gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 120 {
 121   HOST_WIDE_INT lhs, rhs;
 122
 123   /* During the analysis phase, this function is called on arbitrary
 124      statements that might not have scalar results.  */
 125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 126     return scalar_type;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 131   if (assign)
 132     {
 133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 134       if (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 142         {
 143           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146           if (rhs < lhs)
 147             scalar_type = rhs_type;
 148         }
 149     }
 150   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 151     {
 152       unsigned int i = 0;
 153       if (gimple_call_internal_p (call))
 154         {
 155           internal_fn ifn = gimple_call_internal_fn (call);
 156           if (internal_load_fn_p (ifn))
 157             /* For loads the LHS type does the trick.  */
 158             i = ~0U;
 159           else if (internal_store_fn_p (ifn))
 160             {
 161               /* For stores use the tyep of the stored value.  */
 162               i = internal_fn_stored_value_index (ifn);
 163               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 164               i = ~0U;
 165             }
 166           else if (internal_fn_mask_index (ifn) == 0)
 167             i = 1;
 168         }
 169       if (i < gimple_call_num_args (call))
 170         {
 171           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 172           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 173             {
 174               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 175               if (rhs < lhs)
 176                 scalar_type = rhs_type;
 177             }
 178         }
 179     }
 180
 181   return scalar_type;
 182 }
 183
 184
 185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 186    tested at run-time.  Return TRUE if DDR was successfully inserted.
 187    Return false if versioning is not supported.  */
 188
 189 static opt_result
 190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 191 {
 192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 193
 194   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 195     return opt_result::failure_at (vect_location,
 196                                    "will not create alias checks, as"
 197                                    " --param vect-max-version-for-alias-checks"
 198                                    " == 0\n");
 199
 200   opt_result res
 201     = runtime_alias_check_p (ddr, loop,
 202                              optimize_loop_nest_for_speed_p (loop));
 203   if (!res)
 204     return res;
 205
 206   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 207   return opt_result::success ();
 208 }
 209
 210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 211
 212 static void
 213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 214 {
 215   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 216   for (unsigned int i = 0; i < checks.length(); ++i)
 217     if (checks[i] == value)
 218       return;
 219
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location,
 222                      "need run-time check that %T is nonzero\n",
 223                      value);
 224   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 225 }
 226
 227 /* Return true if we know that the order of vectorized DR_INFO_A and
 228    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 229    DR_INFO_B.  At least one of the accesses is a write.  */
 230
 231 static bool
 232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 233 {
 234   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 235   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 236
 237   /* Single statements are always kept in their original order.  */
 238   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 239       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 240     return true;
 241
 242   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 243      emitted at the position of the first scalar load.
 244      Stores in a group are emitted at the position of the last scalar store.
 245      Compute that position and check whether the resulting order matches
 246      the current one.  */
 247   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 248   if (il_a)
 249     {
 250       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 251         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 252              s = DR_GROUP_NEXT_ELEMENT (s))
 253           il_a = get_later_stmt (il_a, s);
 254       else /* DR_IS_READ */
 255         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 256              s = DR_GROUP_NEXT_ELEMENT (s))
 257           if (get_later_stmt (il_a, s) == il_a)
 258             il_a = s;
 259     }
 260   else
 261     il_a = stmtinfo_a;
 262   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 263   if (il_b)
 264     {
 265       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 266         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 267              s = DR_GROUP_NEXT_ELEMENT (s))
 268           il_b = get_later_stmt (il_b, s);
 269       else /* DR_IS_READ */
 270         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 271              s = DR_GROUP_NEXT_ELEMENT (s))
 272           if (get_later_stmt (il_b, s) == il_b)
 273             il_b = s;
 274     }
 275   else
 276     il_b = stmtinfo_b;
 277   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 278   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 279 }
 280
 281 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 282    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 283    distances.  These distances are conservatively correct but they don't
 284    reflect a guaranteed dependence.
 285
 286    Return true if this function does all the work necessary to avoid
 287    an alias or false if the caller should use the dependence distances
 288    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 289    the depth of the loop described by LOOP_VINFO and the other arguments
 290    are as for vect_analyze_data_ref_dependence.  */
 291
 292 static bool
 293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 294                                        loop_vec_info loop_vinfo,
 295                                        int loop_depth, unsigned int *max_vf)
 296 {
 297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 298   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 299     {
 300       int dist = dist_v[loop_depth];
 301       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 302         {
 303           /* If the user asserted safelen >= DIST consecutive iterations
 304              can be executed concurrently, assume independence.
 305
 306              ??? An alternative would be to add the alias check even
 307              in this case, and vectorize the fallback loop with the
 308              maximum VF set to safelen.  However, if the user has
 309              explicitly given a length, it's less likely that that
 310              would be a win.  */
 311           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 312             {
 313               if ((unsigned int) loop->safelen < *max_vf)
 314                 *max_vf = loop->safelen;
 315               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 316               continue;
 317             }
 318
 319           /* For dependence distances of 2 or more, we have the option
 320              of limiting VF or checking for an alias at runtime.
 321              Prefer to check at runtime if we can, to avoid limiting
 322              the VF unnecessarily when the bases are in fact independent.
 323
 324              Note that the alias checks will be removed if the VF ends up
 325              being small enough.  */
 326           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 327           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 328           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 329                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 330                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 331         }
 332     }
 333   return true;
 334 }
 335
 336
 337 /* Function vect_analyze_data_ref_dependence.
 338
 339    FIXME: I needed to change the sense of the returned flag.
 340
 341    Return FALSE if there (might) exist a dependence between a memory-reference
 342    DRA and a memory-reference DRB.  When versioning for alias may check a
 343    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 344    the data dependence.  */
 345
 346 static opt_result
 347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 348                                   loop_vec_info loop_vinfo,
 349                                   unsigned int *max_vf)
 350 {
 351   unsigned int i;
 352   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 353   struct data_reference *dra = DDR_A (ddr);
 354   struct data_reference *drb = DDR_B (ddr);
 355   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 356   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 357   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 358   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 359   lambda_vector dist_v;
 360   unsigned int loop_depth;
 361
 362   /* In loop analysis all data references should be vectorizable.  */
 363   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 364       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 365     gcc_unreachable ();
 366
 367   /* Independent data accesses.  */
 368   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 369     return opt_result::success ();
 370
 371   if (dra == drb
 372       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 373     return opt_result::success ();
 374
 375   /* We do not have to consider dependences between accesses that belong
 376      to the same group, unless the stride could be smaller than the
 377      group size.  */
 378   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 379       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 380           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 381       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 382     return opt_result::success ();
 383
 384   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 385      least two scalar iterations, there is always also a true dependence.
 386      As the vectorizer does not re-order loads and stores we can ignore
 387      the anti-dependence if TBAA can disambiguate both DRs similar to the
 388      case with known negative distance anti-dependences (positive
 389      distance anti-dependences would violate TBAA constraints).  */
 390   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 391        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 392       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 393                                  get_alias_set (DR_REF (drb))))
 394     return opt_result::success ();
 395
 396   /* Unknown data dependence.  */
 397   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 398     {
 399       /* If user asserted safelen consecutive iterations can be
 400          executed concurrently, assume independence.  */
 401       if (loop->safelen >= 2)
 402         {
 403           if ((unsigned int) loop->safelen < *max_vf)
 404             *max_vf = loop->safelen;
 405           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 406           return opt_result::success ();
 407         }
 408
 409       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 410           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 411         return opt_result::failure_at
 412           (stmtinfo_a->stmt,
 413            "versioning for alias not supported for: "
 414            "can't determine dependence between %T and %T\n",
 415            DR_REF (dra), DR_REF (drb));
 416
 417       if (dump_enabled_p ())
 418         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 419                          "versioning for alias required: "
 420                          "can't determine dependence between %T and %T\n",
 421                          DR_REF (dra), DR_REF (drb));
 422
 423       /* Add to list of ddrs that need to be tested at run-time.  */
 424       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 425     }
 426
 427   /* Known data dependence.  */
 428   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 429     {
 430       /* If user asserted safelen consecutive iterations can be
 431          executed concurrently, assume independence.  */
 432       if (loop->safelen >= 2)
 433         {
 434           if ((unsigned int) loop->safelen < *max_vf)
 435             *max_vf = loop->safelen;
 436           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 437           return opt_result::success ();
 438         }
 439
 440       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 441           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 442         return opt_result::failure_at
 443           (stmtinfo_a->stmt,
 444            "versioning for alias not supported for: "
 445            "bad dist vector for %T and %T\n",
 446            DR_REF (dra), DR_REF (drb));
 447
 448       if (dump_enabled_p ())
 449         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 450                          "versioning for alias required: "
 451                          "bad dist vector for %T and %T\n",
 452                          DR_REF (dra), DR_REF (drb));
 453       /* Add to list of ddrs that need to be tested at run-time.  */
 454       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 455     }
 456
 457   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 458
 459   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 460       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 461                                                 loop_depth, max_vf))
 462     return opt_result::success ();
 463
 464   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 465     {
 466       int dist = dist_v[loop_depth];
 467
 468       if (dump_enabled_p ())
 469         dump_printf_loc (MSG_NOTE, vect_location,
 470                          "dependence distance  = %d.\n", dist);
 471
 472       if (dist == 0)
 473         {
 474           if (dump_enabled_p ())
 475             dump_printf_loc (MSG_NOTE, vect_location,
 476                              "dependence distance == 0 between %T and %T\n",
 477                              DR_REF (dra), DR_REF (drb));
 478
 479           /* When we perform grouped accesses and perform implicit CSE
 480              by detecting equal accesses and doing disambiguation with
 481              runtime alias tests like for
 482                 .. = a[i];
 483                 .. = a[i+1];
 484                 a[i] = ..;
 485                 a[i+1] = ..;
 486                 *p = ..;
 487                 .. = a[i];
 488                 .. = a[i+1];
 489              where we will end up loading { a[i], a[i+1] } once, make
 490              sure that inserting group loads before the first load and
 491              stores after the last store will do the right thing.
 492              Similar for groups like
 493                 a[i] = ...;
 494                 ... = a[i];
 495                 a[i+1] = ...;
 496              where loads from the group interleave with the store.  */
 497           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 498             return opt_result::failure_at (stmtinfo_a->stmt,
 499                                            "READ_WRITE dependence"
 500                                            " in interleaving.\n");
 501
 502           if (loop->safelen < 2)
 503             {
 504               tree indicator = dr_zero_step_indicator (dra);
 505               if (!indicator || integer_zerop (indicator))
 506                 return opt_result::failure_at (stmtinfo_a->stmt,
 507                                                "access also has a zero step\n");
 508               else if (TREE_CODE (indicator) != INTEGER_CST)
 509                 vect_check_nonzero_value (loop_vinfo, indicator);
 510             }
 511           continue;
 512         }
 513
 514       if (dist > 0 && DDR_REVERSED_P (ddr))
 515         {
 516           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 517              reversed (to make distance vector positive), and the actual
 518              distance is negative.  */
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "dependence distance negative.\n");
 522           /* When doing outer loop vectorization, we need to check if there is
 523              a backward dependence at the inner loop level if the dependence
 524              at the outer loop is reversed.  See PR81740.  */
 525           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 526               || nested_in_vect_loop_p (loop, stmtinfo_b))
 527             {
 528               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 529                                                          DDR_LOOP_NEST (ddr));
 530               if (dist_v[inner_depth] < 0)
 531                 return opt_result::failure_at (stmtinfo_a->stmt,
 532                                                "not vectorized, dependence "
 533                                                "between data-refs %T and %T\n",
 534                                                DR_REF (dra), DR_REF (drb));
 535             }
 536           /* Record a negative dependence distance to later limit the
 537              amount of stmt copying / unrolling we can perform.
 538              Only need to handle read-after-write dependence.  */
 539           if (DR_IS_READ (drb)
 540               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 541                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 542             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 543           continue;
 544         }
 545
 546       unsigned int abs_dist = abs (dist);
 547       if (abs_dist >= 2 && abs_dist < *max_vf)
 548         {
 549           /* The dependence distance requires reduction of the maximal
 550              vectorization factor.  */
 551           *max_vf = abs_dist;
 552           if (dump_enabled_p ())
 553             dump_printf_loc (MSG_NOTE, vect_location,
 554                              "adjusting maximal vectorization factor to %i\n",
 555                              *max_vf);
 556         }
 557
 558       if (abs_dist >= *max_vf)
 559         {
 560           /* Dependence distance does not create dependence, as far as
 561              vectorization is concerned, in this case.  */
 562           if (dump_enabled_p ())
 563             dump_printf_loc (MSG_NOTE, vect_location,
 564                              "dependence distance >= VF.\n");
 565           continue;
 566         }
 567
 568       return opt_result::failure_at (stmtinfo_a->stmt,
 569                                      "not vectorized, possible dependence "
 570                                      "between data-refs %T and %T\n",
 571                                      DR_REF (dra), DR_REF (drb));
 572     }
 573
 574   return opt_result::success ();
 575 }
 576
 577 /* Function vect_analyze_data_ref_dependences.
 578
 579    Examine all the data references in the loop, and make sure there do not
 580    exist any data dependences between them.  Set *MAX_VF according to
 581    the maximum vectorization factor the data dependences allow.  */
 582
 583 opt_result
 584 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 585                                    unsigned int *max_vf)
 586 {
 587   unsigned int i;
 588   struct data_dependence_relation *ddr;
 589
 590   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 591
 592   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 593     {
 594       LOOP_VINFO_DDRS (loop_vinfo)
 595         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 596                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 597       /* We do not need read-read dependences.  */
 598       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 599                                           &LOOP_VINFO_DDRS (loop_vinfo),
 600                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 601                                           false);
 602       gcc_assert (res);
 603     }
 604
 605   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 606
 607   /* For epilogues we either have no aliases or alias versioning
 608      was applied to original loop.  Therefore we may just get max_vf
 609      using VF of original loop.  */
 610   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 611     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 612   else
 613     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 614       {
 615         opt_result res
 616           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 617         if (!res)
 618           return res;
 619       }
 620
 621   return opt_result::success ();
 622 }
 623
 624
 625 /* Function vect_slp_analyze_data_ref_dependence.
 626
 627    Return TRUE if there (might) exist a dependence between a memory-reference
 628    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 629    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 630    according to the data dependence.  */
 631
 632 static bool
 633 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 634                                       struct data_dependence_relation *ddr)
 635 {
 636   struct data_reference *dra = DDR_A (ddr);
 637   struct data_reference *drb = DDR_B (ddr);
 638   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 639   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 640
 641   /* We need to check dependences of statements marked as unvectorizable
 642      as well, they still can prohibit vectorization.  */
 643
 644   /* Independent data accesses.  */
 645   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 646     return false;
 647
 648   if (dra == drb)
 649     return false;
 650
 651   /* Read-read is OK.  */
 652   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 653     return false;
 654
 655   /* If dra and drb are part of the same interleaving chain consider
 656      them independent.  */
 657   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 658       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 659           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 660     return false;
 661
 662   /* Unknown data dependence.  */
 663   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 664     {
 665       if  (dump_enabled_p ())
 666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 667                          "can't determine dependence between %T and %T\n",
 668                          DR_REF (dra), DR_REF (drb));
 669     }
 670   else if (dump_enabled_p ())
 671     dump_printf_loc (MSG_NOTE, vect_location,
 672                      "determined dependence between %T and %T\n",
 673                      DR_REF (dra), DR_REF (drb));
 674
 675   return true;
 676 }
 677
 678
 679 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 680    contain the vector of scalar stores of this instance if we are
 681    disambiguating the loads.  */
 682
 683 static bool
 684 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 685                                    vec<stmt_vec_info> stores,
 686                                    stmt_vec_info last_store_info)
 687 {
 688   /* This walks over all stmts involved in the SLP load/store done
 689      in NODE verifying we can sink them up to the last stmt in the
 690      group.  */
 691   if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
 692     {
 693       stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 694       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 695         {
 696           stmt_vec_info access_info
 697             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 698           if (access_info == last_access_info)
 699             continue;
 700           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 701           ao_ref ref;
 702           bool ref_initialized_p = false;
 703           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 704                gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 705             {
 706               gimple *stmt = gsi_stmt (gsi);
 707               if (! gimple_vuse (stmt))
 708                 continue;
 709
 710               /* If we couldn't record a (single) data reference for this
 711                  stmt we have to resort to the alias oracle.  */
 712               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 713               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 714               if (!dr_b)
 715                 {
 716                   /* We are moving a store - this means
 717                      we cannot use TBAA for disambiguation.  */
 718                   if (!ref_initialized_p)
 719                     ao_ref_init (&ref, DR_REF (dr_a));
 720                   if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 721                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 722                     return false;
 723                   continue;
 724                 }
 725
 726               bool dependent = false;
 727               /* If we run into a store of this same instance (we've just
 728                  marked those) then delay dependence checking until we run
 729                  into the last store because this is where it will have
 730                  been sunk to (and we verify if we can do that as well).  */
 731               if (gimple_visited_p (stmt))
 732                 {
 733                   if (stmt_info != last_store_info)
 734                     continue;
 735
 736                   for (stmt_vec_info &store_info : stores)
 737                     {
 738                       data_reference *store_dr
 739                         = STMT_VINFO_DATA_REF (store_info);
 740                       ddr_p ddr = initialize_data_dependence_relation
 741                                     (dr_a, store_dr, vNULL);
 742                       dependent
 743                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 744                       free_dependence_relation (ddr);
 745                       if (dependent)
 746                         break;
 747                     }
 748                 }
 749               else
 750                 {
 751                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 752                                                                    dr_b, vNULL);
 753                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 754                   free_dependence_relation (ddr);
 755                 }
 756               if (dependent)
 757                 return false;
 758             }
 759         }
 760     }
 761   else /* DR_IS_READ */
 762     {
 763       stmt_vec_info first_access_info
 764         = vect_find_first_scalar_stmt_in_slp (node);
 765       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 766         {
 767           stmt_vec_info access_info
 768             = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 769           if (access_info == first_access_info)
 770             continue;
 771           data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 772           ao_ref ref;
 773           bool ref_initialized_p = false;
 774           for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 775                gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 776             {
 777               gimple *stmt = gsi_stmt (gsi);
 778               if (! gimple_vdef (stmt))
 779                 continue;
 780
 781               /* If we couldn't record a (single) data reference for this
 782                  stmt we have to resort to the alias oracle.  */
 783               stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 784               data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 785
 786               /* We are hoisting a load - this means we can use
 787                  TBAA for disambiguation.  */
 788               if (!ref_initialized_p)
 789                 ao_ref_init (&ref, DR_REF (dr_a));
 790               if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
 791                 {
 792                   if (!dr_b)
 793                     return false;
 794                   /* Resort to dependence checking below.  */
 795                 }
 796               else
 797                 /* No dependence.  */
 798                 continue;
 799
 800               bool dependent = false;
 801               /* If we run into a store of this same instance (we've just
 802                  marked those) then delay dependence checking until we run
 803                  into the last store because this is where it will have
 804                  been sunk to (and we verify if we can do that as well).  */
 805               if (gimple_visited_p (stmt))
 806                 {
 807                   if (stmt_info != last_store_info)
 808                     continue;
 809
 810                   for (stmt_vec_info &store_info : stores)
 811                     {
 812                       data_reference *store_dr
 813                         = STMT_VINFO_DATA_REF (store_info);
 814                       ddr_p ddr = initialize_data_dependence_relation
 815                                     (dr_a, store_dr, vNULL);
 816                       dependent
 817                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 818                       free_dependence_relation (ddr);
 819                       if (dependent)
 820                         break;
 821                     }
 822                 }
 823               else
 824                 {
 825                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 826                                                                    dr_b, vNULL);
 827                   dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 828                   free_dependence_relation (ddr);
 829                 }
 830               if (dependent)
 831                 return false;
 832             }
 833         }
 834     }
 835   return true;
 836 }
 837
 838
 839 /* Function vect_analyze_data_ref_dependences.
 840
 841    Examine all the data references in the basic-block, and make sure there
 842    do not exist any data dependences between them.  Set *MAX_VF according to
 843    the maximum vectorization factor the data dependences allow.  */
 844
 845 bool
 846 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 847 {
 848   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 849
 850   /* The stores of this instance are at the root of the SLP tree.  */
 851   slp_tree store = NULL;
 852   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 853     store = SLP_INSTANCE_TREE (instance);
 854
 855   /* Verify we can sink stores to the vectorized stmt insert location.  */
 856   stmt_vec_info last_store_info = NULL;
 857   if (store)
 858     {
 859       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 860         return false;
 861
 862       /* Mark stores in this instance and remember the last one.  */
 863       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 864       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 865         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 866     }
 867
 868   bool res = true;
 869
 870   /* Verify we can sink loads to the vectorized stmt insert location,
 871      special-casing stores of this instance.  */
 872   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 873     if (! vect_slp_analyze_node_dependences (vinfo, load,
 874                                              store
 875                                              ? SLP_TREE_SCALAR_STMTS (store)
 876                                              : vNULL, last_store_info))
 877       {
 878         res = false;
 879         break;
 880       }
 881
 882   /* Unset the visited flag.  */
 883   if (store)
 884     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 885       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 886
 887   return res;
 888 }
 889
 890 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
 891    applied.  */
 892
 893 int
 894 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
 895 {
 896   HOST_WIDE_INT diff = 0;
 897   /* Alignment is only analyzed for the first element of a DR group,
 898      use that but adjust misalignment by the offset of the access.  */
 899   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
 900     {
 901       dr_vec_info *first_dr
 902         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
 903       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
 904          INTEGER_CSTs and the first element in the group has the lowest
 905          address.  */
 906       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
 907               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
 908       gcc_assert (diff >= 0);
 909       dr_info = first_dr;
 910     }
 911
 912   int misalign = dr_info->misalignment;
 913   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
 914   if (misalign == DR_MISALIGNMENT_UNKNOWN)
 915     return misalign;
 916
 917   /* If the access is only aligned for a vector type with smaller alignment
 918      requirement the access has unknown misalignment.  */
 919   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
 920                 targetm.vectorize.preferred_vector_alignment (vectype)))
 921     return DR_MISALIGNMENT_UNKNOWN;
 922
 923   /* Apply the offset from the DR group start and the externally supplied
 924      offset which can for example result from a negative stride access.  */
 925   poly_int64 misalignment = misalign + diff + offset;
 926
 927   /* vect_compute_data_ref_alignment will have ensured that target_alignment
 928      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
 929   unsigned HOST_WIDE_INT target_alignment_c
 930     = dr_info->target_alignment.to_constant ();
 931   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
 932     return DR_MISALIGNMENT_UNKNOWN;
 933   return misalign;
 934 }
 935
 936 /* Record the base alignment guarantee given by DRB, which occurs
 937    in STMT_INFO.  */
 938
 939 static void
 940 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 941                             innermost_loop_behavior *drb)
 942 {
 943   bool existed;
 944   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
 945     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 946   if (!existed || entry.second->base_alignment < drb->base_alignment)
 947     {
 948       entry = std::make_pair (stmt_info, drb);
 949       if (dump_enabled_p ())
 950         dump_printf_loc (MSG_NOTE, vect_location,
 951                          "recording new base alignment for %T\n"
 952                          "  alignment:    %d\n"
 953                          "  misalignment: %d\n"
 954                          "  based on:     %G",
 955                          drb->base_address,
 956                          drb->base_alignment,
 957                          drb->base_misalignment,
 958                          stmt_info->stmt);
 959     }
 960 }
 961
 962 /* If the region we're going to vectorize is reached, all unconditional
 963    data references occur at least once.  We can therefore pool the base
 964    alignment guarantees from each unconditional reference.  Do this by
 965    going through all the data references in VINFO and checking whether
 966    the containing statement makes the reference unconditionally.  If so,
 967    record the alignment of the base address in VINFO so that it can be
 968    used for all other references with the same base.  */
 969
 970 void
 971 vect_record_base_alignments (vec_info *vinfo)
 972 {
 973   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 974   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 975   for (data_reference *dr : vinfo->shared->datarefs)
 976     {
 977       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 978       stmt_vec_info stmt_info = dr_info->stmt;
 979       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 980           && STMT_VINFO_VECTORIZABLE (stmt_info)
 981           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 982         {
 983           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 984
 985           /* If DR is nested in the loop that is being vectorized, we can also
 986              record the alignment of the base wrt the outer loop.  */
 987           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 988             vect_record_base_alignment
 989               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 990         }
 991     }
 992 }
 993
 994 /* Function vect_compute_data_ref_alignment
 995
 996    Compute the misalignment of the data reference DR_INFO when vectorizing
 997    with VECTYPE.
 998
 999    Output:
1000    1. initialized misalignment info for DR_INFO
1001
1002    FOR NOW: No analysis is actually performed. Misalignment is calculated
1003    only for trivial cases. TODO.  */
1004
1005 static void
1006 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1007                                  tree vectype)
1008 {
1009   stmt_vec_info stmt_info = dr_info->stmt;
1010   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1011   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1012   class loop *loop = NULL;
1013   tree ref = DR_REF (dr_info->dr);
1014
1015   if (dump_enabled_p ())
1016     dump_printf_loc (MSG_NOTE, vect_location,
1017                      "vect_compute_data_ref_alignment:\n");
1018
1019   if (loop_vinfo)
1020     loop = LOOP_VINFO_LOOP (loop_vinfo);
1021
1022   /* Initialize misalignment to unknown.  */
1023   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1024
1025   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1026     return;
1027
1028   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1029   bool step_preserves_misalignment_p;
1030
1031   poly_uint64 vector_alignment
1032     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1033                  BITS_PER_UNIT);
1034   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1035
1036   /* If the main loop has peeled for alignment we have no way of knowing
1037      whether the data accesses in the epilogues are aligned.  We can't at
1038      compile time answer the question whether we have entered the main loop or
1039      not.  Fixes PR 92351.  */
1040   if (loop_vinfo)
1041     {
1042       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1043       if (orig_loop_vinfo
1044           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1045         return;
1046     }
1047
1048   unsigned HOST_WIDE_INT vect_align_c;
1049   if (!vector_alignment.is_constant (&vect_align_c))
1050     return;
1051
1052   /* No step for BB vectorization.  */
1053   if (!loop)
1054     {
1055       gcc_assert (integer_zerop (drb->step));
1056       step_preserves_misalignment_p = true;
1057     }
1058
1059   /* In case the dataref is in an inner-loop of the loop that is being
1060      vectorized (LOOP), we use the base and misalignment information
1061      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1062      stays the same throughout the execution of the inner-loop, which is why
1063      we have to check that the stride of the dataref in the inner-loop evenly
1064      divides by the vector alignment.  */
1065   else if (nested_in_vect_loop_p (loop, stmt_info))
1066     {
1067       step_preserves_misalignment_p
1068         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1069
1070       if (dump_enabled_p ())
1071         {
1072           if (step_preserves_misalignment_p)
1073             dump_printf_loc (MSG_NOTE, vect_location,
1074                              "inner step divides the vector alignment.\n");
1075           else
1076             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1077                              "inner step doesn't divide the vector"
1078                              " alignment.\n");
1079         }
1080     }
1081
1082   /* Similarly we can only use base and misalignment information relative to
1083      an innermost loop if the misalignment stays the same throughout the
1084      execution of the loop.  As above, this is the case if the stride of
1085      the dataref evenly divides by the alignment.  */
1086   else
1087     {
1088       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1089       step_preserves_misalignment_p
1090         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1091
1092       if (!step_preserves_misalignment_p && dump_enabled_p ())
1093         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094                          "step doesn't divide the vector alignment.\n");
1095     }
1096
1097   unsigned int base_alignment = drb->base_alignment;
1098   unsigned int base_misalignment = drb->base_misalignment;
1099
1100   /* Calculate the maximum of the pooled base address alignment and the
1101      alignment that we can compute for DR itself.  */
1102   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1103     = base_alignments->get (drb->base_address);
1104   if (entry
1105       && base_alignment < (*entry).second->base_alignment
1106       && (loop_vinfo
1107           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1108                               gimple_bb (entry->first->stmt))
1109               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1110                   || (entry->first->dr_aux.group <= dr_info->group)))))
1111     {
1112       base_alignment = entry->second->base_alignment;
1113       base_misalignment = entry->second->base_misalignment;
1114     }
1115
1116   if (drb->offset_alignment < vect_align_c
1117       || !step_preserves_misalignment_p
1118       /* We need to know whether the step wrt the vectorized loop is
1119          negative when computing the starting misalignment below.  */
1120       || TREE_CODE (drb->step) != INTEGER_CST)
1121     {
1122       if (dump_enabled_p ())
1123         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124                          "Unknown alignment for access: %T\n", ref);
1125       return;
1126     }
1127
1128   if (base_alignment < vect_align_c)
1129     {
1130       unsigned int max_alignment;
1131       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1132       if (max_alignment < vect_align_c
1133           || !vect_can_force_dr_alignment_p (base,
1134                                              vect_align_c * BITS_PER_UNIT))
1135         {
1136           if (dump_enabled_p ())
1137             dump_printf_loc (MSG_NOTE, vect_location,
1138                              "can't force alignment of ref: %T\n", ref);
1139           return;
1140         }
1141
1142       /* Force the alignment of the decl.
1143          NOTE: This is the only change to the code we make during
1144          the analysis phase, before deciding to vectorize the loop.  */
1145       if (dump_enabled_p ())
1146         dump_printf_loc (MSG_NOTE, vect_location,
1147                          "force alignment of %T\n", ref);
1148
1149       dr_info->base_decl = base;
1150       dr_info->base_misaligned = true;
1151       base_misalignment = 0;
1152     }
1153   poly_int64 misalignment
1154     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1155
1156   unsigned int const_misalignment;
1157   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1158     {
1159       if (dump_enabled_p ())
1160         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1161                          "Non-constant misalignment for access: %T\n", ref);
1162       return;
1163     }
1164
1165   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1166
1167   if (dump_enabled_p ())
1168     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1169                      "misalign = %d bytes of ref %T\n",
1170                      const_misalignment, ref);
1171
1172   return;
1173 }
1174
1175 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1176    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1177    is made aligned via peeling.  */
1178
1179 static bool
1180 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1181                                          dr_vec_info *dr_peel_info)
1182 {
1183   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1184                   DR_TARGET_ALIGNMENT (dr_info)))
1185     {
1186       poly_offset_int diff
1187         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1188            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1189       if (known_eq (diff, 0)
1190           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1191         return true;
1192     }
1193   return false;
1194 }
1195
1196 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1197    aligned via peeling.  */
1198
1199 static bool
1200 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1201                                  dr_vec_info *dr_peel_info)
1202 {
1203   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1204                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1205       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1206                            DR_OFFSET (dr_peel_info->dr), 0)
1207       || !operand_equal_p (DR_STEP (dr_info->dr),
1208                            DR_STEP (dr_peel_info->dr), 0))
1209     return false;
1210
1211   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1212 }
1213
1214 /* Compute the value for dr_info->misalign so that the access appears
1215    aligned.  This is used by peeling to compensate for dr_misalignment
1216    applying the offset for negative step.  */
1217
1218 int
1219 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1220 {
1221   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1222     return 0;
1223
1224   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1225   poly_int64 misalignment
1226     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1227        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1228
1229   unsigned HOST_WIDE_INT target_alignment_c;
1230   int misalign;
1231   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1232       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1233     return DR_MISALIGNMENT_UNKNOWN;
1234   return misalign;
1235 }
1236
1237 /* Function vect_update_misalignment_for_peel.
1238    Sets DR_INFO's misalignment
1239    - to 0 if it has the same alignment as DR_PEEL_INFO,
1240    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1241    - to -1 (unknown) otherwise.
1242
1243    DR_INFO - the data reference whose misalignment is to be adjusted.
1244    DR_PEEL_INFO - the data reference whose misalignment is being made
1245                   zero in the vector loop by the peel.
1246    NPEEL - the number of iterations in the peel loop if the misalignment
1247            of DR_PEEL_INFO is known at compile time.  */
1248
1249 static void
1250 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1251                                    dr_vec_info *dr_peel_info, int npeel)
1252 {
1253   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1254   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1255     {
1256       SET_DR_MISALIGNMENT (dr_info,
1257                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1258       return;
1259     }
1260
1261   unsigned HOST_WIDE_INT alignment;
1262   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1263       && known_alignment_for_access_p (dr_info,
1264                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1265       && known_alignment_for_access_p (dr_peel_info,
1266                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1267     {
1268       int misal = dr_info->misalignment;
1269       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1270       misal &= alignment - 1;
1271       set_dr_misalignment (dr_info, misal);
1272       return;
1273     }
1274
1275   if (dump_enabled_p ())
1276     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1277                      "to unknown (-1).\n");
1278   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1279 }
1280
1281 /* Return true if alignment is relevant for DR_INFO.  */
1282
1283 static bool
1284 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1285 {
1286   stmt_vec_info stmt_info = dr_info->stmt;
1287
1288   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1289     return false;
1290
1291   /* For interleaving, only the alignment of the first access matters.  */
1292   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1293       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1294     return false;
1295
1296   /* Scatter-gather and invariant accesses continue to address individual
1297      scalars, so vector-level alignment is irrelevant.  */
1298   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1299       || integer_zerop (DR_STEP (dr_info->dr)))
1300     return false;
1301
1302   /* Strided accesses perform only component accesses, alignment is
1303      irrelevant for them.  */
1304   if (STMT_VINFO_STRIDED_P (stmt_info)
1305       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1306     return false;
1307
1308   return true;
1309 }
1310
1311 /* Given an memory reference EXP return whether its alignment is less
1312    than its size.  */
1313
1314 static bool
1315 not_size_aligned (tree exp)
1316 {
1317   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1318     return true;
1319
1320   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1321           > get_object_alignment (exp));
1322 }
1323
1324 /* Function vector_alignment_reachable_p
1325
1326    Return true if vector alignment for DR_INFO is reachable by peeling
1327    a few loop iterations.  Return false otherwise.  */
1328
1329 static bool
1330 vector_alignment_reachable_p (dr_vec_info *dr_info)
1331 {
1332   stmt_vec_info stmt_info = dr_info->stmt;
1333   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1334
1335   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1336     {
1337       /* For interleaved access we peel only if number of iterations in
1338          the prolog loop ({VF - misalignment}), is a multiple of the
1339          number of the interleaved accesses.  */
1340       int elem_size, mis_in_elements;
1341
1342       /* FORNOW: handle only known alignment.  */
1343       if (!known_alignment_for_access_p (dr_info, vectype))
1344         return false;
1345
1346       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1347       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1348       elem_size = vector_element_size (vector_size, nelements);
1349       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1350
1351       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1352         return false;
1353     }
1354
1355   /* If misalignment is known at the compile time then allow peeling
1356      only if natural alignment is reachable through peeling.  */
1357   if (known_alignment_for_access_p (dr_info, vectype)
1358       && !aligned_access_p (dr_info, vectype))
1359     {
1360       HOST_WIDE_INT elmsize =
1361                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1362       if (dump_enabled_p ())
1363         {
1364           dump_printf_loc (MSG_NOTE, vect_location,
1365                            "data size = %wd. misalignment = %d.\n", elmsize,
1366                            dr_misalignment (dr_info, vectype));
1367         }
1368       if (dr_misalignment (dr_info, vectype) % elmsize)
1369         {
1370           if (dump_enabled_p ())
1371             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372                              "data size does not divide the misalignment.\n");
1373           return false;
1374         }
1375     }
1376
1377   if (!known_alignment_for_access_p (dr_info, vectype))
1378     {
1379       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1380       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1381       if (dump_enabled_p ())
1382         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383                          "Unknown misalignment, %snaturally aligned\n",
1384                          is_packed ? "not " : "");
1385       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1386     }
1387
1388   return true;
1389 }
1390
1391
1392 /* Calculate the cost of the memory access represented by DR_INFO.  */
1393
1394 static void
1395 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1396                            dr_alignment_support alignment_support_scheme,
1397                            int misalignment,
1398                            unsigned int *inside_cost,
1399                            unsigned int *outside_cost,
1400                            stmt_vector_for_cost *body_cost_vec,
1401                            stmt_vector_for_cost *prologue_cost_vec)
1402 {
1403   stmt_vec_info stmt_info = dr_info->stmt;
1404   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1405   int ncopies;
1406
1407   if (PURE_SLP_STMT (stmt_info))
1408     ncopies = 1;
1409   else
1410     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1411
1412   if (DR_IS_READ (dr_info->dr))
1413     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1414                         misalignment, true, inside_cost,
1415                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1416   else
1417     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1418                          misalignment, inside_cost, body_cost_vec);
1419
1420   if (dump_enabled_p ())
1421     dump_printf_loc (MSG_NOTE, vect_location,
1422                      "vect_get_data_access_cost: inside_cost = %d, "
1423                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1424 }
1425
1426
1427 typedef struct _vect_peel_info
1428 {
1429   dr_vec_info *dr_info;
1430   int npeel;
1431   unsigned int count;
1432 } *vect_peel_info;
1433
1434 typedef struct _vect_peel_extended_info
1435 {
1436   vec_info *vinfo;
1437   struct _vect_peel_info peel_info;
1438   unsigned int inside_cost;
1439   unsigned int outside_cost;
1440 } *vect_peel_extended_info;
1441
1442
1443 /* Peeling hashtable helpers.  */
1444
1445 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1446 {
1447   static inline hashval_t hash (const _vect_peel_info *);
1448   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1449 };
1450
1451 inline hashval_t
1452 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1453 {
1454   return (hashval_t) peel_info->npeel;
1455 }
1456
1457 inline bool
1458 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1459 {
1460   return (a->npeel == b->npeel);
1461 }
1462
1463
1464 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1465
1466 static void
1467 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1468                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1469                           int npeel, bool supportable_if_not_aligned)
1470 {
1471   struct _vect_peel_info elem, *slot;
1472   _vect_peel_info **new_slot;
1473
1474   elem.npeel = npeel;
1475   slot = peeling_htab->find (&elem);
1476   if (slot)
1477     slot->count++;
1478   else
1479     {
1480       slot = XNEW (struct _vect_peel_info);
1481       slot->npeel = npeel;
1482       slot->dr_info = dr_info;
1483       slot->count = 1;
1484       new_slot = peeling_htab->find_slot (slot, INSERT);
1485       *new_slot = slot;
1486     }
1487
1488   /* If this DR is not supported with unknown misalignment then bias
1489      this slot when the cost model is disabled.  */
1490   if (!supportable_if_not_aligned
1491       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1492     slot->count += VECT_MAX_COST;
1493 }
1494
1495
1496 /* Traverse peeling hash table to find peeling option that aligns maximum
1497    number of data accesses.  */
1498
1499 int
1500 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1501                                      _vect_peel_extended_info *max)
1502 {
1503   vect_peel_info elem = *slot;
1504
1505   if (elem->count > max->peel_info.count
1506       || (elem->count == max->peel_info.count
1507           && max->peel_info.npeel > elem->npeel))
1508     {
1509       max->peel_info.npeel = elem->npeel;
1510       max->peel_info.count = elem->count;
1511       max->peel_info.dr_info = elem->dr_info;
1512     }
1513
1514   return 1;
1515 }
1516
1517 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1518    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1519    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1520    after peeling.  */
1521
1522 static void
1523 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1524                                 dr_vec_info *dr0_info,
1525                                 unsigned int *inside_cost,
1526                                 unsigned int *outside_cost,
1527                                 stmt_vector_for_cost *body_cost_vec,
1528                                 stmt_vector_for_cost *prologue_cost_vec,
1529                                 unsigned int npeel)
1530 {
1531   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1532
1533   bool dr0_alignment_known_p
1534     = (dr0_info
1535        && known_alignment_for_access_p (dr0_info,
1536                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1537
1538   for (data_reference *dr : datarefs)
1539     {
1540       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541       if (!vect_relevant_for_alignment_p (dr_info))
1542         continue;
1543
1544       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1545       dr_alignment_support alignment_support_scheme;
1546       int misalignment;
1547       unsigned HOST_WIDE_INT alignment;
1548
1549       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1550                                             size_zero_node) < 0;
1551       poly_int64 off = 0;
1552       if (negative)
1553         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1554                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1555
1556       if (npeel == 0)
1557         misalignment = dr_misalignment (dr_info, vectype, off);
1558       else if (dr_info == dr0_info
1559                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1560         misalignment = 0;
1561       else if (!dr0_alignment_known_p
1562                || !known_alignment_for_access_p (dr_info, vectype)
1563                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1564         misalignment = DR_MISALIGNMENT_UNKNOWN;
1565       else
1566         {
1567           misalignment = dr_misalignment (dr_info, vectype, off);
1568           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1569           misalignment &= alignment - 1;
1570         }
1571       alignment_support_scheme
1572         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1573                                          misalignment);
1574
1575       vect_get_data_access_cost (loop_vinfo, dr_info,
1576                                  alignment_support_scheme, misalignment,
1577                                  inside_cost, outside_cost,
1578                                  body_cost_vec, prologue_cost_vec);
1579     }
1580 }
1581
1582 /* Traverse peeling hash table and calculate cost for each peeling option.
1583    Find the one with the lowest cost.  */
1584
1585 int
1586 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1587                                    _vect_peel_extended_info *min)
1588 {
1589   vect_peel_info elem = *slot;
1590   int dummy;
1591   unsigned int inside_cost = 0, outside_cost = 0;
1592   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1593   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1594                        epilogue_cost_vec;
1595
1596   prologue_cost_vec.create (2);
1597   body_cost_vec.create (2);
1598   epilogue_cost_vec.create (2);
1599
1600   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1601                                   &outside_cost, &body_cost_vec,
1602                                   &prologue_cost_vec, elem->npeel);
1603
1604   body_cost_vec.release ();
1605
1606   outside_cost += vect_get_known_peeling_cost
1607     (loop_vinfo, elem->npeel, &dummy,
1608      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1609      &prologue_cost_vec, &epilogue_cost_vec);
1610
1611   /* Prologue and epilogue costs are added to the target model later.
1612      These costs depend only on the scalar iteration cost, the
1613      number of peeling iterations finally chosen, and the number of
1614      misaligned statements.  So discard the information found here.  */
1615   prologue_cost_vec.release ();
1616   epilogue_cost_vec.release ();
1617
1618   if (inside_cost < min->inside_cost
1619       || (inside_cost == min->inside_cost
1620           && outside_cost < min->outside_cost))
1621     {
1622       min->inside_cost = inside_cost;
1623       min->outside_cost = outside_cost;
1624       min->peel_info.dr_info = elem->dr_info;
1625       min->peel_info.npeel = elem->npeel;
1626       min->peel_info.count = elem->count;
1627     }
1628
1629   return 1;
1630 }
1631
1632
1633 /* Choose best peeling option by traversing peeling hash table and either
1634    choosing an option with the lowest cost (if cost model is enabled) or the
1635    option that aligns as many accesses as possible.  */
1636
1637 static struct _vect_peel_extended_info
1638 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1639                                        loop_vec_info loop_vinfo)
1640 {
1641    struct _vect_peel_extended_info res;
1642
1643    res.peel_info.dr_info = NULL;
1644    res.vinfo = loop_vinfo;
1645
1646    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1647      {
1648        res.inside_cost = INT_MAX;
1649        res.outside_cost = INT_MAX;
1650        peeling_htab->traverse <_vect_peel_extended_info *,
1651                                vect_peeling_hash_get_lowest_cost> (&res);
1652      }
1653    else
1654      {
1655        res.peel_info.count = 0;
1656        peeling_htab->traverse <_vect_peel_extended_info *,
1657                                vect_peeling_hash_get_most_frequent> (&res);
1658        res.inside_cost = 0;
1659        res.outside_cost = 0;
1660      }
1661
1662    return res;
1663 }
1664
1665 /* Return true if the new peeling NPEEL is supported.  */
1666
1667 static bool
1668 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1669                           unsigned npeel)
1670 {
1671   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1672   enum dr_alignment_support supportable_dr_alignment;
1673
1674   bool dr0_alignment_known_p
1675     = known_alignment_for_access_p (dr0_info,
1676                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1677
1678   /* Ensure that all data refs can be vectorized after the peel.  */
1679   for (data_reference *dr : datarefs)
1680     {
1681       if (dr == dr0_info->dr)
1682         continue;
1683
1684       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1685       if (!vect_relevant_for_alignment_p (dr_info)
1686           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1687         continue;
1688
1689       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1690       int misalignment;
1691       unsigned HOST_WIDE_INT alignment;
1692       if (!dr0_alignment_known_p
1693           || !known_alignment_for_access_p (dr_info, vectype)
1694           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1695         misalignment = DR_MISALIGNMENT_UNKNOWN;
1696       else
1697         {
1698           misalignment = dr_misalignment (dr_info, vectype);
1699           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1700           misalignment &= alignment - 1;
1701         }
1702       supportable_dr_alignment
1703         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1704                                          misalignment);
1705       if (supportable_dr_alignment == dr_unaligned_unsupported)
1706         return false;
1707     }
1708
1709   return true;
1710 }
1711
1712 /* Compare two data-references DRA and DRB to group them into chunks
1713    with related alignment.  */
1714
1715 static int
1716 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1717 {
1718   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1719   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1720   int cmp;
1721
1722   /* Stabilize sort.  */
1723   if (dra == drb)
1724     return 0;
1725
1726   /* Ordering of DRs according to base.  */
1727   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1728                                DR_BASE_ADDRESS (drb));
1729   if (cmp != 0)
1730     return cmp;
1731
1732   /* And according to DR_OFFSET.  */
1733   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1734   if (cmp != 0)
1735     return cmp;
1736
1737   /* And after step.  */
1738   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1739   if (cmp != 0)
1740     return cmp;
1741
1742   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1743   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1744   if (cmp == 0)
1745     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1746   return cmp;
1747 }
1748
1749 /* Function vect_enhance_data_refs_alignment
1750
1751    This pass will use loop versioning and loop peeling in order to enhance
1752    the alignment of data references in the loop.
1753
1754    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1755    original loop is to be vectorized.  Any other loops that are created by
1756    the transformations performed in this pass - are not supposed to be
1757    vectorized.  This restriction will be relaxed.
1758
1759    This pass will require a cost model to guide it whether to apply peeling
1760    or versioning or a combination of the two.  For example, the scheme that
1761    intel uses when given a loop with several memory accesses, is as follows:
1762    choose one memory access ('p') which alignment you want to force by doing
1763    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1764    other accesses are not necessarily aligned, or (2) use loop versioning to
1765    generate one loop in which all accesses are aligned, and another loop in
1766    which only 'p' is necessarily aligned.
1767
1768    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1769    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1770    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1771
1772    Devising a cost model is the most critical aspect of this work.  It will
1773    guide us on which access to peel for, whether to use loop versioning, how
1774    many versions to create, etc.  The cost model will probably consist of
1775    generic considerations as well as target specific considerations (on
1776    powerpc for example, misaligned stores are more painful than misaligned
1777    loads).
1778
1779    Here are the general steps involved in alignment enhancements:
1780
1781      -- original loop, before alignment analysis:
1782         for (i=0; i<N; i++){
1783           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1784           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1785         }
1786
1787      -- After vect_compute_data_refs_alignment:
1788         for (i=0; i<N; i++){
1789           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1790           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1791         }
1792
1793      -- Possibility 1: we do loop versioning:
1794      if (p is aligned) {
1795         for (i=0; i<N; i++){    # loop 1A
1796           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1797           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1798         }
1799      }
1800      else {
1801         for (i=0; i<N; i++){    # loop 1B
1802           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1803           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1804         }
1805      }
1806
1807      -- Possibility 2: we do loop peeling:
1808      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1809         x = q[i];
1810         p[i] = y;
1811      }
1812      for (i = 3; i < N; i++){   # loop 2A
1813         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1814         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1815      }
1816
1817      -- Possibility 3: combination of loop peeling and versioning:
1818      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1819         x = q[i];
1820         p[i] = y;
1821      }
1822      if (p is aligned) {
1823         for (i = 3; i<N; i++){  # loop 3A
1824           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1825           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1826         }
1827      }
1828      else {
1829         for (i = 3; i<N; i++){  # loop 3B
1830           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1831           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1832         }
1833      }
1834
1835      These loops are later passed to loop_transform to be vectorized.  The
1836      vectorizer will use the alignment information to guide the transformation
1837      (whether to generate regular loads/stores, or with special handling for
1838      misalignment).  */
1839
1840 opt_result
1841 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1842 {
1843   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1844   dr_vec_info *first_store = NULL;
1845   dr_vec_info *dr0_info = NULL;
1846   struct data_reference *dr;
1847   unsigned int i;
1848   bool do_peeling = false;
1849   bool do_versioning = false;
1850   unsigned int npeel = 0;
1851   bool one_misalignment_known = false;
1852   bool one_misalignment_unknown = false;
1853   bool one_dr_unsupportable = false;
1854   dr_vec_info *unsupportable_dr_info = NULL;
1855   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1856   hash_table<peel_info_hasher> peeling_htab (1);
1857
1858   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1859
1860   /* Reset data so we can safely be called multiple times.  */
1861   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1862   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1863
1864   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1865     return opt_result::success ();
1866
1867   /* Sort the vector of datarefs so DRs that have the same or dependent
1868      alignment are next to each other.  */
1869   auto_vec<data_reference_p> datarefs
1870     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1871   datarefs.qsort (dr_align_group_sort_cmp);
1872
1873   /* Compute the number of DRs that become aligned when we peel
1874      a dataref so it becomes aligned.  */
1875   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1876   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1877   unsigned i0;
1878   for (i0 = 0; i0 < datarefs.length (); ++i0)
1879     if (DR_BASE_ADDRESS (datarefs[i0]))
1880       break;
1881   for (i = i0 + 1; i <= datarefs.length (); ++i)
1882     {
1883       if (i == datarefs.length ()
1884           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1885                                DR_BASE_ADDRESS (datarefs[i]), 0)
1886           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1887                                DR_OFFSET (datarefs[i]), 0)
1888           || !operand_equal_p (DR_STEP (datarefs[i0]),
1889                                DR_STEP (datarefs[i]), 0))
1890         {
1891           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1892              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1893              will get known misalignment if we align one of the refs
1894              with the largest DR_TARGET_ALIGNMENT.  */
1895           for (unsigned j = i0; j < i; ++j)
1896             {
1897               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1898               for (unsigned k = i0; k < i; ++k)
1899                 {
1900                   if (k == j)
1901                     continue;
1902                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1903                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1904                                                                dr_infoj))
1905                     n_same_align_refs[j]++;
1906                 }
1907             }
1908           i0 = i;
1909         }
1910     }
1911
1912   /* While cost model enhancements are expected in the future, the high level
1913      view of the code at this time is as follows:
1914
1915      A) If there is a misaligned access then see if peeling to align
1916         this access can make all data references satisfy
1917         vect_supportable_dr_alignment.  If so, update data structures
1918         as needed and return true.
1919
1920      B) If peeling wasn't possible and there is a data reference with an
1921         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1922         then see if loop versioning checks can be used to make all data
1923         references satisfy vect_supportable_dr_alignment.  If so, update
1924         data structures as needed and return true.
1925
1926      C) If neither peeling nor versioning were successful then return false if
1927         any data reference does not satisfy vect_supportable_dr_alignment.
1928
1929      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1930
1931      Note, Possibility 3 above (which is peeling and versioning together) is not
1932      being done at this time.  */
1933
1934   /* (1) Peeling to force alignment.  */
1935
1936   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1937      Considerations:
1938      + How many accesses will become aligned due to the peeling
1939      - How many accesses will become unaligned due to the peeling,
1940        and the cost of misaligned accesses.
1941      - The cost of peeling (the extra runtime checks, the increase
1942        in code size).  */
1943
1944   FOR_EACH_VEC_ELT (datarefs, i, dr)
1945     {
1946       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1947       if (!vect_relevant_for_alignment_p (dr_info))
1948         continue;
1949
1950       stmt_vec_info stmt_info = dr_info->stmt;
1951       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1952       do_peeling = vector_alignment_reachable_p (dr_info);
1953       if (do_peeling)
1954         {
1955           if (known_alignment_for_access_p (dr_info, vectype))
1956             {
1957               unsigned int npeel_tmp = 0;
1958               bool negative = tree_int_cst_compare (DR_STEP (dr),
1959                                                     size_zero_node) < 0;
1960
1961               /* If known_alignment_for_access_p then we have set
1962                  DR_MISALIGNMENT which is only done if we know it at compiler
1963                  time, so it is safe to assume target alignment is constant.
1964                */
1965               unsigned int target_align =
1966                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1967               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1968               poly_int64 off = 0;
1969               if (negative)
1970                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1971               unsigned int mis = dr_misalignment (dr_info, vectype, off);
1972               mis = negative ? mis : -mis;
1973               if (mis != 0)
1974                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1975
1976               /* For multiple types, it is possible that the bigger type access
1977                  will have more than one peeling option.  E.g., a loop with two
1978                  types: one of size (vector size / 4), and the other one of
1979                  size (vector size / 8).  Vectorization factor will 8.  If both
1980                  accesses are misaligned by 3, the first one needs one scalar
1981                  iteration to be aligned, and the second one needs 5.  But the
1982                  first one will be aligned also by peeling 5 scalar
1983                  iterations, and in that case both accesses will be aligned.
1984                  Hence, except for the immediate peeling amount, we also want
1985                  to try to add full vector size, while we don't exceed
1986                  vectorization factor.
1987                  We do this automatically for cost model, since we calculate
1988                  cost for every peeling option.  */
1989               poly_uint64 nscalars = npeel_tmp;
1990               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1991                 {
1992                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1993                   nscalars = (STMT_SLP_TYPE (stmt_info)
1994                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1995                 }
1996
1997               /* Save info about DR in the hash table.  Also include peeling
1998                  amounts according to the explanation above.  Indicate
1999                  the alignment status when the ref is not aligned.
2000                  ???  Rather than using unknown alignment here we should
2001                  prune all entries from the peeling hashtable which cause
2002                  DRs to be not supported.  */
2003               bool supportable_if_not_aligned
2004                 = vect_supportable_dr_alignment
2005                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2006               while (known_le (npeel_tmp, nscalars))
2007                 {
2008                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2009                                             dr_info, npeel_tmp,
2010                                             supportable_if_not_aligned);
2011                   npeel_tmp += MAX (1, target_align / dr_size);
2012                 }
2013
2014               one_misalignment_known = true;
2015             }
2016           else
2017             {
2018               /* If we don't know any misalignment values, we prefer
2019                  peeling for data-ref that has the maximum number of data-refs
2020                  with the same alignment, unless the target prefers to align
2021                  stores over load.  */
2022               unsigned same_align_drs = n_same_align_refs[i];
2023               if (!dr0_info
2024                   || dr0_same_align_drs < same_align_drs)
2025                 {
2026                   dr0_same_align_drs = same_align_drs;
2027                   dr0_info = dr_info;
2028                 }
2029               /* For data-refs with the same number of related
2030                  accesses prefer the one where the misalign
2031                  computation will be invariant in the outermost loop.  */
2032               else if (dr0_same_align_drs == same_align_drs)
2033                 {
2034                   class loop *ivloop0, *ivloop;
2035                   ivloop0 = outermost_invariant_loop_for_expr
2036                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2037                   ivloop = outermost_invariant_loop_for_expr
2038                     (loop, DR_BASE_ADDRESS (dr));
2039                   if ((ivloop && !ivloop0)
2040                       || (ivloop && ivloop0
2041                           && flow_loop_nested_p (ivloop, ivloop0)))
2042                     dr0_info = dr_info;
2043                 }
2044
2045               one_misalignment_unknown = true;
2046
2047               /* Check for data refs with unsupportable alignment that
2048                  can be peeled.  */
2049               enum dr_alignment_support supportable_dr_alignment
2050                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2051                                                  DR_MISALIGNMENT_UNKNOWN);
2052               if (supportable_dr_alignment == dr_unaligned_unsupported)
2053                 {
2054                   one_dr_unsupportable = true;
2055                   unsupportable_dr_info = dr_info;
2056                 }
2057
2058               if (!first_store && DR_IS_WRITE (dr))
2059                 {
2060                   first_store = dr_info;
2061                   first_store_same_align_drs = same_align_drs;
2062                 }
2063             }
2064         }
2065       else
2066         {
2067           if (!aligned_access_p (dr_info, vectype))
2068             {
2069               if (dump_enabled_p ())
2070                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2071                                  "vector alignment may not be reachable\n");
2072               break;
2073             }
2074         }
2075     }
2076
2077   /* Check if we can possibly peel the loop.  */
2078   if (!vect_can_advance_ivs_p (loop_vinfo)
2079       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2080       || loop->inner)
2081     do_peeling = false;
2082
2083   struct _vect_peel_extended_info peel_for_known_alignment;
2084   struct _vect_peel_extended_info peel_for_unknown_alignment;
2085   struct _vect_peel_extended_info best_peel;
2086
2087   peel_for_unknown_alignment.inside_cost = INT_MAX;
2088   peel_for_unknown_alignment.outside_cost = INT_MAX;
2089   peel_for_unknown_alignment.peel_info.count = 0;
2090
2091   if (do_peeling
2092       && one_misalignment_unknown)
2093     {
2094       /* Check if the target requires to prefer stores over loads, i.e., if
2095          misaligned stores are more expensive than misaligned loads (taking
2096          drs with same alignment into account).  */
2097       unsigned int load_inside_cost = 0;
2098       unsigned int load_outside_cost = 0;
2099       unsigned int store_inside_cost = 0;
2100       unsigned int store_outside_cost = 0;
2101       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2102
2103       stmt_vector_for_cost dummy;
2104       dummy.create (2);
2105       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2106                                       &load_inside_cost,
2107                                       &load_outside_cost,
2108                                       &dummy, &dummy, estimated_npeels);
2109       dummy.release ();
2110
2111       if (first_store)
2112         {
2113           dummy.create (2);
2114           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2115                                           &store_inside_cost,
2116                                           &store_outside_cost,
2117                                           &dummy, &dummy,
2118                                           estimated_npeels);
2119           dummy.release ();
2120         }
2121       else
2122         {
2123           store_inside_cost = INT_MAX;
2124           store_outside_cost = INT_MAX;
2125         }
2126
2127       if (load_inside_cost > store_inside_cost
2128           || (load_inside_cost == store_inside_cost
2129               && load_outside_cost > store_outside_cost))
2130         {
2131           dr0_info = first_store;
2132           dr0_same_align_drs = first_store_same_align_drs;
2133           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2134           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2135         }
2136       else
2137         {
2138           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2139           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2140         }
2141
2142       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2143       prologue_cost_vec.create (2);
2144       epilogue_cost_vec.create (2);
2145
2146       int dummy2;
2147       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2148         (loop_vinfo, estimated_npeels, &dummy2,
2149          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2150          &prologue_cost_vec, &epilogue_cost_vec);
2151
2152       prologue_cost_vec.release ();
2153       epilogue_cost_vec.release ();
2154
2155       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2156     }
2157
2158   peel_for_unknown_alignment.peel_info.npeel = 0;
2159   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2160
2161   best_peel = peel_for_unknown_alignment;
2162
2163   peel_for_known_alignment.inside_cost = INT_MAX;
2164   peel_for_known_alignment.outside_cost = INT_MAX;
2165   peel_for_known_alignment.peel_info.count = 0;
2166   peel_for_known_alignment.peel_info.dr_info = NULL;
2167
2168   if (do_peeling && one_misalignment_known)
2169     {
2170       /* Peeling is possible, but there is no data access that is not supported
2171          unless aligned.  So we try to choose the best possible peeling from
2172          the hash table.  */
2173       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2174         (&peeling_htab, loop_vinfo);
2175     }
2176
2177   /* Compare costs of peeling for known and unknown alignment. */
2178   if (peel_for_known_alignment.peel_info.dr_info != NULL
2179       && peel_for_unknown_alignment.inside_cost
2180       >= peel_for_known_alignment.inside_cost)
2181     {
2182       best_peel = peel_for_known_alignment;
2183
2184       /* If the best peeling for known alignment has NPEEL == 0, perform no
2185          peeling at all except if there is an unsupportable dr that we can
2186          align.  */
2187       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2188         do_peeling = false;
2189     }
2190
2191   /* If there is an unsupportable data ref, prefer this over all choices so far
2192      since we'd have to discard a chosen peeling except when it accidentally
2193      aligned the unsupportable data ref.  */
2194   if (one_dr_unsupportable)
2195     dr0_info = unsupportable_dr_info;
2196   else if (do_peeling)
2197     {
2198       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2199          TODO: Use nopeel_outside_cost or get rid of it?  */
2200       unsigned nopeel_inside_cost = 0;
2201       unsigned nopeel_outside_cost = 0;
2202
2203       stmt_vector_for_cost dummy;
2204       dummy.create (2);
2205       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2206                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2207       dummy.release ();
2208
2209       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2210          costs will be recorded.  */
2211       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2212       prologue_cost_vec.create (2);
2213       epilogue_cost_vec.create (2);
2214
2215       int dummy2;
2216       nopeel_outside_cost += vect_get_known_peeling_cost
2217         (loop_vinfo, 0, &dummy2,
2218          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2219          &prologue_cost_vec, &epilogue_cost_vec);
2220
2221       prologue_cost_vec.release ();
2222       epilogue_cost_vec.release ();
2223
2224       npeel = best_peel.peel_info.npeel;
2225       dr0_info = best_peel.peel_info.dr_info;
2226
2227       /* If no peeling is not more expensive than the best peeling we
2228          have so far, don't perform any peeling.  */
2229       if (nopeel_inside_cost <= best_peel.inside_cost)
2230         do_peeling = false;
2231     }
2232
2233   if (do_peeling)
2234     {
2235       stmt_vec_info stmt_info = dr0_info->stmt;
2236       if (known_alignment_for_access_p (dr0_info,
2237                                         STMT_VINFO_VECTYPE (stmt_info)))
2238         {
2239           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2240                                                 size_zero_node) < 0;
2241           if (!npeel)
2242             {
2243               /* Since it's known at compile time, compute the number of
2244                  iterations in the peeled loop (the peeling factor) for use in
2245                  updating DR_MISALIGNMENT values.  The peeling factor is the
2246                  vectorization factor minus the misalignment as an element
2247                  count.  */
2248               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2249               poly_int64 off = 0;
2250               if (negative)
2251                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2252                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2253               unsigned int mis
2254                 = dr_misalignment (dr0_info, vectype, off);
2255               mis = negative ? mis : -mis;
2256               /* If known_alignment_for_access_p then we have set
2257                  DR_MISALIGNMENT which is only done if we know it at compiler
2258                  time, so it is safe to assume target alignment is constant.
2259                */
2260               unsigned int target_align =
2261                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2262               npeel = ((mis & (target_align - 1))
2263                        / vect_get_scalar_dr_size (dr0_info));
2264             }
2265
2266           /* For interleaved data access every iteration accesses all the
2267              members of the group, therefore we divide the number of iterations
2268              by the group size.  */
2269           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2270             npeel /= DR_GROUP_SIZE (stmt_info);
2271
2272           if (dump_enabled_p ())
2273             dump_printf_loc (MSG_NOTE, vect_location,
2274                              "Try peeling by %d\n", npeel);
2275         }
2276
2277       /* Ensure that all datarefs can be vectorized after the peel.  */
2278       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2279         do_peeling = false;
2280
2281       /* Check if all datarefs are supportable and log.  */
2282       if (do_peeling
2283           && npeel == 0
2284           && known_alignment_for_access_p (dr0_info,
2285                                            STMT_VINFO_VECTYPE (stmt_info)))
2286         return opt_result::success ();
2287
2288       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2289       if (do_peeling)
2290         {
2291           unsigned max_allowed_peel
2292             = param_vect_max_peeling_for_alignment;
2293           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2294             max_allowed_peel = 0;
2295           if (max_allowed_peel != (unsigned)-1)
2296             {
2297               unsigned max_peel = npeel;
2298               if (max_peel == 0)
2299                 {
2300                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2301                   unsigned HOST_WIDE_INT target_align_c;
2302                   if (target_align.is_constant (&target_align_c))
2303                     max_peel =
2304                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2305                   else
2306                     {
2307                       do_peeling = false;
2308                       if (dump_enabled_p ())
2309                         dump_printf_loc (MSG_NOTE, vect_location,
2310                           "Disable peeling, max peels set and vector"
2311                           " alignment unknown\n");
2312                     }
2313                 }
2314               if (max_peel > max_allowed_peel)
2315                 {
2316                   do_peeling = false;
2317                   if (dump_enabled_p ())
2318                     dump_printf_loc (MSG_NOTE, vect_location,
2319                         "Disable peeling, max peels reached: %d\n", max_peel);
2320                 }
2321             }
2322         }
2323
2324       /* Cost model #2 - if peeling may result in a remaining loop not
2325          iterating enough to be vectorized then do not peel.  Since this
2326          is a cost heuristic rather than a correctness decision, use the
2327          most likely runtime value for variable vectorization factors.  */
2328       if (do_peeling
2329           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2330         {
2331           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2332           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2333           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2334               < assumed_vf + max_peel)
2335             do_peeling = false;
2336         }
2337
2338       if (do_peeling)
2339         {
2340           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2341              If the misalignment of DR_i is identical to that of dr0 then set
2342              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2343              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2344              by the peeling factor times the element size of DR_i (MOD the
2345              vectorization factor times the size).  Otherwise, the
2346              misalignment of DR_i must be set to unknown.  */
2347           FOR_EACH_VEC_ELT (datarefs, i, dr)
2348             if (dr != dr0_info->dr)
2349               {
2350                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2351                 if (!vect_relevant_for_alignment_p (dr_info))
2352                   continue;
2353
2354                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2355               }
2356
2357           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2358           if (npeel)
2359             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2360           else
2361             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2362           SET_DR_MISALIGNMENT (dr0_info,
2363                                vect_dr_misalign_for_aligned_access (dr0_info));
2364           if (dump_enabled_p ())
2365             {
2366               dump_printf_loc (MSG_NOTE, vect_location,
2367                                "Alignment of access forced using peeling.\n");
2368               dump_printf_loc (MSG_NOTE, vect_location,
2369                                "Peeling for alignment will be applied.\n");
2370             }
2371
2372           /* The inside-loop cost will be accounted for in vectorizable_load
2373              and vectorizable_store correctly with adjusted alignments.
2374              Drop the body_cst_vec on the floor here.  */
2375           return opt_result::success ();
2376         }
2377     }
2378
2379   /* (2) Versioning to force alignment.  */
2380
2381   /* Try versioning if:
2382      1) optimize loop for speed and the cost-model is not cheap
2383      2) there is at least one unsupported misaligned data ref with an unknown
2384         misalignment, and
2385      3) all misaligned data refs with a known misalignment are supported, and
2386      4) the number of runtime alignment checks is within reason.  */
2387
2388   do_versioning
2389     = (optimize_loop_nest_for_speed_p (loop)
2390        && !loop->inner /* FORNOW */
2391        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2392
2393   if (do_versioning)
2394     {
2395       FOR_EACH_VEC_ELT (datarefs, i, dr)
2396         {
2397           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2398           if (!vect_relevant_for_alignment_p (dr_info))
2399             continue;
2400
2401           stmt_vec_info stmt_info = dr_info->stmt;
2402           if (STMT_VINFO_STRIDED_P (stmt_info))
2403             {
2404               do_versioning = false;
2405               break;
2406             }
2407
2408           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2409           bool negative = tree_int_cst_compare (DR_STEP (dr),
2410                                                 size_zero_node) < 0;
2411           poly_int64 off = 0;
2412           if (negative)
2413             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2414                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2415           int misalignment;
2416           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2417             continue;
2418
2419           enum dr_alignment_support supportable_dr_alignment
2420             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2421                                              misalignment);
2422           if (supportable_dr_alignment == dr_unaligned_unsupported)
2423             {
2424               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2425                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2426                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2427                 {
2428                   do_versioning = false;
2429                   break;
2430                 }
2431
2432               /* At present we don't support versioning for alignment
2433                  with variable VF, since there's no guarantee that the
2434                  VF is a power of two.  We could relax this if we added
2435                  a way of enforcing a power-of-two size.  */
2436               unsigned HOST_WIDE_INT size;
2437               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2438                 {
2439                   do_versioning = false;
2440                   break;
2441                 }
2442
2443               /* Forcing alignment in the first iteration is no good if
2444                  we don't keep it across iterations.  For now, just disable
2445                  versioning in this case.
2446                  ?? We could actually unroll the loop to achieve the required
2447                  overall step alignment, and forcing the alignment could be
2448                  done by doing some iterations of the non-vectorized loop.  */
2449               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2450                                * DR_STEP_ALIGNMENT (dr),
2451                                DR_TARGET_ALIGNMENT (dr_info)))
2452                 {
2453                   do_versioning = false;
2454                   break;
2455                 }
2456
2457               /* The rightmost bits of an aligned address must be zeros.
2458                  Construct the mask needed for this test.  For example,
2459                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2460                  mask must be 15 = 0xf. */
2461               int mask = size - 1;
2462
2463               /* FORNOW: use the same mask to test all potentially unaligned
2464                  references in the loop.  */
2465               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2466                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2467                 {
2468                   do_versioning = false;
2469                   break;
2470                 }
2471
2472               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2473               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2474             }
2475         }
2476
2477       /* Versioning requires at least one misaligned data reference.  */
2478       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2479         do_versioning = false;
2480       else if (!do_versioning)
2481         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2482     }
2483
2484   if (do_versioning)
2485     {
2486       const vec<stmt_vec_info> &may_misalign_stmts
2487         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2488       stmt_vec_info stmt_info;
2489
2490       /* It can now be assumed that the data references in the statements
2491          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2492          of the loop being vectorized.  */
2493       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2494         {
2495           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2496           SET_DR_MISALIGNMENT (dr_info,
2497                                vect_dr_misalign_for_aligned_access (dr_info));
2498           if (dump_enabled_p ())
2499             dump_printf_loc (MSG_NOTE, vect_location,
2500                              "Alignment of access forced using versioning.\n");
2501         }
2502
2503       if (dump_enabled_p ())
2504         dump_printf_loc (MSG_NOTE, vect_location,
2505                          "Versioning for alignment will be applied.\n");
2506
2507       /* Peeling and versioning can't be done together at this time.  */
2508       gcc_assert (! (do_peeling && do_versioning));
2509
2510       return opt_result::success ();
2511     }
2512
2513   /* This point is reached if neither peeling nor versioning is being done.  */
2514   gcc_assert (! (do_peeling || do_versioning));
2515
2516   return opt_result::success ();
2517 }
2518
2519
2520 /* Function vect_analyze_data_refs_alignment
2521
2522    Analyze the alignment of the data-references in the loop.
2523    Return FALSE if a data reference is found that cannot be vectorized.  */
2524
2525 opt_result
2526 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2527 {
2528   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2529
2530   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2531   struct data_reference *dr;
2532   unsigned int i;
2533
2534   vect_record_base_alignments (loop_vinfo);
2535   FOR_EACH_VEC_ELT (datarefs, i, dr)
2536     {
2537       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2538       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2539         {
2540           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2541               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2542             continue;
2543           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2544                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2545         }
2546     }
2547
2548   return opt_result::success ();
2549 }
2550
2551
2552 /* Analyze alignment of DRs of stmts in NODE.  */
2553
2554 static bool
2555 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2556 {
2557   /* Alignment is maintained in the first element of the group.  */
2558   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2559   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2560   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2561   tree vectype = SLP_TREE_VECTYPE (node);
2562   poly_uint64 vector_alignment
2563     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2564                  BITS_PER_UNIT);
2565   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2566     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2567   /* Re-analyze alignment when we're facing a vectorization with a bigger
2568      alignment requirement.  */
2569   else if (known_lt (dr_info->target_alignment, vector_alignment))
2570     {
2571       poly_uint64 old_target_alignment = dr_info->target_alignment;
2572       int old_misalignment = dr_info->misalignment;
2573       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2574       /* But keep knowledge about a smaller alignment.  */
2575       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2576           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2577         {
2578           dr_info->target_alignment = old_target_alignment;
2579           dr_info->misalignment = old_misalignment;
2580         }
2581     }
2582   /* When we ever face unordered target alignments the first one wins in terms
2583      of analyzing and the other will become unknown in dr_misalignment.  */
2584   return true;
2585 }
2586
2587 /* Function vect_slp_analyze_instance_alignment
2588
2589    Analyze the alignment of the data-references in the SLP instance.
2590    Return FALSE if a data reference is found that cannot be vectorized.  */
2591
2592 bool
2593 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2594                                                 slp_instance instance)
2595 {
2596   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2597
2598   slp_tree node;
2599   unsigned i;
2600   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2601     if (! vect_slp_analyze_node_alignment (vinfo, node))
2602       return false;
2603
2604   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2605       && ! vect_slp_analyze_node_alignment
2606              (vinfo, SLP_INSTANCE_TREE (instance)))
2607     return false;
2608
2609   return true;
2610 }
2611
2612
2613 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2614    accesses of legal size, step, etc.  Detect gaps, single element
2615    interleaving, and other special cases. Set grouped access info.
2616    Collect groups of strided stores for further use in SLP analysis.
2617    Worker for vect_analyze_group_access.  */
2618
2619 static bool
2620 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2621 {
2622   data_reference *dr = dr_info->dr;
2623   tree step = DR_STEP (dr);
2624   tree scalar_type = TREE_TYPE (DR_REF (dr));
2625   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2626   stmt_vec_info stmt_info = dr_info->stmt;
2627   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2628   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2629   HOST_WIDE_INT dr_step = -1;
2630   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2631   bool slp_impossible = false;
2632
2633   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2634      size of the interleaving group (including gaps).  */
2635   if (tree_fits_shwi_p (step))
2636     {
2637       dr_step = tree_to_shwi (step);
2638       /* Check that STEP is a multiple of type size.  Otherwise there is
2639          a non-element-sized gap at the end of the group which we
2640          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2641          ???  As we can handle non-constant step fine here we should
2642          simply remove uses of DR_GROUP_GAP between the last and first
2643          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2644          simply not include that gap.  */
2645       if ((dr_step % type_size) != 0)
2646         {
2647           if (dump_enabled_p ())
2648             dump_printf_loc (MSG_NOTE, vect_location,
2649                              "Step %T is not a multiple of the element size"
2650                              " for %T\n",
2651                              step, DR_REF (dr));
2652           return false;
2653         }
2654       groupsize = absu_hwi (dr_step) / type_size;
2655     }
2656   else
2657     groupsize = 0;
2658
2659   /* Not consecutive access is possible only if it is a part of interleaving.  */
2660   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2661     {
2662       /* Check if it this DR is a part of interleaving, and is a single
2663          element of the group that is accessed in the loop.  */
2664
2665       /* Gaps are supported only for loads. STEP must be a multiple of the type
2666          size.  */
2667       if (DR_IS_READ (dr)
2668           && (dr_step % type_size) == 0
2669           && groupsize > 0
2670           /* This could be UINT_MAX but as we are generating code in a very
2671              inefficient way we have to cap earlier.
2672              See PR91403 for example.  */
2673           && groupsize <= 4096)
2674         {
2675           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2676           DR_GROUP_SIZE (stmt_info) = groupsize;
2677           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2678           if (dump_enabled_p ())
2679             dump_printf_loc (MSG_NOTE, vect_location,
2680                              "Detected single element interleaving %T"
2681                              " step %T\n",
2682                              DR_REF (dr), step);
2683
2684           return true;
2685         }
2686
2687       if (dump_enabled_p ())
2688         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2689                          "not consecutive access %G", stmt_info->stmt);
2690
2691       if (bb_vinfo)
2692         {
2693           /* Mark the statement as unvectorizable.  */
2694           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2695           return true;
2696         }
2697
2698       if (dump_enabled_p ())
2699         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2700       STMT_VINFO_STRIDED_P (stmt_info) = true;
2701       return true;
2702     }
2703
2704   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2705     {
2706       /* First stmt in the interleaving chain. Check the chain.  */
2707       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2708       struct data_reference *data_ref = dr;
2709       unsigned int count = 1;
2710       tree prev_init = DR_INIT (data_ref);
2711       HOST_WIDE_INT diff, gaps = 0;
2712
2713       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2714       while (next)
2715         {
2716           /* We never have the same DR multiple times.  */
2717           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2718                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2719
2720           data_ref = STMT_VINFO_DATA_REF (next);
2721
2722           /* All group members have the same STEP by construction.  */
2723           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2724
2725           /* Check that the distance between two accesses is equal to the type
2726              size. Otherwise, we have gaps.  */
2727           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2728                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2729           if (diff != 1)
2730             {
2731               /* FORNOW: SLP of accesses with gaps is not supported.  */
2732               slp_impossible = true;
2733               if (DR_IS_WRITE (data_ref))
2734                 {
2735                   if (dump_enabled_p ())
2736                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2737                                      "interleaved store with gaps\n");
2738                   return false;
2739                 }
2740
2741               gaps += diff - 1;
2742             }
2743
2744           last_accessed_element += diff;
2745
2746           /* Store the gap from the previous member of the group. If there is no
2747              gap in the access, DR_GROUP_GAP is always 1.  */
2748           DR_GROUP_GAP (next) = diff;
2749
2750           prev_init = DR_INIT (data_ref);
2751           next = DR_GROUP_NEXT_ELEMENT (next);
2752           /* Count the number of data-refs in the chain.  */
2753           count++;
2754         }
2755
2756       if (groupsize == 0)
2757         groupsize = count + gaps;
2758
2759       /* This could be UINT_MAX but as we are generating code in a very
2760          inefficient way we have to cap earlier.  See PR78699 for example.  */
2761       if (groupsize > 4096)
2762         {
2763           if (dump_enabled_p ())
2764             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2765                              "group is too large\n");
2766           return false;
2767         }
2768
2769       /* Check that the size of the interleaving is equal to count for stores,
2770          i.e., that there are no gaps.  */
2771       if (groupsize != count
2772           && !DR_IS_READ (dr))
2773         {
2774           groupsize = count;
2775           STMT_VINFO_STRIDED_P (stmt_info) = true;
2776         }
2777
2778       /* If there is a gap after the last load in the group it is the
2779          difference between the groupsize and the last accessed
2780          element.
2781          When there is no gap, this difference should be 0.  */
2782       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2783
2784       DR_GROUP_SIZE (stmt_info) = groupsize;
2785       if (dump_enabled_p ())
2786         {
2787           dump_printf_loc (MSG_NOTE, vect_location,
2788                            "Detected interleaving ");
2789           if (DR_IS_READ (dr))
2790             dump_printf (MSG_NOTE, "load ");
2791           else if (STMT_VINFO_STRIDED_P (stmt_info))
2792             dump_printf (MSG_NOTE, "strided store ");
2793           else
2794             dump_printf (MSG_NOTE, "store ");
2795           dump_printf (MSG_NOTE, "of size %u\n",
2796                        (unsigned)groupsize);
2797           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2798           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2799           while (next)
2800             {
2801               if (DR_GROUP_GAP (next) != 1)
2802                 dump_printf_loc (MSG_NOTE, vect_location,
2803                                  "\t<gap of %d elements>\n",
2804                                  DR_GROUP_GAP (next) - 1);
2805               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2806               next = DR_GROUP_NEXT_ELEMENT (next);
2807             }
2808           if (DR_GROUP_GAP (stmt_info) != 0)
2809             dump_printf_loc (MSG_NOTE, vect_location,
2810                              "\t<gap of %d elements>\n",
2811                              DR_GROUP_GAP (stmt_info));
2812         }
2813
2814       /* SLP: create an SLP data structure for every interleaving group of
2815          stores for further analysis in vect_analyse_slp.  */
2816       if (DR_IS_WRITE (dr) && !slp_impossible)
2817         {
2818           if (loop_vinfo)
2819             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2820           if (bb_vinfo)
2821             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2822         }
2823     }
2824
2825   return true;
2826 }
2827
2828 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2829    accesses of legal size, step, etc.  Detect gaps, single element
2830    interleaving, and other special cases. Set grouped access info.
2831    Collect groups of strided stores for further use in SLP analysis.  */
2832
2833 static bool
2834 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2835 {
2836   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2837     {
2838       /* Dissolve the group if present.  */
2839       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2840       while (stmt_info)
2841         {
2842           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2843           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2844           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2845           stmt_info = next;
2846         }
2847       return false;
2848     }
2849   return true;
2850 }
2851
2852 /* Analyze the access pattern of the data-reference DR_INFO.
2853    In case of non-consecutive accesses call vect_analyze_group_access() to
2854    analyze groups of accesses.  */
2855
2856 static bool
2857 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2858 {
2859   data_reference *dr = dr_info->dr;
2860   tree step = DR_STEP (dr);
2861   tree scalar_type = TREE_TYPE (DR_REF (dr));
2862   stmt_vec_info stmt_info = dr_info->stmt;
2863   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2864   class loop *loop = NULL;
2865
2866   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2867     return true;
2868
2869   if (loop_vinfo)
2870     loop = LOOP_VINFO_LOOP (loop_vinfo);
2871
2872   if (loop_vinfo && !step)
2873     {
2874       if (dump_enabled_p ())
2875         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2876                          "bad data-ref access in loop\n");
2877       return false;
2878     }
2879
2880   /* Allow loads with zero step in inner-loop vectorization.  */
2881   if (loop_vinfo && integer_zerop (step))
2882     {
2883       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2884       if (!nested_in_vect_loop_p (loop, stmt_info))
2885         return DR_IS_READ (dr);
2886       /* Allow references with zero step for outer loops marked
2887          with pragma omp simd only - it guarantees absence of
2888          loop-carried dependencies between inner loop iterations.  */
2889       if (loop->safelen < 2)
2890         {
2891           if (dump_enabled_p ())
2892             dump_printf_loc (MSG_NOTE, vect_location,
2893                              "zero step in inner loop of nest\n");
2894           return false;
2895         }
2896     }
2897
2898   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2899     {
2900       /* Interleaved accesses are not yet supported within outer-loop
2901         vectorization for references in the inner-loop.  */
2902       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2903
2904       /* For the rest of the analysis we use the outer-loop step.  */
2905       step = STMT_VINFO_DR_STEP (stmt_info);
2906       if (integer_zerop (step))
2907         {
2908           if (dump_enabled_p ())
2909             dump_printf_loc (MSG_NOTE, vect_location,
2910                              "zero step in outer loop.\n");
2911           return DR_IS_READ (dr);
2912         }
2913     }
2914
2915   /* Consecutive?  */
2916   if (TREE_CODE (step) == INTEGER_CST)
2917     {
2918       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2919       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2920           || (dr_step < 0
2921               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2922         {
2923           /* Mark that it is not interleaving.  */
2924           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2925           return true;
2926         }
2927     }
2928
2929   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2930     {
2931       if (dump_enabled_p ())
2932         dump_printf_loc (MSG_NOTE, vect_location,
2933                          "grouped access in outer loop.\n");
2934       return false;
2935     }
2936
2937
2938   /* Assume this is a DR handled by non-constant strided load case.  */
2939   if (TREE_CODE (step) != INTEGER_CST)
2940     return (STMT_VINFO_STRIDED_P (stmt_info)
2941             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2942                 || vect_analyze_group_access (vinfo, dr_info)));
2943
2944   /* Not consecutive access - check if it's a part of interleaving group.  */
2945   return vect_analyze_group_access (vinfo, dr_info);
2946 }
2947
2948 /* Compare two data-references DRA and DRB to group them into chunks
2949    suitable for grouping.  */
2950
2951 static int
2952 dr_group_sort_cmp (const void *dra_, const void *drb_)
2953 {
2954   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2955   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2956   data_reference_p dra = dra_info->dr;
2957   data_reference_p drb = drb_info->dr;
2958   int cmp;
2959
2960   /* Stabilize sort.  */
2961   if (dra == drb)
2962     return 0;
2963
2964   /* Different group IDs lead never belong to the same group.  */
2965   if (dra_info->group != drb_info->group)
2966     return dra_info->group < drb_info->group ? -1 : 1;
2967
2968   /* Ordering of DRs according to base.  */
2969   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2970                                DR_BASE_ADDRESS (drb));
2971   if (cmp != 0)
2972     return cmp;
2973
2974   /* And according to DR_OFFSET.  */
2975   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2976   if (cmp != 0)
2977     return cmp;
2978
2979   /* Put reads before writes.  */
2980   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2981     return DR_IS_READ (dra) ? -1 : 1;
2982
2983   /* Then sort after access size.  */
2984   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2985                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2986   if (cmp != 0)
2987     return cmp;
2988
2989   /* And after step.  */
2990   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2991   if (cmp != 0)
2992     return cmp;
2993
2994   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2995   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2996   if (cmp == 0)
2997     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2998   return cmp;
2999 }
3000
3001 /* If OP is the result of a conversion, return the unconverted value,
3002    otherwise return null.  */
3003
3004 static tree
3005 strip_conversion (tree op)
3006 {
3007   if (TREE_CODE (op) != SSA_NAME)
3008     return NULL_TREE;
3009   gimple *stmt = SSA_NAME_DEF_STMT (op);
3010   if (!is_gimple_assign (stmt)
3011       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3012     return NULL_TREE;
3013   return gimple_assign_rhs1 (stmt);
3014 }
3015
3016 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3017    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3018    be grouped in SLP mode.  */
3019
3020 static bool
3021 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3022                    bool allow_slp_p)
3023 {
3024   if (gimple_assign_single_p (stmt1_info->stmt))
3025     return gimple_assign_single_p (stmt2_info->stmt);
3026
3027   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3028   if (call1 && gimple_call_internal_p (call1))
3029     {
3030       /* Check for two masked loads or two masked stores.  */
3031       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3032       if (!call2 || !gimple_call_internal_p (call2))
3033         return false;
3034       internal_fn ifn = gimple_call_internal_fn (call1);
3035       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3036         return false;
3037       if (ifn != gimple_call_internal_fn (call2))
3038         return false;
3039
3040       /* Check that the masks are the same.  Cope with casts of masks,
3041          like those created by build_mask_conversion.  */
3042       tree mask1 = gimple_call_arg (call1, 2);
3043       tree mask2 = gimple_call_arg (call2, 2);
3044       if (!operand_equal_p (mask1, mask2, 0)
3045           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3046         {
3047           mask1 = strip_conversion (mask1);
3048           if (!mask1)
3049             return false;
3050           mask2 = strip_conversion (mask2);
3051           if (!mask2)
3052             return false;
3053           if (!operand_equal_p (mask1, mask2, 0))
3054             return false;
3055         }
3056       return true;
3057     }
3058
3059   return false;
3060 }
3061
3062 /* Function vect_analyze_data_ref_accesses.
3063
3064    Analyze the access pattern of all the data references in the loop.
3065
3066    FORNOW: the only access pattern that is considered vectorizable is a
3067            simple step 1 (consecutive) access.
3068
3069    FORNOW: handle only arrays and pointer accesses.  */
3070
3071 opt_result
3072 vect_analyze_data_ref_accesses (vec_info *vinfo,
3073                                 vec<int> *dataref_groups)
3074 {
3075   unsigned int i;
3076   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3077
3078   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3079
3080   if (datarefs.is_empty ())
3081     return opt_result::success ();
3082
3083   /* Sort the array of datarefs to make building the interleaving chains
3084      linear.  Don't modify the original vector's order, it is needed for
3085      determining what dependencies are reversed.  */
3086   vec<dr_vec_info *> datarefs_copy;
3087   datarefs_copy.create (datarefs.length ());
3088   for (unsigned i = 0; i < datarefs.length (); i++)
3089     {
3090       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3091       /* If the caller computed DR grouping use that, otherwise group by
3092          basic blocks.  */
3093       if (dataref_groups)
3094         dr_info->group = (*dataref_groups)[i];
3095       else
3096         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3097       datarefs_copy.quick_push (dr_info);
3098     }
3099   datarefs_copy.qsort (dr_group_sort_cmp);
3100   hash_set<stmt_vec_info> to_fixup;
3101
3102   /* Build the interleaving chains.  */
3103   for (i = 0; i < datarefs_copy.length () - 1;)
3104     {
3105       dr_vec_info *dr_info_a = datarefs_copy[i];
3106       data_reference_p dra = dr_info_a->dr;
3107       int dra_group_id = dr_info_a->group;
3108       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3109       stmt_vec_info lastinfo = NULL;
3110       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3111           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3112         {
3113           ++i;
3114           continue;
3115         }
3116       for (i = i + 1; i < datarefs_copy.length (); ++i)
3117         {
3118           dr_vec_info *dr_info_b = datarefs_copy[i];
3119           data_reference_p drb = dr_info_b->dr;
3120           int drb_group_id = dr_info_b->group;
3121           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3122           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3123               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3124             break;
3125
3126           /* ???  Imperfect sorting (non-compatible types, non-modulo
3127              accesses, same accesses) can lead to a group to be artificially
3128              split here as we don't just skip over those.  If it really
3129              matters we can push those to a worklist and re-iterate
3130              over them.  The we can just skip ahead to the next DR here.  */
3131
3132           /* DRs in a different DR group should not be put into the same
3133              interleaving group.  */
3134           if (dra_group_id != drb_group_id)
3135             break;
3136
3137           /* Check that the data-refs have same first location (except init)
3138              and they are both either store or load (not load and store,
3139              not masked loads or stores).  */
3140           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3141               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3142                                         DR_BASE_ADDRESS (drb)) != 0
3143               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3144               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3145             break;
3146
3147           /* Check that the data-refs have the same constant size.  */
3148           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3149           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3150           if (!tree_fits_uhwi_p (sza)
3151               || !tree_fits_uhwi_p (szb)
3152               || !tree_int_cst_equal (sza, szb))
3153             break;
3154
3155           /* Check that the data-refs have the same step.  */
3156           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3157             break;
3158
3159           /* Check the types are compatible.
3160              ???  We don't distinguish this during sorting.  */
3161           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3162                                    TREE_TYPE (DR_REF (drb))))
3163             break;
3164
3165           /* Check that the DR_INITs are compile-time constants.  */
3166           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3167               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3168             break;
3169
3170           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3171              just hold extra information.  */
3172           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3173               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3174               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3175             break;
3176
3177           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3178           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3179           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3180           HOST_WIDE_INT init_prev
3181             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3182           gcc_assert (init_a <= init_b
3183                       && init_a <= init_prev
3184                       && init_prev <= init_b);
3185
3186           /* Do not place the same access in the interleaving chain twice.  */
3187           if (init_b == init_prev)
3188             {
3189               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3190                           < gimple_uid (DR_STMT (drb)));
3191               /* Simply link in duplicates and fix up the chain below.  */
3192             }
3193           else
3194             {
3195               /* If init_b == init_a + the size of the type * k, we have an
3196                  interleaving, and DRA is accessed before DRB.  */
3197               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3198               if (type_size_a == 0
3199                   || (init_b - init_a) % type_size_a != 0)
3200                 break;
3201
3202               /* If we have a store, the accesses are adjacent.  This splits
3203                  groups into chunks we support (we don't support vectorization
3204                  of stores with gaps).  */
3205               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3206                 break;
3207
3208               /* If the step (if not zero or non-constant) is smaller than the
3209                  difference between data-refs' inits this splits groups into
3210                  suitable sizes.  */
3211               if (tree_fits_shwi_p (DR_STEP (dra)))
3212                 {
3213                   unsigned HOST_WIDE_INT step
3214                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3215                   if (step != 0
3216                       && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3217                     break;
3218                 }
3219             }
3220
3221           if (dump_enabled_p ())
3222             dump_printf_loc (MSG_NOTE, vect_location,
3223                              DR_IS_READ (dra)
3224                              ? "Detected interleaving load %T and %T\n"
3225                              : "Detected interleaving store %T and %T\n",
3226                              DR_REF (dra), DR_REF (drb));
3227
3228           /* Link the found element into the group list.  */
3229           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3230             {
3231               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3232               lastinfo = stmtinfo_a;
3233             }
3234           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3235           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3236           lastinfo = stmtinfo_b;
3237
3238           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3239             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3240
3241           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3242             dump_printf_loc (MSG_NOTE, vect_location,
3243                              "Load suitable for SLP vectorization only.\n");
3244
3245           if (init_b == init_prev
3246               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3247               && dump_enabled_p ())
3248             dump_printf_loc (MSG_NOTE, vect_location,
3249                              "Queuing group with duplicate access for fixup\n");
3250         }
3251     }
3252
3253   /* Fixup groups with duplicate entries by splitting it.  */
3254   while (1)
3255     {
3256       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3257       if (!(it != to_fixup.end ()))
3258         break;
3259       stmt_vec_info grp = *it;
3260       to_fixup.remove (grp);
3261
3262       /* Find the earliest duplicate group member.  */
3263       unsigned first_duplicate = -1u;
3264       stmt_vec_info next, g = grp;
3265       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3266         {
3267           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3268                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3269               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3270             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3271           g = next;
3272         }
3273       if (first_duplicate == -1U)
3274         continue;
3275
3276       /* Then move all stmts after the first duplicate to a new group.
3277          Note this is a heuristic but one with the property that *it
3278          is fixed up completely.  */
3279       g = grp;
3280       stmt_vec_info newgroup = NULL, ng = grp;
3281       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3282         {
3283           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3284             {
3285               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3286               if (!newgroup)
3287                 newgroup = next;
3288               else
3289                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3290               ng = next;
3291               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3292             }
3293           else
3294             g = DR_GROUP_NEXT_ELEMENT (g);
3295         }
3296       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3297
3298       /* Fixup the new group which still may contain duplicates.  */
3299       to_fixup.add (newgroup);
3300     }
3301
3302   dr_vec_info *dr_info;
3303   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3304     {
3305       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3306           && !vect_analyze_data_ref_access (vinfo, dr_info))
3307         {
3308           if (dump_enabled_p ())
3309             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3310                              "not vectorized: complicated access pattern.\n");
3311
3312           if (is_a <bb_vec_info> (vinfo))
3313             {
3314               /* Mark the statement as not vectorizable.  */
3315               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3316               continue;
3317             }
3318           else
3319             {
3320               datarefs_copy.release ();
3321               return opt_result::failure_at (dr_info->stmt->stmt,
3322                                              "not vectorized:"
3323                                              " complicated access pattern.\n");
3324             }
3325         }
3326     }
3327
3328   datarefs_copy.release ();
3329   return opt_result::success ();
3330 }
3331
3332 /* Function vect_vfa_segment_size.
3333
3334    Input:
3335      DR_INFO: The data reference.
3336      LENGTH_FACTOR: segment length to consider.
3337
3338    Return a value suitable for the dr_with_seg_len::seg_len field.
3339    This is the "distance travelled" by the pointer from the first
3340    iteration in the segment to the last.  Note that it does not include
3341    the size of the access; in effect it only describes the first byte.  */
3342
3343 static tree
3344 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3345 {
3346   length_factor = size_binop (MINUS_EXPR,
3347                               fold_convert (sizetype, length_factor),
3348                               size_one_node);
3349   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3350                      length_factor);
3351 }
3352
3353 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3354    gives the worst-case number of bytes covered by the segment.  */
3355
3356 static unsigned HOST_WIDE_INT
3357 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3358 {
3359   stmt_vec_info stmt_vinfo = dr_info->stmt;
3360   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3361   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3362   unsigned HOST_WIDE_INT access_size = ref_size;
3363   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3364     {
3365       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3366       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3367     }
3368   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3369   int misalignment;
3370   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3371       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3372       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3373           == dr_explicit_realign_optimized))
3374     {
3375       /* We might access a full vector's worth.  */
3376       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3377     }
3378   return access_size;
3379 }
3380
3381 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3382    describes.  */
3383
3384 static unsigned int
3385 vect_vfa_align (dr_vec_info *dr_info)
3386 {
3387   return dr_alignment (dr_info->dr);
3388 }
3389
3390 /* Function vect_no_alias_p.
3391
3392    Given data references A and B with equal base and offset, see whether
3393    the alias relation can be decided at compilation time.  Return 1 if
3394    it can and the references alias, 0 if it can and the references do
3395    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3396    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3397    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3398
3399 static int
3400 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3401                          tree segment_length_a, tree segment_length_b,
3402                          unsigned HOST_WIDE_INT access_size_a,
3403                          unsigned HOST_WIDE_INT access_size_b)
3404 {
3405   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3406   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3407   poly_uint64 const_length_a;
3408   poly_uint64 const_length_b;
3409
3410   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3411      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3412      [a, a+12) */
3413   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3414     {
3415       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3416       offset_a -= const_length_a;
3417     }
3418   else
3419     const_length_a = tree_to_poly_uint64 (segment_length_a);
3420   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3421     {
3422       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3423       offset_b -= const_length_b;
3424     }
3425   else
3426     const_length_b = tree_to_poly_uint64 (segment_length_b);
3427
3428   const_length_a += access_size_a;
3429   const_length_b += access_size_b;
3430
3431   if (ranges_known_overlap_p (offset_a, const_length_a,
3432                               offset_b, const_length_b))
3433     return 1;
3434
3435   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3436                                offset_b, const_length_b))
3437     return 0;
3438
3439   return -1;
3440 }
3441
3442 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3443    in DDR is >= VF.  */
3444
3445 static bool
3446 dependence_distance_ge_vf (data_dependence_relation *ddr,
3447                            unsigned int loop_depth, poly_uint64 vf)
3448 {
3449   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3450       || DDR_NUM_DIST_VECTS (ddr) == 0)
3451     return false;
3452
3453   /* If the dependence is exact, we should have limited the VF instead.  */
3454   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3455
3456   unsigned int i;
3457   lambda_vector dist_v;
3458   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3459     {
3460       HOST_WIDE_INT dist = dist_v[loop_depth];
3461       if (dist != 0
3462           && !(dist > 0 && DDR_REVERSED_P (ddr))
3463           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3464         return false;
3465     }
3466
3467   if (dump_enabled_p ())
3468     dump_printf_loc (MSG_NOTE, vect_location,
3469                      "dependence distance between %T and %T is >= VF\n",
3470                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3471
3472   return true;
3473 }
3474
3475 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3476
3477 static void
3478 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3479 {
3480   dump_printf (dump_kind, "%s (%T) >= ",
3481                lower_bound.unsigned_p ? "unsigned" : "abs",
3482                lower_bound.expr);
3483   dump_dec (dump_kind, lower_bound.min_value);
3484 }
3485
3486 /* Record that the vectorized loop requires the vec_lower_bound described
3487    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3488
3489 static void
3490 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3491                         poly_uint64 min_value)
3492 {
3493   vec<vec_lower_bound> &lower_bounds
3494     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3495   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3496     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3497       {
3498         unsigned_p &= lower_bounds[i].unsigned_p;
3499         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3500         if (lower_bounds[i].unsigned_p != unsigned_p
3501             || maybe_lt (lower_bounds[i].min_value, min_value))
3502           {
3503             lower_bounds[i].unsigned_p = unsigned_p;
3504             lower_bounds[i].min_value = min_value;
3505             if (dump_enabled_p ())
3506               {
3507                 dump_printf_loc (MSG_NOTE, vect_location,
3508                                  "updating run-time check to ");
3509                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3510                 dump_printf (MSG_NOTE, "\n");
3511               }
3512           }
3513         return;
3514       }
3515
3516   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3517   if (dump_enabled_p ())
3518     {
3519       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3520       dump_lower_bound (MSG_NOTE, lower_bound);
3521       dump_printf (MSG_NOTE, "\n");
3522     }
3523   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3524 }
3525
3526 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3527    will span fewer than GAP bytes.  */
3528
3529 static bool
3530 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3531                   poly_int64 gap)
3532 {
3533   stmt_vec_info stmt_info = dr_info->stmt;
3534   HOST_WIDE_INT count
3535     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3536   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3537     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3538   return (estimated_poly_value (gap)
3539           <= count * vect_get_scalar_dr_size (dr_info));
3540 }
3541
3542 /* Return true if we know that there is no alias between DR_INFO_A and
3543    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3544    When returning true, set *LOWER_BOUND_OUT to this N.  */
3545
3546 static bool
3547 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3548                                 poly_uint64 *lower_bound_out)
3549 {
3550   /* Check that there is a constant gap of known sign between DR_A
3551      and DR_B.  */
3552   data_reference *dr_a = dr_info_a->dr;
3553   data_reference *dr_b = dr_info_b->dr;
3554   poly_int64 init_a, init_b;
3555   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3556       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3557       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3558       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3559       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3560       || !ordered_p (init_a, init_b))
3561     return false;
3562
3563   /* Sort DR_A and DR_B by the address they access.  */
3564   if (maybe_lt (init_b, init_a))
3565     {
3566       std::swap (init_a, init_b);
3567       std::swap (dr_info_a, dr_info_b);
3568       std::swap (dr_a, dr_b);
3569     }
3570
3571   /* If the two accesses could be dependent within a scalar iteration,
3572      make sure that we'd retain their order.  */
3573   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3574       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3575     return false;
3576
3577   /* There is no alias if abs (DR_STEP) is greater than or equal to
3578      the bytes spanned by the combination of the two accesses.  */
3579   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3580   return true;
3581 }
3582
3583 /* Function vect_prune_runtime_alias_test_list.
3584
3585    Prune a list of ddrs to be tested at run-time by versioning for alias.
3586    Merge several alias checks into one if possible.
3587    Return FALSE if resulting list of ddrs is longer then allowed by
3588    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3589
3590 opt_result
3591 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3592 {
3593   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3594   hash_set <tree_pair_hash> compared_objects;
3595
3596   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3597   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3598     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3599   const vec<vec_object_pair> &check_unequal_addrs
3600     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3601   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3602   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3603
3604   ddr_p ddr;
3605   unsigned int i;
3606   tree length_factor;
3607
3608   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3609
3610   /* Step values are irrelevant for aliasing if the number of vector
3611      iterations is equal to the number of scalar iterations (which can
3612      happen for fully-SLP loops).  */
3613   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3614
3615   if (!vf_one_p)
3616     {
3617       /* Convert the checks for nonzero steps into bound tests.  */
3618       tree value;
3619       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3620         vect_check_lower_bound (loop_vinfo, value, true, 1);
3621     }
3622
3623   if (may_alias_ddrs.is_empty ())
3624     return opt_result::success ();
3625
3626   comp_alias_ddrs.create (may_alias_ddrs.length ());
3627
3628   unsigned int loop_depth
3629     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3630                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3631
3632   /* First, we collect all data ref pairs for aliasing checks.  */
3633   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3634     {
3635       poly_uint64 lower_bound;
3636       tree segment_length_a, segment_length_b;
3637       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3638       unsigned int align_a, align_b;
3639
3640       /* Ignore the alias if the VF we chose ended up being no greater
3641          than the dependence distance.  */
3642       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3643         continue;
3644
3645       if (DDR_OBJECT_A (ddr))
3646         {
3647           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3648           if (!compared_objects.add (new_pair))
3649             {
3650               if (dump_enabled_p ())
3651                 dump_printf_loc (MSG_NOTE, vect_location,
3652                                  "checking that %T and %T"
3653                                  " have different addresses\n",
3654                                  new_pair.first, new_pair.second);
3655               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3656             }
3657           continue;
3658         }
3659
3660       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3661       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3662
3663       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3664       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3665
3666       bool preserves_scalar_order_p
3667         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3668       bool ignore_step_p
3669           = (vf_one_p
3670              && (preserves_scalar_order_p
3671                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3672                                      DR_STEP (dr_info_b->dr))));
3673
3674       /* Skip the pair if inter-iteration dependencies are irrelevant
3675          and intra-iteration dependencies are guaranteed to be honored.  */
3676       if (ignore_step_p
3677           && (preserves_scalar_order_p
3678               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3679                                                  &lower_bound)))
3680         {
3681           if (dump_enabled_p ())
3682             dump_printf_loc (MSG_NOTE, vect_location,
3683                              "no need for alias check between "
3684                              "%T and %T when VF is 1\n",
3685                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3686           continue;
3687         }
3688
3689       /* See whether we can handle the alias using a bounds check on
3690          the step, and whether that's likely to be the best approach.
3691          (It might not be, for example, if the minimum step is much larger
3692          than the number of bytes handled by one vector iteration.)  */
3693       if (!ignore_step_p
3694           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3695           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3696                                              &lower_bound)
3697           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3698               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3699         {
3700           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3701           if (dump_enabled_p ())
3702             {
3703               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3704                                "%T and %T when the step %T is outside ",
3705                                DR_REF (dr_info_a->dr),
3706                                DR_REF (dr_info_b->dr),
3707                                DR_STEP (dr_info_a->dr));
3708               if (unsigned_p)
3709                 dump_printf (MSG_NOTE, "[0");
3710               else
3711                 {
3712                   dump_printf (MSG_NOTE, "(");
3713                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3714                 }
3715               dump_printf (MSG_NOTE, ", ");
3716               dump_dec (MSG_NOTE, lower_bound);
3717               dump_printf (MSG_NOTE, ")\n");
3718             }
3719           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3720                                   unsigned_p, lower_bound);
3721           continue;
3722         }
3723
3724       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3725       if (dr_group_first_a)
3726         {
3727           stmt_info_a = dr_group_first_a;
3728           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3729         }
3730
3731       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3732       if (dr_group_first_b)
3733         {
3734           stmt_info_b = dr_group_first_b;
3735           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3736         }
3737
3738       if (ignore_step_p)
3739         {
3740           segment_length_a = size_zero_node;
3741           segment_length_b = size_zero_node;
3742         }
3743       else
3744         {
3745           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3746                                 DR_STEP (dr_info_b->dr), 0))
3747             length_factor = scalar_loop_iters;
3748           else
3749             length_factor = size_int (vect_factor);
3750           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3751           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3752         }
3753       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3754       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3755       align_a = vect_vfa_align (dr_info_a);
3756       align_b = vect_vfa_align (dr_info_b);
3757
3758       /* See whether the alias is known at compilation time.  */
3759       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3760                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3761           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3762                               DR_OFFSET (dr_info_b->dr), 0)
3763           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3764           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3765           && poly_int_tree_p (segment_length_a)
3766           && poly_int_tree_p (segment_length_b))
3767         {
3768           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3769                                              segment_length_a,
3770                                              segment_length_b,
3771                                              access_size_a,
3772                                              access_size_b);
3773           if (res >= 0 && dump_enabled_p ())
3774             {
3775               dump_printf_loc (MSG_NOTE, vect_location,
3776                                "can tell at compile time that %T and %T",
3777                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3778               if (res == 0)
3779                 dump_printf (MSG_NOTE, " do not alias\n");
3780               else
3781                 dump_printf (MSG_NOTE, " alias\n");
3782             }
3783
3784           if (res == 0)
3785             continue;
3786
3787           if (res == 1)
3788             return opt_result::failure_at (stmt_info_b->stmt,
3789                                            "not vectorized:"
3790                                            " compilation time alias: %G%G",
3791                                            stmt_info_a->stmt,
3792                                            stmt_info_b->stmt);
3793         }
3794
3795       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3796                             access_size_a, align_a);
3797       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3798                             access_size_b, align_b);
3799       /* Canonicalize the order to be the one that's needed for accurate
3800          RAW, WAR and WAW flags, in cases where the data references are
3801          well-ordered.  The order doesn't really matter otherwise,
3802          but we might as well be consistent.  */
3803       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3804         std::swap (dr_a, dr_b);
3805
3806       dr_with_seg_len_pair_t dr_with_seg_len_pair
3807         (dr_a, dr_b, (preserves_scalar_order_p
3808                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3809                       : dr_with_seg_len_pair_t::REORDERED));
3810
3811       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3812     }
3813
3814   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3815
3816   unsigned int count = (comp_alias_ddrs.length ()
3817                         + check_unequal_addrs.length ());
3818
3819   if (count
3820       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3821           == VECT_COST_MODEL_VERY_CHEAP))
3822     return opt_result::failure_at
3823       (vect_location, "would need a runtime alias check\n");
3824
3825   if (dump_enabled_p ())
3826     dump_printf_loc (MSG_NOTE, vect_location,
3827                      "improved number of alias checks from %d to %d\n",
3828                      may_alias_ddrs.length (), count);
3829   unsigned limit = param_vect_max_version_for_alias_checks;
3830   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3831     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3832   if (count > limit)
3833     return opt_result::failure_at
3834       (vect_location,
3835        "number of versioning for alias run-time tests exceeds %d "
3836        "(--param vect-max-version-for-alias-checks)\n", limit);
3837
3838   return opt_result::success ();
3839 }
3840
3841 /* Check whether we can use an internal function for a gather load
3842    or scatter store.  READ_P is true for loads and false for stores.
3843    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3844    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3845    is the type of the offset that is being applied to the invariant
3846    base address.  SCALE is the amount by which the offset should
3847    be multiplied *after* it has been converted to address width.
3848
3849    Return true if the function is supported, storing the function id in
3850    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3851
3852 bool
3853 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3854                           tree vectype, tree memory_type, tree offset_type,
3855                           int scale, internal_fn *ifn_out,
3856                           tree *offset_vectype_out)
3857 {
3858   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3859   unsigned int element_bits = vector_element_bits (vectype);
3860   if (element_bits != memory_bits)
3861     /* For now the vector elements must be the same width as the
3862        memory elements.  */
3863     return false;
3864
3865   /* Work out which function we need.  */
3866   internal_fn ifn, alt_ifn;
3867   if (read_p)
3868     {
3869       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3870       alt_ifn = IFN_MASK_GATHER_LOAD;
3871     }
3872   else
3873     {
3874       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3875       alt_ifn = IFN_MASK_SCATTER_STORE;
3876     }
3877
3878   for (;;)
3879     {
3880       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3881       if (!offset_vectype)
3882         return false;
3883
3884       /* Test whether the target supports this combination.  */
3885       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3886                                                   offset_vectype, scale))
3887         {
3888           *ifn_out = ifn;
3889           *offset_vectype_out = offset_vectype;
3890           return true;
3891         }
3892       else if (!masked_p
3893                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3894                                                           memory_type,
3895                                                           offset_vectype,
3896                                                           scale))
3897         {
3898           *ifn_out = alt_ifn;
3899           *offset_vectype_out = offset_vectype;
3900           return true;
3901         }
3902
3903       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3904           && TYPE_PRECISION (offset_type) >= element_bits)
3905         return false;
3906
3907       offset_type = build_nonstandard_integer_type
3908         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3909     }
3910 }
3911
3912 /* STMT_INFO is a call to an internal gather load or scatter store function.
3913    Describe the operation in INFO.  */
3914
3915 static void
3916 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3917                                    gather_scatter_info *info)
3918 {
3919   gcall *call = as_a <gcall *> (stmt_info->stmt);
3920   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3921   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3922
3923   info->ifn = gimple_call_internal_fn (call);
3924   info->decl = NULL_TREE;
3925   info->base = gimple_call_arg (call, 0);
3926   info->offset = gimple_call_arg (call, 1);
3927   info->offset_dt = vect_unknown_def_type;
3928   info->offset_vectype = NULL_TREE;
3929   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3930   info->element_type = TREE_TYPE (vectype);
3931   info->memory_type = TREE_TYPE (DR_REF (dr));
3932 }
3933
3934 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3935    gather load or scatter store.  Describe the operation in *INFO if so.  */
3936
3937 bool
3938 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3939                            gather_scatter_info *info)
3940 {
3941   HOST_WIDE_INT scale = 1;
3942   poly_int64 pbitpos, pbitsize;
3943   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3944   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3945   tree offtype = NULL_TREE;
3946   tree decl = NULL_TREE, base, off;
3947   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3948   tree memory_type = TREE_TYPE (DR_REF (dr));
3949   machine_mode pmode;
3950   int punsignedp, reversep, pvolatilep = 0;
3951   internal_fn ifn;
3952   tree offset_vectype;
3953   bool masked_p = false;
3954
3955   /* See whether this is already a call to a gather/scatter internal function.
3956      If not, see whether it's a masked load or store.  */
3957   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3958   if (call && gimple_call_internal_p (call))
3959     {
3960       ifn = gimple_call_internal_fn (call);
3961       if (internal_gather_scatter_fn_p (ifn))
3962         {
3963           vect_describe_gather_scatter_call (stmt_info, info);
3964           return true;
3965         }
3966       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3967     }
3968
3969   /* True if we should aim to use internal functions rather than
3970      built-in functions.  */
3971   bool use_ifn_p = (DR_IS_READ (dr)
3972                     ? supports_vec_gather_load_p ()
3973                     : supports_vec_scatter_store_p ());
3974
3975   base = DR_REF (dr);
3976   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3977      see if we can use the def stmt of the address.  */
3978   if (masked_p
3979       && TREE_CODE (base) == MEM_REF
3980       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3981       && integer_zerop (TREE_OPERAND (base, 1))
3982       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3983     {
3984       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3985       if (is_gimple_assign (def_stmt)
3986           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3987         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3988     }
3989
3990   /* The gather and scatter builtins need address of the form
3991      loop_invariant + vector * {1, 2, 4, 8}
3992      or
3993      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3994      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3995      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3996      multiplications and additions in it.  To get a vector, we need
3997      a single SSA_NAME that will be defined in the loop and will
3998      contain everything that is not loop invariant and that can be
3999      vectorized.  The following code attempts to find such a preexistng
4000      SSA_NAME OFF and put the loop invariants into a tree BASE
4001      that can be gimplified before the loop.  */
4002   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4003                               &punsignedp, &reversep, &pvolatilep);
4004   if (reversep)
4005     return false;
4006
4007   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4008
4009   if (TREE_CODE (base) == MEM_REF)
4010     {
4011       if (!integer_zerop (TREE_OPERAND (base, 1)))
4012         {
4013           if (off == NULL_TREE)
4014             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4015           else
4016             off = size_binop (PLUS_EXPR, off,
4017                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4018         }
4019       base = TREE_OPERAND (base, 0);
4020     }
4021   else
4022     base = build_fold_addr_expr (base);
4023
4024   if (off == NULL_TREE)
4025     off = size_zero_node;
4026
4027   /* If base is not loop invariant, either off is 0, then we start with just
4028      the constant offset in the loop invariant BASE and continue with base
4029      as OFF, otherwise give up.
4030      We could handle that case by gimplifying the addition of base + off
4031      into some SSA_NAME and use that as off, but for now punt.  */
4032   if (!expr_invariant_in_loop_p (loop, base))
4033     {
4034       if (!integer_zerop (off))
4035         return false;
4036       off = base;
4037       base = size_int (pbytepos);
4038     }
4039   /* Otherwise put base + constant offset into the loop invariant BASE
4040      and continue with OFF.  */
4041   else
4042     {
4043       base = fold_convert (sizetype, base);
4044       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4045     }
4046
4047   /* OFF at this point may be either a SSA_NAME or some tree expression
4048      from get_inner_reference.  Try to peel off loop invariants from it
4049      into BASE as long as possible.  */
4050   STRIP_NOPS (off);
4051   while (offtype == NULL_TREE)
4052     {
4053       enum tree_code code;
4054       tree op0, op1, add = NULL_TREE;
4055
4056       if (TREE_CODE (off) == SSA_NAME)
4057         {
4058           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4059
4060           if (expr_invariant_in_loop_p (loop, off))
4061             return false;
4062
4063           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4064             break;
4065
4066           op0 = gimple_assign_rhs1 (def_stmt);
4067           code = gimple_assign_rhs_code (def_stmt);
4068           op1 = gimple_assign_rhs2 (def_stmt);
4069         }
4070       else
4071         {
4072           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4073             return false;
4074           code = TREE_CODE (off);
4075           extract_ops_from_tree (off, &code, &op0, &op1);
4076         }
4077       switch (code)
4078         {
4079         case POINTER_PLUS_EXPR:
4080         case PLUS_EXPR:
4081           if (expr_invariant_in_loop_p (loop, op0))
4082             {
4083               add = op0;
4084               off = op1;
4085             do_add:
4086               add = fold_convert (sizetype, add);
4087               if (scale != 1)
4088                 add = size_binop (MULT_EXPR, add, size_int (scale));
4089               base = size_binop (PLUS_EXPR, base, add);
4090               continue;
4091             }
4092           if (expr_invariant_in_loop_p (loop, op1))
4093             {
4094               add = op1;
4095               off = op0;
4096               goto do_add;
4097             }
4098           break;
4099         case MINUS_EXPR:
4100           if (expr_invariant_in_loop_p (loop, op1))
4101             {
4102               add = fold_convert (sizetype, op1);
4103               add = size_binop (MINUS_EXPR, size_zero_node, add);
4104               off = op0;
4105               goto do_add;
4106             }
4107           break;
4108         case MULT_EXPR:
4109           if (scale == 1 && tree_fits_shwi_p (op1))
4110             {
4111               int new_scale = tree_to_shwi (op1);
4112               /* Only treat this as a scaling operation if the target
4113                  supports it for at least some offset type.  */
4114               if (use_ifn_p
4115                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4116                                                 masked_p, vectype, memory_type,
4117                                                 signed_char_type_node,
4118                                                 new_scale, &ifn,
4119                                                 &offset_vectype)
4120                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4121                                                 masked_p, vectype, memory_type,
4122                                                 unsigned_char_type_node,
4123                                                 new_scale, &ifn,
4124                                                 &offset_vectype))
4125                 break;
4126               scale = new_scale;
4127               off = op0;
4128               continue;
4129             }
4130           break;
4131         case SSA_NAME:
4132           off = op0;
4133           continue;
4134         CASE_CONVERT:
4135           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4136               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4137             break;
4138
4139           /* Don't include the conversion if the target is happy with
4140              the current offset type.  */
4141           if (use_ifn_p
4142               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4143                                            masked_p, vectype, memory_type,
4144                                            TREE_TYPE (off), scale, &ifn,
4145                                            &offset_vectype))
4146             break;
4147
4148           if (TYPE_PRECISION (TREE_TYPE (op0))
4149               == TYPE_PRECISION (TREE_TYPE (off)))
4150             {
4151               off = op0;
4152               continue;
4153             }
4154
4155           /* Include the conversion if it is widening and we're using
4156              the IFN path or the target can handle the converted from
4157              offset or the current size is not already the same as the
4158              data vector element size.  */
4159           if ((TYPE_PRECISION (TREE_TYPE (op0))
4160                < TYPE_PRECISION (TREE_TYPE (off)))
4161               && (use_ifn_p
4162                   || (DR_IS_READ (dr)
4163                       ? (targetm.vectorize.builtin_gather
4164                          && targetm.vectorize.builtin_gather (vectype,
4165                                                               TREE_TYPE (op0),
4166                                                               scale))
4167                       : (targetm.vectorize.builtin_scatter
4168                          && targetm.vectorize.builtin_scatter (vectype,
4169                                                                TREE_TYPE (op0),
4170                                                                scale)))
4171                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4172                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4173             {
4174               off = op0;
4175               offtype = TREE_TYPE (off);
4176               STRIP_NOPS (off);
4177               continue;
4178             }
4179           break;
4180         default:
4181           break;
4182         }
4183       break;
4184     }
4185
4186   /* If at the end OFF still isn't a SSA_NAME or isn't
4187      defined in the loop, punt.  */
4188   if (TREE_CODE (off) != SSA_NAME
4189       || expr_invariant_in_loop_p (loop, off))
4190     return false;
4191
4192   if (offtype == NULL_TREE)
4193     offtype = TREE_TYPE (off);
4194
4195   if (use_ifn_p)
4196     {
4197       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4198                                      vectype, memory_type, offtype, scale,
4199                                      &ifn, &offset_vectype))
4200         ifn = IFN_LAST;
4201       decl = NULL_TREE;
4202     }
4203   else
4204     {
4205       if (DR_IS_READ (dr))
4206         {
4207           if (targetm.vectorize.builtin_gather)
4208             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4209         }
4210       else
4211         {
4212           if (targetm.vectorize.builtin_scatter)
4213             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4214         }
4215       ifn = IFN_LAST;
4216       /* The offset vector type will be read from DECL when needed.  */
4217       offset_vectype = NULL_TREE;
4218     }
4219
4220   info->ifn = ifn;
4221   info->decl = decl;
4222   info->base = base;
4223   info->offset = off;
4224   info->offset_dt = vect_unknown_def_type;
4225   info->offset_vectype = offset_vectype;
4226   info->scale = scale;
4227   info->element_type = TREE_TYPE (vectype);
4228   info->memory_type = memory_type;
4229   return true;
4230 }
4231
4232 /* Find the data references in STMT, analyze them with respect to LOOP and
4233    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4234    be handled.  */
4235
4236 opt_result
4237 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4238                                vec<data_reference_p> *datarefs,
4239                                vec<int> *dataref_groups, int group_id)
4240 {
4241   /* We can ignore clobbers for dataref analysis - they are removed during
4242      loop vectorization and BB vectorization checks dependences with a
4243      stmt walk.  */
4244   if (gimple_clobber_p (stmt))
4245     return opt_result::success ();
4246
4247   if (gimple_has_volatile_ops (stmt))
4248     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4249                                    stmt);
4250
4251   if (stmt_can_throw_internal (cfun, stmt))
4252     return opt_result::failure_at (stmt,
4253                                    "not vectorized:"
4254                                    " statement can throw an exception: %G",
4255                                    stmt);
4256
4257   auto_vec<data_reference_p, 2> refs;
4258   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4259   if (!res)
4260     return res;
4261
4262   if (refs.is_empty ())
4263     return opt_result::success ();
4264
4265   if (refs.length () > 1)
4266     {
4267       while (!refs.is_empty ())
4268         free_data_ref (refs.pop ());
4269       return opt_result::failure_at (stmt,
4270                                      "not vectorized: more than one "
4271                                      "data ref in stmt: %G", stmt);
4272     }
4273
4274   data_reference_p dr = refs.pop ();
4275   if (gcall *call = dyn_cast <gcall *> (stmt))
4276     if (!gimple_call_internal_p (call)
4277         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4278             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4279       {
4280         free_data_ref (dr);
4281         return opt_result::failure_at (stmt,
4282                                        "not vectorized: dr in a call %G", stmt);
4283       }
4284
4285   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4286       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4287     {
4288       free_data_ref (dr);
4289       return opt_result::failure_at (stmt,
4290                                      "not vectorized:"
4291                                      " statement is bitfield access %G", stmt);
4292     }
4293
4294   if (DR_BASE_ADDRESS (dr)
4295       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4296     {
4297       free_data_ref (dr);
4298       return opt_result::failure_at (stmt,
4299                                      "not vectorized:"
4300                                      " base addr of dr is a constant\n");
4301     }
4302
4303   /* Check whether this may be a SIMD lane access and adjust the
4304      DR to make it easier for us to handle it.  */
4305   if (loop
4306       && loop->simduid
4307       && (!DR_BASE_ADDRESS (dr)
4308           || !DR_OFFSET (dr)
4309           || !DR_INIT (dr)
4310           || !DR_STEP (dr)))
4311     {
4312       struct data_reference *newdr
4313         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4314                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4315       if (DR_BASE_ADDRESS (newdr)
4316           && DR_OFFSET (newdr)
4317           && DR_INIT (newdr)
4318           && DR_STEP (newdr)
4319           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4320           && integer_zerop (DR_STEP (newdr)))
4321         {
4322           tree base_address = DR_BASE_ADDRESS (newdr);
4323           tree off = DR_OFFSET (newdr);
4324           tree step = ssize_int (1);
4325           if (integer_zerop (off)
4326               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4327             {
4328               off = TREE_OPERAND (base_address, 1);
4329               base_address = TREE_OPERAND (base_address, 0);
4330             }
4331           STRIP_NOPS (off);
4332           if (TREE_CODE (off) == MULT_EXPR
4333               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4334             {
4335               step = TREE_OPERAND (off, 1);
4336               off = TREE_OPERAND (off, 0);
4337               STRIP_NOPS (off);
4338             }
4339           if (CONVERT_EXPR_P (off)
4340               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4341                   < TYPE_PRECISION (TREE_TYPE (off))))
4342             off = TREE_OPERAND (off, 0);
4343           if (TREE_CODE (off) == SSA_NAME)
4344             {
4345               gimple *def = SSA_NAME_DEF_STMT (off);
4346               /* Look through widening conversion.  */
4347               if (is_gimple_assign (def)
4348                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4349                 {
4350                   tree rhs1 = gimple_assign_rhs1 (def);
4351                   if (TREE_CODE (rhs1) == SSA_NAME
4352                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4353                       && (TYPE_PRECISION (TREE_TYPE (off))
4354                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4355                     def = SSA_NAME_DEF_STMT (rhs1);
4356                 }
4357               if (is_gimple_call (def)
4358                   && gimple_call_internal_p (def)
4359                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4360                 {
4361                   tree arg = gimple_call_arg (def, 0);
4362                   tree reft = TREE_TYPE (DR_REF (newdr));
4363                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4364                   arg = SSA_NAME_VAR (arg);
4365                   if (arg == loop->simduid
4366                       /* For now.  */
4367                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4368                     {
4369                       DR_BASE_ADDRESS (newdr) = base_address;
4370                       DR_OFFSET (newdr) = ssize_int (0);
4371                       DR_STEP (newdr) = step;
4372                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4373                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4374                       /* Mark as simd-lane access.  */
4375                       tree arg2 = gimple_call_arg (def, 1);
4376                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4377                       free_data_ref (dr);
4378                       datarefs->safe_push (newdr);
4379                       if (dataref_groups)
4380                         dataref_groups->safe_push (group_id);
4381                       return opt_result::success ();
4382                     }
4383                 }
4384             }
4385         }
4386       free_data_ref (newdr);
4387     }
4388
4389   datarefs->safe_push (dr);
4390   if (dataref_groups)
4391     dataref_groups->safe_push (group_id);
4392   return opt_result::success ();
4393 }
4394
4395 /* Function vect_analyze_data_refs.
4396
4397   Find all the data references in the loop or basic block.
4398
4399    The general structure of the analysis of data refs in the vectorizer is as
4400    follows:
4401    1- vect_analyze_data_refs(loop/bb): call
4402       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4403       in the loop/bb and their dependences.
4404    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4405    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4406    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4407
4408 */
4409
4410 opt_result
4411 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4412 {
4413   class loop *loop = NULL;
4414   unsigned int i;
4415   struct data_reference *dr;
4416   tree scalar_type;
4417
4418   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4419
4420   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4421     loop = LOOP_VINFO_LOOP (loop_vinfo);
4422
4423   /* Go through the data-refs, check that the analysis succeeded.  Update
4424      pointer from stmt_vec_info struct to DR and vectype.  */
4425
4426   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4427   FOR_EACH_VEC_ELT (datarefs, i, dr)
4428     {
4429       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4430       poly_uint64 vf;
4431
4432       gcc_assert (DR_REF (dr));
4433       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4434       gcc_assert (!stmt_info->dr_aux.dr);
4435       stmt_info->dr_aux.dr = dr;
4436       stmt_info->dr_aux.stmt = stmt_info;
4437
4438       /* Check that analysis of the data-ref succeeded.  */
4439       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4440           || !DR_STEP (dr))
4441         {
4442           bool maybe_gather
4443             = DR_IS_READ (dr)
4444               && !TREE_THIS_VOLATILE (DR_REF (dr));
4445           bool maybe_scatter
4446             = DR_IS_WRITE (dr)
4447               && !TREE_THIS_VOLATILE (DR_REF (dr))
4448               && (targetm.vectorize.builtin_scatter != NULL
4449                   || supports_vec_scatter_store_p ());
4450
4451           /* If target supports vector gather loads or scatter stores,
4452              see if they can't be used.  */
4453           if (is_a <loop_vec_info> (vinfo)
4454               && !nested_in_vect_loop_p (loop, stmt_info))
4455             {
4456               if (maybe_gather || maybe_scatter)
4457                 {
4458                   if (maybe_gather)
4459                     gatherscatter = GATHER;
4460                   else
4461                     gatherscatter = SCATTER;
4462                 }
4463             }
4464
4465           if (gatherscatter == SG_NONE)
4466             {
4467               if (dump_enabled_p ())
4468                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4469                                  "not vectorized: data ref analysis "
4470                                  "failed %G", stmt_info->stmt);
4471               if (is_a <bb_vec_info> (vinfo))
4472                 {
4473                   /* In BB vectorization the ref can still participate
4474                      in dependence analysis, we just can't vectorize it.  */
4475                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4476                   continue;
4477                 }
4478               return opt_result::failure_at (stmt_info->stmt,
4479                                              "not vectorized:"
4480                                              " data ref analysis failed: %G",
4481                                              stmt_info->stmt);
4482             }
4483         }
4484
4485       /* See if this was detected as SIMD lane access.  */
4486       if (dr->aux == (void *)-1
4487           || dr->aux == (void *)-2
4488           || dr->aux == (void *)-3
4489           || dr->aux == (void *)-4)
4490         {
4491           if (nested_in_vect_loop_p (loop, stmt_info))
4492             return opt_result::failure_at (stmt_info->stmt,
4493                                            "not vectorized:"
4494                                            " data ref analysis failed: %G",
4495                                            stmt_info->stmt);
4496           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4497             = -(uintptr_t) dr->aux;
4498         }
4499
4500       tree base = get_base_address (DR_REF (dr));
4501       if (base && VAR_P (base) && DECL_NONALIASED (base))
4502         {
4503           if (dump_enabled_p ())
4504             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4505                              "not vectorized: base object not addressable "
4506                              "for stmt: %G", stmt_info->stmt);
4507           if (is_a <bb_vec_info> (vinfo))
4508             {
4509               /* In BB vectorization the ref can still participate
4510                  in dependence analysis, we just can't vectorize it.  */
4511               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4512               continue;
4513             }
4514           return opt_result::failure_at (stmt_info->stmt,
4515                                          "not vectorized: base object not"
4516                                          " addressable for stmt: %G",
4517                                          stmt_info->stmt);
4518         }
4519
4520       if (is_a <loop_vec_info> (vinfo)
4521           && DR_STEP (dr)
4522           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4523         {
4524           if (nested_in_vect_loop_p (loop, stmt_info))
4525             return opt_result::failure_at (stmt_info->stmt,
4526                                            "not vectorized: "
4527                                            "not suitable for strided load %G",
4528                                            stmt_info->stmt);
4529           STMT_VINFO_STRIDED_P (stmt_info) = true;
4530         }
4531
4532       /* Update DR field in stmt_vec_info struct.  */
4533
4534       /* If the dataref is in an inner-loop of the loop that is considered for
4535          for vectorization, we also want to analyze the access relative to
4536          the outer-loop (DR contains information only relative to the
4537          inner-most enclosing loop).  We do that by building a reference to the
4538          first location accessed by the inner-loop, and analyze it relative to
4539          the outer-loop.  */
4540       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4541         {
4542           /* Build a reference to the first location accessed by the
4543              inner loop: *(BASE + INIT + OFFSET).  By construction,
4544              this address must be invariant in the inner loop, so we
4545              can consider it as being used in the outer loop.  */
4546           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4547           tree offset = unshare_expr (DR_OFFSET (dr));
4548           tree init = unshare_expr (DR_INIT (dr));
4549           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4550                                           init, offset);
4551           tree init_addr = fold_build_pointer_plus (base, init_offset);
4552           tree init_ref = build_fold_indirect_ref (init_addr);
4553
4554           if (dump_enabled_p ())
4555             dump_printf_loc (MSG_NOTE, vect_location,
4556                              "analyze in outer loop: %T\n", init_ref);
4557
4558           opt_result res
4559             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4560                                     init_ref, loop, stmt_info->stmt);
4561           if (!res)
4562             /* dr_analyze_innermost already explained the failure.  */
4563             return res;
4564
4565           if (dump_enabled_p ())
4566             dump_printf_loc (MSG_NOTE, vect_location,
4567                              "\touter base_address: %T\n"
4568                              "\touter offset from base address: %T\n"
4569                              "\touter constant offset from base address: %T\n"
4570                              "\touter step: %T\n"
4571                              "\touter base alignment: %d\n\n"
4572                              "\touter base misalignment: %d\n"
4573                              "\touter offset alignment: %d\n"
4574                              "\touter step alignment: %d\n",
4575                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4576                              STMT_VINFO_DR_OFFSET (stmt_info),
4577                              STMT_VINFO_DR_INIT (stmt_info),
4578                              STMT_VINFO_DR_STEP (stmt_info),
4579                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4580                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4581                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4582                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4583         }
4584
4585       /* Set vectype for STMT.  */
4586       scalar_type = TREE_TYPE (DR_REF (dr));
4587       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4588       if (!vectype)
4589         {
4590           if (dump_enabled_p ())
4591             {
4592               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4593                                "not vectorized: no vectype for stmt: %G",
4594                                stmt_info->stmt);
4595               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4596               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4597                                  scalar_type);
4598               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4599             }
4600
4601           if (is_a <bb_vec_info> (vinfo))
4602             {
4603               /* No vector type is fine, the ref can still participate
4604                  in dependence analysis, we just can't vectorize it.  */
4605               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4606               continue;
4607             }
4608           if (fatal)
4609             *fatal = false;
4610           return opt_result::failure_at (stmt_info->stmt,
4611                                          "not vectorized:"
4612                                          " no vectype for stmt: %G"
4613                                          " scalar_type: %T\n",
4614                                          stmt_info->stmt, scalar_type);
4615         }
4616       else
4617         {
4618           if (dump_enabled_p ())
4619             dump_printf_loc (MSG_NOTE, vect_location,
4620                              "got vectype for stmt: %G%T\n",
4621                              stmt_info->stmt, vectype);
4622         }
4623
4624       /* Adjust the minimal vectorization factor according to the
4625          vector type.  */
4626       vf = TYPE_VECTOR_SUBPARTS (vectype);
4627       *min_vf = upper_bound (*min_vf, vf);
4628
4629       /* Leave the BB vectorizer to pick the vector type later, based on
4630          the final dataref group size and SLP node size.  */
4631       if (is_a <loop_vec_info> (vinfo))
4632         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4633
4634       if (gatherscatter != SG_NONE)
4635         {
4636           gather_scatter_info gs_info;
4637           if (!vect_check_gather_scatter (stmt_info,
4638                                           as_a <loop_vec_info> (vinfo),
4639                                           &gs_info)
4640               || !get_vectype_for_scalar_type (vinfo,
4641                                                TREE_TYPE (gs_info.offset)))
4642             {
4643               if (fatal)
4644                 *fatal = false;
4645               return opt_result::failure_at
4646                         (stmt_info->stmt,
4647                          (gatherscatter == GATHER)
4648                          ? "not vectorized: not suitable for gather load %G"
4649                          : "not vectorized: not suitable for scatter store %G",
4650                          stmt_info->stmt);
4651             }
4652           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4653         }
4654     }
4655
4656   /* We used to stop processing and prune the list here.  Verify we no
4657      longer need to.  */
4658   gcc_assert (i == datarefs.length ());
4659
4660   return opt_result::success ();
4661 }
4662
4663
4664 /* Function vect_get_new_vect_var.
4665
4666    Returns a name for a new variable.  The current naming scheme appends the
4667    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4668    the name of vectorizer generated variables, and appends that to NAME if
4669    provided.  */
4670
4671 tree
4672 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4673 {
4674   const char *prefix;
4675   tree new_vect_var;
4676
4677   switch (var_kind)
4678   {
4679   case vect_simple_var:
4680     prefix = "vect";
4681     break;
4682   case vect_scalar_var:
4683     prefix = "stmp";
4684     break;
4685   case vect_mask_var:
4686     prefix = "mask";
4687     break;
4688   case vect_pointer_var:
4689     prefix = "vectp";
4690     break;
4691   default:
4692     gcc_unreachable ();
4693   }
4694
4695   if (name)
4696     {
4697       char* tmp = concat (prefix, "_", name, NULL);
4698       new_vect_var = create_tmp_reg (type, tmp);
4699       free (tmp);
4700     }
4701   else
4702     new_vect_var = create_tmp_reg (type, prefix);
4703
4704   return new_vect_var;
4705 }
4706
4707 /* Like vect_get_new_vect_var but return an SSA name.  */
4708
4709 tree
4710 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4711 {
4712   const char *prefix;
4713   tree new_vect_var;
4714
4715   switch (var_kind)
4716   {
4717   case vect_simple_var:
4718     prefix = "vect";
4719     break;
4720   case vect_scalar_var:
4721     prefix = "stmp";
4722     break;
4723   case vect_pointer_var:
4724     prefix = "vectp";
4725     break;
4726   default:
4727     gcc_unreachable ();
4728   }
4729
4730   if (name)
4731     {
4732       char* tmp = concat (prefix, "_", name, NULL);
4733       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4734       free (tmp);
4735     }
4736   else
4737     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4738
4739   return new_vect_var;
4740 }
4741
4742 /* Duplicate points-to info on NAME from DR_INFO.  */
4743
4744 static void
4745 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4746 {
4747   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4748   /* DR_PTR_INFO is for a base SSA name, not including constant or
4749      variable offsets in the ref so its alignment info does not apply.  */
4750   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4751 }
4752
4753 /* Function vect_create_addr_base_for_vector_ref.
4754
4755    Create an expression that computes the address of the first memory location
4756    that will be accessed for a data reference.
4757
4758    Input:
4759    STMT_INFO: The statement containing the data reference.
4760    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4761    OFFSET: Optional. If supplied, it is be added to the initial address.
4762    LOOP:    Specify relative to which loop-nest should the address be computed.
4763             For example, when the dataref is in an inner-loop nested in an
4764             outer-loop that is now being vectorized, LOOP can be either the
4765             outer-loop, or the inner-loop.  The first memory location accessed
4766             by the following dataref ('in' points to short):
4767
4768                 for (i=0; i<N; i++)
4769                    for (j=0; j<M; j++)
4770                      s += in[i+j]
4771
4772             is as follows:
4773             if LOOP=i_loop:     &in             (relative to i_loop)
4774             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4775
4776    Output:
4777    1. Return an SSA_NAME whose value is the address of the memory location of
4778       the first vector of the data reference.
4779    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4780       these statement(s) which define the returned SSA_NAME.
4781
4782    FORNOW: We are only handling array accesses with step 1.  */
4783
4784 tree
4785 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4786                                       gimple_seq *new_stmt_list,
4787                                       tree offset)
4788 {
4789   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4790   struct data_reference *dr = dr_info->dr;
4791   const char *base_name;
4792   tree addr_base;
4793   tree dest;
4794   gimple_seq seq = NULL;
4795   tree vect_ptr_type;
4796   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4797   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4798
4799   tree data_ref_base = unshare_expr (drb->base_address);
4800   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4801   tree init = unshare_expr (drb->init);
4802
4803   if (loop_vinfo)
4804     base_name = get_name (data_ref_base);
4805   else
4806     {
4807       base_offset = ssize_int (0);
4808       init = ssize_int (0);
4809       base_name = get_name (DR_REF (dr));
4810     }
4811
4812   /* Create base_offset */
4813   base_offset = size_binop (PLUS_EXPR,
4814                             fold_convert (sizetype, base_offset),
4815                             fold_convert (sizetype, init));
4816
4817   if (offset)
4818     {
4819       offset = fold_convert (sizetype, offset);
4820       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4821                                  base_offset, offset);
4822     }
4823
4824   /* base + base_offset */
4825   if (loop_vinfo)
4826     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4827   else
4828     {
4829       addr_base = build1 (ADDR_EXPR,
4830                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4831                           unshare_expr (DR_REF (dr)));
4832     }
4833
4834   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4835   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4836   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4837   gimple_seq_add_seq (new_stmt_list, seq);
4838
4839   if (DR_PTR_INFO (dr)
4840       && TREE_CODE (addr_base) == SSA_NAME
4841       /* We should only duplicate pointer info to newly created SSA names.  */
4842       && SSA_NAME_VAR (addr_base) == dest)
4843     {
4844       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4845       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4846     }
4847
4848   if (dump_enabled_p ())
4849     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4850
4851   return addr_base;
4852 }
4853
4854
4855 /* Function vect_create_data_ref_ptr.
4856
4857    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4858    location accessed in the loop by STMT_INFO, along with the def-use update
4859    chain to appropriately advance the pointer through the loop iterations.
4860    Also set aliasing information for the pointer.  This pointer is used by
4861    the callers to this function to create a memory reference expression for
4862    vector load/store access.
4863
4864    Input:
4865    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4866          GIMPLE_ASSIGN <name, data-ref> or
4867          GIMPLE_ASSIGN <data-ref, name>.
4868    2. AGGR_TYPE: the type of the reference, which should be either a vector
4869         or an array.
4870    3. AT_LOOP: the loop where the vector memref is to be created.
4871    4. OFFSET (optional): a byte offset to be added to the initial address
4872         accessed by the data-ref in STMT_INFO.
4873    5. BSI: location where the new stmts are to be placed if there is no loop
4874    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4875         pointing to the initial address.
4876    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4877         to the IV during each iteration of the loop.  NULL says to move
4878         by one copy of AGGR_TYPE up or down, depending on the step of the
4879         data reference.
4880
4881    Output:
4882    1. Declare a new ptr to vector_type, and have it point to the base of the
4883       data reference (initial addressed accessed by the data reference).
4884       For example, for vector of type V8HI, the following code is generated:
4885
4886       v8hi *ap;
4887       ap = (v8hi *)initial_address;
4888
4889       if OFFSET is not supplied:
4890          initial_address = &a[init];
4891       if OFFSET is supplied:
4892          initial_address = &a[init] + OFFSET;
4893       if BYTE_OFFSET is supplied:
4894          initial_address = &a[init] + BYTE_OFFSET;
4895
4896       Return the initial_address in INITIAL_ADDRESS.
4897
4898    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4899       update the pointer in each iteration of the loop.
4900
4901       Return the increment stmt that updates the pointer in PTR_INCR.
4902
4903    3. Return the pointer.  */
4904
4905 tree
4906 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4907                           tree aggr_type, class loop *at_loop, tree offset,
4908                           tree *initial_address, gimple_stmt_iterator *gsi,
4909                           gimple **ptr_incr, bool only_init,
4910                           tree iv_step)
4911 {
4912   const char *base_name;
4913   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4914   class loop *loop = NULL;
4915   bool nested_in_vect_loop = false;
4916   class loop *containing_loop = NULL;
4917   tree aggr_ptr_type;
4918   tree aggr_ptr;
4919   tree new_temp;
4920   gimple_seq new_stmt_list = NULL;
4921   edge pe = NULL;
4922   basic_block new_bb;
4923   tree aggr_ptr_init;
4924   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4925   struct data_reference *dr = dr_info->dr;
4926   tree aptr;
4927   gimple_stmt_iterator incr_gsi;
4928   bool insert_after;
4929   tree indx_before_incr, indx_after_incr;
4930   gimple *incr;
4931   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4932
4933   gcc_assert (iv_step != NULL_TREE
4934               || TREE_CODE (aggr_type) == ARRAY_TYPE
4935               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4936
4937   if (loop_vinfo)
4938     {
4939       loop = LOOP_VINFO_LOOP (loop_vinfo);
4940       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4941       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4942       pe = loop_preheader_edge (loop);
4943     }
4944   else
4945     {
4946       gcc_assert (bb_vinfo);
4947       only_init = true;
4948       *ptr_incr = NULL;
4949     }
4950
4951   /* Create an expression for the first address accessed by this load
4952      in LOOP.  */
4953   base_name = get_name (DR_BASE_ADDRESS (dr));
4954
4955   if (dump_enabled_p ())
4956     {
4957       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4958       dump_printf_loc (MSG_NOTE, vect_location,
4959                        "create %s-pointer variable to type: %T",
4960                        get_tree_code_name (TREE_CODE (aggr_type)),
4961                        aggr_type);
4962       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4963         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4964       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4965         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4966       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4967         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4968       else
4969         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4970       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4971     }
4972
4973   /* (1) Create the new aggregate-pointer variable.
4974      Vector and array types inherit the alias set of their component
4975      type by default so we need to use a ref-all pointer if the data
4976      reference does not conflict with the created aggregated data
4977      reference because it is not addressable.  */
4978   bool need_ref_all = false;
4979   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4980                               get_alias_set (DR_REF (dr))))
4981     need_ref_all = true;
4982   /* Likewise for any of the data references in the stmt group.  */
4983   else if (DR_GROUP_SIZE (stmt_info) > 1)
4984     {
4985       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4986       do
4987         {
4988           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4989           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4990                                       get_alias_set (DR_REF (sdr))))
4991             {
4992               need_ref_all = true;
4993               break;
4994             }
4995           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4996         }
4997       while (sinfo);
4998     }
4999   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5000                                                need_ref_all);
5001   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5002
5003
5004   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5005      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5006      def-use update cycles for the pointer: one relative to the outer-loop
5007      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5008      to the inner-loop (which is the inner-most loop containing the dataref),
5009      and this is done be step (5) below.
5010
5011      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5012      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5013      redundant.  Steps (3),(4) create the following:
5014
5015         vp0 = &base_addr;
5016         LOOP:   vp1 = phi(vp0,vp2)
5017                 ...
5018                 ...
5019                 vp2 = vp1 + step
5020                 goto LOOP
5021
5022      If there is an inner-loop nested in loop, then step (5) will also be
5023      applied, and an additional update in the inner-loop will be created:
5024
5025         vp0 = &base_addr;
5026         LOOP:   vp1 = phi(vp0,vp2)
5027                 ...
5028         inner:     vp3 = phi(vp1,vp4)
5029                    vp4 = vp3 + inner_step
5030                    if () goto inner
5031                 ...
5032                 vp2 = vp1 + step
5033                 if () goto LOOP   */
5034
5035   /* (2) Calculate the initial address of the aggregate-pointer, and set
5036      the aggregate-pointer to point to it before the loop.  */
5037
5038   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5039
5040   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5041                                                    stmt_info, &new_stmt_list,
5042                                                    offset);
5043   if (new_stmt_list)
5044     {
5045       if (pe)
5046         {
5047           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5048           gcc_assert (!new_bb);
5049         }
5050       else
5051         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5052     }
5053
5054   *initial_address = new_temp;
5055   aggr_ptr_init = new_temp;
5056
5057   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5058      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5059      inner-loop nested in LOOP (during outer-loop vectorization).  */
5060
5061   /* No update in loop is required.  */
5062   if (only_init && (!loop_vinfo || at_loop == loop))
5063     aptr = aggr_ptr_init;
5064   else
5065     {
5066       /* Accesses to invariant addresses should be handled specially
5067          by the caller.  */
5068       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5069       gcc_assert (!integer_zerop (step));
5070
5071       if (iv_step == NULL_TREE)
5072         {
5073           /* The step of the aggregate pointer is the type size,
5074              negated for downward accesses.  */
5075           iv_step = TYPE_SIZE_UNIT (aggr_type);
5076           if (tree_int_cst_sgn (step) == -1)
5077             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5078         }
5079
5080       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5081
5082       create_iv (aggr_ptr_init,
5083                  fold_convert (aggr_ptr_type, iv_step),
5084                  aggr_ptr, loop, &incr_gsi, insert_after,
5085                  &indx_before_incr, &indx_after_incr);
5086       incr = gsi_stmt (incr_gsi);
5087
5088       /* Copy the points-to information if it exists. */
5089       if (DR_PTR_INFO (dr))
5090         {
5091           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5092           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5093         }
5094       if (ptr_incr)
5095         *ptr_incr = incr;
5096
5097       aptr = indx_before_incr;
5098     }
5099
5100   if (!nested_in_vect_loop || only_init)
5101     return aptr;
5102
5103
5104   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5105      nested in LOOP, if exists.  */
5106
5107   gcc_assert (nested_in_vect_loop);
5108   if (!only_init)
5109     {
5110       standard_iv_increment_position (containing_loop, &incr_gsi,
5111                                       &insert_after);
5112       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5113                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5114                  &indx_after_incr);
5115       incr = gsi_stmt (incr_gsi);
5116
5117       /* Copy the points-to information if it exists. */
5118       if (DR_PTR_INFO (dr))
5119         {
5120           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5121           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5122         }
5123       if (ptr_incr)
5124         *ptr_incr = incr;
5125
5126       return indx_before_incr;
5127     }
5128   else
5129     gcc_unreachable ();
5130 }
5131
5132
5133 /* Function bump_vector_ptr
5134
5135    Increment a pointer (to a vector type) by vector-size. If requested,
5136    i.e. if PTR-INCR is given, then also connect the new increment stmt
5137    to the existing def-use update-chain of the pointer, by modifying
5138    the PTR_INCR as illustrated below:
5139
5140    The pointer def-use update-chain before this function:
5141                         DATAREF_PTR = phi (p_0, p_2)
5142                         ....
5143         PTR_INCR:       p_2 = DATAREF_PTR + step
5144
5145    The pointer def-use update-chain after this function:
5146                         DATAREF_PTR = phi (p_0, p_2)
5147                         ....
5148                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5149                         ....
5150         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5151
5152    Input:
5153    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5154                  in the loop.
5155    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5156               the loop.  The increment amount across iterations is expected
5157               to be vector_size.
5158    BSI - location where the new update stmt is to be placed.
5159    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5160    BUMP - optional. The offset by which to bump the pointer. If not given,
5161           the offset is assumed to be vector_size.
5162
5163    Output: Return NEW_DATAREF_PTR as illustrated above.
5164
5165 */
5166
5167 tree
5168 bump_vector_ptr (vec_info *vinfo,
5169                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5170                  stmt_vec_info stmt_info, tree bump)
5171 {
5172   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5173   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5174   tree update = TYPE_SIZE_UNIT (vectype);
5175   gimple *incr_stmt;
5176   ssa_op_iter iter;
5177   use_operand_p use_p;
5178   tree new_dataref_ptr;
5179
5180   if (bump)
5181     update = bump;
5182
5183   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5184     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5185   else
5186     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5187   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5188                                    dataref_ptr, update);
5189   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5190   /* Fold the increment, avoiding excessive chains use-def chains of
5191      those, leading to compile-time issues for passes until the next
5192      forwprop pass which would do this as well.  */
5193   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5194   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5195     {
5196       incr_stmt = gsi_stmt (fold_gsi);
5197       update_stmt (incr_stmt);
5198     }
5199
5200   /* Copy the points-to information if it exists. */
5201   if (DR_PTR_INFO (dr))
5202     {
5203       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5204       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5205     }
5206
5207   if (!ptr_incr)
5208     return new_dataref_ptr;
5209
5210   /* Update the vector-pointer's cross-iteration increment.  */
5211   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5212     {
5213       tree use = USE_FROM_PTR (use_p);
5214
5215       if (use == dataref_ptr)
5216         SET_USE (use_p, new_dataref_ptr);
5217       else
5218         gcc_assert (operand_equal_p (use, update, 0));
5219     }
5220
5221   return new_dataref_ptr;
5222 }
5223
5224
5225 /* Copy memory reference info such as base/clique from the SRC reference
5226    to the DEST MEM_REF.  */
5227
5228 void
5229 vect_copy_ref_info (tree dest, tree src)
5230 {
5231   if (TREE_CODE (dest) != MEM_REF)
5232     return;
5233
5234   tree src_base = src;
5235   while (handled_component_p (src_base))
5236     src_base = TREE_OPERAND (src_base, 0);
5237   if (TREE_CODE (src_base) != MEM_REF
5238       && TREE_CODE (src_base) != TARGET_MEM_REF)
5239     return;
5240
5241   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5242   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5243 }
5244
5245
5246 /* Function vect_create_destination_var.
5247
5248    Create a new temporary of type VECTYPE.  */
5249
5250 tree
5251 vect_create_destination_var (tree scalar_dest, tree vectype)
5252 {
5253   tree vec_dest;
5254   const char *name;
5255   char *new_name;
5256   tree type;
5257   enum vect_var_kind kind;
5258
5259   kind = vectype
5260     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5261     ? vect_mask_var
5262     : vect_simple_var
5263     : vect_scalar_var;
5264   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5265
5266   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5267
5268   name = get_name (scalar_dest);
5269   if (name)
5270     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5271   else
5272     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5273   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5274   free (new_name);
5275
5276   return vec_dest;
5277 }
5278
5279 /* Function vect_grouped_store_supported.
5280
5281    Returns TRUE if interleave high and interleave low permutations
5282    are supported, and FALSE otherwise.  */
5283
5284 bool
5285 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5286 {
5287   machine_mode mode = TYPE_MODE (vectype);
5288
5289   /* vect_permute_store_chain requires the group size to be equal to 3 or
5290      be a power of two.  */
5291   if (count != 3 && exact_log2 (count) == -1)
5292     {
5293       if (dump_enabled_p ())
5294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5295                          "the size of the group of accesses"
5296                          " is not a power of 2 or not eqaul to 3\n");
5297       return false;
5298     }
5299
5300   /* Check that the permutation is supported.  */
5301   if (VECTOR_MODE_P (mode))
5302     {
5303       unsigned int i;
5304       if (count == 3)
5305         {
5306           unsigned int j0 = 0, j1 = 0, j2 = 0;
5307           unsigned int i, j;
5308
5309           unsigned int nelt;
5310           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5311             {
5312               if (dump_enabled_p ())
5313                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5314                                  "cannot handle groups of 3 stores for"
5315                                  " variable-length vectors\n");
5316               return false;
5317             }
5318
5319           vec_perm_builder sel (nelt, nelt, 1);
5320           sel.quick_grow (nelt);
5321           vec_perm_indices indices;
5322           for (j = 0; j < 3; j++)
5323             {
5324               int nelt0 = ((3 - j) * nelt) % 3;
5325               int nelt1 = ((3 - j) * nelt + 1) % 3;
5326               int nelt2 = ((3 - j) * nelt + 2) % 3;
5327               for (i = 0; i < nelt; i++)
5328                 {
5329                   if (3 * i + nelt0 < nelt)
5330                     sel[3 * i + nelt0] = j0++;
5331                   if (3 * i + nelt1 < nelt)
5332                     sel[3 * i + nelt1] = nelt + j1++;
5333                   if (3 * i + nelt2 < nelt)
5334                     sel[3 * i + nelt2] = 0;
5335                 }
5336               indices.new_vector (sel, 2, nelt);
5337               if (!can_vec_perm_const_p (mode, indices))
5338                 {
5339                   if (dump_enabled_p ())
5340                     dump_printf (MSG_MISSED_OPTIMIZATION,
5341                                  "permutation op not supported by target.\n");
5342                   return false;
5343                 }
5344
5345               for (i = 0; i < nelt; i++)
5346                 {
5347                   if (3 * i + nelt0 < nelt)
5348                     sel[3 * i + nelt0] = 3 * i + nelt0;
5349                   if (3 * i + nelt1 < nelt)
5350                     sel[3 * i + nelt1] = 3 * i + nelt1;
5351                   if (3 * i + nelt2 < nelt)
5352                     sel[3 * i + nelt2] = nelt + j2++;
5353                 }
5354               indices.new_vector (sel, 2, nelt);
5355               if (!can_vec_perm_const_p (mode, indices))
5356                 {
5357                   if (dump_enabled_p ())
5358                     dump_printf (MSG_MISSED_OPTIMIZATION,
5359                                  "permutation op not supported by target.\n");
5360                   return false;
5361                 }
5362             }
5363           return true;
5364         }
5365       else
5366         {
5367           /* If length is not equal to 3 then only power of 2 is supported.  */
5368           gcc_assert (pow2p_hwi (count));
5369           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5370
5371           /* The encoding has 2 interleaved stepped patterns.  */
5372           vec_perm_builder sel (nelt, 2, 3);
5373           sel.quick_grow (6);
5374           for (i = 0; i < 3; i++)
5375             {
5376               sel[i * 2] = i;
5377               sel[i * 2 + 1] = i + nelt;
5378             }
5379           vec_perm_indices indices (sel, 2, nelt);
5380           if (can_vec_perm_const_p (mode, indices))
5381             {
5382               for (i = 0; i < 6; i++)
5383                 sel[i] += exact_div (nelt, 2);
5384               indices.new_vector (sel, 2, nelt);
5385               if (can_vec_perm_const_p (mode, indices))
5386                 return true;
5387             }
5388         }
5389     }
5390
5391   if (dump_enabled_p ())
5392     dump_printf (MSG_MISSED_OPTIMIZATION,
5393                  "permutation op not supported by target.\n");
5394   return false;
5395 }
5396
5397
5398 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5399    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5400
5401 bool
5402 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5403                             bool masked_p)
5404 {
5405   if (masked_p)
5406     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5407                                          vec_mask_store_lanes_optab,
5408                                          vectype, count);
5409   else
5410     return vect_lanes_optab_supported_p ("vec_store_lanes",
5411                                          vec_store_lanes_optab,
5412                                          vectype, count);
5413 }
5414
5415
5416 /* Function vect_permute_store_chain.
5417
5418    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5419    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5420    the data correctly for the stores.  Return the final references for stores
5421    in RESULT_CHAIN.
5422
5423    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5424    The input is 4 vectors each containing 8 elements.  We assign a number to
5425    each element, the input sequence is:
5426
5427    1st vec:   0  1  2  3  4  5  6  7
5428    2nd vec:   8  9 10 11 12 13 14 15
5429    3rd vec:  16 17 18 19 20 21 22 23
5430    4th vec:  24 25 26 27 28 29 30 31
5431
5432    The output sequence should be:
5433
5434    1st vec:  0  8 16 24  1  9 17 25
5435    2nd vec:  2 10 18 26  3 11 19 27
5436    3rd vec:  4 12 20 28  5 13 21 30
5437    4th vec:  6 14 22 30  7 15 23 31
5438
5439    i.e., we interleave the contents of the four vectors in their order.
5440
5441    We use interleave_high/low instructions to create such output.  The input of
5442    each interleave_high/low operation is two vectors:
5443    1st vec    2nd vec
5444    0 1 2 3    4 5 6 7
5445    the even elements of the result vector are obtained left-to-right from the
5446    high/low elements of the first vector.  The odd elements of the result are
5447    obtained left-to-right from the high/low elements of the second vector.
5448    The output of interleave_high will be:   0 4 1 5
5449    and of interleave_low:                   2 6 3 7
5450
5451
5452    The permutation is done in log LENGTH stages.  In each stage interleave_high
5453    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5454    where the first argument is taken from the first half of DR_CHAIN and the
5455    second argument from it's second half.
5456    In our example,
5457
5458    I1: interleave_high (1st vec, 3rd vec)
5459    I2: interleave_low (1st vec, 3rd vec)
5460    I3: interleave_high (2nd vec, 4th vec)
5461    I4: interleave_low (2nd vec, 4th vec)
5462
5463    The output for the first stage is:
5464
5465    I1:  0 16  1 17  2 18  3 19
5466    I2:  4 20  5 21  6 22  7 23
5467    I3:  8 24  9 25 10 26 11 27
5468    I4: 12 28 13 29 14 30 15 31
5469
5470    The output of the second stage, i.e. the final result is:
5471
5472    I1:  0  8 16 24  1  9 17 25
5473    I2:  2 10 18 26  3 11 19 27
5474    I3:  4 12 20 28  5 13 21 30
5475    I4:  6 14 22 30  7 15 23 31.  */
5476
5477 void
5478 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5479                           unsigned int length,
5480                           stmt_vec_info stmt_info,
5481                           gimple_stmt_iterator *gsi,
5482                           vec<tree> *result_chain)
5483 {
5484   tree vect1, vect2, high, low;
5485   gimple *perm_stmt;
5486   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5487   tree perm_mask_low, perm_mask_high;
5488   tree data_ref;
5489   tree perm3_mask_low, perm3_mask_high;
5490   unsigned int i, j, n, log_length = exact_log2 (length);
5491
5492   result_chain->quick_grow (length);
5493   memcpy (result_chain->address (), dr_chain.address (),
5494           length * sizeof (tree));
5495
5496   if (length == 3)
5497     {
5498       /* vect_grouped_store_supported ensures that this is constant.  */
5499       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5500       unsigned int j0 = 0, j1 = 0, j2 = 0;
5501
5502       vec_perm_builder sel (nelt, nelt, 1);
5503       sel.quick_grow (nelt);
5504       vec_perm_indices indices;
5505       for (j = 0; j < 3; j++)
5506         {
5507           int nelt0 = ((3 - j) * nelt) % 3;
5508           int nelt1 = ((3 - j) * nelt + 1) % 3;
5509           int nelt2 = ((3 - j) * nelt + 2) % 3;
5510
5511           for (i = 0; i < nelt; i++)
5512             {
5513               if (3 * i + nelt0 < nelt)
5514                 sel[3 * i + nelt0] = j0++;
5515               if (3 * i + nelt1 < nelt)
5516                 sel[3 * i + nelt1] = nelt + j1++;
5517               if (3 * i + nelt2 < nelt)
5518                 sel[3 * i + nelt2] = 0;
5519             }
5520           indices.new_vector (sel, 2, nelt);
5521           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5522
5523           for (i = 0; i < nelt; i++)
5524             {
5525               if (3 * i + nelt0 < nelt)
5526                 sel[3 * i + nelt0] = 3 * i + nelt0;
5527               if (3 * i + nelt1 < nelt)
5528                 sel[3 * i + nelt1] = 3 * i + nelt1;
5529               if (3 * i + nelt2 < nelt)
5530                 sel[3 * i + nelt2] = nelt + j2++;
5531             }
5532           indices.new_vector (sel, 2, nelt);
5533           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5534
5535           vect1 = dr_chain[0];
5536           vect2 = dr_chain[1];
5537
5538           /* Create interleaving stmt:
5539              low = VEC_PERM_EXPR <vect1, vect2,
5540                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5541                                    j + 2, nelt + j + 2, *, ...}>  */
5542           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5543           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5544                                            vect2, perm3_mask_low);
5545           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5546
5547           vect1 = data_ref;
5548           vect2 = dr_chain[2];
5549           /* Create interleaving stmt:
5550              low = VEC_PERM_EXPR <vect1, vect2,
5551                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5552                                    6, 7, nelt + j + 2, ...}>  */
5553           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5554           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5555                                            vect2, perm3_mask_high);
5556           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5557           (*result_chain)[j] = data_ref;
5558         }
5559     }
5560   else
5561     {
5562       /* If length is not equal to 3 then only power of 2 is supported.  */
5563       gcc_assert (pow2p_hwi (length));
5564
5565       /* The encoding has 2 interleaved stepped patterns.  */
5566       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5567       vec_perm_builder sel (nelt, 2, 3);
5568       sel.quick_grow (6);
5569       for (i = 0; i < 3; i++)
5570         {
5571           sel[i * 2] = i;
5572           sel[i * 2 + 1] = i + nelt;
5573         }
5574         vec_perm_indices indices (sel, 2, nelt);
5575         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5576
5577         for (i = 0; i < 6; i++)
5578           sel[i] += exact_div (nelt, 2);
5579         indices.new_vector (sel, 2, nelt);
5580         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5581
5582         for (i = 0, n = log_length; i < n; i++)
5583           {
5584             for (j = 0; j < length/2; j++)
5585               {
5586                 vect1 = dr_chain[j];
5587                 vect2 = dr_chain[j+length/2];
5588
5589                 /* Create interleaving stmt:
5590                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5591                                                         ...}>  */
5592                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5593                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5594                                                  vect2, perm_mask_high);
5595                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5596                 (*result_chain)[2*j] = high;
5597
5598                 /* Create interleaving stmt:
5599                    low = VEC_PERM_EXPR <vect1, vect2,
5600                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5601                                          ...}>  */
5602                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5603                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5604                                                  vect2, perm_mask_low);
5605                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5606                 (*result_chain)[2*j+1] = low;
5607               }
5608             memcpy (dr_chain.address (), result_chain->address (),
5609                     length * sizeof (tree));
5610           }
5611     }
5612 }
5613
5614 /* Function vect_setup_realignment
5615
5616    This function is called when vectorizing an unaligned load using
5617    the dr_explicit_realign[_optimized] scheme.
5618    This function generates the following code at the loop prolog:
5619
5620       p = initial_addr;
5621    x  msq_init = *(floor(p));   # prolog load
5622       realignment_token = call target_builtin;
5623     loop:
5624    x  msq = phi (msq_init, ---)
5625
5626    The stmts marked with x are generated only for the case of
5627    dr_explicit_realign_optimized.
5628
5629    The code above sets up a new (vector) pointer, pointing to the first
5630    location accessed by STMT_INFO, and a "floor-aligned" load using that
5631    pointer.  It also generates code to compute the "realignment-token"
5632    (if the relevant target hook was defined), and creates a phi-node at the
5633    loop-header bb whose arguments are the result of the prolog-load (created
5634    by this function) and the result of a load that takes place in the loop
5635    (to be created by the caller to this function).
5636
5637    For the case of dr_explicit_realign_optimized:
5638    The caller to this function uses the phi-result (msq) to create the
5639    realignment code inside the loop, and sets up the missing phi argument,
5640    as follows:
5641     loop:
5642       msq = phi (msq_init, lsq)
5643       lsq = *(floor(p'));        # load in loop
5644       result = realign_load (msq, lsq, realignment_token);
5645
5646    For the case of dr_explicit_realign:
5647     loop:
5648       msq = *(floor(p));        # load in loop
5649       p' = p + (VS-1);
5650       lsq = *(floor(p'));       # load in loop
5651       result = realign_load (msq, lsq, realignment_token);
5652
5653    Input:
5654    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5655                a memory location that may be unaligned.
5656    BSI - place where new code is to be inserted.
5657    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5658                               is used.
5659
5660    Output:
5661    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5662                        target hook, if defined.
5663    Return value - the result of the loop-header phi node.  */
5664
5665 tree
5666 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5667                         gimple_stmt_iterator *gsi, tree *realignment_token,
5668                         enum dr_alignment_support alignment_support_scheme,
5669                         tree init_addr,
5670                         class loop **at_loop)
5671 {
5672   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5673   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5674   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5675   struct data_reference *dr = dr_info->dr;
5676   class loop *loop = NULL;
5677   edge pe = NULL;
5678   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5679   tree vec_dest;
5680   gimple *inc;
5681   tree ptr;
5682   tree data_ref;
5683   basic_block new_bb;
5684   tree msq_init = NULL_TREE;
5685   tree new_temp;
5686   gphi *phi_stmt;
5687   tree msq = NULL_TREE;
5688   gimple_seq stmts = NULL;
5689   bool compute_in_loop = false;
5690   bool nested_in_vect_loop = false;
5691   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5692   class loop *loop_for_initial_load = NULL;
5693
5694   if (loop_vinfo)
5695     {
5696       loop = LOOP_VINFO_LOOP (loop_vinfo);
5697       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5698     }
5699
5700   gcc_assert (alignment_support_scheme == dr_explicit_realign
5701               || alignment_support_scheme == dr_explicit_realign_optimized);
5702
5703   /* We need to generate three things:
5704      1. the misalignment computation
5705      2. the extra vector load (for the optimized realignment scheme).
5706      3. the phi node for the two vectors from which the realignment is
5707       done (for the optimized realignment scheme).  */
5708
5709   /* 1. Determine where to generate the misalignment computation.
5710
5711      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5712      calculation will be generated by this function, outside the loop (in the
5713      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5714      caller, inside the loop.
5715
5716      Background: If the misalignment remains fixed throughout the iterations of
5717      the loop, then both realignment schemes are applicable, and also the
5718      misalignment computation can be done outside LOOP.  This is because we are
5719      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5720      are a multiple of VS (the Vector Size), and therefore the misalignment in
5721      different vectorized LOOP iterations is always the same.
5722      The problem arises only if the memory access is in an inner-loop nested
5723      inside LOOP, which is now being vectorized using outer-loop vectorization.
5724      This is the only case when the misalignment of the memory access may not
5725      remain fixed throughout the iterations of the inner-loop (as explained in
5726      detail in vect_supportable_dr_alignment).  In this case, not only is the
5727      optimized realignment scheme not applicable, but also the misalignment
5728      computation (and generation of the realignment token that is passed to
5729      REALIGN_LOAD) have to be done inside the loop.
5730
5731      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5732      or not, which in turn determines if the misalignment is computed inside
5733      the inner-loop, or outside LOOP.  */
5734
5735   if (init_addr != NULL_TREE || !loop_vinfo)
5736     {
5737       compute_in_loop = true;
5738       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5739     }
5740
5741
5742   /* 2. Determine where to generate the extra vector load.
5743
5744      For the optimized realignment scheme, instead of generating two vector
5745      loads in each iteration, we generate a single extra vector load in the
5746      preheader of the loop, and in each iteration reuse the result of the
5747      vector load from the previous iteration.  In case the memory access is in
5748      an inner-loop nested inside LOOP, which is now being vectorized using
5749      outer-loop vectorization, we need to determine whether this initial vector
5750      load should be generated at the preheader of the inner-loop, or can be
5751      generated at the preheader of LOOP.  If the memory access has no evolution
5752      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5753      to be generated inside LOOP (in the preheader of the inner-loop).  */
5754
5755   if (nested_in_vect_loop)
5756     {
5757       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5758       bool invariant_in_outerloop =
5759             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5760       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5761     }
5762   else
5763     loop_for_initial_load = loop;
5764   if (at_loop)
5765     *at_loop = loop_for_initial_load;
5766
5767   if (loop_for_initial_load)
5768     pe = loop_preheader_edge (loop_for_initial_load);
5769
5770   /* 3. For the case of the optimized realignment, create the first vector
5771       load at the loop preheader.  */
5772
5773   if (alignment_support_scheme == dr_explicit_realign_optimized)
5774     {
5775       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5776       gassign *new_stmt;
5777
5778       gcc_assert (!compute_in_loop);
5779       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5780       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5781                                       loop_for_initial_load, NULL_TREE,
5782                                       &init_addr, NULL, &inc, true);
5783       if (TREE_CODE (ptr) == SSA_NAME)
5784         new_temp = copy_ssa_name (ptr);
5785       else
5786         new_temp = make_ssa_name (TREE_TYPE (ptr));
5787       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5788       tree type = TREE_TYPE (ptr);
5789       new_stmt = gimple_build_assign
5790                    (new_temp, BIT_AND_EXPR, ptr,
5791                     fold_build2 (MINUS_EXPR, type,
5792                                  build_int_cst (type, 0),
5793                                  build_int_cst (type, align)));
5794       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5795       gcc_assert (!new_bb);
5796       data_ref
5797         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5798                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5799       vect_copy_ref_info (data_ref, DR_REF (dr));
5800       new_stmt = gimple_build_assign (vec_dest, data_ref);
5801       new_temp = make_ssa_name (vec_dest, new_stmt);
5802       gimple_assign_set_lhs (new_stmt, new_temp);
5803       if (pe)
5804         {
5805           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5806           gcc_assert (!new_bb);
5807         }
5808       else
5809          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5810
5811       msq_init = gimple_assign_lhs (new_stmt);
5812     }
5813
5814   /* 4. Create realignment token using a target builtin, if available.
5815       It is done either inside the containing loop, or before LOOP (as
5816       determined above).  */
5817
5818   if (targetm.vectorize.builtin_mask_for_load)
5819     {
5820       gcall *new_stmt;
5821       tree builtin_decl;
5822
5823       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5824       if (!init_addr)
5825         {
5826           /* Generate the INIT_ADDR computation outside LOOP.  */
5827           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5828                                                             stmt_info, &stmts,
5829                                                             NULL_TREE);
5830           if (loop)
5831             {
5832               pe = loop_preheader_edge (loop);
5833               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5834               gcc_assert (!new_bb);
5835             }
5836           else
5837              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5838         }
5839
5840       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5841       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5842       vec_dest =
5843         vect_create_destination_var (scalar_dest,
5844                                      gimple_call_return_type (new_stmt));
5845       new_temp = make_ssa_name (vec_dest, new_stmt);
5846       gimple_call_set_lhs (new_stmt, new_temp);
5847
5848       if (compute_in_loop)
5849         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5850       else
5851         {
5852           /* Generate the misalignment computation outside LOOP.  */
5853           pe = loop_preheader_edge (loop);
5854           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5855           gcc_assert (!new_bb);
5856         }
5857
5858       *realignment_token = gimple_call_lhs (new_stmt);
5859
5860       /* The result of the CALL_EXPR to this builtin is determined from
5861          the value of the parameter and no global variables are touched
5862          which makes the builtin a "const" function.  Requiring the
5863          builtin to have the "const" attribute makes it unnecessary
5864          to call mark_call_clobbered.  */
5865       gcc_assert (TREE_READONLY (builtin_decl));
5866     }
5867
5868   if (alignment_support_scheme == dr_explicit_realign)
5869     return msq;
5870
5871   gcc_assert (!compute_in_loop);
5872   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5873
5874
5875   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5876
5877   pe = loop_preheader_edge (containing_loop);
5878   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5879   msq = make_ssa_name (vec_dest);
5880   phi_stmt = create_phi_node (msq, containing_loop->header);
5881   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5882
5883   return msq;
5884 }
5885
5886
5887 /* Function vect_grouped_load_supported.
5888
5889    COUNT is the size of the load group (the number of statements plus the
5890    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5891    only one statement, with a gap of COUNT - 1.
5892
5893    Returns true if a suitable permute exists.  */
5894
5895 bool
5896 vect_grouped_load_supported (tree vectype, bool single_element_p,
5897                              unsigned HOST_WIDE_INT count)
5898 {
5899   machine_mode mode = TYPE_MODE (vectype);
5900
5901   /* If this is single-element interleaving with an element distance
5902      that leaves unused vector loads around punt - we at least create
5903      very sub-optimal code in that case (and blow up memory,
5904      see PR65518).  */
5905   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5906     {
5907       if (dump_enabled_p ())
5908         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5909                          "single-element interleaving not supported "
5910                          "for not adjacent vector loads\n");
5911       return false;
5912     }
5913
5914   /* vect_permute_load_chain requires the group size to be equal to 3 or
5915      be a power of two.  */
5916   if (count != 3 && exact_log2 (count) == -1)
5917     {
5918       if (dump_enabled_p ())
5919         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5920                          "the size of the group of accesses"
5921                          " is not a power of 2 or not equal to 3\n");
5922       return false;
5923     }
5924
5925   /* Check that the permutation is supported.  */
5926   if (VECTOR_MODE_P (mode))
5927     {
5928       unsigned int i, j;
5929       if (count == 3)
5930         {
5931           unsigned int nelt;
5932           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5933             {
5934               if (dump_enabled_p ())
5935                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5936                                  "cannot handle groups of 3 loads for"
5937                                  " variable-length vectors\n");
5938               return false;
5939             }
5940
5941           vec_perm_builder sel (nelt, nelt, 1);
5942           sel.quick_grow (nelt);
5943           vec_perm_indices indices;
5944           unsigned int k;
5945           for (k = 0; k < 3; k++)
5946             {
5947               for (i = 0; i < nelt; i++)
5948                 if (3 * i + k < 2 * nelt)
5949                   sel[i] = 3 * i + k;
5950                 else
5951                   sel[i] = 0;
5952               indices.new_vector (sel, 2, nelt);
5953               if (!can_vec_perm_const_p (mode, indices))
5954                 {
5955                   if (dump_enabled_p ())
5956                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5957                                      "shuffle of 3 loads is not supported by"
5958                                      " target\n");
5959                   return false;
5960                 }
5961               for (i = 0, j = 0; i < nelt; i++)
5962                 if (3 * i + k < 2 * nelt)
5963                   sel[i] = i;
5964                 else
5965                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5966               indices.new_vector (sel, 2, nelt);
5967               if (!can_vec_perm_const_p (mode, indices))
5968                 {
5969                   if (dump_enabled_p ())
5970                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5971                                      "shuffle of 3 loads is not supported by"
5972                                      " target\n");
5973                   return false;
5974                 }
5975             }
5976           return true;
5977         }
5978       else
5979         {
5980           /* If length is not equal to 3 then only power of 2 is supported.  */
5981           gcc_assert (pow2p_hwi (count));
5982           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5983
5984           /* The encoding has a single stepped pattern.  */
5985           vec_perm_builder sel (nelt, 1, 3);
5986           sel.quick_grow (3);
5987           for (i = 0; i < 3; i++)
5988             sel[i] = i * 2;
5989           vec_perm_indices indices (sel, 2, nelt);
5990           if (can_vec_perm_const_p (mode, indices))
5991             {
5992               for (i = 0; i < 3; i++)
5993                 sel[i] = i * 2 + 1;
5994               indices.new_vector (sel, 2, nelt);
5995               if (can_vec_perm_const_p (mode, indices))
5996                 return true;
5997             }
5998         }
5999     }
6000
6001   if (dump_enabled_p ())
6002     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6003                      "extract even/odd not supported by target\n");
6004   return false;
6005 }
6006
6007 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6008    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6009
6010 bool
6011 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6012                            bool masked_p)
6013 {
6014   if (masked_p)
6015     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6016                                          vec_mask_load_lanes_optab,
6017                                          vectype, count);
6018   else
6019     return vect_lanes_optab_supported_p ("vec_load_lanes",
6020                                          vec_load_lanes_optab,
6021                                          vectype, count);
6022 }
6023
6024 /* Function vect_permute_load_chain.
6025
6026    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6027    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6028    the input data correctly.  Return the final references for loads in
6029    RESULT_CHAIN.
6030
6031    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6032    The input is 4 vectors each containing 8 elements. We assign a number to each
6033    element, the input sequence is:
6034
6035    1st vec:   0  1  2  3  4  5  6  7
6036    2nd vec:   8  9 10 11 12 13 14 15
6037    3rd vec:  16 17 18 19 20 21 22 23
6038    4th vec:  24 25 26 27 28 29 30 31
6039
6040    The output sequence should be:
6041
6042    1st vec:  0 4  8 12 16 20 24 28
6043    2nd vec:  1 5  9 13 17 21 25 29
6044    3rd vec:  2 6 10 14 18 22 26 30
6045    4th vec:  3 7 11 15 19 23 27 31
6046
6047    i.e., the first output vector should contain the first elements of each
6048    interleaving group, etc.
6049
6050    We use extract_even/odd instructions to create such output.  The input of
6051    each extract_even/odd operation is two vectors
6052    1st vec    2nd vec
6053    0 1 2 3    4 5 6 7
6054
6055    and the output is the vector of extracted even/odd elements.  The output of
6056    extract_even will be:   0 2 4 6
6057    and of extract_odd:     1 3 5 7
6058
6059
6060    The permutation is done in log LENGTH stages.  In each stage extract_even
6061    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6062    their order.  In our example,
6063
6064    E1: extract_even (1st vec, 2nd vec)
6065    E2: extract_odd (1st vec, 2nd vec)
6066    E3: extract_even (3rd vec, 4th vec)
6067    E4: extract_odd (3rd vec, 4th vec)
6068
6069    The output for the first stage will be:
6070
6071    E1:  0  2  4  6  8 10 12 14
6072    E2:  1  3  5  7  9 11 13 15
6073    E3: 16 18 20 22 24 26 28 30
6074    E4: 17 19 21 23 25 27 29 31
6075
6076    In order to proceed and create the correct sequence for the next stage (or
6077    for the correct output, if the second stage is the last one, as in our
6078    example), we first put the output of extract_even operation and then the
6079    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6080    The input for the second stage is:
6081
6082    1st vec (E1):  0  2  4  6  8 10 12 14
6083    2nd vec (E3): 16 18 20 22 24 26 28 30
6084    3rd vec (E2):  1  3  5  7  9 11 13 15
6085    4th vec (E4): 17 19 21 23 25 27 29 31
6086
6087    The output of the second stage:
6088
6089    E1: 0 4  8 12 16 20 24 28
6090    E2: 2 6 10 14 18 22 26 30
6091    E3: 1 5  9 13 17 21 25 29
6092    E4: 3 7 11 15 19 23 27 31
6093
6094    And RESULT_CHAIN after reordering:
6095
6096    1st vec (E1):  0 4  8 12 16 20 24 28
6097    2nd vec (E3):  1 5  9 13 17 21 25 29
6098    3rd vec (E2):  2 6 10 14 18 22 26 30
6099    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6100
6101 static void
6102 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6103                          unsigned int length,
6104                          stmt_vec_info stmt_info,
6105                          gimple_stmt_iterator *gsi,
6106                          vec<tree> *result_chain)
6107 {
6108   tree data_ref, first_vect, second_vect;
6109   tree perm_mask_even, perm_mask_odd;
6110   tree perm3_mask_low, perm3_mask_high;
6111   gimple *perm_stmt;
6112   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6113   unsigned int i, j, log_length = exact_log2 (length);
6114
6115   result_chain->quick_grow (length);
6116   memcpy (result_chain->address (), dr_chain.address (),
6117           length * sizeof (tree));
6118
6119   if (length == 3)
6120     {
6121       /* vect_grouped_load_supported ensures that this is constant.  */
6122       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6123       unsigned int k;
6124
6125       vec_perm_builder sel (nelt, nelt, 1);
6126       sel.quick_grow (nelt);
6127       vec_perm_indices indices;
6128       for (k = 0; k < 3; k++)
6129         {
6130           for (i = 0; i < nelt; i++)
6131             if (3 * i + k < 2 * nelt)
6132               sel[i] = 3 * i + k;
6133             else
6134               sel[i] = 0;
6135           indices.new_vector (sel, 2, nelt);
6136           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6137
6138           for (i = 0, j = 0; i < nelt; i++)
6139             if (3 * i + k < 2 * nelt)
6140               sel[i] = i;
6141             else
6142               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6143           indices.new_vector (sel, 2, nelt);
6144           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6145
6146           first_vect = dr_chain[0];
6147           second_vect = dr_chain[1];
6148
6149           /* Create interleaving stmt (low part of):
6150              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6151                                                              ...}>  */
6152           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6153           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6154                                            second_vect, perm3_mask_low);
6155           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6156
6157           /* Create interleaving stmt (high part of):
6158              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6159                                                               ...}>  */
6160           first_vect = data_ref;
6161           second_vect = dr_chain[2];
6162           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6163           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6164                                            second_vect, perm3_mask_high);
6165           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6166           (*result_chain)[k] = data_ref;
6167         }
6168     }
6169   else
6170     {
6171       /* If length is not equal to 3 then only power of 2 is supported.  */
6172       gcc_assert (pow2p_hwi (length));
6173
6174       /* The encoding has a single stepped pattern.  */
6175       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6176       vec_perm_builder sel (nelt, 1, 3);
6177       sel.quick_grow (3);
6178       for (i = 0; i < 3; ++i)
6179         sel[i] = i * 2;
6180       vec_perm_indices indices (sel, 2, nelt);
6181       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6182
6183       for (i = 0; i < 3; ++i)
6184         sel[i] = i * 2 + 1;
6185       indices.new_vector (sel, 2, nelt);
6186       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6187
6188       for (i = 0; i < log_length; i++)
6189         {
6190           for (j = 0; j < length; j += 2)
6191             {
6192               first_vect = dr_chain[j];
6193               second_vect = dr_chain[j+1];
6194
6195               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6196               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6197               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6198                                                first_vect, second_vect,
6199                                                perm_mask_even);
6200               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6201               (*result_chain)[j/2] = data_ref;
6202
6203               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6204               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6205               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6206                                                first_vect, second_vect,
6207                                                perm_mask_odd);
6208               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6209               (*result_chain)[j/2+length/2] = data_ref;
6210             }
6211           memcpy (dr_chain.address (), result_chain->address (),
6212                   length * sizeof (tree));
6213         }
6214     }
6215 }
6216
6217 /* Function vect_shift_permute_load_chain.
6218
6219    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6220    sequence of stmts to reorder the input data accordingly.
6221    Return the final references for loads in RESULT_CHAIN.
6222    Return true if successed, false otherwise.
6223
6224    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6225    The input is 3 vectors each containing 8 elements.  We assign a
6226    number to each element, the input sequence is:
6227
6228    1st vec:   0  1  2  3  4  5  6  7
6229    2nd vec:   8  9 10 11 12 13 14 15
6230    3rd vec:  16 17 18 19 20 21 22 23
6231
6232    The output sequence should be:
6233
6234    1st vec:  0 3 6  9 12 15 18 21
6235    2nd vec:  1 4 7 10 13 16 19 22
6236    3rd vec:  2 5 8 11 14 17 20 23
6237
6238    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6239
6240    First we shuffle all 3 vectors to get correct elements order:
6241
6242    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6243    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6244    3rd vec:  (16 19 22) (17 20 23) (18 21)
6245
6246    Next we unite and shift vector 3 times:
6247
6248    1st step:
6249      shift right by 6 the concatenation of:
6250      "1st vec" and  "2nd vec"
6251        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6252      "2nd vec" and  "3rd vec"
6253        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6254      "3rd vec" and  "1st vec"
6255        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6256                              | New vectors                   |
6257
6258      So that now new vectors are:
6259
6260      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6261      2nd vec:  (10 13) (16 19 22) (17 20 23)
6262      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6263
6264    2nd step:
6265      shift right by 5 the concatenation of:
6266      "1st vec" and  "3rd vec"
6267        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6268      "2nd vec" and  "1st vec"
6269        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6270      "3rd vec" and  "2nd vec"
6271        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6272                           | New vectors                   |
6273
6274      So that now new vectors are:
6275
6276      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6277      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6278      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6279
6280    3rd step:
6281      shift right by 5 the concatenation of:
6282      "1st vec" and  "1st vec"
6283        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6284      shift right by 3 the concatenation of:
6285      "2nd vec" and  "2nd vec"
6286                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6287                           | New vectors                   |
6288
6289      So that now all vectors are READY:
6290      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6291      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6292      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6293
6294    This algorithm is faster than one in vect_permute_load_chain if:
6295      1.  "shift of a concatination" is faster than general permutation.
6296          This is usually so.
6297      2.  The TARGET machine can't execute vector instructions in parallel.
6298          This is because each step of the algorithm depends on previous.
6299          The algorithm in vect_permute_load_chain is much more parallel.
6300
6301    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6302 */
6303
6304 static bool
6305 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6306                                unsigned int length,
6307                                stmt_vec_info stmt_info,
6308                                gimple_stmt_iterator *gsi,
6309                                vec<tree> *result_chain)
6310 {
6311   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6312   tree perm2_mask1, perm2_mask2, perm3_mask;
6313   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6314   gimple *perm_stmt;
6315
6316   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6317   unsigned int i;
6318   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6319
6320   unsigned HOST_WIDE_INT nelt, vf;
6321   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6322       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6323     /* Not supported for variable-length vectors.  */
6324     return false;
6325
6326   vec_perm_builder sel (nelt, nelt, 1);
6327   sel.quick_grow (nelt);
6328
6329   result_chain->quick_grow (length);
6330   memcpy (result_chain->address (), dr_chain.address (),
6331           length * sizeof (tree));
6332
6333   if (pow2p_hwi (length) && vf > 4)
6334     {
6335       unsigned int j, log_length = exact_log2 (length);
6336       for (i = 0; i < nelt / 2; ++i)
6337         sel[i] = i * 2;
6338       for (i = 0; i < nelt / 2; ++i)
6339         sel[nelt / 2 + i] = i * 2 + 1;
6340       vec_perm_indices indices (sel, 2, nelt);
6341       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6342         {
6343           if (dump_enabled_p ())
6344             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6345                              "shuffle of 2 fields structure is not \
6346                               supported by target\n");
6347           return false;
6348         }
6349       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6350
6351       for (i = 0; i < nelt / 2; ++i)
6352         sel[i] = i * 2 + 1;
6353       for (i = 0; i < nelt / 2; ++i)
6354         sel[nelt / 2 + i] = i * 2;
6355       indices.new_vector (sel, 2, nelt);
6356       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6357         {
6358           if (dump_enabled_p ())
6359             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6360                              "shuffle of 2 fields structure is not \
6361                               supported by target\n");
6362           return false;
6363         }
6364       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6365
6366       /* Generating permutation constant to shift all elements.
6367          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6368       for (i = 0; i < nelt; i++)
6369         sel[i] = nelt / 2 + i;
6370       indices.new_vector (sel, 2, nelt);
6371       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6372         {
6373           if (dump_enabled_p ())
6374             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6375                              "shift permutation is not supported by target\n");
6376           return false;
6377         }
6378       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6379
6380       /* Generating permutation constant to select vector from 2.
6381          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6382       for (i = 0; i < nelt / 2; i++)
6383         sel[i] = i;
6384       for (i = nelt / 2; i < nelt; i++)
6385         sel[i] = nelt + i;
6386       indices.new_vector (sel, 2, nelt);
6387       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6388         {
6389           if (dump_enabled_p ())
6390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6391                              "select is not supported by target\n");
6392           return false;
6393         }
6394       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6395
6396       for (i = 0; i < log_length; i++)
6397         {
6398           for (j = 0; j < length; j += 2)
6399             {
6400               first_vect = dr_chain[j];
6401               second_vect = dr_chain[j + 1];
6402
6403               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6404               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6405                                                first_vect, first_vect,
6406                                                perm2_mask1);
6407               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6408               vect[0] = data_ref;
6409
6410               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6411               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6412                                                second_vect, second_vect,
6413                                                perm2_mask2);
6414               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6415               vect[1] = data_ref;
6416
6417               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6418               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6419                                                vect[0], vect[1], shift1_mask);
6420               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6421               (*result_chain)[j/2 + length/2] = data_ref;
6422
6423               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6424               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6425                                                vect[0], vect[1], select_mask);
6426               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6427               (*result_chain)[j/2] = data_ref;
6428             }
6429           memcpy (dr_chain.address (), result_chain->address (),
6430                   length * sizeof (tree));
6431         }
6432       return true;
6433     }
6434   if (length == 3 && vf > 2)
6435     {
6436       unsigned int k = 0, l = 0;
6437
6438       /* Generating permutation constant to get all elements in rigth order.
6439          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6440       for (i = 0; i < nelt; i++)
6441         {
6442           if (3 * k + (l % 3) >= nelt)
6443             {
6444               k = 0;
6445               l += (3 - (nelt % 3));
6446             }
6447           sel[i] = 3 * k + (l % 3);
6448           k++;
6449         }
6450       vec_perm_indices indices (sel, 2, nelt);
6451       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6452         {
6453           if (dump_enabled_p ())
6454             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6455                              "shuffle of 3 fields structure is not \
6456                               supported by target\n");
6457           return false;
6458         }
6459       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6460
6461       /* Generating permutation constant to shift all elements.
6462          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6463       for (i = 0; i < nelt; i++)
6464         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6465       indices.new_vector (sel, 2, nelt);
6466       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6467         {
6468           if (dump_enabled_p ())
6469             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470                              "shift permutation is not supported by target\n");
6471           return false;
6472         }
6473       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6474
6475       /* Generating permutation constant to shift all elements.
6476          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6477       for (i = 0; i < nelt; i++)
6478         sel[i] = 2 * (nelt / 3) + 1 + i;
6479       indices.new_vector (sel, 2, nelt);
6480       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6481         {
6482           if (dump_enabled_p ())
6483             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6484                              "shift permutation is not supported by target\n");
6485           return false;
6486         }
6487       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6488
6489       /* Generating permutation constant to shift all elements.
6490          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6491       for (i = 0; i < nelt; i++)
6492         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6493       indices.new_vector (sel, 2, nelt);
6494       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6495         {
6496           if (dump_enabled_p ())
6497             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6498                              "shift permutation is not supported by target\n");
6499           return false;
6500         }
6501       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6502
6503       /* Generating permutation constant to shift all elements.
6504          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6505       for (i = 0; i < nelt; i++)
6506         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6507       indices.new_vector (sel, 2, nelt);
6508       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6509         {
6510           if (dump_enabled_p ())
6511             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6512                              "shift permutation is not supported by target\n");
6513           return false;
6514         }
6515       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6516
6517       for (k = 0; k < 3; k++)
6518         {
6519           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6520           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6521                                            dr_chain[k], dr_chain[k],
6522                                            perm3_mask);
6523           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6524           vect[k] = data_ref;
6525         }
6526
6527       for (k = 0; k < 3; k++)
6528         {
6529           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6530           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6531                                            vect[k % 3], vect[(k + 1) % 3],
6532                                            shift1_mask);
6533           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6534           vect_shift[k] = data_ref;
6535         }
6536
6537       for (k = 0; k < 3; k++)
6538         {
6539           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6540           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6541                                            vect_shift[(4 - k) % 3],
6542                                            vect_shift[(3 - k) % 3],
6543                                            shift2_mask);
6544           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6545           vect[k] = data_ref;
6546         }
6547
6548       (*result_chain)[3 - (nelt % 3)] = vect[2];
6549
6550       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6551       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6552                                        vect[0], shift3_mask);
6553       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6554       (*result_chain)[nelt % 3] = data_ref;
6555
6556       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6557       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6558                                        vect[1], shift4_mask);
6559       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6560       (*result_chain)[0] = data_ref;
6561       return true;
6562     }
6563   return false;
6564 }
6565
6566 /* Function vect_transform_grouped_load.
6567
6568    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6569    to perform their permutation and ascribe the result vectorized statements to
6570    the scalar statements.
6571 */
6572
6573 void
6574 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6575                              vec<tree> dr_chain,
6576                              int size, gimple_stmt_iterator *gsi)
6577 {
6578   machine_mode mode;
6579   vec<tree> result_chain = vNULL;
6580
6581   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6582      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6583      vectors, that are ready for vector computation.  */
6584   result_chain.create (size);
6585
6586   /* If reassociation width for vector type is 2 or greater target machine can
6587      execute 2 or more vector instructions in parallel.  Otherwise try to
6588      get chain for loads group using vect_shift_permute_load_chain.  */
6589   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6590   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6591       || pow2p_hwi (size)
6592       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6593                                          gsi, &result_chain))
6594     vect_permute_load_chain (vinfo, dr_chain,
6595                              size, stmt_info, gsi, &result_chain);
6596   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6597   result_chain.release ();
6598 }
6599
6600 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6601    generated as part of the vectorization of STMT_INFO.  Assign the statement
6602    for each vector to the associated scalar statement.  */
6603
6604 void
6605 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6606                                   vec<tree> result_chain)
6607 {
6608   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6609   unsigned int i, gap_count;
6610   tree tmp_data_ref;
6611
6612   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6613      Since we scan the chain starting from it's first node, their order
6614      corresponds the order of data-refs in RESULT_CHAIN.  */
6615   stmt_vec_info next_stmt_info = first_stmt_info;
6616   gap_count = 1;
6617   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6618     {
6619       if (!next_stmt_info)
6620         break;
6621
6622       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6623        code elimination pass later.  No need to check for the first stmt in
6624        the group, since it always exists.
6625        DR_GROUP_GAP is the number of steps in elements from the previous
6626        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6627        correspond to the gaps.  */
6628       if (next_stmt_info != first_stmt_info
6629           && gap_count < DR_GROUP_GAP (next_stmt_info))
6630         {
6631           gap_count++;
6632           continue;
6633         }
6634
6635       /* ???  The following needs cleanup after the removal of
6636          DR_GROUP_SAME_DR_STMT.  */
6637       if (next_stmt_info)
6638         {
6639           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6640           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6641              copies, and we put the new vector statement last.  */
6642           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6643
6644           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6645           gap_count = 1;
6646         }
6647     }
6648 }
6649
6650 /* Function vect_force_dr_alignment_p.
6651
6652    Returns whether the alignment of a DECL can be forced to be aligned
6653    on ALIGNMENT bit boundary.  */
6654
6655 bool
6656 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6657 {
6658   if (!VAR_P (decl))
6659     return false;
6660
6661   if (decl_in_symtab_p (decl)
6662       && !symtab_node::get (decl)->can_increase_alignment_p ())
6663     return false;
6664
6665   if (TREE_STATIC (decl))
6666     return (known_le (alignment,
6667                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6668   else
6669     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6670 }
6671
6672 /* Return whether the data reference DR_INFO is supported with respect to its
6673    alignment.
6674    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6675    it is aligned, i.e., check if it is possible to vectorize it with different
6676    alignment.  */
6677
6678 enum dr_alignment_support
6679 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6680                                tree vectype, int misalignment)
6681 {
6682   data_reference *dr = dr_info->dr;
6683   stmt_vec_info stmt_info = dr_info->stmt;
6684   machine_mode mode = TYPE_MODE (vectype);
6685   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6686   class loop *vect_loop = NULL;
6687   bool nested_in_vect_loop = false;
6688
6689   if (misalignment == 0)
6690     return dr_aligned;
6691
6692   /* For now assume all conditional loads/stores support unaligned
6693      access without any special code.  */
6694   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6695     if (gimple_call_internal_p (stmt)
6696         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6697             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6698       return dr_unaligned_supported;
6699
6700   if (loop_vinfo)
6701     {
6702       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6703       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6704     }
6705
6706   /* Possibly unaligned access.  */
6707
6708   /* We can choose between using the implicit realignment scheme (generating
6709      a misaligned_move stmt) and the explicit realignment scheme (generating
6710      aligned loads with a REALIGN_LOAD).  There are two variants to the
6711      explicit realignment scheme: optimized, and unoptimized.
6712      We can optimize the realignment only if the step between consecutive
6713      vector loads is equal to the vector size.  Since the vector memory
6714      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6715      is guaranteed that the misalignment amount remains the same throughout the
6716      execution of the vectorized loop.  Therefore, we can create the
6717      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6718      at the loop preheader.
6719
6720      However, in the case of outer-loop vectorization, when vectorizing a
6721      memory access in the inner-loop nested within the LOOP that is now being
6722      vectorized, while it is guaranteed that the misalignment of the
6723      vectorized memory access will remain the same in different outer-loop
6724      iterations, it is *not* guaranteed that is will remain the same throughout
6725      the execution of the inner-loop.  This is because the inner-loop advances
6726      with the original scalar step (and not in steps of VS).  If the inner-loop
6727      step happens to be a multiple of VS, then the misalignment remains fixed
6728      and we can use the optimized realignment scheme.  For example:
6729
6730       for (i=0; i<N; i++)
6731         for (j=0; j<M; j++)
6732           s += a[i+j];
6733
6734      When vectorizing the i-loop in the above example, the step between
6735      consecutive vector loads is 1, and so the misalignment does not remain
6736      fixed across the execution of the inner-loop, and the realignment cannot
6737      be optimized (as illustrated in the following pseudo vectorized loop):
6738
6739       for (i=0; i<N; i+=4)
6740         for (j=0; j<M; j++){
6741           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6742                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6743                          // (assuming that we start from an aligned address).
6744           }
6745
6746      We therefore have to use the unoptimized realignment scheme:
6747
6748       for (i=0; i<N; i+=4)
6749           for (j=k; j<M; j+=4)
6750           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6751                            // that the misalignment of the initial address is
6752                            // 0).
6753
6754      The loop can then be vectorized as follows:
6755
6756       for (k=0; k<4; k++){
6757         rt = get_realignment_token (&vp[k]);
6758         for (i=0; i<N; i+=4){
6759           v1 = vp[i+k];
6760           for (j=k; j<M; j+=4){
6761             v2 = vp[i+j+VS-1];
6762             va = REALIGN_LOAD <v1,v2,rt>;
6763             vs += va;
6764             v1 = v2;
6765           }
6766         }
6767     } */
6768
6769   if (DR_IS_READ (dr))
6770     {
6771       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6772           && (!targetm.vectorize.builtin_mask_for_load
6773               || targetm.vectorize.builtin_mask_for_load ()))
6774         {
6775           /* If we are doing SLP then the accesses need not have the
6776              same alignment, instead it depends on the SLP group size.  */
6777           if (loop_vinfo
6778               && STMT_SLP_TYPE (stmt_info)
6779               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6780                               * (DR_GROUP_SIZE
6781                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6782                               TYPE_VECTOR_SUBPARTS (vectype)))
6783             ;
6784           else if (!loop_vinfo
6785                    || (nested_in_vect_loop
6786                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6787                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6788             return dr_explicit_realign;
6789           else
6790             return dr_explicit_realign_optimized;
6791         }
6792     }
6793
6794   bool is_packed = false;
6795   tree type = TREE_TYPE (DR_REF (dr));
6796   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6797     is_packed = not_size_aligned (DR_REF (dr));
6798   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6799                                                      is_packed))
6800     return dr_unaligned_supported;
6801
6802   /* Unsupported.  */
6803   return dr_unaligned_unsupported;
6804 }