gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode, array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   if (!targetm.array_mode (mode, count).exists (&array_mode))
  69     {
  70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  71       limit_p = !targetm.array_mode_supported_p (mode, count);
  72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  73         {
  74           if (dump_enabled_p ())
  75             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  76                              "no array mode for %s[%wu]\n",
  77                              GET_MODE_NAME (mode), count);
  78           return false;
  79         }
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT_INFO.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 119                                HOST_WIDE_INT *lhs_size_unit,
 120                                HOST_WIDE_INT *rhs_size_unit)
 121 {
 122   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 123   HOST_WIDE_INT lhs, rhs;
 124
 125   /* During the analysis phase, this function is called on arbitrary
 126      statements that might not have scalar results.  */
 127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 128     return scalar_type;
 129
 130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 131
 132   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 133   if (assign
 134       && (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 148     {
 149       unsigned int i = 0;
 150       if (gimple_call_internal_p (call))
 151         {
 152           internal_fn ifn = gimple_call_internal_fn (call);
 153           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 154             /* gimple_expr_type already picked the type of the loaded
 155                or stored data.  */
 156             i = ~0U;
 157           else if (internal_fn_mask_index (ifn) == 0)
 158             i = 1;
 159         }
 160       if (i < gimple_call_num_args (call))
 161         {
 162           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 163           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 164             {
 165               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 166               if (rhs < lhs)
 167                 scalar_type = rhs_type;
 168             }
 169         }
 170     }
 171
 172   *lhs_size_unit = lhs;
 173   *rhs_size_unit = rhs;
 174   return scalar_type;
 175 }
 176
 177
 178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 179    tested at run-time.  Return TRUE if DDR was successfully inserted.
 180    Return false if versioning is not supported.  */
 181
 182 static opt_result
 183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 184 {
 185   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186
 187   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 188     return opt_result::failure_at (vect_location,
 189                                    "will not create alias checks, as"
 190                                    " --param vect-max-version-for-alias-checks"
 191                                    " == 0\n");
 192
 193   opt_result res
 194     = runtime_alias_check_p (ddr, loop,
 195                              optimize_loop_nest_for_speed_p (loop));
 196   if (!res)
 197     return res;
 198
 199   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 200   return opt_result::success ();
 201 }
 202
 203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 204
 205 static void
 206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 207 {
 208   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 209   for (unsigned int i = 0; i < checks.length(); ++i)
 210     if (checks[i] == value)
 211       return;
 212
 213   if (dump_enabled_p ())
 214     dump_printf_loc (MSG_NOTE, vect_location,
 215                      "need run-time check that %T is nonzero\n",
 216                      value);
 217   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 218 }
 219
 220 /* Return true if we know that the order of vectorized DR_INFO_A and
 221    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 222    DR_INFO_B.  At least one of the accesses is a write.  */
 223
 224 static bool
 225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 226 {
 227   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 228   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 229
 230   /* Single statements are always kept in their original order.  */
 231   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 232       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 233     return true;
 234
 235   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 236      SLP group are emitted at the position of the last scalar load and
 237      all loads in an interleaving group are emitted at the position
 238      of the first scalar load.
 239      Stores in a group are emitted at the position of the last scalar store.
 240      Compute that position and check whether the resulting order matches
 241      the current one.
 242      We have not yet decided between SLP and interleaving so we have
 243      to conservatively assume both.  */
 244   stmt_vec_info il_a;
 245   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 246   if (last_a)
 247     {
 248       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 249            s = DR_GROUP_NEXT_ELEMENT (s))
 250         last_a = get_later_stmt (last_a, s);
 251       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 252         {
 253           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 254                s = DR_GROUP_NEXT_ELEMENT (s))
 255             if (get_later_stmt (il_a, s) == il_a)
 256               il_a = s;
 257         }
 258       else
 259         il_a = last_a;
 260     }
 261   else
 262     last_a = il_a = stmtinfo_a;
 263   stmt_vec_info il_b;
 264   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 265   if (last_b)
 266     {
 267       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 268            s = DR_GROUP_NEXT_ELEMENT (s))
 269         last_b = get_later_stmt (last_b, s);
 270       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 271         {
 272           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 273                s = DR_GROUP_NEXT_ELEMENT (s))
 274             if (get_later_stmt (il_b, s) == il_b)
 275               il_b = s;
 276         }
 277       else
 278         il_b = last_b;
 279     }
 280   else
 281     last_b = il_b = stmtinfo_b;
 282   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 283   return (/* SLP */
 284           (get_later_stmt (last_a, last_b) == last_a) == a_after_b
 285           /* Interleaving */
 286           && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
 287           /* Mixed */
 288           && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
 289           && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 290 }
 291
 292 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 293    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 294    distances.  These distances are conservatively correct but they don't
 295    reflect a guaranteed dependence.
 296
 297    Return true if this function does all the work necessary to avoid
 298    an alias or false if the caller should use the dependence distances
 299    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 300    the depth of the loop described by LOOP_VINFO and the other arguments
 301    are as for vect_analyze_data_ref_dependence.  */
 302
 303 static bool
 304 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 305                                        loop_vec_info loop_vinfo,
 306                                        int loop_depth, unsigned int *max_vf)
 307 {
 308   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 309   lambda_vector dist_v;
 310   unsigned int i;
 311   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 312     {
 313       int dist = dist_v[loop_depth];
 314       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 315         {
 316           /* If the user asserted safelen >= DIST consecutive iterations
 317              can be executed concurrently, assume independence.
 318
 319              ??? An alternative would be to add the alias check even
 320              in this case, and vectorize the fallback loop with the
 321              maximum VF set to safelen.  However, if the user has
 322              explicitly given a length, it's less likely that that
 323              would be a win.  */
 324           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 325             {
 326               if ((unsigned int) loop->safelen < *max_vf)
 327                 *max_vf = loop->safelen;
 328               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 329               continue;
 330             }
 331
 332           /* For dependence distances of 2 or more, we have the option
 333              of limiting VF or checking for an alias at runtime.
 334              Prefer to check at runtime if we can, to avoid limiting
 335              the VF unnecessarily when the bases are in fact independent.
 336
 337              Note that the alias checks will be removed if the VF ends up
 338              being small enough.  */
 339           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 340           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 341           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 342                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 343                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 344         }
 345     }
 346   return true;
 347 }
 348
 349
 350 /* Function vect_analyze_data_ref_dependence.
 351
 352    FIXME: I needed to change the sense of the returned flag.
 353
 354    Return FALSE if there (might) exist a dependence between a memory-reference
 355    DRA and a memory-reference DRB.  When versioning for alias may check a
 356    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 357    the data dependence.  */
 358
 359 static opt_result
 360 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 361                                   loop_vec_info loop_vinfo,
 362                                   unsigned int *max_vf)
 363 {
 364   unsigned int i;
 365   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 366   struct data_reference *dra = DDR_A (ddr);
 367   struct data_reference *drb = DDR_B (ddr);
 368   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 369   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 370   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 371   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 372   lambda_vector dist_v;
 373   unsigned int loop_depth;
 374
 375   /* In loop analysis all data references should be vectorizable.  */
 376   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 377       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 378     gcc_unreachable ();
 379
 380   /* Independent data accesses.  */
 381   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 382     return opt_result::success ();
 383
 384   if (dra == drb
 385       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 386     return opt_result::success ();
 387
 388   /* We do not have to consider dependences between accesses that belong
 389      to the same group, unless the stride could be smaller than the
 390      group size.  */
 391   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 392       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 394       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 395     return opt_result::success ();
 396
 397   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 398      least two scalar iterations, there is always also a true dependence.
 399      As the vectorizer does not re-order loads and stores we can ignore
 400      the anti-dependence if TBAA can disambiguate both DRs similar to the
 401      case with known negative distance anti-dependences (positive
 402      distance anti-dependences would violate TBAA constraints).  */
 403   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 404        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 405       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 406                                  get_alias_set (DR_REF (drb))))
 407     return opt_result::success ();
 408
 409   /* Unknown data dependence.  */
 410   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 411     {
 412       /* If user asserted safelen consecutive iterations can be
 413          executed concurrently, assume independence.  */
 414       if (loop->safelen >= 2)
 415         {
 416           if ((unsigned int) loop->safelen < *max_vf)
 417             *max_vf = loop->safelen;
 418           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 419           return opt_result::success ();
 420         }
 421
 422       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 423           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 424         return opt_result::failure_at
 425           (stmtinfo_a->stmt,
 426            "versioning for alias not supported for: "
 427            "can't determine dependence between %T and %T\n",
 428            DR_REF (dra), DR_REF (drb));
 429
 430       if (dump_enabled_p ())
 431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 432                          "versioning for alias required: "
 433                          "can't determine dependence between %T and %T\n",
 434                          DR_REF (dra), DR_REF (drb));
 435
 436       /* Add to list of ddrs that need to be tested at run-time.  */
 437       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 438     }
 439
 440   /* Known data dependence.  */
 441   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 442     {
 443       /* If user asserted safelen consecutive iterations can be
 444          executed concurrently, assume independence.  */
 445       if (loop->safelen >= 2)
 446         {
 447           if ((unsigned int) loop->safelen < *max_vf)
 448             *max_vf = loop->safelen;
 449           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 450           return opt_result::success ();
 451         }
 452
 453       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 454           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 455         return opt_result::failure_at
 456           (stmtinfo_a->stmt,
 457            "versioning for alias not supported for: "
 458            "bad dist vector for %T and %T\n",
 459            DR_REF (dra), DR_REF (drb));
 460
 461       if (dump_enabled_p ())
 462         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 463                          "versioning for alias required: "
 464                          "bad dist vector for %T and %T\n",
 465                          DR_REF (dra), DR_REF (drb));
 466       /* Add to list of ddrs that need to be tested at run-time.  */
 467       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 468     }
 469
 470   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 471
 472   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 473       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 474                                                 loop_depth, max_vf))
 475     return opt_result::success ();
 476
 477   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 478     {
 479       int dist = dist_v[loop_depth];
 480
 481       if (dump_enabled_p ())
 482         dump_printf_loc (MSG_NOTE, vect_location,
 483                          "dependence distance  = %d.\n", dist);
 484
 485       if (dist == 0)
 486         {
 487           if (dump_enabled_p ())
 488             dump_printf_loc (MSG_NOTE, vect_location,
 489                              "dependence distance == 0 between %T and %T\n",
 490                              DR_REF (dra), DR_REF (drb));
 491
 492           /* When we perform grouped accesses and perform implicit CSE
 493              by detecting equal accesses and doing disambiguation with
 494              runtime alias tests like for
 495                 .. = a[i];
 496                 .. = a[i+1];
 497                 a[i] = ..;
 498                 a[i+1] = ..;
 499                 *p = ..;
 500                 .. = a[i];
 501                 .. = a[i+1];
 502              where we will end up loading { a[i], a[i+1] } once, make
 503              sure that inserting group loads before the first load and
 504              stores after the last store will do the right thing.
 505              Similar for groups like
 506                 a[i] = ...;
 507                 ... = a[i];
 508                 a[i+1] = ...;
 509              where loads from the group interleave with the store.  */
 510           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 511             return opt_result::failure_at (stmtinfo_a->stmt,
 512                                            "READ_WRITE dependence"
 513                                            " in interleaving.\n");
 514
 515           if (loop->safelen < 2)
 516             {
 517               tree indicator = dr_zero_step_indicator (dra);
 518               if (!indicator || integer_zerop (indicator))
 519                 return opt_result::failure_at (stmtinfo_a->stmt,
 520                                                "access also has a zero step\n");
 521               else if (TREE_CODE (indicator) != INTEGER_CST)
 522                 vect_check_nonzero_value (loop_vinfo, indicator);
 523             }
 524           continue;
 525         }
 526
 527       if (dist > 0 && DDR_REVERSED_P (ddr))
 528         {
 529           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 530              reversed (to make distance vector positive), and the actual
 531              distance is negative.  */
 532           if (dump_enabled_p ())
 533             dump_printf_loc (MSG_NOTE, vect_location,
 534                              "dependence distance negative.\n");
 535           /* When doing outer loop vectorization, we need to check if there is
 536              a backward dependence at the inner loop level if the dependence
 537              at the outer loop is reversed.  See PR81740.  */
 538           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 539               || nested_in_vect_loop_p (loop, stmtinfo_b))
 540             {
 541               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 542                                                          DDR_LOOP_NEST (ddr));
 543               if (dist_v[inner_depth] < 0)
 544                 return opt_result::failure_at (stmtinfo_a->stmt,
 545                                                "not vectorized, dependence "
 546                                                "between data-refs %T and %T\n",
 547                                                DR_REF (dra), DR_REF (drb));
 548             }
 549           /* Record a negative dependence distance to later limit the
 550              amount of stmt copying / unrolling we can perform.
 551              Only need to handle read-after-write dependence.  */
 552           if (DR_IS_READ (drb)
 553               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 554                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 555             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 556           continue;
 557         }
 558
 559       unsigned int abs_dist = abs (dist);
 560       if (abs_dist >= 2 && abs_dist < *max_vf)
 561         {
 562           /* The dependence distance requires reduction of the maximal
 563              vectorization factor.  */
 564           *max_vf = abs_dist;
 565           if (dump_enabled_p ())
 566             dump_printf_loc (MSG_NOTE, vect_location,
 567                              "adjusting maximal vectorization factor to %i\n",
 568                              *max_vf);
 569         }
 570
 571       if (abs_dist >= *max_vf)
 572         {
 573           /* Dependence distance does not create dependence, as far as
 574              vectorization is concerned, in this case.  */
 575           if (dump_enabled_p ())
 576             dump_printf_loc (MSG_NOTE, vect_location,
 577                              "dependence distance >= VF.\n");
 578           continue;
 579         }
 580
 581       return opt_result::failure_at (stmtinfo_a->stmt,
 582                                      "not vectorized, possible dependence "
 583                                      "between data-refs %T and %T\n",
 584                                      DR_REF (dra), DR_REF (drb));
 585     }
 586
 587   return opt_result::success ();
 588 }
 589
 590 /* Function vect_analyze_data_ref_dependences.
 591
 592    Examine all the data references in the loop, and make sure there do not
 593    exist any data dependences between them.  Set *MAX_VF according to
 594    the maximum vectorization factor the data dependences allow.  */
 595
 596 opt_result
 597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 598                                    unsigned int *max_vf)
 599 {
 600   unsigned int i;
 601   struct data_dependence_relation *ddr;
 602
 603   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 604
 605   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 606     {
 607       LOOP_VINFO_DDRS (loop_vinfo)
 608         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 609                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 610       /* We need read-read dependences to compute
 611          STMT_VINFO_SAME_ALIGN_REFS.  */
 612       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 613                                           &LOOP_VINFO_DDRS (loop_vinfo),
 614                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 615                                           true);
 616       gcc_assert (res);
 617     }
 618
 619   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 620
 621   /* For epilogues we either have no aliases or alias versioning
 622      was applied to original loop.  Therefore we may just get max_vf
 623      using VF of original loop.  */
 624   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 625     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 626   else
 627     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 628       {
 629         opt_result res
 630           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 631         if (!res)
 632           return res;
 633       }
 634
 635   return opt_result::success ();
 636 }
 637
 638
 639 /* Function vect_slp_analyze_data_ref_dependence.
 640
 641    Return TRUE if there (might) exist a dependence between a memory-reference
 642    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 643    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 644    according to the data dependence.  */
 645
 646 static bool
 647 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 648                                       struct data_dependence_relation *ddr)
 649 {
 650   struct data_reference *dra = DDR_A (ddr);
 651   struct data_reference *drb = DDR_B (ddr);
 652   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 653   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 654
 655   /* We need to check dependences of statements marked as unvectorizable
 656      as well, they still can prohibit vectorization.  */
 657
 658   /* Independent data accesses.  */
 659   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 660     return false;
 661
 662   if (dra == drb)
 663     return false;
 664
 665   /* Read-read is OK.  */
 666   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 667     return false;
 668
 669   /* If dra and drb are part of the same interleaving chain consider
 670      them independent.  */
 671   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 672       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 673           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 674     return false;
 675
 676   /* Unknown data dependence.  */
 677   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 678     {
 679       if  (dump_enabled_p ())
 680         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 681                          "can't determine dependence between %T and %T\n",
 682                          DR_REF (dra), DR_REF (drb));
 683     }
 684   else if (dump_enabled_p ())
 685     dump_printf_loc (MSG_NOTE, vect_location,
 686                      "determined dependence between %T and %T\n",
 687                      DR_REF (dra), DR_REF (drb));
 688
 689   return true;
 690 }
 691
 692
 693 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 694    contain the vector of scalar stores of this instance if we are
 695    disambiguating the loads.  */
 696
 697 static bool
 698 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 699                                    vec<stmt_vec_info> stores,
 700                                    stmt_vec_info last_store_info)
 701 {
 702   /* This walks over all stmts involved in the SLP load/store done
 703      in NODE verifying we can sink them up to the last stmt in the
 704      group.  */
 705   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 706   vec_info *vinfo = last_access_info->vinfo;
 707   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 708     {
 709       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 710       if (access_info == last_access_info)
 711         continue;
 712       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 713       ao_ref ref;
 714       bool ref_initialized_p = false;
 715       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 716            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 717         {
 718           gimple *stmt = gsi_stmt (gsi);
 719           if (! gimple_vuse (stmt)
 720               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 721             continue;
 722
 723           /* If we couldn't record a (single) data reference for this
 724              stmt we have to resort to the alias oracle.  */
 725           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 726           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 727           if (!dr_b)
 728             {
 729               /* We are moving a store or sinking a load - this means
 730                  we cannot use TBAA for disambiguation.  */
 731               if (!ref_initialized_p)
 732                 ao_ref_init (&ref, DR_REF (dr_a));
 733               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 734                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 735                 return false;
 736               continue;
 737             }
 738
 739           bool dependent = false;
 740           /* If we run into a store of this same instance (we've just
 741              marked those) then delay dependence checking until we run
 742              into the last store because this is where it will have
 743              been sunk to (and we verify if we can do that as well).  */
 744           if (gimple_visited_p (stmt))
 745             {
 746               if (stmt_info != last_store_info)
 747                 continue;
 748               unsigned i;
 749               stmt_vec_info store_info;
 750               FOR_EACH_VEC_ELT (stores, i, store_info)
 751                 {
 752                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 753                   ddr_p ddr = initialize_data_dependence_relation
 754                                 (dr_a, store_dr, vNULL);
 755                   dependent
 756                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 757                   free_dependence_relation (ddr);
 758                   if (dependent)
 759                     break;
 760                 }
 761             }
 762           else
 763             {
 764               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 765                                                                dr_b, vNULL);
 766               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 767               free_dependence_relation (ddr);
 768             }
 769           if (dependent)
 770             return false;
 771         }
 772     }
 773   return true;
 774 }
 775
 776
 777 /* Function vect_analyze_data_ref_dependences.
 778
 779    Examine all the data references in the basic-block, and make sure there
 780    do not exist any data dependences between them.  Set *MAX_VF according to
 781    the maximum vectorization factor the data dependences allow.  */
 782
 783 bool
 784 vect_slp_analyze_instance_dependence (slp_instance instance)
 785 {
 786   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 787
 788   /* The stores of this instance are at the root of the SLP tree.  */
 789   slp_tree store = SLP_INSTANCE_TREE (instance);
 790   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 791     store = NULL;
 792
 793   /* Verify we can sink stores to the vectorized stmt insert location.  */
 794   stmt_vec_info last_store_info = NULL;
 795   if (store)
 796     {
 797       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 798         return false;
 799
 800       /* Mark stores in this instance and remember the last one.  */
 801       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 802       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 803         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 804     }
 805
 806   bool res = true;
 807
 808   /* Verify we can sink loads to the vectorized stmt insert location,
 809      special-casing stores of this instance.  */
 810   slp_tree load;
 811   unsigned int i;
 812   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 813     if (! vect_slp_analyze_node_dependences (instance, load,
 814                                              store
 815                                              ? SLP_TREE_SCALAR_STMTS (store)
 816                                              : vNULL, last_store_info))
 817       {
 818         res = false;
 819         break;
 820       }
 821
 822   /* Unset the visited flag.  */
 823   if (store)
 824     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 825       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 826
 827   return res;
 828 }
 829
 830 /* Record the base alignment guarantee given by DRB, which occurs
 831    in STMT_INFO.  */
 832
 833 static void
 834 vect_record_base_alignment (stmt_vec_info stmt_info,
 835                             innermost_loop_behavior *drb)
 836 {
 837   vec_info *vinfo = stmt_info->vinfo;
 838   bool existed;
 839   innermost_loop_behavior *&entry
 840     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 841   if (!existed || entry->base_alignment < drb->base_alignment)
 842     {
 843       entry = drb;
 844       if (dump_enabled_p ())
 845         dump_printf_loc (MSG_NOTE, vect_location,
 846                          "recording new base alignment for %T\n"
 847                          "  alignment:    %d\n"
 848                          "  misalignment: %d\n"
 849                          "  based on:     %G",
 850                          drb->base_address,
 851                          drb->base_alignment,
 852                          drb->base_misalignment,
 853                          stmt_info->stmt);
 854     }
 855 }
 856
 857 /* If the region we're going to vectorize is reached, all unconditional
 858    data references occur at least once.  We can therefore pool the base
 859    alignment guarantees from each unconditional reference.  Do this by
 860    going through all the data references in VINFO and checking whether
 861    the containing statement makes the reference unconditionally.  If so,
 862    record the alignment of the base address in VINFO so that it can be
 863    used for all other references with the same base.  */
 864
 865 void
 866 vect_record_base_alignments (vec_info *vinfo)
 867 {
 868   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 869   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 870   data_reference *dr;
 871   unsigned int i;
 872   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 873     {
 874       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 875       stmt_vec_info stmt_info = dr_info->stmt;
 876       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 877           && STMT_VINFO_VECTORIZABLE (stmt_info)
 878           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 879         {
 880           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 881
 882           /* If DR is nested in the loop that is being vectorized, we can also
 883              record the alignment of the base wrt the outer loop.  */
 884           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 885             vect_record_base_alignment
 886               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 887         }
 888     }
 889 }
 890
 891 /* Return the target alignment for the vectorized form of DR_INFO.  */
 892
 893 static poly_uint64
 894 vect_calculate_target_alignment (dr_vec_info *dr_info)
 895 {
 896   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 897   return targetm.vectorize.preferred_vector_alignment (vectype);
 898 }
 899
 900 /* Function vect_compute_data_ref_alignment
 901
 902    Compute the misalignment of the data reference DR_INFO.
 903
 904    Output:
 905    1. DR_MISALIGNMENT (DR_INFO) is defined.
 906
 907    FOR NOW: No analysis is actually performed. Misalignment is calculated
 908    only for trivial cases. TODO.  */
 909
 910 static void
 911 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 912 {
 913   stmt_vec_info stmt_info = dr_info->stmt;
 914   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 915   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 916   class loop *loop = NULL;
 917   tree ref = DR_REF (dr_info->dr);
 918   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 919
 920   if (dump_enabled_p ())
 921     dump_printf_loc (MSG_NOTE, vect_location,
 922                      "vect_compute_data_ref_alignment:\n");
 923
 924   if (loop_vinfo)
 925     loop = LOOP_VINFO_LOOP (loop_vinfo);
 926
 927   /* Initialize misalignment to unknown.  */
 928   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 929
 930   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 931     return;
 932
 933   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 934   bool step_preserves_misalignment_p;
 935
 936   poly_uint64 vector_alignment
 937     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 938   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 939
 940   /* If the main loop has peeled for alignment we have no way of knowing
 941      whether the data accesses in the epilogues are aligned.  We can't at
 942      compile time answer the question whether we have entered the main loop or
 943      not.  Fixes PR 92351.  */
 944   if (loop_vinfo)
 945     {
 946       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
 947       if (orig_loop_vinfo
 948           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
 949         return;
 950     }
 951
 952   unsigned HOST_WIDE_INT vect_align_c;
 953   if (!vector_alignment.is_constant (&vect_align_c))
 954     return;
 955
 956   /* No step for BB vectorization.  */
 957   if (!loop)
 958     {
 959       gcc_assert (integer_zerop (drb->step));
 960       step_preserves_misalignment_p = true;
 961     }
 962
 963   /* In case the dataref is in an inner-loop of the loop that is being
 964      vectorized (LOOP), we use the base and misalignment information
 965      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 966      stays the same throughout the execution of the inner-loop, which is why
 967      we have to check that the stride of the dataref in the inner-loop evenly
 968      divides by the vector alignment.  */
 969   else if (nested_in_vect_loop_p (loop, stmt_info))
 970     {
 971       step_preserves_misalignment_p
 972         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 973
 974       if (dump_enabled_p ())
 975         {
 976           if (step_preserves_misalignment_p)
 977             dump_printf_loc (MSG_NOTE, vect_location,
 978                              "inner step divides the vector alignment.\n");
 979           else
 980             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 981                              "inner step doesn't divide the vector"
 982                              " alignment.\n");
 983         }
 984     }
 985
 986   /* Similarly we can only use base and misalignment information relative to
 987      an innermost loop if the misalignment stays the same throughout the
 988      execution of the loop.  As above, this is the case if the stride of
 989      the dataref evenly divides by the alignment.  */
 990   else
 991     {
 992       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 993       step_preserves_misalignment_p
 994         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 995
 996       if (!step_preserves_misalignment_p && dump_enabled_p ())
 997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 998                          "step doesn't divide the vector alignment.\n");
 999     }
1000
1001   unsigned int base_alignment = drb->base_alignment;
1002   unsigned int base_misalignment = drb->base_misalignment;
1003
1004   /* Calculate the maximum of the pooled base address alignment and the
1005      alignment that we can compute for DR itself.  */
1006   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1007   if (entry && base_alignment < (*entry)->base_alignment)
1008     {
1009       base_alignment = (*entry)->base_alignment;
1010       base_misalignment = (*entry)->base_misalignment;
1011     }
1012
1013   if (drb->offset_alignment < vect_align_c
1014       || !step_preserves_misalignment_p
1015       /* We need to know whether the step wrt the vectorized loop is
1016          negative when computing the starting misalignment below.  */
1017       || TREE_CODE (drb->step) != INTEGER_CST)
1018     {
1019       if (dump_enabled_p ())
1020         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021                          "Unknown alignment for access: %T\n", ref);
1022       return;
1023     }
1024
1025   if (base_alignment < vect_align_c)
1026     {
1027       unsigned int max_alignment;
1028       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1029       if (max_alignment < vect_align_c
1030           || !vect_can_force_dr_alignment_p (base,
1031                                              vect_align_c * BITS_PER_UNIT))
1032         {
1033           if (dump_enabled_p ())
1034             dump_printf_loc (MSG_NOTE, vect_location,
1035                              "can't force alignment of ref: %T\n", ref);
1036           return;
1037         }
1038
1039       /* Force the alignment of the decl.
1040          NOTE: This is the only change to the code we make during
1041          the analysis phase, before deciding to vectorize the loop.  */
1042       if (dump_enabled_p ())
1043         dump_printf_loc (MSG_NOTE, vect_location,
1044                          "force alignment of %T\n", ref);
1045
1046       dr_info->base_decl = base;
1047       dr_info->base_misaligned = true;
1048       base_misalignment = 0;
1049     }
1050   poly_int64 misalignment
1051     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1052
1053   /* If this is a backward running DR then first access in the larger
1054      vectype actually is N-1 elements before the address in the DR.
1055      Adjust misalign accordingly.  */
1056   if (tree_int_cst_sgn (drb->step) < 0)
1057     /* PLUS because STEP is negative.  */
1058     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1059                      * TREE_INT_CST_LOW (drb->step));
1060
1061   unsigned int const_misalignment;
1062   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1063     {
1064       if (dump_enabled_p ())
1065         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066                          "Non-constant misalignment for access: %T\n", ref);
1067       return;
1068     }
1069
1070   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1071
1072   if (dump_enabled_p ())
1073     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074                      "misalign = %d bytes of ref %T\n",
1075                      DR_MISALIGNMENT (dr_info), ref);
1076
1077   return;
1078 }
1079
1080 /* Function vect_update_misalignment_for_peel.
1081    Sets DR_INFO's misalignment
1082    - to 0 if it has the same alignment as DR_PEEL_INFO,
1083    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1084    - to -1 (unknown) otherwise.
1085
1086    DR_INFO - the data reference whose misalignment is to be adjusted.
1087    DR_PEEL_INFO - the data reference whose misalignment is being made
1088                   zero in the vector loop by the peel.
1089    NPEEL - the number of iterations in the peel loop if the misalignment
1090            of DR_PEEL_INFO is known at compile time.  */
1091
1092 static void
1093 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1094                                    dr_vec_info *dr_peel_info, int npeel)
1095 {
1096   unsigned int i;
1097   vec<dr_p> same_aligned_drs;
1098   struct data_reference *current_dr;
1099   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1100
1101   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1102      it is aligned in the vector loop.  */
1103   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1104   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1105     {
1106       if (current_dr != dr_info->dr)
1107         continue;
1108       gcc_assert (!known_alignment_for_access_p (dr_info)
1109                   || !known_alignment_for_access_p (dr_peel_info)
1110                   || (DR_MISALIGNMENT (dr_info)
1111                       == DR_MISALIGNMENT (dr_peel_info)));
1112       SET_DR_MISALIGNMENT (dr_info, 0);
1113       return;
1114     }
1115
1116   unsigned HOST_WIDE_INT alignment;
1117   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1118       && known_alignment_for_access_p (dr_info)
1119       && known_alignment_for_access_p (dr_peel_info))
1120     {
1121       int misal = DR_MISALIGNMENT (dr_info);
1122       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1123       misal &= alignment - 1;
1124       SET_DR_MISALIGNMENT (dr_info, misal);
1125       return;
1126     }
1127
1128   if (dump_enabled_p ())
1129     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1130                      "to unknown (-1).\n");
1131   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1132 }
1133
1134
1135 /* Function verify_data_ref_alignment
1136
1137    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1138
1139 static opt_result
1140 verify_data_ref_alignment (dr_vec_info *dr_info)
1141 {
1142   enum dr_alignment_support supportable_dr_alignment
1143     = vect_supportable_dr_alignment (dr_info, false);
1144   if (!supportable_dr_alignment)
1145     return opt_result::failure_at
1146       (dr_info->stmt->stmt,
1147        DR_IS_READ (dr_info->dr)
1148         ? "not vectorized: unsupported unaligned load: %T\n"
1149         : "not vectorized: unsupported unaligned store: %T\n",
1150        DR_REF (dr_info->dr));
1151
1152   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1153     dump_printf_loc (MSG_NOTE, vect_location,
1154                      "Vectorizing an unaligned access.\n");
1155
1156   return opt_result::success ();
1157 }
1158
1159 /* Function vect_verify_datarefs_alignment
1160
1161    Return TRUE if all data references in the loop can be
1162    handled with respect to alignment.  */
1163
1164 opt_result
1165 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1166 {
1167   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1168   struct data_reference *dr;
1169   unsigned int i;
1170
1171   FOR_EACH_VEC_ELT (datarefs, i, dr)
1172     {
1173       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1174       stmt_vec_info stmt_info = dr_info->stmt;
1175
1176       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1177         continue;
1178
1179       /* For interleaving, only the alignment of the first access matters.   */
1180       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1181           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1182         continue;
1183
1184       /* Strided accesses perform only component accesses, alignment is
1185          irrelevant for them.  */
1186       if (STMT_VINFO_STRIDED_P (stmt_info)
1187           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1188         continue;
1189
1190       opt_result res = verify_data_ref_alignment (dr_info);
1191       if (!res)
1192         return res;
1193     }
1194
1195   return opt_result::success ();
1196 }
1197
1198 /* Given an memory reference EXP return whether its alignment is less
1199    than its size.  */
1200
1201 static bool
1202 not_size_aligned (tree exp)
1203 {
1204   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205     return true;
1206
1207   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208           > get_object_alignment (exp));
1209 }
1210
1211 /* Function vector_alignment_reachable_p
1212
1213    Return true if vector alignment for DR_INFO is reachable by peeling
1214    a few loop iterations.  Return false otherwise.  */
1215
1216 static bool
1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1218 {
1219   stmt_vec_info stmt_info = dr_info->stmt;
1220   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1221
1222   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1223     {
1224       /* For interleaved access we peel only if number of iterations in
1225          the prolog loop ({VF - misalignment}), is a multiple of the
1226          number of the interleaved accesses.  */
1227       int elem_size, mis_in_elements;
1228
1229       /* FORNOW: handle only known alignment.  */
1230       if (!known_alignment_for_access_p (dr_info))
1231         return false;
1232
1233       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235       elem_size = vector_element_size (vector_size, nelements);
1236       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1237
1238       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239         return false;
1240     }
1241
1242   /* If misalignment is known at the compile time then allow peeling
1243      only if natural alignment is reachable through peeling.  */
1244   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1245     {
1246       HOST_WIDE_INT elmsize =
1247                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248       if (dump_enabled_p ())
1249         {
1250           dump_printf_loc (MSG_NOTE, vect_location,
1251                            "data size = %wd. misalignment = %d.\n", elmsize,
1252                            DR_MISALIGNMENT (dr_info));
1253         }
1254       if (DR_MISALIGNMENT (dr_info) % elmsize)
1255         {
1256           if (dump_enabled_p ())
1257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                              "data size does not divide the misalignment.\n");
1259           return false;
1260         }
1261     }
1262
1263   if (!known_alignment_for_access_p (dr_info))
1264     {
1265       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267       if (dump_enabled_p ())
1268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269                          "Unknown misalignment, %snaturally aligned\n",
1270                          is_packed ? "not " : "");
1271       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272     }
1273
1274   return true;
1275 }
1276
1277
1278 /* Calculate the cost of the memory access represented by DR_INFO.  */
1279
1280 static void
1281 vect_get_data_access_cost (dr_vec_info *dr_info,
1282                            unsigned int *inside_cost,
1283                            unsigned int *outside_cost,
1284                            stmt_vector_for_cost *body_cost_vec,
1285                            stmt_vector_for_cost *prologue_cost_vec)
1286 {
1287   stmt_vec_info stmt_info = dr_info->stmt;
1288   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289   int ncopies;
1290
1291   if (PURE_SLP_STMT (stmt_info))
1292     ncopies = 1;
1293   else
1294     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295
1296   if (DR_IS_READ (dr_info->dr))
1297     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298                         prologue_cost_vec, body_cost_vec, false);
1299   else
1300     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1301
1302   if (dump_enabled_p ())
1303     dump_printf_loc (MSG_NOTE, vect_location,
1304                      "vect_get_data_access_cost: inside_cost = %d, "
1305                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1306 }
1307
1308
1309 typedef struct _vect_peel_info
1310 {
1311   dr_vec_info *dr_info;
1312   int npeel;
1313   unsigned int count;
1314 } *vect_peel_info;
1315
1316 typedef struct _vect_peel_extended_info
1317 {
1318   struct _vect_peel_info peel_info;
1319   unsigned int inside_cost;
1320   unsigned int outside_cost;
1321 } *vect_peel_extended_info;
1322
1323
1324 /* Peeling hashtable helpers.  */
1325
1326 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1327 {
1328   static inline hashval_t hash (const _vect_peel_info *);
1329   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1330 };
1331
1332 inline hashval_t
1333 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1334 {
1335   return (hashval_t) peel_info->npeel;
1336 }
1337
1338 inline bool
1339 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1340 {
1341   return (a->npeel == b->npeel);
1342 }
1343
1344
1345 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1346
1347 static void
1348 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350                           int npeel)
1351 {
1352   struct _vect_peel_info elem, *slot;
1353   _vect_peel_info **new_slot;
1354   bool supportable_dr_alignment
1355     = vect_supportable_dr_alignment (dr_info, true);
1356
1357   elem.npeel = npeel;
1358   slot = peeling_htab->find (&elem);
1359   if (slot)
1360     slot->count++;
1361   else
1362     {
1363       slot = XNEW (struct _vect_peel_info);
1364       slot->npeel = npeel;
1365       slot->dr_info = dr_info;
1366       slot->count = 1;
1367       new_slot = peeling_htab->find_slot (slot, INSERT);
1368       *new_slot = slot;
1369     }
1370
1371   if (!supportable_dr_alignment
1372       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373     slot->count += VECT_MAX_COST;
1374 }
1375
1376
1377 /* Traverse peeling hash table to find peeling option that aligns maximum
1378    number of data accesses.  */
1379
1380 int
1381 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382                                      _vect_peel_extended_info *max)
1383 {
1384   vect_peel_info elem = *slot;
1385
1386   if (elem->count > max->peel_info.count
1387       || (elem->count == max->peel_info.count
1388           && max->peel_info.npeel > elem->npeel))
1389     {
1390       max->peel_info.npeel = elem->npeel;
1391       max->peel_info.count = elem->count;
1392       max->peel_info.dr_info = elem->dr_info;
1393     }
1394
1395   return 1;
1396 }
1397
1398 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1400    we assume DR0_INFO's misalignment will be zero after peeling.  */
1401
1402 static void
1403 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404                                 dr_vec_info *dr0_info,
1405                                 unsigned int *inside_cost,
1406                                 unsigned int *outside_cost,
1407                                 stmt_vector_for_cost *body_cost_vec,
1408                                 stmt_vector_for_cost *prologue_cost_vec,
1409                                 unsigned int npeel,
1410                                 bool unknown_misalignment)
1411 {
1412   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413   unsigned i;
1414   data_reference *dr;
1415
1416   FOR_EACH_VEC_ELT (datarefs, i, dr)
1417     {
1418       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419       stmt_vec_info stmt_info = dr_info->stmt;
1420       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421         continue;
1422
1423       /* For interleaving, only the alignment of the first access
1424          matters.  */
1425       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427         continue;
1428
1429       /* Strided accesses perform only component accesses, alignment is
1430          irrelevant for them.  */
1431       if (STMT_VINFO_STRIDED_P (stmt_info)
1432           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433         continue;
1434
1435       int save_misalignment;
1436       save_misalignment = DR_MISALIGNMENT (dr_info);
1437       if (npeel == 0)
1438         ;
1439       else if (unknown_misalignment && dr_info == dr0_info)
1440         SET_DR_MISALIGNMENT (dr_info, 0);
1441       else
1442         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444                                  body_cost_vec, prologue_cost_vec);
1445       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1446     }
1447 }
1448
1449 /* Traverse peeling hash table and calculate cost for each peeling option.
1450    Find the one with the lowest cost.  */
1451
1452 int
1453 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454                                    _vect_peel_extended_info *min)
1455 {
1456   vect_peel_info elem = *slot;
1457   int dummy;
1458   unsigned int inside_cost = 0, outside_cost = 0;
1459   stmt_vec_info stmt_info = elem->dr_info->stmt;
1460   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462                        epilogue_cost_vec;
1463
1464   prologue_cost_vec.create (2);
1465   body_cost_vec.create (2);
1466   epilogue_cost_vec.create (2);
1467
1468   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469                                   &outside_cost, &body_cost_vec,
1470                                   &prologue_cost_vec, elem->npeel, false);
1471
1472   body_cost_vec.release ();
1473
1474   outside_cost += vect_get_known_peeling_cost
1475     (loop_vinfo, elem->npeel, &dummy,
1476      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477      &prologue_cost_vec, &epilogue_cost_vec);
1478
1479   /* Prologue and epilogue costs are added to the target model later.
1480      These costs depend only on the scalar iteration cost, the
1481      number of peeling iterations finally chosen, and the number of
1482      misaligned statements.  So discard the information found here.  */
1483   prologue_cost_vec.release ();
1484   epilogue_cost_vec.release ();
1485
1486   if (inside_cost < min->inside_cost
1487       || (inside_cost == min->inside_cost
1488           && outside_cost < min->outside_cost))
1489     {
1490       min->inside_cost = inside_cost;
1491       min->outside_cost = outside_cost;
1492       min->peel_info.dr_info = elem->dr_info;
1493       min->peel_info.npeel = elem->npeel;
1494       min->peel_info.count = elem->count;
1495     }
1496
1497   return 1;
1498 }
1499
1500
1501 /* Choose best peeling option by traversing peeling hash table and either
1502    choosing an option with the lowest cost (if cost model is enabled) or the
1503    option that aligns as many accesses as possible.  */
1504
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507                                        loop_vec_info loop_vinfo)
1508 {
1509    struct _vect_peel_extended_info res;
1510
1511    res.peel_info.dr_info = NULL;
1512
1513    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514      {
1515        res.inside_cost = INT_MAX;
1516        res.outside_cost = INT_MAX;
1517        peeling_htab->traverse <_vect_peel_extended_info *,
1518                                vect_peeling_hash_get_lowest_cost> (&res);
1519      }
1520    else
1521      {
1522        res.peel_info.count = 0;
1523        peeling_htab->traverse <_vect_peel_extended_info *,
1524                                vect_peeling_hash_get_most_frequent> (&res);
1525        res.inside_cost = 0;
1526        res.outside_cost = 0;
1527      }
1528
1529    return res;
1530 }
1531
1532 /* Return true if the new peeling NPEEL is supported.  */
1533
1534 static bool
1535 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536                           unsigned npeel)
1537 {
1538   unsigned i;
1539   struct data_reference *dr = NULL;
1540   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541   enum dr_alignment_support supportable_dr_alignment;
1542
1543   /* Ensure that all data refs can be vectorized after the peel.  */
1544   FOR_EACH_VEC_ELT (datarefs, i, dr)
1545     {
1546       int save_misalignment;
1547
1548       if (dr == dr0_info->dr)
1549         continue;
1550
1551       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552       stmt_vec_info stmt_info = dr_info->stmt;
1553       /* For interleaving, only the alignment of the first access
1554          matters.  */
1555       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557         continue;
1558
1559       /* Strided accesses perform only component accesses, alignment is
1560          irrelevant for them.  */
1561       if (STMT_VINFO_STRIDED_P (stmt_info)
1562           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563         continue;
1564
1565       save_misalignment = DR_MISALIGNMENT (dr_info);
1566       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567       supportable_dr_alignment
1568         = vect_supportable_dr_alignment (dr_info, false);
1569       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1570
1571       if (!supportable_dr_alignment)
1572         return false;
1573     }
1574
1575   return true;
1576 }
1577
1578 /* Function vect_enhance_data_refs_alignment
1579
1580    This pass will use loop versioning and loop peeling in order to enhance
1581    the alignment of data references in the loop.
1582
1583    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584    original loop is to be vectorized.  Any other loops that are created by
1585    the transformations performed in this pass - are not supposed to be
1586    vectorized.  This restriction will be relaxed.
1587
1588    This pass will require a cost model to guide it whether to apply peeling
1589    or versioning or a combination of the two.  For example, the scheme that
1590    intel uses when given a loop with several memory accesses, is as follows:
1591    choose one memory access ('p') which alignment you want to force by doing
1592    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1593    other accesses are not necessarily aligned, or (2) use loop versioning to
1594    generate one loop in which all accesses are aligned, and another loop in
1595    which only 'p' is necessarily aligned.
1596
1597    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600
1601    Devising a cost model is the most critical aspect of this work.  It will
1602    guide us on which access to peel for, whether to use loop versioning, how
1603    many versions to create, etc.  The cost model will probably consist of
1604    generic considerations as well as target specific considerations (on
1605    powerpc for example, misaligned stores are more painful than misaligned
1606    loads).
1607
1608    Here are the general steps involved in alignment enhancements:
1609
1610      -- original loop, before alignment analysis:
1611         for (i=0; i<N; i++){
1612           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1613           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1614         }
1615
1616      -- After vect_compute_data_refs_alignment:
1617         for (i=0; i<N; i++){
1618           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1619           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1620         }
1621
1622      -- Possibility 1: we do loop versioning:
1623      if (p is aligned) {
1624         for (i=0; i<N; i++){    # loop 1A
1625           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1626           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1627         }
1628      }
1629      else {
1630         for (i=0; i<N; i++){    # loop 1B
1631           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1632           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1633         }
1634      }
1635
1636      -- Possibility 2: we do loop peeling:
1637      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1638         x = q[i];
1639         p[i] = y;
1640      }
1641      for (i = 3; i < N; i++){   # loop 2A
1642         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1643         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1644      }
1645
1646      -- Possibility 3: combination of loop peeling and versioning:
1647      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1648         x = q[i];
1649         p[i] = y;
1650      }
1651      if (p is aligned) {
1652         for (i = 3; i<N; i++){  # loop 3A
1653           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1654           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1655         }
1656      }
1657      else {
1658         for (i = 3; i<N; i++){  # loop 3B
1659           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1660           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1661         }
1662      }
1663
1664      These loops are later passed to loop_transform to be vectorized.  The
1665      vectorizer will use the alignment information to guide the transformation
1666      (whether to generate regular loads/stores, or with special handling for
1667      misalignment).  */
1668
1669 opt_result
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671 {
1672   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674   enum dr_alignment_support supportable_dr_alignment;
1675   dr_vec_info *first_store = NULL;
1676   dr_vec_info *dr0_info = NULL;
1677   struct data_reference *dr;
1678   unsigned int i, j;
1679   bool do_peeling = false;
1680   bool do_versioning = false;
1681   unsigned int npeel = 0;
1682   bool one_misalignment_known = false;
1683   bool one_misalignment_unknown = false;
1684   bool one_dr_unsupportable = false;
1685   dr_vec_info *unsupportable_dr_info = NULL;
1686   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1687   unsigned possible_npeel_number = 1;
1688   tree vectype;
1689   unsigned int mis, same_align_drs_max = 0;
1690   hash_table<peel_info_hasher> peeling_htab (1);
1691
1692   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1693
1694   /* Reset data so we can safely be called multiple times.  */
1695   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1696   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1697
1698   /* While cost model enhancements are expected in the future, the high level
1699      view of the code at this time is as follows:
1700
1701      A) If there is a misaligned access then see if peeling to align
1702         this access can make all data references satisfy
1703         vect_supportable_dr_alignment.  If so, update data structures
1704         as needed and return true.
1705
1706      B) If peeling wasn't possible and there is a data reference with an
1707         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1708         then see if loop versioning checks can be used to make all data
1709         references satisfy vect_supportable_dr_alignment.  If so, update
1710         data structures as needed and return true.
1711
1712      C) If neither peeling nor versioning were successful then return false if
1713         any data reference does not satisfy vect_supportable_dr_alignment.
1714
1715      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1716
1717      Note, Possibility 3 above (which is peeling and versioning together) is not
1718      being done at this time.  */
1719
1720   /* (1) Peeling to force alignment.  */
1721
1722   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1723      Considerations:
1724      + How many accesses will become aligned due to the peeling
1725      - How many accesses will become unaligned due to the peeling,
1726        and the cost of misaligned accesses.
1727      - The cost of peeling (the extra runtime checks, the increase
1728        in code size).  */
1729
1730   FOR_EACH_VEC_ELT (datarefs, i, dr)
1731     {
1732       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1733       stmt_vec_info stmt_info = dr_info->stmt;
1734
1735       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1736         continue;
1737
1738       /* For interleaving, only the alignment of the first access
1739          matters.  */
1740       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1741           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1742         continue;
1743
1744       /* For scatter-gather or invariant accesses there is nothing
1745          to enhance.  */
1746       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1747           || integer_zerop (DR_STEP (dr)))
1748         continue;
1749
1750       /* Strided accesses perform only component accesses, alignment is
1751          irrelevant for them.  */
1752       if (STMT_VINFO_STRIDED_P (stmt_info)
1753           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1754         continue;
1755
1756       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1757       do_peeling = vector_alignment_reachable_p (dr_info);
1758       if (do_peeling)
1759         {
1760           if (known_alignment_for_access_p (dr_info))
1761             {
1762               unsigned int npeel_tmp = 0;
1763               bool negative = tree_int_cst_compare (DR_STEP (dr),
1764                                                     size_zero_node) < 0;
1765
1766               vectype = STMT_VINFO_VECTYPE (stmt_info);
1767               /* If known_alignment_for_access_p then we have set
1768                  DR_MISALIGNMENT which is only done if we know it at compiler
1769                  time, so it is safe to assume target alignment is constant.
1770                */
1771               unsigned int target_align =
1772                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1773               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774               mis = (negative
1775                      ? DR_MISALIGNMENT (dr_info)
1776                      : -DR_MISALIGNMENT (dr_info));
1777               if (DR_MISALIGNMENT (dr_info) != 0)
1778                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1779
1780               /* For multiple types, it is possible that the bigger type access
1781                  will have more than one peeling option.  E.g., a loop with two
1782                  types: one of size (vector size / 4), and the other one of
1783                  size (vector size / 8).  Vectorization factor will 8.  If both
1784                  accesses are misaligned by 3, the first one needs one scalar
1785                  iteration to be aligned, and the second one needs 5.  But the
1786                  first one will be aligned also by peeling 5 scalar
1787                  iterations, and in that case both accesses will be aligned.
1788                  Hence, except for the immediate peeling amount, we also want
1789                  to try to add full vector size, while we don't exceed
1790                  vectorization factor.
1791                  We do this automatically for cost model, since we calculate
1792                  cost for every peeling option.  */
1793               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1794                 {
1795                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797                   possible_npeel_number
1798                     = vect_get_num_vectors (nscalars, vectype);
1799
1800                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1801                      allow peeling NELEMENTS.  */
1802                   if (DR_MISALIGNMENT (dr_info) == 0)
1803                     possible_npeel_number++;
1804                 }
1805
1806               /* Save info about DR in the hash table.  Also include peeling
1807                  amounts according to the explanation above.  */
1808               for (j = 0; j < possible_npeel_number; j++)
1809                 {
1810                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811                                             dr_info, npeel_tmp);
1812                   npeel_tmp += target_align / dr_size;
1813                 }
1814
1815               one_misalignment_known = true;
1816             }
1817           else
1818             {
1819               /* If we don't know any misalignment values, we prefer
1820                  peeling for data-ref that has the maximum number of data-refs
1821                  with the same alignment, unless the target prefers to align
1822                  stores over load.  */
1823               unsigned same_align_drs
1824                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825               if (!dr0_info
1826                   || same_align_drs_max < same_align_drs)
1827                 {
1828                   same_align_drs_max = same_align_drs;
1829                   dr0_info = dr_info;
1830                 }
1831               /* For data-refs with the same number of related
1832                  accesses prefer the one where the misalign
1833                  computation will be invariant in the outermost loop.  */
1834               else if (same_align_drs_max == same_align_drs)
1835                 {
1836                   class loop *ivloop0, *ivloop;
1837                   ivloop0 = outermost_invariant_loop_for_expr
1838                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839                   ivloop = outermost_invariant_loop_for_expr
1840                     (loop, DR_BASE_ADDRESS (dr));
1841                   if ((ivloop && !ivloop0)
1842                       || (ivloop && ivloop0
1843                           && flow_loop_nested_p (ivloop, ivloop0)))
1844                     dr0_info = dr_info;
1845                 }
1846
1847               one_misalignment_unknown = true;
1848
1849               /* Check for data refs with unsupportable alignment that
1850                  can be peeled.  */
1851               if (!supportable_dr_alignment)
1852               {
1853                 one_dr_unsupportable = true;
1854                 unsupportable_dr_info = dr_info;
1855               }
1856
1857               if (!first_store && DR_IS_WRITE (dr))
1858                 first_store = dr_info;
1859             }
1860         }
1861       else
1862         {
1863           if (!aligned_access_p (dr_info))
1864             {
1865               if (dump_enabled_p ())
1866                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                                  "vector alignment may not be reachable\n");
1868               break;
1869             }
1870         }
1871     }
1872
1873   /* Check if we can possibly peel the loop.  */
1874   if (!vect_can_advance_ivs_p (loop_vinfo)
1875       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876       || loop->inner)
1877     do_peeling = false;
1878
1879   struct _vect_peel_extended_info peel_for_known_alignment;
1880   struct _vect_peel_extended_info peel_for_unknown_alignment;
1881   struct _vect_peel_extended_info best_peel;
1882
1883   peel_for_unknown_alignment.inside_cost = INT_MAX;
1884   peel_for_unknown_alignment.outside_cost = INT_MAX;
1885   peel_for_unknown_alignment.peel_info.count = 0;
1886
1887   if (do_peeling
1888       && one_misalignment_unknown)
1889     {
1890       /* Check if the target requires to prefer stores over loads, i.e., if
1891          misaligned stores are more expensive than misaligned loads (taking
1892          drs with same alignment into account).  */
1893       unsigned int load_inside_cost = 0;
1894       unsigned int load_outside_cost = 0;
1895       unsigned int store_inside_cost = 0;
1896       unsigned int store_outside_cost = 0;
1897       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1898
1899       stmt_vector_for_cost dummy;
1900       dummy.create (2);
1901       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902                                       &load_inside_cost,
1903                                       &load_outside_cost,
1904                                       &dummy, &dummy, estimated_npeels, true);
1905       dummy.release ();
1906
1907       if (first_store)
1908         {
1909           dummy.create (2);
1910           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911                                           &store_inside_cost,
1912                                           &store_outside_cost,
1913                                           &dummy, &dummy,
1914                                           estimated_npeels, true);
1915           dummy.release ();
1916         }
1917       else
1918         {
1919           store_inside_cost = INT_MAX;
1920           store_outside_cost = INT_MAX;
1921         }
1922
1923       if (load_inside_cost > store_inside_cost
1924           || (load_inside_cost == store_inside_cost
1925               && load_outside_cost > store_outside_cost))
1926         {
1927           dr0_info = first_store;
1928           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1930         }
1931       else
1932         {
1933           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935         }
1936
1937       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938       prologue_cost_vec.create (2);
1939       epilogue_cost_vec.create (2);
1940
1941       int dummy2;
1942       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943         (loop_vinfo, estimated_npeels, &dummy2,
1944          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945          &prologue_cost_vec, &epilogue_cost_vec);
1946
1947       prologue_cost_vec.release ();
1948       epilogue_cost_vec.release ();
1949
1950       peel_for_unknown_alignment.peel_info.count = 1
1951         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952     }
1953
1954   peel_for_unknown_alignment.peel_info.npeel = 0;
1955   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1956
1957   best_peel = peel_for_unknown_alignment;
1958
1959   peel_for_known_alignment.inside_cost = INT_MAX;
1960   peel_for_known_alignment.outside_cost = INT_MAX;
1961   peel_for_known_alignment.peel_info.count = 0;
1962   peel_for_known_alignment.peel_info.dr_info = NULL;
1963
1964   if (do_peeling && one_misalignment_known)
1965     {
1966       /* Peeling is possible, but there is no data access that is not supported
1967          unless aligned.  So we try to choose the best possible peeling from
1968          the hash table.  */
1969       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970         (&peeling_htab, loop_vinfo);
1971     }
1972
1973   /* Compare costs of peeling for known and unknown alignment. */
1974   if (peel_for_known_alignment.peel_info.dr_info != NULL
1975       && peel_for_unknown_alignment.inside_cost
1976       >= peel_for_known_alignment.inside_cost)
1977     {
1978       best_peel = peel_for_known_alignment;
1979
1980       /* If the best peeling for known alignment has NPEEL == 0, perform no
1981          peeling at all except if there is an unsupportable dr that we can
1982          align.  */
1983       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984         do_peeling = false;
1985     }
1986
1987   /* If there is an unsupportable data ref, prefer this over all choices so far
1988      since we'd have to discard a chosen peeling except when it accidentally
1989      aligned the unsupportable data ref.  */
1990   if (one_dr_unsupportable)
1991     dr0_info = unsupportable_dr_info;
1992   else if (do_peeling)
1993     {
1994       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995          TODO: Use nopeel_outside_cost or get rid of it?  */
1996       unsigned nopeel_inside_cost = 0;
1997       unsigned nopeel_outside_cost = 0;
1998
1999       stmt_vector_for_cost dummy;
2000       dummy.create (2);
2001       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002                                       &nopeel_outside_cost, &dummy, &dummy,
2003                                       0, false);
2004       dummy.release ();
2005
2006       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2007          costs will be recorded.  */
2008       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009       prologue_cost_vec.create (2);
2010       epilogue_cost_vec.create (2);
2011
2012       int dummy2;
2013       nopeel_outside_cost += vect_get_known_peeling_cost
2014         (loop_vinfo, 0, &dummy2,
2015          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016          &prologue_cost_vec, &epilogue_cost_vec);
2017
2018       prologue_cost_vec.release ();
2019       epilogue_cost_vec.release ();
2020
2021       npeel = best_peel.peel_info.npeel;
2022       dr0_info = best_peel.peel_info.dr_info;
2023
2024       /* If no peeling is not more expensive than the best peeling we
2025          have so far, don't perform any peeling.  */
2026       if (nopeel_inside_cost <= best_peel.inside_cost)
2027         do_peeling = false;
2028     }
2029
2030   if (do_peeling)
2031     {
2032       stmt_vec_info stmt_info = dr0_info->stmt;
2033       vectype = STMT_VINFO_VECTYPE (stmt_info);
2034
2035       if (known_alignment_for_access_p (dr0_info))
2036         {
2037           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038                                                 size_zero_node) < 0;
2039           if (!npeel)
2040             {
2041               /* Since it's known at compile time, compute the number of
2042                  iterations in the peeled loop (the peeling factor) for use in
2043                  updating DR_MISALIGNMENT values.  The peeling factor is the
2044                  vectorization factor minus the misalignment as an element
2045                  count.  */
2046               mis = (negative
2047                      ? DR_MISALIGNMENT (dr0_info)
2048                      : -DR_MISALIGNMENT (dr0_info));
2049               /* If known_alignment_for_access_p then we have set
2050                  DR_MISALIGNMENT which is only done if we know it at compiler
2051                  time, so it is safe to assume target alignment is constant.
2052                */
2053               unsigned int target_align =
2054                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2055               npeel = ((mis & (target_align - 1))
2056                        / vect_get_scalar_dr_size (dr0_info));
2057             }
2058
2059           /* For interleaved data access every iteration accesses all the
2060              members of the group, therefore we divide the number of iterations
2061              by the group size.  */
2062           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2063             npeel /= DR_GROUP_SIZE (stmt_info);
2064
2065           if (dump_enabled_p ())
2066             dump_printf_loc (MSG_NOTE, vect_location,
2067                              "Try peeling by %d\n", npeel);
2068         }
2069
2070       /* Ensure that all datarefs can be vectorized after the peel.  */
2071       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2072         do_peeling = false;
2073
2074       /* Check if all datarefs are supportable and log.  */
2075       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2076         {
2077           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078           if (!stat)
2079             do_peeling = false;
2080           else
2081             return stat;
2082         }
2083
2084       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2085       if (do_peeling)
2086         {
2087           unsigned max_allowed_peel
2088             = param_vect_max_peeling_for_alignment;
2089           if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2090             max_allowed_peel = 0;
2091           if (max_allowed_peel != (unsigned)-1)
2092             {
2093               unsigned max_peel = npeel;
2094               if (max_peel == 0)
2095                 {
2096                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2097                   unsigned HOST_WIDE_INT target_align_c;
2098                   if (target_align.is_constant (&target_align_c))
2099                     max_peel =
2100                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2101                   else
2102                     {
2103                       do_peeling = false;
2104                       if (dump_enabled_p ())
2105                         dump_printf_loc (MSG_NOTE, vect_location,
2106                           "Disable peeling, max peels set and vector"
2107                           " alignment unknown\n");
2108                     }
2109                 }
2110               if (max_peel > max_allowed_peel)
2111                 {
2112                   do_peeling = false;
2113                   if (dump_enabled_p ())
2114                     dump_printf_loc (MSG_NOTE, vect_location,
2115                         "Disable peeling, max peels reached: %d\n", max_peel);
2116                 }
2117             }
2118         }
2119
2120       /* Cost model #2 - if peeling may result in a remaining loop not
2121          iterating enough to be vectorized then do not peel.  Since this
2122          is a cost heuristic rather than a correctness decision, use the
2123          most likely runtime value for variable vectorization factors.  */
2124       if (do_peeling
2125           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2126         {
2127           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2128           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2129           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2130               < assumed_vf + max_peel)
2131             do_peeling = false;
2132         }
2133
2134       if (do_peeling)
2135         {
2136           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2137              If the misalignment of DR_i is identical to that of dr0 then set
2138              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2139              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2140              by the peeling factor times the element size of DR_i (MOD the
2141              vectorization factor times the size).  Otherwise, the
2142              misalignment of DR_i must be set to unknown.  */
2143           FOR_EACH_VEC_ELT (datarefs, i, dr)
2144             if (dr != dr0_info->dr)
2145               {
2146                 /* Strided accesses perform only component accesses, alignment
2147                    is irrelevant for them.  */
2148                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2149                 stmt_info = dr_info->stmt;
2150                 if (STMT_VINFO_STRIDED_P (stmt_info)
2151                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2152                   continue;
2153
2154                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2155               }
2156
2157           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2158           if (npeel)
2159             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2160           else
2161             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2162               = DR_MISALIGNMENT (dr0_info);
2163           SET_DR_MISALIGNMENT (dr0_info, 0);
2164           if (dump_enabled_p ())
2165             {
2166               dump_printf_loc (MSG_NOTE, vect_location,
2167                                "Alignment of access forced using peeling.\n");
2168               dump_printf_loc (MSG_NOTE, vect_location,
2169                                "Peeling for alignment will be applied.\n");
2170             }
2171
2172           /* The inside-loop cost will be accounted for in vectorizable_load
2173              and vectorizable_store correctly with adjusted alignments.
2174              Drop the body_cst_vec on the floor here.  */
2175           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2176           gcc_assert (stat);
2177           return stat;
2178         }
2179     }
2180
2181   /* (2) Versioning to force alignment.  */
2182
2183   /* Try versioning if:
2184      1) optimize loop for speed and the cost-model is not cheap
2185      2) there is at least one unsupported misaligned data ref with an unknown
2186         misalignment, and
2187      3) all misaligned data refs with a known misalignment are supported, and
2188      4) the number of runtime alignment checks is within reason.  */
2189
2190   do_versioning
2191     = (optimize_loop_nest_for_speed_p (loop)
2192        && !loop->inner /* FORNOW */
2193        && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2194
2195   if (do_versioning)
2196     {
2197       FOR_EACH_VEC_ELT (datarefs, i, dr)
2198         {
2199           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2200           stmt_vec_info stmt_info = dr_info->stmt;
2201
2202           /* For interleaving, only the alignment of the first access
2203              matters.  */
2204           if (aligned_access_p (dr_info)
2205               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2206                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2207             continue;
2208
2209           if (STMT_VINFO_STRIDED_P (stmt_info))
2210             {
2211               /* Strided loads perform only component accesses, alignment is
2212                  irrelevant for them.  */
2213               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2214                 continue;
2215               do_versioning = false;
2216               break;
2217             }
2218
2219           supportable_dr_alignment
2220             = vect_supportable_dr_alignment (dr_info, false);
2221
2222           if (!supportable_dr_alignment)
2223             {
2224               int mask;
2225               tree vectype;
2226
2227               if (known_alignment_for_access_p (dr_info)
2228                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2229                   >= (unsigned) param_vect_max_version_for_alignment_checks)
2230                 {
2231                   do_versioning = false;
2232                   break;
2233                 }
2234
2235               vectype = STMT_VINFO_VECTYPE (stmt_info);
2236               gcc_assert (vectype);
2237
2238               /* At present we don't support versioning for alignment
2239                  with variable VF, since there's no guarantee that the
2240                  VF is a power of two.  We could relax this if we added
2241                  a way of enforcing a power-of-two size.  */
2242               unsigned HOST_WIDE_INT size;
2243               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2244                 {
2245                   do_versioning = false;
2246                   break;
2247                 }
2248
2249               /* Forcing alignment in the first iteration is no good if
2250                  we don't keep it across iterations.  For now, just disable
2251                  versioning in this case.
2252                  ?? We could actually unroll the loop to achieve the required
2253                  overall step alignment, and forcing the alignment could be
2254                  done by doing some iterations of the non-vectorized loop.  */
2255               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2256                                * DR_STEP_ALIGNMENT (dr),
2257                                DR_TARGET_ALIGNMENT (dr_info)))
2258                 {
2259                   do_versioning = false;
2260                   break;
2261                 }
2262
2263               /* The rightmost bits of an aligned address must be zeros.
2264                  Construct the mask needed for this test.  For example,
2265                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2266                  mask must be 15 = 0xf. */
2267               mask = size - 1;
2268
2269               /* FORNOW: use the same mask to test all potentially unaligned
2270                  references in the loop.  The vectorizer currently supports
2271                  a single vector size, see the reference to
2272                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2273                  vectorization factor is computed.  */
2274               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2275                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2276               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2277               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2278             }
2279         }
2280
2281       /* Versioning requires at least one misaligned data reference.  */
2282       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2283         do_versioning = false;
2284       else if (!do_versioning)
2285         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2286     }
2287
2288   if (do_versioning)
2289     {
2290       vec<stmt_vec_info> may_misalign_stmts
2291         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2292       stmt_vec_info stmt_info;
2293
2294       /* It can now be assumed that the data references in the statements
2295          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2296          of the loop being vectorized.  */
2297       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2298         {
2299           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2300           SET_DR_MISALIGNMENT (dr_info, 0);
2301           if (dump_enabled_p ())
2302             dump_printf_loc (MSG_NOTE, vect_location,
2303                              "Alignment of access forced using versioning.\n");
2304         }
2305
2306       if (dump_enabled_p ())
2307         dump_printf_loc (MSG_NOTE, vect_location,
2308                          "Versioning for alignment will be applied.\n");
2309
2310       /* Peeling and versioning can't be done together at this time.  */
2311       gcc_assert (! (do_peeling && do_versioning));
2312
2313       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2314       gcc_assert (stat);
2315       return stat;
2316     }
2317
2318   /* This point is reached if neither peeling nor versioning is being done.  */
2319   gcc_assert (! (do_peeling || do_versioning));
2320
2321   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2322   return stat;
2323 }
2324
2325
2326 /* Function vect_find_same_alignment_drs.
2327
2328    Update group and alignment relations in VINFO according to the chosen
2329    vectorization factor.  */
2330
2331 static void
2332 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2333 {
2334   struct data_reference *dra = DDR_A (ddr);
2335   struct data_reference *drb = DDR_B (ddr);
2336   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2337   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2338   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2339   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2340
2341   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2342     return;
2343
2344   if (dra == drb)
2345     return;
2346
2347   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2348       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2349     return;
2350
2351   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2352       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2353       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2354     return;
2355
2356   /* Two references with distance zero have the same alignment.  */
2357   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2358                           - wi::to_poly_offset (DR_INIT (drb)));
2359   if (maybe_ne (diff, 0))
2360     {
2361       /* Get the wider of the two alignments.  */
2362       poly_uint64 align_a =
2363         exact_div (vect_calculate_target_alignment (dr_info_a),
2364                    BITS_PER_UNIT);
2365       poly_uint64 align_b =
2366         exact_div (vect_calculate_target_alignment (dr_info_b),
2367                    BITS_PER_UNIT);
2368       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2369       if (!align_a.is_constant (&align_a_c)
2370           || !align_b.is_constant (&align_b_c))
2371         return;
2372
2373       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2374
2375       /* Require the gap to be a multiple of the larger vector alignment.  */
2376       if (!multiple_p (diff, max_align))
2377         return;
2378     }
2379
2380   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2381   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2382   if (dump_enabled_p ())
2383     dump_printf_loc (MSG_NOTE, vect_location,
2384                      "accesses have the same alignment: %T and %T\n",
2385                      DR_REF (dra), DR_REF (drb));
2386 }
2387
2388
2389 /* Function vect_analyze_data_refs_alignment
2390
2391    Analyze the alignment of the data-references in the loop.
2392    Return FALSE if a data reference is found that cannot be vectorized.  */
2393
2394 opt_result
2395 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2396 {
2397   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2398
2399   /* Mark groups of data references with same alignment using
2400      data dependence information.  */
2401   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2402   struct data_dependence_relation *ddr;
2403   unsigned int i;
2404
2405   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2406     vect_find_same_alignment_drs (vinfo, ddr);
2407
2408   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2409   struct data_reference *dr;
2410
2411   vect_record_base_alignments (vinfo);
2412   FOR_EACH_VEC_ELT (datarefs, i, dr)
2413     {
2414       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2415       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2416         vect_compute_data_ref_alignment (dr_info);
2417     }
2418
2419   return opt_result::success ();
2420 }
2421
2422
2423 /* Analyze alignment of DRs of stmts in NODE.  */
2424
2425 static bool
2426 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2427 {
2428   /* We vectorize from the first scalar stmt in the node unless
2429      the node is permuted in which case we start from the first
2430      element in the group.  */
2431   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2432   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2433   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2434     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2435
2436   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2437   vect_compute_data_ref_alignment (dr_info);
2438   /* For creating the data-ref pointer we need alignment of the
2439      first element anyway.  */
2440   if (dr_info != first_dr_info)
2441     vect_compute_data_ref_alignment (first_dr_info);
2442   if (! verify_data_ref_alignment (dr_info))
2443     {
2444       if (dump_enabled_p ())
2445         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2446                          "not vectorized: bad data alignment in basic "
2447                          "block.\n");
2448       return false;
2449     }
2450
2451   return true;
2452 }
2453
2454 /* Function vect_slp_analyze_instance_alignment
2455
2456    Analyze the alignment of the data-references in the SLP instance.
2457    Return FALSE if a data reference is found that cannot be vectorized.  */
2458
2459 bool
2460 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2461 {
2462   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2463
2464   slp_tree node;
2465   unsigned i;
2466   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2467     if (! vect_slp_analyze_and_verify_node_alignment (node))
2468       return false;
2469
2470   node = SLP_INSTANCE_TREE (instance);
2471   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2472       && ! vect_slp_analyze_and_verify_node_alignment
2473              (SLP_INSTANCE_TREE (instance)))
2474     return false;
2475
2476   return true;
2477 }
2478
2479
2480 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2481    accesses of legal size, step, etc.  Detect gaps, single element
2482    interleaving, and other special cases. Set grouped access info.
2483    Collect groups of strided stores for further use in SLP analysis.
2484    Worker for vect_analyze_group_access.  */
2485
2486 static bool
2487 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2488 {
2489   data_reference *dr = dr_info->dr;
2490   tree step = DR_STEP (dr);
2491   tree scalar_type = TREE_TYPE (DR_REF (dr));
2492   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2493   stmt_vec_info stmt_info = dr_info->stmt;
2494   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2495   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2496   HOST_WIDE_INT dr_step = -1;
2497   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2498   bool slp_impossible = false;
2499
2500   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2501      size of the interleaving group (including gaps).  */
2502   if (tree_fits_shwi_p (step))
2503     {
2504       dr_step = tree_to_shwi (step);
2505       /* Check that STEP is a multiple of type size.  Otherwise there is
2506          a non-element-sized gap at the end of the group which we
2507          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2508          ???  As we can handle non-constant step fine here we should
2509          simply remove uses of DR_GROUP_GAP between the last and first
2510          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2511          simply not include that gap.  */
2512       if ((dr_step % type_size) != 0)
2513         {
2514           if (dump_enabled_p ())
2515             dump_printf_loc (MSG_NOTE, vect_location,
2516                              "Step %T is not a multiple of the element size"
2517                              " for %T\n",
2518                              step, DR_REF (dr));
2519           return false;
2520         }
2521       groupsize = absu_hwi (dr_step) / type_size;
2522     }
2523   else
2524     groupsize = 0;
2525
2526   /* Not consecutive access is possible only if it is a part of interleaving.  */
2527   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2528     {
2529       /* Check if it this DR is a part of interleaving, and is a single
2530          element of the group that is accessed in the loop.  */
2531
2532       /* Gaps are supported only for loads. STEP must be a multiple of the type
2533          size.  */
2534       if (DR_IS_READ (dr)
2535           && (dr_step % type_size) == 0
2536           && groupsize > 0)
2537         {
2538           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2539           DR_GROUP_SIZE (stmt_info) = groupsize;
2540           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2541           if (dump_enabled_p ())
2542             dump_printf_loc (MSG_NOTE, vect_location,
2543                              "Detected single element interleaving %T"
2544                              " step %T\n",
2545                              DR_REF (dr), step);
2546
2547           return true;
2548         }
2549
2550       if (dump_enabled_p ())
2551         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2552                          "not consecutive access %G", stmt_info->stmt);
2553
2554       if (bb_vinfo)
2555         {
2556           /* Mark the statement as unvectorizable.  */
2557           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2558           return true;
2559         }
2560
2561       if (dump_enabled_p ())
2562         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2563       STMT_VINFO_STRIDED_P (stmt_info) = true;
2564       return true;
2565     }
2566
2567   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2568     {
2569       /* First stmt in the interleaving chain. Check the chain.  */
2570       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2571       struct data_reference *data_ref = dr;
2572       unsigned int count = 1;
2573       tree prev_init = DR_INIT (data_ref);
2574       HOST_WIDE_INT diff, gaps = 0;
2575
2576       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2577       while (next)
2578         {
2579           /* We never have the same DR multiple times.  */
2580           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2581                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2582
2583           data_ref = STMT_VINFO_DATA_REF (next);
2584
2585           /* All group members have the same STEP by construction.  */
2586           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2587
2588           /* Check that the distance between two accesses is equal to the type
2589              size. Otherwise, we have gaps.  */
2590           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2591                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2592           if (diff != 1)
2593             {
2594               /* FORNOW: SLP of accesses with gaps is not supported.  */
2595               slp_impossible = true;
2596               if (DR_IS_WRITE (data_ref))
2597                 {
2598                   if (dump_enabled_p ())
2599                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2600                                      "interleaved store with gaps\n");
2601                   return false;
2602                 }
2603
2604               gaps += diff - 1;
2605             }
2606
2607           last_accessed_element += diff;
2608
2609           /* Store the gap from the previous member of the group. If there is no
2610              gap in the access, DR_GROUP_GAP is always 1.  */
2611           DR_GROUP_GAP (next) = diff;
2612
2613           prev_init = DR_INIT (data_ref);
2614           next = DR_GROUP_NEXT_ELEMENT (next);
2615           /* Count the number of data-refs in the chain.  */
2616           count++;
2617         }
2618
2619       if (groupsize == 0)
2620         groupsize = count + gaps;
2621
2622       /* This could be UINT_MAX but as we are generating code in a very
2623          inefficient way we have to cap earlier.  See PR78699 for example.  */
2624       if (groupsize > 4096)
2625         {
2626           if (dump_enabled_p ())
2627             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2628                              "group is too large\n");
2629           return false;
2630         }
2631
2632       /* Check that the size of the interleaving is equal to count for stores,
2633          i.e., that there are no gaps.  */
2634       if (groupsize != count
2635           && !DR_IS_READ (dr))
2636         {
2637           groupsize = count;
2638           STMT_VINFO_STRIDED_P (stmt_info) = true;
2639         }
2640
2641       /* If there is a gap after the last load in the group it is the
2642          difference between the groupsize and the last accessed
2643          element.
2644          When there is no gap, this difference should be 0.  */
2645       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2646
2647       DR_GROUP_SIZE (stmt_info) = groupsize;
2648       if (dump_enabled_p ())
2649         {
2650           dump_printf_loc (MSG_NOTE, vect_location,
2651                            "Detected interleaving ");
2652           if (DR_IS_READ (dr))
2653             dump_printf (MSG_NOTE, "load ");
2654           else if (STMT_VINFO_STRIDED_P (stmt_info))
2655             dump_printf (MSG_NOTE, "strided store ");
2656           else
2657             dump_printf (MSG_NOTE, "store ");
2658           dump_printf (MSG_NOTE, "of size %u\n",
2659                        (unsigned)groupsize);
2660           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2661           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2662           while (next)
2663             {
2664               if (DR_GROUP_GAP (next) != 1)
2665                 dump_printf_loc (MSG_NOTE, vect_location,
2666                                  "\t<gap of %d elements>\n",
2667                                  DR_GROUP_GAP (next) - 1);
2668               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2669               next = DR_GROUP_NEXT_ELEMENT (next);
2670             }
2671           if (DR_GROUP_GAP (stmt_info) != 0)
2672             dump_printf_loc (MSG_NOTE, vect_location,
2673                              "\t<gap of %d elements>\n",
2674                              DR_GROUP_GAP (stmt_info));
2675         }
2676
2677       /* SLP: create an SLP data structure for every interleaving group of
2678          stores for further analysis in vect_analyse_slp.  */
2679       if (DR_IS_WRITE (dr) && !slp_impossible)
2680         {
2681           if (loop_vinfo)
2682             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2683           if (bb_vinfo)
2684             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2685         }
2686     }
2687
2688   return true;
2689 }
2690
2691 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2692    accesses of legal size, step, etc.  Detect gaps, single element
2693    interleaving, and other special cases. Set grouped access info.
2694    Collect groups of strided stores for further use in SLP analysis.  */
2695
2696 static bool
2697 vect_analyze_group_access (dr_vec_info *dr_info)
2698 {
2699   if (!vect_analyze_group_access_1 (dr_info))
2700     {
2701       /* Dissolve the group if present.  */
2702       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2703       while (stmt_info)
2704         {
2705           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2706           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2707           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2708           stmt_info = next;
2709         }
2710       return false;
2711     }
2712   return true;
2713 }
2714
2715 /* Analyze the access pattern of the data-reference DR_INFO.
2716    In case of non-consecutive accesses call vect_analyze_group_access() to
2717    analyze groups of accesses.  */
2718
2719 static bool
2720 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2721 {
2722   data_reference *dr = dr_info->dr;
2723   tree step = DR_STEP (dr);
2724   tree scalar_type = TREE_TYPE (DR_REF (dr));
2725   stmt_vec_info stmt_info = dr_info->stmt;
2726   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2727   class loop *loop = NULL;
2728
2729   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2730     return true;
2731
2732   if (loop_vinfo)
2733     loop = LOOP_VINFO_LOOP (loop_vinfo);
2734
2735   if (loop_vinfo && !step)
2736     {
2737       if (dump_enabled_p ())
2738         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2739                          "bad data-ref access in loop\n");
2740       return false;
2741     }
2742
2743   /* Allow loads with zero step in inner-loop vectorization.  */
2744   if (loop_vinfo && integer_zerop (step))
2745     {
2746       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2747       if (!nested_in_vect_loop_p (loop, stmt_info))
2748         return DR_IS_READ (dr);
2749       /* Allow references with zero step for outer loops marked
2750          with pragma omp simd only - it guarantees absence of
2751          loop-carried dependencies between inner loop iterations.  */
2752       if (loop->safelen < 2)
2753         {
2754           if (dump_enabled_p ())
2755             dump_printf_loc (MSG_NOTE, vect_location,
2756                              "zero step in inner loop of nest\n");
2757           return false;
2758         }
2759     }
2760
2761   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2762     {
2763       /* Interleaved accesses are not yet supported within outer-loop
2764         vectorization for references in the inner-loop.  */
2765       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2766
2767       /* For the rest of the analysis we use the outer-loop step.  */
2768       step = STMT_VINFO_DR_STEP (stmt_info);
2769       if (integer_zerop (step))
2770         {
2771           if (dump_enabled_p ())
2772             dump_printf_loc (MSG_NOTE, vect_location,
2773                              "zero step in outer loop.\n");
2774           return DR_IS_READ (dr);
2775         }
2776     }
2777
2778   /* Consecutive?  */
2779   if (TREE_CODE (step) == INTEGER_CST)
2780     {
2781       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2782       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2783           || (dr_step < 0
2784               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2785         {
2786           /* Mark that it is not interleaving.  */
2787           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2788           return true;
2789         }
2790     }
2791
2792   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2793     {
2794       if (dump_enabled_p ())
2795         dump_printf_loc (MSG_NOTE, vect_location,
2796                          "grouped access in outer loop.\n");
2797       return false;
2798     }
2799
2800
2801   /* Assume this is a DR handled by non-constant strided load case.  */
2802   if (TREE_CODE (step) != INTEGER_CST)
2803     return (STMT_VINFO_STRIDED_P (stmt_info)
2804             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2805                 || vect_analyze_group_access (dr_info)));
2806
2807   /* Not consecutive access - check if it's a part of interleaving group.  */
2808   return vect_analyze_group_access (dr_info);
2809 }
2810
2811 /* Compare two data-references DRA and DRB to group them into chunks
2812    suitable for grouping.  */
2813
2814 static int
2815 dr_group_sort_cmp (const void *dra_, const void *drb_)
2816 {
2817   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2818   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2819   int cmp;
2820
2821   /* Stabilize sort.  */
2822   if (dra == drb)
2823     return 0;
2824
2825   /* DRs in different loops never belong to the same group.  */
2826   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2827   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2828   if (loopa != loopb)
2829     return loopa->num < loopb->num ? -1 : 1;
2830
2831   /* Ordering of DRs according to base.  */
2832   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2833                                DR_BASE_ADDRESS (drb));
2834   if (cmp != 0)
2835     return cmp;
2836
2837   /* And according to DR_OFFSET.  */
2838   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2839   if (cmp != 0)
2840     return cmp;
2841
2842   /* Put reads before writes.  */
2843   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2844     return DR_IS_READ (dra) ? -1 : 1;
2845
2846   /* Then sort after access size.  */
2847   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2848                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2849   if (cmp != 0)
2850     return cmp;
2851
2852   /* And after step.  */
2853   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2854   if (cmp != 0)
2855     return cmp;
2856
2857   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2858   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2859   if (cmp == 0)
2860     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2861   return cmp;
2862 }
2863
2864 /* If OP is the result of a conversion, return the unconverted value,
2865    otherwise return null.  */
2866
2867 static tree
2868 strip_conversion (tree op)
2869 {
2870   if (TREE_CODE (op) != SSA_NAME)
2871     return NULL_TREE;
2872   gimple *stmt = SSA_NAME_DEF_STMT (op);
2873   if (!is_gimple_assign (stmt)
2874       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2875     return NULL_TREE;
2876   return gimple_assign_rhs1 (stmt);
2877 }
2878
2879 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2880    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2881    be grouped in SLP mode.  */
2882
2883 static bool
2884 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2885                    bool allow_slp_p)
2886 {
2887   if (gimple_assign_single_p (stmt1_info->stmt))
2888     return gimple_assign_single_p (stmt2_info->stmt);
2889
2890   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2891   if (call1 && gimple_call_internal_p (call1))
2892     {
2893       /* Check for two masked loads or two masked stores.  */
2894       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2895       if (!call2 || !gimple_call_internal_p (call2))
2896         return false;
2897       internal_fn ifn = gimple_call_internal_fn (call1);
2898       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2899         return false;
2900       if (ifn != gimple_call_internal_fn (call2))
2901         return false;
2902
2903       /* Check that the masks are the same.  Cope with casts of masks,
2904          like those created by build_mask_conversion.  */
2905       tree mask1 = gimple_call_arg (call1, 2);
2906       tree mask2 = gimple_call_arg (call2, 2);
2907       if (!operand_equal_p (mask1, mask2, 0)
2908           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2909         {
2910           mask1 = strip_conversion (mask1);
2911           if (!mask1)
2912             return false;
2913           mask2 = strip_conversion (mask2);
2914           if (!mask2)
2915             return false;
2916           if (!operand_equal_p (mask1, mask2, 0))
2917             return false;
2918         }
2919       return true;
2920     }
2921
2922   return false;
2923 }
2924
2925 /* Function vect_analyze_data_ref_accesses.
2926
2927    Analyze the access pattern of all the data references in the loop.
2928
2929    FORNOW: the only access pattern that is considered vectorizable is a
2930            simple step 1 (consecutive) access.
2931
2932    FORNOW: handle only arrays and pointer accesses.  */
2933
2934 opt_result
2935 vect_analyze_data_ref_accesses (vec_info *vinfo)
2936 {
2937   unsigned int i;
2938   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2939   struct data_reference *dr;
2940
2941   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2942
2943   if (datarefs.is_empty ())
2944     return opt_result::success ();
2945
2946   /* Sort the array of datarefs to make building the interleaving chains
2947      linear.  Don't modify the original vector's order, it is needed for
2948      determining what dependencies are reversed.  */
2949   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2950   datarefs_copy.qsort (dr_group_sort_cmp);
2951   hash_set<stmt_vec_info> to_fixup;
2952
2953   /* Build the interleaving chains.  */
2954   for (i = 0; i < datarefs_copy.length () - 1;)
2955     {
2956       data_reference_p dra = datarefs_copy[i];
2957       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2958       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2959       stmt_vec_info lastinfo = NULL;
2960       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2961           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2962         {
2963           ++i;
2964           continue;
2965         }
2966       for (i = i + 1; i < datarefs_copy.length (); ++i)
2967         {
2968           data_reference_p drb = datarefs_copy[i];
2969           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2970           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2971           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2972               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2973             break;
2974
2975           /* ???  Imperfect sorting (non-compatible types, non-modulo
2976              accesses, same accesses) can lead to a group to be artificially
2977              split here as we don't just skip over those.  If it really
2978              matters we can push those to a worklist and re-iterate
2979              over them.  The we can just skip ahead to the next DR here.  */
2980
2981           /* DRs in a different loop should not be put into the same
2982              interleaving group.  */
2983           if (gimple_bb (DR_STMT (dra))->loop_father
2984               != gimple_bb (DR_STMT (drb))->loop_father)
2985             break;
2986
2987           /* Check that the data-refs have same first location (except init)
2988              and they are both either store or load (not load and store,
2989              not masked loads or stores).  */
2990           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2991               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2992                                         DR_BASE_ADDRESS (drb)) != 0
2993               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2994               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2995             break;
2996
2997           /* Check that the data-refs have the same constant size.  */
2998           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2999           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3000           if (!tree_fits_uhwi_p (sza)
3001               || !tree_fits_uhwi_p (szb)
3002               || !tree_int_cst_equal (sza, szb))
3003             break;
3004
3005           /* Check that the data-refs have the same step.  */
3006           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3007             break;
3008
3009           /* Check the types are compatible.
3010              ???  We don't distinguish this during sorting.  */
3011           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3012                                    TREE_TYPE (DR_REF (drb))))
3013             break;
3014
3015           /* Check that the DR_INITs are compile-time constants.  */
3016           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3017               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3018             break;
3019
3020           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3021              just hold extra information.  */
3022           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3023               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3024               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3025             break;
3026
3027           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3028           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3029           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3030           HOST_WIDE_INT init_prev
3031             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3032           gcc_assert (init_a <= init_b
3033                       && init_a <= init_prev
3034                       && init_prev <= init_b);
3035
3036           /* Do not place the same access in the interleaving chain twice.  */
3037           if (init_b == init_prev)
3038             {
3039               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3040                           < gimple_uid (DR_STMT (drb)));
3041               /* Simply link in duplicates and fix up the chain below.  */
3042             }
3043           else
3044             {
3045               /* If init_b == init_a + the size of the type * k, we have an
3046                  interleaving, and DRA is accessed before DRB.  */
3047               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3048               if (type_size_a == 0
3049                   || (init_b - init_a) % type_size_a != 0)
3050                 break;
3051
3052               /* If we have a store, the accesses are adjacent.  This splits
3053                  groups into chunks we support (we don't support vectorization
3054                  of stores with gaps).  */
3055               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3056                 break;
3057
3058               /* If the step (if not zero or non-constant) is greater than the
3059                  difference between data-refs' inits this splits groups into
3060                  suitable sizes.  */
3061               if (tree_fits_shwi_p (DR_STEP (dra)))
3062                 {
3063                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3064                   if (step != 0 && step <= (init_b - init_a))
3065                     break;
3066                 }
3067             }
3068
3069           if (dump_enabled_p ())
3070             dump_printf_loc (MSG_NOTE, vect_location,
3071                              DR_IS_READ (dra)
3072                              ? "Detected interleaving load %T and %T\n"
3073                              : "Detected interleaving store %T and %T\n",
3074                              DR_REF (dra), DR_REF (drb));
3075
3076           /* Link the found element into the group list.  */
3077           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3078             {
3079               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3080               lastinfo = stmtinfo_a;
3081             }
3082           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3083           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3084           lastinfo = stmtinfo_b;
3085
3086           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3087             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3088
3089           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3090             dump_printf_loc (MSG_NOTE, vect_location,
3091                              "Load suitable for SLP vectorization only.\n");
3092
3093           if (init_b == init_prev
3094               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3095               && dump_enabled_p ())
3096             dump_printf_loc (MSG_NOTE, vect_location,
3097                              "Queuing group with duplicate access for fixup\n");
3098         }
3099     }
3100
3101   /* Fixup groups with duplicate entries by splitting it.  */
3102   while (1)
3103     {
3104       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3105       if (!(it != to_fixup.end ()))
3106         break;
3107       stmt_vec_info grp = *it;
3108       to_fixup.remove (grp);
3109
3110       /* Find the earliest duplicate group member.  */
3111       unsigned first_duplicate = -1u;
3112       stmt_vec_info next, g = grp;
3113       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3114         {
3115           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3116                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3117               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3118             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3119           g = next;
3120         }
3121       if (first_duplicate == -1U)
3122         continue;
3123
3124       /* Then move all stmts after the first duplicate to a new group.
3125          Note this is a heuristic but one with the property that *it
3126          is fixed up completely.  */
3127       g = grp;
3128       stmt_vec_info newgroup = NULL, ng = grp;
3129       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3130         {
3131           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3132             {
3133               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3134               if (!newgroup)
3135                 newgroup = next;
3136               else
3137                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3138               ng = next;
3139               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3140             }
3141           else
3142             g = DR_GROUP_NEXT_ELEMENT (g);
3143         }
3144       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3145
3146       /* Fixup the new group which still may contain duplicates.  */
3147       to_fixup.add (newgroup);
3148     }
3149
3150   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3151     {
3152       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3153       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3154           && !vect_analyze_data_ref_access (dr_info))
3155         {
3156           if (dump_enabled_p ())
3157             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3158                              "not vectorized: complicated access pattern.\n");
3159
3160           if (is_a <bb_vec_info> (vinfo))
3161             {
3162               /* Mark the statement as not vectorizable.  */
3163               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3164               continue;
3165             }
3166           else
3167             {
3168               datarefs_copy.release ();
3169               return opt_result::failure_at (dr_info->stmt->stmt,
3170                                              "not vectorized:"
3171                                              " complicated access pattern.\n");
3172             }
3173         }
3174     }
3175
3176   datarefs_copy.release ();
3177   return opt_result::success ();
3178 }
3179
3180 /* Function vect_vfa_segment_size.
3181
3182    Input:
3183      DR_INFO: The data reference.
3184      LENGTH_FACTOR: segment length to consider.
3185
3186    Return a value suitable for the dr_with_seg_len::seg_len field.
3187    This is the "distance travelled" by the pointer from the first
3188    iteration in the segment to the last.  Note that it does not include
3189    the size of the access; in effect it only describes the first byte.  */
3190
3191 static tree
3192 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3193 {
3194   length_factor = size_binop (MINUS_EXPR,
3195                               fold_convert (sizetype, length_factor),
3196                               size_one_node);
3197   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3198                      length_factor);
3199 }
3200
3201 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3202    gives the worst-case number of bytes covered by the segment.  */
3203
3204 static unsigned HOST_WIDE_INT
3205 vect_vfa_access_size (dr_vec_info *dr_info)
3206 {
3207   stmt_vec_info stmt_vinfo = dr_info->stmt;
3208   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3209   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3210   unsigned HOST_WIDE_INT access_size = ref_size;
3211   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3212     {
3213       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3214       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3215     }
3216   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3217       && (vect_supportable_dr_alignment (dr_info, false)
3218           == dr_explicit_realign_optimized))
3219     {
3220       /* We might access a full vector's worth.  */
3221       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3222       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3223     }
3224   return access_size;
3225 }
3226
3227 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3228    describes.  */
3229
3230 static unsigned int
3231 vect_vfa_align (dr_vec_info *dr_info)
3232 {
3233   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3234 }
3235
3236 /* Function vect_no_alias_p.
3237
3238    Given data references A and B with equal base and offset, see whether
3239    the alias relation can be decided at compilation time.  Return 1 if
3240    it can and the references alias, 0 if it can and the references do
3241    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3242    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3243    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3244
3245 static int
3246 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3247                          tree segment_length_a, tree segment_length_b,
3248                          unsigned HOST_WIDE_INT access_size_a,
3249                          unsigned HOST_WIDE_INT access_size_b)
3250 {
3251   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3252   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3253   poly_uint64 const_length_a;
3254   poly_uint64 const_length_b;
3255
3256   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3257      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3258      [a, a+12) */
3259   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3260     {
3261       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3262       offset_a = (offset_a + access_size_a) - const_length_a;
3263     }
3264   else
3265     const_length_a = tree_to_poly_uint64 (segment_length_a);
3266   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3267     {
3268       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3269       offset_b = (offset_b + access_size_b) - const_length_b;
3270     }
3271   else
3272     const_length_b = tree_to_poly_uint64 (segment_length_b);
3273
3274   const_length_a += access_size_a;
3275   const_length_b += access_size_b;
3276
3277   if (ranges_known_overlap_p (offset_a, const_length_a,
3278                               offset_b, const_length_b))
3279     return 1;
3280
3281   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3282                                offset_b, const_length_b))
3283     return 0;
3284
3285   return -1;
3286 }
3287
3288 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3289    in DDR is >= VF.  */
3290
3291 static bool
3292 dependence_distance_ge_vf (data_dependence_relation *ddr,
3293                            unsigned int loop_depth, poly_uint64 vf)
3294 {
3295   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3296       || DDR_NUM_DIST_VECTS (ddr) == 0)
3297     return false;
3298
3299   /* If the dependence is exact, we should have limited the VF instead.  */
3300   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3301
3302   unsigned int i;
3303   lambda_vector dist_v;
3304   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3305     {
3306       HOST_WIDE_INT dist = dist_v[loop_depth];
3307       if (dist != 0
3308           && !(dist > 0 && DDR_REVERSED_P (ddr))
3309           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3310         return false;
3311     }
3312
3313   if (dump_enabled_p ())
3314     dump_printf_loc (MSG_NOTE, vect_location,
3315                      "dependence distance between %T and %T is >= VF\n",
3316                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3317
3318   return true;
3319 }
3320
3321 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3322
3323 static void
3324 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3325 {
3326   dump_printf (dump_kind, "%s (%T) >= ",
3327                lower_bound.unsigned_p ? "unsigned" : "abs",
3328                lower_bound.expr);
3329   dump_dec (dump_kind, lower_bound.min_value);
3330 }
3331
3332 /* Record that the vectorized loop requires the vec_lower_bound described
3333    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3334
3335 static void
3336 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3337                         poly_uint64 min_value)
3338 {
3339   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3340   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3341     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3342       {
3343         unsigned_p &= lower_bounds[i].unsigned_p;
3344         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3345         if (lower_bounds[i].unsigned_p != unsigned_p
3346             || maybe_lt (lower_bounds[i].min_value, min_value))
3347           {
3348             lower_bounds[i].unsigned_p = unsigned_p;
3349             lower_bounds[i].min_value = min_value;
3350             if (dump_enabled_p ())
3351               {
3352                 dump_printf_loc (MSG_NOTE, vect_location,
3353                                  "updating run-time check to ");
3354                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3355                 dump_printf (MSG_NOTE, "\n");
3356               }
3357           }
3358         return;
3359       }
3360
3361   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3362   if (dump_enabled_p ())
3363     {
3364       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3365       dump_lower_bound (MSG_NOTE, lower_bound);
3366       dump_printf (MSG_NOTE, "\n");
3367     }
3368   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3369 }
3370
3371 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3372    will span fewer than GAP bytes.  */
3373
3374 static bool
3375 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3376                   poly_int64 gap)
3377 {
3378   stmt_vec_info stmt_info = dr_info->stmt;
3379   HOST_WIDE_INT count
3380     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3381   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3382     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3383   return (estimated_poly_value (gap)
3384           <= count * vect_get_scalar_dr_size (dr_info));
3385 }
3386
3387 /* Return true if we know that there is no alias between DR_INFO_A and
3388    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3389    When returning true, set *LOWER_BOUND_OUT to this N.  */
3390
3391 static bool
3392 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3393                                 poly_uint64 *lower_bound_out)
3394 {
3395   /* Check that there is a constant gap of known sign between DR_A
3396      and DR_B.  */
3397   data_reference *dr_a = dr_info_a->dr;
3398   data_reference *dr_b = dr_info_b->dr;
3399   poly_int64 init_a, init_b;
3400   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3401       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3402       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3403       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3404       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3405       || !ordered_p (init_a, init_b))
3406     return false;
3407
3408   /* Sort DR_A and DR_B by the address they access.  */
3409   if (maybe_lt (init_b, init_a))
3410     {
3411       std::swap (init_a, init_b);
3412       std::swap (dr_info_a, dr_info_b);
3413       std::swap (dr_a, dr_b);
3414     }
3415
3416   /* If the two accesses could be dependent within a scalar iteration,
3417      make sure that we'd retain their order.  */
3418   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3419       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3420     return false;
3421
3422   /* There is no alias if abs (DR_STEP) is greater than or equal to
3423      the bytes spanned by the combination of the two accesses.  */
3424   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3425   return true;
3426 }
3427
3428 /* Function vect_prune_runtime_alias_test_list.
3429
3430    Prune a list of ddrs to be tested at run-time by versioning for alias.
3431    Merge several alias checks into one if possible.
3432    Return FALSE if resulting list of ddrs is longer then allowed by
3433    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3434
3435 opt_result
3436 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3437 {
3438   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3439   hash_set <tree_pair_hash> compared_objects;
3440
3441   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3442   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3443     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3444   vec<vec_object_pair> &check_unequal_addrs
3445     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3446   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3447   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3448
3449   ddr_p ddr;
3450   unsigned int i;
3451   tree length_factor;
3452
3453   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3454
3455   /* Step values are irrelevant for aliasing if the number of vector
3456      iterations is equal to the number of scalar iterations (which can
3457      happen for fully-SLP loops).  */
3458   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3459
3460   if (!ignore_step_p)
3461     {
3462       /* Convert the checks for nonzero steps into bound tests.  */
3463       tree value;
3464       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3465         vect_check_lower_bound (loop_vinfo, value, true, 1);
3466     }
3467
3468   if (may_alias_ddrs.is_empty ())
3469     return opt_result::success ();
3470
3471   comp_alias_ddrs.create (may_alias_ddrs.length ());
3472
3473   unsigned int loop_depth
3474     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3475                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3476
3477   /* First, we collect all data ref pairs for aliasing checks.  */
3478   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3479     {
3480       poly_uint64 lower_bound;
3481       tree segment_length_a, segment_length_b;
3482       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3483       unsigned int align_a, align_b;
3484
3485       /* Ignore the alias if the VF we chose ended up being no greater
3486          than the dependence distance.  */
3487       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3488         continue;
3489
3490       if (DDR_OBJECT_A (ddr))
3491         {
3492           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3493           if (!compared_objects.add (new_pair))
3494             {
3495               if (dump_enabled_p ())
3496                 dump_printf_loc (MSG_NOTE, vect_location,
3497                                  "checking that %T and %T"
3498                                  " have different addresses\n",
3499                                  new_pair.first, new_pair.second);
3500               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3501             }
3502           continue;
3503         }
3504
3505       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3506       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3507
3508       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3509       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3510
3511       bool preserves_scalar_order_p
3512         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3513
3514       /* Skip the pair if inter-iteration dependencies are irrelevant
3515          and intra-iteration dependencies are guaranteed to be honored.  */
3516       if (ignore_step_p
3517           && (preserves_scalar_order_p
3518               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3519                                                  &lower_bound)))
3520         {
3521           if (dump_enabled_p ())
3522             dump_printf_loc (MSG_NOTE, vect_location,
3523                              "no need for alias check between "
3524                              "%T and %T when VF is 1\n",
3525                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3526           continue;
3527         }
3528
3529       /* See whether we can handle the alias using a bounds check on
3530          the step, and whether that's likely to be the best approach.
3531          (It might not be, for example, if the minimum step is much larger
3532          than the number of bytes handled by one vector iteration.)  */
3533       if (!ignore_step_p
3534           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3535           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3536                                              &lower_bound)
3537           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3538               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3539         {
3540           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3541           if (dump_enabled_p ())
3542             {
3543               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3544                                "%T and %T when the step %T is outside ",
3545                                DR_REF (dr_info_a->dr),
3546                                DR_REF (dr_info_b->dr),
3547                                DR_STEP (dr_info_a->dr));
3548               if (unsigned_p)
3549                 dump_printf (MSG_NOTE, "[0");
3550               else
3551                 {
3552                   dump_printf (MSG_NOTE, "(");
3553                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3554                 }
3555               dump_printf (MSG_NOTE, ", ");
3556               dump_dec (MSG_NOTE, lower_bound);
3557               dump_printf (MSG_NOTE, ")\n");
3558             }
3559           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3560                                   unsigned_p, lower_bound);
3561           continue;
3562         }
3563
3564       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3565       if (dr_group_first_a)
3566         {
3567           stmt_info_a = dr_group_first_a;
3568           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3569         }
3570
3571       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3572       if (dr_group_first_b)
3573         {
3574           stmt_info_b = dr_group_first_b;
3575           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3576         }
3577
3578       if (ignore_step_p)
3579         {
3580           segment_length_a = size_zero_node;
3581           segment_length_b = size_zero_node;
3582         }
3583       else
3584         {
3585           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3586                                 DR_STEP (dr_info_b->dr), 0))
3587             length_factor = scalar_loop_iters;
3588           else
3589             length_factor = size_int (vect_factor);
3590           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3591           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3592         }
3593       access_size_a = vect_vfa_access_size (dr_info_a);
3594       access_size_b = vect_vfa_access_size (dr_info_b);
3595       align_a = vect_vfa_align (dr_info_a);
3596       align_b = vect_vfa_align (dr_info_b);
3597
3598       /* See whether the alias is known at compilation time.  */
3599       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3600                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3601           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3602                               DR_OFFSET (dr_info_b->dr), 0)
3603           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3604           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3605           && poly_int_tree_p (segment_length_a)
3606           && poly_int_tree_p (segment_length_b))
3607         {
3608           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3609                                              segment_length_a,
3610                                              segment_length_b,
3611                                              access_size_a,
3612                                              access_size_b);
3613           if (res >= 0 && dump_enabled_p ())
3614             {
3615               dump_printf_loc (MSG_NOTE, vect_location,
3616                                "can tell at compile time that %T and %T",
3617                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3618               if (res == 0)
3619                 dump_printf (MSG_NOTE, " do not alias\n");
3620               else
3621                 dump_printf (MSG_NOTE, " alias\n");
3622             }
3623
3624           if (res == 0)
3625             continue;
3626
3627           if (res == 1)
3628             return opt_result::failure_at (stmt_info_b->stmt,
3629                                            "not vectorized:"
3630                                            " compilation time alias: %G%G",
3631                                            stmt_info_a->stmt,
3632                                            stmt_info_b->stmt);
3633         }
3634
3635       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3636                             access_size_a, align_a);
3637       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3638                             access_size_b, align_b);
3639       /* Canonicalize the order to be the one that's needed for accurate
3640          RAW, WAR and WAW flags, in cases where the data references are
3641          well-ordered.  The order doesn't really matter otherwise,
3642          but we might as well be consistent.  */
3643       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3644         std::swap (dr_a, dr_b);
3645
3646       dr_with_seg_len_pair_t dr_with_seg_len_pair
3647         (dr_a, dr_b, (preserves_scalar_order_p
3648                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3649                       : dr_with_seg_len_pair_t::REORDERED));
3650
3651       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3652     }
3653
3654   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3655
3656   unsigned int count = (comp_alias_ddrs.length ()
3657                         + check_unequal_addrs.length ());
3658
3659   if (dump_enabled_p ())
3660     dump_printf_loc (MSG_NOTE, vect_location,
3661                      "improved number of alias checks from %d to %d\n",
3662                      may_alias_ddrs.length (), count);
3663   unsigned limit = param_vect_max_version_for_alias_checks;
3664   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3665     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3666   if (count > limit)
3667     return opt_result::failure_at
3668       (vect_location,
3669        "number of versioning for alias run-time tests exceeds %d "
3670        "(--param vect-max-version-for-alias-checks)\n", limit);
3671
3672   return opt_result::success ();
3673 }
3674
3675 /* Check whether we can use an internal function for a gather load
3676    or scatter store.  READ_P is true for loads and false for stores.
3677    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3678    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3679    is the type of the offset that is being applied to the invariant
3680    base address.  SCALE is the amount by which the offset should
3681    be multiplied *after* it has been converted to address width.
3682
3683    Return true if the function is supported, storing the function id in
3684    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3685
3686 bool
3687 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3688                           tree vectype, tree memory_type, tree offset_type,
3689                           int scale, internal_fn *ifn_out,
3690                           tree *offset_vectype_out)
3691 {
3692   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3693   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3694   if (element_bits != memory_bits)
3695     /* For now the vector elements must be the same width as the
3696        memory elements.  */
3697     return false;
3698
3699   /* Work out which function we need.  */
3700   internal_fn ifn;
3701   if (read_p)
3702     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3703   else
3704     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3705
3706   for (;;)
3707     {
3708       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3709       if (!offset_vectype)
3710         return false;
3711
3712       /* Test whether the target supports this combination.  */
3713       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3714                                                   offset_vectype, scale))
3715         {
3716           *ifn_out = ifn;
3717           *offset_vectype_out = offset_vectype;
3718           return true;
3719         }
3720
3721       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3722           && TYPE_PRECISION (offset_type) >= element_bits)
3723         return false;
3724
3725       offset_type = build_nonstandard_integer_type
3726         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3727     }
3728 }
3729
3730 /* STMT_INFO is a call to an internal gather load or scatter store function.
3731    Describe the operation in INFO.  */
3732
3733 static void
3734 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3735                                    gather_scatter_info *info)
3736 {
3737   gcall *call = as_a <gcall *> (stmt_info->stmt);
3738   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3739   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3740
3741   info->ifn = gimple_call_internal_fn (call);
3742   info->decl = NULL_TREE;
3743   info->base = gimple_call_arg (call, 0);
3744   info->offset = gimple_call_arg (call, 1);
3745   info->offset_dt = vect_unknown_def_type;
3746   info->offset_vectype = NULL_TREE;
3747   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3748   info->element_type = TREE_TYPE (vectype);
3749   info->memory_type = TREE_TYPE (DR_REF (dr));
3750 }
3751
3752 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3753    gather load or scatter store.  Describe the operation in *INFO if so.  */
3754
3755 bool
3756 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3757                            gather_scatter_info *info)
3758 {
3759   HOST_WIDE_INT scale = 1;
3760   poly_int64 pbitpos, pbitsize;
3761   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3762   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3763   tree offtype = NULL_TREE;
3764   tree decl = NULL_TREE, base, off;
3765   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3766   tree memory_type = TREE_TYPE (DR_REF (dr));
3767   machine_mode pmode;
3768   int punsignedp, reversep, pvolatilep = 0;
3769   internal_fn ifn;
3770   tree offset_vectype;
3771   bool masked_p = false;
3772
3773   /* See whether this is already a call to a gather/scatter internal function.
3774      If not, see whether it's a masked load or store.  */
3775   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3776   if (call && gimple_call_internal_p (call))
3777     {
3778       ifn = gimple_call_internal_fn (call);
3779       if (internal_gather_scatter_fn_p (ifn))
3780         {
3781           vect_describe_gather_scatter_call (stmt_info, info);
3782           return true;
3783         }
3784       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3785     }
3786
3787   /* True if we should aim to use internal functions rather than
3788      built-in functions.  */
3789   bool use_ifn_p = (DR_IS_READ (dr)
3790                     ? supports_vec_gather_load_p ()
3791                     : supports_vec_scatter_store_p ());
3792
3793   base = DR_REF (dr);
3794   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3795      see if we can use the def stmt of the address.  */
3796   if (masked_p
3797       && TREE_CODE (base) == MEM_REF
3798       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3799       && integer_zerop (TREE_OPERAND (base, 1))
3800       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3801     {
3802       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3803       if (is_gimple_assign (def_stmt)
3804           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3805         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3806     }
3807
3808   /* The gather and scatter builtins need address of the form
3809      loop_invariant + vector * {1, 2, 4, 8}
3810      or
3811      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3812      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3813      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3814      multiplications and additions in it.  To get a vector, we need
3815      a single SSA_NAME that will be defined in the loop and will
3816      contain everything that is not loop invariant and that can be
3817      vectorized.  The following code attempts to find such a preexistng
3818      SSA_NAME OFF and put the loop invariants into a tree BASE
3819      that can be gimplified before the loop.  */
3820   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3821                               &punsignedp, &reversep, &pvolatilep);
3822   if (reversep)
3823     return false;
3824
3825   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3826
3827   if (TREE_CODE (base) == MEM_REF)
3828     {
3829       if (!integer_zerop (TREE_OPERAND (base, 1)))
3830         {
3831           if (off == NULL_TREE)
3832             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3833           else
3834             off = size_binop (PLUS_EXPR, off,
3835                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3836         }
3837       base = TREE_OPERAND (base, 0);
3838     }
3839   else
3840     base = build_fold_addr_expr (base);
3841
3842   if (off == NULL_TREE)
3843     off = size_zero_node;
3844
3845   /* If base is not loop invariant, either off is 0, then we start with just
3846      the constant offset in the loop invariant BASE and continue with base
3847      as OFF, otherwise give up.
3848      We could handle that case by gimplifying the addition of base + off
3849      into some SSA_NAME and use that as off, but for now punt.  */
3850   if (!expr_invariant_in_loop_p (loop, base))
3851     {
3852       if (!integer_zerop (off))
3853         return false;
3854       off = base;
3855       base = size_int (pbytepos);
3856     }
3857   /* Otherwise put base + constant offset into the loop invariant BASE
3858      and continue with OFF.  */
3859   else
3860     {
3861       base = fold_convert (sizetype, base);
3862       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3863     }
3864
3865   /* OFF at this point may be either a SSA_NAME or some tree expression
3866      from get_inner_reference.  Try to peel off loop invariants from it
3867      into BASE as long as possible.  */
3868   STRIP_NOPS (off);
3869   while (offtype == NULL_TREE)
3870     {
3871       enum tree_code code;
3872       tree op0, op1, add = NULL_TREE;
3873
3874       if (TREE_CODE (off) == SSA_NAME)
3875         {
3876           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3877
3878           if (expr_invariant_in_loop_p (loop, off))
3879             return false;
3880
3881           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3882             break;
3883
3884           op0 = gimple_assign_rhs1 (def_stmt);
3885           code = gimple_assign_rhs_code (def_stmt);
3886           op1 = gimple_assign_rhs2 (def_stmt);
3887         }
3888       else
3889         {
3890           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3891             return false;
3892           code = TREE_CODE (off);
3893           extract_ops_from_tree (off, &code, &op0, &op1);
3894         }
3895       switch (code)
3896         {
3897         case POINTER_PLUS_EXPR:
3898         case PLUS_EXPR:
3899           if (expr_invariant_in_loop_p (loop, op0))
3900             {
3901               add = op0;
3902               off = op1;
3903             do_add:
3904               add = fold_convert (sizetype, add);
3905               if (scale != 1)
3906                 add = size_binop (MULT_EXPR, add, size_int (scale));
3907               base = size_binop (PLUS_EXPR, base, add);
3908               continue;
3909             }
3910           if (expr_invariant_in_loop_p (loop, op1))
3911             {
3912               add = op1;
3913               off = op0;
3914               goto do_add;
3915             }
3916           break;
3917         case MINUS_EXPR:
3918           if (expr_invariant_in_loop_p (loop, op1))
3919             {
3920               add = fold_convert (sizetype, op1);
3921               add = size_binop (MINUS_EXPR, size_zero_node, add);
3922               off = op0;
3923               goto do_add;
3924             }
3925           break;
3926         case MULT_EXPR:
3927           if (scale == 1 && tree_fits_shwi_p (op1))
3928             {
3929               int new_scale = tree_to_shwi (op1);
3930               /* Only treat this as a scaling operation if the target
3931                  supports it for at least some offset type.  */
3932               if (use_ifn_p
3933                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3934                                                 masked_p, vectype, memory_type,
3935                                                 signed_char_type_node,
3936                                                 new_scale, &ifn,
3937                                                 &offset_vectype)
3938                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3939                                                 masked_p, vectype, memory_type,
3940                                                 unsigned_char_type_node,
3941                                                 new_scale, &ifn,
3942                                                 &offset_vectype))
3943                 break;
3944               scale = new_scale;
3945               off = op0;
3946               continue;
3947             }
3948           break;
3949         case SSA_NAME:
3950           off = op0;
3951           continue;
3952         CASE_CONVERT:
3953           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3954               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3955             break;
3956
3957           /* Don't include the conversion if the target is happy with
3958              the current offset type.  */
3959           if (use_ifn_p
3960               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3961                                            masked_p, vectype, memory_type,
3962                                            TREE_TYPE (off), scale, &ifn,
3963                                            &offset_vectype))
3964             break;
3965
3966           if (TYPE_PRECISION (TREE_TYPE (op0))
3967               == TYPE_PRECISION (TREE_TYPE (off)))
3968             {
3969               off = op0;
3970               continue;
3971             }
3972
3973           if (TYPE_PRECISION (TREE_TYPE (op0))
3974               < TYPE_PRECISION (TREE_TYPE (off)))
3975             {
3976               off = op0;
3977               offtype = TREE_TYPE (off);
3978               STRIP_NOPS (off);
3979               continue;
3980             }
3981           break;
3982         default:
3983           break;
3984         }
3985       break;
3986     }
3987
3988   /* If at the end OFF still isn't a SSA_NAME or isn't
3989      defined in the loop, punt.  */
3990   if (TREE_CODE (off) != SSA_NAME
3991       || expr_invariant_in_loop_p (loop, off))
3992     return false;
3993
3994   if (offtype == NULL_TREE)
3995     offtype = TREE_TYPE (off);
3996
3997   if (use_ifn_p)
3998     {
3999       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4000                                      vectype, memory_type, offtype, scale,
4001                                      &ifn, &offset_vectype))
4002         return false;
4003     }
4004   else
4005     {
4006       if (DR_IS_READ (dr))
4007         {
4008           if (targetm.vectorize.builtin_gather)
4009             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4010         }
4011       else
4012         {
4013           if (targetm.vectorize.builtin_scatter)
4014             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4015         }
4016
4017       if (!decl)
4018         return false;
4019
4020       ifn = IFN_LAST;
4021       /* The offset vector type will be read from DECL when needed.  */
4022       offset_vectype = NULL_TREE;
4023     }
4024
4025   info->ifn = ifn;
4026   info->decl = decl;
4027   info->base = base;
4028   info->offset = off;
4029   info->offset_dt = vect_unknown_def_type;
4030   info->offset_vectype = offset_vectype;
4031   info->scale = scale;
4032   info->element_type = TREE_TYPE (vectype);
4033   info->memory_type = memory_type;
4034   return true;
4035 }
4036
4037 /* Find the data references in STMT, analyze them with respect to LOOP and
4038    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4039    be handled.  */
4040
4041 opt_result
4042 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4043                                vec<data_reference_p> *datarefs)
4044 {
4045   /* We can ignore clobbers for dataref analysis - they are removed during
4046      loop vectorization and BB vectorization checks dependences with a
4047      stmt walk.  */
4048   if (gimple_clobber_p (stmt))
4049     return opt_result::success ();
4050
4051   if (gimple_has_volatile_ops (stmt))
4052     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4053                                    stmt);
4054
4055   if (stmt_can_throw_internal (cfun, stmt))
4056     return opt_result::failure_at (stmt,
4057                                    "not vectorized:"
4058                                    " statement can throw an exception: %G",
4059                                    stmt);
4060
4061   auto_vec<data_reference_p, 2> refs;
4062   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4063   if (!res)
4064     return res;
4065
4066   if (refs.is_empty ())
4067     return opt_result::success ();
4068
4069   if (refs.length () > 1)
4070     return opt_result::failure_at (stmt,
4071                                    "not vectorized:"
4072                                    " more than one data ref in stmt: %G", stmt);
4073
4074   if (gcall *call = dyn_cast <gcall *> (stmt))
4075     if (!gimple_call_internal_p (call)
4076         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4077             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4078       return opt_result::failure_at (stmt,
4079                                      "not vectorized: dr in a call %G", stmt);
4080
4081   data_reference_p dr = refs.pop ();
4082   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4083       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4084     return opt_result::failure_at (stmt,
4085                                    "not vectorized:"
4086                                    " statement is bitfield access %G", stmt);
4087
4088   if (DR_BASE_ADDRESS (dr)
4089       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4090     return opt_result::failure_at (stmt,
4091                                    "not vectorized:"
4092                                    " base addr of dr is a constant\n");
4093
4094   /* Check whether this may be a SIMD lane access and adjust the
4095      DR to make it easier for us to handle it.  */
4096   if (loop
4097       && loop->simduid
4098       && (!DR_BASE_ADDRESS (dr)
4099           || !DR_OFFSET (dr)
4100           || !DR_INIT (dr)
4101           || !DR_STEP (dr)))
4102     {
4103       struct data_reference *newdr
4104         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4105                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4106       if (DR_BASE_ADDRESS (newdr)
4107           && DR_OFFSET (newdr)
4108           && DR_INIT (newdr)
4109           && DR_STEP (newdr)
4110           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4111           && integer_zerop (DR_STEP (newdr)))
4112         {
4113           tree base_address = DR_BASE_ADDRESS (newdr);
4114           tree off = DR_OFFSET (newdr);
4115           tree step = ssize_int (1);
4116           if (integer_zerop (off)
4117               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4118             {
4119               off = TREE_OPERAND (base_address, 1);
4120               base_address = TREE_OPERAND (base_address, 0);
4121             }
4122           STRIP_NOPS (off);
4123           if (TREE_CODE (off) == MULT_EXPR
4124               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4125             {
4126               step = TREE_OPERAND (off, 1);
4127               off = TREE_OPERAND (off, 0);
4128               STRIP_NOPS (off);
4129             }
4130           if (CONVERT_EXPR_P (off)
4131               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4132                   < TYPE_PRECISION (TREE_TYPE (off))))
4133             off = TREE_OPERAND (off, 0);
4134           if (TREE_CODE (off) == SSA_NAME)
4135             {
4136               gimple *def = SSA_NAME_DEF_STMT (off);
4137               /* Look through widening conversion.  */
4138               if (is_gimple_assign (def)
4139                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4140                 {
4141                   tree rhs1 = gimple_assign_rhs1 (def);
4142                   if (TREE_CODE (rhs1) == SSA_NAME
4143                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4144                       && (TYPE_PRECISION (TREE_TYPE (off))
4145                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4146                     def = SSA_NAME_DEF_STMT (rhs1);
4147                 }
4148               if (is_gimple_call (def)
4149                   && gimple_call_internal_p (def)
4150                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4151                 {
4152                   tree arg = gimple_call_arg (def, 0);
4153                   tree reft = TREE_TYPE (DR_REF (newdr));
4154                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4155                   arg = SSA_NAME_VAR (arg);
4156                   if (arg == loop->simduid
4157                       /* For now.  */
4158                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4159                     {
4160                       DR_BASE_ADDRESS (newdr) = base_address;
4161                       DR_OFFSET (newdr) = ssize_int (0);
4162                       DR_STEP (newdr) = step;
4163                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4164                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4165                       /* Mark as simd-lane access.  */
4166                       tree arg2 = gimple_call_arg (def, 1);
4167                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4168                       free_data_ref (dr);
4169                       datarefs->safe_push (newdr);
4170                       return opt_result::success ();
4171                     }
4172                 }
4173             }
4174         }
4175       free_data_ref (newdr);
4176     }
4177
4178   datarefs->safe_push (dr);
4179   return opt_result::success ();
4180 }
4181
4182 /* Function vect_analyze_data_refs.
4183
4184   Find all the data references in the loop or basic block.
4185
4186    The general structure of the analysis of data refs in the vectorizer is as
4187    follows:
4188    1- vect_analyze_data_refs(loop/bb): call
4189       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4190       in the loop/bb and their dependences.
4191    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4192    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4193    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4194
4195 */
4196
4197 opt_result
4198 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4199 {
4200   class loop *loop = NULL;
4201   unsigned int i;
4202   struct data_reference *dr;
4203   tree scalar_type;
4204
4205   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4206
4207   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4208     loop = LOOP_VINFO_LOOP (loop_vinfo);
4209
4210   /* Go through the data-refs, check that the analysis succeeded.  Update
4211      pointer from stmt_vec_info struct to DR and vectype.  */
4212
4213   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4214   FOR_EACH_VEC_ELT (datarefs, i, dr)
4215     {
4216       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4217       poly_uint64 vf;
4218
4219       gcc_assert (DR_REF (dr));
4220       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4221       gcc_assert (!stmt_info->dr_aux.dr);
4222       stmt_info->dr_aux.dr = dr;
4223       stmt_info->dr_aux.stmt = stmt_info;
4224
4225       /* Check that analysis of the data-ref succeeded.  */
4226       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4227           || !DR_STEP (dr))
4228         {
4229           bool maybe_gather
4230             = DR_IS_READ (dr)
4231               && !TREE_THIS_VOLATILE (DR_REF (dr))
4232               && (targetm.vectorize.builtin_gather != NULL
4233                   || supports_vec_gather_load_p ());
4234           bool maybe_scatter
4235             = DR_IS_WRITE (dr)
4236               && !TREE_THIS_VOLATILE (DR_REF (dr))
4237               && (targetm.vectorize.builtin_scatter != NULL
4238                   || supports_vec_scatter_store_p ());
4239
4240           /* If target supports vector gather loads or scatter stores,
4241              see if they can't be used.  */
4242           if (is_a <loop_vec_info> (vinfo)
4243               && !nested_in_vect_loop_p (loop, stmt_info))
4244             {
4245               if (maybe_gather || maybe_scatter)
4246                 {
4247                   if (maybe_gather)
4248                     gatherscatter = GATHER;
4249                   else
4250                     gatherscatter = SCATTER;
4251                 }
4252             }
4253
4254           if (gatherscatter == SG_NONE)
4255             {
4256               if (dump_enabled_p ())
4257                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4258                                  "not vectorized: data ref analysis "
4259                                  "failed %G", stmt_info->stmt);
4260               if (is_a <bb_vec_info> (vinfo))
4261                 {
4262                   /* In BB vectorization the ref can still participate
4263                      in dependence analysis, we just can't vectorize it.  */
4264                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4265                   continue;
4266                 }
4267               return opt_result::failure_at (stmt_info->stmt,
4268                                              "not vectorized:"
4269                                              " data ref analysis failed: %G",
4270                                              stmt_info->stmt);
4271             }
4272         }
4273
4274       /* See if this was detected as SIMD lane access.  */
4275       if (dr->aux == (void *)-1
4276           || dr->aux == (void *)-2
4277           || dr->aux == (void *)-3
4278           || dr->aux == (void *)-4)
4279         {
4280           if (nested_in_vect_loop_p (loop, stmt_info))
4281             return opt_result::failure_at (stmt_info->stmt,
4282                                            "not vectorized:"
4283                                            " data ref analysis failed: %G",
4284                                            stmt_info->stmt);
4285           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4286             = -(uintptr_t) dr->aux;
4287         }
4288
4289       tree base = get_base_address (DR_REF (dr));
4290       if (base && VAR_P (base) && DECL_NONALIASED (base))
4291         {
4292           if (dump_enabled_p ())
4293             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4294                              "not vectorized: base object not addressable "
4295                              "for stmt: %G", stmt_info->stmt);
4296           if (is_a <bb_vec_info> (vinfo))
4297             {
4298               /* In BB vectorization the ref can still participate
4299                  in dependence analysis, we just can't vectorize it.  */
4300               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4301               continue;
4302             }
4303           return opt_result::failure_at (stmt_info->stmt,
4304                                          "not vectorized: base object not"
4305                                          " addressable for stmt: %G",
4306                                          stmt_info->stmt);
4307         }
4308
4309       if (is_a <loop_vec_info> (vinfo)
4310           && DR_STEP (dr)
4311           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4312         {
4313           if (nested_in_vect_loop_p (loop, stmt_info))
4314             return opt_result::failure_at (stmt_info->stmt,
4315                                            "not vectorized: "
4316                                            "not suitable for strided load %G",
4317                                            stmt_info->stmt);
4318           STMT_VINFO_STRIDED_P (stmt_info) = true;
4319         }
4320
4321       /* Update DR field in stmt_vec_info struct.  */
4322
4323       /* If the dataref is in an inner-loop of the loop that is considered for
4324          for vectorization, we also want to analyze the access relative to
4325          the outer-loop (DR contains information only relative to the
4326          inner-most enclosing loop).  We do that by building a reference to the
4327          first location accessed by the inner-loop, and analyze it relative to
4328          the outer-loop.  */
4329       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4330         {
4331           /* Build a reference to the first location accessed by the
4332              inner loop: *(BASE + INIT + OFFSET).  By construction,
4333              this address must be invariant in the inner loop, so we
4334              can consider it as being used in the outer loop.  */
4335           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4336           tree offset = unshare_expr (DR_OFFSET (dr));
4337           tree init = unshare_expr (DR_INIT (dr));
4338           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4339                                           init, offset);
4340           tree init_addr = fold_build_pointer_plus (base, init_offset);
4341           tree init_ref = build_fold_indirect_ref (init_addr);
4342
4343           if (dump_enabled_p ())
4344             dump_printf_loc (MSG_NOTE, vect_location,
4345                              "analyze in outer loop: %T\n", init_ref);
4346
4347           opt_result res
4348             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4349                                     init_ref, loop, stmt_info->stmt);
4350           if (!res)
4351             /* dr_analyze_innermost already explained the failure.  */
4352             return res;
4353
4354           if (dump_enabled_p ())
4355             dump_printf_loc (MSG_NOTE, vect_location,
4356                              "\touter base_address: %T\n"
4357                              "\touter offset from base address: %T\n"
4358                              "\touter constant offset from base address: %T\n"
4359                              "\touter step: %T\n"
4360                              "\touter base alignment: %d\n\n"
4361                              "\touter base misalignment: %d\n"
4362                              "\touter offset alignment: %d\n"
4363                              "\touter step alignment: %d\n",
4364                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4365                              STMT_VINFO_DR_OFFSET (stmt_info),
4366                              STMT_VINFO_DR_INIT (stmt_info),
4367                              STMT_VINFO_DR_STEP (stmt_info),
4368                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4369                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4370                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4371                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4372         }
4373
4374       /* Set vectype for STMT.  */
4375       scalar_type = TREE_TYPE (DR_REF (dr));
4376       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4377       if (!vectype)
4378         {
4379           if (dump_enabled_p ())
4380             {
4381               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4382                                "not vectorized: no vectype for stmt: %G",
4383                                stmt_info->stmt);
4384               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4385               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4386                                  scalar_type);
4387               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4388             }
4389
4390           if (is_a <bb_vec_info> (vinfo))
4391             {
4392               /* No vector type is fine, the ref can still participate
4393                  in dependence analysis, we just can't vectorize it.  */
4394               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4395               continue;
4396             }
4397           if (fatal)
4398             *fatal = false;
4399           return opt_result::failure_at (stmt_info->stmt,
4400                                          "not vectorized:"
4401                                          " no vectype for stmt: %G"
4402                                          " scalar_type: %T\n",
4403                                          stmt_info->stmt, scalar_type);
4404         }
4405       else
4406         {
4407           if (dump_enabled_p ())
4408             dump_printf_loc (MSG_NOTE, vect_location,
4409                              "got vectype for stmt: %G%T\n",
4410                              stmt_info->stmt, vectype);
4411         }
4412
4413       /* Adjust the minimal vectorization factor according to the
4414          vector type.  */
4415       vf = TYPE_VECTOR_SUBPARTS (vectype);
4416       *min_vf = upper_bound (*min_vf, vf);
4417
4418       /* Leave the BB vectorizer to pick the vector type later, based on
4419          the final dataref group size and SLP node size.  */
4420       if (is_a <loop_vec_info> (vinfo))
4421         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4422
4423       if (gatherscatter != SG_NONE)
4424         {
4425           gather_scatter_info gs_info;
4426           if (!vect_check_gather_scatter (stmt_info,
4427                                           as_a <loop_vec_info> (vinfo),
4428                                           &gs_info)
4429               || !get_vectype_for_scalar_type (vinfo,
4430                                                TREE_TYPE (gs_info.offset)))
4431             {
4432               if (fatal)
4433                 *fatal = false;
4434               return opt_result::failure_at
4435                         (stmt_info->stmt,
4436                          (gatherscatter == GATHER)
4437                          ? "not vectorized: not suitable for gather load %G"
4438                          : "not vectorized: not suitable for scatter store %G",
4439                          stmt_info->stmt);
4440             }
4441           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4442         }
4443     }
4444
4445   /* We used to stop processing and prune the list here.  Verify we no
4446      longer need to.  */
4447   gcc_assert (i == datarefs.length ());
4448
4449   return opt_result::success ();
4450 }
4451
4452
4453 /* Function vect_get_new_vect_var.
4454
4455    Returns a name for a new variable.  The current naming scheme appends the
4456    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4457    the name of vectorizer generated variables, and appends that to NAME if
4458    provided.  */
4459
4460 tree
4461 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4462 {
4463   const char *prefix;
4464   tree new_vect_var;
4465
4466   switch (var_kind)
4467   {
4468   case vect_simple_var:
4469     prefix = "vect";
4470     break;
4471   case vect_scalar_var:
4472     prefix = "stmp";
4473     break;
4474   case vect_mask_var:
4475     prefix = "mask";
4476     break;
4477   case vect_pointer_var:
4478     prefix = "vectp";
4479     break;
4480   default:
4481     gcc_unreachable ();
4482   }
4483
4484   if (name)
4485     {
4486       char* tmp = concat (prefix, "_", name, NULL);
4487       new_vect_var = create_tmp_reg (type, tmp);
4488       free (tmp);
4489     }
4490   else
4491     new_vect_var = create_tmp_reg (type, prefix);
4492
4493   return new_vect_var;
4494 }
4495
4496 /* Like vect_get_new_vect_var but return an SSA name.  */
4497
4498 tree
4499 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4500 {
4501   const char *prefix;
4502   tree new_vect_var;
4503
4504   switch (var_kind)
4505   {
4506   case vect_simple_var:
4507     prefix = "vect";
4508     break;
4509   case vect_scalar_var:
4510     prefix = "stmp";
4511     break;
4512   case vect_pointer_var:
4513     prefix = "vectp";
4514     break;
4515   default:
4516     gcc_unreachable ();
4517   }
4518
4519   if (name)
4520     {
4521       char* tmp = concat (prefix, "_", name, NULL);
4522       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4523       free (tmp);
4524     }
4525   else
4526     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4527
4528   return new_vect_var;
4529 }
4530
4531 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4532
4533 static void
4534 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4535 {
4536   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4537   int misalign = DR_MISALIGNMENT (dr_info);
4538   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4539     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4540   else
4541     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4542                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4543                             misalign);
4544 }
4545
4546 /* Function vect_create_addr_base_for_vector_ref.
4547
4548    Create an expression that computes the address of the first memory location
4549    that will be accessed for a data reference.
4550
4551    Input:
4552    STMT_INFO: The statement containing the data reference.
4553    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4554    OFFSET: Optional. If supplied, it is be added to the initial address.
4555    LOOP:    Specify relative to which loop-nest should the address be computed.
4556             For example, when the dataref is in an inner-loop nested in an
4557             outer-loop that is now being vectorized, LOOP can be either the
4558             outer-loop, or the inner-loop.  The first memory location accessed
4559             by the following dataref ('in' points to short):
4560
4561                 for (i=0; i<N; i++)
4562                    for (j=0; j<M; j++)
4563                      s += in[i+j]
4564
4565             is as follows:
4566             if LOOP=i_loop:     &in             (relative to i_loop)
4567             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4568    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4569             initial address.  Unlike OFFSET, which is number of elements to
4570             be added, BYTE_OFFSET is measured in bytes.
4571
4572    Output:
4573    1. Return an SSA_NAME whose value is the address of the memory location of
4574       the first vector of the data reference.
4575    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4576       these statement(s) which define the returned SSA_NAME.
4577
4578    FORNOW: We are only handling array accesses with step 1.  */
4579
4580 tree
4581 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4582                                       gimple_seq *new_stmt_list,
4583                                       tree offset,
4584                                       tree byte_offset)
4585 {
4586   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4587   struct data_reference *dr = dr_info->dr;
4588   const char *base_name;
4589   tree addr_base;
4590   tree dest;
4591   gimple_seq seq = NULL;
4592   tree vect_ptr_type;
4593   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4594   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4595   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4596
4597   tree data_ref_base = unshare_expr (drb->base_address);
4598   tree base_offset = unshare_expr (drb->offset);
4599   tree init = unshare_expr (drb->init);
4600
4601   if (loop_vinfo)
4602     base_name = get_name (data_ref_base);
4603   else
4604     {
4605       base_offset = ssize_int (0);
4606       init = ssize_int (0);
4607       base_name = get_name (DR_REF (dr));
4608     }
4609
4610   /* Create base_offset */
4611   base_offset = size_binop (PLUS_EXPR,
4612                             fold_convert (sizetype, base_offset),
4613                             fold_convert (sizetype, init));
4614
4615   if (offset)
4616     {
4617       offset = fold_build2 (MULT_EXPR, sizetype,
4618                             fold_convert (sizetype, offset), step);
4619       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4620                                  base_offset, offset);
4621     }
4622   if (byte_offset)
4623     {
4624       byte_offset = fold_convert (sizetype, byte_offset);
4625       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4626                                  base_offset, byte_offset);
4627     }
4628
4629   /* base + base_offset */
4630   if (loop_vinfo)
4631     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4632   else
4633     {
4634       addr_base = build1 (ADDR_EXPR,
4635                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4636                           unshare_expr (DR_REF (dr)));
4637     }
4638
4639   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4640   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4641   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4642   gimple_seq_add_seq (new_stmt_list, seq);
4643
4644   if (DR_PTR_INFO (dr)
4645       && TREE_CODE (addr_base) == SSA_NAME
4646       && !SSA_NAME_PTR_INFO (addr_base))
4647     {
4648       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4649       if (offset || byte_offset)
4650         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4651     }
4652
4653   if (dump_enabled_p ())
4654     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4655
4656   return addr_base;
4657 }
4658
4659
4660 /* Function vect_create_data_ref_ptr.
4661
4662    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4663    location accessed in the loop by STMT_INFO, along with the def-use update
4664    chain to appropriately advance the pointer through the loop iterations.
4665    Also set aliasing information for the pointer.  This pointer is used by
4666    the callers to this function to create a memory reference expression for
4667    vector load/store access.
4668
4669    Input:
4670    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4671          GIMPLE_ASSIGN <name, data-ref> or
4672          GIMPLE_ASSIGN <data-ref, name>.
4673    2. AGGR_TYPE: the type of the reference, which should be either a vector
4674         or an array.
4675    3. AT_LOOP: the loop where the vector memref is to be created.
4676    4. OFFSET (optional): an offset to be added to the initial address accessed
4677         by the data-ref in STMT_INFO.
4678    5. BSI: location where the new stmts are to be placed if there is no loop
4679    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4680         pointing to the initial address.
4681    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4682         to the initial address accessed by the data-ref in STMT_INFO.  This is
4683         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4684         in bytes.
4685    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4686         to the IV during each iteration of the loop.  NULL says to move
4687         by one copy of AGGR_TYPE up or down, depending on the step of the
4688         data reference.
4689
4690    Output:
4691    1. Declare a new ptr to vector_type, and have it point to the base of the
4692       data reference (initial addressed accessed by the data reference).
4693       For example, for vector of type V8HI, the following code is generated:
4694
4695       v8hi *ap;
4696       ap = (v8hi *)initial_address;
4697
4698       if OFFSET is not supplied:
4699          initial_address = &a[init];
4700       if OFFSET is supplied:
4701          initial_address = &a[init + OFFSET];
4702       if BYTE_OFFSET is supplied:
4703          initial_address = &a[init] + BYTE_OFFSET;
4704
4705       Return the initial_address in INITIAL_ADDRESS.
4706
4707    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4708       update the pointer in each iteration of the loop.
4709
4710       Return the increment stmt that updates the pointer in PTR_INCR.
4711
4712    3. Return the pointer.  */
4713
4714 tree
4715 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4716                           class loop *at_loop, tree offset,
4717                           tree *initial_address, gimple_stmt_iterator *gsi,
4718                           gimple **ptr_incr, bool only_init,
4719                           tree byte_offset, tree iv_step)
4720 {
4721   const char *base_name;
4722   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4723   class loop *loop = NULL;
4724   bool nested_in_vect_loop = false;
4725   class loop *containing_loop = NULL;
4726   tree aggr_ptr_type;
4727   tree aggr_ptr;
4728   tree new_temp;
4729   gimple_seq new_stmt_list = NULL;
4730   edge pe = NULL;
4731   basic_block new_bb;
4732   tree aggr_ptr_init;
4733   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4734   struct data_reference *dr = dr_info->dr;
4735   tree aptr;
4736   gimple_stmt_iterator incr_gsi;
4737   bool insert_after;
4738   tree indx_before_incr, indx_after_incr;
4739   gimple *incr;
4740   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4741
4742   gcc_assert (iv_step != NULL_TREE
4743               || TREE_CODE (aggr_type) == ARRAY_TYPE
4744               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4745
4746   if (loop_vinfo)
4747     {
4748       loop = LOOP_VINFO_LOOP (loop_vinfo);
4749       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4750       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4751       pe = loop_preheader_edge (loop);
4752     }
4753   else
4754     {
4755       gcc_assert (bb_vinfo);
4756       only_init = true;
4757       *ptr_incr = NULL;
4758     }
4759
4760   /* Create an expression for the first address accessed by this load
4761      in LOOP.  */
4762   base_name = get_name (DR_BASE_ADDRESS (dr));
4763
4764   if (dump_enabled_p ())
4765     {
4766       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4767       dump_printf_loc (MSG_NOTE, vect_location,
4768                        "create %s-pointer variable to type: %T",
4769                        get_tree_code_name (TREE_CODE (aggr_type)),
4770                        aggr_type);
4771       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4772         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4773       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4774         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4775       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4776         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4777       else
4778         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4779       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4780     }
4781
4782   /* (1) Create the new aggregate-pointer variable.
4783      Vector and array types inherit the alias set of their component
4784      type by default so we need to use a ref-all pointer if the data
4785      reference does not conflict with the created aggregated data
4786      reference because it is not addressable.  */
4787   bool need_ref_all = false;
4788   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4789                               get_alias_set (DR_REF (dr))))
4790     need_ref_all = true;
4791   /* Likewise for any of the data references in the stmt group.  */
4792   else if (DR_GROUP_SIZE (stmt_info) > 1)
4793     {
4794       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4795       do
4796         {
4797           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4798           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4799                                       get_alias_set (DR_REF (sdr))))
4800             {
4801               need_ref_all = true;
4802               break;
4803             }
4804           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4805         }
4806       while (sinfo);
4807     }
4808   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4809                                                need_ref_all);
4810   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4811
4812
4813   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4814      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4815      def-use update cycles for the pointer: one relative to the outer-loop
4816      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4817      to the inner-loop (which is the inner-most loop containing the dataref),
4818      and this is done be step (5) below.
4819
4820      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4821      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4822      redundant.  Steps (3),(4) create the following:
4823
4824         vp0 = &base_addr;
4825         LOOP:   vp1 = phi(vp0,vp2)
4826                 ...
4827                 ...
4828                 vp2 = vp1 + step
4829                 goto LOOP
4830
4831      If there is an inner-loop nested in loop, then step (5) will also be
4832      applied, and an additional update in the inner-loop will be created:
4833
4834         vp0 = &base_addr;
4835         LOOP:   vp1 = phi(vp0,vp2)
4836                 ...
4837         inner:     vp3 = phi(vp1,vp4)
4838                    vp4 = vp3 + inner_step
4839                    if () goto inner
4840                 ...
4841                 vp2 = vp1 + step
4842                 if () goto LOOP   */
4843
4844   /* (2) Calculate the initial address of the aggregate-pointer, and set
4845      the aggregate-pointer to point to it before the loop.  */
4846
4847   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4848
4849   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4850                                                    offset, byte_offset);
4851   if (new_stmt_list)
4852     {
4853       if (pe)
4854         {
4855           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4856           gcc_assert (!new_bb);
4857         }
4858       else
4859         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4860     }
4861
4862   *initial_address = new_temp;
4863   aggr_ptr_init = new_temp;
4864
4865   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4866      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4867      inner-loop nested in LOOP (during outer-loop vectorization).  */
4868
4869   /* No update in loop is required.  */
4870   if (only_init && (!loop_vinfo || at_loop == loop))
4871     aptr = aggr_ptr_init;
4872   else
4873     {
4874       /* Accesses to invariant addresses should be handled specially
4875          by the caller.  */
4876       tree step = vect_dr_behavior (dr_info)->step;
4877       gcc_assert (!integer_zerop (step));
4878
4879       if (iv_step == NULL_TREE)
4880         {
4881           /* The step of the aggregate pointer is the type size,
4882              negated for downward accesses.  */
4883           iv_step = TYPE_SIZE_UNIT (aggr_type);
4884           if (tree_int_cst_sgn (step) == -1)
4885             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4886         }
4887
4888       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4889
4890       create_iv (aggr_ptr_init,
4891                  fold_convert (aggr_ptr_type, iv_step),
4892                  aggr_ptr, loop, &incr_gsi, insert_after,
4893                  &indx_before_incr, &indx_after_incr);
4894       incr = gsi_stmt (incr_gsi);
4895       loop_vinfo->add_stmt (incr);
4896
4897       /* Copy the points-to information if it exists. */
4898       if (DR_PTR_INFO (dr))
4899         {
4900           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4901           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4902         }
4903       if (ptr_incr)
4904         *ptr_incr = incr;
4905
4906       aptr = indx_before_incr;
4907     }
4908
4909   if (!nested_in_vect_loop || only_init)
4910     return aptr;
4911
4912
4913   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4914      nested in LOOP, if exists.  */
4915
4916   gcc_assert (nested_in_vect_loop);
4917   if (!only_init)
4918     {
4919       standard_iv_increment_position (containing_loop, &incr_gsi,
4920                                       &insert_after);
4921       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4922                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4923                  &indx_after_incr);
4924       incr = gsi_stmt (incr_gsi);
4925       loop_vinfo->add_stmt (incr);
4926
4927       /* Copy the points-to information if it exists. */
4928       if (DR_PTR_INFO (dr))
4929         {
4930           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4931           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4932         }
4933       if (ptr_incr)
4934         *ptr_incr = incr;
4935
4936       return indx_before_incr;
4937     }
4938   else
4939     gcc_unreachable ();
4940 }
4941
4942
4943 /* Function bump_vector_ptr
4944
4945    Increment a pointer (to a vector type) by vector-size. If requested,
4946    i.e. if PTR-INCR is given, then also connect the new increment stmt
4947    to the existing def-use update-chain of the pointer, by modifying
4948    the PTR_INCR as illustrated below:
4949
4950    The pointer def-use update-chain before this function:
4951                         DATAREF_PTR = phi (p_0, p_2)
4952                         ....
4953         PTR_INCR:       p_2 = DATAREF_PTR + step
4954
4955    The pointer def-use update-chain after this function:
4956                         DATAREF_PTR = phi (p_0, p_2)
4957                         ....
4958                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4959                         ....
4960         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4961
4962    Input:
4963    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4964                  in the loop.
4965    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4966               the loop.  The increment amount across iterations is expected
4967               to be vector_size.
4968    BSI - location where the new update stmt is to be placed.
4969    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4970    BUMP - optional. The offset by which to bump the pointer. If not given,
4971           the offset is assumed to be vector_size.
4972
4973    Output: Return NEW_DATAREF_PTR as illustrated above.
4974
4975 */
4976
4977 tree
4978 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4979                  stmt_vec_info stmt_info, tree bump)
4980 {
4981   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4982   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4983   tree update = TYPE_SIZE_UNIT (vectype);
4984   gassign *incr_stmt;
4985   ssa_op_iter iter;
4986   use_operand_p use_p;
4987   tree new_dataref_ptr;
4988
4989   if (bump)
4990     update = bump;
4991
4992   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4993     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4994   else
4995     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4996   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4997                                    dataref_ptr, update);
4998   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4999
5000   /* Copy the points-to information if it exists. */
5001   if (DR_PTR_INFO (dr))
5002     {
5003       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5004       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5005     }
5006
5007   if (!ptr_incr)
5008     return new_dataref_ptr;
5009
5010   /* Update the vector-pointer's cross-iteration increment.  */
5011   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5012     {
5013       tree use = USE_FROM_PTR (use_p);
5014
5015       if (use == dataref_ptr)
5016         SET_USE (use_p, new_dataref_ptr);
5017       else
5018         gcc_assert (operand_equal_p (use, update, 0));
5019     }
5020
5021   return new_dataref_ptr;
5022 }
5023
5024
5025 /* Copy memory reference info such as base/clique from the SRC reference
5026    to the DEST MEM_REF.  */
5027
5028 void
5029 vect_copy_ref_info (tree dest, tree src)
5030 {
5031   if (TREE_CODE (dest) != MEM_REF)
5032     return;
5033
5034   tree src_base = src;
5035   while (handled_component_p (src_base))
5036     src_base = TREE_OPERAND (src_base, 0);
5037   if (TREE_CODE (src_base) != MEM_REF
5038       && TREE_CODE (src_base) != TARGET_MEM_REF)
5039     return;
5040
5041   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5042   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5043 }
5044
5045
5046 /* Function vect_create_destination_var.
5047
5048    Create a new temporary of type VECTYPE.  */
5049
5050 tree
5051 vect_create_destination_var (tree scalar_dest, tree vectype)
5052 {
5053   tree vec_dest;
5054   const char *name;
5055   char *new_name;
5056   tree type;
5057   enum vect_var_kind kind;
5058
5059   kind = vectype
5060     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5061     ? vect_mask_var
5062     : vect_simple_var
5063     : vect_scalar_var;
5064   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5065
5066   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5067
5068   name = get_name (scalar_dest);
5069   if (name)
5070     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5071   else
5072     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5073   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5074   free (new_name);
5075
5076   return vec_dest;
5077 }
5078
5079 /* Function vect_grouped_store_supported.
5080
5081    Returns TRUE if interleave high and interleave low permutations
5082    are supported, and FALSE otherwise.  */
5083
5084 bool
5085 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5086 {
5087   machine_mode mode = TYPE_MODE (vectype);
5088
5089   /* vect_permute_store_chain requires the group size to be equal to 3 or
5090      be a power of two.  */
5091   if (count != 3 && exact_log2 (count) == -1)
5092     {
5093       if (dump_enabled_p ())
5094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5095                          "the size of the group of accesses"
5096                          " is not a power of 2 or not eqaul to 3\n");
5097       return false;
5098     }
5099
5100   /* Check that the permutation is supported.  */
5101   if (VECTOR_MODE_P (mode))
5102     {
5103       unsigned int i;
5104       if (count == 3)
5105         {
5106           unsigned int j0 = 0, j1 = 0, j2 = 0;
5107           unsigned int i, j;
5108
5109           unsigned int nelt;
5110           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5111             {
5112               if (dump_enabled_p ())
5113                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5114                                  "cannot handle groups of 3 stores for"
5115                                  " variable-length vectors\n");
5116               return false;
5117             }
5118
5119           vec_perm_builder sel (nelt, nelt, 1);
5120           sel.quick_grow (nelt);
5121           vec_perm_indices indices;
5122           for (j = 0; j < 3; j++)
5123             {
5124               int nelt0 = ((3 - j) * nelt) % 3;
5125               int nelt1 = ((3 - j) * nelt + 1) % 3;
5126               int nelt2 = ((3 - j) * nelt + 2) % 3;
5127               for (i = 0; i < nelt; i++)
5128                 {
5129                   if (3 * i + nelt0 < nelt)
5130                     sel[3 * i + nelt0] = j0++;
5131                   if (3 * i + nelt1 < nelt)
5132                     sel[3 * i + nelt1] = nelt + j1++;
5133                   if (3 * i + nelt2 < nelt)
5134                     sel[3 * i + nelt2] = 0;
5135                 }
5136               indices.new_vector (sel, 2, nelt);
5137               if (!can_vec_perm_const_p (mode, indices))
5138                 {
5139                   if (dump_enabled_p ())
5140                     dump_printf (MSG_MISSED_OPTIMIZATION,
5141                                  "permutation op not supported by target.\n");
5142                   return false;
5143                 }
5144
5145               for (i = 0; i < nelt; i++)
5146                 {
5147                   if (3 * i + nelt0 < nelt)
5148                     sel[3 * i + nelt0] = 3 * i + nelt0;
5149                   if (3 * i + nelt1 < nelt)
5150                     sel[3 * i + nelt1] = 3 * i + nelt1;
5151                   if (3 * i + nelt2 < nelt)
5152                     sel[3 * i + nelt2] = nelt + j2++;
5153                 }
5154               indices.new_vector (sel, 2, nelt);
5155               if (!can_vec_perm_const_p (mode, indices))
5156                 {
5157                   if (dump_enabled_p ())
5158                     dump_printf (MSG_MISSED_OPTIMIZATION,
5159                                  "permutation op not supported by target.\n");
5160                   return false;
5161                 }
5162             }
5163           return true;
5164         }
5165       else
5166         {
5167           /* If length is not equal to 3 then only power of 2 is supported.  */
5168           gcc_assert (pow2p_hwi (count));
5169           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5170
5171           /* The encoding has 2 interleaved stepped patterns.  */
5172           vec_perm_builder sel (nelt, 2, 3);
5173           sel.quick_grow (6);
5174           for (i = 0; i < 3; i++)
5175             {
5176               sel[i * 2] = i;
5177               sel[i * 2 + 1] = i + nelt;
5178             }
5179           vec_perm_indices indices (sel, 2, nelt);
5180           if (can_vec_perm_const_p (mode, indices))
5181             {
5182               for (i = 0; i < 6; i++)
5183                 sel[i] += exact_div (nelt, 2);
5184               indices.new_vector (sel, 2, nelt);
5185               if (can_vec_perm_const_p (mode, indices))
5186                 return true;
5187             }
5188         }
5189     }
5190
5191   if (dump_enabled_p ())
5192     dump_printf (MSG_MISSED_OPTIMIZATION,
5193                  "permutation op not supported by target.\n");
5194   return false;
5195 }
5196
5197
5198 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5199    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5200
5201 bool
5202 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5203                             bool masked_p)
5204 {
5205   if (masked_p)
5206     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5207                                          vec_mask_store_lanes_optab,
5208                                          vectype, count);
5209   else
5210     return vect_lanes_optab_supported_p ("vec_store_lanes",
5211                                          vec_store_lanes_optab,
5212                                          vectype, count);
5213 }
5214
5215
5216 /* Function vect_permute_store_chain.
5217
5218    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5219    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5220    the data correctly for the stores.  Return the final references for stores
5221    in RESULT_CHAIN.
5222
5223    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5224    The input is 4 vectors each containing 8 elements.  We assign a number to
5225    each element, the input sequence is:
5226
5227    1st vec:   0  1  2  3  4  5  6  7
5228    2nd vec:   8  9 10 11 12 13 14 15
5229    3rd vec:  16 17 18 19 20 21 22 23
5230    4th vec:  24 25 26 27 28 29 30 31
5231
5232    The output sequence should be:
5233
5234    1st vec:  0  8 16 24  1  9 17 25
5235    2nd vec:  2 10 18 26  3 11 19 27
5236    3rd vec:  4 12 20 28  5 13 21 30
5237    4th vec:  6 14 22 30  7 15 23 31
5238
5239    i.e., we interleave the contents of the four vectors in their order.
5240
5241    We use interleave_high/low instructions to create such output.  The input of
5242    each interleave_high/low operation is two vectors:
5243    1st vec    2nd vec
5244    0 1 2 3    4 5 6 7
5245    the even elements of the result vector are obtained left-to-right from the
5246    high/low elements of the first vector.  The odd elements of the result are
5247    obtained left-to-right from the high/low elements of the second vector.
5248    The output of interleave_high will be:   0 4 1 5
5249    and of interleave_low:                   2 6 3 7
5250
5251
5252    The permutation is done in log LENGTH stages.  In each stage interleave_high
5253    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5254    where the first argument is taken from the first half of DR_CHAIN and the
5255    second argument from it's second half.
5256    In our example,
5257
5258    I1: interleave_high (1st vec, 3rd vec)
5259    I2: interleave_low (1st vec, 3rd vec)
5260    I3: interleave_high (2nd vec, 4th vec)
5261    I4: interleave_low (2nd vec, 4th vec)
5262
5263    The output for the first stage is:
5264
5265    I1:  0 16  1 17  2 18  3 19
5266    I2:  4 20  5 21  6 22  7 23
5267    I3:  8 24  9 25 10 26 11 27
5268    I4: 12 28 13 29 14 30 15 31
5269
5270    The output of the second stage, i.e. the final result is:
5271
5272    I1:  0  8 16 24  1  9 17 25
5273    I2:  2 10 18 26  3 11 19 27
5274    I3:  4 12 20 28  5 13 21 30
5275    I4:  6 14 22 30  7 15 23 31.  */
5276
5277 void
5278 vect_permute_store_chain (vec<tree> dr_chain,
5279                           unsigned int length,
5280                           stmt_vec_info stmt_info,
5281                           gimple_stmt_iterator *gsi,
5282                           vec<tree> *result_chain)
5283 {
5284   tree vect1, vect2, high, low;
5285   gimple *perm_stmt;
5286   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5287   tree perm_mask_low, perm_mask_high;
5288   tree data_ref;
5289   tree perm3_mask_low, perm3_mask_high;
5290   unsigned int i, j, n, log_length = exact_log2 (length);
5291
5292   result_chain->quick_grow (length);
5293   memcpy (result_chain->address (), dr_chain.address (),
5294           length * sizeof (tree));
5295
5296   if (length == 3)
5297     {
5298       /* vect_grouped_store_supported ensures that this is constant.  */
5299       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5300       unsigned int j0 = 0, j1 = 0, j2 = 0;
5301
5302       vec_perm_builder sel (nelt, nelt, 1);
5303       sel.quick_grow (nelt);
5304       vec_perm_indices indices;
5305       for (j = 0; j < 3; j++)
5306         {
5307           int nelt0 = ((3 - j) * nelt) % 3;
5308           int nelt1 = ((3 - j) * nelt + 1) % 3;
5309           int nelt2 = ((3 - j) * nelt + 2) % 3;
5310
5311           for (i = 0; i < nelt; i++)
5312             {
5313               if (3 * i + nelt0 < nelt)
5314                 sel[3 * i + nelt0] = j0++;
5315               if (3 * i + nelt1 < nelt)
5316                 sel[3 * i + nelt1] = nelt + j1++;
5317               if (3 * i + nelt2 < nelt)
5318                 sel[3 * i + nelt2] = 0;
5319             }
5320           indices.new_vector (sel, 2, nelt);
5321           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5322
5323           for (i = 0; i < nelt; i++)
5324             {
5325               if (3 * i + nelt0 < nelt)
5326                 sel[3 * i + nelt0] = 3 * i + nelt0;
5327               if (3 * i + nelt1 < nelt)
5328                 sel[3 * i + nelt1] = 3 * i + nelt1;
5329               if (3 * i + nelt2 < nelt)
5330                 sel[3 * i + nelt2] = nelt + j2++;
5331             }
5332           indices.new_vector (sel, 2, nelt);
5333           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5334
5335           vect1 = dr_chain[0];
5336           vect2 = dr_chain[1];
5337
5338           /* Create interleaving stmt:
5339              low = VEC_PERM_EXPR <vect1, vect2,
5340                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5341                                    j + 2, nelt + j + 2, *, ...}>  */
5342           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5343           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5344                                            vect2, perm3_mask_low);
5345           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5346
5347           vect1 = data_ref;
5348           vect2 = dr_chain[2];
5349           /* Create interleaving stmt:
5350              low = VEC_PERM_EXPR <vect1, vect2,
5351                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5352                                    6, 7, nelt + j + 2, ...}>  */
5353           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5354           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5355                                            vect2, perm3_mask_high);
5356           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5357           (*result_chain)[j] = data_ref;
5358         }
5359     }
5360   else
5361     {
5362       /* If length is not equal to 3 then only power of 2 is supported.  */
5363       gcc_assert (pow2p_hwi (length));
5364
5365       /* The encoding has 2 interleaved stepped patterns.  */
5366       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5367       vec_perm_builder sel (nelt, 2, 3);
5368       sel.quick_grow (6);
5369       for (i = 0; i < 3; i++)
5370         {
5371           sel[i * 2] = i;
5372           sel[i * 2 + 1] = i + nelt;
5373         }
5374         vec_perm_indices indices (sel, 2, nelt);
5375         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5376
5377         for (i = 0; i < 6; i++)
5378           sel[i] += exact_div (nelt, 2);
5379         indices.new_vector (sel, 2, nelt);
5380         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5381
5382         for (i = 0, n = log_length; i < n; i++)
5383           {
5384             for (j = 0; j < length/2; j++)
5385               {
5386                 vect1 = dr_chain[j];
5387                 vect2 = dr_chain[j+length/2];
5388
5389                 /* Create interleaving stmt:
5390                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5391                                                         ...}>  */
5392                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5393                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5394                                                  vect2, perm_mask_high);
5395                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5396                 (*result_chain)[2*j] = high;
5397
5398                 /* Create interleaving stmt:
5399                    low = VEC_PERM_EXPR <vect1, vect2,
5400                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5401                                          ...}>  */
5402                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5403                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5404                                                  vect2, perm_mask_low);
5405                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5406                 (*result_chain)[2*j+1] = low;
5407               }
5408             memcpy (dr_chain.address (), result_chain->address (),
5409                     length * sizeof (tree));
5410           }
5411     }
5412 }
5413
5414 /* Function vect_setup_realignment
5415
5416    This function is called when vectorizing an unaligned load using
5417    the dr_explicit_realign[_optimized] scheme.
5418    This function generates the following code at the loop prolog:
5419
5420       p = initial_addr;
5421    x  msq_init = *(floor(p));   # prolog load
5422       realignment_token = call target_builtin;
5423     loop:
5424    x  msq = phi (msq_init, ---)
5425
5426    The stmts marked with x are generated only for the case of
5427    dr_explicit_realign_optimized.
5428
5429    The code above sets up a new (vector) pointer, pointing to the first
5430    location accessed by STMT_INFO, and a "floor-aligned" load using that
5431    pointer.  It also generates code to compute the "realignment-token"
5432    (if the relevant target hook was defined), and creates a phi-node at the
5433    loop-header bb whose arguments are the result of the prolog-load (created
5434    by this function) and the result of a load that takes place in the loop
5435    (to be created by the caller to this function).
5436
5437    For the case of dr_explicit_realign_optimized:
5438    The caller to this function uses the phi-result (msq) to create the
5439    realignment code inside the loop, and sets up the missing phi argument,
5440    as follows:
5441     loop:
5442       msq = phi (msq_init, lsq)
5443       lsq = *(floor(p'));        # load in loop
5444       result = realign_load (msq, lsq, realignment_token);
5445
5446    For the case of dr_explicit_realign:
5447     loop:
5448       msq = *(floor(p));        # load in loop
5449       p' = p + (VS-1);
5450       lsq = *(floor(p'));       # load in loop
5451       result = realign_load (msq, lsq, realignment_token);
5452
5453    Input:
5454    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5455                a memory location that may be unaligned.
5456    BSI - place where new code is to be inserted.
5457    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5458                               is used.
5459
5460    Output:
5461    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5462                        target hook, if defined.
5463    Return value - the result of the loop-header phi node.  */
5464
5465 tree
5466 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5467                         tree *realignment_token,
5468                         enum dr_alignment_support alignment_support_scheme,
5469                         tree init_addr,
5470                         class loop **at_loop)
5471 {
5472   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5473   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5474   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5475   struct data_reference *dr = dr_info->dr;
5476   class loop *loop = NULL;
5477   edge pe = NULL;
5478   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5479   tree vec_dest;
5480   gimple *inc;
5481   tree ptr;
5482   tree data_ref;
5483   basic_block new_bb;
5484   tree msq_init = NULL_TREE;
5485   tree new_temp;
5486   gphi *phi_stmt;
5487   tree msq = NULL_TREE;
5488   gimple_seq stmts = NULL;
5489   bool compute_in_loop = false;
5490   bool nested_in_vect_loop = false;
5491   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5492   class loop *loop_for_initial_load = NULL;
5493
5494   if (loop_vinfo)
5495     {
5496       loop = LOOP_VINFO_LOOP (loop_vinfo);
5497       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5498     }
5499
5500   gcc_assert (alignment_support_scheme == dr_explicit_realign
5501               || alignment_support_scheme == dr_explicit_realign_optimized);
5502
5503   /* We need to generate three things:
5504      1. the misalignment computation
5505      2. the extra vector load (for the optimized realignment scheme).
5506      3. the phi node for the two vectors from which the realignment is
5507       done (for the optimized realignment scheme).  */
5508
5509   /* 1. Determine where to generate the misalignment computation.
5510
5511      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5512      calculation will be generated by this function, outside the loop (in the
5513      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5514      caller, inside the loop.
5515
5516      Background: If the misalignment remains fixed throughout the iterations of
5517      the loop, then both realignment schemes are applicable, and also the
5518      misalignment computation can be done outside LOOP.  This is because we are
5519      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5520      are a multiple of VS (the Vector Size), and therefore the misalignment in
5521      different vectorized LOOP iterations is always the same.
5522      The problem arises only if the memory access is in an inner-loop nested
5523      inside LOOP, which is now being vectorized using outer-loop vectorization.
5524      This is the only case when the misalignment of the memory access may not
5525      remain fixed throughout the iterations of the inner-loop (as explained in
5526      detail in vect_supportable_dr_alignment).  In this case, not only is the
5527      optimized realignment scheme not applicable, but also the misalignment
5528      computation (and generation of the realignment token that is passed to
5529      REALIGN_LOAD) have to be done inside the loop.
5530
5531      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5532      or not, which in turn determines if the misalignment is computed inside
5533      the inner-loop, or outside LOOP.  */
5534
5535   if (init_addr != NULL_TREE || !loop_vinfo)
5536     {
5537       compute_in_loop = true;
5538       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5539     }
5540
5541
5542   /* 2. Determine where to generate the extra vector load.
5543
5544      For the optimized realignment scheme, instead of generating two vector
5545      loads in each iteration, we generate a single extra vector load in the
5546      preheader of the loop, and in each iteration reuse the result of the
5547      vector load from the previous iteration.  In case the memory access is in
5548      an inner-loop nested inside LOOP, which is now being vectorized using
5549      outer-loop vectorization, we need to determine whether this initial vector
5550      load should be generated at the preheader of the inner-loop, or can be
5551      generated at the preheader of LOOP.  If the memory access has no evolution
5552      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5553      to be generated inside LOOP (in the preheader of the inner-loop).  */
5554
5555   if (nested_in_vect_loop)
5556     {
5557       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5558       bool invariant_in_outerloop =
5559             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5560       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5561     }
5562   else
5563     loop_for_initial_load = loop;
5564   if (at_loop)
5565     *at_loop = loop_for_initial_load;
5566
5567   if (loop_for_initial_load)
5568     pe = loop_preheader_edge (loop_for_initial_load);
5569
5570   /* 3. For the case of the optimized realignment, create the first vector
5571       load at the loop preheader.  */
5572
5573   if (alignment_support_scheme == dr_explicit_realign_optimized)
5574     {
5575       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5576       gassign *new_stmt;
5577
5578       gcc_assert (!compute_in_loop);
5579       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5580       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5581                                       loop_for_initial_load, NULL_TREE,
5582                                       &init_addr, NULL, &inc, true);
5583       if (TREE_CODE (ptr) == SSA_NAME)
5584         new_temp = copy_ssa_name (ptr);
5585       else
5586         new_temp = make_ssa_name (TREE_TYPE (ptr));
5587       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5588       tree type = TREE_TYPE (ptr);
5589       new_stmt = gimple_build_assign
5590                    (new_temp, BIT_AND_EXPR, ptr,
5591                     fold_build2 (MINUS_EXPR, type,
5592                                  build_int_cst (type, 0),
5593                                  build_int_cst (type, align)));
5594       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5595       gcc_assert (!new_bb);
5596       data_ref
5597         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5598                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5599       vect_copy_ref_info (data_ref, DR_REF (dr));
5600       new_stmt = gimple_build_assign (vec_dest, data_ref);
5601       new_temp = make_ssa_name (vec_dest, new_stmt);
5602       gimple_assign_set_lhs (new_stmt, new_temp);
5603       if (pe)
5604         {
5605           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5606           gcc_assert (!new_bb);
5607         }
5608       else
5609          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5610
5611       msq_init = gimple_assign_lhs (new_stmt);
5612     }
5613
5614   /* 4. Create realignment token using a target builtin, if available.
5615       It is done either inside the containing loop, or before LOOP (as
5616       determined above).  */
5617
5618   if (targetm.vectorize.builtin_mask_for_load)
5619     {
5620       gcall *new_stmt;
5621       tree builtin_decl;
5622
5623       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5624       if (!init_addr)
5625         {
5626           /* Generate the INIT_ADDR computation outside LOOP.  */
5627           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5628                                                             NULL_TREE);
5629           if (loop)
5630             {
5631               pe = loop_preheader_edge (loop);
5632               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5633               gcc_assert (!new_bb);
5634             }
5635           else
5636              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5637         }
5638
5639       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5640       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5641       vec_dest =
5642         vect_create_destination_var (scalar_dest,
5643                                      gimple_call_return_type (new_stmt));
5644       new_temp = make_ssa_name (vec_dest, new_stmt);
5645       gimple_call_set_lhs (new_stmt, new_temp);
5646
5647       if (compute_in_loop)
5648         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5649       else
5650         {
5651           /* Generate the misalignment computation outside LOOP.  */
5652           pe = loop_preheader_edge (loop);
5653           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5654           gcc_assert (!new_bb);
5655         }
5656
5657       *realignment_token = gimple_call_lhs (new_stmt);
5658
5659       /* The result of the CALL_EXPR to this builtin is determined from
5660          the value of the parameter and no global variables are touched
5661          which makes the builtin a "const" function.  Requiring the
5662          builtin to have the "const" attribute makes it unnecessary
5663          to call mark_call_clobbered.  */
5664       gcc_assert (TREE_READONLY (builtin_decl));
5665     }
5666
5667   if (alignment_support_scheme == dr_explicit_realign)
5668     return msq;
5669
5670   gcc_assert (!compute_in_loop);
5671   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5672
5673
5674   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5675
5676   pe = loop_preheader_edge (containing_loop);
5677   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5678   msq = make_ssa_name (vec_dest);
5679   phi_stmt = create_phi_node (msq, containing_loop->header);
5680   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5681
5682   return msq;
5683 }
5684
5685
5686 /* Function vect_grouped_load_supported.
5687
5688    COUNT is the size of the load group (the number of statements plus the
5689    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5690    only one statement, with a gap of COUNT - 1.
5691
5692    Returns true if a suitable permute exists.  */
5693
5694 bool
5695 vect_grouped_load_supported (tree vectype, bool single_element_p,
5696                              unsigned HOST_WIDE_INT count)
5697 {
5698   machine_mode mode = TYPE_MODE (vectype);
5699
5700   /* If this is single-element interleaving with an element distance
5701      that leaves unused vector loads around punt - we at least create
5702      very sub-optimal code in that case (and blow up memory,
5703      see PR65518).  */
5704   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5705     {
5706       if (dump_enabled_p ())
5707         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5708                          "single-element interleaving not supported "
5709                          "for not adjacent vector loads\n");
5710       return false;
5711     }
5712
5713   /* vect_permute_load_chain requires the group size to be equal to 3 or
5714      be a power of two.  */
5715   if (count != 3 && exact_log2 (count) == -1)
5716     {
5717       if (dump_enabled_p ())
5718         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719                          "the size of the group of accesses"
5720                          " is not a power of 2 or not equal to 3\n");
5721       return false;
5722     }
5723
5724   /* Check that the permutation is supported.  */
5725   if (VECTOR_MODE_P (mode))
5726     {
5727       unsigned int i, j;
5728       if (count == 3)
5729         {
5730           unsigned int nelt;
5731           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5732             {
5733               if (dump_enabled_p ())
5734                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5735                                  "cannot handle groups of 3 loads for"
5736                                  " variable-length vectors\n");
5737               return false;
5738             }
5739
5740           vec_perm_builder sel (nelt, nelt, 1);
5741           sel.quick_grow (nelt);
5742           vec_perm_indices indices;
5743           unsigned int k;
5744           for (k = 0; k < 3; k++)
5745             {
5746               for (i = 0; i < nelt; i++)
5747                 if (3 * i + k < 2 * nelt)
5748                   sel[i] = 3 * i + k;
5749                 else
5750                   sel[i] = 0;
5751               indices.new_vector (sel, 2, nelt);
5752               if (!can_vec_perm_const_p (mode, indices))
5753                 {
5754                   if (dump_enabled_p ())
5755                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5756                                      "shuffle of 3 loads is not supported by"
5757                                      " target\n");
5758                   return false;
5759                 }
5760               for (i = 0, j = 0; i < nelt; i++)
5761                 if (3 * i + k < 2 * nelt)
5762                   sel[i] = i;
5763                 else
5764                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5765               indices.new_vector (sel, 2, nelt);
5766               if (!can_vec_perm_const_p (mode, indices))
5767                 {
5768                   if (dump_enabled_p ())
5769                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5770                                      "shuffle of 3 loads is not supported by"
5771                                      " target\n");
5772                   return false;
5773                 }
5774             }
5775           return true;
5776         }
5777       else
5778         {
5779           /* If length is not equal to 3 then only power of 2 is supported.  */
5780           gcc_assert (pow2p_hwi (count));
5781           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5782
5783           /* The encoding has a single stepped pattern.  */
5784           vec_perm_builder sel (nelt, 1, 3);
5785           sel.quick_grow (3);
5786           for (i = 0; i < 3; i++)
5787             sel[i] = i * 2;
5788           vec_perm_indices indices (sel, 2, nelt);
5789           if (can_vec_perm_const_p (mode, indices))
5790             {
5791               for (i = 0; i < 3; i++)
5792                 sel[i] = i * 2 + 1;
5793               indices.new_vector (sel, 2, nelt);
5794               if (can_vec_perm_const_p (mode, indices))
5795                 return true;
5796             }
5797         }
5798     }
5799
5800   if (dump_enabled_p ())
5801     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5802                      "extract even/odd not supported by target\n");
5803   return false;
5804 }
5805
5806 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5807    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5808
5809 bool
5810 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5811                            bool masked_p)
5812 {
5813   if (masked_p)
5814     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5815                                          vec_mask_load_lanes_optab,
5816                                          vectype, count);
5817   else
5818     return vect_lanes_optab_supported_p ("vec_load_lanes",
5819                                          vec_load_lanes_optab,
5820                                          vectype, count);
5821 }
5822
5823 /* Function vect_permute_load_chain.
5824
5825    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5826    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5827    the input data correctly.  Return the final references for loads in
5828    RESULT_CHAIN.
5829
5830    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5831    The input is 4 vectors each containing 8 elements. We assign a number to each
5832    element, the input sequence is:
5833
5834    1st vec:   0  1  2  3  4  5  6  7
5835    2nd vec:   8  9 10 11 12 13 14 15
5836    3rd vec:  16 17 18 19 20 21 22 23
5837    4th vec:  24 25 26 27 28 29 30 31
5838
5839    The output sequence should be:
5840
5841    1st vec:  0 4  8 12 16 20 24 28
5842    2nd vec:  1 5  9 13 17 21 25 29
5843    3rd vec:  2 6 10 14 18 22 26 30
5844    4th vec:  3 7 11 15 19 23 27 31
5845
5846    i.e., the first output vector should contain the first elements of each
5847    interleaving group, etc.
5848
5849    We use extract_even/odd instructions to create such output.  The input of
5850    each extract_even/odd operation is two vectors
5851    1st vec    2nd vec
5852    0 1 2 3    4 5 6 7
5853
5854    and the output is the vector of extracted even/odd elements.  The output of
5855    extract_even will be:   0 2 4 6
5856    and of extract_odd:     1 3 5 7
5857
5858
5859    The permutation is done in log LENGTH stages.  In each stage extract_even
5860    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5861    their order.  In our example,
5862
5863    E1: extract_even (1st vec, 2nd vec)
5864    E2: extract_odd (1st vec, 2nd vec)
5865    E3: extract_even (3rd vec, 4th vec)
5866    E4: extract_odd (3rd vec, 4th vec)
5867
5868    The output for the first stage will be:
5869
5870    E1:  0  2  4  6  8 10 12 14
5871    E2:  1  3  5  7  9 11 13 15
5872    E3: 16 18 20 22 24 26 28 30
5873    E4: 17 19 21 23 25 27 29 31
5874
5875    In order to proceed and create the correct sequence for the next stage (or
5876    for the correct output, if the second stage is the last one, as in our
5877    example), we first put the output of extract_even operation and then the
5878    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5879    The input for the second stage is:
5880
5881    1st vec (E1):  0  2  4  6  8 10 12 14
5882    2nd vec (E3): 16 18 20 22 24 26 28 30
5883    3rd vec (E2):  1  3  5  7  9 11 13 15
5884    4th vec (E4): 17 19 21 23 25 27 29 31
5885
5886    The output of the second stage:
5887
5888    E1: 0 4  8 12 16 20 24 28
5889    E2: 2 6 10 14 18 22 26 30
5890    E3: 1 5  9 13 17 21 25 29
5891    E4: 3 7 11 15 19 23 27 31
5892
5893    And RESULT_CHAIN after reordering:
5894
5895    1st vec (E1):  0 4  8 12 16 20 24 28
5896    2nd vec (E3):  1 5  9 13 17 21 25 29
5897    3rd vec (E2):  2 6 10 14 18 22 26 30
5898    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5899
5900 static void
5901 vect_permute_load_chain (vec<tree> dr_chain,
5902                          unsigned int length,
5903                          stmt_vec_info stmt_info,
5904                          gimple_stmt_iterator *gsi,
5905                          vec<tree> *result_chain)
5906 {
5907   tree data_ref, first_vect, second_vect;
5908   tree perm_mask_even, perm_mask_odd;
5909   tree perm3_mask_low, perm3_mask_high;
5910   gimple *perm_stmt;
5911   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5912   unsigned int i, j, log_length = exact_log2 (length);
5913
5914   result_chain->quick_grow (length);
5915   memcpy (result_chain->address (), dr_chain.address (),
5916           length * sizeof (tree));
5917
5918   if (length == 3)
5919     {
5920       /* vect_grouped_load_supported ensures that this is constant.  */
5921       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5922       unsigned int k;
5923
5924       vec_perm_builder sel (nelt, nelt, 1);
5925       sel.quick_grow (nelt);
5926       vec_perm_indices indices;
5927       for (k = 0; k < 3; k++)
5928         {
5929           for (i = 0; i < nelt; i++)
5930             if (3 * i + k < 2 * nelt)
5931               sel[i] = 3 * i + k;
5932             else
5933               sel[i] = 0;
5934           indices.new_vector (sel, 2, nelt);
5935           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5936
5937           for (i = 0, j = 0; i < nelt; i++)
5938             if (3 * i + k < 2 * nelt)
5939               sel[i] = i;
5940             else
5941               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5942           indices.new_vector (sel, 2, nelt);
5943           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5944
5945           first_vect = dr_chain[0];
5946           second_vect = dr_chain[1];
5947
5948           /* Create interleaving stmt (low part of):
5949              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5950                                                              ...}>  */
5951           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5952           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5953                                            second_vect, perm3_mask_low);
5954           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5955
5956           /* Create interleaving stmt (high part of):
5957              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5958                                                               ...}>  */
5959           first_vect = data_ref;
5960           second_vect = dr_chain[2];
5961           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5962           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5963                                            second_vect, perm3_mask_high);
5964           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5965           (*result_chain)[k] = data_ref;
5966         }
5967     }
5968   else
5969     {
5970       /* If length is not equal to 3 then only power of 2 is supported.  */
5971       gcc_assert (pow2p_hwi (length));
5972
5973       /* The encoding has a single stepped pattern.  */
5974       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5975       vec_perm_builder sel (nelt, 1, 3);
5976       sel.quick_grow (3);
5977       for (i = 0; i < 3; ++i)
5978         sel[i] = i * 2;
5979       vec_perm_indices indices (sel, 2, nelt);
5980       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5981
5982       for (i = 0; i < 3; ++i)
5983         sel[i] = i * 2 + 1;
5984       indices.new_vector (sel, 2, nelt);
5985       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5986
5987       for (i = 0; i < log_length; i++)
5988         {
5989           for (j = 0; j < length; j += 2)
5990             {
5991               first_vect = dr_chain[j];
5992               second_vect = dr_chain[j+1];
5993
5994               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5995               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5996               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5997                                                first_vect, second_vect,
5998                                                perm_mask_even);
5999               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6000               (*result_chain)[j/2] = data_ref;
6001
6002               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6003               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6004               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6005                                                first_vect, second_vect,
6006                                                perm_mask_odd);
6007               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6008               (*result_chain)[j/2+length/2] = data_ref;
6009             }
6010           memcpy (dr_chain.address (), result_chain->address (),
6011                   length * sizeof (tree));
6012         }
6013     }
6014 }
6015
6016 /* Function vect_shift_permute_load_chain.
6017
6018    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6019    sequence of stmts to reorder the input data accordingly.
6020    Return the final references for loads in RESULT_CHAIN.
6021    Return true if successed, false otherwise.
6022
6023    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6024    The input is 3 vectors each containing 8 elements.  We assign a
6025    number to each element, the input sequence is:
6026
6027    1st vec:   0  1  2  3  4  5  6  7
6028    2nd vec:   8  9 10 11 12 13 14 15
6029    3rd vec:  16 17 18 19 20 21 22 23
6030
6031    The output sequence should be:
6032
6033    1st vec:  0 3 6  9 12 15 18 21
6034    2nd vec:  1 4 7 10 13 16 19 22
6035    3rd vec:  2 5 8 11 14 17 20 23
6036
6037    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6038
6039    First we shuffle all 3 vectors to get correct elements order:
6040
6041    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6042    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6043    3rd vec:  (16 19 22) (17 20 23) (18 21)
6044
6045    Next we unite and shift vector 3 times:
6046
6047    1st step:
6048      shift right by 6 the concatenation of:
6049      "1st vec" and  "2nd vec"
6050        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6051      "2nd vec" and  "3rd vec"
6052        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6053      "3rd vec" and  "1st vec"
6054        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6055                              | New vectors                   |
6056
6057      So that now new vectors are:
6058
6059      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6060      2nd vec:  (10 13) (16 19 22) (17 20 23)
6061      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6062
6063    2nd step:
6064      shift right by 5 the concatenation of:
6065      "1st vec" and  "3rd vec"
6066        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6067      "2nd vec" and  "1st vec"
6068        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6069      "3rd vec" and  "2nd vec"
6070        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6071                           | New vectors                   |
6072
6073      So that now new vectors are:
6074
6075      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6076      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6077      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6078
6079    3rd step:
6080      shift right by 5 the concatenation of:
6081      "1st vec" and  "1st vec"
6082        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6083      shift right by 3 the concatenation of:
6084      "2nd vec" and  "2nd vec"
6085                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6086                           | New vectors                   |
6087
6088      So that now all vectors are READY:
6089      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6090      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6091      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6092
6093    This algorithm is faster than one in vect_permute_load_chain if:
6094      1.  "shift of a concatination" is faster than general permutation.
6095          This is usually so.
6096      2.  The TARGET machine can't execute vector instructions in parallel.
6097          This is because each step of the algorithm depends on previous.
6098          The algorithm in vect_permute_load_chain is much more parallel.
6099
6100    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6101 */
6102
6103 static bool
6104 vect_shift_permute_load_chain (vec<tree> dr_chain,
6105                                unsigned int length,
6106                                stmt_vec_info stmt_info,
6107                                gimple_stmt_iterator *gsi,
6108                                vec<tree> *result_chain)
6109 {
6110   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6111   tree perm2_mask1, perm2_mask2, perm3_mask;
6112   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6113   gimple *perm_stmt;
6114
6115   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6116   unsigned int i;
6117   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6118
6119   unsigned HOST_WIDE_INT nelt, vf;
6120   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6121       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6122     /* Not supported for variable-length vectors.  */
6123     return false;
6124
6125   vec_perm_builder sel (nelt, nelt, 1);
6126   sel.quick_grow (nelt);
6127
6128   result_chain->quick_grow (length);
6129   memcpy (result_chain->address (), dr_chain.address (),
6130           length * sizeof (tree));
6131
6132   if (pow2p_hwi (length) && vf > 4)
6133     {
6134       unsigned int j, log_length = exact_log2 (length);
6135       for (i = 0; i < nelt / 2; ++i)
6136         sel[i] = i * 2;
6137       for (i = 0; i < nelt / 2; ++i)
6138         sel[nelt / 2 + i] = i * 2 + 1;
6139       vec_perm_indices indices (sel, 2, nelt);
6140       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6141         {
6142           if (dump_enabled_p ())
6143             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6144                              "shuffle of 2 fields structure is not \
6145                               supported by target\n");
6146           return false;
6147         }
6148       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6149
6150       for (i = 0; i < nelt / 2; ++i)
6151         sel[i] = i * 2 + 1;
6152       for (i = 0; i < nelt / 2; ++i)
6153         sel[nelt / 2 + i] = i * 2;
6154       indices.new_vector (sel, 2, nelt);
6155       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6156         {
6157           if (dump_enabled_p ())
6158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6159                              "shuffle of 2 fields structure is not \
6160                               supported by target\n");
6161           return false;
6162         }
6163       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6164
6165       /* Generating permutation constant to shift all elements.
6166          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6167       for (i = 0; i < nelt; i++)
6168         sel[i] = nelt / 2 + i;
6169       indices.new_vector (sel, 2, nelt);
6170       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6171         {
6172           if (dump_enabled_p ())
6173             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6174                              "shift permutation is not supported by target\n");
6175           return false;
6176         }
6177       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6178
6179       /* Generating permutation constant to select vector from 2.
6180          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6181       for (i = 0; i < nelt / 2; i++)
6182         sel[i] = i;
6183       for (i = nelt / 2; i < nelt; i++)
6184         sel[i] = nelt + i;
6185       indices.new_vector (sel, 2, nelt);
6186       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6187         {
6188           if (dump_enabled_p ())
6189             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6190                              "select is not supported by target\n");
6191           return false;
6192         }
6193       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6194
6195       for (i = 0; i < log_length; i++)
6196         {
6197           for (j = 0; j < length; j += 2)
6198             {
6199               first_vect = dr_chain[j];
6200               second_vect = dr_chain[j + 1];
6201
6202               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6203               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6204                                                first_vect, first_vect,
6205                                                perm2_mask1);
6206               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6207               vect[0] = data_ref;
6208
6209               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6210               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6211                                                second_vect, second_vect,
6212                                                perm2_mask2);
6213               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6214               vect[1] = data_ref;
6215
6216               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6217               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6218                                                vect[0], vect[1], shift1_mask);
6219               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6220               (*result_chain)[j/2 + length/2] = data_ref;
6221
6222               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6223               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6224                                                vect[0], vect[1], select_mask);
6225               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6226               (*result_chain)[j/2] = data_ref;
6227             }
6228           memcpy (dr_chain.address (), result_chain->address (),
6229                   length * sizeof (tree));
6230         }
6231       return true;
6232     }
6233   if (length == 3 && vf > 2)
6234     {
6235       unsigned int k = 0, l = 0;
6236
6237       /* Generating permutation constant to get all elements in rigth order.
6238          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6239       for (i = 0; i < nelt; i++)
6240         {
6241           if (3 * k + (l % 3) >= nelt)
6242             {
6243               k = 0;
6244               l += (3 - (nelt % 3));
6245             }
6246           sel[i] = 3 * k + (l % 3);
6247           k++;
6248         }
6249       vec_perm_indices indices (sel, 2, nelt);
6250       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6251         {
6252           if (dump_enabled_p ())
6253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254                              "shuffle of 3 fields structure is not \
6255                               supported by target\n");
6256           return false;
6257         }
6258       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6259
6260       /* Generating permutation constant to shift all elements.
6261          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6262       for (i = 0; i < nelt; i++)
6263         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6264       indices.new_vector (sel, 2, nelt);
6265       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6266         {
6267           if (dump_enabled_p ())
6268             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6269                              "shift permutation is not supported by target\n");
6270           return false;
6271         }
6272       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6273
6274       /* Generating permutation constant to shift all elements.
6275          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6276       for (i = 0; i < nelt; i++)
6277         sel[i] = 2 * (nelt / 3) + 1 + i;
6278       indices.new_vector (sel, 2, nelt);
6279       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6280         {
6281           if (dump_enabled_p ())
6282             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6283                              "shift permutation is not supported by target\n");
6284           return false;
6285         }
6286       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6287
6288       /* Generating permutation constant to shift all elements.
6289          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6290       for (i = 0; i < nelt; i++)
6291         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6292       indices.new_vector (sel, 2, nelt);
6293       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6294         {
6295           if (dump_enabled_p ())
6296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6297                              "shift permutation is not supported by target\n");
6298           return false;
6299         }
6300       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6301
6302       /* Generating permutation constant to shift all elements.
6303          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6304       for (i = 0; i < nelt; i++)
6305         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6306       indices.new_vector (sel, 2, nelt);
6307       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6308         {
6309           if (dump_enabled_p ())
6310             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6311                              "shift permutation is not supported by target\n");
6312           return false;
6313         }
6314       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6315
6316       for (k = 0; k < 3; k++)
6317         {
6318           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6319           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6320                                            dr_chain[k], dr_chain[k],
6321                                            perm3_mask);
6322           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6323           vect[k] = data_ref;
6324         }
6325
6326       for (k = 0; k < 3; k++)
6327         {
6328           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6329           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6330                                            vect[k % 3], vect[(k + 1) % 3],
6331                                            shift1_mask);
6332           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6333           vect_shift[k] = data_ref;
6334         }
6335
6336       for (k = 0; k < 3; k++)
6337         {
6338           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6339           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6340                                            vect_shift[(4 - k) % 3],
6341                                            vect_shift[(3 - k) % 3],
6342                                            shift2_mask);
6343           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6344           vect[k] = data_ref;
6345         }
6346
6347       (*result_chain)[3 - (nelt % 3)] = vect[2];
6348
6349       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6350       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6351                                        vect[0], shift3_mask);
6352       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6353       (*result_chain)[nelt % 3] = data_ref;
6354
6355       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6356       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6357                                        vect[1], shift4_mask);
6358       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6359       (*result_chain)[0] = data_ref;
6360       return true;
6361     }
6362   return false;
6363 }
6364
6365 /* Function vect_transform_grouped_load.
6366
6367    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6368    to perform their permutation and ascribe the result vectorized statements to
6369    the scalar statements.
6370 */
6371
6372 void
6373 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6374                              int size, gimple_stmt_iterator *gsi)
6375 {
6376   machine_mode mode;
6377   vec<tree> result_chain = vNULL;
6378
6379   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6380      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6381      vectors, that are ready for vector computation.  */
6382   result_chain.create (size);
6383
6384   /* If reassociation width for vector type is 2 or greater target machine can
6385      execute 2 or more vector instructions in parallel.  Otherwise try to
6386      get chain for loads group using vect_shift_permute_load_chain.  */
6387   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6388   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6389       || pow2p_hwi (size)
6390       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6391                                          gsi, &result_chain))
6392     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6393   vect_record_grouped_load_vectors (stmt_info, result_chain);
6394   result_chain.release ();
6395 }
6396
6397 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6398    generated as part of the vectorization of STMT_INFO.  Assign the statement
6399    for each vector to the associated scalar statement.  */
6400
6401 void
6402 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6403                                   vec<tree> result_chain)
6404 {
6405   vec_info *vinfo = stmt_info->vinfo;
6406   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6407   unsigned int i, gap_count;
6408   tree tmp_data_ref;
6409
6410   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6411      Since we scan the chain starting from it's first node, their order
6412      corresponds the order of data-refs in RESULT_CHAIN.  */
6413   stmt_vec_info next_stmt_info = first_stmt_info;
6414   gap_count = 1;
6415   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6416     {
6417       if (!next_stmt_info)
6418         break;
6419
6420       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6421        code elimination pass later.  No need to check for the first stmt in
6422        the group, since it always exists.
6423        DR_GROUP_GAP is the number of steps in elements from the previous
6424        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6425        correspond to the gaps.  */
6426       if (next_stmt_info != first_stmt_info
6427           && gap_count < DR_GROUP_GAP (next_stmt_info))
6428         {
6429           gap_count++;
6430           continue;
6431         }
6432
6433       /* ???  The following needs cleanup after the removal of
6434          DR_GROUP_SAME_DR_STMT.  */
6435       if (next_stmt_info)
6436         {
6437           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6438           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6439              copies, and we put the new vector statement in the first available
6440              RELATED_STMT.  */
6441           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6442             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6443           else
6444             {
6445               stmt_vec_info prev_stmt_info
6446                 = STMT_VINFO_VEC_STMT (next_stmt_info);
6447               stmt_vec_info rel_stmt_info
6448                 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6449               while (rel_stmt_info)
6450                 {
6451                   prev_stmt_info = rel_stmt_info;
6452                   rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6453                 }
6454
6455               STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6456             }
6457
6458           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6459           gap_count = 1;
6460         }
6461     }
6462 }
6463
6464 /* Function vect_force_dr_alignment_p.
6465
6466    Returns whether the alignment of a DECL can be forced to be aligned
6467    on ALIGNMENT bit boundary.  */
6468
6469 bool
6470 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6471 {
6472   if (!VAR_P (decl))
6473     return false;
6474
6475   if (decl_in_symtab_p (decl)
6476       && !symtab_node::get (decl)->can_increase_alignment_p ())
6477     return false;
6478
6479   if (TREE_STATIC (decl))
6480     return (known_le (alignment,
6481                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6482   else
6483     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6484 }
6485
6486
6487 /* Return whether the data reference DR_INFO is supported with respect to its
6488    alignment.
6489    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6490    it is aligned, i.e., check if it is possible to vectorize it with different
6491    alignment.  */
6492
6493 enum dr_alignment_support
6494 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6495                                bool check_aligned_accesses)
6496 {
6497   data_reference *dr = dr_info->dr;
6498   stmt_vec_info stmt_info = dr_info->stmt;
6499   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6500   machine_mode mode = TYPE_MODE (vectype);
6501   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6502   class loop *vect_loop = NULL;
6503   bool nested_in_vect_loop = false;
6504
6505   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6506     return dr_aligned;
6507
6508   /* For now assume all conditional loads/stores support unaligned
6509      access without any special code.  */
6510   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6511     if (gimple_call_internal_p (stmt)
6512         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6513             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6514       return dr_unaligned_supported;
6515
6516   if (loop_vinfo)
6517     {
6518       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6519       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6520     }
6521
6522   /* Possibly unaligned access.  */
6523
6524   /* We can choose between using the implicit realignment scheme (generating
6525      a misaligned_move stmt) and the explicit realignment scheme (generating
6526      aligned loads with a REALIGN_LOAD).  There are two variants to the
6527      explicit realignment scheme: optimized, and unoptimized.
6528      We can optimize the realignment only if the step between consecutive
6529      vector loads is equal to the vector size.  Since the vector memory
6530      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6531      is guaranteed that the misalignment amount remains the same throughout the
6532      execution of the vectorized loop.  Therefore, we can create the
6533      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6534      at the loop preheader.
6535
6536      However, in the case of outer-loop vectorization, when vectorizing a
6537      memory access in the inner-loop nested within the LOOP that is now being
6538      vectorized, while it is guaranteed that the misalignment of the
6539      vectorized memory access will remain the same in different outer-loop
6540      iterations, it is *not* guaranteed that is will remain the same throughout
6541      the execution of the inner-loop.  This is because the inner-loop advances
6542      with the original scalar step (and not in steps of VS).  If the inner-loop
6543      step happens to be a multiple of VS, then the misalignment remains fixed
6544      and we can use the optimized realignment scheme.  For example:
6545
6546       for (i=0; i<N; i++)
6547         for (j=0; j<M; j++)
6548           s += a[i+j];
6549
6550      When vectorizing the i-loop in the above example, the step between
6551      consecutive vector loads is 1, and so the misalignment does not remain
6552      fixed across the execution of the inner-loop, and the realignment cannot
6553      be optimized (as illustrated in the following pseudo vectorized loop):
6554
6555       for (i=0; i<N; i+=4)
6556         for (j=0; j<M; j++){
6557           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6558                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6559                          // (assuming that we start from an aligned address).
6560           }
6561
6562      We therefore have to use the unoptimized realignment scheme:
6563
6564       for (i=0; i<N; i+=4)
6565           for (j=k; j<M; j+=4)
6566           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6567                            // that the misalignment of the initial address is
6568                            // 0).
6569
6570      The loop can then be vectorized as follows:
6571
6572       for (k=0; k<4; k++){
6573         rt = get_realignment_token (&vp[k]);
6574         for (i=0; i<N; i+=4){
6575           v1 = vp[i+k];
6576           for (j=k; j<M; j+=4){
6577             v2 = vp[i+j+VS-1];
6578             va = REALIGN_LOAD <v1,v2,rt>;
6579             vs += va;
6580             v1 = v2;
6581           }
6582         }
6583     } */
6584
6585   if (DR_IS_READ (dr))
6586     {
6587       bool is_packed = false;
6588       tree type = (TREE_TYPE (DR_REF (dr)));
6589
6590       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6591           && (!targetm.vectorize.builtin_mask_for_load
6592               || targetm.vectorize.builtin_mask_for_load ()))
6593         {
6594           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6595
6596           /* If we are doing SLP then the accesses need not have the
6597              same alignment, instead it depends on the SLP group size.  */
6598           if (loop_vinfo
6599               && STMT_SLP_TYPE (stmt_info)
6600               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6601                               * (DR_GROUP_SIZE
6602                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6603                               TYPE_VECTOR_SUBPARTS (vectype)))
6604             ;
6605           else if (!loop_vinfo
6606                    || (nested_in_vect_loop
6607                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6608                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6609             return dr_explicit_realign;
6610           else
6611             return dr_explicit_realign_optimized;
6612         }
6613       if (!known_alignment_for_access_p (dr_info))
6614         is_packed = not_size_aligned (DR_REF (dr));
6615
6616       if (targetm.vectorize.support_vector_misalignment
6617             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6618         /* Can't software pipeline the loads, but can at least do them.  */
6619         return dr_unaligned_supported;
6620     }
6621   else
6622     {
6623       bool is_packed = false;
6624       tree type = (TREE_TYPE (DR_REF (dr)));
6625
6626       if (!known_alignment_for_access_p (dr_info))
6627         is_packed = not_size_aligned (DR_REF (dr));
6628
6629      if (targetm.vectorize.support_vector_misalignment
6630            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6631        return dr_unaligned_supported;
6632     }
6633
6634   /* Unsupported.  */
6635   return dr_unaligned_unsupported;
6636 }