gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode, array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   if (!targetm.array_mode (mode, count).exists (&array_mode))
  69     {
  70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  71       limit_p = !targetm.array_mode_supported_p (mode, count);
  72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  73         {
  74           if (dump_enabled_p ())
  75             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  76                              "no array mode for %s[%wu]\n",
  77                              GET_MODE_NAME (mode), count);
  78           return false;
  79         }
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT_INFO.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 119                                HOST_WIDE_INT *lhs_size_unit,
 120                                HOST_WIDE_INT *rhs_size_unit)
 121 {
 122   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 123   HOST_WIDE_INT lhs, rhs;
 124
 125   /* During the analysis phase, this function is called on arbitrary
 126      statements that might not have scalar results.  */
 127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 128     return scalar_type;
 129
 130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 131
 132   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 133   if (assign
 134       && (gimple_assign_cast_p (assign)
 135           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 148     {
 149       unsigned int i = 0;
 150       if (gimple_call_internal_p (call))
 151         {
 152           internal_fn ifn = gimple_call_internal_fn (call);
 153           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 154             /* gimple_expr_type already picked the type of the loaded
 155                or stored data.  */
 156             i = ~0U;
 157           else if (internal_fn_mask_index (ifn) == 0)
 158             i = 1;
 159         }
 160       if (i < gimple_call_num_args (call))
 161         {
 162           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 163           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 164             {
 165               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 166               if (rhs < lhs)
 167                 scalar_type = rhs_type;
 168             }
 169         }
 170     }
 171
 172   *lhs_size_unit = lhs;
 173   *rhs_size_unit = rhs;
 174   return scalar_type;
 175 }
 176
 177
 178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 179    tested at run-time.  Return TRUE if DDR was successfully inserted.
 180    Return false if versioning is not supported.  */
 181
 182 static opt_result
 183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 184 {
 185   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186
 187   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 188     return opt_result::failure_at (vect_location,
 189                                    "will not create alias checks, as"
 190                                    " --param vect-max-version-for-alias-checks"
 191                                    " == 0\n");
 192
 193   opt_result res
 194     = runtime_alias_check_p (ddr, loop,
 195                              optimize_loop_nest_for_speed_p (loop));
 196   if (!res)
 197     return res;
 198
 199   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 200   return opt_result::success ();
 201 }
 202
 203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 204
 205 static void
 206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 207 {
 208   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 209   for (unsigned int i = 0; i < checks.length(); ++i)
 210     if (checks[i] == value)
 211       return;
 212
 213   if (dump_enabled_p ())
 214     dump_printf_loc (MSG_NOTE, vect_location,
 215                      "need run-time check that %T is nonzero\n",
 216                      value);
 217   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 218 }
 219
 220 /* Return true if we know that the order of vectorized DR_INFO_A and
 221    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 222    DR_INFO_B.  At least one of the accesses is a write.  */
 223
 224 static bool
 225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 226 {
 227   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 228   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 229
 230   /* Single statements are always kept in their original order.  */
 231   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 232       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 233     return true;
 234
 235   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 236      SLP group are emitted at the position of the last scalar load and
 237      all loads in an interleaving group are emitted at the position
 238      of the first scalar load.
 239      Stores in a group are emitted at the position of the last scalar store.
 240      Compute that position and check whether the resulting order matches
 241      the current one.
 242      We have not yet decided between SLP and interleaving so we have
 243      to conservatively assume both.  */
 244   stmt_vec_info il_a;
 245   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 246   if (last_a)
 247     {
 248       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 249            s = DR_GROUP_NEXT_ELEMENT (s))
 250         last_a = get_later_stmt (last_a, s);
 251       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 252         {
 253           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 254                s = DR_GROUP_NEXT_ELEMENT (s))
 255             if (get_later_stmt (il_a, s) == il_a)
 256               il_a = s;
 257         }
 258       else
 259         il_a = last_a;
 260     }
 261   else
 262     last_a = il_a = stmtinfo_a;
 263   stmt_vec_info il_b;
 264   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 265   if (last_b)
 266     {
 267       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 268            s = DR_GROUP_NEXT_ELEMENT (s))
 269         last_b = get_later_stmt (last_b, s);
 270       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 271         {
 272           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 273                s = DR_GROUP_NEXT_ELEMENT (s))
 274             if (get_later_stmt (il_b, s) == il_b)
 275               il_b = s;
 276         }
 277       else
 278         il_b = last_b;
 279     }
 280   else
 281     last_b = il_b = stmtinfo_b;
 282   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 283   return (/* SLP */
 284           (get_later_stmt (last_a, last_b) == last_a) == a_after_b
 285           /* Interleaving */
 286           && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
 287           /* Mixed */
 288           && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
 289           && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 290 }
 291
 292 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 293    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 294    distances.  These distances are conservatively correct but they don't
 295    reflect a guaranteed dependence.
 296
 297    Return true if this function does all the work necessary to avoid
 298    an alias or false if the caller should use the dependence distances
 299    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 300    the depth of the loop described by LOOP_VINFO and the other arguments
 301    are as for vect_analyze_data_ref_dependence.  */
 302
 303 static bool
 304 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 305                                        loop_vec_info loop_vinfo,
 306                                        int loop_depth, unsigned int *max_vf)
 307 {
 308   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 309   lambda_vector dist_v;
 310   unsigned int i;
 311   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 312     {
 313       int dist = dist_v[loop_depth];
 314       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 315         {
 316           /* If the user asserted safelen >= DIST consecutive iterations
 317              can be executed concurrently, assume independence.
 318
 319              ??? An alternative would be to add the alias check even
 320              in this case, and vectorize the fallback loop with the
 321              maximum VF set to safelen.  However, if the user has
 322              explicitly given a length, it's less likely that that
 323              would be a win.  */
 324           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 325             {
 326               if ((unsigned int) loop->safelen < *max_vf)
 327                 *max_vf = loop->safelen;
 328               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 329               continue;
 330             }
 331
 332           /* For dependence distances of 2 or more, we have the option
 333              of limiting VF or checking for an alias at runtime.
 334              Prefer to check at runtime if we can, to avoid limiting
 335              the VF unnecessarily when the bases are in fact independent.
 336
 337              Note that the alias checks will be removed if the VF ends up
 338              being small enough.  */
 339           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 340           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 341           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 342                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 343                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 344         }
 345     }
 346   return true;
 347 }
 348
 349
 350 /* Function vect_analyze_data_ref_dependence.
 351
 352    FIXME: I needed to change the sense of the returned flag.
 353
 354    Return FALSE if there (might) exist a dependence between a memory-reference
 355    DRA and a memory-reference DRB.  When versioning for alias may check a
 356    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 357    the data dependence.  */
 358
 359 static opt_result
 360 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 361                                   loop_vec_info loop_vinfo,
 362                                   unsigned int *max_vf)
 363 {
 364   unsigned int i;
 365   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 366   struct data_reference *dra = DDR_A (ddr);
 367   struct data_reference *drb = DDR_B (ddr);
 368   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 369   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 370   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 371   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 372   lambda_vector dist_v;
 373   unsigned int loop_depth;
 374
 375   /* In loop analysis all data references should be vectorizable.  */
 376   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 377       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 378     gcc_unreachable ();
 379
 380   /* Independent data accesses.  */
 381   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 382     return opt_result::success ();
 383
 384   if (dra == drb
 385       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 386     return opt_result::success ();
 387
 388   /* We do not have to consider dependences between accesses that belong
 389      to the same group, unless the stride could be smaller than the
 390      group size.  */
 391   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 392       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 394       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 395     return opt_result::success ();
 396
 397   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 398      least two scalar iterations, there is always also a true dependence.
 399      As the vectorizer does not re-order loads and stores we can ignore
 400      the anti-dependence if TBAA can disambiguate both DRs similar to the
 401      case with known negative distance anti-dependences (positive
 402      distance anti-dependences would violate TBAA constraints).  */
 403   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 404        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 405       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 406                                  get_alias_set (DR_REF (drb))))
 407     return opt_result::success ();
 408
 409   /* Unknown data dependence.  */
 410   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 411     {
 412       /* If user asserted safelen consecutive iterations can be
 413          executed concurrently, assume independence.  */
 414       if (loop->safelen >= 2)
 415         {
 416           if ((unsigned int) loop->safelen < *max_vf)
 417             *max_vf = loop->safelen;
 418           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 419           return opt_result::success ();
 420         }
 421
 422       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 423           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 424         return opt_result::failure_at
 425           (stmtinfo_a->stmt,
 426            "versioning for alias not supported for: "
 427            "can't determine dependence between %T and %T\n",
 428            DR_REF (dra), DR_REF (drb));
 429
 430       if (dump_enabled_p ())
 431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 432                          "versioning for alias required: "
 433                          "can't determine dependence between %T and %T\n",
 434                          DR_REF (dra), DR_REF (drb));
 435
 436       /* Add to list of ddrs that need to be tested at run-time.  */
 437       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 438     }
 439
 440   /* Known data dependence.  */
 441   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 442     {
 443       /* If user asserted safelen consecutive iterations can be
 444          executed concurrently, assume independence.  */
 445       if (loop->safelen >= 2)
 446         {
 447           if ((unsigned int) loop->safelen < *max_vf)
 448             *max_vf = loop->safelen;
 449           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 450           return opt_result::success ();
 451         }
 452
 453       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 454           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 455         return opt_result::failure_at
 456           (stmtinfo_a->stmt,
 457            "versioning for alias not supported for: "
 458            "bad dist vector for %T and %T\n",
 459            DR_REF (dra), DR_REF (drb));
 460
 461       if (dump_enabled_p ())
 462         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 463                          "versioning for alias required: "
 464                          "bad dist vector for %T and %T\n",
 465                          DR_REF (dra), DR_REF (drb));
 466       /* Add to list of ddrs that need to be tested at run-time.  */
 467       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 468     }
 469
 470   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 471
 472   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 473       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 474                                                 loop_depth, max_vf))
 475     return opt_result::success ();
 476
 477   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 478     {
 479       int dist = dist_v[loop_depth];
 480
 481       if (dump_enabled_p ())
 482         dump_printf_loc (MSG_NOTE, vect_location,
 483                          "dependence distance  = %d.\n", dist);
 484
 485       if (dist == 0)
 486         {
 487           if (dump_enabled_p ())
 488             dump_printf_loc (MSG_NOTE, vect_location,
 489                              "dependence distance == 0 between %T and %T\n",
 490                              DR_REF (dra), DR_REF (drb));
 491
 492           /* When we perform grouped accesses and perform implicit CSE
 493              by detecting equal accesses and doing disambiguation with
 494              runtime alias tests like for
 495                 .. = a[i];
 496                 .. = a[i+1];
 497                 a[i] = ..;
 498                 a[i+1] = ..;
 499                 *p = ..;
 500                 .. = a[i];
 501                 .. = a[i+1];
 502              where we will end up loading { a[i], a[i+1] } once, make
 503              sure that inserting group loads before the first load and
 504              stores after the last store will do the right thing.
 505              Similar for groups like
 506                 a[i] = ...;
 507                 ... = a[i];
 508                 a[i+1] = ...;
 509              where loads from the group interleave with the store.  */
 510           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 511             return opt_result::failure_at (stmtinfo_a->stmt,
 512                                            "READ_WRITE dependence"
 513                                            " in interleaving.\n");
 514
 515           if (loop->safelen < 2)
 516             {
 517               tree indicator = dr_zero_step_indicator (dra);
 518               if (!indicator || integer_zerop (indicator))
 519                 return opt_result::failure_at (stmtinfo_a->stmt,
 520                                                "access also has a zero step\n");
 521               else if (TREE_CODE (indicator) != INTEGER_CST)
 522                 vect_check_nonzero_value (loop_vinfo, indicator);
 523             }
 524           continue;
 525         }
 526
 527       if (dist > 0 && DDR_REVERSED_P (ddr))
 528         {
 529           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 530              reversed (to make distance vector positive), and the actual
 531              distance is negative.  */
 532           if (dump_enabled_p ())
 533             dump_printf_loc (MSG_NOTE, vect_location,
 534                              "dependence distance negative.\n");
 535           /* When doing outer loop vectorization, we need to check if there is
 536              a backward dependence at the inner loop level if the dependence
 537              at the outer loop is reversed.  See PR81740.  */
 538           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 539               || nested_in_vect_loop_p (loop, stmtinfo_b))
 540             {
 541               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 542                                                          DDR_LOOP_NEST (ddr));
 543               if (dist_v[inner_depth] < 0)
 544                 return opt_result::failure_at (stmtinfo_a->stmt,
 545                                                "not vectorized, dependence "
 546                                                "between data-refs %T and %T\n",
 547                                                DR_REF (dra), DR_REF (drb));
 548             }
 549           /* Record a negative dependence distance to later limit the
 550              amount of stmt copying / unrolling we can perform.
 551              Only need to handle read-after-write dependence.  */
 552           if (DR_IS_READ (drb)
 553               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 554                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 555             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 556           continue;
 557         }
 558
 559       unsigned int abs_dist = abs (dist);
 560       if (abs_dist >= 2 && abs_dist < *max_vf)
 561         {
 562           /* The dependence distance requires reduction of the maximal
 563              vectorization factor.  */
 564           *max_vf = abs_dist;
 565           if (dump_enabled_p ())
 566             dump_printf_loc (MSG_NOTE, vect_location,
 567                              "adjusting maximal vectorization factor to %i\n",
 568                              *max_vf);
 569         }
 570
 571       if (abs_dist >= *max_vf)
 572         {
 573           /* Dependence distance does not create dependence, as far as
 574              vectorization is concerned, in this case.  */
 575           if (dump_enabled_p ())
 576             dump_printf_loc (MSG_NOTE, vect_location,
 577                              "dependence distance >= VF.\n");
 578           continue;
 579         }
 580
 581       return opt_result::failure_at (stmtinfo_a->stmt,
 582                                      "not vectorized, possible dependence "
 583                                      "between data-refs %T and %T\n",
 584                                      DR_REF (dra), DR_REF (drb));
 585     }
 586
 587   return opt_result::success ();
 588 }
 589
 590 /* Function vect_analyze_data_ref_dependences.
 591
 592    Examine all the data references in the loop, and make sure there do not
 593    exist any data dependences between them.  Set *MAX_VF according to
 594    the maximum vectorization factor the data dependences allow.  */
 595
 596 opt_result
 597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 598                                    unsigned int *max_vf)
 599 {
 600   unsigned int i;
 601   struct data_dependence_relation *ddr;
 602
 603   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 604
 605   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 606     {
 607       LOOP_VINFO_DDRS (loop_vinfo)
 608         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 609                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 610       /* We need read-read dependences to compute
 611          STMT_VINFO_SAME_ALIGN_REFS.  */
 612       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 613                                           &LOOP_VINFO_DDRS (loop_vinfo),
 614                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 615                                           true);
 616       gcc_assert (res);
 617     }
 618
 619   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 620
 621   /* For epilogues we either have no aliases or alias versioning
 622      was applied to original loop.  Therefore we may just get max_vf
 623      using VF of original loop.  */
 624   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 625     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 626   else
 627     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 628       {
 629         opt_result res
 630           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 631         if (!res)
 632           return res;
 633       }
 634
 635   return opt_result::success ();
 636 }
 637
 638
 639 /* Function vect_slp_analyze_data_ref_dependence.
 640
 641    Return TRUE if there (might) exist a dependence between a memory-reference
 642    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 643    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 644    according to the data dependence.  */
 645
 646 static bool
 647 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 648                                       struct data_dependence_relation *ddr)
 649 {
 650   struct data_reference *dra = DDR_A (ddr);
 651   struct data_reference *drb = DDR_B (ddr);
 652   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 653   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 654
 655   /* We need to check dependences of statements marked as unvectorizable
 656      as well, they still can prohibit vectorization.  */
 657
 658   /* Independent data accesses.  */
 659   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 660     return false;
 661
 662   if (dra == drb)
 663     return false;
 664
 665   /* Read-read is OK.  */
 666   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 667     return false;
 668
 669   /* If dra and drb are part of the same interleaving chain consider
 670      them independent.  */
 671   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 672       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 673           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 674     return false;
 675
 676   /* Unknown data dependence.  */
 677   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 678     {
 679       if  (dump_enabled_p ())
 680         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 681                          "can't determine dependence between %T and %T\n",
 682                          DR_REF (dra), DR_REF (drb));
 683     }
 684   else if (dump_enabled_p ())
 685     dump_printf_loc (MSG_NOTE, vect_location,
 686                      "determined dependence between %T and %T\n",
 687                      DR_REF (dra), DR_REF (drb));
 688
 689   return true;
 690 }
 691
 692
 693 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 694    contain the vector of scalar stores of this instance if we are
 695    disambiguating the loads.  */
 696
 697 static bool
 698 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
 699                                    vec<stmt_vec_info> stores,
 700                                    stmt_vec_info last_store_info)
 701 {
 702   /* This walks over all stmts involved in the SLP load/store done
 703      in NODE verifying we can sink them up to the last stmt in the
 704      group.  */
 705   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 706   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 707     {
 708       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 709       if (access_info == last_access_info)
 710         continue;
 711       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 712       ao_ref ref;
 713       bool ref_initialized_p = false;
 714       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 715            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 716         {
 717           gimple *stmt = gsi_stmt (gsi);
 718           if (! gimple_vuse (stmt)
 719               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 720             continue;
 721
 722           /* If we couldn't record a (single) data reference for this
 723              stmt we have to resort to the alias oracle.  */
 724           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 725           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 726           if (!dr_b)
 727             {
 728               /* We are moving a store or sinking a load - this means
 729                  we cannot use TBAA for disambiguation.  */
 730               if (!ref_initialized_p)
 731                 ao_ref_init (&ref, DR_REF (dr_a));
 732               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 733                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 734                 return false;
 735               continue;
 736             }
 737
 738           bool dependent = false;
 739           /* If we run into a store of this same instance (we've just
 740              marked those) then delay dependence checking until we run
 741              into the last store because this is where it will have
 742              been sunk to (and we verify if we can do that as well).  */
 743           if (gimple_visited_p (stmt))
 744             {
 745               if (stmt_info != last_store_info)
 746                 continue;
 747               unsigned i;
 748               stmt_vec_info store_info;
 749               FOR_EACH_VEC_ELT (stores, i, store_info)
 750                 {
 751                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 752                   ddr_p ddr = initialize_data_dependence_relation
 753                                 (dr_a, store_dr, vNULL);
 754                   dependent
 755                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 756                   free_dependence_relation (ddr);
 757                   if (dependent)
 758                     break;
 759                 }
 760             }
 761           else
 762             {
 763               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 764                                                                dr_b, vNULL);
 765               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 766               free_dependence_relation (ddr);
 767             }
 768           if (dependent)
 769             return false;
 770         }
 771     }
 772   return true;
 773 }
 774
 775
 776 /* Function vect_analyze_data_ref_dependences.
 777
 778    Examine all the data references in the basic-block, and make sure there
 779    do not exist any data dependences between them.  Set *MAX_VF according to
 780    the maximum vectorization factor the data dependences allow.  */
 781
 782 bool
 783 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 784 {
 785   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 786
 787   /* The stores of this instance are at the root of the SLP tree.  */
 788   slp_tree store = SLP_INSTANCE_TREE (instance);
 789   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 790     store = NULL;
 791
 792   /* Verify we can sink stores to the vectorized stmt insert location.  */
 793   stmt_vec_info last_store_info = NULL;
 794   if (store)
 795     {
 796       if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
 797         return false;
 798
 799       /* Mark stores in this instance and remember the last one.  */
 800       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 801       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 802         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 803     }
 804
 805   bool res = true;
 806
 807   /* Verify we can sink loads to the vectorized stmt insert location,
 808      special-casing stores of this instance.  */
 809   slp_tree load;
 810   unsigned int i;
 811   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 812     if (! vect_slp_analyze_node_dependences (vinfo, load,
 813                                              store
 814                                              ? SLP_TREE_SCALAR_STMTS (store)
 815                                              : vNULL, last_store_info))
 816       {
 817         res = false;
 818         break;
 819       }
 820
 821   /* Unset the visited flag.  */
 822   if (store)
 823     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 824       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 825
 826   return res;
 827 }
 828
 829 /* Record the base alignment guarantee given by DRB, which occurs
 830    in STMT_INFO.  */
 831
 832 static void
 833 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 834                             innermost_loop_behavior *drb)
 835 {
 836   bool existed;
 837   innermost_loop_behavior *&entry
 838     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 839   if (!existed || entry->base_alignment < drb->base_alignment)
 840     {
 841       entry = drb;
 842       if (dump_enabled_p ())
 843         dump_printf_loc (MSG_NOTE, vect_location,
 844                          "recording new base alignment for %T\n"
 845                          "  alignment:    %d\n"
 846                          "  misalignment: %d\n"
 847                          "  based on:     %G",
 848                          drb->base_address,
 849                          drb->base_alignment,
 850                          drb->base_misalignment,
 851                          stmt_info->stmt);
 852     }
 853 }
 854
 855 /* If the region we're going to vectorize is reached, all unconditional
 856    data references occur at least once.  We can therefore pool the base
 857    alignment guarantees from each unconditional reference.  Do this by
 858    going through all the data references in VINFO and checking whether
 859    the containing statement makes the reference unconditionally.  If so,
 860    record the alignment of the base address in VINFO so that it can be
 861    used for all other references with the same base.  */
 862
 863 void
 864 vect_record_base_alignments (vec_info *vinfo)
 865 {
 866   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 867   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 868   data_reference *dr;
 869   unsigned int i;
 870   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 871     {
 872       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 873       stmt_vec_info stmt_info = dr_info->stmt;
 874       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 875           && STMT_VINFO_VECTORIZABLE (stmt_info)
 876           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 877         {
 878           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 879
 880           /* If DR is nested in the loop that is being vectorized, we can also
 881              record the alignment of the base wrt the outer loop.  */
 882           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 883             vect_record_base_alignment
 884               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 885         }
 886     }
 887 }
 888
 889 /* Return the target alignment for the vectorized form of DR_INFO.  */
 890
 891 static poly_uint64
 892 vect_calculate_target_alignment (dr_vec_info *dr_info)
 893 {
 894   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 895   return targetm.vectorize.preferred_vector_alignment (vectype);
 896 }
 897
 898 /* Function vect_compute_data_ref_alignment
 899
 900    Compute the misalignment of the data reference DR_INFO.
 901
 902    Output:
 903    1. DR_MISALIGNMENT (DR_INFO) is defined.
 904
 905    FOR NOW: No analysis is actually performed. Misalignment is calculated
 906    only for trivial cases. TODO.  */
 907
 908 static void
 909 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
 910 {
 911   stmt_vec_info stmt_info = dr_info->stmt;
 912   vec_base_alignments *base_alignments = &vinfo->base_alignments;
 913   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 914   class loop *loop = NULL;
 915   tree ref = DR_REF (dr_info->dr);
 916   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 917
 918   if (dump_enabled_p ())
 919     dump_printf_loc (MSG_NOTE, vect_location,
 920                      "vect_compute_data_ref_alignment:\n");
 921
 922   if (loop_vinfo)
 923     loop = LOOP_VINFO_LOOP (loop_vinfo);
 924
 925   /* Initialize misalignment to unknown.  */
 926   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 927
 928   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 929     return;
 930
 931   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
 932   bool step_preserves_misalignment_p;
 933
 934   poly_uint64 vector_alignment
 935     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 936   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 937
 938   /* If the main loop has peeled for alignment we have no way of knowing
 939      whether the data accesses in the epilogues are aligned.  We can't at
 940      compile time answer the question whether we have entered the main loop or
 941      not.  Fixes PR 92351.  */
 942   if (loop_vinfo)
 943     {
 944       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
 945       if (orig_loop_vinfo
 946           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
 947         return;
 948     }
 949
 950   unsigned HOST_WIDE_INT vect_align_c;
 951   if (!vector_alignment.is_constant (&vect_align_c))
 952     return;
 953
 954   /* No step for BB vectorization.  */
 955   if (!loop)
 956     {
 957       gcc_assert (integer_zerop (drb->step));
 958       step_preserves_misalignment_p = true;
 959     }
 960
 961   /* In case the dataref is in an inner-loop of the loop that is being
 962      vectorized (LOOP), we use the base and misalignment information
 963      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 964      stays the same throughout the execution of the inner-loop, which is why
 965      we have to check that the stride of the dataref in the inner-loop evenly
 966      divides by the vector alignment.  */
 967   else if (nested_in_vect_loop_p (loop, stmt_info))
 968     {
 969       step_preserves_misalignment_p
 970         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 971
 972       if (dump_enabled_p ())
 973         {
 974           if (step_preserves_misalignment_p)
 975             dump_printf_loc (MSG_NOTE, vect_location,
 976                              "inner step divides the vector alignment.\n");
 977           else
 978             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 979                              "inner step doesn't divide the vector"
 980                              " alignment.\n");
 981         }
 982     }
 983
 984   /* Similarly we can only use base and misalignment information relative to
 985      an innermost loop if the misalignment stays the same throughout the
 986      execution of the loop.  As above, this is the case if the stride of
 987      the dataref evenly divides by the alignment.  */
 988   else
 989     {
 990       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 991       step_preserves_misalignment_p
 992         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 993
 994       if (!step_preserves_misalignment_p && dump_enabled_p ())
 995         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 996                          "step doesn't divide the vector alignment.\n");
 997     }
 998
 999   unsigned int base_alignment = drb->base_alignment;
1000   unsigned int base_misalignment = drb->base_misalignment;
1001
1002   /* Calculate the maximum of the pooled base address alignment and the
1003      alignment that we can compute for DR itself.  */
1004   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1005   if (entry && base_alignment < (*entry)->base_alignment)
1006     {
1007       base_alignment = (*entry)->base_alignment;
1008       base_misalignment = (*entry)->base_misalignment;
1009     }
1010
1011   if (drb->offset_alignment < vect_align_c
1012       || !step_preserves_misalignment_p
1013       /* We need to know whether the step wrt the vectorized loop is
1014          negative when computing the starting misalignment below.  */
1015       || TREE_CODE (drb->step) != INTEGER_CST)
1016     {
1017       if (dump_enabled_p ())
1018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019                          "Unknown alignment for access: %T\n", ref);
1020       return;
1021     }
1022
1023   if (base_alignment < vect_align_c)
1024     {
1025       unsigned int max_alignment;
1026       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1027       if (max_alignment < vect_align_c
1028           || !vect_can_force_dr_alignment_p (base,
1029                                              vect_align_c * BITS_PER_UNIT))
1030         {
1031           if (dump_enabled_p ())
1032             dump_printf_loc (MSG_NOTE, vect_location,
1033                              "can't force alignment of ref: %T\n", ref);
1034           return;
1035         }
1036
1037       /* Force the alignment of the decl.
1038          NOTE: This is the only change to the code we make during
1039          the analysis phase, before deciding to vectorize the loop.  */
1040       if (dump_enabled_p ())
1041         dump_printf_loc (MSG_NOTE, vect_location,
1042                          "force alignment of %T\n", ref);
1043
1044       dr_info->base_decl = base;
1045       dr_info->base_misaligned = true;
1046       base_misalignment = 0;
1047     }
1048   poly_int64 misalignment
1049     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1050
1051   /* If this is a backward running DR then first access in the larger
1052      vectype actually is N-1 elements before the address in the DR.
1053      Adjust misalign accordingly.  */
1054   if (tree_int_cst_sgn (drb->step) < 0)
1055     /* PLUS because STEP is negative.  */
1056     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1057                      * TREE_INT_CST_LOW (drb->step));
1058
1059   unsigned int const_misalignment;
1060   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1061     {
1062       if (dump_enabled_p ())
1063         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1064                          "Non-constant misalignment for access: %T\n", ref);
1065       return;
1066     }
1067
1068   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1069
1070   if (dump_enabled_p ())
1071     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1072                      "misalign = %d bytes of ref %T\n",
1073                      DR_MISALIGNMENT (dr_info), ref);
1074
1075   return;
1076 }
1077
1078 /* Function vect_update_misalignment_for_peel.
1079    Sets DR_INFO's misalignment
1080    - to 0 if it has the same alignment as DR_PEEL_INFO,
1081    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1082    - to -1 (unknown) otherwise.
1083
1084    DR_INFO - the data reference whose misalignment is to be adjusted.
1085    DR_PEEL_INFO - the data reference whose misalignment is being made
1086                   zero in the vector loop by the peel.
1087    NPEEL - the number of iterations in the peel loop if the misalignment
1088            of DR_PEEL_INFO is known at compile time.  */
1089
1090 static void
1091 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1092                                    dr_vec_info *dr_peel_info, int npeel)
1093 {
1094   unsigned int i;
1095   vec<dr_p> same_aligned_drs;
1096   struct data_reference *current_dr;
1097   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1098
1099   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1100      it is aligned in the vector loop.  */
1101   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1102   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1103     {
1104       if (current_dr != dr_info->dr)
1105         continue;
1106       gcc_assert (!known_alignment_for_access_p (dr_info)
1107                   || !known_alignment_for_access_p (dr_peel_info)
1108                   || (DR_MISALIGNMENT (dr_info)
1109                       == DR_MISALIGNMENT (dr_peel_info)));
1110       SET_DR_MISALIGNMENT (dr_info, 0);
1111       return;
1112     }
1113
1114   unsigned HOST_WIDE_INT alignment;
1115   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1116       && known_alignment_for_access_p (dr_info)
1117       && known_alignment_for_access_p (dr_peel_info))
1118     {
1119       int misal = DR_MISALIGNMENT (dr_info);
1120       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1121       misal &= alignment - 1;
1122       SET_DR_MISALIGNMENT (dr_info, misal);
1123       return;
1124     }
1125
1126   if (dump_enabled_p ())
1127     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1128                      "to unknown (-1).\n");
1129   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1130 }
1131
1132
1133 /* Function verify_data_ref_alignment
1134
1135    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1136
1137 static opt_result
1138 verify_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
1139 {
1140   enum dr_alignment_support supportable_dr_alignment
1141     = vect_supportable_dr_alignment (vinfo, dr_info, false);
1142   if (!supportable_dr_alignment)
1143     return opt_result::failure_at
1144       (dr_info->stmt->stmt,
1145        DR_IS_READ (dr_info->dr)
1146         ? "not vectorized: unsupported unaligned load: %T\n"
1147         : "not vectorized: unsupported unaligned store: %T\n",
1148        DR_REF (dr_info->dr));
1149
1150   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1151     dump_printf_loc (MSG_NOTE, vect_location,
1152                      "Vectorizing an unaligned access.\n");
1153
1154   return opt_result::success ();
1155 }
1156
1157 /* Function vect_verify_datarefs_alignment
1158
1159    Return TRUE if all data references in the loop can be
1160    handled with respect to alignment.  */
1161
1162 opt_result
1163 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1164 {
1165   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1166   struct data_reference *dr;
1167   unsigned int i;
1168
1169   FOR_EACH_VEC_ELT (datarefs, i, dr)
1170     {
1171       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1172       stmt_vec_info stmt_info = dr_info->stmt;
1173
1174       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1175         continue;
1176
1177       /* For interleaving, only the alignment of the first access matters.   */
1178       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1179           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1180         continue;
1181
1182       /* Strided accesses perform only component accesses, alignment is
1183          irrelevant for them.  */
1184       if (STMT_VINFO_STRIDED_P (stmt_info)
1185           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1186         continue;
1187
1188       opt_result res = verify_data_ref_alignment (vinfo, dr_info);
1189       if (!res)
1190         return res;
1191     }
1192
1193   return opt_result::success ();
1194 }
1195
1196 /* Given an memory reference EXP return whether its alignment is less
1197    than its size.  */
1198
1199 static bool
1200 not_size_aligned (tree exp)
1201 {
1202   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1203     return true;
1204
1205   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1206           > get_object_alignment (exp));
1207 }
1208
1209 /* Function vector_alignment_reachable_p
1210
1211    Return true if vector alignment for DR_INFO is reachable by peeling
1212    a few loop iterations.  Return false otherwise.  */
1213
1214 static bool
1215 vector_alignment_reachable_p (dr_vec_info *dr_info)
1216 {
1217   stmt_vec_info stmt_info = dr_info->stmt;
1218   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1219
1220   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1221     {
1222       /* For interleaved access we peel only if number of iterations in
1223          the prolog loop ({VF - misalignment}), is a multiple of the
1224          number of the interleaved accesses.  */
1225       int elem_size, mis_in_elements;
1226
1227       /* FORNOW: handle only known alignment.  */
1228       if (!known_alignment_for_access_p (dr_info))
1229         return false;
1230
1231       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1232       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1233       elem_size = vector_element_size (vector_size, nelements);
1234       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1235
1236       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1237         return false;
1238     }
1239
1240   /* If misalignment is known at the compile time then allow peeling
1241      only if natural alignment is reachable through peeling.  */
1242   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1243     {
1244       HOST_WIDE_INT elmsize =
1245                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1246       if (dump_enabled_p ())
1247         {
1248           dump_printf_loc (MSG_NOTE, vect_location,
1249                            "data size = %wd. misalignment = %d.\n", elmsize,
1250                            DR_MISALIGNMENT (dr_info));
1251         }
1252       if (DR_MISALIGNMENT (dr_info) % elmsize)
1253         {
1254           if (dump_enabled_p ())
1255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256                              "data size does not divide the misalignment.\n");
1257           return false;
1258         }
1259     }
1260
1261   if (!known_alignment_for_access_p (dr_info))
1262     {
1263       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1264       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1265       if (dump_enabled_p ())
1266         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267                          "Unknown misalignment, %snaturally aligned\n",
1268                          is_packed ? "not " : "");
1269       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1270     }
1271
1272   return true;
1273 }
1274
1275
1276 /* Calculate the cost of the memory access represented by DR_INFO.  */
1277
1278 static void
1279 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1280                            unsigned int *inside_cost,
1281                            unsigned int *outside_cost,
1282                            stmt_vector_for_cost *body_cost_vec,
1283                            stmt_vector_for_cost *prologue_cost_vec)
1284 {
1285   stmt_vec_info stmt_info = dr_info->stmt;
1286   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1287   int ncopies;
1288
1289   if (PURE_SLP_STMT (stmt_info))
1290     ncopies = 1;
1291   else
1292     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1293
1294   if (DR_IS_READ (dr_info->dr))
1295     vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1296                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1297   else
1298     vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1299
1300   if (dump_enabled_p ())
1301     dump_printf_loc (MSG_NOTE, vect_location,
1302                      "vect_get_data_access_cost: inside_cost = %d, "
1303                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1304 }
1305
1306
1307 typedef struct _vect_peel_info
1308 {
1309   dr_vec_info *dr_info;
1310   int npeel;
1311   unsigned int count;
1312 } *vect_peel_info;
1313
1314 typedef struct _vect_peel_extended_info
1315 {
1316   vec_info *vinfo;
1317   struct _vect_peel_info peel_info;
1318   unsigned int inside_cost;
1319   unsigned int outside_cost;
1320 } *vect_peel_extended_info;
1321
1322
1323 /* Peeling hashtable helpers.  */
1324
1325 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1326 {
1327   static inline hashval_t hash (const _vect_peel_info *);
1328   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1329 };
1330
1331 inline hashval_t
1332 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1333 {
1334   return (hashval_t) peel_info->npeel;
1335 }
1336
1337 inline bool
1338 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1339 {
1340   return (a->npeel == b->npeel);
1341 }
1342
1343
1344 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1345
1346 static void
1347 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1348                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1349                           int npeel)
1350 {
1351   struct _vect_peel_info elem, *slot;
1352   _vect_peel_info **new_slot;
1353   bool supportable_dr_alignment
1354     = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1355
1356   elem.npeel = npeel;
1357   slot = peeling_htab->find (&elem);
1358   if (slot)
1359     slot->count++;
1360   else
1361     {
1362       slot = XNEW (struct _vect_peel_info);
1363       slot->npeel = npeel;
1364       slot->dr_info = dr_info;
1365       slot->count = 1;
1366       new_slot = peeling_htab->find_slot (slot, INSERT);
1367       *new_slot = slot;
1368     }
1369
1370   if (!supportable_dr_alignment
1371       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1372     slot->count += VECT_MAX_COST;
1373 }
1374
1375
1376 /* Traverse peeling hash table to find peeling option that aligns maximum
1377    number of data accesses.  */
1378
1379 int
1380 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1381                                      _vect_peel_extended_info *max)
1382 {
1383   vect_peel_info elem = *slot;
1384
1385   if (elem->count > max->peel_info.count
1386       || (elem->count == max->peel_info.count
1387           && max->peel_info.npeel > elem->npeel))
1388     {
1389       max->peel_info.npeel = elem->npeel;
1390       max->peel_info.count = elem->count;
1391       max->peel_info.dr_info = elem->dr_info;
1392     }
1393
1394   return 1;
1395 }
1396
1397 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1398    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1399    we assume DR0_INFO's misalignment will be zero after peeling.  */
1400
1401 static void
1402 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1403                                 dr_vec_info *dr0_info,
1404                                 unsigned int *inside_cost,
1405                                 unsigned int *outside_cost,
1406                                 stmt_vector_for_cost *body_cost_vec,
1407                                 stmt_vector_for_cost *prologue_cost_vec,
1408                                 unsigned int npeel,
1409                                 bool unknown_misalignment)
1410 {
1411   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1412   unsigned i;
1413   data_reference *dr;
1414
1415   FOR_EACH_VEC_ELT (datarefs, i, dr)
1416     {
1417       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1418       stmt_vec_info stmt_info = dr_info->stmt;
1419       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1420         continue;
1421
1422       /* For interleaving, only the alignment of the first access
1423          matters.  */
1424       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1425           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1426         continue;
1427
1428       /* Strided accesses perform only component accesses, alignment is
1429          irrelevant for them.  */
1430       if (STMT_VINFO_STRIDED_P (stmt_info)
1431           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1432         continue;
1433
1434       int save_misalignment;
1435       save_misalignment = DR_MISALIGNMENT (dr_info);
1436       if (npeel == 0)
1437         ;
1438       else if (unknown_misalignment && dr_info == dr0_info)
1439         SET_DR_MISALIGNMENT (dr_info, 0);
1440       else
1441         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1442       vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1443                                  body_cost_vec, prologue_cost_vec);
1444       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1445     }
1446 }
1447
1448 /* Traverse peeling hash table and calculate cost for each peeling option.
1449    Find the one with the lowest cost.  */
1450
1451 int
1452 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1453                                    _vect_peel_extended_info *min)
1454 {
1455   vect_peel_info elem = *slot;
1456   int dummy;
1457   unsigned int inside_cost = 0, outside_cost = 0;
1458   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1459   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1460                        epilogue_cost_vec;
1461
1462   prologue_cost_vec.create (2);
1463   body_cost_vec.create (2);
1464   epilogue_cost_vec.create (2);
1465
1466   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1467                                   &outside_cost, &body_cost_vec,
1468                                   &prologue_cost_vec, elem->npeel, false);
1469
1470   body_cost_vec.release ();
1471
1472   outside_cost += vect_get_known_peeling_cost
1473     (loop_vinfo, elem->npeel, &dummy,
1474      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1475      &prologue_cost_vec, &epilogue_cost_vec);
1476
1477   /* Prologue and epilogue costs are added to the target model later.
1478      These costs depend only on the scalar iteration cost, the
1479      number of peeling iterations finally chosen, and the number of
1480      misaligned statements.  So discard the information found here.  */
1481   prologue_cost_vec.release ();
1482   epilogue_cost_vec.release ();
1483
1484   if (inside_cost < min->inside_cost
1485       || (inside_cost == min->inside_cost
1486           && outside_cost < min->outside_cost))
1487     {
1488       min->inside_cost = inside_cost;
1489       min->outside_cost = outside_cost;
1490       min->peel_info.dr_info = elem->dr_info;
1491       min->peel_info.npeel = elem->npeel;
1492       min->peel_info.count = elem->count;
1493     }
1494
1495   return 1;
1496 }
1497
1498
1499 /* Choose best peeling option by traversing peeling hash table and either
1500    choosing an option with the lowest cost (if cost model is enabled) or the
1501    option that aligns as many accesses as possible.  */
1502
1503 static struct _vect_peel_extended_info
1504 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1505                                        loop_vec_info loop_vinfo)
1506 {
1507    struct _vect_peel_extended_info res;
1508
1509    res.peel_info.dr_info = NULL;
1510    res.vinfo = loop_vinfo;
1511
1512    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1513      {
1514        res.inside_cost = INT_MAX;
1515        res.outside_cost = INT_MAX;
1516        peeling_htab->traverse <_vect_peel_extended_info *,
1517                                vect_peeling_hash_get_lowest_cost> (&res);
1518      }
1519    else
1520      {
1521        res.peel_info.count = 0;
1522        peeling_htab->traverse <_vect_peel_extended_info *,
1523                                vect_peeling_hash_get_most_frequent> (&res);
1524        res.inside_cost = 0;
1525        res.outside_cost = 0;
1526      }
1527
1528    return res;
1529 }
1530
1531 /* Return true if the new peeling NPEEL is supported.  */
1532
1533 static bool
1534 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1535                           unsigned npeel)
1536 {
1537   unsigned i;
1538   struct data_reference *dr = NULL;
1539   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1540   enum dr_alignment_support supportable_dr_alignment;
1541
1542   /* Ensure that all data refs can be vectorized after the peel.  */
1543   FOR_EACH_VEC_ELT (datarefs, i, dr)
1544     {
1545       int save_misalignment;
1546
1547       if (dr == dr0_info->dr)
1548         continue;
1549
1550       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1551       stmt_vec_info stmt_info = dr_info->stmt;
1552       /* For interleaving, only the alignment of the first access
1553          matters.  */
1554       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1555           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1556         continue;
1557
1558       /* Strided accesses perform only component accesses, alignment is
1559          irrelevant for them.  */
1560       if (STMT_VINFO_STRIDED_P (stmt_info)
1561           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1562         continue;
1563
1564       save_misalignment = DR_MISALIGNMENT (dr_info);
1565       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1566       supportable_dr_alignment
1567         = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1568       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1569
1570       if (!supportable_dr_alignment)
1571         return false;
1572     }
1573
1574   return true;
1575 }
1576
1577 /* Function vect_enhance_data_refs_alignment
1578
1579    This pass will use loop versioning and loop peeling in order to enhance
1580    the alignment of data references in the loop.
1581
1582    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1583    original loop is to be vectorized.  Any other loops that are created by
1584    the transformations performed in this pass - are not supposed to be
1585    vectorized.  This restriction will be relaxed.
1586
1587    This pass will require a cost model to guide it whether to apply peeling
1588    or versioning or a combination of the two.  For example, the scheme that
1589    intel uses when given a loop with several memory accesses, is as follows:
1590    choose one memory access ('p') which alignment you want to force by doing
1591    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1592    other accesses are not necessarily aligned, or (2) use loop versioning to
1593    generate one loop in which all accesses are aligned, and another loop in
1594    which only 'p' is necessarily aligned.
1595
1596    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1597    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1598    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1599
1600    Devising a cost model is the most critical aspect of this work.  It will
1601    guide us on which access to peel for, whether to use loop versioning, how
1602    many versions to create, etc.  The cost model will probably consist of
1603    generic considerations as well as target specific considerations (on
1604    powerpc for example, misaligned stores are more painful than misaligned
1605    loads).
1606
1607    Here are the general steps involved in alignment enhancements:
1608
1609      -- original loop, before alignment analysis:
1610         for (i=0; i<N; i++){
1611           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1612           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1613         }
1614
1615      -- After vect_compute_data_refs_alignment:
1616         for (i=0; i<N; i++){
1617           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1618           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1619         }
1620
1621      -- Possibility 1: we do loop versioning:
1622      if (p is aligned) {
1623         for (i=0; i<N; i++){    # loop 1A
1624           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1625           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1626         }
1627      }
1628      else {
1629         for (i=0; i<N; i++){    # loop 1B
1630           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1631           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1632         }
1633      }
1634
1635      -- Possibility 2: we do loop peeling:
1636      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1637         x = q[i];
1638         p[i] = y;
1639      }
1640      for (i = 3; i < N; i++){   # loop 2A
1641         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1642         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1643      }
1644
1645      -- Possibility 3: combination of loop peeling and versioning:
1646      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1647         x = q[i];
1648         p[i] = y;
1649      }
1650      if (p is aligned) {
1651         for (i = 3; i<N; i++){  # loop 3A
1652           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1653           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1654         }
1655      }
1656      else {
1657         for (i = 3; i<N; i++){  # loop 3B
1658           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1659           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1660         }
1661      }
1662
1663      These loops are later passed to loop_transform to be vectorized.  The
1664      vectorizer will use the alignment information to guide the transformation
1665      (whether to generate regular loads/stores, or with special handling for
1666      misalignment).  */
1667
1668 opt_result
1669 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1670 {
1671   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1672   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1673   enum dr_alignment_support supportable_dr_alignment;
1674   dr_vec_info *first_store = NULL;
1675   dr_vec_info *dr0_info = NULL;
1676   struct data_reference *dr;
1677   unsigned int i, j;
1678   bool do_peeling = false;
1679   bool do_versioning = false;
1680   unsigned int npeel = 0;
1681   bool one_misalignment_known = false;
1682   bool one_misalignment_unknown = false;
1683   bool one_dr_unsupportable = false;
1684   dr_vec_info *unsupportable_dr_info = NULL;
1685   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1686   unsigned possible_npeel_number = 1;
1687   tree vectype;
1688   unsigned int mis, same_align_drs_max = 0;
1689   hash_table<peel_info_hasher> peeling_htab (1);
1690
1691   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1692
1693   /* Reset data so we can safely be called multiple times.  */
1694   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1695   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1696
1697   /* While cost model enhancements are expected in the future, the high level
1698      view of the code at this time is as follows:
1699
1700      A) If there is a misaligned access then see if peeling to align
1701         this access can make all data references satisfy
1702         vect_supportable_dr_alignment.  If so, update data structures
1703         as needed and return true.
1704
1705      B) If peeling wasn't possible and there is a data reference with an
1706         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1707         then see if loop versioning checks can be used to make all data
1708         references satisfy vect_supportable_dr_alignment.  If so, update
1709         data structures as needed and return true.
1710
1711      C) If neither peeling nor versioning were successful then return false if
1712         any data reference does not satisfy vect_supportable_dr_alignment.
1713
1714      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1715
1716      Note, Possibility 3 above (which is peeling and versioning together) is not
1717      being done at this time.  */
1718
1719   /* (1) Peeling to force alignment.  */
1720
1721   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1722      Considerations:
1723      + How many accesses will become aligned due to the peeling
1724      - How many accesses will become unaligned due to the peeling,
1725        and the cost of misaligned accesses.
1726      - The cost of peeling (the extra runtime checks, the increase
1727        in code size).  */
1728
1729   FOR_EACH_VEC_ELT (datarefs, i, dr)
1730     {
1731       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1732       stmt_vec_info stmt_info = dr_info->stmt;
1733
1734       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1735         continue;
1736
1737       /* For interleaving, only the alignment of the first access
1738          matters.  */
1739       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1740           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1741         continue;
1742
1743       /* For scatter-gather or invariant accesses there is nothing
1744          to enhance.  */
1745       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1746           || integer_zerop (DR_STEP (dr)))
1747         continue;
1748
1749       /* Strided accesses perform only component accesses, alignment is
1750          irrelevant for them.  */
1751       if (STMT_VINFO_STRIDED_P (stmt_info)
1752           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1753         continue;
1754
1755       supportable_dr_alignment
1756         = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1757       do_peeling = vector_alignment_reachable_p (dr_info);
1758       if (do_peeling)
1759         {
1760           if (known_alignment_for_access_p (dr_info))
1761             {
1762               unsigned int npeel_tmp = 0;
1763               bool negative = tree_int_cst_compare (DR_STEP (dr),
1764                                                     size_zero_node) < 0;
1765
1766               vectype = STMT_VINFO_VECTYPE (stmt_info);
1767               /* If known_alignment_for_access_p then we have set
1768                  DR_MISALIGNMENT which is only done if we know it at compiler
1769                  time, so it is safe to assume target alignment is constant.
1770                */
1771               unsigned int target_align =
1772                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1773               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774               mis = (negative
1775                      ? DR_MISALIGNMENT (dr_info)
1776                      : -DR_MISALIGNMENT (dr_info));
1777               if (DR_MISALIGNMENT (dr_info) != 0)
1778                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1779
1780               /* For multiple types, it is possible that the bigger type access
1781                  will have more than one peeling option.  E.g., a loop with two
1782                  types: one of size (vector size / 4), and the other one of
1783                  size (vector size / 8).  Vectorization factor will 8.  If both
1784                  accesses are misaligned by 3, the first one needs one scalar
1785                  iteration to be aligned, and the second one needs 5.  But the
1786                  first one will be aligned also by peeling 5 scalar
1787                  iterations, and in that case both accesses will be aligned.
1788                  Hence, except for the immediate peeling amount, we also want
1789                  to try to add full vector size, while we don't exceed
1790                  vectorization factor.
1791                  We do this automatically for cost model, since we calculate
1792                  cost for every peeling option.  */
1793               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1794                 {
1795                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797                   possible_npeel_number
1798                     = vect_get_num_vectors (nscalars, vectype);
1799
1800                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1801                      allow peeling NELEMENTS.  */
1802                   if (DR_MISALIGNMENT (dr_info) == 0)
1803                     possible_npeel_number++;
1804                 }
1805
1806               /* Save info about DR in the hash table.  Also include peeling
1807                  amounts according to the explanation above.  */
1808               for (j = 0; j < possible_npeel_number; j++)
1809                 {
1810                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811                                             dr_info, npeel_tmp);
1812                   npeel_tmp += target_align / dr_size;
1813                 }
1814
1815               one_misalignment_known = true;
1816             }
1817           else
1818             {
1819               /* If we don't know any misalignment values, we prefer
1820                  peeling for data-ref that has the maximum number of data-refs
1821                  with the same alignment, unless the target prefers to align
1822                  stores over load.  */
1823               unsigned same_align_drs
1824                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825               if (!dr0_info
1826                   || same_align_drs_max < same_align_drs)
1827                 {
1828                   same_align_drs_max = same_align_drs;
1829                   dr0_info = dr_info;
1830                 }
1831               /* For data-refs with the same number of related
1832                  accesses prefer the one where the misalign
1833                  computation will be invariant in the outermost loop.  */
1834               else if (same_align_drs_max == same_align_drs)
1835                 {
1836                   class loop *ivloop0, *ivloop;
1837                   ivloop0 = outermost_invariant_loop_for_expr
1838                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839                   ivloop = outermost_invariant_loop_for_expr
1840                     (loop, DR_BASE_ADDRESS (dr));
1841                   if ((ivloop && !ivloop0)
1842                       || (ivloop && ivloop0
1843                           && flow_loop_nested_p (ivloop, ivloop0)))
1844                     dr0_info = dr_info;
1845                 }
1846
1847               one_misalignment_unknown = true;
1848
1849               /* Check for data refs with unsupportable alignment that
1850                  can be peeled.  */
1851               if (!supportable_dr_alignment)
1852               {
1853                 one_dr_unsupportable = true;
1854                 unsupportable_dr_info = dr_info;
1855               }
1856
1857               if (!first_store && DR_IS_WRITE (dr))
1858                 first_store = dr_info;
1859             }
1860         }
1861       else
1862         {
1863           if (!aligned_access_p (dr_info))
1864             {
1865               if (dump_enabled_p ())
1866                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                                  "vector alignment may not be reachable\n");
1868               break;
1869             }
1870         }
1871     }
1872
1873   /* Check if we can possibly peel the loop.  */
1874   if (!vect_can_advance_ivs_p (loop_vinfo)
1875       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876       || loop->inner)
1877     do_peeling = false;
1878
1879   struct _vect_peel_extended_info peel_for_known_alignment;
1880   struct _vect_peel_extended_info peel_for_unknown_alignment;
1881   struct _vect_peel_extended_info best_peel;
1882
1883   peel_for_unknown_alignment.inside_cost = INT_MAX;
1884   peel_for_unknown_alignment.outside_cost = INT_MAX;
1885   peel_for_unknown_alignment.peel_info.count = 0;
1886
1887   if (do_peeling
1888       && one_misalignment_unknown)
1889     {
1890       /* Check if the target requires to prefer stores over loads, i.e., if
1891          misaligned stores are more expensive than misaligned loads (taking
1892          drs with same alignment into account).  */
1893       unsigned int load_inside_cost = 0;
1894       unsigned int load_outside_cost = 0;
1895       unsigned int store_inside_cost = 0;
1896       unsigned int store_outside_cost = 0;
1897       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1898
1899       stmt_vector_for_cost dummy;
1900       dummy.create (2);
1901       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902                                       &load_inside_cost,
1903                                       &load_outside_cost,
1904                                       &dummy, &dummy, estimated_npeels, true);
1905       dummy.release ();
1906
1907       if (first_store)
1908         {
1909           dummy.create (2);
1910           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911                                           &store_inside_cost,
1912                                           &store_outside_cost,
1913                                           &dummy, &dummy,
1914                                           estimated_npeels, true);
1915           dummy.release ();
1916         }
1917       else
1918         {
1919           store_inside_cost = INT_MAX;
1920           store_outside_cost = INT_MAX;
1921         }
1922
1923       if (load_inside_cost > store_inside_cost
1924           || (load_inside_cost == store_inside_cost
1925               && load_outside_cost > store_outside_cost))
1926         {
1927           dr0_info = first_store;
1928           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1930         }
1931       else
1932         {
1933           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935         }
1936
1937       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938       prologue_cost_vec.create (2);
1939       epilogue_cost_vec.create (2);
1940
1941       int dummy2;
1942       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943         (loop_vinfo, estimated_npeels, &dummy2,
1944          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945          &prologue_cost_vec, &epilogue_cost_vec);
1946
1947       prologue_cost_vec.release ();
1948       epilogue_cost_vec.release ();
1949
1950       peel_for_unknown_alignment.peel_info.count = 1
1951         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952     }
1953
1954   peel_for_unknown_alignment.peel_info.npeel = 0;
1955   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1956
1957   best_peel = peel_for_unknown_alignment;
1958
1959   peel_for_known_alignment.inside_cost = INT_MAX;
1960   peel_for_known_alignment.outside_cost = INT_MAX;
1961   peel_for_known_alignment.peel_info.count = 0;
1962   peel_for_known_alignment.peel_info.dr_info = NULL;
1963
1964   if (do_peeling && one_misalignment_known)
1965     {
1966       /* Peeling is possible, but there is no data access that is not supported
1967          unless aligned.  So we try to choose the best possible peeling from
1968          the hash table.  */
1969       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970         (&peeling_htab, loop_vinfo);
1971     }
1972
1973   /* Compare costs of peeling for known and unknown alignment. */
1974   if (peel_for_known_alignment.peel_info.dr_info != NULL
1975       && peel_for_unknown_alignment.inside_cost
1976       >= peel_for_known_alignment.inside_cost)
1977     {
1978       best_peel = peel_for_known_alignment;
1979
1980       /* If the best peeling for known alignment has NPEEL == 0, perform no
1981          peeling at all except if there is an unsupportable dr that we can
1982          align.  */
1983       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984         do_peeling = false;
1985     }
1986
1987   /* If there is an unsupportable data ref, prefer this over all choices so far
1988      since we'd have to discard a chosen peeling except when it accidentally
1989      aligned the unsupportable data ref.  */
1990   if (one_dr_unsupportable)
1991     dr0_info = unsupportable_dr_info;
1992   else if (do_peeling)
1993     {
1994       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995          TODO: Use nopeel_outside_cost or get rid of it?  */
1996       unsigned nopeel_inside_cost = 0;
1997       unsigned nopeel_outside_cost = 0;
1998
1999       stmt_vector_for_cost dummy;
2000       dummy.create (2);
2001       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002                                       &nopeel_outside_cost, &dummy, &dummy,
2003                                       0, false);
2004       dummy.release ();
2005
2006       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2007          costs will be recorded.  */
2008       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009       prologue_cost_vec.create (2);
2010       epilogue_cost_vec.create (2);
2011
2012       int dummy2;
2013       nopeel_outside_cost += vect_get_known_peeling_cost
2014         (loop_vinfo, 0, &dummy2,
2015          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016          &prologue_cost_vec, &epilogue_cost_vec);
2017
2018       prologue_cost_vec.release ();
2019       epilogue_cost_vec.release ();
2020
2021       npeel = best_peel.peel_info.npeel;
2022       dr0_info = best_peel.peel_info.dr_info;
2023
2024       /* If no peeling is not more expensive than the best peeling we
2025          have so far, don't perform any peeling.  */
2026       if (nopeel_inside_cost <= best_peel.inside_cost)
2027         do_peeling = false;
2028     }
2029
2030   if (do_peeling)
2031     {
2032       stmt_vec_info stmt_info = dr0_info->stmt;
2033       vectype = STMT_VINFO_VECTYPE (stmt_info);
2034
2035       if (known_alignment_for_access_p (dr0_info))
2036         {
2037           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038                                                 size_zero_node) < 0;
2039           if (!npeel)
2040             {
2041               /* Since it's known at compile time, compute the number of
2042                  iterations in the peeled loop (the peeling factor) for use in
2043                  updating DR_MISALIGNMENT values.  The peeling factor is the
2044                  vectorization factor minus the misalignment as an element
2045                  count.  */
2046               mis = (negative
2047                      ? DR_MISALIGNMENT (dr0_info)
2048                      : -DR_MISALIGNMENT (dr0_info));
2049               /* If known_alignment_for_access_p then we have set
2050                  DR_MISALIGNMENT which is only done if we know it at compiler
2051                  time, so it is safe to assume target alignment is constant.
2052                */
2053               unsigned int target_align =
2054                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2055               npeel = ((mis & (target_align - 1))
2056                        / vect_get_scalar_dr_size (dr0_info));
2057             }
2058
2059           /* For interleaved data access every iteration accesses all the
2060              members of the group, therefore we divide the number of iterations
2061              by the group size.  */
2062           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2063             npeel /= DR_GROUP_SIZE (stmt_info);
2064
2065           if (dump_enabled_p ())
2066             dump_printf_loc (MSG_NOTE, vect_location,
2067                              "Try peeling by %d\n", npeel);
2068         }
2069
2070       /* Ensure that all datarefs can be vectorized after the peel.  */
2071       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2072         do_peeling = false;
2073
2074       /* Check if all datarefs are supportable and log.  */
2075       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2076         {
2077           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078           if (!stat)
2079             do_peeling = false;
2080           else
2081             return stat;
2082         }
2083
2084       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2085       if (do_peeling)
2086         {
2087           unsigned max_allowed_peel
2088             = param_vect_max_peeling_for_alignment;
2089           if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2090             max_allowed_peel = 0;
2091           if (max_allowed_peel != (unsigned)-1)
2092             {
2093               unsigned max_peel = npeel;
2094               if (max_peel == 0)
2095                 {
2096                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2097                   unsigned HOST_WIDE_INT target_align_c;
2098                   if (target_align.is_constant (&target_align_c))
2099                     max_peel =
2100                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2101                   else
2102                     {
2103                       do_peeling = false;
2104                       if (dump_enabled_p ())
2105                         dump_printf_loc (MSG_NOTE, vect_location,
2106                           "Disable peeling, max peels set and vector"
2107                           " alignment unknown\n");
2108                     }
2109                 }
2110               if (max_peel > max_allowed_peel)
2111                 {
2112                   do_peeling = false;
2113                   if (dump_enabled_p ())
2114                     dump_printf_loc (MSG_NOTE, vect_location,
2115                         "Disable peeling, max peels reached: %d\n", max_peel);
2116                 }
2117             }
2118         }
2119
2120       /* Cost model #2 - if peeling may result in a remaining loop not
2121          iterating enough to be vectorized then do not peel.  Since this
2122          is a cost heuristic rather than a correctness decision, use the
2123          most likely runtime value for variable vectorization factors.  */
2124       if (do_peeling
2125           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2126         {
2127           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2128           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2129           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2130               < assumed_vf + max_peel)
2131             do_peeling = false;
2132         }
2133
2134       if (do_peeling)
2135         {
2136           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2137              If the misalignment of DR_i is identical to that of dr0 then set
2138              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2139              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2140              by the peeling factor times the element size of DR_i (MOD the
2141              vectorization factor times the size).  Otherwise, the
2142              misalignment of DR_i must be set to unknown.  */
2143           FOR_EACH_VEC_ELT (datarefs, i, dr)
2144             if (dr != dr0_info->dr)
2145               {
2146                 /* Strided accesses perform only component accesses, alignment
2147                    is irrelevant for them.  */
2148                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2149                 stmt_info = dr_info->stmt;
2150                 if (STMT_VINFO_STRIDED_P (stmt_info)
2151                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2152                   continue;
2153
2154                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2155               }
2156
2157           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2158           if (npeel)
2159             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2160           else
2161             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2162               = DR_MISALIGNMENT (dr0_info);
2163           SET_DR_MISALIGNMENT (dr0_info, 0);
2164           if (dump_enabled_p ())
2165             {
2166               dump_printf_loc (MSG_NOTE, vect_location,
2167                                "Alignment of access forced using peeling.\n");
2168               dump_printf_loc (MSG_NOTE, vect_location,
2169                                "Peeling for alignment will be applied.\n");
2170             }
2171
2172           /* The inside-loop cost will be accounted for in vectorizable_load
2173              and vectorizable_store correctly with adjusted alignments.
2174              Drop the body_cst_vec on the floor here.  */
2175           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2176           gcc_assert (stat);
2177           return stat;
2178         }
2179     }
2180
2181   /* (2) Versioning to force alignment.  */
2182
2183   /* Try versioning if:
2184      1) optimize loop for speed and the cost-model is not cheap
2185      2) there is at least one unsupported misaligned data ref with an unknown
2186         misalignment, and
2187      3) all misaligned data refs with a known misalignment are supported, and
2188      4) the number of runtime alignment checks is within reason.  */
2189
2190   do_versioning
2191     = (optimize_loop_nest_for_speed_p (loop)
2192        && !loop->inner /* FORNOW */
2193        && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2194
2195   if (do_versioning)
2196     {
2197       FOR_EACH_VEC_ELT (datarefs, i, dr)
2198         {
2199           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2200           stmt_vec_info stmt_info = dr_info->stmt;
2201
2202           /* For interleaving, only the alignment of the first access
2203              matters.  */
2204           if (aligned_access_p (dr_info)
2205               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2206                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2207             continue;
2208
2209           if (STMT_VINFO_STRIDED_P (stmt_info))
2210             {
2211               /* Strided loads perform only component accesses, alignment is
2212                  irrelevant for them.  */
2213               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2214                 continue;
2215               do_versioning = false;
2216               break;
2217             }
2218
2219           supportable_dr_alignment
2220             = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2221
2222           if (!supportable_dr_alignment)
2223             {
2224               int mask;
2225               tree vectype;
2226
2227               if (known_alignment_for_access_p (dr_info)
2228                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2229                   >= (unsigned) param_vect_max_version_for_alignment_checks)
2230                 {
2231                   do_versioning = false;
2232                   break;
2233                 }
2234
2235               vectype = STMT_VINFO_VECTYPE (stmt_info);
2236               gcc_assert (vectype);
2237
2238               /* At present we don't support versioning for alignment
2239                  with variable VF, since there's no guarantee that the
2240                  VF is a power of two.  We could relax this if we added
2241                  a way of enforcing a power-of-two size.  */
2242               unsigned HOST_WIDE_INT size;
2243               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2244                 {
2245                   do_versioning = false;
2246                   break;
2247                 }
2248
2249               /* Forcing alignment in the first iteration is no good if
2250                  we don't keep it across iterations.  For now, just disable
2251                  versioning in this case.
2252                  ?? We could actually unroll the loop to achieve the required
2253                  overall step alignment, and forcing the alignment could be
2254                  done by doing some iterations of the non-vectorized loop.  */
2255               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2256                                * DR_STEP_ALIGNMENT (dr),
2257                                DR_TARGET_ALIGNMENT (dr_info)))
2258                 {
2259                   do_versioning = false;
2260                   break;
2261                 }
2262
2263               /* The rightmost bits of an aligned address must be zeros.
2264                  Construct the mask needed for this test.  For example,
2265                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2266                  mask must be 15 = 0xf. */
2267               mask = size - 1;
2268
2269               /* FORNOW: use the same mask to test all potentially unaligned
2270                  references in the loop.  */
2271               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2272                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2273                 {
2274                   do_versioning = false;
2275                   break;
2276                 }
2277
2278               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2279               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2280             }
2281         }
2282
2283       /* Versioning requires at least one misaligned data reference.  */
2284       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2285         do_versioning = false;
2286       else if (!do_versioning)
2287         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2288     }
2289
2290   if (do_versioning)
2291     {
2292       vec<stmt_vec_info> may_misalign_stmts
2293         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2294       stmt_vec_info stmt_info;
2295
2296       /* It can now be assumed that the data references in the statements
2297          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2298          of the loop being vectorized.  */
2299       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2300         {
2301           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2302           SET_DR_MISALIGNMENT (dr_info, 0);
2303           if (dump_enabled_p ())
2304             dump_printf_loc (MSG_NOTE, vect_location,
2305                              "Alignment of access forced using versioning.\n");
2306         }
2307
2308       if (dump_enabled_p ())
2309         dump_printf_loc (MSG_NOTE, vect_location,
2310                          "Versioning for alignment will be applied.\n");
2311
2312       /* Peeling and versioning can't be done together at this time.  */
2313       gcc_assert (! (do_peeling && do_versioning));
2314
2315       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2316       gcc_assert (stat);
2317       return stat;
2318     }
2319
2320   /* This point is reached if neither peeling nor versioning is being done.  */
2321   gcc_assert (! (do_peeling || do_versioning));
2322
2323   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2324   return stat;
2325 }
2326
2327
2328 /* Function vect_find_same_alignment_drs.
2329
2330    Update group and alignment relations in VINFO according to the chosen
2331    vectorization factor.  */
2332
2333 static void
2334 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2335 {
2336   struct data_reference *dra = DDR_A (ddr);
2337   struct data_reference *drb = DDR_B (ddr);
2338   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2339   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2340   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2341   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2342
2343   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2344     return;
2345
2346   if (dra == drb)
2347     return;
2348
2349   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2350       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2351     return;
2352
2353   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2354       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2355       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2356     return;
2357
2358   /* Two references with distance zero have the same alignment.  */
2359   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2360                           - wi::to_poly_offset (DR_INIT (drb)));
2361   if (maybe_ne (diff, 0))
2362     {
2363       /* Get the wider of the two alignments.  */
2364       poly_uint64 align_a =
2365         exact_div (vect_calculate_target_alignment (dr_info_a),
2366                    BITS_PER_UNIT);
2367       poly_uint64 align_b =
2368         exact_div (vect_calculate_target_alignment (dr_info_b),
2369                    BITS_PER_UNIT);
2370       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2371       if (!align_a.is_constant (&align_a_c)
2372           || !align_b.is_constant (&align_b_c))
2373         return;
2374
2375       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2376
2377       /* Require the gap to be a multiple of the larger vector alignment.  */
2378       if (!multiple_p (diff, max_align))
2379         return;
2380     }
2381
2382   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2383   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2384   if (dump_enabled_p ())
2385     dump_printf_loc (MSG_NOTE, vect_location,
2386                      "accesses have the same alignment: %T and %T\n",
2387                      DR_REF (dra), DR_REF (drb));
2388 }
2389
2390
2391 /* Function vect_analyze_data_refs_alignment
2392
2393    Analyze the alignment of the data-references in the loop.
2394    Return FALSE if a data reference is found that cannot be vectorized.  */
2395
2396 opt_result
2397 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2398 {
2399   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2400
2401   /* Mark groups of data references with same alignment using
2402      data dependence information.  */
2403   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2404   struct data_dependence_relation *ddr;
2405   unsigned int i;
2406
2407   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2408     vect_find_same_alignment_drs (vinfo, ddr);
2409
2410   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2411   struct data_reference *dr;
2412
2413   vect_record_base_alignments (vinfo);
2414   FOR_EACH_VEC_ELT (datarefs, i, dr)
2415     {
2416       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2417       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2418         vect_compute_data_ref_alignment (vinfo, dr_info);
2419     }
2420
2421   return opt_result::success ();
2422 }
2423
2424
2425 /* Analyze alignment of DRs of stmts in NODE.  */
2426
2427 static bool
2428 vect_slp_analyze_and_verify_node_alignment (vec_info *vinfo, slp_tree node)
2429 {
2430   /* We vectorize from the first scalar stmt in the node unless
2431      the node is permuted in which case we start from the first
2432      element in the group.  */
2433   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2434   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2435   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2436     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2437
2438   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2439   vect_compute_data_ref_alignment (vinfo, dr_info);
2440   /* For creating the data-ref pointer we need alignment of the
2441      first element anyway.  */
2442   if (dr_info != first_dr_info)
2443     vect_compute_data_ref_alignment (vinfo, first_dr_info);
2444   if (! verify_data_ref_alignment (vinfo, dr_info))
2445     {
2446       if (dump_enabled_p ())
2447         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448                          "not vectorized: bad data alignment in basic "
2449                          "block.\n");
2450       return false;
2451     }
2452
2453   return true;
2454 }
2455
2456 /* Function vect_slp_analyze_instance_alignment
2457
2458    Analyze the alignment of the data-references in the SLP instance.
2459    Return FALSE if a data reference is found that cannot be vectorized.  */
2460
2461 bool
2462 vect_slp_analyze_and_verify_instance_alignment (vec_info *vinfo,
2463                                                 slp_instance instance)
2464 {
2465   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2466
2467   slp_tree node;
2468   unsigned i;
2469   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2470     if (! vect_slp_analyze_and_verify_node_alignment (vinfo, node))
2471       return false;
2472
2473   node = SLP_INSTANCE_TREE (instance);
2474   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2475       && ! vect_slp_analyze_and_verify_node_alignment
2476              (vinfo, SLP_INSTANCE_TREE (instance)))
2477     return false;
2478
2479   return true;
2480 }
2481
2482
2483 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2484    accesses of legal size, step, etc.  Detect gaps, single element
2485    interleaving, and other special cases. Set grouped access info.
2486    Collect groups of strided stores for further use in SLP analysis.
2487    Worker for vect_analyze_group_access.  */
2488
2489 static bool
2490 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2491 {
2492   data_reference *dr = dr_info->dr;
2493   tree step = DR_STEP (dr);
2494   tree scalar_type = TREE_TYPE (DR_REF (dr));
2495   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2496   stmt_vec_info stmt_info = dr_info->stmt;
2497   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2498   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2499   HOST_WIDE_INT dr_step = -1;
2500   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2501   bool slp_impossible = false;
2502
2503   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2504      size of the interleaving group (including gaps).  */
2505   if (tree_fits_shwi_p (step))
2506     {
2507       dr_step = tree_to_shwi (step);
2508       /* Check that STEP is a multiple of type size.  Otherwise there is
2509          a non-element-sized gap at the end of the group which we
2510          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2511          ???  As we can handle non-constant step fine here we should
2512          simply remove uses of DR_GROUP_GAP between the last and first
2513          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2514          simply not include that gap.  */
2515       if ((dr_step % type_size) != 0)
2516         {
2517           if (dump_enabled_p ())
2518             dump_printf_loc (MSG_NOTE, vect_location,
2519                              "Step %T is not a multiple of the element size"
2520                              " for %T\n",
2521                              step, DR_REF (dr));
2522           return false;
2523         }
2524       groupsize = absu_hwi (dr_step) / type_size;
2525     }
2526   else
2527     groupsize = 0;
2528
2529   /* Not consecutive access is possible only if it is a part of interleaving.  */
2530   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2531     {
2532       /* Check if it this DR is a part of interleaving, and is a single
2533          element of the group that is accessed in the loop.  */
2534
2535       /* Gaps are supported only for loads. STEP must be a multiple of the type
2536          size.  */
2537       if (DR_IS_READ (dr)
2538           && (dr_step % type_size) == 0
2539           && groupsize > 0)
2540         {
2541           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2542           DR_GROUP_SIZE (stmt_info) = groupsize;
2543           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2544           if (dump_enabled_p ())
2545             dump_printf_loc (MSG_NOTE, vect_location,
2546                              "Detected single element interleaving %T"
2547                              " step %T\n",
2548                              DR_REF (dr), step);
2549
2550           return true;
2551         }
2552
2553       if (dump_enabled_p ())
2554         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555                          "not consecutive access %G", stmt_info->stmt);
2556
2557       if (bb_vinfo)
2558         {
2559           /* Mark the statement as unvectorizable.  */
2560           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2561           return true;
2562         }
2563
2564       if (dump_enabled_p ())
2565         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2566       STMT_VINFO_STRIDED_P (stmt_info) = true;
2567       return true;
2568     }
2569
2570   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2571     {
2572       /* First stmt in the interleaving chain. Check the chain.  */
2573       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2574       struct data_reference *data_ref = dr;
2575       unsigned int count = 1;
2576       tree prev_init = DR_INIT (data_ref);
2577       HOST_WIDE_INT diff, gaps = 0;
2578
2579       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2580       while (next)
2581         {
2582           /* We never have the same DR multiple times.  */
2583           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2584                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2585
2586           data_ref = STMT_VINFO_DATA_REF (next);
2587
2588           /* All group members have the same STEP by construction.  */
2589           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2590
2591           /* Check that the distance between two accesses is equal to the type
2592              size. Otherwise, we have gaps.  */
2593           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2594                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2595           if (diff != 1)
2596             {
2597               /* FORNOW: SLP of accesses with gaps is not supported.  */
2598               slp_impossible = true;
2599               if (DR_IS_WRITE (data_ref))
2600                 {
2601                   if (dump_enabled_p ())
2602                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2603                                      "interleaved store with gaps\n");
2604                   return false;
2605                 }
2606
2607               gaps += diff - 1;
2608             }
2609
2610           last_accessed_element += diff;
2611
2612           /* Store the gap from the previous member of the group. If there is no
2613              gap in the access, DR_GROUP_GAP is always 1.  */
2614           DR_GROUP_GAP (next) = diff;
2615
2616           prev_init = DR_INIT (data_ref);
2617           next = DR_GROUP_NEXT_ELEMENT (next);
2618           /* Count the number of data-refs in the chain.  */
2619           count++;
2620         }
2621
2622       if (groupsize == 0)
2623         groupsize = count + gaps;
2624
2625       /* This could be UINT_MAX but as we are generating code in a very
2626          inefficient way we have to cap earlier.  See PR78699 for example.  */
2627       if (groupsize > 4096)
2628         {
2629           if (dump_enabled_p ())
2630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631                              "group is too large\n");
2632           return false;
2633         }
2634
2635       /* Check that the size of the interleaving is equal to count for stores,
2636          i.e., that there are no gaps.  */
2637       if (groupsize != count
2638           && !DR_IS_READ (dr))
2639         {
2640           groupsize = count;
2641           STMT_VINFO_STRIDED_P (stmt_info) = true;
2642         }
2643
2644       /* If there is a gap after the last load in the group it is the
2645          difference between the groupsize and the last accessed
2646          element.
2647          When there is no gap, this difference should be 0.  */
2648       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2649
2650       DR_GROUP_SIZE (stmt_info) = groupsize;
2651       if (dump_enabled_p ())
2652         {
2653           dump_printf_loc (MSG_NOTE, vect_location,
2654                            "Detected interleaving ");
2655           if (DR_IS_READ (dr))
2656             dump_printf (MSG_NOTE, "load ");
2657           else if (STMT_VINFO_STRIDED_P (stmt_info))
2658             dump_printf (MSG_NOTE, "strided store ");
2659           else
2660             dump_printf (MSG_NOTE, "store ");
2661           dump_printf (MSG_NOTE, "of size %u\n",
2662                        (unsigned)groupsize);
2663           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2664           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2665           while (next)
2666             {
2667               if (DR_GROUP_GAP (next) != 1)
2668                 dump_printf_loc (MSG_NOTE, vect_location,
2669                                  "\t<gap of %d elements>\n",
2670                                  DR_GROUP_GAP (next) - 1);
2671               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2672               next = DR_GROUP_NEXT_ELEMENT (next);
2673             }
2674           if (DR_GROUP_GAP (stmt_info) != 0)
2675             dump_printf_loc (MSG_NOTE, vect_location,
2676                              "\t<gap of %d elements>\n",
2677                              DR_GROUP_GAP (stmt_info));
2678         }
2679
2680       /* SLP: create an SLP data structure for every interleaving group of
2681          stores for further analysis in vect_analyse_slp.  */
2682       if (DR_IS_WRITE (dr) && !slp_impossible)
2683         {
2684           if (loop_vinfo)
2685             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2686           if (bb_vinfo)
2687             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2688         }
2689     }
2690
2691   return true;
2692 }
2693
2694 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2695    accesses of legal size, step, etc.  Detect gaps, single element
2696    interleaving, and other special cases. Set grouped access info.
2697    Collect groups of strided stores for further use in SLP analysis.  */
2698
2699 static bool
2700 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2701 {
2702   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2703     {
2704       /* Dissolve the group if present.  */
2705       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2706       while (stmt_info)
2707         {
2708           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2709           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2710           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2711           stmt_info = next;
2712         }
2713       return false;
2714     }
2715   return true;
2716 }
2717
2718 /* Analyze the access pattern of the data-reference DR_INFO.
2719    In case of non-consecutive accesses call vect_analyze_group_access() to
2720    analyze groups of accesses.  */
2721
2722 static bool
2723 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2724 {
2725   data_reference *dr = dr_info->dr;
2726   tree step = DR_STEP (dr);
2727   tree scalar_type = TREE_TYPE (DR_REF (dr));
2728   stmt_vec_info stmt_info = dr_info->stmt;
2729   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2730   class loop *loop = NULL;
2731
2732   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2733     return true;
2734
2735   if (loop_vinfo)
2736     loop = LOOP_VINFO_LOOP (loop_vinfo);
2737
2738   if (loop_vinfo && !step)
2739     {
2740       if (dump_enabled_p ())
2741         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742                          "bad data-ref access in loop\n");
2743       return false;
2744     }
2745
2746   /* Allow loads with zero step in inner-loop vectorization.  */
2747   if (loop_vinfo && integer_zerop (step))
2748     {
2749       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2750       if (!nested_in_vect_loop_p (loop, stmt_info))
2751         return DR_IS_READ (dr);
2752       /* Allow references with zero step for outer loops marked
2753          with pragma omp simd only - it guarantees absence of
2754          loop-carried dependencies between inner loop iterations.  */
2755       if (loop->safelen < 2)
2756         {
2757           if (dump_enabled_p ())
2758             dump_printf_loc (MSG_NOTE, vect_location,
2759                              "zero step in inner loop of nest\n");
2760           return false;
2761         }
2762     }
2763
2764   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2765     {
2766       /* Interleaved accesses are not yet supported within outer-loop
2767         vectorization for references in the inner-loop.  */
2768       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2769
2770       /* For the rest of the analysis we use the outer-loop step.  */
2771       step = STMT_VINFO_DR_STEP (stmt_info);
2772       if (integer_zerop (step))
2773         {
2774           if (dump_enabled_p ())
2775             dump_printf_loc (MSG_NOTE, vect_location,
2776                              "zero step in outer loop.\n");
2777           return DR_IS_READ (dr);
2778         }
2779     }
2780
2781   /* Consecutive?  */
2782   if (TREE_CODE (step) == INTEGER_CST)
2783     {
2784       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2785       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2786           || (dr_step < 0
2787               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2788         {
2789           /* Mark that it is not interleaving.  */
2790           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2791           return true;
2792         }
2793     }
2794
2795   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2796     {
2797       if (dump_enabled_p ())
2798         dump_printf_loc (MSG_NOTE, vect_location,
2799                          "grouped access in outer loop.\n");
2800       return false;
2801     }
2802
2803
2804   /* Assume this is a DR handled by non-constant strided load case.  */
2805   if (TREE_CODE (step) != INTEGER_CST)
2806     return (STMT_VINFO_STRIDED_P (stmt_info)
2807             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2808                 || vect_analyze_group_access (vinfo, dr_info)));
2809
2810   /* Not consecutive access - check if it's a part of interleaving group.  */
2811   return vect_analyze_group_access (vinfo, dr_info);
2812 }
2813
2814 /* Compare two data-references DRA and DRB to group them into chunks
2815    suitable for grouping.  */
2816
2817 static int
2818 dr_group_sort_cmp (const void *dra_, const void *drb_)
2819 {
2820   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2821   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2822   int cmp;
2823
2824   /* Stabilize sort.  */
2825   if (dra == drb)
2826     return 0;
2827
2828   /* DRs in different loops never belong to the same group.  */
2829   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2830   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2831   if (loopa != loopb)
2832     return loopa->num < loopb->num ? -1 : 1;
2833
2834   /* Ordering of DRs according to base.  */
2835   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2836                                DR_BASE_ADDRESS (drb));
2837   if (cmp != 0)
2838     return cmp;
2839
2840   /* And according to DR_OFFSET.  */
2841   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2842   if (cmp != 0)
2843     return cmp;
2844
2845   /* Put reads before writes.  */
2846   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2847     return DR_IS_READ (dra) ? -1 : 1;
2848
2849   /* Then sort after access size.  */
2850   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2851                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2852   if (cmp != 0)
2853     return cmp;
2854
2855   /* And after step.  */
2856   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2857   if (cmp != 0)
2858     return cmp;
2859
2860   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2861   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2862   if (cmp == 0)
2863     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2864   return cmp;
2865 }
2866
2867 /* If OP is the result of a conversion, return the unconverted value,
2868    otherwise return null.  */
2869
2870 static tree
2871 strip_conversion (tree op)
2872 {
2873   if (TREE_CODE (op) != SSA_NAME)
2874     return NULL_TREE;
2875   gimple *stmt = SSA_NAME_DEF_STMT (op);
2876   if (!is_gimple_assign (stmt)
2877       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2878     return NULL_TREE;
2879   return gimple_assign_rhs1 (stmt);
2880 }
2881
2882 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2883    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2884    be grouped in SLP mode.  */
2885
2886 static bool
2887 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2888                    bool allow_slp_p)
2889 {
2890   if (gimple_assign_single_p (stmt1_info->stmt))
2891     return gimple_assign_single_p (stmt2_info->stmt);
2892
2893   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2894   if (call1 && gimple_call_internal_p (call1))
2895     {
2896       /* Check for two masked loads or two masked stores.  */
2897       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2898       if (!call2 || !gimple_call_internal_p (call2))
2899         return false;
2900       internal_fn ifn = gimple_call_internal_fn (call1);
2901       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2902         return false;
2903       if (ifn != gimple_call_internal_fn (call2))
2904         return false;
2905
2906       /* Check that the masks are the same.  Cope with casts of masks,
2907          like those created by build_mask_conversion.  */
2908       tree mask1 = gimple_call_arg (call1, 2);
2909       tree mask2 = gimple_call_arg (call2, 2);
2910       if (!operand_equal_p (mask1, mask2, 0)
2911           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2912         {
2913           mask1 = strip_conversion (mask1);
2914           if (!mask1)
2915             return false;
2916           mask2 = strip_conversion (mask2);
2917           if (!mask2)
2918             return false;
2919           if (!operand_equal_p (mask1, mask2, 0))
2920             return false;
2921         }
2922       return true;
2923     }
2924
2925   return false;
2926 }
2927
2928 /* Function vect_analyze_data_ref_accesses.
2929
2930    Analyze the access pattern of all the data references in the loop.
2931
2932    FORNOW: the only access pattern that is considered vectorizable is a
2933            simple step 1 (consecutive) access.
2934
2935    FORNOW: handle only arrays and pointer accesses.  */
2936
2937 opt_result
2938 vect_analyze_data_ref_accesses (vec_info *vinfo)
2939 {
2940   unsigned int i;
2941   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2942   struct data_reference *dr;
2943
2944   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2945
2946   if (datarefs.is_empty ())
2947     return opt_result::success ();
2948
2949   /* Sort the array of datarefs to make building the interleaving chains
2950      linear.  Don't modify the original vector's order, it is needed for
2951      determining what dependencies are reversed.  */
2952   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2953   datarefs_copy.qsort (dr_group_sort_cmp);
2954   hash_set<stmt_vec_info> to_fixup;
2955
2956   /* Build the interleaving chains.  */
2957   for (i = 0; i < datarefs_copy.length () - 1;)
2958     {
2959       data_reference_p dra = datarefs_copy[i];
2960       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2961       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2962       stmt_vec_info lastinfo = NULL;
2963       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2964           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2965         {
2966           ++i;
2967           continue;
2968         }
2969       for (i = i + 1; i < datarefs_copy.length (); ++i)
2970         {
2971           data_reference_p drb = datarefs_copy[i];
2972           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2973           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2974           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2975               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2976             break;
2977
2978           /* ???  Imperfect sorting (non-compatible types, non-modulo
2979              accesses, same accesses) can lead to a group to be artificially
2980              split here as we don't just skip over those.  If it really
2981              matters we can push those to a worklist and re-iterate
2982              over them.  The we can just skip ahead to the next DR here.  */
2983
2984           /* DRs in a different loop should not be put into the same
2985              interleaving group.  */
2986           if (gimple_bb (DR_STMT (dra))->loop_father
2987               != gimple_bb (DR_STMT (drb))->loop_father)
2988             break;
2989
2990           /* Check that the data-refs have same first location (except init)
2991              and they are both either store or load (not load and store,
2992              not masked loads or stores).  */
2993           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2994               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2995                                         DR_BASE_ADDRESS (drb)) != 0
2996               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2997               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2998             break;
2999
3000           /* Check that the data-refs have the same constant size.  */
3001           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3002           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3003           if (!tree_fits_uhwi_p (sza)
3004               || !tree_fits_uhwi_p (szb)
3005               || !tree_int_cst_equal (sza, szb))
3006             break;
3007
3008           /* Check that the data-refs have the same step.  */
3009           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3010             break;
3011
3012           /* Check the types are compatible.
3013              ???  We don't distinguish this during sorting.  */
3014           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3015                                    TREE_TYPE (DR_REF (drb))))
3016             break;
3017
3018           /* Check that the DR_INITs are compile-time constants.  */
3019           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3020               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3021             break;
3022
3023           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3024              just hold extra information.  */
3025           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3026               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3027               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3028             break;
3029
3030           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3031           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3032           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3033           HOST_WIDE_INT init_prev
3034             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3035           gcc_assert (init_a <= init_b
3036                       && init_a <= init_prev
3037                       && init_prev <= init_b);
3038
3039           /* Do not place the same access in the interleaving chain twice.  */
3040           if (init_b == init_prev)
3041             {
3042               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3043                           < gimple_uid (DR_STMT (drb)));
3044               /* Simply link in duplicates and fix up the chain below.  */
3045             }
3046           else
3047             {
3048               /* If init_b == init_a + the size of the type * k, we have an
3049                  interleaving, and DRA is accessed before DRB.  */
3050               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3051               if (type_size_a == 0
3052                   || (init_b - init_a) % type_size_a != 0)
3053                 break;
3054
3055               /* If we have a store, the accesses are adjacent.  This splits
3056                  groups into chunks we support (we don't support vectorization
3057                  of stores with gaps).  */
3058               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3059                 break;
3060
3061               /* If the step (if not zero or non-constant) is greater than the
3062                  difference between data-refs' inits this splits groups into
3063                  suitable sizes.  */
3064               if (tree_fits_shwi_p (DR_STEP (dra)))
3065                 {
3066                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3067                   if (step != 0 && step <= (init_b - init_a))
3068                     break;
3069                 }
3070             }
3071
3072           if (dump_enabled_p ())
3073             dump_printf_loc (MSG_NOTE, vect_location,
3074                              DR_IS_READ (dra)
3075                              ? "Detected interleaving load %T and %T\n"
3076                              : "Detected interleaving store %T and %T\n",
3077                              DR_REF (dra), DR_REF (drb));
3078
3079           /* Link the found element into the group list.  */
3080           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3081             {
3082               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3083               lastinfo = stmtinfo_a;
3084             }
3085           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3086           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3087           lastinfo = stmtinfo_b;
3088
3089           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3090             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3091
3092           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3093             dump_printf_loc (MSG_NOTE, vect_location,
3094                              "Load suitable for SLP vectorization only.\n");
3095
3096           if (init_b == init_prev
3097               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3098               && dump_enabled_p ())
3099             dump_printf_loc (MSG_NOTE, vect_location,
3100                              "Queuing group with duplicate access for fixup\n");
3101         }
3102     }
3103
3104   /* Fixup groups with duplicate entries by splitting it.  */
3105   while (1)
3106     {
3107       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3108       if (!(it != to_fixup.end ()))
3109         break;
3110       stmt_vec_info grp = *it;
3111       to_fixup.remove (grp);
3112
3113       /* Find the earliest duplicate group member.  */
3114       unsigned first_duplicate = -1u;
3115       stmt_vec_info next, g = grp;
3116       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3117         {
3118           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3119                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3120               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3121             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3122           g = next;
3123         }
3124       if (first_duplicate == -1U)
3125         continue;
3126
3127       /* Then move all stmts after the first duplicate to a new group.
3128          Note this is a heuristic but one with the property that *it
3129          is fixed up completely.  */
3130       g = grp;
3131       stmt_vec_info newgroup = NULL, ng = grp;
3132       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3133         {
3134           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3135             {
3136               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3137               if (!newgroup)
3138                 newgroup = next;
3139               else
3140                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3141               ng = next;
3142               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3143             }
3144           else
3145             g = DR_GROUP_NEXT_ELEMENT (g);
3146         }
3147       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3148
3149       /* Fixup the new group which still may contain duplicates.  */
3150       to_fixup.add (newgroup);
3151     }
3152
3153   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3154     {
3155       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3156       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3157           && !vect_analyze_data_ref_access (vinfo, dr_info))
3158         {
3159           if (dump_enabled_p ())
3160             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3161                              "not vectorized: complicated access pattern.\n");
3162
3163           if (is_a <bb_vec_info> (vinfo))
3164             {
3165               /* Mark the statement as not vectorizable.  */
3166               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3167               continue;
3168             }
3169           else
3170             {
3171               datarefs_copy.release ();
3172               return opt_result::failure_at (dr_info->stmt->stmt,
3173                                              "not vectorized:"
3174                                              " complicated access pattern.\n");
3175             }
3176         }
3177     }
3178
3179   datarefs_copy.release ();
3180   return opt_result::success ();
3181 }
3182
3183 /* Function vect_vfa_segment_size.
3184
3185    Input:
3186      DR_INFO: The data reference.
3187      LENGTH_FACTOR: segment length to consider.
3188
3189    Return a value suitable for the dr_with_seg_len::seg_len field.
3190    This is the "distance travelled" by the pointer from the first
3191    iteration in the segment to the last.  Note that it does not include
3192    the size of the access; in effect it only describes the first byte.  */
3193
3194 static tree
3195 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3196 {
3197   length_factor = size_binop (MINUS_EXPR,
3198                               fold_convert (sizetype, length_factor),
3199                               size_one_node);
3200   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3201                      length_factor);
3202 }
3203
3204 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3205    gives the worst-case number of bytes covered by the segment.  */
3206
3207 static unsigned HOST_WIDE_INT
3208 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3209 {
3210   stmt_vec_info stmt_vinfo = dr_info->stmt;
3211   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3212   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3213   unsigned HOST_WIDE_INT access_size = ref_size;
3214   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3215     {
3216       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3217       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3218     }
3219   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3220       && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3221           == dr_explicit_realign_optimized))
3222     {
3223       /* We might access a full vector's worth.  */
3224       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3225       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3226     }
3227   return access_size;
3228 }
3229
3230 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3231    describes.  */
3232
3233 static unsigned int
3234 vect_vfa_align (dr_vec_info *dr_info)
3235 {
3236   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3237 }
3238
3239 /* Function vect_no_alias_p.
3240
3241    Given data references A and B with equal base and offset, see whether
3242    the alias relation can be decided at compilation time.  Return 1 if
3243    it can and the references alias, 0 if it can and the references do
3244    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3245    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3246    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3247
3248 static int
3249 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3250                          tree segment_length_a, tree segment_length_b,
3251                          unsigned HOST_WIDE_INT access_size_a,
3252                          unsigned HOST_WIDE_INT access_size_b)
3253 {
3254   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3255   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3256   poly_uint64 const_length_a;
3257   poly_uint64 const_length_b;
3258
3259   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3260      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3261      [a, a+12) */
3262   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3263     {
3264       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3265       offset_a -= const_length_a;
3266     }
3267   else
3268     const_length_a = tree_to_poly_uint64 (segment_length_a);
3269   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3270     {
3271       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3272       offset_b -= const_length_b;
3273     }
3274   else
3275     const_length_b = tree_to_poly_uint64 (segment_length_b);
3276
3277   const_length_a += access_size_a;
3278   const_length_b += access_size_b;
3279
3280   if (ranges_known_overlap_p (offset_a, const_length_a,
3281                               offset_b, const_length_b))
3282     return 1;
3283
3284   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3285                                offset_b, const_length_b))
3286     return 0;
3287
3288   return -1;
3289 }
3290
3291 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3292    in DDR is >= VF.  */
3293
3294 static bool
3295 dependence_distance_ge_vf (data_dependence_relation *ddr,
3296                            unsigned int loop_depth, poly_uint64 vf)
3297 {
3298   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3299       || DDR_NUM_DIST_VECTS (ddr) == 0)
3300     return false;
3301
3302   /* If the dependence is exact, we should have limited the VF instead.  */
3303   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3304
3305   unsigned int i;
3306   lambda_vector dist_v;
3307   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3308     {
3309       HOST_WIDE_INT dist = dist_v[loop_depth];
3310       if (dist != 0
3311           && !(dist > 0 && DDR_REVERSED_P (ddr))
3312           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3313         return false;
3314     }
3315
3316   if (dump_enabled_p ())
3317     dump_printf_loc (MSG_NOTE, vect_location,
3318                      "dependence distance between %T and %T is >= VF\n",
3319                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3320
3321   return true;
3322 }
3323
3324 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3325
3326 static void
3327 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3328 {
3329   dump_printf (dump_kind, "%s (%T) >= ",
3330                lower_bound.unsigned_p ? "unsigned" : "abs",
3331                lower_bound.expr);
3332   dump_dec (dump_kind, lower_bound.min_value);
3333 }
3334
3335 /* Record that the vectorized loop requires the vec_lower_bound described
3336    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3337
3338 static void
3339 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3340                         poly_uint64 min_value)
3341 {
3342   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3343   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3344     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3345       {
3346         unsigned_p &= lower_bounds[i].unsigned_p;
3347         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3348         if (lower_bounds[i].unsigned_p != unsigned_p
3349             || maybe_lt (lower_bounds[i].min_value, min_value))
3350           {
3351             lower_bounds[i].unsigned_p = unsigned_p;
3352             lower_bounds[i].min_value = min_value;
3353             if (dump_enabled_p ())
3354               {
3355                 dump_printf_loc (MSG_NOTE, vect_location,
3356                                  "updating run-time check to ");
3357                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3358                 dump_printf (MSG_NOTE, "\n");
3359               }
3360           }
3361         return;
3362       }
3363
3364   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3365   if (dump_enabled_p ())
3366     {
3367       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3368       dump_lower_bound (MSG_NOTE, lower_bound);
3369       dump_printf (MSG_NOTE, "\n");
3370     }
3371   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3372 }
3373
3374 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3375    will span fewer than GAP bytes.  */
3376
3377 static bool
3378 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3379                   poly_int64 gap)
3380 {
3381   stmt_vec_info stmt_info = dr_info->stmt;
3382   HOST_WIDE_INT count
3383     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3384   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3385     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3386   return (estimated_poly_value (gap)
3387           <= count * vect_get_scalar_dr_size (dr_info));
3388 }
3389
3390 /* Return true if we know that there is no alias between DR_INFO_A and
3391    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3392    When returning true, set *LOWER_BOUND_OUT to this N.  */
3393
3394 static bool
3395 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3396                                 poly_uint64 *lower_bound_out)
3397 {
3398   /* Check that there is a constant gap of known sign between DR_A
3399      and DR_B.  */
3400   data_reference *dr_a = dr_info_a->dr;
3401   data_reference *dr_b = dr_info_b->dr;
3402   poly_int64 init_a, init_b;
3403   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3404       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3405       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3406       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3407       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3408       || !ordered_p (init_a, init_b))
3409     return false;
3410
3411   /* Sort DR_A and DR_B by the address they access.  */
3412   if (maybe_lt (init_b, init_a))
3413     {
3414       std::swap (init_a, init_b);
3415       std::swap (dr_info_a, dr_info_b);
3416       std::swap (dr_a, dr_b);
3417     }
3418
3419   /* If the two accesses could be dependent within a scalar iteration,
3420      make sure that we'd retain their order.  */
3421   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3422       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3423     return false;
3424
3425   /* There is no alias if abs (DR_STEP) is greater than or equal to
3426      the bytes spanned by the combination of the two accesses.  */
3427   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3428   return true;
3429 }
3430
3431 /* Function vect_prune_runtime_alias_test_list.
3432
3433    Prune a list of ddrs to be tested at run-time by versioning for alias.
3434    Merge several alias checks into one if possible.
3435    Return FALSE if resulting list of ddrs is longer then allowed by
3436    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3437
3438 opt_result
3439 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3440 {
3441   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3442   hash_set <tree_pair_hash> compared_objects;
3443
3444   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3445   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3446     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3447   vec<vec_object_pair> &check_unequal_addrs
3448     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3449   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3450   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3451
3452   ddr_p ddr;
3453   unsigned int i;
3454   tree length_factor;
3455
3456   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3457
3458   /* Step values are irrelevant for aliasing if the number of vector
3459      iterations is equal to the number of scalar iterations (which can
3460      happen for fully-SLP loops).  */
3461   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3462
3463   if (!ignore_step_p)
3464     {
3465       /* Convert the checks for nonzero steps into bound tests.  */
3466       tree value;
3467       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3468         vect_check_lower_bound (loop_vinfo, value, true, 1);
3469     }
3470
3471   if (may_alias_ddrs.is_empty ())
3472     return opt_result::success ();
3473
3474   comp_alias_ddrs.create (may_alias_ddrs.length ());
3475
3476   unsigned int loop_depth
3477     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3478                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3479
3480   /* First, we collect all data ref pairs for aliasing checks.  */
3481   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3482     {
3483       poly_uint64 lower_bound;
3484       tree segment_length_a, segment_length_b;
3485       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3486       unsigned int align_a, align_b;
3487
3488       /* Ignore the alias if the VF we chose ended up being no greater
3489          than the dependence distance.  */
3490       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3491         continue;
3492
3493       if (DDR_OBJECT_A (ddr))
3494         {
3495           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3496           if (!compared_objects.add (new_pair))
3497             {
3498               if (dump_enabled_p ())
3499                 dump_printf_loc (MSG_NOTE, vect_location,
3500                                  "checking that %T and %T"
3501                                  " have different addresses\n",
3502                                  new_pair.first, new_pair.second);
3503               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3504             }
3505           continue;
3506         }
3507
3508       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3509       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3510
3511       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3512       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3513
3514       bool preserves_scalar_order_p
3515         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3516
3517       /* Skip the pair if inter-iteration dependencies are irrelevant
3518          and intra-iteration dependencies are guaranteed to be honored.  */
3519       if (ignore_step_p
3520           && (preserves_scalar_order_p
3521               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3522                                                  &lower_bound)))
3523         {
3524           if (dump_enabled_p ())
3525             dump_printf_loc (MSG_NOTE, vect_location,
3526                              "no need for alias check between "
3527                              "%T and %T when VF is 1\n",
3528                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3529           continue;
3530         }
3531
3532       /* See whether we can handle the alias using a bounds check on
3533          the step, and whether that's likely to be the best approach.
3534          (It might not be, for example, if the minimum step is much larger
3535          than the number of bytes handled by one vector iteration.)  */
3536       if (!ignore_step_p
3537           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3538           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3539                                              &lower_bound)
3540           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3541               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3542         {
3543           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3544           if (dump_enabled_p ())
3545             {
3546               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3547                                "%T and %T when the step %T is outside ",
3548                                DR_REF (dr_info_a->dr),
3549                                DR_REF (dr_info_b->dr),
3550                                DR_STEP (dr_info_a->dr));
3551               if (unsigned_p)
3552                 dump_printf (MSG_NOTE, "[0");
3553               else
3554                 {
3555                   dump_printf (MSG_NOTE, "(");
3556                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3557                 }
3558               dump_printf (MSG_NOTE, ", ");
3559               dump_dec (MSG_NOTE, lower_bound);
3560               dump_printf (MSG_NOTE, ")\n");
3561             }
3562           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3563                                   unsigned_p, lower_bound);
3564           continue;
3565         }
3566
3567       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3568       if (dr_group_first_a)
3569         {
3570           stmt_info_a = dr_group_first_a;
3571           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3572         }
3573
3574       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3575       if (dr_group_first_b)
3576         {
3577           stmt_info_b = dr_group_first_b;
3578           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3579         }
3580
3581       if (ignore_step_p)
3582         {
3583           segment_length_a = size_zero_node;
3584           segment_length_b = size_zero_node;
3585         }
3586       else
3587         {
3588           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3589                                 DR_STEP (dr_info_b->dr), 0))
3590             length_factor = scalar_loop_iters;
3591           else
3592             length_factor = size_int (vect_factor);
3593           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3594           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3595         }
3596       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3597       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3598       align_a = vect_vfa_align (dr_info_a);
3599       align_b = vect_vfa_align (dr_info_b);
3600
3601       /* See whether the alias is known at compilation time.  */
3602       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3603                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3604           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3605                               DR_OFFSET (dr_info_b->dr), 0)
3606           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3607           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3608           && poly_int_tree_p (segment_length_a)
3609           && poly_int_tree_p (segment_length_b))
3610         {
3611           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3612                                              segment_length_a,
3613                                              segment_length_b,
3614                                              access_size_a,
3615                                              access_size_b);
3616           if (res >= 0 && dump_enabled_p ())
3617             {
3618               dump_printf_loc (MSG_NOTE, vect_location,
3619                                "can tell at compile time that %T and %T",
3620                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3621               if (res == 0)
3622                 dump_printf (MSG_NOTE, " do not alias\n");
3623               else
3624                 dump_printf (MSG_NOTE, " alias\n");
3625             }
3626
3627           if (res == 0)
3628             continue;
3629
3630           if (res == 1)
3631             return opt_result::failure_at (stmt_info_b->stmt,
3632                                            "not vectorized:"
3633                                            " compilation time alias: %G%G",
3634                                            stmt_info_a->stmt,
3635                                            stmt_info_b->stmt);
3636         }
3637
3638       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3639                             access_size_a, align_a);
3640       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3641                             access_size_b, align_b);
3642       /* Canonicalize the order to be the one that's needed for accurate
3643          RAW, WAR and WAW flags, in cases where the data references are
3644          well-ordered.  The order doesn't really matter otherwise,
3645          but we might as well be consistent.  */
3646       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3647         std::swap (dr_a, dr_b);
3648
3649       dr_with_seg_len_pair_t dr_with_seg_len_pair
3650         (dr_a, dr_b, (preserves_scalar_order_p
3651                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3652                       : dr_with_seg_len_pair_t::REORDERED));
3653
3654       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3655     }
3656
3657   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3658
3659   unsigned int count = (comp_alias_ddrs.length ()
3660                         + check_unequal_addrs.length ());
3661
3662   if (dump_enabled_p ())
3663     dump_printf_loc (MSG_NOTE, vect_location,
3664                      "improved number of alias checks from %d to %d\n",
3665                      may_alias_ddrs.length (), count);
3666   unsigned limit = param_vect_max_version_for_alias_checks;
3667   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3668     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3669   if (count > limit)
3670     return opt_result::failure_at
3671       (vect_location,
3672        "number of versioning for alias run-time tests exceeds %d "
3673        "(--param vect-max-version-for-alias-checks)\n", limit);
3674
3675   return opt_result::success ();
3676 }
3677
3678 /* Check whether we can use an internal function for a gather load
3679    or scatter store.  READ_P is true for loads and false for stores.
3680    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3681    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3682    is the type of the offset that is being applied to the invariant
3683    base address.  SCALE is the amount by which the offset should
3684    be multiplied *after* it has been converted to address width.
3685
3686    Return true if the function is supported, storing the function id in
3687    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3688
3689 bool
3690 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3691                           tree vectype, tree memory_type, tree offset_type,
3692                           int scale, internal_fn *ifn_out,
3693                           tree *offset_vectype_out)
3694 {
3695   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3696   unsigned int element_bits = vector_element_bits (vectype);
3697   if (element_bits != memory_bits)
3698     /* For now the vector elements must be the same width as the
3699        memory elements.  */
3700     return false;
3701
3702   /* Work out which function we need.  */
3703   internal_fn ifn;
3704   if (read_p)
3705     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3706   else
3707     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3708
3709   for (;;)
3710     {
3711       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3712       if (!offset_vectype)
3713         return false;
3714
3715       /* Test whether the target supports this combination.  */
3716       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3717                                                   offset_vectype, scale))
3718         {
3719           *ifn_out = ifn;
3720           *offset_vectype_out = offset_vectype;
3721           return true;
3722         }
3723
3724       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3725           && TYPE_PRECISION (offset_type) >= element_bits)
3726         return false;
3727
3728       offset_type = build_nonstandard_integer_type
3729         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3730     }
3731 }
3732
3733 /* STMT_INFO is a call to an internal gather load or scatter store function.
3734    Describe the operation in INFO.  */
3735
3736 static void
3737 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3738                                    gather_scatter_info *info)
3739 {
3740   gcall *call = as_a <gcall *> (stmt_info->stmt);
3741   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3742   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3743
3744   info->ifn = gimple_call_internal_fn (call);
3745   info->decl = NULL_TREE;
3746   info->base = gimple_call_arg (call, 0);
3747   info->offset = gimple_call_arg (call, 1);
3748   info->offset_dt = vect_unknown_def_type;
3749   info->offset_vectype = NULL_TREE;
3750   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3751   info->element_type = TREE_TYPE (vectype);
3752   info->memory_type = TREE_TYPE (DR_REF (dr));
3753 }
3754
3755 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3756    gather load or scatter store.  Describe the operation in *INFO if so.  */
3757
3758 bool
3759 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3760                            gather_scatter_info *info)
3761 {
3762   HOST_WIDE_INT scale = 1;
3763   poly_int64 pbitpos, pbitsize;
3764   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3765   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3766   tree offtype = NULL_TREE;
3767   tree decl = NULL_TREE, base, off;
3768   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3769   tree memory_type = TREE_TYPE (DR_REF (dr));
3770   machine_mode pmode;
3771   int punsignedp, reversep, pvolatilep = 0;
3772   internal_fn ifn;
3773   tree offset_vectype;
3774   bool masked_p = false;
3775
3776   /* See whether this is already a call to a gather/scatter internal function.
3777      If not, see whether it's a masked load or store.  */
3778   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3779   if (call && gimple_call_internal_p (call))
3780     {
3781       ifn = gimple_call_internal_fn (call);
3782       if (internal_gather_scatter_fn_p (ifn))
3783         {
3784           vect_describe_gather_scatter_call (stmt_info, info);
3785           return true;
3786         }
3787       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3788     }
3789
3790   /* True if we should aim to use internal functions rather than
3791      built-in functions.  */
3792   bool use_ifn_p = (DR_IS_READ (dr)
3793                     ? supports_vec_gather_load_p ()
3794                     : supports_vec_scatter_store_p ());
3795
3796   base = DR_REF (dr);
3797   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3798      see if we can use the def stmt of the address.  */
3799   if (masked_p
3800       && TREE_CODE (base) == MEM_REF
3801       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3802       && integer_zerop (TREE_OPERAND (base, 1))
3803       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3804     {
3805       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3806       if (is_gimple_assign (def_stmt)
3807           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3808         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3809     }
3810
3811   /* The gather and scatter builtins need address of the form
3812      loop_invariant + vector * {1, 2, 4, 8}
3813      or
3814      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3815      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3816      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3817      multiplications and additions in it.  To get a vector, we need
3818      a single SSA_NAME that will be defined in the loop and will
3819      contain everything that is not loop invariant and that can be
3820      vectorized.  The following code attempts to find such a preexistng
3821      SSA_NAME OFF and put the loop invariants into a tree BASE
3822      that can be gimplified before the loop.  */
3823   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3824                               &punsignedp, &reversep, &pvolatilep);
3825   if (reversep)
3826     return false;
3827
3828   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3829
3830   if (TREE_CODE (base) == MEM_REF)
3831     {
3832       if (!integer_zerop (TREE_OPERAND (base, 1)))
3833         {
3834           if (off == NULL_TREE)
3835             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3836           else
3837             off = size_binop (PLUS_EXPR, off,
3838                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3839         }
3840       base = TREE_OPERAND (base, 0);
3841     }
3842   else
3843     base = build_fold_addr_expr (base);
3844
3845   if (off == NULL_TREE)
3846     off = size_zero_node;
3847
3848   /* If base is not loop invariant, either off is 0, then we start with just
3849      the constant offset in the loop invariant BASE and continue with base
3850      as OFF, otherwise give up.
3851      We could handle that case by gimplifying the addition of base + off
3852      into some SSA_NAME and use that as off, but for now punt.  */
3853   if (!expr_invariant_in_loop_p (loop, base))
3854     {
3855       if (!integer_zerop (off))
3856         return false;
3857       off = base;
3858       base = size_int (pbytepos);
3859     }
3860   /* Otherwise put base + constant offset into the loop invariant BASE
3861      and continue with OFF.  */
3862   else
3863     {
3864       base = fold_convert (sizetype, base);
3865       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3866     }
3867
3868   /* OFF at this point may be either a SSA_NAME or some tree expression
3869      from get_inner_reference.  Try to peel off loop invariants from it
3870      into BASE as long as possible.  */
3871   STRIP_NOPS (off);
3872   while (offtype == NULL_TREE)
3873     {
3874       enum tree_code code;
3875       tree op0, op1, add = NULL_TREE;
3876
3877       if (TREE_CODE (off) == SSA_NAME)
3878         {
3879           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3880
3881           if (expr_invariant_in_loop_p (loop, off))
3882             return false;
3883
3884           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3885             break;
3886
3887           op0 = gimple_assign_rhs1 (def_stmt);
3888           code = gimple_assign_rhs_code (def_stmt);
3889           op1 = gimple_assign_rhs2 (def_stmt);
3890         }
3891       else
3892         {
3893           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3894             return false;
3895           code = TREE_CODE (off);
3896           extract_ops_from_tree (off, &code, &op0, &op1);
3897         }
3898       switch (code)
3899         {
3900         case POINTER_PLUS_EXPR:
3901         case PLUS_EXPR:
3902           if (expr_invariant_in_loop_p (loop, op0))
3903             {
3904               add = op0;
3905               off = op1;
3906             do_add:
3907               add = fold_convert (sizetype, add);
3908               if (scale != 1)
3909                 add = size_binop (MULT_EXPR, add, size_int (scale));
3910               base = size_binop (PLUS_EXPR, base, add);
3911               continue;
3912             }
3913           if (expr_invariant_in_loop_p (loop, op1))
3914             {
3915               add = op1;
3916               off = op0;
3917               goto do_add;
3918             }
3919           break;
3920         case MINUS_EXPR:
3921           if (expr_invariant_in_loop_p (loop, op1))
3922             {
3923               add = fold_convert (sizetype, op1);
3924               add = size_binop (MINUS_EXPR, size_zero_node, add);
3925               off = op0;
3926               goto do_add;
3927             }
3928           break;
3929         case MULT_EXPR:
3930           if (scale == 1 && tree_fits_shwi_p (op1))
3931             {
3932               int new_scale = tree_to_shwi (op1);
3933               /* Only treat this as a scaling operation if the target
3934                  supports it for at least some offset type.  */
3935               if (use_ifn_p
3936                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3937                                                 masked_p, vectype, memory_type,
3938                                                 signed_char_type_node,
3939                                                 new_scale, &ifn,
3940                                                 &offset_vectype)
3941                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3942                                                 masked_p, vectype, memory_type,
3943                                                 unsigned_char_type_node,
3944                                                 new_scale, &ifn,
3945                                                 &offset_vectype))
3946                 break;
3947               scale = new_scale;
3948               off = op0;
3949               continue;
3950             }
3951           break;
3952         case SSA_NAME:
3953           off = op0;
3954           continue;
3955         CASE_CONVERT:
3956           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3957               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3958             break;
3959
3960           /* Don't include the conversion if the target is happy with
3961              the current offset type.  */
3962           if (use_ifn_p
3963               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3964                                            masked_p, vectype, memory_type,
3965                                            TREE_TYPE (off), scale, &ifn,
3966                                            &offset_vectype))
3967             break;
3968
3969           if (TYPE_PRECISION (TREE_TYPE (op0))
3970               == TYPE_PRECISION (TREE_TYPE (off)))
3971             {
3972               off = op0;
3973               continue;
3974             }
3975
3976           if (TYPE_PRECISION (TREE_TYPE (op0))
3977               < TYPE_PRECISION (TREE_TYPE (off)))
3978             {
3979               off = op0;
3980               offtype = TREE_TYPE (off);
3981               STRIP_NOPS (off);
3982               continue;
3983             }
3984           break;
3985         default:
3986           break;
3987         }
3988       break;
3989     }
3990
3991   /* If at the end OFF still isn't a SSA_NAME or isn't
3992      defined in the loop, punt.  */
3993   if (TREE_CODE (off) != SSA_NAME
3994       || expr_invariant_in_loop_p (loop, off))
3995     return false;
3996
3997   if (offtype == NULL_TREE)
3998     offtype = TREE_TYPE (off);
3999
4000   if (use_ifn_p)
4001     {
4002       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4003                                      vectype, memory_type, offtype, scale,
4004                                      &ifn, &offset_vectype))
4005         return false;
4006     }
4007   else
4008     {
4009       if (DR_IS_READ (dr))
4010         {
4011           if (targetm.vectorize.builtin_gather)
4012             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4013         }
4014       else
4015         {
4016           if (targetm.vectorize.builtin_scatter)
4017             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4018         }
4019
4020       if (!decl)
4021         return false;
4022
4023       ifn = IFN_LAST;
4024       /* The offset vector type will be read from DECL when needed.  */
4025       offset_vectype = NULL_TREE;
4026     }
4027
4028   info->ifn = ifn;
4029   info->decl = decl;
4030   info->base = base;
4031   info->offset = off;
4032   info->offset_dt = vect_unknown_def_type;
4033   info->offset_vectype = offset_vectype;
4034   info->scale = scale;
4035   info->element_type = TREE_TYPE (vectype);
4036   info->memory_type = memory_type;
4037   return true;
4038 }
4039
4040 /* Find the data references in STMT, analyze them with respect to LOOP and
4041    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4042    be handled.  */
4043
4044 opt_result
4045 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4046                                vec<data_reference_p> *datarefs)
4047 {
4048   /* We can ignore clobbers for dataref analysis - they are removed during
4049      loop vectorization and BB vectorization checks dependences with a
4050      stmt walk.  */
4051   if (gimple_clobber_p (stmt))
4052     return opt_result::success ();
4053
4054   if (gimple_has_volatile_ops (stmt))
4055     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4056                                    stmt);
4057
4058   if (stmt_can_throw_internal (cfun, stmt))
4059     return opt_result::failure_at (stmt,
4060                                    "not vectorized:"
4061                                    " statement can throw an exception: %G",
4062                                    stmt);
4063
4064   auto_vec<data_reference_p, 2> refs;
4065   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4066   if (!res)
4067     return res;
4068
4069   if (refs.is_empty ())
4070     return opt_result::success ();
4071
4072   if (refs.length () > 1)
4073     return opt_result::failure_at (stmt,
4074                                    "not vectorized:"
4075                                    " more than one data ref in stmt: %G", stmt);
4076
4077   if (gcall *call = dyn_cast <gcall *> (stmt))
4078     if (!gimple_call_internal_p (call)
4079         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4080             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4081       return opt_result::failure_at (stmt,
4082                                      "not vectorized: dr in a call %G", stmt);
4083
4084   data_reference_p dr = refs.pop ();
4085   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4086       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4087     return opt_result::failure_at (stmt,
4088                                    "not vectorized:"
4089                                    " statement is bitfield access %G", stmt);
4090
4091   if (DR_BASE_ADDRESS (dr)
4092       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4093     return opt_result::failure_at (stmt,
4094                                    "not vectorized:"
4095                                    " base addr of dr is a constant\n");
4096
4097   /* Check whether this may be a SIMD lane access and adjust the
4098      DR to make it easier for us to handle it.  */
4099   if (loop
4100       && loop->simduid
4101       && (!DR_BASE_ADDRESS (dr)
4102           || !DR_OFFSET (dr)
4103           || !DR_INIT (dr)
4104           || !DR_STEP (dr)))
4105     {
4106       struct data_reference *newdr
4107         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4108                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4109       if (DR_BASE_ADDRESS (newdr)
4110           && DR_OFFSET (newdr)
4111           && DR_INIT (newdr)
4112           && DR_STEP (newdr)
4113           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4114           && integer_zerop (DR_STEP (newdr)))
4115         {
4116           tree base_address = DR_BASE_ADDRESS (newdr);
4117           tree off = DR_OFFSET (newdr);
4118           tree step = ssize_int (1);
4119           if (integer_zerop (off)
4120               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4121             {
4122               off = TREE_OPERAND (base_address, 1);
4123               base_address = TREE_OPERAND (base_address, 0);
4124             }
4125           STRIP_NOPS (off);
4126           if (TREE_CODE (off) == MULT_EXPR
4127               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4128             {
4129               step = TREE_OPERAND (off, 1);
4130               off = TREE_OPERAND (off, 0);
4131               STRIP_NOPS (off);
4132             }
4133           if (CONVERT_EXPR_P (off)
4134               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4135                   < TYPE_PRECISION (TREE_TYPE (off))))
4136             off = TREE_OPERAND (off, 0);
4137           if (TREE_CODE (off) == SSA_NAME)
4138             {
4139               gimple *def = SSA_NAME_DEF_STMT (off);
4140               /* Look through widening conversion.  */
4141               if (is_gimple_assign (def)
4142                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4143                 {
4144                   tree rhs1 = gimple_assign_rhs1 (def);
4145                   if (TREE_CODE (rhs1) == SSA_NAME
4146                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4147                       && (TYPE_PRECISION (TREE_TYPE (off))
4148                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4149                     def = SSA_NAME_DEF_STMT (rhs1);
4150                 }
4151               if (is_gimple_call (def)
4152                   && gimple_call_internal_p (def)
4153                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4154                 {
4155                   tree arg = gimple_call_arg (def, 0);
4156                   tree reft = TREE_TYPE (DR_REF (newdr));
4157                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4158                   arg = SSA_NAME_VAR (arg);
4159                   if (arg == loop->simduid
4160                       /* For now.  */
4161                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4162                     {
4163                       DR_BASE_ADDRESS (newdr) = base_address;
4164                       DR_OFFSET (newdr) = ssize_int (0);
4165                       DR_STEP (newdr) = step;
4166                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4167                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4168                       /* Mark as simd-lane access.  */
4169                       tree arg2 = gimple_call_arg (def, 1);
4170                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4171                       free_data_ref (dr);
4172                       datarefs->safe_push (newdr);
4173                       return opt_result::success ();
4174                     }
4175                 }
4176             }
4177         }
4178       free_data_ref (newdr);
4179     }
4180
4181   datarefs->safe_push (dr);
4182   return opt_result::success ();
4183 }
4184
4185 /* Function vect_analyze_data_refs.
4186
4187   Find all the data references in the loop or basic block.
4188
4189    The general structure of the analysis of data refs in the vectorizer is as
4190    follows:
4191    1- vect_analyze_data_refs(loop/bb): call
4192       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4193       in the loop/bb and their dependences.
4194    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4195    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4196    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4197
4198 */
4199
4200 opt_result
4201 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4202 {
4203   class loop *loop = NULL;
4204   unsigned int i;
4205   struct data_reference *dr;
4206   tree scalar_type;
4207
4208   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4209
4210   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4211     loop = LOOP_VINFO_LOOP (loop_vinfo);
4212
4213   /* Go through the data-refs, check that the analysis succeeded.  Update
4214      pointer from stmt_vec_info struct to DR and vectype.  */
4215
4216   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4217   FOR_EACH_VEC_ELT (datarefs, i, dr)
4218     {
4219       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4220       poly_uint64 vf;
4221
4222       gcc_assert (DR_REF (dr));
4223       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4224       gcc_assert (!stmt_info->dr_aux.dr);
4225       stmt_info->dr_aux.dr = dr;
4226       stmt_info->dr_aux.stmt = stmt_info;
4227
4228       /* Check that analysis of the data-ref succeeded.  */
4229       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4230           || !DR_STEP (dr))
4231         {
4232           bool maybe_gather
4233             = DR_IS_READ (dr)
4234               && !TREE_THIS_VOLATILE (DR_REF (dr))
4235               && (targetm.vectorize.builtin_gather != NULL
4236                   || supports_vec_gather_load_p ());
4237           bool maybe_scatter
4238             = DR_IS_WRITE (dr)
4239               && !TREE_THIS_VOLATILE (DR_REF (dr))
4240               && (targetm.vectorize.builtin_scatter != NULL
4241                   || supports_vec_scatter_store_p ());
4242
4243           /* If target supports vector gather loads or scatter stores,
4244              see if they can't be used.  */
4245           if (is_a <loop_vec_info> (vinfo)
4246               && !nested_in_vect_loop_p (loop, stmt_info))
4247             {
4248               if (maybe_gather || maybe_scatter)
4249                 {
4250                   if (maybe_gather)
4251                     gatherscatter = GATHER;
4252                   else
4253                     gatherscatter = SCATTER;
4254                 }
4255             }
4256
4257           if (gatherscatter == SG_NONE)
4258             {
4259               if (dump_enabled_p ())
4260                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4261                                  "not vectorized: data ref analysis "
4262                                  "failed %G", stmt_info->stmt);
4263               if (is_a <bb_vec_info> (vinfo))
4264                 {
4265                   /* In BB vectorization the ref can still participate
4266                      in dependence analysis, we just can't vectorize it.  */
4267                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4268                   continue;
4269                 }
4270               return opt_result::failure_at (stmt_info->stmt,
4271                                              "not vectorized:"
4272                                              " data ref analysis failed: %G",
4273                                              stmt_info->stmt);
4274             }
4275         }
4276
4277       /* See if this was detected as SIMD lane access.  */
4278       if (dr->aux == (void *)-1
4279           || dr->aux == (void *)-2
4280           || dr->aux == (void *)-3
4281           || dr->aux == (void *)-4)
4282         {
4283           if (nested_in_vect_loop_p (loop, stmt_info))
4284             return opt_result::failure_at (stmt_info->stmt,
4285                                            "not vectorized:"
4286                                            " data ref analysis failed: %G",
4287                                            stmt_info->stmt);
4288           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4289             = -(uintptr_t) dr->aux;
4290         }
4291
4292       tree base = get_base_address (DR_REF (dr));
4293       if (base && VAR_P (base) && DECL_NONALIASED (base))
4294         {
4295           if (dump_enabled_p ())
4296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4297                              "not vectorized: base object not addressable "
4298                              "for stmt: %G", stmt_info->stmt);
4299           if (is_a <bb_vec_info> (vinfo))
4300             {
4301               /* In BB vectorization the ref can still participate
4302                  in dependence analysis, we just can't vectorize it.  */
4303               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4304               continue;
4305             }
4306           return opt_result::failure_at (stmt_info->stmt,
4307                                          "not vectorized: base object not"
4308                                          " addressable for stmt: %G",
4309                                          stmt_info->stmt);
4310         }
4311
4312       if (is_a <loop_vec_info> (vinfo)
4313           && DR_STEP (dr)
4314           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4315         {
4316           if (nested_in_vect_loop_p (loop, stmt_info))
4317             return opt_result::failure_at (stmt_info->stmt,
4318                                            "not vectorized: "
4319                                            "not suitable for strided load %G",
4320                                            stmt_info->stmt);
4321           STMT_VINFO_STRIDED_P (stmt_info) = true;
4322         }
4323
4324       /* Update DR field in stmt_vec_info struct.  */
4325
4326       /* If the dataref is in an inner-loop of the loop that is considered for
4327          for vectorization, we also want to analyze the access relative to
4328          the outer-loop (DR contains information only relative to the
4329          inner-most enclosing loop).  We do that by building a reference to the
4330          first location accessed by the inner-loop, and analyze it relative to
4331          the outer-loop.  */
4332       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4333         {
4334           /* Build a reference to the first location accessed by the
4335              inner loop: *(BASE + INIT + OFFSET).  By construction,
4336              this address must be invariant in the inner loop, so we
4337              can consider it as being used in the outer loop.  */
4338           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4339           tree offset = unshare_expr (DR_OFFSET (dr));
4340           tree init = unshare_expr (DR_INIT (dr));
4341           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4342                                           init, offset);
4343           tree init_addr = fold_build_pointer_plus (base, init_offset);
4344           tree init_ref = build_fold_indirect_ref (init_addr);
4345
4346           if (dump_enabled_p ())
4347             dump_printf_loc (MSG_NOTE, vect_location,
4348                              "analyze in outer loop: %T\n", init_ref);
4349
4350           opt_result res
4351             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4352                                     init_ref, loop, stmt_info->stmt);
4353           if (!res)
4354             /* dr_analyze_innermost already explained the failure.  */
4355             return res;
4356
4357           if (dump_enabled_p ())
4358             dump_printf_loc (MSG_NOTE, vect_location,
4359                              "\touter base_address: %T\n"
4360                              "\touter offset from base address: %T\n"
4361                              "\touter constant offset from base address: %T\n"
4362                              "\touter step: %T\n"
4363                              "\touter base alignment: %d\n\n"
4364                              "\touter base misalignment: %d\n"
4365                              "\touter offset alignment: %d\n"
4366                              "\touter step alignment: %d\n",
4367                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4368                              STMT_VINFO_DR_OFFSET (stmt_info),
4369                              STMT_VINFO_DR_INIT (stmt_info),
4370                              STMT_VINFO_DR_STEP (stmt_info),
4371                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4372                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4373                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4374                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4375         }
4376
4377       /* Set vectype for STMT.  */
4378       scalar_type = TREE_TYPE (DR_REF (dr));
4379       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4380       if (!vectype)
4381         {
4382           if (dump_enabled_p ())
4383             {
4384               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4385                                "not vectorized: no vectype for stmt: %G",
4386                                stmt_info->stmt);
4387               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4388               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4389                                  scalar_type);
4390               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4391             }
4392
4393           if (is_a <bb_vec_info> (vinfo))
4394             {
4395               /* No vector type is fine, the ref can still participate
4396                  in dependence analysis, we just can't vectorize it.  */
4397               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4398               continue;
4399             }
4400           if (fatal)
4401             *fatal = false;
4402           return opt_result::failure_at (stmt_info->stmt,
4403                                          "not vectorized:"
4404                                          " no vectype for stmt: %G"
4405                                          " scalar_type: %T\n",
4406                                          stmt_info->stmt, scalar_type);
4407         }
4408       else
4409         {
4410           if (dump_enabled_p ())
4411             dump_printf_loc (MSG_NOTE, vect_location,
4412                              "got vectype for stmt: %G%T\n",
4413                              stmt_info->stmt, vectype);
4414         }
4415
4416       /* Adjust the minimal vectorization factor according to the
4417          vector type.  */
4418       vf = TYPE_VECTOR_SUBPARTS (vectype);
4419       *min_vf = upper_bound (*min_vf, vf);
4420
4421       /* Leave the BB vectorizer to pick the vector type later, based on
4422          the final dataref group size and SLP node size.  */
4423       if (is_a <loop_vec_info> (vinfo))
4424         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4425
4426       if (gatherscatter != SG_NONE)
4427         {
4428           gather_scatter_info gs_info;
4429           if (!vect_check_gather_scatter (stmt_info,
4430                                           as_a <loop_vec_info> (vinfo),
4431                                           &gs_info)
4432               || !get_vectype_for_scalar_type (vinfo,
4433                                                TREE_TYPE (gs_info.offset)))
4434             {
4435               if (fatal)
4436                 *fatal = false;
4437               return opt_result::failure_at
4438                         (stmt_info->stmt,
4439                          (gatherscatter == GATHER)
4440                          ? "not vectorized: not suitable for gather load %G"
4441                          : "not vectorized: not suitable for scatter store %G",
4442                          stmt_info->stmt);
4443             }
4444           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4445         }
4446     }
4447
4448   /* We used to stop processing and prune the list here.  Verify we no
4449      longer need to.  */
4450   gcc_assert (i == datarefs.length ());
4451
4452   return opt_result::success ();
4453 }
4454
4455
4456 /* Function vect_get_new_vect_var.
4457
4458    Returns a name for a new variable.  The current naming scheme appends the
4459    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4460    the name of vectorizer generated variables, and appends that to NAME if
4461    provided.  */
4462
4463 tree
4464 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4465 {
4466   const char *prefix;
4467   tree new_vect_var;
4468
4469   switch (var_kind)
4470   {
4471   case vect_simple_var:
4472     prefix = "vect";
4473     break;
4474   case vect_scalar_var:
4475     prefix = "stmp";
4476     break;
4477   case vect_mask_var:
4478     prefix = "mask";
4479     break;
4480   case vect_pointer_var:
4481     prefix = "vectp";
4482     break;
4483   default:
4484     gcc_unreachable ();
4485   }
4486
4487   if (name)
4488     {
4489       char* tmp = concat (prefix, "_", name, NULL);
4490       new_vect_var = create_tmp_reg (type, tmp);
4491       free (tmp);
4492     }
4493   else
4494     new_vect_var = create_tmp_reg (type, prefix);
4495
4496   return new_vect_var;
4497 }
4498
4499 /* Like vect_get_new_vect_var but return an SSA name.  */
4500
4501 tree
4502 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4503 {
4504   const char *prefix;
4505   tree new_vect_var;
4506
4507   switch (var_kind)
4508   {
4509   case vect_simple_var:
4510     prefix = "vect";
4511     break;
4512   case vect_scalar_var:
4513     prefix = "stmp";
4514     break;
4515   case vect_pointer_var:
4516     prefix = "vectp";
4517     break;
4518   default:
4519     gcc_unreachable ();
4520   }
4521
4522   if (name)
4523     {
4524       char* tmp = concat (prefix, "_", name, NULL);
4525       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4526       free (tmp);
4527     }
4528   else
4529     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4530
4531   return new_vect_var;
4532 }
4533
4534 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4535
4536 static void
4537 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4538 {
4539   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4540   int misalign = DR_MISALIGNMENT (dr_info);
4541   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4542     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4543   else
4544     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4545                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4546                             misalign);
4547 }
4548
4549 /* Function vect_create_addr_base_for_vector_ref.
4550
4551    Create an expression that computes the address of the first memory location
4552    that will be accessed for a data reference.
4553
4554    Input:
4555    STMT_INFO: The statement containing the data reference.
4556    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4557    OFFSET: Optional. If supplied, it is be added to the initial address.
4558    LOOP:    Specify relative to which loop-nest should the address be computed.
4559             For example, when the dataref is in an inner-loop nested in an
4560             outer-loop that is now being vectorized, LOOP can be either the
4561             outer-loop, or the inner-loop.  The first memory location accessed
4562             by the following dataref ('in' points to short):
4563
4564                 for (i=0; i<N; i++)
4565                    for (j=0; j<M; j++)
4566                      s += in[i+j]
4567
4568             is as follows:
4569             if LOOP=i_loop:     &in             (relative to i_loop)
4570             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4571    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4572             initial address.  Unlike OFFSET, which is number of elements to
4573             be added, BYTE_OFFSET is measured in bytes.
4574
4575    Output:
4576    1. Return an SSA_NAME whose value is the address of the memory location of
4577       the first vector of the data reference.
4578    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4579       these statement(s) which define the returned SSA_NAME.
4580
4581    FORNOW: We are only handling array accesses with step 1.  */
4582
4583 tree
4584 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4585                                       gimple_seq *new_stmt_list,
4586                                       tree offset,
4587                                       tree byte_offset)
4588 {
4589   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4590   struct data_reference *dr = dr_info->dr;
4591   const char *base_name;
4592   tree addr_base;
4593   tree dest;
4594   gimple_seq seq = NULL;
4595   tree vect_ptr_type;
4596   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4597   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4598   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4599
4600   tree data_ref_base = unshare_expr (drb->base_address);
4601   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4602   tree init = unshare_expr (drb->init);
4603
4604   if (loop_vinfo)
4605     base_name = get_name (data_ref_base);
4606   else
4607     {
4608       base_offset = ssize_int (0);
4609       init = ssize_int (0);
4610       base_name = get_name (DR_REF (dr));
4611     }
4612
4613   /* Create base_offset */
4614   base_offset = size_binop (PLUS_EXPR,
4615                             fold_convert (sizetype, base_offset),
4616                             fold_convert (sizetype, init));
4617
4618   if (offset)
4619     {
4620       offset = fold_build2 (MULT_EXPR, sizetype,
4621                             fold_convert (sizetype, offset), step);
4622       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4623                                  base_offset, offset);
4624     }
4625   if (byte_offset)
4626     {
4627       byte_offset = fold_convert (sizetype, byte_offset);
4628       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4629                                  base_offset, byte_offset);
4630     }
4631
4632   /* base + base_offset */
4633   if (loop_vinfo)
4634     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4635   else
4636     {
4637       addr_base = build1 (ADDR_EXPR,
4638                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4639                           unshare_expr (DR_REF (dr)));
4640     }
4641
4642   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4643   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4644   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4645   gimple_seq_add_seq (new_stmt_list, seq);
4646
4647   if (DR_PTR_INFO (dr)
4648       && TREE_CODE (addr_base) == SSA_NAME
4649       && !SSA_NAME_PTR_INFO (addr_base))
4650     {
4651       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4652       if (offset || byte_offset)
4653         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4654     }
4655
4656   if (dump_enabled_p ())
4657     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4658
4659   return addr_base;
4660 }
4661
4662
4663 /* Function vect_create_data_ref_ptr.
4664
4665    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4666    location accessed in the loop by STMT_INFO, along with the def-use update
4667    chain to appropriately advance the pointer through the loop iterations.
4668    Also set aliasing information for the pointer.  This pointer is used by
4669    the callers to this function to create a memory reference expression for
4670    vector load/store access.
4671
4672    Input:
4673    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4674          GIMPLE_ASSIGN <name, data-ref> or
4675          GIMPLE_ASSIGN <data-ref, name>.
4676    2. AGGR_TYPE: the type of the reference, which should be either a vector
4677         or an array.
4678    3. AT_LOOP: the loop where the vector memref is to be created.
4679    4. OFFSET (optional): an offset to be added to the initial address accessed
4680         by the data-ref in STMT_INFO.
4681    5. BSI: location where the new stmts are to be placed if there is no loop
4682    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4683         pointing to the initial address.
4684    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4685         to the initial address accessed by the data-ref in STMT_INFO.  This is
4686         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4687         in bytes.
4688    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4689         to the IV during each iteration of the loop.  NULL says to move
4690         by one copy of AGGR_TYPE up or down, depending on the step of the
4691         data reference.
4692
4693    Output:
4694    1. Declare a new ptr to vector_type, and have it point to the base of the
4695       data reference (initial addressed accessed by the data reference).
4696       For example, for vector of type V8HI, the following code is generated:
4697
4698       v8hi *ap;
4699       ap = (v8hi *)initial_address;
4700
4701       if OFFSET is not supplied:
4702          initial_address = &a[init];
4703       if OFFSET is supplied:
4704          initial_address = &a[init + OFFSET];
4705       if BYTE_OFFSET is supplied:
4706          initial_address = &a[init] + BYTE_OFFSET;
4707
4708       Return the initial_address in INITIAL_ADDRESS.
4709
4710    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4711       update the pointer in each iteration of the loop.
4712
4713       Return the increment stmt that updates the pointer in PTR_INCR.
4714
4715    3. Return the pointer.  */
4716
4717 tree
4718 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4719                           tree aggr_type, class loop *at_loop, tree offset,
4720                           tree *initial_address, gimple_stmt_iterator *gsi,
4721                           gimple **ptr_incr, bool only_init,
4722                           tree byte_offset, tree iv_step)
4723 {
4724   const char *base_name;
4725   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4726   class loop *loop = NULL;
4727   bool nested_in_vect_loop = false;
4728   class loop *containing_loop = NULL;
4729   tree aggr_ptr_type;
4730   tree aggr_ptr;
4731   tree new_temp;
4732   gimple_seq new_stmt_list = NULL;
4733   edge pe = NULL;
4734   basic_block new_bb;
4735   tree aggr_ptr_init;
4736   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4737   struct data_reference *dr = dr_info->dr;
4738   tree aptr;
4739   gimple_stmt_iterator incr_gsi;
4740   bool insert_after;
4741   tree indx_before_incr, indx_after_incr;
4742   gimple *incr;
4743   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4744
4745   gcc_assert (iv_step != NULL_TREE
4746               || TREE_CODE (aggr_type) == ARRAY_TYPE
4747               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4748
4749   if (loop_vinfo)
4750     {
4751       loop = LOOP_VINFO_LOOP (loop_vinfo);
4752       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4753       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4754       pe = loop_preheader_edge (loop);
4755     }
4756   else
4757     {
4758       gcc_assert (bb_vinfo);
4759       only_init = true;
4760       *ptr_incr = NULL;
4761     }
4762
4763   /* Create an expression for the first address accessed by this load
4764      in LOOP.  */
4765   base_name = get_name (DR_BASE_ADDRESS (dr));
4766
4767   if (dump_enabled_p ())
4768     {
4769       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4770       dump_printf_loc (MSG_NOTE, vect_location,
4771                        "create %s-pointer variable to type: %T",
4772                        get_tree_code_name (TREE_CODE (aggr_type)),
4773                        aggr_type);
4774       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4775         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4776       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4777         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4778       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4779         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4780       else
4781         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4782       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4783     }
4784
4785   /* (1) Create the new aggregate-pointer variable.
4786      Vector and array types inherit the alias set of their component
4787      type by default so we need to use a ref-all pointer if the data
4788      reference does not conflict with the created aggregated data
4789      reference because it is not addressable.  */
4790   bool need_ref_all = false;
4791   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4792                               get_alias_set (DR_REF (dr))))
4793     need_ref_all = true;
4794   /* Likewise for any of the data references in the stmt group.  */
4795   else if (DR_GROUP_SIZE (stmt_info) > 1)
4796     {
4797       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4798       do
4799         {
4800           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4801           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4802                                       get_alias_set (DR_REF (sdr))))
4803             {
4804               need_ref_all = true;
4805               break;
4806             }
4807           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4808         }
4809       while (sinfo);
4810     }
4811   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4812                                                need_ref_all);
4813   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4814
4815
4816   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4817      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4818      def-use update cycles for the pointer: one relative to the outer-loop
4819      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4820      to the inner-loop (which is the inner-most loop containing the dataref),
4821      and this is done be step (5) below.
4822
4823      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4824      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4825      redundant.  Steps (3),(4) create the following:
4826
4827         vp0 = &base_addr;
4828         LOOP:   vp1 = phi(vp0,vp2)
4829                 ...
4830                 ...
4831                 vp2 = vp1 + step
4832                 goto LOOP
4833
4834      If there is an inner-loop nested in loop, then step (5) will also be
4835      applied, and an additional update in the inner-loop will be created:
4836
4837         vp0 = &base_addr;
4838         LOOP:   vp1 = phi(vp0,vp2)
4839                 ...
4840         inner:     vp3 = phi(vp1,vp4)
4841                    vp4 = vp3 + inner_step
4842                    if () goto inner
4843                 ...
4844                 vp2 = vp1 + step
4845                 if () goto LOOP   */
4846
4847   /* (2) Calculate the initial address of the aggregate-pointer, and set
4848      the aggregate-pointer to point to it before the loop.  */
4849
4850   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4851
4852   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4853                                                    stmt_info, &new_stmt_list,
4854                                                    offset, byte_offset);
4855   if (new_stmt_list)
4856     {
4857       if (pe)
4858         {
4859           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4860           gcc_assert (!new_bb);
4861         }
4862       else
4863         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4864     }
4865
4866   *initial_address = new_temp;
4867   aggr_ptr_init = new_temp;
4868
4869   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4870      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4871      inner-loop nested in LOOP (during outer-loop vectorization).  */
4872
4873   /* No update in loop is required.  */
4874   if (only_init && (!loop_vinfo || at_loop == loop))
4875     aptr = aggr_ptr_init;
4876   else
4877     {
4878       /* Accesses to invariant addresses should be handled specially
4879          by the caller.  */
4880       tree step = vect_dr_behavior (vinfo, dr_info)->step;
4881       gcc_assert (!integer_zerop (step));
4882
4883       if (iv_step == NULL_TREE)
4884         {
4885           /* The step of the aggregate pointer is the type size,
4886              negated for downward accesses.  */
4887           iv_step = TYPE_SIZE_UNIT (aggr_type);
4888           if (tree_int_cst_sgn (step) == -1)
4889             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4890         }
4891
4892       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4893
4894       create_iv (aggr_ptr_init,
4895                  fold_convert (aggr_ptr_type, iv_step),
4896                  aggr_ptr, loop, &incr_gsi, insert_after,
4897                  &indx_before_incr, &indx_after_incr);
4898       incr = gsi_stmt (incr_gsi);
4899       loop_vinfo->add_stmt (incr);
4900
4901       /* Copy the points-to information if it exists. */
4902       if (DR_PTR_INFO (dr))
4903         {
4904           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4905           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4906         }
4907       if (ptr_incr)
4908         *ptr_incr = incr;
4909
4910       aptr = indx_before_incr;
4911     }
4912
4913   if (!nested_in_vect_loop || only_init)
4914     return aptr;
4915
4916
4917   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4918      nested in LOOP, if exists.  */
4919
4920   gcc_assert (nested_in_vect_loop);
4921   if (!only_init)
4922     {
4923       standard_iv_increment_position (containing_loop, &incr_gsi,
4924                                       &insert_after);
4925       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4926                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4927                  &indx_after_incr);
4928       incr = gsi_stmt (incr_gsi);
4929       loop_vinfo->add_stmt (incr);
4930
4931       /* Copy the points-to information if it exists. */
4932       if (DR_PTR_INFO (dr))
4933         {
4934           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4935           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4936         }
4937       if (ptr_incr)
4938         *ptr_incr = incr;
4939
4940       return indx_before_incr;
4941     }
4942   else
4943     gcc_unreachable ();
4944 }
4945
4946
4947 /* Function bump_vector_ptr
4948
4949    Increment a pointer (to a vector type) by vector-size. If requested,
4950    i.e. if PTR-INCR is given, then also connect the new increment stmt
4951    to the existing def-use update-chain of the pointer, by modifying
4952    the PTR_INCR as illustrated below:
4953
4954    The pointer def-use update-chain before this function:
4955                         DATAREF_PTR = phi (p_0, p_2)
4956                         ....
4957         PTR_INCR:       p_2 = DATAREF_PTR + step
4958
4959    The pointer def-use update-chain after this function:
4960                         DATAREF_PTR = phi (p_0, p_2)
4961                         ....
4962                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4963                         ....
4964         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4965
4966    Input:
4967    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4968                  in the loop.
4969    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4970               the loop.  The increment amount across iterations is expected
4971               to be vector_size.
4972    BSI - location where the new update stmt is to be placed.
4973    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4974    BUMP - optional. The offset by which to bump the pointer. If not given,
4975           the offset is assumed to be vector_size.
4976
4977    Output: Return NEW_DATAREF_PTR as illustrated above.
4978
4979 */
4980
4981 tree
4982 bump_vector_ptr (vec_info *vinfo,
4983                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4984                  stmt_vec_info stmt_info, tree bump)
4985 {
4986   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4987   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4988   tree update = TYPE_SIZE_UNIT (vectype);
4989   gassign *incr_stmt;
4990   ssa_op_iter iter;
4991   use_operand_p use_p;
4992   tree new_dataref_ptr;
4993
4994   if (bump)
4995     update = bump;
4996
4997   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4998     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4999   else
5000     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5001   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5002                                    dataref_ptr, update);
5003   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5004
5005   /* Copy the points-to information if it exists. */
5006   if (DR_PTR_INFO (dr))
5007     {
5008       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5009       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5010     }
5011
5012   if (!ptr_incr)
5013     return new_dataref_ptr;
5014
5015   /* Update the vector-pointer's cross-iteration increment.  */
5016   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5017     {
5018       tree use = USE_FROM_PTR (use_p);
5019
5020       if (use == dataref_ptr)
5021         SET_USE (use_p, new_dataref_ptr);
5022       else
5023         gcc_assert (operand_equal_p (use, update, 0));
5024     }
5025
5026   return new_dataref_ptr;
5027 }
5028
5029
5030 /* Copy memory reference info such as base/clique from the SRC reference
5031    to the DEST MEM_REF.  */
5032
5033 void
5034 vect_copy_ref_info (tree dest, tree src)
5035 {
5036   if (TREE_CODE (dest) != MEM_REF)
5037     return;
5038
5039   tree src_base = src;
5040   while (handled_component_p (src_base))
5041     src_base = TREE_OPERAND (src_base, 0);
5042   if (TREE_CODE (src_base) != MEM_REF
5043       && TREE_CODE (src_base) != TARGET_MEM_REF)
5044     return;
5045
5046   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5047   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5048 }
5049
5050
5051 /* Function vect_create_destination_var.
5052
5053    Create a new temporary of type VECTYPE.  */
5054
5055 tree
5056 vect_create_destination_var (tree scalar_dest, tree vectype)
5057 {
5058   tree vec_dest;
5059   const char *name;
5060   char *new_name;
5061   tree type;
5062   enum vect_var_kind kind;
5063
5064   kind = vectype
5065     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5066     ? vect_mask_var
5067     : vect_simple_var
5068     : vect_scalar_var;
5069   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5070
5071   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5072
5073   name = get_name (scalar_dest);
5074   if (name)
5075     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5076   else
5077     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5078   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5079   free (new_name);
5080
5081   return vec_dest;
5082 }
5083
5084 /* Function vect_grouped_store_supported.
5085
5086    Returns TRUE if interleave high and interleave low permutations
5087    are supported, and FALSE otherwise.  */
5088
5089 bool
5090 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5091 {
5092   machine_mode mode = TYPE_MODE (vectype);
5093
5094   /* vect_permute_store_chain requires the group size to be equal to 3 or
5095      be a power of two.  */
5096   if (count != 3 && exact_log2 (count) == -1)
5097     {
5098       if (dump_enabled_p ())
5099         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5100                          "the size of the group of accesses"
5101                          " is not a power of 2 or not eqaul to 3\n");
5102       return false;
5103     }
5104
5105   /* Check that the permutation is supported.  */
5106   if (VECTOR_MODE_P (mode))
5107     {
5108       unsigned int i;
5109       if (count == 3)
5110         {
5111           unsigned int j0 = 0, j1 = 0, j2 = 0;
5112           unsigned int i, j;
5113
5114           unsigned int nelt;
5115           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5116             {
5117               if (dump_enabled_p ())
5118                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5119                                  "cannot handle groups of 3 stores for"
5120                                  " variable-length vectors\n");
5121               return false;
5122             }
5123
5124           vec_perm_builder sel (nelt, nelt, 1);
5125           sel.quick_grow (nelt);
5126           vec_perm_indices indices;
5127           for (j = 0; j < 3; j++)
5128             {
5129               int nelt0 = ((3 - j) * nelt) % 3;
5130               int nelt1 = ((3 - j) * nelt + 1) % 3;
5131               int nelt2 = ((3 - j) * nelt + 2) % 3;
5132               for (i = 0; i < nelt; i++)
5133                 {
5134                   if (3 * i + nelt0 < nelt)
5135                     sel[3 * i + nelt0] = j0++;
5136                   if (3 * i + nelt1 < nelt)
5137                     sel[3 * i + nelt1] = nelt + j1++;
5138                   if (3 * i + nelt2 < nelt)
5139                     sel[3 * i + nelt2] = 0;
5140                 }
5141               indices.new_vector (sel, 2, nelt);
5142               if (!can_vec_perm_const_p (mode, indices))
5143                 {
5144                   if (dump_enabled_p ())
5145                     dump_printf (MSG_MISSED_OPTIMIZATION,
5146                                  "permutation op not supported by target.\n");
5147                   return false;
5148                 }
5149
5150               for (i = 0; i < nelt; i++)
5151                 {
5152                   if (3 * i + nelt0 < nelt)
5153                     sel[3 * i + nelt0] = 3 * i + nelt0;
5154                   if (3 * i + nelt1 < nelt)
5155                     sel[3 * i + nelt1] = 3 * i + nelt1;
5156                   if (3 * i + nelt2 < nelt)
5157                     sel[3 * i + nelt2] = nelt + j2++;
5158                 }
5159               indices.new_vector (sel, 2, nelt);
5160               if (!can_vec_perm_const_p (mode, indices))
5161                 {
5162                   if (dump_enabled_p ())
5163                     dump_printf (MSG_MISSED_OPTIMIZATION,
5164                                  "permutation op not supported by target.\n");
5165                   return false;
5166                 }
5167             }
5168           return true;
5169         }
5170       else
5171         {
5172           /* If length is not equal to 3 then only power of 2 is supported.  */
5173           gcc_assert (pow2p_hwi (count));
5174           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5175
5176           /* The encoding has 2 interleaved stepped patterns.  */
5177           vec_perm_builder sel (nelt, 2, 3);
5178           sel.quick_grow (6);
5179           for (i = 0; i < 3; i++)
5180             {
5181               sel[i * 2] = i;
5182               sel[i * 2 + 1] = i + nelt;
5183             }
5184           vec_perm_indices indices (sel, 2, nelt);
5185           if (can_vec_perm_const_p (mode, indices))
5186             {
5187               for (i = 0; i < 6; i++)
5188                 sel[i] += exact_div (nelt, 2);
5189               indices.new_vector (sel, 2, nelt);
5190               if (can_vec_perm_const_p (mode, indices))
5191                 return true;
5192             }
5193         }
5194     }
5195
5196   if (dump_enabled_p ())
5197     dump_printf (MSG_MISSED_OPTIMIZATION,
5198                  "permutation op not supported by target.\n");
5199   return false;
5200 }
5201
5202
5203 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5204    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5205
5206 bool
5207 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5208                             bool masked_p)
5209 {
5210   if (masked_p)
5211     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5212                                          vec_mask_store_lanes_optab,
5213                                          vectype, count);
5214   else
5215     return vect_lanes_optab_supported_p ("vec_store_lanes",
5216                                          vec_store_lanes_optab,
5217                                          vectype, count);
5218 }
5219
5220
5221 /* Function vect_permute_store_chain.
5222
5223    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5224    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5225    the data correctly for the stores.  Return the final references for stores
5226    in RESULT_CHAIN.
5227
5228    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5229    The input is 4 vectors each containing 8 elements.  We assign a number to
5230    each element, the input sequence is:
5231
5232    1st vec:   0  1  2  3  4  5  6  7
5233    2nd vec:   8  9 10 11 12 13 14 15
5234    3rd vec:  16 17 18 19 20 21 22 23
5235    4th vec:  24 25 26 27 28 29 30 31
5236
5237    The output sequence should be:
5238
5239    1st vec:  0  8 16 24  1  9 17 25
5240    2nd vec:  2 10 18 26  3 11 19 27
5241    3rd vec:  4 12 20 28  5 13 21 30
5242    4th vec:  6 14 22 30  7 15 23 31
5243
5244    i.e., we interleave the contents of the four vectors in their order.
5245
5246    We use interleave_high/low instructions to create such output.  The input of
5247    each interleave_high/low operation is two vectors:
5248    1st vec    2nd vec
5249    0 1 2 3    4 5 6 7
5250    the even elements of the result vector are obtained left-to-right from the
5251    high/low elements of the first vector.  The odd elements of the result are
5252    obtained left-to-right from the high/low elements of the second vector.
5253    The output of interleave_high will be:   0 4 1 5
5254    and of interleave_low:                   2 6 3 7
5255
5256
5257    The permutation is done in log LENGTH stages.  In each stage interleave_high
5258    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5259    where the first argument is taken from the first half of DR_CHAIN and the
5260    second argument from it's second half.
5261    In our example,
5262
5263    I1: interleave_high (1st vec, 3rd vec)
5264    I2: interleave_low (1st vec, 3rd vec)
5265    I3: interleave_high (2nd vec, 4th vec)
5266    I4: interleave_low (2nd vec, 4th vec)
5267
5268    The output for the first stage is:
5269
5270    I1:  0 16  1 17  2 18  3 19
5271    I2:  4 20  5 21  6 22  7 23
5272    I3:  8 24  9 25 10 26 11 27
5273    I4: 12 28 13 29 14 30 15 31
5274
5275    The output of the second stage, i.e. the final result is:
5276
5277    I1:  0  8 16 24  1  9 17 25
5278    I2:  2 10 18 26  3 11 19 27
5279    I3:  4 12 20 28  5 13 21 30
5280    I4:  6 14 22 30  7 15 23 31.  */
5281
5282 void
5283 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5284                           unsigned int length,
5285                           stmt_vec_info stmt_info,
5286                           gimple_stmt_iterator *gsi,
5287                           vec<tree> *result_chain)
5288 {
5289   tree vect1, vect2, high, low;
5290   gimple *perm_stmt;
5291   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5292   tree perm_mask_low, perm_mask_high;
5293   tree data_ref;
5294   tree perm3_mask_low, perm3_mask_high;
5295   unsigned int i, j, n, log_length = exact_log2 (length);
5296
5297   result_chain->quick_grow (length);
5298   memcpy (result_chain->address (), dr_chain.address (),
5299           length * sizeof (tree));
5300
5301   if (length == 3)
5302     {
5303       /* vect_grouped_store_supported ensures that this is constant.  */
5304       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5305       unsigned int j0 = 0, j1 = 0, j2 = 0;
5306
5307       vec_perm_builder sel (nelt, nelt, 1);
5308       sel.quick_grow (nelt);
5309       vec_perm_indices indices;
5310       for (j = 0; j < 3; j++)
5311         {
5312           int nelt0 = ((3 - j) * nelt) % 3;
5313           int nelt1 = ((3 - j) * nelt + 1) % 3;
5314           int nelt2 = ((3 - j) * nelt + 2) % 3;
5315
5316           for (i = 0; i < nelt; i++)
5317             {
5318               if (3 * i + nelt0 < nelt)
5319                 sel[3 * i + nelt0] = j0++;
5320               if (3 * i + nelt1 < nelt)
5321                 sel[3 * i + nelt1] = nelt + j1++;
5322               if (3 * i + nelt2 < nelt)
5323                 sel[3 * i + nelt2] = 0;
5324             }
5325           indices.new_vector (sel, 2, nelt);
5326           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5327
5328           for (i = 0; i < nelt; i++)
5329             {
5330               if (3 * i + nelt0 < nelt)
5331                 sel[3 * i + nelt0] = 3 * i + nelt0;
5332               if (3 * i + nelt1 < nelt)
5333                 sel[3 * i + nelt1] = 3 * i + nelt1;
5334               if (3 * i + nelt2 < nelt)
5335                 sel[3 * i + nelt2] = nelt + j2++;
5336             }
5337           indices.new_vector (sel, 2, nelt);
5338           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5339
5340           vect1 = dr_chain[0];
5341           vect2 = dr_chain[1];
5342
5343           /* Create interleaving stmt:
5344              low = VEC_PERM_EXPR <vect1, vect2,
5345                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5346                                    j + 2, nelt + j + 2, *, ...}>  */
5347           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5348           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5349                                            vect2, perm3_mask_low);
5350           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5351
5352           vect1 = data_ref;
5353           vect2 = dr_chain[2];
5354           /* Create interleaving stmt:
5355              low = VEC_PERM_EXPR <vect1, vect2,
5356                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5357                                    6, 7, nelt + j + 2, ...}>  */
5358           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5359           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5360                                            vect2, perm3_mask_high);
5361           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5362           (*result_chain)[j] = data_ref;
5363         }
5364     }
5365   else
5366     {
5367       /* If length is not equal to 3 then only power of 2 is supported.  */
5368       gcc_assert (pow2p_hwi (length));
5369
5370       /* The encoding has 2 interleaved stepped patterns.  */
5371       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5372       vec_perm_builder sel (nelt, 2, 3);
5373       sel.quick_grow (6);
5374       for (i = 0; i < 3; i++)
5375         {
5376           sel[i * 2] = i;
5377           sel[i * 2 + 1] = i + nelt;
5378         }
5379         vec_perm_indices indices (sel, 2, nelt);
5380         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5381
5382         for (i = 0; i < 6; i++)
5383           sel[i] += exact_div (nelt, 2);
5384         indices.new_vector (sel, 2, nelt);
5385         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5386
5387         for (i = 0, n = log_length; i < n; i++)
5388           {
5389             for (j = 0; j < length/2; j++)
5390               {
5391                 vect1 = dr_chain[j];
5392                 vect2 = dr_chain[j+length/2];
5393
5394                 /* Create interleaving stmt:
5395                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5396                                                         ...}>  */
5397                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5398                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5399                                                  vect2, perm_mask_high);
5400                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5401                 (*result_chain)[2*j] = high;
5402
5403                 /* Create interleaving stmt:
5404                    low = VEC_PERM_EXPR <vect1, vect2,
5405                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5406                                          ...}>  */
5407                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5408                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5409                                                  vect2, perm_mask_low);
5410                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5411                 (*result_chain)[2*j+1] = low;
5412               }
5413             memcpy (dr_chain.address (), result_chain->address (),
5414                     length * sizeof (tree));
5415           }
5416     }
5417 }
5418
5419 /* Function vect_setup_realignment
5420
5421    This function is called when vectorizing an unaligned load using
5422    the dr_explicit_realign[_optimized] scheme.
5423    This function generates the following code at the loop prolog:
5424
5425       p = initial_addr;
5426    x  msq_init = *(floor(p));   # prolog load
5427       realignment_token = call target_builtin;
5428     loop:
5429    x  msq = phi (msq_init, ---)
5430
5431    The stmts marked with x are generated only for the case of
5432    dr_explicit_realign_optimized.
5433
5434    The code above sets up a new (vector) pointer, pointing to the first
5435    location accessed by STMT_INFO, and a "floor-aligned" load using that
5436    pointer.  It also generates code to compute the "realignment-token"
5437    (if the relevant target hook was defined), and creates a phi-node at the
5438    loop-header bb whose arguments are the result of the prolog-load (created
5439    by this function) and the result of a load that takes place in the loop
5440    (to be created by the caller to this function).
5441
5442    For the case of dr_explicit_realign_optimized:
5443    The caller to this function uses the phi-result (msq) to create the
5444    realignment code inside the loop, and sets up the missing phi argument,
5445    as follows:
5446     loop:
5447       msq = phi (msq_init, lsq)
5448       lsq = *(floor(p'));        # load in loop
5449       result = realign_load (msq, lsq, realignment_token);
5450
5451    For the case of dr_explicit_realign:
5452     loop:
5453       msq = *(floor(p));        # load in loop
5454       p' = p + (VS-1);
5455       lsq = *(floor(p'));       # load in loop
5456       result = realign_load (msq, lsq, realignment_token);
5457
5458    Input:
5459    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5460                a memory location that may be unaligned.
5461    BSI - place where new code is to be inserted.
5462    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5463                               is used.
5464
5465    Output:
5466    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5467                        target hook, if defined.
5468    Return value - the result of the loop-header phi node.  */
5469
5470 tree
5471 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5472                         gimple_stmt_iterator *gsi, tree *realignment_token,
5473                         enum dr_alignment_support alignment_support_scheme,
5474                         tree init_addr,
5475                         class loop **at_loop)
5476 {
5477   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5478   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5479   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5480   struct data_reference *dr = dr_info->dr;
5481   class loop *loop = NULL;
5482   edge pe = NULL;
5483   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5484   tree vec_dest;
5485   gimple *inc;
5486   tree ptr;
5487   tree data_ref;
5488   basic_block new_bb;
5489   tree msq_init = NULL_TREE;
5490   tree new_temp;
5491   gphi *phi_stmt;
5492   tree msq = NULL_TREE;
5493   gimple_seq stmts = NULL;
5494   bool compute_in_loop = false;
5495   bool nested_in_vect_loop = false;
5496   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5497   class loop *loop_for_initial_load = NULL;
5498
5499   if (loop_vinfo)
5500     {
5501       loop = LOOP_VINFO_LOOP (loop_vinfo);
5502       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5503     }
5504
5505   gcc_assert (alignment_support_scheme == dr_explicit_realign
5506               || alignment_support_scheme == dr_explicit_realign_optimized);
5507
5508   /* We need to generate three things:
5509      1. the misalignment computation
5510      2. the extra vector load (for the optimized realignment scheme).
5511      3. the phi node for the two vectors from which the realignment is
5512       done (for the optimized realignment scheme).  */
5513
5514   /* 1. Determine where to generate the misalignment computation.
5515
5516      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5517      calculation will be generated by this function, outside the loop (in the
5518      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5519      caller, inside the loop.
5520
5521      Background: If the misalignment remains fixed throughout the iterations of
5522      the loop, then both realignment schemes are applicable, and also the
5523      misalignment computation can be done outside LOOP.  This is because we are
5524      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5525      are a multiple of VS (the Vector Size), and therefore the misalignment in
5526      different vectorized LOOP iterations is always the same.
5527      The problem arises only if the memory access is in an inner-loop nested
5528      inside LOOP, which is now being vectorized using outer-loop vectorization.
5529      This is the only case when the misalignment of the memory access may not
5530      remain fixed throughout the iterations of the inner-loop (as explained in
5531      detail in vect_supportable_dr_alignment).  In this case, not only is the
5532      optimized realignment scheme not applicable, but also the misalignment
5533      computation (and generation of the realignment token that is passed to
5534      REALIGN_LOAD) have to be done inside the loop.
5535
5536      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5537      or not, which in turn determines if the misalignment is computed inside
5538      the inner-loop, or outside LOOP.  */
5539
5540   if (init_addr != NULL_TREE || !loop_vinfo)
5541     {
5542       compute_in_loop = true;
5543       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5544     }
5545
5546
5547   /* 2. Determine where to generate the extra vector load.
5548
5549      For the optimized realignment scheme, instead of generating two vector
5550      loads in each iteration, we generate a single extra vector load in the
5551      preheader of the loop, and in each iteration reuse the result of the
5552      vector load from the previous iteration.  In case the memory access is in
5553      an inner-loop nested inside LOOP, which is now being vectorized using
5554      outer-loop vectorization, we need to determine whether this initial vector
5555      load should be generated at the preheader of the inner-loop, or can be
5556      generated at the preheader of LOOP.  If the memory access has no evolution
5557      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5558      to be generated inside LOOP (in the preheader of the inner-loop).  */
5559
5560   if (nested_in_vect_loop)
5561     {
5562       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5563       bool invariant_in_outerloop =
5564             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5565       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5566     }
5567   else
5568     loop_for_initial_load = loop;
5569   if (at_loop)
5570     *at_loop = loop_for_initial_load;
5571
5572   if (loop_for_initial_load)
5573     pe = loop_preheader_edge (loop_for_initial_load);
5574
5575   /* 3. For the case of the optimized realignment, create the first vector
5576       load at the loop preheader.  */
5577
5578   if (alignment_support_scheme == dr_explicit_realign_optimized)
5579     {
5580       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5581       gassign *new_stmt;
5582
5583       gcc_assert (!compute_in_loop);
5584       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5585       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5586                                       loop_for_initial_load, NULL_TREE,
5587                                       &init_addr, NULL, &inc, true);
5588       if (TREE_CODE (ptr) == SSA_NAME)
5589         new_temp = copy_ssa_name (ptr);
5590       else
5591         new_temp = make_ssa_name (TREE_TYPE (ptr));
5592       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5593       tree type = TREE_TYPE (ptr);
5594       new_stmt = gimple_build_assign
5595                    (new_temp, BIT_AND_EXPR, ptr,
5596                     fold_build2 (MINUS_EXPR, type,
5597                                  build_int_cst (type, 0),
5598                                  build_int_cst (type, align)));
5599       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5600       gcc_assert (!new_bb);
5601       data_ref
5602         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5603                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5604       vect_copy_ref_info (data_ref, DR_REF (dr));
5605       new_stmt = gimple_build_assign (vec_dest, data_ref);
5606       new_temp = make_ssa_name (vec_dest, new_stmt);
5607       gimple_assign_set_lhs (new_stmt, new_temp);
5608       if (pe)
5609         {
5610           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5611           gcc_assert (!new_bb);
5612         }
5613       else
5614          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5615
5616       msq_init = gimple_assign_lhs (new_stmt);
5617     }
5618
5619   /* 4. Create realignment token using a target builtin, if available.
5620       It is done either inside the containing loop, or before LOOP (as
5621       determined above).  */
5622
5623   if (targetm.vectorize.builtin_mask_for_load)
5624     {
5625       gcall *new_stmt;
5626       tree builtin_decl;
5627
5628       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5629       if (!init_addr)
5630         {
5631           /* Generate the INIT_ADDR computation outside LOOP.  */
5632           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5633                                                             stmt_info, &stmts,
5634                                                             NULL_TREE);
5635           if (loop)
5636             {
5637               pe = loop_preheader_edge (loop);
5638               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5639               gcc_assert (!new_bb);
5640             }
5641           else
5642              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5643         }
5644
5645       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5646       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5647       vec_dest =
5648         vect_create_destination_var (scalar_dest,
5649                                      gimple_call_return_type (new_stmt));
5650       new_temp = make_ssa_name (vec_dest, new_stmt);
5651       gimple_call_set_lhs (new_stmt, new_temp);
5652
5653       if (compute_in_loop)
5654         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5655       else
5656         {
5657           /* Generate the misalignment computation outside LOOP.  */
5658           pe = loop_preheader_edge (loop);
5659           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5660           gcc_assert (!new_bb);
5661         }
5662
5663       *realignment_token = gimple_call_lhs (new_stmt);
5664
5665       /* The result of the CALL_EXPR to this builtin is determined from
5666          the value of the parameter and no global variables are touched
5667          which makes the builtin a "const" function.  Requiring the
5668          builtin to have the "const" attribute makes it unnecessary
5669          to call mark_call_clobbered.  */
5670       gcc_assert (TREE_READONLY (builtin_decl));
5671     }
5672
5673   if (alignment_support_scheme == dr_explicit_realign)
5674     return msq;
5675
5676   gcc_assert (!compute_in_loop);
5677   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5678
5679
5680   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5681
5682   pe = loop_preheader_edge (containing_loop);
5683   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5684   msq = make_ssa_name (vec_dest);
5685   phi_stmt = create_phi_node (msq, containing_loop->header);
5686   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5687
5688   return msq;
5689 }
5690
5691
5692 /* Function vect_grouped_load_supported.
5693
5694    COUNT is the size of the load group (the number of statements plus the
5695    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5696    only one statement, with a gap of COUNT - 1.
5697
5698    Returns true if a suitable permute exists.  */
5699
5700 bool
5701 vect_grouped_load_supported (tree vectype, bool single_element_p,
5702                              unsigned HOST_WIDE_INT count)
5703 {
5704   machine_mode mode = TYPE_MODE (vectype);
5705
5706   /* If this is single-element interleaving with an element distance
5707      that leaves unused vector loads around punt - we at least create
5708      very sub-optimal code in that case (and blow up memory,
5709      see PR65518).  */
5710   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5711     {
5712       if (dump_enabled_p ())
5713         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5714                          "single-element interleaving not supported "
5715                          "for not adjacent vector loads\n");
5716       return false;
5717     }
5718
5719   /* vect_permute_load_chain requires the group size to be equal to 3 or
5720      be a power of two.  */
5721   if (count != 3 && exact_log2 (count) == -1)
5722     {
5723       if (dump_enabled_p ())
5724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5725                          "the size of the group of accesses"
5726                          " is not a power of 2 or not equal to 3\n");
5727       return false;
5728     }
5729
5730   /* Check that the permutation is supported.  */
5731   if (VECTOR_MODE_P (mode))
5732     {
5733       unsigned int i, j;
5734       if (count == 3)
5735         {
5736           unsigned int nelt;
5737           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5738             {
5739               if (dump_enabled_p ())
5740                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5741                                  "cannot handle groups of 3 loads for"
5742                                  " variable-length vectors\n");
5743               return false;
5744             }
5745
5746           vec_perm_builder sel (nelt, nelt, 1);
5747           sel.quick_grow (nelt);
5748           vec_perm_indices indices;
5749           unsigned int k;
5750           for (k = 0; k < 3; k++)
5751             {
5752               for (i = 0; i < nelt; i++)
5753                 if (3 * i + k < 2 * nelt)
5754                   sel[i] = 3 * i + k;
5755                 else
5756                   sel[i] = 0;
5757               indices.new_vector (sel, 2, nelt);
5758               if (!can_vec_perm_const_p (mode, indices))
5759                 {
5760                   if (dump_enabled_p ())
5761                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5762                                      "shuffle of 3 loads is not supported by"
5763                                      " target\n");
5764                   return false;
5765                 }
5766               for (i = 0, j = 0; i < nelt; i++)
5767                 if (3 * i + k < 2 * nelt)
5768                   sel[i] = i;
5769                 else
5770                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5771               indices.new_vector (sel, 2, nelt);
5772               if (!can_vec_perm_const_p (mode, indices))
5773                 {
5774                   if (dump_enabled_p ())
5775                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5776                                      "shuffle of 3 loads is not supported by"
5777                                      " target\n");
5778                   return false;
5779                 }
5780             }
5781           return true;
5782         }
5783       else
5784         {
5785           /* If length is not equal to 3 then only power of 2 is supported.  */
5786           gcc_assert (pow2p_hwi (count));
5787           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5788
5789           /* The encoding has a single stepped pattern.  */
5790           vec_perm_builder sel (nelt, 1, 3);
5791           sel.quick_grow (3);
5792           for (i = 0; i < 3; i++)
5793             sel[i] = i * 2;
5794           vec_perm_indices indices (sel, 2, nelt);
5795           if (can_vec_perm_const_p (mode, indices))
5796             {
5797               for (i = 0; i < 3; i++)
5798                 sel[i] = i * 2 + 1;
5799               indices.new_vector (sel, 2, nelt);
5800               if (can_vec_perm_const_p (mode, indices))
5801                 return true;
5802             }
5803         }
5804     }
5805
5806   if (dump_enabled_p ())
5807     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5808                      "extract even/odd not supported by target\n");
5809   return false;
5810 }
5811
5812 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5813    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5814
5815 bool
5816 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5817                            bool masked_p)
5818 {
5819   if (masked_p)
5820     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5821                                          vec_mask_load_lanes_optab,
5822                                          vectype, count);
5823   else
5824     return vect_lanes_optab_supported_p ("vec_load_lanes",
5825                                          vec_load_lanes_optab,
5826                                          vectype, count);
5827 }
5828
5829 /* Function vect_permute_load_chain.
5830
5831    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5832    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5833    the input data correctly.  Return the final references for loads in
5834    RESULT_CHAIN.
5835
5836    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5837    The input is 4 vectors each containing 8 elements. We assign a number to each
5838    element, the input sequence is:
5839
5840    1st vec:   0  1  2  3  4  5  6  7
5841    2nd vec:   8  9 10 11 12 13 14 15
5842    3rd vec:  16 17 18 19 20 21 22 23
5843    4th vec:  24 25 26 27 28 29 30 31
5844
5845    The output sequence should be:
5846
5847    1st vec:  0 4  8 12 16 20 24 28
5848    2nd vec:  1 5  9 13 17 21 25 29
5849    3rd vec:  2 6 10 14 18 22 26 30
5850    4th vec:  3 7 11 15 19 23 27 31
5851
5852    i.e., the first output vector should contain the first elements of each
5853    interleaving group, etc.
5854
5855    We use extract_even/odd instructions to create such output.  The input of
5856    each extract_even/odd operation is two vectors
5857    1st vec    2nd vec
5858    0 1 2 3    4 5 6 7
5859
5860    and the output is the vector of extracted even/odd elements.  The output of
5861    extract_even will be:   0 2 4 6
5862    and of extract_odd:     1 3 5 7
5863
5864
5865    The permutation is done in log LENGTH stages.  In each stage extract_even
5866    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5867    their order.  In our example,
5868
5869    E1: extract_even (1st vec, 2nd vec)
5870    E2: extract_odd (1st vec, 2nd vec)
5871    E3: extract_even (3rd vec, 4th vec)
5872    E4: extract_odd (3rd vec, 4th vec)
5873
5874    The output for the first stage will be:
5875
5876    E1:  0  2  4  6  8 10 12 14
5877    E2:  1  3  5  7  9 11 13 15
5878    E3: 16 18 20 22 24 26 28 30
5879    E4: 17 19 21 23 25 27 29 31
5880
5881    In order to proceed and create the correct sequence for the next stage (or
5882    for the correct output, if the second stage is the last one, as in our
5883    example), we first put the output of extract_even operation and then the
5884    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5885    The input for the second stage is:
5886
5887    1st vec (E1):  0  2  4  6  8 10 12 14
5888    2nd vec (E3): 16 18 20 22 24 26 28 30
5889    3rd vec (E2):  1  3  5  7  9 11 13 15
5890    4th vec (E4): 17 19 21 23 25 27 29 31
5891
5892    The output of the second stage:
5893
5894    E1: 0 4  8 12 16 20 24 28
5895    E2: 2 6 10 14 18 22 26 30
5896    E3: 1 5  9 13 17 21 25 29
5897    E4: 3 7 11 15 19 23 27 31
5898
5899    And RESULT_CHAIN after reordering:
5900
5901    1st vec (E1):  0 4  8 12 16 20 24 28
5902    2nd vec (E3):  1 5  9 13 17 21 25 29
5903    3rd vec (E2):  2 6 10 14 18 22 26 30
5904    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5905
5906 static void
5907 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5908                          unsigned int length,
5909                          stmt_vec_info stmt_info,
5910                          gimple_stmt_iterator *gsi,
5911                          vec<tree> *result_chain)
5912 {
5913   tree data_ref, first_vect, second_vect;
5914   tree perm_mask_even, perm_mask_odd;
5915   tree perm3_mask_low, perm3_mask_high;
5916   gimple *perm_stmt;
5917   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5918   unsigned int i, j, log_length = exact_log2 (length);
5919
5920   result_chain->quick_grow (length);
5921   memcpy (result_chain->address (), dr_chain.address (),
5922           length * sizeof (tree));
5923
5924   if (length == 3)
5925     {
5926       /* vect_grouped_load_supported ensures that this is constant.  */
5927       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5928       unsigned int k;
5929
5930       vec_perm_builder sel (nelt, nelt, 1);
5931       sel.quick_grow (nelt);
5932       vec_perm_indices indices;
5933       for (k = 0; k < 3; k++)
5934         {
5935           for (i = 0; i < nelt; i++)
5936             if (3 * i + k < 2 * nelt)
5937               sel[i] = 3 * i + k;
5938             else
5939               sel[i] = 0;
5940           indices.new_vector (sel, 2, nelt);
5941           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5942
5943           for (i = 0, j = 0; i < nelt; i++)
5944             if (3 * i + k < 2 * nelt)
5945               sel[i] = i;
5946             else
5947               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5948           indices.new_vector (sel, 2, nelt);
5949           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5950
5951           first_vect = dr_chain[0];
5952           second_vect = dr_chain[1];
5953
5954           /* Create interleaving stmt (low part of):
5955              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5956                                                              ...}>  */
5957           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5958           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5959                                            second_vect, perm3_mask_low);
5960           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5961
5962           /* Create interleaving stmt (high part of):
5963              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5964                                                               ...}>  */
5965           first_vect = data_ref;
5966           second_vect = dr_chain[2];
5967           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5968           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5969                                            second_vect, perm3_mask_high);
5970           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5971           (*result_chain)[k] = data_ref;
5972         }
5973     }
5974   else
5975     {
5976       /* If length is not equal to 3 then only power of 2 is supported.  */
5977       gcc_assert (pow2p_hwi (length));
5978
5979       /* The encoding has a single stepped pattern.  */
5980       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5981       vec_perm_builder sel (nelt, 1, 3);
5982       sel.quick_grow (3);
5983       for (i = 0; i < 3; ++i)
5984         sel[i] = i * 2;
5985       vec_perm_indices indices (sel, 2, nelt);
5986       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5987
5988       for (i = 0; i < 3; ++i)
5989         sel[i] = i * 2 + 1;
5990       indices.new_vector (sel, 2, nelt);
5991       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5992
5993       for (i = 0; i < log_length; i++)
5994         {
5995           for (j = 0; j < length; j += 2)
5996             {
5997               first_vect = dr_chain[j];
5998               second_vect = dr_chain[j+1];
5999
6000               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6001               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6002               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6003                                                first_vect, second_vect,
6004                                                perm_mask_even);
6005               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6006               (*result_chain)[j/2] = data_ref;
6007
6008               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6009               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6010               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6011                                                first_vect, second_vect,
6012                                                perm_mask_odd);
6013               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6014               (*result_chain)[j/2+length/2] = data_ref;
6015             }
6016           memcpy (dr_chain.address (), result_chain->address (),
6017                   length * sizeof (tree));
6018         }
6019     }
6020 }
6021
6022 /* Function vect_shift_permute_load_chain.
6023
6024    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6025    sequence of stmts to reorder the input data accordingly.
6026    Return the final references for loads in RESULT_CHAIN.
6027    Return true if successed, false otherwise.
6028
6029    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6030    The input is 3 vectors each containing 8 elements.  We assign a
6031    number to each element, the input sequence is:
6032
6033    1st vec:   0  1  2  3  4  5  6  7
6034    2nd vec:   8  9 10 11 12 13 14 15
6035    3rd vec:  16 17 18 19 20 21 22 23
6036
6037    The output sequence should be:
6038
6039    1st vec:  0 3 6  9 12 15 18 21
6040    2nd vec:  1 4 7 10 13 16 19 22
6041    3rd vec:  2 5 8 11 14 17 20 23
6042
6043    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6044
6045    First we shuffle all 3 vectors to get correct elements order:
6046
6047    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6048    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6049    3rd vec:  (16 19 22) (17 20 23) (18 21)
6050
6051    Next we unite and shift vector 3 times:
6052
6053    1st step:
6054      shift right by 6 the concatenation of:
6055      "1st vec" and  "2nd vec"
6056        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6057      "2nd vec" and  "3rd vec"
6058        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6059      "3rd vec" and  "1st vec"
6060        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6061                              | New vectors                   |
6062
6063      So that now new vectors are:
6064
6065      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6066      2nd vec:  (10 13) (16 19 22) (17 20 23)
6067      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6068
6069    2nd step:
6070      shift right by 5 the concatenation of:
6071      "1st vec" and  "3rd vec"
6072        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6073      "2nd vec" and  "1st vec"
6074        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6075      "3rd vec" and  "2nd vec"
6076        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6077                           | New vectors                   |
6078
6079      So that now new vectors are:
6080
6081      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6082      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6083      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6084
6085    3rd step:
6086      shift right by 5 the concatenation of:
6087      "1st vec" and  "1st vec"
6088        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6089      shift right by 3 the concatenation of:
6090      "2nd vec" and  "2nd vec"
6091                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6092                           | New vectors                   |
6093
6094      So that now all vectors are READY:
6095      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6096      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6097      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6098
6099    This algorithm is faster than one in vect_permute_load_chain if:
6100      1.  "shift of a concatination" is faster than general permutation.
6101          This is usually so.
6102      2.  The TARGET machine can't execute vector instructions in parallel.
6103          This is because each step of the algorithm depends on previous.
6104          The algorithm in vect_permute_load_chain is much more parallel.
6105
6106    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6107 */
6108
6109 static bool
6110 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6111                                unsigned int length,
6112                                stmt_vec_info stmt_info,
6113                                gimple_stmt_iterator *gsi,
6114                                vec<tree> *result_chain)
6115 {
6116   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6117   tree perm2_mask1, perm2_mask2, perm3_mask;
6118   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6119   gimple *perm_stmt;
6120
6121   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6122   unsigned int i;
6123   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6124
6125   unsigned HOST_WIDE_INT nelt, vf;
6126   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6127       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6128     /* Not supported for variable-length vectors.  */
6129     return false;
6130
6131   vec_perm_builder sel (nelt, nelt, 1);
6132   sel.quick_grow (nelt);
6133
6134   result_chain->quick_grow (length);
6135   memcpy (result_chain->address (), dr_chain.address (),
6136           length * sizeof (tree));
6137
6138   if (pow2p_hwi (length) && vf > 4)
6139     {
6140       unsigned int j, log_length = exact_log2 (length);
6141       for (i = 0; i < nelt / 2; ++i)
6142         sel[i] = i * 2;
6143       for (i = 0; i < nelt / 2; ++i)
6144         sel[nelt / 2 + i] = i * 2 + 1;
6145       vec_perm_indices indices (sel, 2, nelt);
6146       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6147         {
6148           if (dump_enabled_p ())
6149             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6150                              "shuffle of 2 fields structure is not \
6151                               supported by target\n");
6152           return false;
6153         }
6154       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6155
6156       for (i = 0; i < nelt / 2; ++i)
6157         sel[i] = i * 2 + 1;
6158       for (i = 0; i < nelt / 2; ++i)
6159         sel[nelt / 2 + i] = i * 2;
6160       indices.new_vector (sel, 2, nelt);
6161       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6162         {
6163           if (dump_enabled_p ())
6164             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6165                              "shuffle of 2 fields structure is not \
6166                               supported by target\n");
6167           return false;
6168         }
6169       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6170
6171       /* Generating permutation constant to shift all elements.
6172          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6173       for (i = 0; i < nelt; i++)
6174         sel[i] = nelt / 2 + i;
6175       indices.new_vector (sel, 2, nelt);
6176       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6177         {
6178           if (dump_enabled_p ())
6179             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6180                              "shift permutation is not supported by target\n");
6181           return false;
6182         }
6183       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6184
6185       /* Generating permutation constant to select vector from 2.
6186          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6187       for (i = 0; i < nelt / 2; i++)
6188         sel[i] = i;
6189       for (i = nelt / 2; i < nelt; i++)
6190         sel[i] = nelt + i;
6191       indices.new_vector (sel, 2, nelt);
6192       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6193         {
6194           if (dump_enabled_p ())
6195             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196                              "select is not supported by target\n");
6197           return false;
6198         }
6199       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6200
6201       for (i = 0; i < log_length; i++)
6202         {
6203           for (j = 0; j < length; j += 2)
6204             {
6205               first_vect = dr_chain[j];
6206               second_vect = dr_chain[j + 1];
6207
6208               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6209               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6210                                                first_vect, first_vect,
6211                                                perm2_mask1);
6212               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6213               vect[0] = data_ref;
6214
6215               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6216               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6217                                                second_vect, second_vect,
6218                                                perm2_mask2);
6219               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6220               vect[1] = data_ref;
6221
6222               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6223               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6224                                                vect[0], vect[1], shift1_mask);
6225               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6226               (*result_chain)[j/2 + length/2] = data_ref;
6227
6228               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6229               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6230                                                vect[0], vect[1], select_mask);
6231               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6232               (*result_chain)[j/2] = data_ref;
6233             }
6234           memcpy (dr_chain.address (), result_chain->address (),
6235                   length * sizeof (tree));
6236         }
6237       return true;
6238     }
6239   if (length == 3 && vf > 2)
6240     {
6241       unsigned int k = 0, l = 0;
6242
6243       /* Generating permutation constant to get all elements in rigth order.
6244          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6245       for (i = 0; i < nelt; i++)
6246         {
6247           if (3 * k + (l % 3) >= nelt)
6248             {
6249               k = 0;
6250               l += (3 - (nelt % 3));
6251             }
6252           sel[i] = 3 * k + (l % 3);
6253           k++;
6254         }
6255       vec_perm_indices indices (sel, 2, nelt);
6256       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6257         {
6258           if (dump_enabled_p ())
6259             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260                              "shuffle of 3 fields structure is not \
6261                               supported by target\n");
6262           return false;
6263         }
6264       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6265
6266       /* Generating permutation constant to shift all elements.
6267          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6268       for (i = 0; i < nelt; i++)
6269         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6270       indices.new_vector (sel, 2, nelt);
6271       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6272         {
6273           if (dump_enabled_p ())
6274             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6275                              "shift permutation is not supported by target\n");
6276           return false;
6277         }
6278       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6279
6280       /* Generating permutation constant to shift all elements.
6281          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6282       for (i = 0; i < nelt; i++)
6283         sel[i] = 2 * (nelt / 3) + 1 + i;
6284       indices.new_vector (sel, 2, nelt);
6285       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6286         {
6287           if (dump_enabled_p ())
6288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289                              "shift permutation is not supported by target\n");
6290           return false;
6291         }
6292       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6293
6294       /* Generating permutation constant to shift all elements.
6295          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6296       for (i = 0; i < nelt; i++)
6297         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6298       indices.new_vector (sel, 2, nelt);
6299       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6300         {
6301           if (dump_enabled_p ())
6302             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6303                              "shift permutation is not supported by target\n");
6304           return false;
6305         }
6306       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6307
6308       /* Generating permutation constant to shift all elements.
6309          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6310       for (i = 0; i < nelt; i++)
6311         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6312       indices.new_vector (sel, 2, nelt);
6313       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6314         {
6315           if (dump_enabled_p ())
6316             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6317                              "shift permutation is not supported by target\n");
6318           return false;
6319         }
6320       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6321
6322       for (k = 0; k < 3; k++)
6323         {
6324           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6325           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6326                                            dr_chain[k], dr_chain[k],
6327                                            perm3_mask);
6328           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6329           vect[k] = data_ref;
6330         }
6331
6332       for (k = 0; k < 3; k++)
6333         {
6334           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6335           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6336                                            vect[k % 3], vect[(k + 1) % 3],
6337                                            shift1_mask);
6338           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6339           vect_shift[k] = data_ref;
6340         }
6341
6342       for (k = 0; k < 3; k++)
6343         {
6344           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6345           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6346                                            vect_shift[(4 - k) % 3],
6347                                            vect_shift[(3 - k) % 3],
6348                                            shift2_mask);
6349           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6350           vect[k] = data_ref;
6351         }
6352
6353       (*result_chain)[3 - (nelt % 3)] = vect[2];
6354
6355       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6356       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6357                                        vect[0], shift3_mask);
6358       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6359       (*result_chain)[nelt % 3] = data_ref;
6360
6361       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6362       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6363                                        vect[1], shift4_mask);
6364       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6365       (*result_chain)[0] = data_ref;
6366       return true;
6367     }
6368   return false;
6369 }
6370
6371 /* Function vect_transform_grouped_load.
6372
6373    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6374    to perform their permutation and ascribe the result vectorized statements to
6375    the scalar statements.
6376 */
6377
6378 void
6379 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6380                              vec<tree> dr_chain,
6381                              int size, gimple_stmt_iterator *gsi)
6382 {
6383   machine_mode mode;
6384   vec<tree> result_chain = vNULL;
6385
6386   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6387      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6388      vectors, that are ready for vector computation.  */
6389   result_chain.create (size);
6390
6391   /* If reassociation width for vector type is 2 or greater target machine can
6392      execute 2 or more vector instructions in parallel.  Otherwise try to
6393      get chain for loads group using vect_shift_permute_load_chain.  */
6394   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6395   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6396       || pow2p_hwi (size)
6397       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6398                                          gsi, &result_chain))
6399     vect_permute_load_chain (vinfo, dr_chain,
6400                              size, stmt_info, gsi, &result_chain);
6401   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6402   result_chain.release ();
6403 }
6404
6405 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6406    generated as part of the vectorization of STMT_INFO.  Assign the statement
6407    for each vector to the associated scalar statement.  */
6408
6409 void
6410 vect_record_grouped_load_vectors (vec_info *vinfo, stmt_vec_info stmt_info,
6411                                   vec<tree> result_chain)
6412 {
6413   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6414   unsigned int i, gap_count;
6415   tree tmp_data_ref;
6416
6417   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6418      Since we scan the chain starting from it's first node, their order
6419      corresponds the order of data-refs in RESULT_CHAIN.  */
6420   stmt_vec_info next_stmt_info = first_stmt_info;
6421   gap_count = 1;
6422   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6423     {
6424       if (!next_stmt_info)
6425         break;
6426
6427       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6428        code elimination pass later.  No need to check for the first stmt in
6429        the group, since it always exists.
6430        DR_GROUP_GAP is the number of steps in elements from the previous
6431        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6432        correspond to the gaps.  */
6433       if (next_stmt_info != first_stmt_info
6434           && gap_count < DR_GROUP_GAP (next_stmt_info))
6435         {
6436           gap_count++;
6437           continue;
6438         }
6439
6440       /* ???  The following needs cleanup after the removal of
6441          DR_GROUP_SAME_DR_STMT.  */
6442       if (next_stmt_info)
6443         {
6444           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6445           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6446              copies, and we put the new vector statement in the first available
6447              RELATED_STMT.  */
6448           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6449             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6450           else
6451             {
6452               stmt_vec_info prev_stmt_info
6453                 = STMT_VINFO_VEC_STMT (next_stmt_info);
6454               stmt_vec_info rel_stmt_info
6455                 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6456               while (rel_stmt_info)
6457                 {
6458                   prev_stmt_info = rel_stmt_info;
6459                   rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6460                 }
6461
6462               STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6463             }
6464
6465           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6466           gap_count = 1;
6467         }
6468     }
6469 }
6470
6471 /* Function vect_force_dr_alignment_p.
6472
6473    Returns whether the alignment of a DECL can be forced to be aligned
6474    on ALIGNMENT bit boundary.  */
6475
6476 bool
6477 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6478 {
6479   if (!VAR_P (decl))
6480     return false;
6481
6482   if (decl_in_symtab_p (decl)
6483       && !symtab_node::get (decl)->can_increase_alignment_p ())
6484     return false;
6485
6486   if (TREE_STATIC (decl))
6487     return (known_le (alignment,
6488                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6489   else
6490     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6491 }
6492
6493
6494 /* Return whether the data reference DR_INFO is supported with respect to its
6495    alignment.
6496    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6497    it is aligned, i.e., check if it is possible to vectorize it with different
6498    alignment.  */
6499
6500 enum dr_alignment_support
6501 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6502                                bool check_aligned_accesses)
6503 {
6504   data_reference *dr = dr_info->dr;
6505   stmt_vec_info stmt_info = dr_info->stmt;
6506   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6507   machine_mode mode = TYPE_MODE (vectype);
6508   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6509   class loop *vect_loop = NULL;
6510   bool nested_in_vect_loop = false;
6511
6512   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6513     return dr_aligned;
6514
6515   /* For now assume all conditional loads/stores support unaligned
6516      access without any special code.  */
6517   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6518     if (gimple_call_internal_p (stmt)
6519         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6520             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6521       return dr_unaligned_supported;
6522
6523   if (loop_vinfo)
6524     {
6525       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6526       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6527     }
6528
6529   /* Possibly unaligned access.  */
6530
6531   /* We can choose between using the implicit realignment scheme (generating
6532      a misaligned_move stmt) and the explicit realignment scheme (generating
6533      aligned loads with a REALIGN_LOAD).  There are two variants to the
6534      explicit realignment scheme: optimized, and unoptimized.
6535      We can optimize the realignment only if the step between consecutive
6536      vector loads is equal to the vector size.  Since the vector memory
6537      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6538      is guaranteed that the misalignment amount remains the same throughout the
6539      execution of the vectorized loop.  Therefore, we can create the
6540      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6541      at the loop preheader.
6542
6543      However, in the case of outer-loop vectorization, when vectorizing a
6544      memory access in the inner-loop nested within the LOOP that is now being
6545      vectorized, while it is guaranteed that the misalignment of the
6546      vectorized memory access will remain the same in different outer-loop
6547      iterations, it is *not* guaranteed that is will remain the same throughout
6548      the execution of the inner-loop.  This is because the inner-loop advances
6549      with the original scalar step (and not in steps of VS).  If the inner-loop
6550      step happens to be a multiple of VS, then the misalignment remains fixed
6551      and we can use the optimized realignment scheme.  For example:
6552
6553       for (i=0; i<N; i++)
6554         for (j=0; j<M; j++)
6555           s += a[i+j];
6556
6557      When vectorizing the i-loop in the above example, the step between
6558      consecutive vector loads is 1, and so the misalignment does not remain
6559      fixed across the execution of the inner-loop, and the realignment cannot
6560      be optimized (as illustrated in the following pseudo vectorized loop):
6561
6562       for (i=0; i<N; i+=4)
6563         for (j=0; j<M; j++){
6564           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6565                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6566                          // (assuming that we start from an aligned address).
6567           }
6568
6569      We therefore have to use the unoptimized realignment scheme:
6570
6571       for (i=0; i<N; i+=4)
6572           for (j=k; j<M; j+=4)
6573           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6574                            // that the misalignment of the initial address is
6575                            // 0).
6576
6577      The loop can then be vectorized as follows:
6578
6579       for (k=0; k<4; k++){
6580         rt = get_realignment_token (&vp[k]);
6581         for (i=0; i<N; i+=4){
6582           v1 = vp[i+k];
6583           for (j=k; j<M; j+=4){
6584             v2 = vp[i+j+VS-1];
6585             va = REALIGN_LOAD <v1,v2,rt>;
6586             vs += va;
6587             v1 = v2;
6588           }
6589         }
6590     } */
6591
6592   if (DR_IS_READ (dr))
6593     {
6594       bool is_packed = false;
6595       tree type = (TREE_TYPE (DR_REF (dr)));
6596
6597       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6598           && (!targetm.vectorize.builtin_mask_for_load
6599               || targetm.vectorize.builtin_mask_for_load ()))
6600         {
6601           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6602
6603           /* If we are doing SLP then the accesses need not have the
6604              same alignment, instead it depends on the SLP group size.  */
6605           if (loop_vinfo
6606               && STMT_SLP_TYPE (stmt_info)
6607               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6608                               * (DR_GROUP_SIZE
6609                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6610                               TYPE_VECTOR_SUBPARTS (vectype)))
6611             ;
6612           else if (!loop_vinfo
6613                    || (nested_in_vect_loop
6614                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6615                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6616             return dr_explicit_realign;
6617           else
6618             return dr_explicit_realign_optimized;
6619         }
6620       if (!known_alignment_for_access_p (dr_info))
6621         is_packed = not_size_aligned (DR_REF (dr));
6622
6623       if (targetm.vectorize.support_vector_misalignment
6624             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6625         /* Can't software pipeline the loads, but can at least do them.  */
6626         return dr_unaligned_supported;
6627     }
6628   else
6629     {
6630       bool is_packed = false;
6631       tree type = (TREE_TYPE (DR_REF (dr)));
6632
6633       if (!known_alignment_for_access_p (dr_info))
6634         is_packed = not_size_aligned (DR_REF (dr));
6635
6636      if (targetm.vectorize.support_vector_misalignment
6637            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6638        return dr_unaligned_supported;
6639     }
6640
6641   /* Unsupported.  */
6642   return dr_unaligned_unsupported;
6643 }