gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 149     {
 150       unsigned int i = 0;
 151       if (gimple_call_internal_p (call))
 152         {
 153           internal_fn ifn = gimple_call_internal_fn (call);
 154           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 155             /* gimple_expr_type already picked the type of the loaded
 156                or stored data.  */
 157             i = ~0U;
 158           else if (internal_fn_mask_index (ifn) == 0)
 159             i = 1;
 160         }
 161       if (i < gimple_call_num_args (call))
 162         {
 163           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 164           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 165             {
 166               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 167               if (rhs < lhs)
 168                 scalar_type = rhs_type;
 169             }
 170         }
 171     }
 172
 173   *lhs_size_unit = lhs;
 174   *rhs_size_unit = rhs;
 175   return scalar_type;
 176 }
 177
 178
 179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 180    tested at run-time.  Return TRUE if DDR was successfully inserted.
 181    Return false if versioning is not supported.  */
 182
 183 static opt_result
 184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 185 {
 186   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 187
 188   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 189     return opt_result::failure_at (vect_location,
 190                                    "will not create alias checks, as"
 191                                    " --param vect-max-version-for-alias-checks"
 192                                    " == 0\n");
 193
 194   opt_result res
 195     = runtime_alias_check_p (ddr, loop,
 196                              optimize_loop_nest_for_speed_p (loop));
 197   if (!res)
 198     return res;
 199
 200   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 201   return opt_result::success ();
 202 }
 203
 204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 205
 206 static void
 207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 208 {
 209   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 210   for (unsigned int i = 0; i < checks.length(); ++i)
 211     if (checks[i] == value)
 212       return;
 213
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location,
 216                      "need run-time check that %T is nonzero\n",
 217                      value);
 218   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 219 }
 220
 221 /* Return true if we know that the order of vectorized DR_INFO_A and
 222    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 223    DR_INFO_B.  At least one of the accesses is a write.  */
 224
 225 static bool
 226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 227 {
 228   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 229   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 230
 231   /* Single statements are always kept in their original order.  */
 232   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 233       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 234     return true;
 235
 236   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 237      SLP group are emitted at the position of the last scalar load and
 238      all loads in an interleaving group are emitted at the position
 239      of the first scalar load.
 240      Stores in a group are emitted at the position of the last scalar store.
 241      Compute that position and check whether the resulting order matches
 242      the current one.
 243      We have not yet decided between SLP and interleaving so we have
 244      to conservatively assume both.  */
 245   stmt_vec_info il_a;
 246   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 247   if (last_a)
 248     {
 249       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 250            s = DR_GROUP_NEXT_ELEMENT (s))
 251         last_a = get_later_stmt (last_a, s);
 252       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 253         {
 254           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 255                s = DR_GROUP_NEXT_ELEMENT (s))
 256             if (get_later_stmt (il_a, s) == il_a)
 257               il_a = s;
 258         }
 259       else
 260         il_a = last_a;
 261     }
 262   else
 263     last_a = il_a = stmtinfo_a;
 264   stmt_vec_info il_b;
 265   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 266   if (last_b)
 267     {
 268       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 269            s = DR_GROUP_NEXT_ELEMENT (s))
 270         last_b = get_later_stmt (last_b, s);
 271       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 272         {
 273           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 274                s = DR_GROUP_NEXT_ELEMENT (s))
 275             if (get_later_stmt (il_b, s) == il_b)
 276               il_b = s;
 277         }
 278       else
 279         il_b = last_b;
 280     }
 281   else
 282     last_b = il_b = stmtinfo_b;
 283   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 284   return (/* SLP */
 285           (get_later_stmt (last_a, last_b) == last_a) == a_after_b
 286           /* Interleaving */
 287           && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
 288           /* Mixed */
 289           && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
 290           && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 291 }
 292
 293 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 294    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 295    distances.  These distances are conservatively correct but they don't
 296    reflect a guaranteed dependence.
 297
 298    Return true if this function does all the work necessary to avoid
 299    an alias or false if the caller should use the dependence distances
 300    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 301    the depth of the loop described by LOOP_VINFO and the other arguments
 302    are as for vect_analyze_data_ref_dependence.  */
 303
 304 static bool
 305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 306                                        loop_vec_info loop_vinfo,
 307                                        int loop_depth, unsigned int *max_vf)
 308 {
 309   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 310   lambda_vector dist_v;
 311   unsigned int i;
 312   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 313     {
 314       int dist = dist_v[loop_depth];
 315       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 316         {
 317           /* If the user asserted safelen >= DIST consecutive iterations
 318              can be executed concurrently, assume independence.
 319
 320              ??? An alternative would be to add the alias check even
 321              in this case, and vectorize the fallback loop with the
 322              maximum VF set to safelen.  However, if the user has
 323              explicitly given a length, it's less likely that that
 324              would be a win.  */
 325           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 326             {
 327               if ((unsigned int) loop->safelen < *max_vf)
 328                 *max_vf = loop->safelen;
 329               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 330               continue;
 331             }
 332
 333           /* For dependence distances of 2 or more, we have the option
 334              of limiting VF or checking for an alias at runtime.
 335              Prefer to check at runtime if we can, to avoid limiting
 336              the VF unnecessarily when the bases are in fact independent.
 337
 338              Note that the alias checks will be removed if the VF ends up
 339              being small enough.  */
 340           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 341           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 342           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 343                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 344                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 345         }
 346     }
 347   return true;
 348 }
 349
 350
 351 /* Function vect_analyze_data_ref_dependence.
 352
 353    FIXME: I needed to change the sense of the returned flag.
 354
 355    Return FALSE if there (might) exist a dependence between a memory-reference
 356    DRA and a memory-reference DRB.  When versioning for alias may check a
 357    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 358    the data dependence.  */
 359
 360 static opt_result
 361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 362                                   loop_vec_info loop_vinfo,
 363                                   unsigned int *max_vf)
 364 {
 365   unsigned int i;
 366   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 367   struct data_reference *dra = DDR_A (ddr);
 368   struct data_reference *drb = DDR_B (ddr);
 369   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 370   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 371   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 372   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 373   lambda_vector dist_v;
 374   unsigned int loop_depth;
 375
 376   /* In loop analysis all data references should be vectorizable.  */
 377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 379     gcc_unreachable ();
 380
 381   /* Independent data accesses.  */
 382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 383     return opt_result::success ();
 384
 385   if (dra == drb
 386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 387     return opt_result::success ();
 388
 389   /* We do not have to consider dependences between accesses that belong
 390      to the same group, unless the stride could be smaller than the
 391      group size.  */
 392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 394           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 396     return opt_result::success ();
 397
 398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 399      least two scalar iterations, there is always also a true dependence.
 400      As the vectorizer does not re-order loads and stores we can ignore
 401      the anti-dependence if TBAA can disambiguate both DRs similar to the
 402      case with known negative distance anti-dependences (positive
 403      distance anti-dependences would violate TBAA constraints).  */
 404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 407                                  get_alias_set (DR_REF (drb))))
 408     return opt_result::success ();
 409
 410   /* Unknown data dependence.  */
 411   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 412     {
 413       /* If user asserted safelen consecutive iterations can be
 414          executed concurrently, assume independence.  */
 415       if (loop->safelen >= 2)
 416         {
 417           if ((unsigned int) loop->safelen < *max_vf)
 418             *max_vf = loop->safelen;
 419           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 420           return opt_result::success ();
 421         }
 422
 423       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 424           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 425         return opt_result::failure_at
 426           (stmtinfo_a->stmt,
 427            "versioning for alias not supported for: "
 428            "can't determine dependence between %T and %T\n",
 429            DR_REF (dra), DR_REF (drb));
 430
 431       if (dump_enabled_p ())
 432         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 433                          "versioning for alias required: "
 434                          "can't determine dependence between %T and %T\n",
 435                          DR_REF (dra), DR_REF (drb));
 436
 437       /* Add to list of ddrs that need to be tested at run-time.  */
 438       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 439     }
 440
 441   /* Known data dependence.  */
 442   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 443     {
 444       /* If user asserted safelen consecutive iterations can be
 445          executed concurrently, assume independence.  */
 446       if (loop->safelen >= 2)
 447         {
 448           if ((unsigned int) loop->safelen < *max_vf)
 449             *max_vf = loop->safelen;
 450           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 451           return opt_result::success ();
 452         }
 453
 454       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 455           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 456         return opt_result::failure_at
 457           (stmtinfo_a->stmt,
 458            "versioning for alias not supported for: "
 459            "bad dist vector for %T and %T\n",
 460            DR_REF (dra), DR_REF (drb));
 461
 462       if (dump_enabled_p ())
 463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 464                          "versioning for alias required: "
 465                          "bad dist vector for %T and %T\n",
 466                          DR_REF (dra), DR_REF (drb));
 467       /* Add to list of ddrs that need to be tested at run-time.  */
 468       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 469     }
 470
 471   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 472
 473   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 474       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 475                                                 loop_depth, max_vf))
 476     return opt_result::success ();
 477
 478   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 479     {
 480       int dist = dist_v[loop_depth];
 481
 482       if (dump_enabled_p ())
 483         dump_printf_loc (MSG_NOTE, vect_location,
 484                          "dependence distance  = %d.\n", dist);
 485
 486       if (dist == 0)
 487         {
 488           if (dump_enabled_p ())
 489             dump_printf_loc (MSG_NOTE, vect_location,
 490                              "dependence distance == 0 between %T and %T\n",
 491                              DR_REF (dra), DR_REF (drb));
 492
 493           /* When we perform grouped accesses and perform implicit CSE
 494              by detecting equal accesses and doing disambiguation with
 495              runtime alias tests like for
 496                 .. = a[i];
 497                 .. = a[i+1];
 498                 a[i] = ..;
 499                 a[i+1] = ..;
 500                 *p = ..;
 501                 .. = a[i];
 502                 .. = a[i+1];
 503              where we will end up loading { a[i], a[i+1] } once, make
 504              sure that inserting group loads before the first load and
 505              stores after the last store will do the right thing.
 506              Similar for groups like
 507                 a[i] = ...;
 508                 ... = a[i];
 509                 a[i+1] = ...;
 510              where loads from the group interleave with the store.  */
 511           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 512             return opt_result::failure_at (stmtinfo_a->stmt,
 513                                            "READ_WRITE dependence"
 514                                            " in interleaving.\n");
 515
 516           if (loop->safelen < 2)
 517             {
 518               tree indicator = dr_zero_step_indicator (dra);
 519               if (!indicator || integer_zerop (indicator))
 520                 return opt_result::failure_at (stmtinfo_a->stmt,
 521                                                "access also has a zero step\n");
 522               else if (TREE_CODE (indicator) != INTEGER_CST)
 523                 vect_check_nonzero_value (loop_vinfo, indicator);
 524             }
 525           continue;
 526         }
 527
 528       if (dist > 0 && DDR_REVERSED_P (ddr))
 529         {
 530           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 531              reversed (to make distance vector positive), and the actual
 532              distance is negative.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance negative.\n");
 536           /* When doing outer loop vectorization, we need to check if there is
 537              a backward dependence at the inner loop level if the dependence
 538              at the outer loop is reversed.  See PR81740.  */
 539           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 540               || nested_in_vect_loop_p (loop, stmtinfo_b))
 541             {
 542               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 543                                                          DDR_LOOP_NEST (ddr));
 544               if (dist_v[inner_depth] < 0)
 545                 return opt_result::failure_at (stmtinfo_a->stmt,
 546                                                "not vectorized, dependence "
 547                                                "between data-refs %T and %T\n",
 548                                                DR_REF (dra), DR_REF (drb));
 549             }
 550           /* Record a negative dependence distance to later limit the
 551              amount of stmt copying / unrolling we can perform.
 552              Only need to handle read-after-write dependence.  */
 553           if (DR_IS_READ (drb)
 554               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 555                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 556             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 557           continue;
 558         }
 559
 560       unsigned int abs_dist = abs (dist);
 561       if (abs_dist >= 2 && abs_dist < *max_vf)
 562         {
 563           /* The dependence distance requires reduction of the maximal
 564              vectorization factor.  */
 565           *max_vf = abs_dist;
 566           if (dump_enabled_p ())
 567             dump_printf_loc (MSG_NOTE, vect_location,
 568                              "adjusting maximal vectorization factor to %i\n",
 569                              *max_vf);
 570         }
 571
 572       if (abs_dist >= *max_vf)
 573         {
 574           /* Dependence distance does not create dependence, as far as
 575              vectorization is concerned, in this case.  */
 576           if (dump_enabled_p ())
 577             dump_printf_loc (MSG_NOTE, vect_location,
 578                              "dependence distance >= VF.\n");
 579           continue;
 580         }
 581
 582       return opt_result::failure_at (stmtinfo_a->stmt,
 583                                      "not vectorized, possible dependence "
 584                                      "between data-refs %T and %T\n",
 585                                      DR_REF (dra), DR_REF (drb));
 586     }
 587
 588   return opt_result::success ();
 589 }
 590
 591 /* Function vect_analyze_data_ref_dependences.
 592
 593    Examine all the data references in the loop, and make sure there do not
 594    exist any data dependences between them.  Set *MAX_VF according to
 595    the maximum vectorization factor the data dependences allow.  */
 596
 597 opt_result
 598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 599                                    unsigned int *max_vf)
 600 {
 601   unsigned int i;
 602   struct data_dependence_relation *ddr;
 603
 604   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 605
 606   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 607     {
 608       LOOP_VINFO_DDRS (loop_vinfo)
 609         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 610                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 611       /* We need read-read dependences to compute
 612          STMT_VINFO_SAME_ALIGN_REFS.  */
 613       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 614                                           &LOOP_VINFO_DDRS (loop_vinfo),
 615                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 616                                           true);
 617       gcc_assert (res);
 618     }
 619
 620   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 621
 622   /* For epilogues we either have no aliases or alias versioning
 623      was applied to original loop.  Therefore we may just get max_vf
 624      using VF of original loop.  */
 625   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 626     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 627   else
 628     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 629       {
 630         opt_result res
 631           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 632         if (!res)
 633           return res;
 634       }
 635
 636   return opt_result::success ();
 637 }
 638
 639
 640 /* Function vect_slp_analyze_data_ref_dependence.
 641
 642    Return TRUE if there (might) exist a dependence between a memory-reference
 643    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 644    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 645    according to the data dependence.  */
 646
 647 static bool
 648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 649                                       struct data_dependence_relation *ddr)
 650 {
 651   struct data_reference *dra = DDR_A (ddr);
 652   struct data_reference *drb = DDR_B (ddr);
 653   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 654   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 655
 656   /* We need to check dependences of statements marked as unvectorizable
 657      as well, they still can prohibit vectorization.  */
 658
 659   /* Independent data accesses.  */
 660   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 661     return false;
 662
 663   if (dra == drb)
 664     return false;
 665
 666   /* Read-read is OK.  */
 667   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 668     return false;
 669
 670   /* If dra and drb are part of the same interleaving chain consider
 671      them independent.  */
 672   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 673       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 674           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 675     return false;
 676
 677   /* Unknown data dependence.  */
 678   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 679     {
 680       if  (dump_enabled_p ())
 681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 682                          "can't determine dependence between %T and %T\n",
 683                          DR_REF (dra), DR_REF (drb));
 684     }
 685   else if (dump_enabled_p ())
 686     dump_printf_loc (MSG_NOTE, vect_location,
 687                      "determined dependence between %T and %T\n",
 688                      DR_REF (dra), DR_REF (drb));
 689
 690   return true;
 691 }
 692
 693
 694 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 695    contain the vector of scalar stores of this instance if we are
 696    disambiguating the loads.  */
 697
 698 static bool
 699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 700                                    vec<stmt_vec_info> stores,
 701                                    stmt_vec_info last_store_info)
 702 {
 703   /* This walks over all stmts involved in the SLP load/store done
 704      in NODE verifying we can sink them up to the last stmt in the
 705      group.  */
 706   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 707   vec_info *vinfo = last_access_info->vinfo;
 708   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 709     {
 710       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 711       if (access_info == last_access_info)
 712         continue;
 713       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 714       ao_ref ref;
 715       bool ref_initialized_p = false;
 716       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 717            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 718         {
 719           gimple *stmt = gsi_stmt (gsi);
 720           if (! gimple_vuse (stmt)
 721               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 722             continue;
 723
 724           /* If we couldn't record a (single) data reference for this
 725              stmt we have to resort to the alias oracle.  */
 726           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 727           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 728           if (!dr_b)
 729             {
 730               /* We are moving a store or sinking a load - this means
 731                  we cannot use TBAA for disambiguation.  */
 732               if (!ref_initialized_p)
 733                 ao_ref_init (&ref, DR_REF (dr_a));
 734               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 735                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 736                 return false;
 737               continue;
 738             }
 739
 740           bool dependent = false;
 741           /* If we run into a store of this same instance (we've just
 742              marked those) then delay dependence checking until we run
 743              into the last store because this is where it will have
 744              been sunk to (and we verify if we can do that as well).  */
 745           if (gimple_visited_p (stmt))
 746             {
 747               if (stmt_info != last_store_info)
 748                 continue;
 749               unsigned i;
 750               stmt_vec_info store_info;
 751               FOR_EACH_VEC_ELT (stores, i, store_info)
 752                 {
 753                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 754                   ddr_p ddr = initialize_data_dependence_relation
 755                                 (dr_a, store_dr, vNULL);
 756                   dependent
 757                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 758                   free_dependence_relation (ddr);
 759                   if (dependent)
 760                     break;
 761                 }
 762             }
 763           else
 764             {
 765               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 766                                                                dr_b, vNULL);
 767               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 768               free_dependence_relation (ddr);
 769             }
 770           if (dependent)
 771             return false;
 772         }
 773     }
 774   return true;
 775 }
 776
 777
 778 /* Function vect_analyze_data_ref_dependences.
 779
 780    Examine all the data references in the basic-block, and make sure there
 781    do not exist any data dependences between them.  Set *MAX_VF according to
 782    the maximum vectorization factor the data dependences allow.  */
 783
 784 bool
 785 vect_slp_analyze_instance_dependence (slp_instance instance)
 786 {
 787   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 788
 789   /* The stores of this instance are at the root of the SLP tree.  */
 790   slp_tree store = SLP_INSTANCE_TREE (instance);
 791   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 792     store = NULL;
 793
 794   /* Verify we can sink stores to the vectorized stmt insert location.  */
 795   stmt_vec_info last_store_info = NULL;
 796   if (store)
 797     {
 798       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 799         return false;
 800
 801       /* Mark stores in this instance and remember the last one.  */
 802       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 803       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 804         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 805     }
 806
 807   bool res = true;
 808
 809   /* Verify we can sink loads to the vectorized stmt insert location,
 810      special-casing stores of this instance.  */
 811   slp_tree load;
 812   unsigned int i;
 813   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 814     if (! vect_slp_analyze_node_dependences (instance, load,
 815                                              store
 816                                              ? SLP_TREE_SCALAR_STMTS (store)
 817                                              : vNULL, last_store_info))
 818       {
 819         res = false;
 820         break;
 821       }
 822
 823   /* Unset the visited flag.  */
 824   if (store)
 825     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 826       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 827
 828   return res;
 829 }
 830
 831 /* Record the base alignment guarantee given by DRB, which occurs
 832    in STMT_INFO.  */
 833
 834 static void
 835 vect_record_base_alignment (stmt_vec_info stmt_info,
 836                             innermost_loop_behavior *drb)
 837 {
 838   vec_info *vinfo = stmt_info->vinfo;
 839   bool existed;
 840   innermost_loop_behavior *&entry
 841     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 842   if (!existed || entry->base_alignment < drb->base_alignment)
 843     {
 844       entry = drb;
 845       if (dump_enabled_p ())
 846         dump_printf_loc (MSG_NOTE, vect_location,
 847                          "recording new base alignment for %T\n"
 848                          "  alignment:    %d\n"
 849                          "  misalignment: %d\n"
 850                          "  based on:     %G",
 851                          drb->base_address,
 852                          drb->base_alignment,
 853                          drb->base_misalignment,
 854                          stmt_info->stmt);
 855     }
 856 }
 857
 858 /* If the region we're going to vectorize is reached, all unconditional
 859    data references occur at least once.  We can therefore pool the base
 860    alignment guarantees from each unconditional reference.  Do this by
 861    going through all the data references in VINFO and checking whether
 862    the containing statement makes the reference unconditionally.  If so,
 863    record the alignment of the base address in VINFO so that it can be
 864    used for all other references with the same base.  */
 865
 866 void
 867 vect_record_base_alignments (vec_info *vinfo)
 868 {
 869   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 870   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 871   data_reference *dr;
 872   unsigned int i;
 873   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 874     {
 875       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 876       stmt_vec_info stmt_info = dr_info->stmt;
 877       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 878           && STMT_VINFO_VECTORIZABLE (stmt_info)
 879           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 880         {
 881           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 882
 883           /* If DR is nested in the loop that is being vectorized, we can also
 884              record the alignment of the base wrt the outer loop.  */
 885           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 886             vect_record_base_alignment
 887               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 888         }
 889     }
 890 }
 891
 892 /* Return the target alignment for the vectorized form of DR_INFO.  */
 893
 894 static poly_uint64
 895 vect_calculate_target_alignment (dr_vec_info *dr_info)
 896 {
 897   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 898   return targetm.vectorize.preferred_vector_alignment (vectype);
 899 }
 900
 901 /* Function vect_compute_data_ref_alignment
 902
 903    Compute the misalignment of the data reference DR_INFO.
 904
 905    Output:
 906    1. DR_MISALIGNMENT (DR_INFO) is defined.
 907
 908    FOR NOW: No analysis is actually performed. Misalignment is calculated
 909    only for trivial cases. TODO.  */
 910
 911 static void
 912 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 913 {
 914   stmt_vec_info stmt_info = dr_info->stmt;
 915   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 916   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 917   class loop *loop = NULL;
 918   tree ref = DR_REF (dr_info->dr);
 919   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 920
 921   if (dump_enabled_p ())
 922     dump_printf_loc (MSG_NOTE, vect_location,
 923                      "vect_compute_data_ref_alignment:\n");
 924
 925   if (loop_vinfo)
 926     loop = LOOP_VINFO_LOOP (loop_vinfo);
 927
 928   /* Initialize misalignment to unknown.  */
 929   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 930
 931   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 932     return;
 933
 934   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 935   bool step_preserves_misalignment_p;
 936
 937   poly_uint64 vector_alignment
 938     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 939   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 940
 941   unsigned HOST_WIDE_INT vect_align_c;
 942   if (!vector_alignment.is_constant (&vect_align_c))
 943     return;
 944
 945   /* No step for BB vectorization.  */
 946   if (!loop)
 947     {
 948       gcc_assert (integer_zerop (drb->step));
 949       step_preserves_misalignment_p = true;
 950     }
 951
 952   /* In case the dataref is in an inner-loop of the loop that is being
 953      vectorized (LOOP), we use the base and misalignment information
 954      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 955      stays the same throughout the execution of the inner-loop, which is why
 956      we have to check that the stride of the dataref in the inner-loop evenly
 957      divides by the vector alignment.  */
 958   else if (nested_in_vect_loop_p (loop, stmt_info))
 959     {
 960       step_preserves_misalignment_p
 961         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 962
 963       if (dump_enabled_p ())
 964         {
 965           if (step_preserves_misalignment_p)
 966             dump_printf_loc (MSG_NOTE, vect_location,
 967                              "inner step divides the vector alignment.\n");
 968           else
 969             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 970                              "inner step doesn't divide the vector"
 971                              " alignment.\n");
 972         }
 973     }
 974
 975   /* Similarly we can only use base and misalignment information relative to
 976      an innermost loop if the misalignment stays the same throughout the
 977      execution of the loop.  As above, this is the case if the stride of
 978      the dataref evenly divides by the alignment.  */
 979   else
 980     {
 981       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 982       step_preserves_misalignment_p
 983         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 984
 985       if (!step_preserves_misalignment_p && dump_enabled_p ())
 986         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 987                          "step doesn't divide the vector alignment.\n");
 988     }
 989
 990   unsigned int base_alignment = drb->base_alignment;
 991   unsigned int base_misalignment = drb->base_misalignment;
 992
 993   /* Calculate the maximum of the pooled base address alignment and the
 994      alignment that we can compute for DR itself.  */
 995   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 996   if (entry && base_alignment < (*entry)->base_alignment)
 997     {
 998       base_alignment = (*entry)->base_alignment;
 999       base_misalignment = (*entry)->base_misalignment;
1000     }
1001
1002   if (drb->offset_alignment < vect_align_c
1003       || !step_preserves_misalignment_p
1004       /* We need to know whether the step wrt the vectorized loop is
1005          negative when computing the starting misalignment below.  */
1006       || TREE_CODE (drb->step) != INTEGER_CST)
1007     {
1008       if (dump_enabled_p ())
1009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010                          "Unknown alignment for access: %T\n", ref);
1011       return;
1012     }
1013
1014   if (base_alignment < vect_align_c)
1015     {
1016       unsigned int max_alignment;
1017       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1018       if (max_alignment < vect_align_c
1019           || !vect_can_force_dr_alignment_p (base,
1020                                              vect_align_c * BITS_PER_UNIT))
1021         {
1022           if (dump_enabled_p ())
1023             dump_printf_loc (MSG_NOTE, vect_location,
1024                              "can't force alignment of ref: %T\n", ref);
1025           return;
1026         }
1027
1028       /* Force the alignment of the decl.
1029          NOTE: This is the only change to the code we make during
1030          the analysis phase, before deciding to vectorize the loop.  */
1031       if (dump_enabled_p ())
1032         dump_printf_loc (MSG_NOTE, vect_location,
1033                          "force alignment of %T\n", ref);
1034
1035       dr_info->base_decl = base;
1036       dr_info->base_misaligned = true;
1037       base_misalignment = 0;
1038     }
1039   poly_int64 misalignment
1040     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1041
1042   /* If this is a backward running DR then first access in the larger
1043      vectype actually is N-1 elements before the address in the DR.
1044      Adjust misalign accordingly.  */
1045   if (tree_int_cst_sgn (drb->step) < 0)
1046     /* PLUS because STEP is negative.  */
1047     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1048                      * TREE_INT_CST_LOW (drb->step));
1049
1050   unsigned int const_misalignment;
1051   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1052     {
1053       if (dump_enabled_p ())
1054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                          "Non-constant misalignment for access: %T\n", ref);
1056       return;
1057     }
1058
1059   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1060
1061   if (dump_enabled_p ())
1062     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063                      "misalign = %d bytes of ref %T\n",
1064                      DR_MISALIGNMENT (dr_info), ref);
1065
1066   return;
1067 }
1068
1069 /* Function vect_update_misalignment_for_peel.
1070    Sets DR_INFO's misalignment
1071    - to 0 if it has the same alignment as DR_PEEL_INFO,
1072    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1073    - to -1 (unknown) otherwise.
1074
1075    DR_INFO - the data reference whose misalignment is to be adjusted.
1076    DR_PEEL_INFO - the data reference whose misalignment is being made
1077                   zero in the vector loop by the peel.
1078    NPEEL - the number of iterations in the peel loop if the misalignment
1079            of DR_PEEL_INFO is known at compile time.  */
1080
1081 static void
1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1083                                    dr_vec_info *dr_peel_info, int npeel)
1084 {
1085   unsigned int i;
1086   vec<dr_p> same_aligned_drs;
1087   struct data_reference *current_dr;
1088   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1089
1090   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1091      it is aligned in the vector loop.  */
1092   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1093   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1094     {
1095       if (current_dr != dr_info->dr)
1096         continue;
1097       gcc_assert (!known_alignment_for_access_p (dr_info)
1098                   || !known_alignment_for_access_p (dr_peel_info)
1099                   || (DR_MISALIGNMENT (dr_info)
1100                       == DR_MISALIGNMENT (dr_peel_info)));
1101       SET_DR_MISALIGNMENT (dr_info, 0);
1102       return;
1103     }
1104
1105   unsigned HOST_WIDE_INT alignment;
1106   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1107       && known_alignment_for_access_p (dr_info)
1108       && known_alignment_for_access_p (dr_peel_info))
1109     {
1110       int misal = DR_MISALIGNMENT (dr_info);
1111       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1112       misal &= alignment - 1;
1113       SET_DR_MISALIGNMENT (dr_info, misal);
1114       return;
1115     }
1116
1117   if (dump_enabled_p ())
1118     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1119                      "to unknown (-1).\n");
1120   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1121 }
1122
1123
1124 /* Function verify_data_ref_alignment
1125
1126    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1127
1128 static opt_result
1129 verify_data_ref_alignment (dr_vec_info *dr_info)
1130 {
1131   enum dr_alignment_support supportable_dr_alignment
1132     = vect_supportable_dr_alignment (dr_info, false);
1133   if (!supportable_dr_alignment)
1134     return opt_result::failure_at
1135       (dr_info->stmt->stmt,
1136        DR_IS_READ (dr_info->dr)
1137         ? "not vectorized: unsupported unaligned load: %T\n"
1138         : "not vectorized: unsupported unaligned store: %T\n",
1139        DR_REF (dr_info->dr));
1140
1141   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1142     dump_printf_loc (MSG_NOTE, vect_location,
1143                      "Vectorizing an unaligned access.\n");
1144
1145   return opt_result::success ();
1146 }
1147
1148 /* Function vect_verify_datarefs_alignment
1149
1150    Return TRUE if all data references in the loop can be
1151    handled with respect to alignment.  */
1152
1153 opt_result
1154 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1155 {
1156   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1157   struct data_reference *dr;
1158   unsigned int i;
1159
1160   FOR_EACH_VEC_ELT (datarefs, i, dr)
1161     {
1162       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1163       stmt_vec_info stmt_info = dr_info->stmt;
1164
1165       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1166         continue;
1167
1168       /* For interleaving, only the alignment of the first access matters.   */
1169       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1171         continue;
1172
1173       /* Strided accesses perform only component accesses, alignment is
1174          irrelevant for them.  */
1175       if (STMT_VINFO_STRIDED_P (stmt_info)
1176           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1177         continue;
1178
1179       opt_result res = verify_data_ref_alignment (dr_info);
1180       if (!res)
1181         return res;
1182     }
1183
1184   return opt_result::success ();
1185 }
1186
1187 /* Given an memory reference EXP return whether its alignment is less
1188    than its size.  */
1189
1190 static bool
1191 not_size_aligned (tree exp)
1192 {
1193   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1194     return true;
1195
1196   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1197           > get_object_alignment (exp));
1198 }
1199
1200 /* Function vector_alignment_reachable_p
1201
1202    Return true if vector alignment for DR_INFO is reachable by peeling
1203    a few loop iterations.  Return false otherwise.  */
1204
1205 static bool
1206 vector_alignment_reachable_p (dr_vec_info *dr_info)
1207 {
1208   stmt_vec_info stmt_info = dr_info->stmt;
1209   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1210
1211   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212     {
1213       /* For interleaved access we peel only if number of iterations in
1214          the prolog loop ({VF - misalignment}), is a multiple of the
1215          number of the interleaved accesses.  */
1216       int elem_size, mis_in_elements;
1217
1218       /* FORNOW: handle only known alignment.  */
1219       if (!known_alignment_for_access_p (dr_info))
1220         return false;
1221
1222       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1223       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1224       elem_size = vector_element_size (vector_size, nelements);
1225       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1226
1227       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1228         return false;
1229     }
1230
1231   /* If misalignment is known at the compile time then allow peeling
1232      only if natural alignment is reachable through peeling.  */
1233   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1234     {
1235       HOST_WIDE_INT elmsize =
1236                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1237       if (dump_enabled_p ())
1238         {
1239           dump_printf_loc (MSG_NOTE, vect_location,
1240                            "data size = %wd. misalignment = %d.\n", elmsize,
1241                            DR_MISALIGNMENT (dr_info));
1242         }
1243       if (DR_MISALIGNMENT (dr_info) % elmsize)
1244         {
1245           if (dump_enabled_p ())
1246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247                              "data size does not divide the misalignment.\n");
1248           return false;
1249         }
1250     }
1251
1252   if (!known_alignment_for_access_p (dr_info))
1253     {
1254       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1255       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1256       if (dump_enabled_p ())
1257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                          "Unknown misalignment, %snaturally aligned\n",
1259                          is_packed ? "not " : "");
1260       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1261     }
1262
1263   return true;
1264 }
1265
1266
1267 /* Calculate the cost of the memory access represented by DR_INFO.  */
1268
1269 static void
1270 vect_get_data_access_cost (dr_vec_info *dr_info,
1271                            unsigned int *inside_cost,
1272                            unsigned int *outside_cost,
1273                            stmt_vector_for_cost *body_cost_vec,
1274                            stmt_vector_for_cost *prologue_cost_vec)
1275 {
1276   stmt_vec_info stmt_info = dr_info->stmt;
1277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278   int ncopies;
1279
1280   if (PURE_SLP_STMT (stmt_info))
1281     ncopies = 1;
1282   else
1283     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1284
1285   if (DR_IS_READ (dr_info->dr))
1286     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287                         prologue_cost_vec, body_cost_vec, false);
1288   else
1289     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1290
1291   if (dump_enabled_p ())
1292     dump_printf_loc (MSG_NOTE, vect_location,
1293                      "vect_get_data_access_cost: inside_cost = %d, "
1294                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1295 }
1296
1297
1298 typedef struct _vect_peel_info
1299 {
1300   dr_vec_info *dr_info;
1301   int npeel;
1302   unsigned int count;
1303 } *vect_peel_info;
1304
1305 typedef struct _vect_peel_extended_info
1306 {
1307   struct _vect_peel_info peel_info;
1308   unsigned int inside_cost;
1309   unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1311
1312
1313 /* Peeling hashtable helpers.  */
1314
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1316 {
1317   static inline hashval_t hash (const _vect_peel_info *);
1318   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1319 };
1320
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1323 {
1324   return (hashval_t) peel_info->npeel;
1325 }
1326
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1329 {
1330   return (a->npeel == b->npeel);
1331 }
1332
1333
1334 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1335
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1339                           int npeel)
1340 {
1341   struct _vect_peel_info elem, *slot;
1342   _vect_peel_info **new_slot;
1343   bool supportable_dr_alignment
1344     = vect_supportable_dr_alignment (dr_info, true);
1345
1346   elem.npeel = npeel;
1347   slot = peeling_htab->find (&elem);
1348   if (slot)
1349     slot->count++;
1350   else
1351     {
1352       slot = XNEW (struct _vect_peel_info);
1353       slot->npeel = npeel;
1354       slot->dr_info = dr_info;
1355       slot->count = 1;
1356       new_slot = peeling_htab->find_slot (slot, INSERT);
1357       *new_slot = slot;
1358     }
1359
1360   if (!supportable_dr_alignment
1361       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1362     slot->count += VECT_MAX_COST;
1363 }
1364
1365
1366 /* Traverse peeling hash table to find peeling option that aligns maximum
1367    number of data accesses.  */
1368
1369 int
1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1371                                      _vect_peel_extended_info *max)
1372 {
1373   vect_peel_info elem = *slot;
1374
1375   if (elem->count > max->peel_info.count
1376       || (elem->count == max->peel_info.count
1377           && max->peel_info.npeel > elem->npeel))
1378     {
1379       max->peel_info.npeel = elem->npeel;
1380       max->peel_info.count = elem->count;
1381       max->peel_info.dr_info = elem->dr_info;
1382     }
1383
1384   return 1;
1385 }
1386
1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1388    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1389    we assume DR0_INFO's misalignment will be zero after peeling.  */
1390
1391 static void
1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1393                                 dr_vec_info *dr0_info,
1394                                 unsigned int *inside_cost,
1395                                 unsigned int *outside_cost,
1396                                 stmt_vector_for_cost *body_cost_vec,
1397                                 stmt_vector_for_cost *prologue_cost_vec,
1398                                 unsigned int npeel,
1399                                 bool unknown_misalignment)
1400 {
1401   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402   unsigned i;
1403   data_reference *dr;
1404
1405   FOR_EACH_VEC_ELT (datarefs, i, dr)
1406     {
1407       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1408       stmt_vec_info stmt_info = dr_info->stmt;
1409       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410         continue;
1411
1412       /* For interleaving, only the alignment of the first access
1413          matters.  */
1414       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416         continue;
1417
1418       /* Strided accesses perform only component accesses, alignment is
1419          irrelevant for them.  */
1420       if (STMT_VINFO_STRIDED_P (stmt_info)
1421           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422         continue;
1423
1424       int save_misalignment;
1425       save_misalignment = DR_MISALIGNMENT (dr_info);
1426       if (npeel == 0)
1427         ;
1428       else if (unknown_misalignment && dr_info == dr0_info)
1429         SET_DR_MISALIGNMENT (dr_info, 0);
1430       else
1431         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1432       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1433                                  body_cost_vec, prologue_cost_vec);
1434       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1435     }
1436 }
1437
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439    Find the one with the lowest cost.  */
1440
1441 int
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443                                    _vect_peel_extended_info *min)
1444 {
1445   vect_peel_info elem = *slot;
1446   int dummy;
1447   unsigned int inside_cost = 0, outside_cost = 0;
1448   stmt_vec_info stmt_info = elem->dr_info->stmt;
1449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451                        epilogue_cost_vec;
1452
1453   prologue_cost_vec.create (2);
1454   body_cost_vec.create (2);
1455   epilogue_cost_vec.create (2);
1456
1457   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1458                                   &outside_cost, &body_cost_vec,
1459                                   &prologue_cost_vec, elem->npeel, false);
1460
1461   body_cost_vec.release ();
1462
1463   outside_cost += vect_get_known_peeling_cost
1464     (loop_vinfo, elem->npeel, &dummy,
1465      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466      &prologue_cost_vec, &epilogue_cost_vec);
1467
1468   /* Prologue and epilogue costs are added to the target model later.
1469      These costs depend only on the scalar iteration cost, the
1470      number of peeling iterations finally chosen, and the number of
1471      misaligned statements.  So discard the information found here.  */
1472   prologue_cost_vec.release ();
1473   epilogue_cost_vec.release ();
1474
1475   if (inside_cost < min->inside_cost
1476       || (inside_cost == min->inside_cost
1477           && outside_cost < min->outside_cost))
1478     {
1479       min->inside_cost = inside_cost;
1480       min->outside_cost = outside_cost;
1481       min->peel_info.dr_info = elem->dr_info;
1482       min->peel_info.npeel = elem->npeel;
1483       min->peel_info.count = elem->count;
1484     }
1485
1486   return 1;
1487 }
1488
1489
1490 /* Choose best peeling option by traversing peeling hash table and either
1491    choosing an option with the lowest cost (if cost model is enabled) or the
1492    option that aligns as many accesses as possible.  */
1493
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496                                        loop_vec_info loop_vinfo)
1497 {
1498    struct _vect_peel_extended_info res;
1499
1500    res.peel_info.dr_info = NULL;
1501
1502    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1503      {
1504        res.inside_cost = INT_MAX;
1505        res.outside_cost = INT_MAX;
1506        peeling_htab->traverse <_vect_peel_extended_info *,
1507                                vect_peeling_hash_get_lowest_cost> (&res);
1508      }
1509    else
1510      {
1511        res.peel_info.count = 0;
1512        peeling_htab->traverse <_vect_peel_extended_info *,
1513                                vect_peeling_hash_get_most_frequent> (&res);
1514        res.inside_cost = 0;
1515        res.outside_cost = 0;
1516      }
1517
1518    return res;
1519 }
1520
1521 /* Return true if the new peeling NPEEL is supported.  */
1522
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1525                           unsigned npeel)
1526 {
1527   unsigned i;
1528   struct data_reference *dr = NULL;
1529   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530   enum dr_alignment_support supportable_dr_alignment;
1531
1532   /* Ensure that all data refs can be vectorized after the peel.  */
1533   FOR_EACH_VEC_ELT (datarefs, i, dr)
1534     {
1535       int save_misalignment;
1536
1537       if (dr == dr0_info->dr)
1538         continue;
1539
1540       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541       stmt_vec_info stmt_info = dr_info->stmt;
1542       /* For interleaving, only the alignment of the first access
1543          matters.  */
1544       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546         continue;
1547
1548       /* Strided accesses perform only component accesses, alignment is
1549          irrelevant for them.  */
1550       if (STMT_VINFO_STRIDED_P (stmt_info)
1551           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552         continue;
1553
1554       save_misalignment = DR_MISALIGNMENT (dr_info);
1555       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1556       supportable_dr_alignment
1557         = vect_supportable_dr_alignment (dr_info, false);
1558       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1559
1560       if (!supportable_dr_alignment)
1561         return false;
1562     }
1563
1564   return true;
1565 }
1566
1567 /* Function vect_enhance_data_refs_alignment
1568
1569    This pass will use loop versioning and loop peeling in order to enhance
1570    the alignment of data references in the loop.
1571
1572    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573    original loop is to be vectorized.  Any other loops that are created by
1574    the transformations performed in this pass - are not supposed to be
1575    vectorized.  This restriction will be relaxed.
1576
1577    This pass will require a cost model to guide it whether to apply peeling
1578    or versioning or a combination of the two.  For example, the scheme that
1579    intel uses when given a loop with several memory accesses, is as follows:
1580    choose one memory access ('p') which alignment you want to force by doing
1581    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1582    other accesses are not necessarily aligned, or (2) use loop versioning to
1583    generate one loop in which all accesses are aligned, and another loop in
1584    which only 'p' is necessarily aligned.
1585
1586    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589
1590    Devising a cost model is the most critical aspect of this work.  It will
1591    guide us on which access to peel for, whether to use loop versioning, how
1592    many versions to create, etc.  The cost model will probably consist of
1593    generic considerations as well as target specific considerations (on
1594    powerpc for example, misaligned stores are more painful than misaligned
1595    loads).
1596
1597    Here are the general steps involved in alignment enhancements:
1598
1599      -- original loop, before alignment analysis:
1600         for (i=0; i<N; i++){
1601           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1602           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1603         }
1604
1605      -- After vect_compute_data_refs_alignment:
1606         for (i=0; i<N; i++){
1607           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1608           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1609         }
1610
1611      -- Possibility 1: we do loop versioning:
1612      if (p is aligned) {
1613         for (i=0; i<N; i++){    # loop 1A
1614           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1615           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1616         }
1617      }
1618      else {
1619         for (i=0; i<N; i++){    # loop 1B
1620           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1621           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1622         }
1623      }
1624
1625      -- Possibility 2: we do loop peeling:
1626      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1627         x = q[i];
1628         p[i] = y;
1629      }
1630      for (i = 3; i < N; i++){   # loop 2A
1631         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1632         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1633      }
1634
1635      -- Possibility 3: combination of loop peeling and versioning:
1636      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1637         x = q[i];
1638         p[i] = y;
1639      }
1640      if (p is aligned) {
1641         for (i = 3; i<N; i++){  # loop 3A
1642           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1643           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1644         }
1645      }
1646      else {
1647         for (i = 3; i<N; i++){  # loop 3B
1648           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1649           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1650         }
1651      }
1652
1653      These loops are later passed to loop_transform to be vectorized.  The
1654      vectorizer will use the alignment information to guide the transformation
1655      (whether to generate regular loads/stores, or with special handling for
1656      misalignment).  */
1657
1658 opt_result
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 {
1661   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   enum dr_alignment_support supportable_dr_alignment;
1664   dr_vec_info *first_store = NULL;
1665   dr_vec_info *dr0_info = NULL;
1666   struct data_reference *dr;
1667   unsigned int i, j;
1668   bool do_peeling = false;
1669   bool do_versioning = false;
1670   unsigned int npeel = 0;
1671   bool one_misalignment_known = false;
1672   bool one_misalignment_unknown = false;
1673   bool one_dr_unsupportable = false;
1674   dr_vec_info *unsupportable_dr_info = NULL;
1675   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676   unsigned possible_npeel_number = 1;
1677   tree vectype;
1678   unsigned int mis, same_align_drs_max = 0;
1679   hash_table<peel_info_hasher> peeling_htab (1);
1680
1681   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682
1683   /* Reset data so we can safely be called multiple times.  */
1684   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686
1687   /* While cost model enhancements are expected in the future, the high level
1688      view of the code at this time is as follows:
1689
1690      A) If there is a misaligned access then see if peeling to align
1691         this access can make all data references satisfy
1692         vect_supportable_dr_alignment.  If so, update data structures
1693         as needed and return true.
1694
1695      B) If peeling wasn't possible and there is a data reference with an
1696         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697         then see if loop versioning checks can be used to make all data
1698         references satisfy vect_supportable_dr_alignment.  If so, update
1699         data structures as needed and return true.
1700
1701      C) If neither peeling nor versioning were successful then return false if
1702         any data reference does not satisfy vect_supportable_dr_alignment.
1703
1704      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705
1706      Note, Possibility 3 above (which is peeling and versioning together) is not
1707      being done at this time.  */
1708
1709   /* (1) Peeling to force alignment.  */
1710
1711   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712      Considerations:
1713      + How many accesses will become aligned due to the peeling
1714      - How many accesses will become unaligned due to the peeling,
1715        and the cost of misaligned accesses.
1716      - The cost of peeling (the extra runtime checks, the increase
1717        in code size).  */
1718
1719   FOR_EACH_VEC_ELT (datarefs, i, dr)
1720     {
1721       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1722       stmt_vec_info stmt_info = dr_info->stmt;
1723
1724       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725         continue;
1726
1727       /* For interleaving, only the alignment of the first access
1728          matters.  */
1729       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1731         continue;
1732
1733       /* For scatter-gather or invariant accesses there is nothing
1734          to enhance.  */
1735       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1736           || integer_zerop (DR_STEP (dr)))
1737         continue;
1738
1739       /* Strided accesses perform only component accesses, alignment is
1740          irrelevant for them.  */
1741       if (STMT_VINFO_STRIDED_P (stmt_info)
1742           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1743         continue;
1744
1745       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1746       do_peeling = vector_alignment_reachable_p (dr_info);
1747       if (do_peeling)
1748         {
1749           if (known_alignment_for_access_p (dr_info))
1750             {
1751               unsigned int npeel_tmp = 0;
1752               bool negative = tree_int_cst_compare (DR_STEP (dr),
1753                                                     size_zero_node) < 0;
1754
1755               vectype = STMT_VINFO_VECTYPE (stmt_info);
1756               /* If known_alignment_for_access_p then we have set
1757                  DR_MISALIGNMENT which is only done if we know it at compiler
1758                  time, so it is safe to assume target alignment is constant.
1759                */
1760               unsigned int target_align =
1761                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1762               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1763               mis = (negative
1764                      ? DR_MISALIGNMENT (dr_info)
1765                      : -DR_MISALIGNMENT (dr_info));
1766               if (DR_MISALIGNMENT (dr_info) != 0)
1767                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775                  first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781                  cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783                 {
1784                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786                   possible_npeel_number
1787                     = vect_get_num_vectors (nscalars, vectype);
1788
1789                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1790                      allow peeling NELEMENTS.  */
1791                   if (DR_MISALIGNMENT (dr_info) == 0)
1792                     possible_npeel_number++;
1793                 }
1794
1795               /* Save info about DR in the hash table.  Also include peeling
1796                  amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800                                             dr_info, npeel_tmp);
1801                   npeel_tmp += target_align / dr_size;
1802                 }
1803
1804               one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812               unsigned same_align_drs
1813                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814               if (!dr0_info
1815                   || same_align_drs_max < same_align_drs)
1816                 {
1817                   same_align_drs_max = same_align_drs;
1818                   dr0_info = dr_info;
1819                 }
1820               /* For data-refs with the same number of related
1821                  accesses prefer the one where the misalign
1822                  computation will be invariant in the outermost loop.  */
1823               else if (same_align_drs_max == same_align_drs)
1824                 {
1825                   class loop *ivloop0, *ivloop;
1826                   ivloop0 = outermost_invariant_loop_for_expr
1827                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1828                   ivloop = outermost_invariant_loop_for_expr
1829                     (loop, DR_BASE_ADDRESS (dr));
1830                   if ((ivloop && !ivloop0)
1831                       || (ivloop && ivloop0
1832                           && flow_loop_nested_p (ivloop, ivloop0)))
1833                     dr0_info = dr_info;
1834                 }
1835
1836               one_misalignment_unknown = true;
1837
1838               /* Check for data refs with unsupportable alignment that
1839                  can be peeled.  */
1840               if (!supportable_dr_alignment)
1841               {
1842                 one_dr_unsupportable = true;
1843                 unsupportable_dr_info = dr_info;
1844               }
1845
1846               if (!first_store && DR_IS_WRITE (dr))
1847                 first_store = dr_info;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr_info))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1891                                       &load_inside_cost,
1892                                       &load_outside_cost,
1893                                       &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895
1896       if (first_store)
1897         {
1898           dummy.create (2);
1899           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1900                                           &store_inside_cost,
1901                                           &store_outside_cost,
1902                                           &dummy, &dummy,
1903                                           estimated_npeels, true);
1904           dummy.release ();
1905         }
1906       else
1907         {
1908           store_inside_cost = INT_MAX;
1909           store_outside_cost = INT_MAX;
1910         }
1911
1912       if (load_inside_cost > store_inside_cost
1913           || (load_inside_cost == store_inside_cost
1914               && load_outside_cost > store_outside_cost))
1915         {
1916           dr0_info = first_store;
1917           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919         }
1920       else
1921         {
1922           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924         }
1925
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932         (loop_vinfo, estimated_npeels, &dummy2,
1933          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934          &prologue_cost_vec, &epilogue_cost_vec);
1935
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938
1939       peel_for_unknown_alignment.peel_info.count = 1
1940         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1941     }
1942
1943   peel_for_unknown_alignment.peel_info.npeel = 0;
1944   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1945
1946   best_peel = peel_for_unknown_alignment;
1947
1948   peel_for_known_alignment.inside_cost = INT_MAX;
1949   peel_for_known_alignment.outside_cost = INT_MAX;
1950   peel_for_known_alignment.peel_info.count = 0;
1951   peel_for_known_alignment.peel_info.dr_info = NULL;
1952
1953   if (do_peeling && one_misalignment_known)
1954     {
1955       /* Peeling is possible, but there is no data access that is not supported
1956          unless aligned.  So we try to choose the best possible peeling from
1957          the hash table.  */
1958       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1959         (&peeling_htab, loop_vinfo);
1960     }
1961
1962   /* Compare costs of peeling for known and unknown alignment. */
1963   if (peel_for_known_alignment.peel_info.dr_info != NULL
1964       && peel_for_unknown_alignment.inside_cost
1965       >= peel_for_known_alignment.inside_cost)
1966     {
1967       best_peel = peel_for_known_alignment;
1968
1969       /* If the best peeling for known alignment has NPEEL == 0, perform no
1970          peeling at all except if there is an unsupportable dr that we can
1971          align.  */
1972       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1973         do_peeling = false;
1974     }
1975
1976   /* If there is an unsupportable data ref, prefer this over all choices so far
1977      since we'd have to discard a chosen peeling except when it accidentally
1978      aligned the unsupportable data ref.  */
1979   if (one_dr_unsupportable)
1980     dr0_info = unsupportable_dr_info;
1981   else if (do_peeling)
1982     {
1983       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1984          TODO: Use nopeel_outside_cost or get rid of it?  */
1985       unsigned nopeel_inside_cost = 0;
1986       unsigned nopeel_outside_cost = 0;
1987
1988       stmt_vector_for_cost dummy;
1989       dummy.create (2);
1990       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1991                                       &nopeel_outside_cost, &dummy, &dummy,
1992                                       0, false);
1993       dummy.release ();
1994
1995       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1996          costs will be recorded.  */
1997       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1998       prologue_cost_vec.create (2);
1999       epilogue_cost_vec.create (2);
2000
2001       int dummy2;
2002       nopeel_outside_cost += vect_get_known_peeling_cost
2003         (loop_vinfo, 0, &dummy2,
2004          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2005          &prologue_cost_vec, &epilogue_cost_vec);
2006
2007       prologue_cost_vec.release ();
2008       epilogue_cost_vec.release ();
2009
2010       npeel = best_peel.peel_info.npeel;
2011       dr0_info = best_peel.peel_info.dr_info;
2012
2013       /* If no peeling is not more expensive than the best peeling we
2014          have so far, don't perform any peeling.  */
2015       if (nopeel_inside_cost <= best_peel.inside_cost)
2016         do_peeling = false;
2017     }
2018
2019   if (do_peeling)
2020     {
2021       stmt_vec_info stmt_info = dr0_info->stmt;
2022       vectype = STMT_VINFO_VECTYPE (stmt_info);
2023
2024       if (known_alignment_for_access_p (dr0_info))
2025         {
2026           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2027                                                 size_zero_node) < 0;
2028           if (!npeel)
2029             {
2030               /* Since it's known at compile time, compute the number of
2031                  iterations in the peeled loop (the peeling factor) for use in
2032                  updating DR_MISALIGNMENT values.  The peeling factor is the
2033                  vectorization factor minus the misalignment as an element
2034                  count.  */
2035               mis = (negative
2036                      ? DR_MISALIGNMENT (dr0_info)
2037                      : -DR_MISALIGNMENT (dr0_info));
2038               /* If known_alignment_for_access_p then we have set
2039                  DR_MISALIGNMENT which is only done if we know it at compiler
2040                  time, so it is safe to assume target alignment is constant.
2041                */
2042               unsigned int target_align =
2043                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2044               npeel = ((mis & (target_align - 1))
2045                        / vect_get_scalar_dr_size (dr0_info));
2046             }
2047
2048           /* For interleaved data access every iteration accesses all the
2049              members of the group, therefore we divide the number of iterations
2050              by the group size.  */
2051           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052             npeel /= DR_GROUP_SIZE (stmt_info);
2053
2054           if (dump_enabled_p ())
2055             dump_printf_loc (MSG_NOTE, vect_location,
2056                              "Try peeling by %d\n", npeel);
2057         }
2058
2059       /* Ensure that all datarefs can be vectorized after the peel.  */
2060       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2061         do_peeling = false;
2062
2063       /* Check if all datarefs are supportable and log.  */
2064       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2065         {
2066           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2067           if (!stat)
2068             do_peeling = false;
2069           else
2070             return stat;
2071         }
2072
2073       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2074       if (do_peeling)
2075         {
2076           unsigned max_allowed_peel
2077             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2078           if (max_allowed_peel != (unsigned)-1)
2079             {
2080               unsigned max_peel = npeel;
2081               if (max_peel == 0)
2082                 {
2083                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2084                   unsigned HOST_WIDE_INT target_align_c;
2085                   if (target_align.is_constant (&target_align_c))
2086                     max_peel =
2087                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2088                   else
2089                     {
2090                       do_peeling = false;
2091                       if (dump_enabled_p ())
2092                         dump_printf_loc (MSG_NOTE, vect_location,
2093                           "Disable peeling, max peels set and vector"
2094                           " alignment unknown\n");
2095                     }
2096                 }
2097               if (max_peel > max_allowed_peel)
2098                 {
2099                   do_peeling = false;
2100                   if (dump_enabled_p ())
2101                     dump_printf_loc (MSG_NOTE, vect_location,
2102                         "Disable peeling, max peels reached: %d\n", max_peel);
2103                 }
2104             }
2105         }
2106
2107       /* Cost model #2 - if peeling may result in a remaining loop not
2108          iterating enough to be vectorized then do not peel.  Since this
2109          is a cost heuristic rather than a correctness decision, use the
2110          most likely runtime value for variable vectorization factors.  */
2111       if (do_peeling
2112           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2113         {
2114           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2115           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2116           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2117               < assumed_vf + max_peel)
2118             do_peeling = false;
2119         }
2120
2121       if (do_peeling)
2122         {
2123           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2124              If the misalignment of DR_i is identical to that of dr0 then set
2125              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2126              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2127              by the peeling factor times the element size of DR_i (MOD the
2128              vectorization factor times the size).  Otherwise, the
2129              misalignment of DR_i must be set to unknown.  */
2130           FOR_EACH_VEC_ELT (datarefs, i, dr)
2131             if (dr != dr0_info->dr)
2132               {
2133                 /* Strided accesses perform only component accesses, alignment
2134                    is irrelevant for them.  */
2135                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2136                 stmt_info = dr_info->stmt;
2137                 if (STMT_VINFO_STRIDED_P (stmt_info)
2138                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2139                   continue;
2140
2141                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2142               }
2143
2144           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2145           if (npeel)
2146             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2147           else
2148             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2149               = DR_MISALIGNMENT (dr0_info);
2150           SET_DR_MISALIGNMENT (dr0_info, 0);
2151           if (dump_enabled_p ())
2152             {
2153               dump_printf_loc (MSG_NOTE, vect_location,
2154                                "Alignment of access forced using peeling.\n");
2155               dump_printf_loc (MSG_NOTE, vect_location,
2156                                "Peeling for alignment will be applied.\n");
2157             }
2158
2159           /* The inside-loop cost will be accounted for in vectorizable_load
2160              and vectorizable_store correctly with adjusted alignments.
2161              Drop the body_cst_vec on the floor here.  */
2162           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2163           gcc_assert (stat);
2164           return stat;
2165         }
2166     }
2167
2168   /* (2) Versioning to force alignment.  */
2169
2170   /* Try versioning if:
2171      1) optimize loop for speed
2172      2) there is at least one unsupported misaligned data ref with an unknown
2173         misalignment, and
2174      3) all misaligned data refs with a known misalignment are supported, and
2175      4) the number of runtime alignment checks is within reason.  */
2176
2177   do_versioning =
2178         optimize_loop_nest_for_speed_p (loop)
2179         && (!loop->inner); /* FORNOW */
2180
2181   if (do_versioning)
2182     {
2183       FOR_EACH_VEC_ELT (datarefs, i, dr)
2184         {
2185           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2186           stmt_vec_info stmt_info = dr_info->stmt;
2187
2188           /* For interleaving, only the alignment of the first access
2189              matters.  */
2190           if (aligned_access_p (dr_info)
2191               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2192                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2193             continue;
2194
2195           if (STMT_VINFO_STRIDED_P (stmt_info))
2196             {
2197               /* Strided loads perform only component accesses, alignment is
2198                  irrelevant for them.  */
2199               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2200                 continue;
2201               do_versioning = false;
2202               break;
2203             }
2204
2205           supportable_dr_alignment
2206             = vect_supportable_dr_alignment (dr_info, false);
2207
2208           if (!supportable_dr_alignment)
2209             {
2210               int mask;
2211               tree vectype;
2212
2213               if (known_alignment_for_access_p (dr_info)
2214                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2215                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2216                 {
2217                   do_versioning = false;
2218                   break;
2219                 }
2220
2221               vectype = STMT_VINFO_VECTYPE (stmt_info);
2222               gcc_assert (vectype);
2223
2224               /* At present we don't support versioning for alignment
2225                  with variable VF, since there's no guarantee that the
2226                  VF is a power of two.  We could relax this if we added
2227                  a way of enforcing a power-of-two size.  */
2228               unsigned HOST_WIDE_INT size;
2229               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2230                 {
2231                   do_versioning = false;
2232                   break;
2233                 }
2234
2235               /* Forcing alignment in the first iteration is no good if
2236                  we don't keep it across iterations.  For now, just disable
2237                  versioning in this case.
2238                  ?? We could actually unroll the loop to achieve the required
2239                  overall step alignment, and forcing the alignment could be
2240                  done by doing some iterations of the non-vectorized loop.  */
2241               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2242                                * DR_STEP_ALIGNMENT (dr),
2243                                DR_TARGET_ALIGNMENT (dr_info)))
2244                 {
2245                   do_versioning = false;
2246                   break;
2247                 }
2248
2249               /* The rightmost bits of an aligned address must be zeros.
2250                  Construct the mask needed for this test.  For example,
2251                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2252                  mask must be 15 = 0xf. */
2253               mask = size - 1;
2254
2255               /* FORNOW: use the same mask to test all potentially unaligned
2256                  references in the loop.  The vectorizer currently supports
2257                  a single vector size, see the reference to
2258                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2259                  vectorization factor is computed.  */
2260               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2261                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2262               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2263               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2264             }
2265         }
2266
2267       /* Versioning requires at least one misaligned data reference.  */
2268       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2269         do_versioning = false;
2270       else if (!do_versioning)
2271         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2272     }
2273
2274   if (do_versioning)
2275     {
2276       vec<stmt_vec_info> may_misalign_stmts
2277         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2278       stmt_vec_info stmt_info;
2279
2280       /* It can now be assumed that the data references in the statements
2281          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2282          of the loop being vectorized.  */
2283       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2284         {
2285           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2286           SET_DR_MISALIGNMENT (dr_info, 0);
2287           if (dump_enabled_p ())
2288             dump_printf_loc (MSG_NOTE, vect_location,
2289                              "Alignment of access forced using versioning.\n");
2290         }
2291
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_NOTE, vect_location,
2294                          "Versioning for alignment will be applied.\n");
2295
2296       /* Peeling and versioning can't be done together at this time.  */
2297       gcc_assert (! (do_peeling && do_versioning));
2298
2299       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2300       gcc_assert (stat);
2301       return stat;
2302     }
2303
2304   /* This point is reached if neither peeling nor versioning is being done.  */
2305   gcc_assert (! (do_peeling || do_versioning));
2306
2307   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2308   return stat;
2309 }
2310
2311
2312 /* Function vect_find_same_alignment_drs.
2313
2314    Update group and alignment relations in VINFO according to the chosen
2315    vectorization factor.  */
2316
2317 static void
2318 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2319 {
2320   struct data_reference *dra = DDR_A (ddr);
2321   struct data_reference *drb = DDR_B (ddr);
2322   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2323   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2324   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2325   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2326
2327   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2328     return;
2329
2330   if (dra == drb)
2331     return;
2332
2333   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2334       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2335     return;
2336
2337   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2338       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2339       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2340     return;
2341
2342   /* Two references with distance zero have the same alignment.  */
2343   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2344                           - wi::to_poly_offset (DR_INIT (drb)));
2345   if (maybe_ne (diff, 0))
2346     {
2347       /* Get the wider of the two alignments.  */
2348       poly_uint64 align_a =
2349         exact_div (vect_calculate_target_alignment (dr_info_a),
2350                    BITS_PER_UNIT);
2351       poly_uint64 align_b =
2352         exact_div (vect_calculate_target_alignment (dr_info_b),
2353                    BITS_PER_UNIT);
2354       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2355       if (!align_a.is_constant (&align_a_c)
2356           || !align_b.is_constant (&align_b_c))
2357         return;
2358
2359       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2360
2361       /* Require the gap to be a multiple of the larger vector alignment.  */
2362       if (!multiple_p (diff, max_align))
2363         return;
2364     }
2365
2366   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2367   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2368   if (dump_enabled_p ())
2369     dump_printf_loc (MSG_NOTE, vect_location,
2370                      "accesses have the same alignment: %T and %T\n",
2371                      DR_REF (dra), DR_REF (drb));
2372 }
2373
2374
2375 /* Function vect_analyze_data_refs_alignment
2376
2377    Analyze the alignment of the data-references in the loop.
2378    Return FALSE if a data reference is found that cannot be vectorized.  */
2379
2380 opt_result
2381 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2382 {
2383   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2384
2385   /* Mark groups of data references with same alignment using
2386      data dependence information.  */
2387   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2388   struct data_dependence_relation *ddr;
2389   unsigned int i;
2390
2391   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2392     vect_find_same_alignment_drs (vinfo, ddr);
2393
2394   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2395   struct data_reference *dr;
2396
2397   vect_record_base_alignments (vinfo);
2398   FOR_EACH_VEC_ELT (datarefs, i, dr)
2399     {
2400       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2401       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2402         vect_compute_data_ref_alignment (dr_info);
2403     }
2404
2405   return opt_result::success ();
2406 }
2407
2408
2409 /* Analyze alignment of DRs of stmts in NODE.  */
2410
2411 static bool
2412 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2413 {
2414   /* We vectorize from the first scalar stmt in the node unless
2415      the node is permuted in which case we start from the first
2416      element in the group.  */
2417   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2418   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2419   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2420     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2421
2422   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2423   vect_compute_data_ref_alignment (dr_info);
2424   /* For creating the data-ref pointer we need alignment of the
2425      first element anyway.  */
2426   if (dr_info != first_dr_info)
2427     vect_compute_data_ref_alignment (first_dr_info);
2428   if (! verify_data_ref_alignment (dr_info))
2429     {
2430       if (dump_enabled_p ())
2431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432                          "not vectorized: bad data alignment in basic "
2433                          "block.\n");
2434       return false;
2435     }
2436
2437   return true;
2438 }
2439
2440 /* Function vect_slp_analyze_instance_alignment
2441
2442    Analyze the alignment of the data-references in the SLP instance.
2443    Return FALSE if a data reference is found that cannot be vectorized.  */
2444
2445 bool
2446 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2447 {
2448   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2449
2450   slp_tree node;
2451   unsigned i;
2452   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2453     if (! vect_slp_analyze_and_verify_node_alignment (node))
2454       return false;
2455
2456   node = SLP_INSTANCE_TREE (instance);
2457   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2458       && ! vect_slp_analyze_and_verify_node_alignment
2459              (SLP_INSTANCE_TREE (instance)))
2460     return false;
2461
2462   return true;
2463 }
2464
2465
2466 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2467    accesses of legal size, step, etc.  Detect gaps, single element
2468    interleaving, and other special cases. Set grouped access info.
2469    Collect groups of strided stores for further use in SLP analysis.
2470    Worker for vect_analyze_group_access.  */
2471
2472 static bool
2473 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2474 {
2475   data_reference *dr = dr_info->dr;
2476   tree step = DR_STEP (dr);
2477   tree scalar_type = TREE_TYPE (DR_REF (dr));
2478   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2479   stmt_vec_info stmt_info = dr_info->stmt;
2480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2481   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2482   HOST_WIDE_INT dr_step = -1;
2483   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2484   bool slp_impossible = false;
2485
2486   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2487      size of the interleaving group (including gaps).  */
2488   if (tree_fits_shwi_p (step))
2489     {
2490       dr_step = tree_to_shwi (step);
2491       /* Check that STEP is a multiple of type size.  Otherwise there is
2492          a non-element-sized gap at the end of the group which we
2493          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2494          ???  As we can handle non-constant step fine here we should
2495          simply remove uses of DR_GROUP_GAP between the last and first
2496          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2497          simply not include that gap.  */
2498       if ((dr_step % type_size) != 0)
2499         {
2500           if (dump_enabled_p ())
2501             dump_printf_loc (MSG_NOTE, vect_location,
2502                              "Step %T is not a multiple of the element size"
2503                              " for %T\n",
2504                              step, DR_REF (dr));
2505           return false;
2506         }
2507       groupsize = absu_hwi (dr_step) / type_size;
2508     }
2509   else
2510     groupsize = 0;
2511
2512   /* Not consecutive access is possible only if it is a part of interleaving.  */
2513   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2514     {
2515       /* Check if it this DR is a part of interleaving, and is a single
2516          element of the group that is accessed in the loop.  */
2517
2518       /* Gaps are supported only for loads. STEP must be a multiple of the type
2519          size.  */
2520       if (DR_IS_READ (dr)
2521           && (dr_step % type_size) == 0
2522           && groupsize > 0)
2523         {
2524           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2525           DR_GROUP_SIZE (stmt_info) = groupsize;
2526           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2527           if (dump_enabled_p ())
2528             dump_printf_loc (MSG_NOTE, vect_location,
2529                              "Detected single element interleaving %T"
2530                              " step %T\n",
2531                              DR_REF (dr), step);
2532
2533           return true;
2534         }
2535
2536       if (dump_enabled_p ())
2537         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2538                          "not consecutive access %G", stmt_info->stmt);
2539
2540       if (bb_vinfo)
2541         {
2542           /* Mark the statement as unvectorizable.  */
2543           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2544           return true;
2545         }
2546
2547       if (dump_enabled_p ())
2548         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2549       STMT_VINFO_STRIDED_P (stmt_info) = true;
2550       return true;
2551     }
2552
2553   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2554     {
2555       /* First stmt in the interleaving chain. Check the chain.  */
2556       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2557       struct data_reference *data_ref = dr;
2558       unsigned int count = 1;
2559       tree prev_init = DR_INIT (data_ref);
2560       HOST_WIDE_INT diff, gaps = 0;
2561
2562       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2563       while (next)
2564         {
2565           /* We never have the same DR multiple times.  */
2566           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2567                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2568
2569           data_ref = STMT_VINFO_DATA_REF (next);
2570
2571           /* All group members have the same STEP by construction.  */
2572           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2573
2574           /* Check that the distance between two accesses is equal to the type
2575              size. Otherwise, we have gaps.  */
2576           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2577                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2578           if (diff != 1)
2579             {
2580               /* FORNOW: SLP of accesses with gaps is not supported.  */
2581               slp_impossible = true;
2582               if (DR_IS_WRITE (data_ref))
2583                 {
2584                   if (dump_enabled_p ())
2585                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586                                      "interleaved store with gaps\n");
2587                   return false;
2588                 }
2589
2590               gaps += diff - 1;
2591             }
2592
2593           last_accessed_element += diff;
2594
2595           /* Store the gap from the previous member of the group. If there is no
2596              gap in the access, DR_GROUP_GAP is always 1.  */
2597           DR_GROUP_GAP (next) = diff;
2598
2599           prev_init = DR_INIT (data_ref);
2600           next = DR_GROUP_NEXT_ELEMENT (next);
2601           /* Count the number of data-refs in the chain.  */
2602           count++;
2603         }
2604
2605       if (groupsize == 0)
2606         groupsize = count + gaps;
2607
2608       /* This could be UINT_MAX but as we are generating code in a very
2609          inefficient way we have to cap earlier.  See PR78699 for example.  */
2610       if (groupsize > 4096)
2611         {
2612           if (dump_enabled_p ())
2613             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614                              "group is too large\n");
2615           return false;
2616         }
2617
2618       /* Check that the size of the interleaving is equal to count for stores,
2619          i.e., that there are no gaps.  */
2620       if (groupsize != count
2621           && !DR_IS_READ (dr))
2622         {
2623           groupsize = count;
2624           STMT_VINFO_STRIDED_P (stmt_info) = true;
2625         }
2626
2627       /* If there is a gap after the last load in the group it is the
2628          difference between the groupsize and the last accessed
2629          element.
2630          When there is no gap, this difference should be 0.  */
2631       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2632
2633       DR_GROUP_SIZE (stmt_info) = groupsize;
2634       if (dump_enabled_p ())
2635         {
2636           dump_printf_loc (MSG_NOTE, vect_location,
2637                            "Detected interleaving ");
2638           if (DR_IS_READ (dr))
2639             dump_printf (MSG_NOTE, "load ");
2640           else if (STMT_VINFO_STRIDED_P (stmt_info))
2641             dump_printf (MSG_NOTE, "strided store ");
2642           else
2643             dump_printf (MSG_NOTE, "store ");
2644           dump_printf (MSG_NOTE, "of size %u\n",
2645                        (unsigned)groupsize);
2646           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2647           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2648           while (next)
2649             {
2650               if (DR_GROUP_GAP (next) != 1)
2651                 dump_printf_loc (MSG_NOTE, vect_location,
2652                                  "\t<gap of %d elements>\n",
2653                                  DR_GROUP_GAP (next) - 1);
2654               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2655               next = DR_GROUP_NEXT_ELEMENT (next);
2656             }
2657           if (DR_GROUP_GAP (stmt_info) != 0)
2658             dump_printf_loc (MSG_NOTE, vect_location,
2659                              "\t<gap of %d elements>\n",
2660                              DR_GROUP_GAP (stmt_info));
2661         }
2662
2663       /* SLP: create an SLP data structure for every interleaving group of
2664          stores for further analysis in vect_analyse_slp.  */
2665       if (DR_IS_WRITE (dr) && !slp_impossible)
2666         {
2667           if (loop_vinfo)
2668             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2669           if (bb_vinfo)
2670             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2671         }
2672     }
2673
2674   return true;
2675 }
2676
2677 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2678    accesses of legal size, step, etc.  Detect gaps, single element
2679    interleaving, and other special cases. Set grouped access info.
2680    Collect groups of strided stores for further use in SLP analysis.  */
2681
2682 static bool
2683 vect_analyze_group_access (dr_vec_info *dr_info)
2684 {
2685   if (!vect_analyze_group_access_1 (dr_info))
2686     {
2687       /* Dissolve the group if present.  */
2688       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2689       while (stmt_info)
2690         {
2691           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2692           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2693           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2694           stmt_info = next;
2695         }
2696       return false;
2697     }
2698   return true;
2699 }
2700
2701 /* Analyze the access pattern of the data-reference DR_INFO.
2702    In case of non-consecutive accesses call vect_analyze_group_access() to
2703    analyze groups of accesses.  */
2704
2705 static bool
2706 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2707 {
2708   data_reference *dr = dr_info->dr;
2709   tree step = DR_STEP (dr);
2710   tree scalar_type = TREE_TYPE (DR_REF (dr));
2711   stmt_vec_info stmt_info = dr_info->stmt;
2712   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2713   class loop *loop = NULL;
2714
2715   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2716     return true;
2717
2718   if (loop_vinfo)
2719     loop = LOOP_VINFO_LOOP (loop_vinfo);
2720
2721   if (loop_vinfo && !step)
2722     {
2723       if (dump_enabled_p ())
2724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2725                          "bad data-ref access in loop\n");
2726       return false;
2727     }
2728
2729   /* Allow loads with zero step in inner-loop vectorization.  */
2730   if (loop_vinfo && integer_zerop (step))
2731     {
2732       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2733       if (!nested_in_vect_loop_p (loop, stmt_info))
2734         return DR_IS_READ (dr);
2735       /* Allow references with zero step for outer loops marked
2736          with pragma omp simd only - it guarantees absence of
2737          loop-carried dependencies between inner loop iterations.  */
2738       if (loop->safelen < 2)
2739         {
2740           if (dump_enabled_p ())
2741             dump_printf_loc (MSG_NOTE, vect_location,
2742                              "zero step in inner loop of nest\n");
2743           return false;
2744         }
2745     }
2746
2747   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2748     {
2749       /* Interleaved accesses are not yet supported within outer-loop
2750         vectorization for references in the inner-loop.  */
2751       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2752
2753       /* For the rest of the analysis we use the outer-loop step.  */
2754       step = STMT_VINFO_DR_STEP (stmt_info);
2755       if (integer_zerop (step))
2756         {
2757           if (dump_enabled_p ())
2758             dump_printf_loc (MSG_NOTE, vect_location,
2759                              "zero step in outer loop.\n");
2760           return DR_IS_READ (dr);
2761         }
2762     }
2763
2764   /* Consecutive?  */
2765   if (TREE_CODE (step) == INTEGER_CST)
2766     {
2767       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2768       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2769           || (dr_step < 0
2770               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2771         {
2772           /* Mark that it is not interleaving.  */
2773           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2774           return true;
2775         }
2776     }
2777
2778   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2779     {
2780       if (dump_enabled_p ())
2781         dump_printf_loc (MSG_NOTE, vect_location,
2782                          "grouped access in outer loop.\n");
2783       return false;
2784     }
2785
2786
2787   /* Assume this is a DR handled by non-constant strided load case.  */
2788   if (TREE_CODE (step) != INTEGER_CST)
2789     return (STMT_VINFO_STRIDED_P (stmt_info)
2790             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2791                 || vect_analyze_group_access (dr_info)));
2792
2793   /* Not consecutive access - check if it's a part of interleaving group.  */
2794   return vect_analyze_group_access (dr_info);
2795 }
2796
2797 /* Compare two data-references DRA and DRB to group them into chunks
2798    suitable for grouping.  */
2799
2800 static int
2801 dr_group_sort_cmp (const void *dra_, const void *drb_)
2802 {
2803   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2804   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2805   int cmp;
2806
2807   /* Stabilize sort.  */
2808   if (dra == drb)
2809     return 0;
2810
2811   /* DRs in different loops never belong to the same group.  */
2812   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2813   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2814   if (loopa != loopb)
2815     return loopa->num < loopb->num ? -1 : 1;
2816
2817   /* Ordering of DRs according to base.  */
2818   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2819                                DR_BASE_ADDRESS (drb));
2820   if (cmp != 0)
2821     return cmp;
2822
2823   /* And according to DR_OFFSET.  */
2824   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2825   if (cmp != 0)
2826     return cmp;
2827
2828   /* Put reads before writes.  */
2829   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2830     return DR_IS_READ (dra) ? -1 : 1;
2831
2832   /* Then sort after access size.  */
2833   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2834                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2835   if (cmp != 0)
2836     return cmp;
2837
2838   /* And after step.  */
2839   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2840   if (cmp != 0)
2841     return cmp;
2842
2843   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2844   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2845   if (cmp == 0)
2846     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2847   return cmp;
2848 }
2849
2850 /* If OP is the result of a conversion, return the unconverted value,
2851    otherwise return null.  */
2852
2853 static tree
2854 strip_conversion (tree op)
2855 {
2856   if (TREE_CODE (op) != SSA_NAME)
2857     return NULL_TREE;
2858   gimple *stmt = SSA_NAME_DEF_STMT (op);
2859   if (!is_gimple_assign (stmt)
2860       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2861     return NULL_TREE;
2862   return gimple_assign_rhs1 (stmt);
2863 }
2864
2865 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2866    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2867    be grouped in SLP mode.  */
2868
2869 static bool
2870 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2871                    bool allow_slp_p)
2872 {
2873   if (gimple_assign_single_p (stmt1_info->stmt))
2874     return gimple_assign_single_p (stmt2_info->stmt);
2875
2876   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2877   if (call1 && gimple_call_internal_p (call1))
2878     {
2879       /* Check for two masked loads or two masked stores.  */
2880       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2881       if (!call2 || !gimple_call_internal_p (call2))
2882         return false;
2883       internal_fn ifn = gimple_call_internal_fn (call1);
2884       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885         return false;
2886       if (ifn != gimple_call_internal_fn (call2))
2887         return false;
2888
2889       /* Check that the masks are the same.  Cope with casts of masks,
2890          like those created by build_mask_conversion.  */
2891       tree mask1 = gimple_call_arg (call1, 2);
2892       tree mask2 = gimple_call_arg (call2, 2);
2893       if (!operand_equal_p (mask1, mask2, 0)
2894           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2895         {
2896           mask1 = strip_conversion (mask1);
2897           if (!mask1)
2898             return false;
2899           mask2 = strip_conversion (mask2);
2900           if (!mask2)
2901             return false;
2902           if (!operand_equal_p (mask1, mask2, 0))
2903             return false;
2904         }
2905       return true;
2906     }
2907
2908   return false;
2909 }
2910
2911 /* Function vect_analyze_data_ref_accesses.
2912
2913    Analyze the access pattern of all the data references in the loop.
2914
2915    FORNOW: the only access pattern that is considered vectorizable is a
2916            simple step 1 (consecutive) access.
2917
2918    FORNOW: handle only arrays and pointer accesses.  */
2919
2920 opt_result
2921 vect_analyze_data_ref_accesses (vec_info *vinfo)
2922 {
2923   unsigned int i;
2924   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2925   struct data_reference *dr;
2926
2927   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2928
2929   if (datarefs.is_empty ())
2930     return opt_result::success ();
2931
2932   /* Sort the array of datarefs to make building the interleaving chains
2933      linear.  Don't modify the original vector's order, it is needed for
2934      determining what dependencies are reversed.  */
2935   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2936   datarefs_copy.qsort (dr_group_sort_cmp);
2937   hash_set<stmt_vec_info> to_fixup;
2938
2939   /* Build the interleaving chains.  */
2940   for (i = 0; i < datarefs_copy.length () - 1;)
2941     {
2942       data_reference_p dra = datarefs_copy[i];
2943       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2944       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2945       stmt_vec_info lastinfo = NULL;
2946       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2947           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2948         {
2949           ++i;
2950           continue;
2951         }
2952       for (i = i + 1; i < datarefs_copy.length (); ++i)
2953         {
2954           data_reference_p drb = datarefs_copy[i];
2955           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2956           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2957           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2958               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2959             break;
2960
2961           /* ???  Imperfect sorting (non-compatible types, non-modulo
2962              accesses, same accesses) can lead to a group to be artificially
2963              split here as we don't just skip over those.  If it really
2964              matters we can push those to a worklist and re-iterate
2965              over them.  The we can just skip ahead to the next DR here.  */
2966
2967           /* DRs in a different loop should not be put into the same
2968              interleaving group.  */
2969           if (gimple_bb (DR_STMT (dra))->loop_father
2970               != gimple_bb (DR_STMT (drb))->loop_father)
2971             break;
2972
2973           /* Check that the data-refs have same first location (except init)
2974              and they are both either store or load (not load and store,
2975              not masked loads or stores).  */
2976           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2977               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2978                                         DR_BASE_ADDRESS (drb)) != 0
2979               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2980               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2981             break;
2982
2983           /* Check that the data-refs have the same constant size.  */
2984           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2985           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2986           if (!tree_fits_uhwi_p (sza)
2987               || !tree_fits_uhwi_p (szb)
2988               || !tree_int_cst_equal (sza, szb))
2989             break;
2990
2991           /* Check that the data-refs have the same step.  */
2992           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2993             break;
2994
2995           /* Check the types are compatible.
2996              ???  We don't distinguish this during sorting.  */
2997           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2998                                    TREE_TYPE (DR_REF (drb))))
2999             break;
3000
3001           /* Check that the DR_INITs are compile-time constants.  */
3002           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3003               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3004             break;
3005
3006           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3007              just hold extra information.  */
3008           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3009               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3010               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3011             break;
3012
3013           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3014           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3015           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3016           HOST_WIDE_INT init_prev
3017             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3018           gcc_assert (init_a <= init_b
3019                       && init_a <= init_prev
3020                       && init_prev <= init_b);
3021
3022           /* Do not place the same access in the interleaving chain twice.  */
3023           if (init_b == init_prev)
3024             {
3025               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3026                           < gimple_uid (DR_STMT (drb)));
3027               /* Simply link in duplicates and fix up the chain below.  */
3028             }
3029           else
3030             {
3031               /* If init_b == init_a + the size of the type * k, we have an
3032                  interleaving, and DRA is accessed before DRB.  */
3033               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3034               if (type_size_a == 0
3035                   || (init_b - init_a) % type_size_a != 0)
3036                 break;
3037
3038               /* If we have a store, the accesses are adjacent.  This splits
3039                  groups into chunks we support (we don't support vectorization
3040                  of stores with gaps).  */
3041               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3042                 break;
3043
3044               /* If the step (if not zero or non-constant) is greater than the
3045                  difference between data-refs' inits this splits groups into
3046                  suitable sizes.  */
3047               if (tree_fits_shwi_p (DR_STEP (dra)))
3048                 {
3049                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3050                   if (step != 0 && step <= (init_b - init_a))
3051                     break;
3052                 }
3053             }
3054
3055           if (dump_enabled_p ())
3056             dump_printf_loc (MSG_NOTE, vect_location,
3057                              DR_IS_READ (dra)
3058                              ? "Detected interleaving load %T and %T\n"
3059                              : "Detected interleaving store %T and %T\n",
3060                              DR_REF (dra), DR_REF (drb));
3061
3062           /* Link the found element into the group list.  */
3063           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3064             {
3065               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3066               lastinfo = stmtinfo_a;
3067             }
3068           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3069           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3070           lastinfo = stmtinfo_b;
3071
3072           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3073             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3074
3075           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3076             dump_printf_loc (MSG_NOTE, vect_location,
3077                              "Load suitable for SLP vectorization only.\n");
3078
3079           if (init_b == init_prev
3080               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3081               && dump_enabled_p ())
3082             dump_printf_loc (MSG_NOTE, vect_location,
3083                              "Queuing group with duplicate access for fixup\n");
3084         }
3085     }
3086
3087   /* Fixup groups with duplicate entries by splitting it.  */
3088   while (1)
3089     {
3090       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3091       if (!(it != to_fixup.end ()))
3092         break;
3093       stmt_vec_info grp = *it;
3094       to_fixup.remove (grp);
3095
3096       /* Find the earliest duplicate group member.  */
3097       unsigned first_duplicate = -1u;
3098       stmt_vec_info next, g = grp;
3099       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3100         {
3101           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3102                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3103               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3104             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3105           g = next;
3106         }
3107       if (first_duplicate == -1U)
3108         continue;
3109
3110       /* Then move all stmts after the first duplicate to a new group.
3111          Note this is a heuristic but one with the property that *it
3112          is fixed up completely.  */
3113       g = grp;
3114       stmt_vec_info newgroup = NULL, ng = grp;
3115       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3116         {
3117           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3118             {
3119               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3120               if (!newgroup)
3121                 newgroup = next;
3122               else
3123                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3124               ng = next;
3125               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3126             }
3127           else
3128             g = DR_GROUP_NEXT_ELEMENT (g);
3129         }
3130       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3131
3132       /* Fixup the new group which still may contain duplicates.  */
3133       to_fixup.add (newgroup);
3134     }
3135
3136   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3137     {
3138       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3139       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3140           && !vect_analyze_data_ref_access (dr_info))
3141         {
3142           if (dump_enabled_p ())
3143             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144                              "not vectorized: complicated access pattern.\n");
3145
3146           if (is_a <bb_vec_info> (vinfo))
3147             {
3148               /* Mark the statement as not vectorizable.  */
3149               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3150               continue;
3151             }
3152           else
3153             {
3154               datarefs_copy.release ();
3155               return opt_result::failure_at (dr_info->stmt->stmt,
3156                                              "not vectorized:"
3157                                              " complicated access pattern.\n");
3158             }
3159         }
3160     }
3161
3162   datarefs_copy.release ();
3163   return opt_result::success ();
3164 }
3165
3166 /* Function vect_vfa_segment_size.
3167
3168    Input:
3169      DR_INFO: The data reference.
3170      LENGTH_FACTOR: segment length to consider.
3171
3172    Return a value suitable for the dr_with_seg_len::seg_len field.
3173    This is the "distance travelled" by the pointer from the first
3174    iteration in the segment to the last.  Note that it does not include
3175    the size of the access; in effect it only describes the first byte.  */
3176
3177 static tree
3178 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3179 {
3180   length_factor = size_binop (MINUS_EXPR,
3181                               fold_convert (sizetype, length_factor),
3182                               size_one_node);
3183   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3184                      length_factor);
3185 }
3186
3187 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3188    gives the worst-case number of bytes covered by the segment.  */
3189
3190 static unsigned HOST_WIDE_INT
3191 vect_vfa_access_size (dr_vec_info *dr_info)
3192 {
3193   stmt_vec_info stmt_vinfo = dr_info->stmt;
3194   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3195   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3196   unsigned HOST_WIDE_INT access_size = ref_size;
3197   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3198     {
3199       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3200       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3201     }
3202   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3203       && (vect_supportable_dr_alignment (dr_info, false)
3204           == dr_explicit_realign_optimized))
3205     {
3206       /* We might access a full vector's worth.  */
3207       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3208       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3209     }
3210   return access_size;
3211 }
3212
3213 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3214    describes.  */
3215
3216 static unsigned int
3217 vect_vfa_align (dr_vec_info *dr_info)
3218 {
3219   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3220 }
3221
3222 /* Function vect_no_alias_p.
3223
3224    Given data references A and B with equal base and offset, see whether
3225    the alias relation can be decided at compilation time.  Return 1 if
3226    it can and the references alias, 0 if it can and the references do
3227    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3228    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3229    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3230
3231 static int
3232 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3233                          tree segment_length_a, tree segment_length_b,
3234                          unsigned HOST_WIDE_INT access_size_a,
3235                          unsigned HOST_WIDE_INT access_size_b)
3236 {
3237   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3238   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3239   poly_uint64 const_length_a;
3240   poly_uint64 const_length_b;
3241
3242   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3243      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3244      [a, a+12) */
3245   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3246     {
3247       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3248       offset_a = (offset_a + access_size_a) - const_length_a;
3249     }
3250   else
3251     const_length_a = tree_to_poly_uint64 (segment_length_a);
3252   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3253     {
3254       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3255       offset_b = (offset_b + access_size_b) - const_length_b;
3256     }
3257   else
3258     const_length_b = tree_to_poly_uint64 (segment_length_b);
3259
3260   const_length_a += access_size_a;
3261   const_length_b += access_size_b;
3262
3263   if (ranges_known_overlap_p (offset_a, const_length_a,
3264                               offset_b, const_length_b))
3265     return 1;
3266
3267   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3268                                offset_b, const_length_b))
3269     return 0;
3270
3271   return -1;
3272 }
3273
3274 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3275    in DDR is >= VF.  */
3276
3277 static bool
3278 dependence_distance_ge_vf (data_dependence_relation *ddr,
3279                            unsigned int loop_depth, poly_uint64 vf)
3280 {
3281   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3282       || DDR_NUM_DIST_VECTS (ddr) == 0)
3283     return false;
3284
3285   /* If the dependence is exact, we should have limited the VF instead.  */
3286   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3287
3288   unsigned int i;
3289   lambda_vector dist_v;
3290   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3291     {
3292       HOST_WIDE_INT dist = dist_v[loop_depth];
3293       if (dist != 0
3294           && !(dist > 0 && DDR_REVERSED_P (ddr))
3295           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3296         return false;
3297     }
3298
3299   if (dump_enabled_p ())
3300     dump_printf_loc (MSG_NOTE, vect_location,
3301                      "dependence distance between %T and %T is >= VF\n",
3302                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3303
3304   return true;
3305 }
3306
3307 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3308
3309 static void
3310 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3311 {
3312   dump_printf (dump_kind, "%s (%T) >= ",
3313                lower_bound.unsigned_p ? "unsigned" : "abs",
3314                lower_bound.expr);
3315   dump_dec (dump_kind, lower_bound.min_value);
3316 }
3317
3318 /* Record that the vectorized loop requires the vec_lower_bound described
3319    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3320
3321 static void
3322 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3323                         poly_uint64 min_value)
3324 {
3325   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3326   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3327     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3328       {
3329         unsigned_p &= lower_bounds[i].unsigned_p;
3330         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3331         if (lower_bounds[i].unsigned_p != unsigned_p
3332             || maybe_lt (lower_bounds[i].min_value, min_value))
3333           {
3334             lower_bounds[i].unsigned_p = unsigned_p;
3335             lower_bounds[i].min_value = min_value;
3336             if (dump_enabled_p ())
3337               {
3338                 dump_printf_loc (MSG_NOTE, vect_location,
3339                                  "updating run-time check to ");
3340                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3341                 dump_printf (MSG_NOTE, "\n");
3342               }
3343           }
3344         return;
3345       }
3346
3347   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3348   if (dump_enabled_p ())
3349     {
3350       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3351       dump_lower_bound (MSG_NOTE, lower_bound);
3352       dump_printf (MSG_NOTE, "\n");
3353     }
3354   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3355 }
3356
3357 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3358    will span fewer than GAP bytes.  */
3359
3360 static bool
3361 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3362                   poly_int64 gap)
3363 {
3364   stmt_vec_info stmt_info = dr_info->stmt;
3365   HOST_WIDE_INT count
3366     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3367   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3368     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3369   return (estimated_poly_value (gap)
3370           <= count * vect_get_scalar_dr_size (dr_info));
3371 }
3372
3373 /* Return true if we know that there is no alias between DR_INFO_A and
3374    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3375    When returning true, set *LOWER_BOUND_OUT to this N.  */
3376
3377 static bool
3378 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3379                                 poly_uint64 *lower_bound_out)
3380 {
3381   /* Check that there is a constant gap of known sign between DR_A
3382      and DR_B.  */
3383   data_reference *dr_a = dr_info_a->dr;
3384   data_reference *dr_b = dr_info_b->dr;
3385   poly_int64 init_a, init_b;
3386   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3387       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3388       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3389       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3390       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3391       || !ordered_p (init_a, init_b))
3392     return false;
3393
3394   /* Sort DR_A and DR_B by the address they access.  */
3395   if (maybe_lt (init_b, init_a))
3396     {
3397       std::swap (init_a, init_b);
3398       std::swap (dr_info_a, dr_info_b);
3399       std::swap (dr_a, dr_b);
3400     }
3401
3402   /* If the two accesses could be dependent within a scalar iteration,
3403      make sure that we'd retain their order.  */
3404   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3405       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3406     return false;
3407
3408   /* There is no alias if abs (DR_STEP) is greater than or equal to
3409      the bytes spanned by the combination of the two accesses.  */
3410   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3411   return true;
3412 }
3413
3414 /* Function vect_prune_runtime_alias_test_list.
3415
3416    Prune a list of ddrs to be tested at run-time by versioning for alias.
3417    Merge several alias checks into one if possible.
3418    Return FALSE if resulting list of ddrs is longer then allowed by
3419    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3420
3421 opt_result
3422 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3423 {
3424   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3425   hash_set <tree_pair_hash> compared_objects;
3426
3427   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3428   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3429     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3430   vec<vec_object_pair> &check_unequal_addrs
3431     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3432   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3433   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3434
3435   ddr_p ddr;
3436   unsigned int i;
3437   tree length_factor;
3438
3439   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3440
3441   /* Step values are irrelevant for aliasing if the number of vector
3442      iterations is equal to the number of scalar iterations (which can
3443      happen for fully-SLP loops).  */
3444   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3445
3446   if (!ignore_step_p)
3447     {
3448       /* Convert the checks for nonzero steps into bound tests.  */
3449       tree value;
3450       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3451         vect_check_lower_bound (loop_vinfo, value, true, 1);
3452     }
3453
3454   if (may_alias_ddrs.is_empty ())
3455     return opt_result::success ();
3456
3457   comp_alias_ddrs.create (may_alias_ddrs.length ());
3458
3459   unsigned int loop_depth
3460     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3461                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3462
3463   /* First, we collect all data ref pairs for aliasing checks.  */
3464   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3465     {
3466       int comp_res;
3467       poly_uint64 lower_bound;
3468       tree segment_length_a, segment_length_b;
3469       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3470       unsigned int align_a, align_b;
3471
3472       /* Ignore the alias if the VF we chose ended up being no greater
3473          than the dependence distance.  */
3474       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3475         continue;
3476
3477       if (DDR_OBJECT_A (ddr))
3478         {
3479           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3480           if (!compared_objects.add (new_pair))
3481             {
3482               if (dump_enabled_p ())
3483                 dump_printf_loc (MSG_NOTE, vect_location,
3484                                  "checking that %T and %T"
3485                                  " have different addresses\n",
3486                                  new_pair.first, new_pair.second);
3487               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3488             }
3489           continue;
3490         }
3491
3492       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3493       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3494
3495       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3496       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3497
3498       /* Skip the pair if inter-iteration dependencies are irrelevant
3499          and intra-iteration dependencies are guaranteed to be honored.  */
3500       if (ignore_step_p
3501           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3502               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3503                                                  &lower_bound)))
3504         {
3505           if (dump_enabled_p ())
3506             dump_printf_loc (MSG_NOTE, vect_location,
3507                              "no need for alias check between "
3508                              "%T and %T when VF is 1\n",
3509                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3510           continue;
3511         }
3512
3513       /* See whether we can handle the alias using a bounds check on
3514          the step, and whether that's likely to be the best approach.
3515          (It might not be, for example, if the minimum step is much larger
3516          than the number of bytes handled by one vector iteration.)  */
3517       if (!ignore_step_p
3518           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3519           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3520                                              &lower_bound)
3521           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3522               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3523         {
3524           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3525           if (dump_enabled_p ())
3526             {
3527               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3528                                "%T and %T when the step %T is outside ",
3529                                DR_REF (dr_info_a->dr),
3530                                DR_REF (dr_info_b->dr),
3531                                DR_STEP (dr_info_a->dr));
3532               if (unsigned_p)
3533                 dump_printf (MSG_NOTE, "[0");
3534               else
3535                 {
3536                   dump_printf (MSG_NOTE, "(");
3537                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3538                 }
3539               dump_printf (MSG_NOTE, ", ");
3540               dump_dec (MSG_NOTE, lower_bound);
3541               dump_printf (MSG_NOTE, ")\n");
3542             }
3543           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3544                                   unsigned_p, lower_bound);
3545           continue;
3546         }
3547
3548       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3549       if (dr_group_first_a)
3550         {
3551           stmt_info_a = dr_group_first_a;
3552           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3553         }
3554
3555       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3556       if (dr_group_first_b)
3557         {
3558           stmt_info_b = dr_group_first_b;
3559           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3560         }
3561
3562       if (ignore_step_p)
3563         {
3564           segment_length_a = size_zero_node;
3565           segment_length_b = size_zero_node;
3566         }
3567       else
3568         {
3569           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3570                                 DR_STEP (dr_info_b->dr), 0))
3571             length_factor = scalar_loop_iters;
3572           else
3573             length_factor = size_int (vect_factor);
3574           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3575           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3576         }
3577       access_size_a = vect_vfa_access_size (dr_info_a);
3578       access_size_b = vect_vfa_access_size (dr_info_b);
3579       align_a = vect_vfa_align (dr_info_a);
3580       align_b = vect_vfa_align (dr_info_b);
3581
3582       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3583                                         DR_BASE_ADDRESS (dr_info_b->dr));
3584       if (comp_res == 0)
3585         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3586                                           DR_OFFSET (dr_info_b->dr));
3587
3588       /* See whether the alias is known at compilation time.  */
3589       if (comp_res == 0
3590           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3591           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3592           && poly_int_tree_p (segment_length_a)
3593           && poly_int_tree_p (segment_length_b))
3594         {
3595           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3596                                              segment_length_a,
3597                                              segment_length_b,
3598                                              access_size_a,
3599                                              access_size_b);
3600           if (res >= 0 && dump_enabled_p ())
3601             {
3602               dump_printf_loc (MSG_NOTE, vect_location,
3603                                "can tell at compile time that %T and %T",
3604                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3605               if (res == 0)
3606                 dump_printf (MSG_NOTE, " do not alias\n");
3607               else
3608                 dump_printf (MSG_NOTE, " alias\n");
3609             }
3610
3611           if (res == 0)
3612             continue;
3613
3614           if (res == 1)
3615             return opt_result::failure_at (stmt_info_b->stmt,
3616                                            "not vectorized:"
3617                                            " compilation time alias: %G%G",
3618                                            stmt_info_a->stmt,
3619                                            stmt_info_b->stmt);
3620         }
3621
3622       dr_with_seg_len_pair_t dr_with_seg_len_pair
3623         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3624                           access_size_a, align_a),
3625          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3626                           access_size_b, align_b));
3627
3628       /* Canonicalize pairs by sorting the two DR members.  */
3629       if (comp_res > 0)
3630         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3631
3632       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3633     }
3634
3635   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3636
3637   unsigned int count = (comp_alias_ddrs.length ()
3638                         + check_unequal_addrs.length ());
3639
3640   if (dump_enabled_p ())
3641     dump_printf_loc (MSG_NOTE, vect_location,
3642                      "improved number of alias checks from %d to %d\n",
3643                      may_alias_ddrs.length (), count);
3644   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3645     return opt_result::failure_at
3646       (vect_location,
3647        "number of versioning for alias "
3648        "run-time tests exceeds %d "
3649        "(--param vect-max-version-for-alias-checks)\n",
3650        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3651
3652   return opt_result::success ();
3653 }
3654
3655 /* Check whether we can use an internal function for a gather load
3656    or scatter store.  READ_P is true for loads and false for stores.
3657    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3658    the type of the memory elements being loaded or stored.  OFFSET_BITS
3659    is the number of bits in each scalar offset and OFFSET_SIGN is the
3660    sign of the offset.  SCALE is the amount by which the offset should
3661    be multiplied *after* it has been converted to address width.
3662
3663    Return true if the function is supported, storing the function
3664    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3665
3666 bool
3667 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3668                           tree memory_type, unsigned int offset_bits,
3669                           signop offset_sign, int scale,
3670                           internal_fn *ifn_out, tree *element_type_out)
3671 {
3672   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3673   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3674   if (offset_bits > element_bits)
3675     /* Internal functions require the offset to be the same width as
3676        the vector elements.  We can extend narrower offsets, but it isn't
3677        safe to truncate wider offsets.  */
3678     return false;
3679
3680   if (element_bits != memory_bits)
3681     /* For now the vector elements must be the same width as the
3682        memory elements.  */
3683     return false;
3684
3685   /* Work out which function we need.  */
3686   internal_fn ifn;
3687   if (read_p)
3688     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3689   else
3690     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3691
3692   /* Test whether the target supports this combination.  */
3693   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3694                                                offset_sign, scale))
3695     return false;
3696
3697   *ifn_out = ifn;
3698   *element_type_out = TREE_TYPE (vectype);
3699   return true;
3700 }
3701
3702 /* STMT_INFO is a call to an internal gather load or scatter store function.
3703    Describe the operation in INFO.  */
3704
3705 static void
3706 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3707                                    gather_scatter_info *info)
3708 {
3709   gcall *call = as_a <gcall *> (stmt_info->stmt);
3710   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3711   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3712
3713   info->ifn = gimple_call_internal_fn (call);
3714   info->decl = NULL_TREE;
3715   info->base = gimple_call_arg (call, 0);
3716   info->offset = gimple_call_arg (call, 1);
3717   info->offset_dt = vect_unknown_def_type;
3718   info->offset_vectype = NULL_TREE;
3719   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3720   info->element_type = TREE_TYPE (vectype);
3721   info->memory_type = TREE_TYPE (DR_REF (dr));
3722 }
3723
3724 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3725    gather load or scatter store.  Describe the operation in *INFO if so.  */
3726
3727 bool
3728 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3729                            gather_scatter_info *info)
3730 {
3731   HOST_WIDE_INT scale = 1;
3732   poly_int64 pbitpos, pbitsize;
3733   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3734   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3735   tree offtype = NULL_TREE;
3736   tree decl = NULL_TREE, base, off;
3737   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3738   tree memory_type = TREE_TYPE (DR_REF (dr));
3739   machine_mode pmode;
3740   int punsignedp, reversep, pvolatilep = 0;
3741   internal_fn ifn;
3742   tree element_type;
3743   bool masked_p = false;
3744
3745   /* See whether this is already a call to a gather/scatter internal function.
3746      If not, see whether it's a masked load or store.  */
3747   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3748   if (call && gimple_call_internal_p (call))
3749     {
3750       ifn = gimple_call_internal_fn (call);
3751       if (internal_gather_scatter_fn_p (ifn))
3752         {
3753           vect_describe_gather_scatter_call (stmt_info, info);
3754           return true;
3755         }
3756       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3757     }
3758
3759   /* True if we should aim to use internal functions rather than
3760      built-in functions.  */
3761   bool use_ifn_p = (DR_IS_READ (dr)
3762                     ? supports_vec_gather_load_p ()
3763                     : supports_vec_scatter_store_p ());
3764
3765   base = DR_REF (dr);
3766   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3767      see if we can use the def stmt of the address.  */
3768   if (masked_p
3769       && TREE_CODE (base) == MEM_REF
3770       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3771       && integer_zerop (TREE_OPERAND (base, 1))
3772       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3773     {
3774       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3775       if (is_gimple_assign (def_stmt)
3776           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3777         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3778     }
3779
3780   /* The gather and scatter builtins need address of the form
3781      loop_invariant + vector * {1, 2, 4, 8}
3782      or
3783      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3784      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3785      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3786      multiplications and additions in it.  To get a vector, we need
3787      a single SSA_NAME that will be defined in the loop and will
3788      contain everything that is not loop invariant and that can be
3789      vectorized.  The following code attempts to find such a preexistng
3790      SSA_NAME OFF and put the loop invariants into a tree BASE
3791      that can be gimplified before the loop.  */
3792   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3793                               &punsignedp, &reversep, &pvolatilep);
3794   if (reversep)
3795     return false;
3796
3797   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3798
3799   if (TREE_CODE (base) == MEM_REF)
3800     {
3801       if (!integer_zerop (TREE_OPERAND (base, 1)))
3802         {
3803           if (off == NULL_TREE)
3804             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3805           else
3806             off = size_binop (PLUS_EXPR, off,
3807                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3808         }
3809       base = TREE_OPERAND (base, 0);
3810     }
3811   else
3812     base = build_fold_addr_expr (base);
3813
3814   if (off == NULL_TREE)
3815     off = size_zero_node;
3816
3817   /* If base is not loop invariant, either off is 0, then we start with just
3818      the constant offset in the loop invariant BASE and continue with base
3819      as OFF, otherwise give up.
3820      We could handle that case by gimplifying the addition of base + off
3821      into some SSA_NAME and use that as off, but for now punt.  */
3822   if (!expr_invariant_in_loop_p (loop, base))
3823     {
3824       if (!integer_zerop (off))
3825         return false;
3826       off = base;
3827       base = size_int (pbytepos);
3828     }
3829   /* Otherwise put base + constant offset into the loop invariant BASE
3830      and continue with OFF.  */
3831   else
3832     {
3833       base = fold_convert (sizetype, base);
3834       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3835     }
3836
3837   /* OFF at this point may be either a SSA_NAME or some tree expression
3838      from get_inner_reference.  Try to peel off loop invariants from it
3839      into BASE as long as possible.  */
3840   STRIP_NOPS (off);
3841   while (offtype == NULL_TREE)
3842     {
3843       enum tree_code code;
3844       tree op0, op1, add = NULL_TREE;
3845
3846       if (TREE_CODE (off) == SSA_NAME)
3847         {
3848           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3849
3850           if (expr_invariant_in_loop_p (loop, off))
3851             return false;
3852
3853           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3854             break;
3855
3856           op0 = gimple_assign_rhs1 (def_stmt);
3857           code = gimple_assign_rhs_code (def_stmt);
3858           op1 = gimple_assign_rhs2 (def_stmt);
3859         }
3860       else
3861         {
3862           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3863             return false;
3864           code = TREE_CODE (off);
3865           extract_ops_from_tree (off, &code, &op0, &op1);
3866         }
3867       switch (code)
3868         {
3869         case POINTER_PLUS_EXPR:
3870         case PLUS_EXPR:
3871           if (expr_invariant_in_loop_p (loop, op0))
3872             {
3873               add = op0;
3874               off = op1;
3875             do_add:
3876               add = fold_convert (sizetype, add);
3877               if (scale != 1)
3878                 add = size_binop (MULT_EXPR, add, size_int (scale));
3879               base = size_binop (PLUS_EXPR, base, add);
3880               continue;
3881             }
3882           if (expr_invariant_in_loop_p (loop, op1))
3883             {
3884               add = op1;
3885               off = op0;
3886               goto do_add;
3887             }
3888           break;
3889         case MINUS_EXPR:
3890           if (expr_invariant_in_loop_p (loop, op1))
3891             {
3892               add = fold_convert (sizetype, op1);
3893               add = size_binop (MINUS_EXPR, size_zero_node, add);
3894               off = op0;
3895               goto do_add;
3896             }
3897           break;
3898         case MULT_EXPR:
3899           if (scale == 1 && tree_fits_shwi_p (op1))
3900             {
3901               int new_scale = tree_to_shwi (op1);
3902               /* Only treat this as a scaling operation if the target
3903                  supports it.  */
3904               if (use_ifn_p
3905                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3906                                                 vectype, memory_type, 1,
3907                                                 TYPE_SIGN (TREE_TYPE (op0)),
3908                                                 new_scale, &ifn,
3909                                                 &element_type))
3910                 break;
3911               scale = new_scale;
3912               off = op0;
3913               continue;
3914             }
3915           break;
3916         case SSA_NAME:
3917           off = op0;
3918           continue;
3919         CASE_CONVERT:
3920           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3921               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3922             break;
3923           if (TYPE_PRECISION (TREE_TYPE (op0))
3924               == TYPE_PRECISION (TREE_TYPE (off)))
3925             {
3926               off = op0;
3927               continue;
3928             }
3929
3930           /* The internal functions need the offset to be the same width
3931              as the elements of VECTYPE.  Don't include operations that
3932              cast the offset from that width to a different width.  */
3933           if (use_ifn_p
3934               && (int_size_in_bytes (TREE_TYPE (vectype))
3935                   == int_size_in_bytes (TREE_TYPE (off))))
3936             break;
3937
3938           if (TYPE_PRECISION (TREE_TYPE (op0))
3939               < TYPE_PRECISION (TREE_TYPE (off)))
3940             {
3941               off = op0;
3942               offtype = TREE_TYPE (off);
3943               STRIP_NOPS (off);
3944               continue;
3945             }
3946           break;
3947         default:
3948           break;
3949         }
3950       break;
3951     }
3952
3953   /* If at the end OFF still isn't a SSA_NAME or isn't
3954      defined in the loop, punt.  */
3955   if (TREE_CODE (off) != SSA_NAME
3956       || expr_invariant_in_loop_p (loop, off))
3957     return false;
3958
3959   if (offtype == NULL_TREE)
3960     offtype = TREE_TYPE (off);
3961
3962   if (use_ifn_p)
3963     {
3964       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3965                                      memory_type, TYPE_PRECISION (offtype),
3966                                      TYPE_SIGN (offtype), scale, &ifn,
3967                                      &element_type))
3968         return false;
3969     }
3970   else
3971     {
3972       if (DR_IS_READ (dr))
3973         {
3974           if (targetm.vectorize.builtin_gather)
3975             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3976         }
3977       else
3978         {
3979           if (targetm.vectorize.builtin_scatter)
3980             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3981         }
3982
3983       if (!decl)
3984         return false;
3985
3986       ifn = IFN_LAST;
3987       element_type = TREE_TYPE (vectype);
3988     }
3989
3990   info->ifn = ifn;
3991   info->decl = decl;
3992   info->base = base;
3993   info->offset = off;
3994   info->offset_dt = vect_unknown_def_type;
3995   info->offset_vectype = NULL_TREE;
3996   info->scale = scale;
3997   info->element_type = element_type;
3998   info->memory_type = memory_type;
3999   return true;
4000 }
4001
4002 /* Find the data references in STMT, analyze them with respect to LOOP and
4003    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4004    be handled.  */
4005
4006 opt_result
4007 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4008                                vec<data_reference_p> *datarefs)
4009 {
4010   /* We can ignore clobbers for dataref analysis - they are removed during
4011      loop vectorization and BB vectorization checks dependences with a
4012      stmt walk.  */
4013   if (gimple_clobber_p (stmt))
4014     return opt_result::success ();
4015
4016   if (gimple_has_volatile_ops (stmt))
4017     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4018                                    stmt);
4019
4020   if (stmt_can_throw_internal (cfun, stmt))
4021     return opt_result::failure_at (stmt,
4022                                    "not vectorized:"
4023                                    " statement can throw an exception: %G",
4024                                    stmt);
4025
4026   auto_vec<data_reference_p, 2> refs;
4027   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4028   if (!res)
4029     return res;
4030
4031   if (refs.is_empty ())
4032     return opt_result::success ();
4033
4034   if (refs.length () > 1)
4035     return opt_result::failure_at (stmt,
4036                                    "not vectorized:"
4037                                    " more than one data ref in stmt: %G", stmt);
4038
4039   if (gcall *call = dyn_cast <gcall *> (stmt))
4040     if (!gimple_call_internal_p (call)
4041         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4042             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4043       return opt_result::failure_at (stmt,
4044                                      "not vectorized: dr in a call %G", stmt);
4045
4046   data_reference_p dr = refs.pop ();
4047   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4048       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4049     return opt_result::failure_at (stmt,
4050                                    "not vectorized:"
4051                                    " statement is bitfield access %G", stmt);
4052
4053   if (DR_BASE_ADDRESS (dr)
4054       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4055     return opt_result::failure_at (stmt,
4056                                    "not vectorized:"
4057                                    " base addr of dr is a constant\n");
4058
4059   /* Check whether this may be a SIMD lane access and adjust the
4060      DR to make it easier for us to handle it.  */
4061   if (loop
4062       && loop->simduid
4063       && (!DR_BASE_ADDRESS (dr)
4064           || !DR_OFFSET (dr)
4065           || !DR_INIT (dr)
4066           || !DR_STEP (dr)))
4067     {
4068       struct data_reference *newdr
4069         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4070                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4071       if (DR_BASE_ADDRESS (newdr)
4072           && DR_OFFSET (newdr)
4073           && DR_INIT (newdr)
4074           && DR_STEP (newdr)
4075           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4076           && integer_zerop (DR_STEP (newdr)))
4077         {
4078           tree base_address = DR_BASE_ADDRESS (newdr);
4079           tree off = DR_OFFSET (newdr);
4080           tree step = ssize_int (1);
4081           if (integer_zerop (off)
4082               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4083             {
4084               off = TREE_OPERAND (base_address, 1);
4085               base_address = TREE_OPERAND (base_address, 0);
4086             }
4087           STRIP_NOPS (off);
4088           if (TREE_CODE (off) == MULT_EXPR
4089               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4090             {
4091               step = TREE_OPERAND (off, 1);
4092               off = TREE_OPERAND (off, 0);
4093               STRIP_NOPS (off);
4094             }
4095           if (CONVERT_EXPR_P (off)
4096               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4097                   < TYPE_PRECISION (TREE_TYPE (off))))
4098             off = TREE_OPERAND (off, 0);
4099           if (TREE_CODE (off) == SSA_NAME)
4100             {
4101               gimple *def = SSA_NAME_DEF_STMT (off);
4102               /* Look through widening conversion.  */
4103               if (is_gimple_assign (def)
4104                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4105                 {
4106                   tree rhs1 = gimple_assign_rhs1 (def);
4107                   if (TREE_CODE (rhs1) == SSA_NAME
4108                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4109                       && (TYPE_PRECISION (TREE_TYPE (off))
4110                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4111                     def = SSA_NAME_DEF_STMT (rhs1);
4112                 }
4113               if (is_gimple_call (def)
4114                   && gimple_call_internal_p (def)
4115                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4116                 {
4117                   tree arg = gimple_call_arg (def, 0);
4118                   tree reft = TREE_TYPE (DR_REF (newdr));
4119                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4120                   arg = SSA_NAME_VAR (arg);
4121                   if (arg == loop->simduid
4122                       /* For now.  */
4123                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4124                     {
4125                       DR_BASE_ADDRESS (newdr) = base_address;
4126                       DR_OFFSET (newdr) = ssize_int (0);
4127                       DR_STEP (newdr) = step;
4128                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4129                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4130                       /* Mark as simd-lane access.  */
4131                       tree arg2 = gimple_call_arg (def, 1);
4132                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4133                       free_data_ref (dr);
4134                       datarefs->safe_push (newdr);
4135                       return opt_result::success ();
4136                     }
4137                 }
4138             }
4139         }
4140       free_data_ref (newdr);
4141     }
4142
4143   datarefs->safe_push (dr);
4144   return opt_result::success ();
4145 }
4146
4147 /* Function vect_analyze_data_refs.
4148
4149   Find all the data references in the loop or basic block.
4150
4151    The general structure of the analysis of data refs in the vectorizer is as
4152    follows:
4153    1- vect_analyze_data_refs(loop/bb): call
4154       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4155       in the loop/bb and their dependences.
4156    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4157    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4158    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4159
4160 */
4161
4162 opt_result
4163 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4164 {
4165   class loop *loop = NULL;
4166   unsigned int i;
4167   struct data_reference *dr;
4168   tree scalar_type;
4169
4170   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4171
4172   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4173     loop = LOOP_VINFO_LOOP (loop_vinfo);
4174
4175   /* Go through the data-refs, check that the analysis succeeded.  Update
4176      pointer from stmt_vec_info struct to DR and vectype.  */
4177
4178   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4179   FOR_EACH_VEC_ELT (datarefs, i, dr)
4180     {
4181       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4182       poly_uint64 vf;
4183
4184       gcc_assert (DR_REF (dr));
4185       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4186       gcc_assert (!stmt_info->dr_aux.dr);
4187       stmt_info->dr_aux.dr = dr;
4188       stmt_info->dr_aux.stmt = stmt_info;
4189
4190       /* Check that analysis of the data-ref succeeded.  */
4191       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4192           || !DR_STEP (dr))
4193         {
4194           bool maybe_gather
4195             = DR_IS_READ (dr)
4196               && !TREE_THIS_VOLATILE (DR_REF (dr))
4197               && (targetm.vectorize.builtin_gather != NULL
4198                   || supports_vec_gather_load_p ());
4199           bool maybe_scatter
4200             = DR_IS_WRITE (dr)
4201               && !TREE_THIS_VOLATILE (DR_REF (dr))
4202               && (targetm.vectorize.builtin_scatter != NULL
4203                   || supports_vec_scatter_store_p ());
4204
4205           /* If target supports vector gather loads or scatter stores,
4206              see if they can't be used.  */
4207           if (is_a <loop_vec_info> (vinfo)
4208               && !nested_in_vect_loop_p (loop, stmt_info))
4209             {
4210               if (maybe_gather || maybe_scatter)
4211                 {
4212                   if (maybe_gather)
4213                     gatherscatter = GATHER;
4214                   else
4215                     gatherscatter = SCATTER;
4216                 }
4217             }
4218
4219           if (gatherscatter == SG_NONE)
4220             {
4221               if (dump_enabled_p ())
4222                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223                                  "not vectorized: data ref analysis "
4224                                  "failed %G", stmt_info->stmt);
4225               if (is_a <bb_vec_info> (vinfo))
4226                 {
4227                   /* In BB vectorization the ref can still participate
4228                      in dependence analysis, we just can't vectorize it.  */
4229                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4230                   continue;
4231                 }
4232               return opt_result::failure_at (stmt_info->stmt,
4233                                              "not vectorized:"
4234                                              " data ref analysis failed: %G",
4235                                              stmt_info->stmt);
4236             }
4237         }
4238
4239       /* See if this was detected as SIMD lane access.  */
4240       if (dr->aux == (void *)-1
4241           || dr->aux == (void *)-2
4242           || dr->aux == (void *)-3
4243           || dr->aux == (void *)-4)
4244         {
4245           if (nested_in_vect_loop_p (loop, stmt_info))
4246             return opt_result::failure_at (stmt_info->stmt,
4247                                            "not vectorized:"
4248                                            " data ref analysis failed: %G",
4249                                            stmt_info->stmt);
4250           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4251             = -(uintptr_t) dr->aux;
4252         }
4253
4254       tree base = get_base_address (DR_REF (dr));
4255       if (base && VAR_P (base) && DECL_NONALIASED (base))
4256         {
4257           if (dump_enabled_p ())
4258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4259                              "not vectorized: base object not addressable "
4260                              "for stmt: %G", stmt_info->stmt);
4261           if (is_a <bb_vec_info> (vinfo))
4262             {
4263               /* In BB vectorization the ref can still participate
4264                  in dependence analysis, we just can't vectorize it.  */
4265               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4266               continue;
4267             }
4268           return opt_result::failure_at (stmt_info->stmt,
4269                                          "not vectorized: base object not"
4270                                          " addressable for stmt: %G",
4271                                          stmt_info->stmt);
4272         }
4273
4274       if (is_a <loop_vec_info> (vinfo)
4275           && DR_STEP (dr)
4276           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4277         {
4278           if (nested_in_vect_loop_p (loop, stmt_info))
4279             return opt_result::failure_at (stmt_info->stmt,
4280                                            "not vectorized:"
4281                                            "not suitable for strided load %G",
4282                                            stmt_info->stmt);
4283           STMT_VINFO_STRIDED_P (stmt_info) = true;
4284         }
4285
4286       /* Update DR field in stmt_vec_info struct.  */
4287
4288       /* If the dataref is in an inner-loop of the loop that is considered for
4289          for vectorization, we also want to analyze the access relative to
4290          the outer-loop (DR contains information only relative to the
4291          inner-most enclosing loop).  We do that by building a reference to the
4292          first location accessed by the inner-loop, and analyze it relative to
4293          the outer-loop.  */
4294       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4295         {
4296           /* Build a reference to the first location accessed by the
4297              inner loop: *(BASE + INIT + OFFSET).  By construction,
4298              this address must be invariant in the inner loop, so we
4299              can consider it as being used in the outer loop.  */
4300           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4301           tree offset = unshare_expr (DR_OFFSET (dr));
4302           tree init = unshare_expr (DR_INIT (dr));
4303           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4304                                           init, offset);
4305           tree init_addr = fold_build_pointer_plus (base, init_offset);
4306           tree init_ref = build_fold_indirect_ref (init_addr);
4307
4308           if (dump_enabled_p ())
4309             dump_printf_loc (MSG_NOTE, vect_location,
4310                              "analyze in outer loop: %T\n", init_ref);
4311
4312           opt_result res
4313             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4314                                     init_ref, loop, stmt_info->stmt);
4315           if (!res)
4316             /* dr_analyze_innermost already explained the failure.  */
4317             return res;
4318
4319           if (dump_enabled_p ())
4320             dump_printf_loc (MSG_NOTE, vect_location,
4321                              "\touter base_address: %T\n"
4322                              "\touter offset from base address: %T\n"
4323                              "\touter constant offset from base address: %T\n"
4324                              "\touter step: %T\n"
4325                              "\touter base alignment: %d\n\n"
4326                              "\touter base misalignment: %d\n"
4327                              "\touter offset alignment: %d\n"
4328                              "\touter step alignment: %d\n",
4329                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4330                              STMT_VINFO_DR_OFFSET (stmt_info),
4331                              STMT_VINFO_DR_INIT (stmt_info),
4332                              STMT_VINFO_DR_STEP (stmt_info),
4333                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4334                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4335                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4336                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4337         }
4338
4339       /* Set vectype for STMT.  */
4340       scalar_type = TREE_TYPE (DR_REF (dr));
4341       STMT_VINFO_VECTYPE (stmt_info)
4342         = get_vectype_for_scalar_type (scalar_type);
4343       if (!STMT_VINFO_VECTYPE (stmt_info))
4344         {
4345           if (dump_enabled_p ())
4346             {
4347               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4348                                "not vectorized: no vectype for stmt: %G",
4349                                stmt_info->stmt);
4350               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4351               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4352                                  scalar_type);
4353               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4354             }
4355
4356           if (is_a <bb_vec_info> (vinfo))
4357             {
4358               /* No vector type is fine, the ref can still participate
4359                  in dependence analysis, we just can't vectorize it.  */
4360               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4361               continue;
4362             }
4363           if (fatal)
4364             *fatal = false;
4365           return opt_result::failure_at (stmt_info->stmt,
4366                                          "not vectorized:"
4367                                          " no vectype for stmt: %G"
4368                                          " scalar_type: %T\n",
4369                                          stmt_info->stmt, scalar_type);
4370         }
4371       else
4372         {
4373           if (dump_enabled_p ())
4374             dump_printf_loc (MSG_NOTE, vect_location,
4375                              "got vectype for stmt: %G%T\n",
4376                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4377         }
4378
4379       /* Adjust the minimal vectorization factor according to the
4380          vector type.  */
4381       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4382       *min_vf = upper_bound (*min_vf, vf);
4383
4384       if (gatherscatter != SG_NONE)
4385         {
4386           gather_scatter_info gs_info;
4387           if (!vect_check_gather_scatter (stmt_info,
4388                                           as_a <loop_vec_info> (vinfo),
4389                                           &gs_info)
4390               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4391             {
4392               if (fatal)
4393                 *fatal = false;
4394               return opt_result::failure_at
4395                         (stmt_info->stmt,
4396                          (gatherscatter == GATHER)
4397                          ? "not vectorized: not suitable for gather load %G"
4398                          : "not vectorized: not suitable for scatter store %G",
4399                          stmt_info->stmt);
4400             }
4401           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4402         }
4403     }
4404
4405   /* We used to stop processing and prune the list here.  Verify we no
4406      longer need to.  */
4407   gcc_assert (i == datarefs.length ());
4408
4409   return opt_result::success ();
4410 }
4411
4412
4413 /* Function vect_get_new_vect_var.
4414
4415    Returns a name for a new variable.  The current naming scheme appends the
4416    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4417    the name of vectorizer generated variables, and appends that to NAME if
4418    provided.  */
4419
4420 tree
4421 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4422 {
4423   const char *prefix;
4424   tree new_vect_var;
4425
4426   switch (var_kind)
4427   {
4428   case vect_simple_var:
4429     prefix = "vect";
4430     break;
4431   case vect_scalar_var:
4432     prefix = "stmp";
4433     break;
4434   case vect_mask_var:
4435     prefix = "mask";
4436     break;
4437   case vect_pointer_var:
4438     prefix = "vectp";
4439     break;
4440   default:
4441     gcc_unreachable ();
4442   }
4443
4444   if (name)
4445     {
4446       char* tmp = concat (prefix, "_", name, NULL);
4447       new_vect_var = create_tmp_reg (type, tmp);
4448       free (tmp);
4449     }
4450   else
4451     new_vect_var = create_tmp_reg (type, prefix);
4452
4453   return new_vect_var;
4454 }
4455
4456 /* Like vect_get_new_vect_var but return an SSA name.  */
4457
4458 tree
4459 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4460 {
4461   const char *prefix;
4462   tree new_vect_var;
4463
4464   switch (var_kind)
4465   {
4466   case vect_simple_var:
4467     prefix = "vect";
4468     break;
4469   case vect_scalar_var:
4470     prefix = "stmp";
4471     break;
4472   case vect_pointer_var:
4473     prefix = "vectp";
4474     break;
4475   default:
4476     gcc_unreachable ();
4477   }
4478
4479   if (name)
4480     {
4481       char* tmp = concat (prefix, "_", name, NULL);
4482       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4483       free (tmp);
4484     }
4485   else
4486     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4487
4488   return new_vect_var;
4489 }
4490
4491 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4492
4493 static void
4494 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4495 {
4496   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4497   int misalign = DR_MISALIGNMENT (dr_info);
4498   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4499     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4500   else
4501     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4502                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4503                             misalign);
4504 }
4505
4506 /* Function vect_create_addr_base_for_vector_ref.
4507
4508    Create an expression that computes the address of the first memory location
4509    that will be accessed for a data reference.
4510
4511    Input:
4512    STMT_INFO: The statement containing the data reference.
4513    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4514    OFFSET: Optional. If supplied, it is be added to the initial address.
4515    LOOP:    Specify relative to which loop-nest should the address be computed.
4516             For example, when the dataref is in an inner-loop nested in an
4517             outer-loop that is now being vectorized, LOOP can be either the
4518             outer-loop, or the inner-loop.  The first memory location accessed
4519             by the following dataref ('in' points to short):
4520
4521                 for (i=0; i<N; i++)
4522                    for (j=0; j<M; j++)
4523                      s += in[i+j]
4524
4525             is as follows:
4526             if LOOP=i_loop:     &in             (relative to i_loop)
4527             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4528    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4529             initial address.  Unlike OFFSET, which is number of elements to
4530             be added, BYTE_OFFSET is measured in bytes.
4531
4532    Output:
4533    1. Return an SSA_NAME whose value is the address of the memory location of
4534       the first vector of the data reference.
4535    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4536       these statement(s) which define the returned SSA_NAME.
4537
4538    FORNOW: We are only handling array accesses with step 1.  */
4539
4540 tree
4541 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4542                                       gimple_seq *new_stmt_list,
4543                                       tree offset,
4544                                       tree byte_offset)
4545 {
4546   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4547   struct data_reference *dr = dr_info->dr;
4548   const char *base_name;
4549   tree addr_base;
4550   tree dest;
4551   gimple_seq seq = NULL;
4552   tree vect_ptr_type;
4553   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4554   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4555   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4556
4557   tree data_ref_base = unshare_expr (drb->base_address);
4558   tree base_offset = unshare_expr (drb->offset);
4559   tree init = unshare_expr (drb->init);
4560
4561   if (loop_vinfo)
4562     base_name = get_name (data_ref_base);
4563   else
4564     {
4565       base_offset = ssize_int (0);
4566       init = ssize_int (0);
4567       base_name = get_name (DR_REF (dr));
4568     }
4569
4570   /* Create base_offset */
4571   base_offset = size_binop (PLUS_EXPR,
4572                             fold_convert (sizetype, base_offset),
4573                             fold_convert (sizetype, init));
4574
4575   if (offset)
4576     {
4577       offset = fold_build2 (MULT_EXPR, sizetype,
4578                             fold_convert (sizetype, offset), step);
4579       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4580                                  base_offset, offset);
4581     }
4582   if (byte_offset)
4583     {
4584       byte_offset = fold_convert (sizetype, byte_offset);
4585       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4586                                  base_offset, byte_offset);
4587     }
4588
4589   /* base + base_offset */
4590   if (loop_vinfo)
4591     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4592   else
4593     {
4594       addr_base = build1 (ADDR_EXPR,
4595                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4596                           unshare_expr (DR_REF (dr)));
4597     }
4598
4599   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4600   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4601   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4602   gimple_seq_add_seq (new_stmt_list, seq);
4603
4604   if (DR_PTR_INFO (dr)
4605       && TREE_CODE (addr_base) == SSA_NAME
4606       && !SSA_NAME_PTR_INFO (addr_base))
4607     {
4608       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4609       if (offset || byte_offset)
4610         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4611     }
4612
4613   if (dump_enabled_p ())
4614     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4615
4616   return addr_base;
4617 }
4618
4619
4620 /* Function vect_create_data_ref_ptr.
4621
4622    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4623    location accessed in the loop by STMT_INFO, along with the def-use update
4624    chain to appropriately advance the pointer through the loop iterations.
4625    Also set aliasing information for the pointer.  This pointer is used by
4626    the callers to this function to create a memory reference expression for
4627    vector load/store access.
4628
4629    Input:
4630    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4631          GIMPLE_ASSIGN <name, data-ref> or
4632          GIMPLE_ASSIGN <data-ref, name>.
4633    2. AGGR_TYPE: the type of the reference, which should be either a vector
4634         or an array.
4635    3. AT_LOOP: the loop where the vector memref is to be created.
4636    4. OFFSET (optional): an offset to be added to the initial address accessed
4637         by the data-ref in STMT_INFO.
4638    5. BSI: location where the new stmts are to be placed if there is no loop
4639    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4640         pointing to the initial address.
4641    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4642         to the initial address accessed by the data-ref in STMT_INFO.  This is
4643         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4644         in bytes.
4645    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4646         to the IV during each iteration of the loop.  NULL says to move
4647         by one copy of AGGR_TYPE up or down, depending on the step of the
4648         data reference.
4649
4650    Output:
4651    1. Declare a new ptr to vector_type, and have it point to the base of the
4652       data reference (initial addressed accessed by the data reference).
4653       For example, for vector of type V8HI, the following code is generated:
4654
4655       v8hi *ap;
4656       ap = (v8hi *)initial_address;
4657
4658       if OFFSET is not supplied:
4659          initial_address = &a[init];
4660       if OFFSET is supplied:
4661          initial_address = &a[init + OFFSET];
4662       if BYTE_OFFSET is supplied:
4663          initial_address = &a[init] + BYTE_OFFSET;
4664
4665       Return the initial_address in INITIAL_ADDRESS.
4666
4667    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4668       update the pointer in each iteration of the loop.
4669
4670       Return the increment stmt that updates the pointer in PTR_INCR.
4671
4672    3. Return the pointer.  */
4673
4674 tree
4675 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4676                           class loop *at_loop, tree offset,
4677                           tree *initial_address, gimple_stmt_iterator *gsi,
4678                           gimple **ptr_incr, bool only_init,
4679                           tree byte_offset, tree iv_step)
4680 {
4681   const char *base_name;
4682   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4683   class loop *loop = NULL;
4684   bool nested_in_vect_loop = false;
4685   class loop *containing_loop = NULL;
4686   tree aggr_ptr_type;
4687   tree aggr_ptr;
4688   tree new_temp;
4689   gimple_seq new_stmt_list = NULL;
4690   edge pe = NULL;
4691   basic_block new_bb;
4692   tree aggr_ptr_init;
4693   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4694   struct data_reference *dr = dr_info->dr;
4695   tree aptr;
4696   gimple_stmt_iterator incr_gsi;
4697   bool insert_after;
4698   tree indx_before_incr, indx_after_incr;
4699   gimple *incr;
4700   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4701
4702   gcc_assert (iv_step != NULL_TREE
4703               || TREE_CODE (aggr_type) == ARRAY_TYPE
4704               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4705
4706   if (loop_vinfo)
4707     {
4708       loop = LOOP_VINFO_LOOP (loop_vinfo);
4709       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4710       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4711       pe = loop_preheader_edge (loop);
4712     }
4713   else
4714     {
4715       gcc_assert (bb_vinfo);
4716       only_init = true;
4717       *ptr_incr = NULL;
4718     }
4719
4720   /* Create an expression for the first address accessed by this load
4721      in LOOP.  */
4722   base_name = get_name (DR_BASE_ADDRESS (dr));
4723
4724   if (dump_enabled_p ())
4725     {
4726       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4727       dump_printf_loc (MSG_NOTE, vect_location,
4728                        "create %s-pointer variable to type: %T",
4729                        get_tree_code_name (TREE_CODE (aggr_type)),
4730                        aggr_type);
4731       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4732         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4733       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4734         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4735       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4736         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4737       else
4738         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4739       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4740     }
4741
4742   /* (1) Create the new aggregate-pointer variable.
4743      Vector and array types inherit the alias set of their component
4744      type by default so we need to use a ref-all pointer if the data
4745      reference does not conflict with the created aggregated data
4746      reference because it is not addressable.  */
4747   bool need_ref_all = false;
4748   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4749                               get_alias_set (DR_REF (dr))))
4750     need_ref_all = true;
4751   /* Likewise for any of the data references in the stmt group.  */
4752   else if (DR_GROUP_SIZE (stmt_info) > 1)
4753     {
4754       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4755       do
4756         {
4757           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4758           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4759                                       get_alias_set (DR_REF (sdr))))
4760             {
4761               need_ref_all = true;
4762               break;
4763             }
4764           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4765         }
4766       while (sinfo);
4767     }
4768   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4769                                                need_ref_all);
4770   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4771
4772
4773   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4774      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4775      def-use update cycles for the pointer: one relative to the outer-loop
4776      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4777      to the inner-loop (which is the inner-most loop containing the dataref),
4778      and this is done be step (5) below.
4779
4780      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4781      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4782      redundant.  Steps (3),(4) create the following:
4783
4784         vp0 = &base_addr;
4785         LOOP:   vp1 = phi(vp0,vp2)
4786                 ...
4787                 ...
4788                 vp2 = vp1 + step
4789                 goto LOOP
4790
4791      If there is an inner-loop nested in loop, then step (5) will also be
4792      applied, and an additional update in the inner-loop will be created:
4793
4794         vp0 = &base_addr;
4795         LOOP:   vp1 = phi(vp0,vp2)
4796                 ...
4797         inner:     vp3 = phi(vp1,vp4)
4798                    vp4 = vp3 + inner_step
4799                    if () goto inner
4800                 ...
4801                 vp2 = vp1 + step
4802                 if () goto LOOP   */
4803
4804   /* (2) Calculate the initial address of the aggregate-pointer, and set
4805      the aggregate-pointer to point to it before the loop.  */
4806
4807   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4808
4809   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4810                                                    offset, byte_offset);
4811   if (new_stmt_list)
4812     {
4813       if (pe)
4814         {
4815           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4816           gcc_assert (!new_bb);
4817         }
4818       else
4819         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4820     }
4821
4822   *initial_address = new_temp;
4823   aggr_ptr_init = new_temp;
4824
4825   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4826      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4827      inner-loop nested in LOOP (during outer-loop vectorization).  */
4828
4829   /* No update in loop is required.  */
4830   if (only_init && (!loop_vinfo || at_loop == loop))
4831     aptr = aggr_ptr_init;
4832   else
4833     {
4834       /* Accesses to invariant addresses should be handled specially
4835          by the caller.  */
4836       tree step = vect_dr_behavior (dr_info)->step;
4837       gcc_assert (!integer_zerop (step));
4838
4839       if (iv_step == NULL_TREE)
4840         {
4841           /* The step of the aggregate pointer is the type size,
4842              negated for downward accesses.  */
4843           iv_step = TYPE_SIZE_UNIT (aggr_type);
4844           if (tree_int_cst_sgn (step) == -1)
4845             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4846         }
4847
4848       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4849
4850       create_iv (aggr_ptr_init,
4851                  fold_convert (aggr_ptr_type, iv_step),
4852                  aggr_ptr, loop, &incr_gsi, insert_after,
4853                  &indx_before_incr, &indx_after_incr);
4854       incr = gsi_stmt (incr_gsi);
4855       loop_vinfo->add_stmt (incr);
4856
4857       /* Copy the points-to information if it exists. */
4858       if (DR_PTR_INFO (dr))
4859         {
4860           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4861           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4862         }
4863       if (ptr_incr)
4864         *ptr_incr = incr;
4865
4866       aptr = indx_before_incr;
4867     }
4868
4869   if (!nested_in_vect_loop || only_init)
4870     return aptr;
4871
4872
4873   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4874      nested in LOOP, if exists.  */
4875
4876   gcc_assert (nested_in_vect_loop);
4877   if (!only_init)
4878     {
4879       standard_iv_increment_position (containing_loop, &incr_gsi,
4880                                       &insert_after);
4881       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4882                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4883                  &indx_after_incr);
4884       incr = gsi_stmt (incr_gsi);
4885       loop_vinfo->add_stmt (incr);
4886
4887       /* Copy the points-to information if it exists. */
4888       if (DR_PTR_INFO (dr))
4889         {
4890           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4891           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4892         }
4893       if (ptr_incr)
4894         *ptr_incr = incr;
4895
4896       return indx_before_incr;
4897     }
4898   else
4899     gcc_unreachable ();
4900 }
4901
4902
4903 /* Function bump_vector_ptr
4904
4905    Increment a pointer (to a vector type) by vector-size. If requested,
4906    i.e. if PTR-INCR is given, then also connect the new increment stmt
4907    to the existing def-use update-chain of the pointer, by modifying
4908    the PTR_INCR as illustrated below:
4909
4910    The pointer def-use update-chain before this function:
4911                         DATAREF_PTR = phi (p_0, p_2)
4912                         ....
4913         PTR_INCR:       p_2 = DATAREF_PTR + step
4914
4915    The pointer def-use update-chain after this function:
4916                         DATAREF_PTR = phi (p_0, p_2)
4917                         ....
4918                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4919                         ....
4920         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4921
4922    Input:
4923    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4924                  in the loop.
4925    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4926               the loop.  The increment amount across iterations is expected
4927               to be vector_size.
4928    BSI - location where the new update stmt is to be placed.
4929    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4930    BUMP - optional. The offset by which to bump the pointer. If not given,
4931           the offset is assumed to be vector_size.
4932
4933    Output: Return NEW_DATAREF_PTR as illustrated above.
4934
4935 */
4936
4937 tree
4938 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4939                  stmt_vec_info stmt_info, tree bump)
4940 {
4941   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4942   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4943   tree update = TYPE_SIZE_UNIT (vectype);
4944   gassign *incr_stmt;
4945   ssa_op_iter iter;
4946   use_operand_p use_p;
4947   tree new_dataref_ptr;
4948
4949   if (bump)
4950     update = bump;
4951
4952   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4953     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4954   else
4955     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4956   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4957                                    dataref_ptr, update);
4958   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4959
4960   /* Copy the points-to information if it exists. */
4961   if (DR_PTR_INFO (dr))
4962     {
4963       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4964       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4965     }
4966
4967   if (!ptr_incr)
4968     return new_dataref_ptr;
4969
4970   /* Update the vector-pointer's cross-iteration increment.  */
4971   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4972     {
4973       tree use = USE_FROM_PTR (use_p);
4974
4975       if (use == dataref_ptr)
4976         SET_USE (use_p, new_dataref_ptr);
4977       else
4978         gcc_assert (operand_equal_p (use, update, 0));
4979     }
4980
4981   return new_dataref_ptr;
4982 }
4983
4984
4985 /* Copy memory reference info such as base/clique from the SRC reference
4986    to the DEST MEM_REF.  */
4987
4988 void
4989 vect_copy_ref_info (tree dest, tree src)
4990 {
4991   if (TREE_CODE (dest) != MEM_REF)
4992     return;
4993
4994   tree src_base = src;
4995   while (handled_component_p (src_base))
4996     src_base = TREE_OPERAND (src_base, 0);
4997   if (TREE_CODE (src_base) != MEM_REF
4998       && TREE_CODE (src_base) != TARGET_MEM_REF)
4999     return;
5000
5001   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5002   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5003 }
5004
5005
5006 /* Function vect_create_destination_var.
5007
5008    Create a new temporary of type VECTYPE.  */
5009
5010 tree
5011 vect_create_destination_var (tree scalar_dest, tree vectype)
5012 {
5013   tree vec_dest;
5014   const char *name;
5015   char *new_name;
5016   tree type;
5017   enum vect_var_kind kind;
5018
5019   kind = vectype
5020     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5021     ? vect_mask_var
5022     : vect_simple_var
5023     : vect_scalar_var;
5024   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5025
5026   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5027
5028   name = get_name (scalar_dest);
5029   if (name)
5030     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5031   else
5032     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5033   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5034   free (new_name);
5035
5036   return vec_dest;
5037 }
5038
5039 /* Function vect_grouped_store_supported.
5040
5041    Returns TRUE if interleave high and interleave low permutations
5042    are supported, and FALSE otherwise.  */
5043
5044 bool
5045 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5046 {
5047   machine_mode mode = TYPE_MODE (vectype);
5048
5049   /* vect_permute_store_chain requires the group size to be equal to 3 or
5050      be a power of two.  */
5051   if (count != 3 && exact_log2 (count) == -1)
5052     {
5053       if (dump_enabled_p ())
5054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5055                          "the size of the group of accesses"
5056                          " is not a power of 2 or not eqaul to 3\n");
5057       return false;
5058     }
5059
5060   /* Check that the permutation is supported.  */
5061   if (VECTOR_MODE_P (mode))
5062     {
5063       unsigned int i;
5064       if (count == 3)
5065         {
5066           unsigned int j0 = 0, j1 = 0, j2 = 0;
5067           unsigned int i, j;
5068
5069           unsigned int nelt;
5070           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5071             {
5072               if (dump_enabled_p ())
5073                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5074                                  "cannot handle groups of 3 stores for"
5075                                  " variable-length vectors\n");
5076               return false;
5077             }
5078
5079           vec_perm_builder sel (nelt, nelt, 1);
5080           sel.quick_grow (nelt);
5081           vec_perm_indices indices;
5082           for (j = 0; j < 3; j++)
5083             {
5084               int nelt0 = ((3 - j) * nelt) % 3;
5085               int nelt1 = ((3 - j) * nelt + 1) % 3;
5086               int nelt2 = ((3 - j) * nelt + 2) % 3;
5087               for (i = 0; i < nelt; i++)
5088                 {
5089                   if (3 * i + nelt0 < nelt)
5090                     sel[3 * i + nelt0] = j0++;
5091                   if (3 * i + nelt1 < nelt)
5092                     sel[3 * i + nelt1] = nelt + j1++;
5093                   if (3 * i + nelt2 < nelt)
5094                     sel[3 * i + nelt2] = 0;
5095                 }
5096               indices.new_vector (sel, 2, nelt);
5097               if (!can_vec_perm_const_p (mode, indices))
5098                 {
5099                   if (dump_enabled_p ())
5100                     dump_printf (MSG_MISSED_OPTIMIZATION,
5101                                  "permutation op not supported by target.\n");
5102                   return false;
5103                 }
5104
5105               for (i = 0; i < nelt; i++)
5106                 {
5107                   if (3 * i + nelt0 < nelt)
5108                     sel[3 * i + nelt0] = 3 * i + nelt0;
5109                   if (3 * i + nelt1 < nelt)
5110                     sel[3 * i + nelt1] = 3 * i + nelt1;
5111                   if (3 * i + nelt2 < nelt)
5112                     sel[3 * i + nelt2] = nelt + j2++;
5113                 }
5114               indices.new_vector (sel, 2, nelt);
5115               if (!can_vec_perm_const_p (mode, indices))
5116                 {
5117                   if (dump_enabled_p ())
5118                     dump_printf (MSG_MISSED_OPTIMIZATION,
5119                                  "permutation op not supported by target.\n");
5120                   return false;
5121                 }
5122             }
5123           return true;
5124         }
5125       else
5126         {
5127           /* If length is not equal to 3 then only power of 2 is supported.  */
5128           gcc_assert (pow2p_hwi (count));
5129           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5130
5131           /* The encoding has 2 interleaved stepped patterns.  */
5132           vec_perm_builder sel (nelt, 2, 3);
5133           sel.quick_grow (6);
5134           for (i = 0; i < 3; i++)
5135             {
5136               sel[i * 2] = i;
5137               sel[i * 2 + 1] = i + nelt;
5138             }
5139           vec_perm_indices indices (sel, 2, nelt);
5140           if (can_vec_perm_const_p (mode, indices))
5141             {
5142               for (i = 0; i < 6; i++)
5143                 sel[i] += exact_div (nelt, 2);
5144               indices.new_vector (sel, 2, nelt);
5145               if (can_vec_perm_const_p (mode, indices))
5146                 return true;
5147             }
5148         }
5149     }
5150
5151   if (dump_enabled_p ())
5152     dump_printf (MSG_MISSED_OPTIMIZATION,
5153                  "permutation op not supported by target.\n");
5154   return false;
5155 }
5156
5157
5158 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5159    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5160
5161 bool
5162 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5163                             bool masked_p)
5164 {
5165   if (masked_p)
5166     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5167                                          vec_mask_store_lanes_optab,
5168                                          vectype, count);
5169   else
5170     return vect_lanes_optab_supported_p ("vec_store_lanes",
5171                                          vec_store_lanes_optab,
5172                                          vectype, count);
5173 }
5174
5175
5176 /* Function vect_permute_store_chain.
5177
5178    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5179    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5180    the data correctly for the stores.  Return the final references for stores
5181    in RESULT_CHAIN.
5182
5183    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5184    The input is 4 vectors each containing 8 elements.  We assign a number to
5185    each element, the input sequence is:
5186
5187    1st vec:   0  1  2  3  4  5  6  7
5188    2nd vec:   8  9 10 11 12 13 14 15
5189    3rd vec:  16 17 18 19 20 21 22 23
5190    4th vec:  24 25 26 27 28 29 30 31
5191
5192    The output sequence should be:
5193
5194    1st vec:  0  8 16 24  1  9 17 25
5195    2nd vec:  2 10 18 26  3 11 19 27
5196    3rd vec:  4 12 20 28  5 13 21 30
5197    4th vec:  6 14 22 30  7 15 23 31
5198
5199    i.e., we interleave the contents of the four vectors in their order.
5200
5201    We use interleave_high/low instructions to create such output.  The input of
5202    each interleave_high/low operation is two vectors:
5203    1st vec    2nd vec
5204    0 1 2 3    4 5 6 7
5205    the even elements of the result vector are obtained left-to-right from the
5206    high/low elements of the first vector.  The odd elements of the result are
5207    obtained left-to-right from the high/low elements of the second vector.
5208    The output of interleave_high will be:   0 4 1 5
5209    and of interleave_low:                   2 6 3 7
5210
5211
5212    The permutation is done in log LENGTH stages.  In each stage interleave_high
5213    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5214    where the first argument is taken from the first half of DR_CHAIN and the
5215    second argument from it's second half.
5216    In our example,
5217
5218    I1: interleave_high (1st vec, 3rd vec)
5219    I2: interleave_low (1st vec, 3rd vec)
5220    I3: interleave_high (2nd vec, 4th vec)
5221    I4: interleave_low (2nd vec, 4th vec)
5222
5223    The output for the first stage is:
5224
5225    I1:  0 16  1 17  2 18  3 19
5226    I2:  4 20  5 21  6 22  7 23
5227    I3:  8 24  9 25 10 26 11 27
5228    I4: 12 28 13 29 14 30 15 31
5229
5230    The output of the second stage, i.e. the final result is:
5231
5232    I1:  0  8 16 24  1  9 17 25
5233    I2:  2 10 18 26  3 11 19 27
5234    I3:  4 12 20 28  5 13 21 30
5235    I4:  6 14 22 30  7 15 23 31.  */
5236
5237 void
5238 vect_permute_store_chain (vec<tree> dr_chain,
5239                           unsigned int length,
5240                           stmt_vec_info stmt_info,
5241                           gimple_stmt_iterator *gsi,
5242                           vec<tree> *result_chain)
5243 {
5244   tree vect1, vect2, high, low;
5245   gimple *perm_stmt;
5246   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5247   tree perm_mask_low, perm_mask_high;
5248   tree data_ref;
5249   tree perm3_mask_low, perm3_mask_high;
5250   unsigned int i, j, n, log_length = exact_log2 (length);
5251
5252   result_chain->quick_grow (length);
5253   memcpy (result_chain->address (), dr_chain.address (),
5254           length * sizeof (tree));
5255
5256   if (length == 3)
5257     {
5258       /* vect_grouped_store_supported ensures that this is constant.  */
5259       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5260       unsigned int j0 = 0, j1 = 0, j2 = 0;
5261
5262       vec_perm_builder sel (nelt, nelt, 1);
5263       sel.quick_grow (nelt);
5264       vec_perm_indices indices;
5265       for (j = 0; j < 3; j++)
5266         {
5267           int nelt0 = ((3 - j) * nelt) % 3;
5268           int nelt1 = ((3 - j) * nelt + 1) % 3;
5269           int nelt2 = ((3 - j) * nelt + 2) % 3;
5270
5271           for (i = 0; i < nelt; i++)
5272             {
5273               if (3 * i + nelt0 < nelt)
5274                 sel[3 * i + nelt0] = j0++;
5275               if (3 * i + nelt1 < nelt)
5276                 sel[3 * i + nelt1] = nelt + j1++;
5277               if (3 * i + nelt2 < nelt)
5278                 sel[3 * i + nelt2] = 0;
5279             }
5280           indices.new_vector (sel, 2, nelt);
5281           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5282
5283           for (i = 0; i < nelt; i++)
5284             {
5285               if (3 * i + nelt0 < nelt)
5286                 sel[3 * i + nelt0] = 3 * i + nelt0;
5287               if (3 * i + nelt1 < nelt)
5288                 sel[3 * i + nelt1] = 3 * i + nelt1;
5289               if (3 * i + nelt2 < nelt)
5290                 sel[3 * i + nelt2] = nelt + j2++;
5291             }
5292           indices.new_vector (sel, 2, nelt);
5293           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5294
5295           vect1 = dr_chain[0];
5296           vect2 = dr_chain[1];
5297
5298           /* Create interleaving stmt:
5299              low = VEC_PERM_EXPR <vect1, vect2,
5300                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5301                                    j + 2, nelt + j + 2, *, ...}>  */
5302           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5303           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5304                                            vect2, perm3_mask_low);
5305           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5306
5307           vect1 = data_ref;
5308           vect2 = dr_chain[2];
5309           /* Create interleaving stmt:
5310              low = VEC_PERM_EXPR <vect1, vect2,
5311                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5312                                    6, 7, nelt + j + 2, ...}>  */
5313           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5314           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5315                                            vect2, perm3_mask_high);
5316           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5317           (*result_chain)[j] = data_ref;
5318         }
5319     }
5320   else
5321     {
5322       /* If length is not equal to 3 then only power of 2 is supported.  */
5323       gcc_assert (pow2p_hwi (length));
5324
5325       /* The encoding has 2 interleaved stepped patterns.  */
5326       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5327       vec_perm_builder sel (nelt, 2, 3);
5328       sel.quick_grow (6);
5329       for (i = 0; i < 3; i++)
5330         {
5331           sel[i * 2] = i;
5332           sel[i * 2 + 1] = i + nelt;
5333         }
5334         vec_perm_indices indices (sel, 2, nelt);
5335         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5336
5337         for (i = 0; i < 6; i++)
5338           sel[i] += exact_div (nelt, 2);
5339         indices.new_vector (sel, 2, nelt);
5340         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5341
5342         for (i = 0, n = log_length; i < n; i++)
5343           {
5344             for (j = 0; j < length/2; j++)
5345               {
5346                 vect1 = dr_chain[j];
5347                 vect2 = dr_chain[j+length/2];
5348
5349                 /* Create interleaving stmt:
5350                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5351                                                         ...}>  */
5352                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5353                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5354                                                  vect2, perm_mask_high);
5355                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5356                 (*result_chain)[2*j] = high;
5357
5358                 /* Create interleaving stmt:
5359                    low = VEC_PERM_EXPR <vect1, vect2,
5360                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5361                                          ...}>  */
5362                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5363                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5364                                                  vect2, perm_mask_low);
5365                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5366                 (*result_chain)[2*j+1] = low;
5367               }
5368             memcpy (dr_chain.address (), result_chain->address (),
5369                     length * sizeof (tree));
5370           }
5371     }
5372 }
5373
5374 /* Function vect_setup_realignment
5375
5376    This function is called when vectorizing an unaligned load using
5377    the dr_explicit_realign[_optimized] scheme.
5378    This function generates the following code at the loop prolog:
5379
5380       p = initial_addr;
5381    x  msq_init = *(floor(p));   # prolog load
5382       realignment_token = call target_builtin;
5383     loop:
5384    x  msq = phi (msq_init, ---)
5385
5386    The stmts marked with x are generated only for the case of
5387    dr_explicit_realign_optimized.
5388
5389    The code above sets up a new (vector) pointer, pointing to the first
5390    location accessed by STMT_INFO, and a "floor-aligned" load using that
5391    pointer.  It also generates code to compute the "realignment-token"
5392    (if the relevant target hook was defined), and creates a phi-node at the
5393    loop-header bb whose arguments are the result of the prolog-load (created
5394    by this function) and the result of a load that takes place in the loop
5395    (to be created by the caller to this function).
5396
5397    For the case of dr_explicit_realign_optimized:
5398    The caller to this function uses the phi-result (msq) to create the
5399    realignment code inside the loop, and sets up the missing phi argument,
5400    as follows:
5401     loop:
5402       msq = phi (msq_init, lsq)
5403       lsq = *(floor(p'));        # load in loop
5404       result = realign_load (msq, lsq, realignment_token);
5405
5406    For the case of dr_explicit_realign:
5407     loop:
5408       msq = *(floor(p));        # load in loop
5409       p' = p + (VS-1);
5410       lsq = *(floor(p'));       # load in loop
5411       result = realign_load (msq, lsq, realignment_token);
5412
5413    Input:
5414    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5415                a memory location that may be unaligned.
5416    BSI - place where new code is to be inserted.
5417    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5418                               is used.
5419
5420    Output:
5421    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5422                        target hook, if defined.
5423    Return value - the result of the loop-header phi node.  */
5424
5425 tree
5426 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5427                         tree *realignment_token,
5428                         enum dr_alignment_support alignment_support_scheme,
5429                         tree init_addr,
5430                         class loop **at_loop)
5431 {
5432   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5433   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5434   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5435   struct data_reference *dr = dr_info->dr;
5436   class loop *loop = NULL;
5437   edge pe = NULL;
5438   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5439   tree vec_dest;
5440   gimple *inc;
5441   tree ptr;
5442   tree data_ref;
5443   basic_block new_bb;
5444   tree msq_init = NULL_TREE;
5445   tree new_temp;
5446   gphi *phi_stmt;
5447   tree msq = NULL_TREE;
5448   gimple_seq stmts = NULL;
5449   bool compute_in_loop = false;
5450   bool nested_in_vect_loop = false;
5451   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5452   class loop *loop_for_initial_load = NULL;
5453
5454   if (loop_vinfo)
5455     {
5456       loop = LOOP_VINFO_LOOP (loop_vinfo);
5457       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5458     }
5459
5460   gcc_assert (alignment_support_scheme == dr_explicit_realign
5461               || alignment_support_scheme == dr_explicit_realign_optimized);
5462
5463   /* We need to generate three things:
5464      1. the misalignment computation
5465      2. the extra vector load (for the optimized realignment scheme).
5466      3. the phi node for the two vectors from which the realignment is
5467       done (for the optimized realignment scheme).  */
5468
5469   /* 1. Determine where to generate the misalignment computation.
5470
5471      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5472      calculation will be generated by this function, outside the loop (in the
5473      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5474      caller, inside the loop.
5475
5476      Background: If the misalignment remains fixed throughout the iterations of
5477      the loop, then both realignment schemes are applicable, and also the
5478      misalignment computation can be done outside LOOP.  This is because we are
5479      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5480      are a multiple of VS (the Vector Size), and therefore the misalignment in
5481      different vectorized LOOP iterations is always the same.
5482      The problem arises only if the memory access is in an inner-loop nested
5483      inside LOOP, which is now being vectorized using outer-loop vectorization.
5484      This is the only case when the misalignment of the memory access may not
5485      remain fixed throughout the iterations of the inner-loop (as explained in
5486      detail in vect_supportable_dr_alignment).  In this case, not only is the
5487      optimized realignment scheme not applicable, but also the misalignment
5488      computation (and generation of the realignment token that is passed to
5489      REALIGN_LOAD) have to be done inside the loop.
5490
5491      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5492      or not, which in turn determines if the misalignment is computed inside
5493      the inner-loop, or outside LOOP.  */
5494
5495   if (init_addr != NULL_TREE || !loop_vinfo)
5496     {
5497       compute_in_loop = true;
5498       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5499     }
5500
5501
5502   /* 2. Determine where to generate the extra vector load.
5503
5504      For the optimized realignment scheme, instead of generating two vector
5505      loads in each iteration, we generate a single extra vector load in the
5506      preheader of the loop, and in each iteration reuse the result of the
5507      vector load from the previous iteration.  In case the memory access is in
5508      an inner-loop nested inside LOOP, which is now being vectorized using
5509      outer-loop vectorization, we need to determine whether this initial vector
5510      load should be generated at the preheader of the inner-loop, or can be
5511      generated at the preheader of LOOP.  If the memory access has no evolution
5512      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5513      to be generated inside LOOP (in the preheader of the inner-loop).  */
5514
5515   if (nested_in_vect_loop)
5516     {
5517       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5518       bool invariant_in_outerloop =
5519             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5520       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5521     }
5522   else
5523     loop_for_initial_load = loop;
5524   if (at_loop)
5525     *at_loop = loop_for_initial_load;
5526
5527   if (loop_for_initial_load)
5528     pe = loop_preheader_edge (loop_for_initial_load);
5529
5530   /* 3. For the case of the optimized realignment, create the first vector
5531       load at the loop preheader.  */
5532
5533   if (alignment_support_scheme == dr_explicit_realign_optimized)
5534     {
5535       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5536       gassign *new_stmt;
5537
5538       gcc_assert (!compute_in_loop);
5539       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5540       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5541                                       loop_for_initial_load, NULL_TREE,
5542                                       &init_addr, NULL, &inc, true);
5543       if (TREE_CODE (ptr) == SSA_NAME)
5544         new_temp = copy_ssa_name (ptr);
5545       else
5546         new_temp = make_ssa_name (TREE_TYPE (ptr));
5547       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5548       tree type = TREE_TYPE (ptr);
5549       new_stmt = gimple_build_assign
5550                    (new_temp, BIT_AND_EXPR, ptr,
5551                     fold_build2 (MINUS_EXPR, type,
5552                                  build_int_cst (type, 0),
5553                                  build_int_cst (type, align)));
5554       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5555       gcc_assert (!new_bb);
5556       data_ref
5557         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5558                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5559       vect_copy_ref_info (data_ref, DR_REF (dr));
5560       new_stmt = gimple_build_assign (vec_dest, data_ref);
5561       new_temp = make_ssa_name (vec_dest, new_stmt);
5562       gimple_assign_set_lhs (new_stmt, new_temp);
5563       if (pe)
5564         {
5565           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5566           gcc_assert (!new_bb);
5567         }
5568       else
5569          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5570
5571       msq_init = gimple_assign_lhs (new_stmt);
5572     }
5573
5574   /* 4. Create realignment token using a target builtin, if available.
5575       It is done either inside the containing loop, or before LOOP (as
5576       determined above).  */
5577
5578   if (targetm.vectorize.builtin_mask_for_load)
5579     {
5580       gcall *new_stmt;
5581       tree builtin_decl;
5582
5583       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5584       if (!init_addr)
5585         {
5586           /* Generate the INIT_ADDR computation outside LOOP.  */
5587           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5588                                                             NULL_TREE);
5589           if (loop)
5590             {
5591               pe = loop_preheader_edge (loop);
5592               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5593               gcc_assert (!new_bb);
5594             }
5595           else
5596              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5597         }
5598
5599       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5600       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5601       vec_dest =
5602         vect_create_destination_var (scalar_dest,
5603                                      gimple_call_return_type (new_stmt));
5604       new_temp = make_ssa_name (vec_dest, new_stmt);
5605       gimple_call_set_lhs (new_stmt, new_temp);
5606
5607       if (compute_in_loop)
5608         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5609       else
5610         {
5611           /* Generate the misalignment computation outside LOOP.  */
5612           pe = loop_preheader_edge (loop);
5613           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5614           gcc_assert (!new_bb);
5615         }
5616
5617       *realignment_token = gimple_call_lhs (new_stmt);
5618
5619       /* The result of the CALL_EXPR to this builtin is determined from
5620          the value of the parameter and no global variables are touched
5621          which makes the builtin a "const" function.  Requiring the
5622          builtin to have the "const" attribute makes it unnecessary
5623          to call mark_call_clobbered.  */
5624       gcc_assert (TREE_READONLY (builtin_decl));
5625     }
5626
5627   if (alignment_support_scheme == dr_explicit_realign)
5628     return msq;
5629
5630   gcc_assert (!compute_in_loop);
5631   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5632
5633
5634   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5635
5636   pe = loop_preheader_edge (containing_loop);
5637   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5638   msq = make_ssa_name (vec_dest);
5639   phi_stmt = create_phi_node (msq, containing_loop->header);
5640   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5641
5642   return msq;
5643 }
5644
5645
5646 /* Function vect_grouped_load_supported.
5647
5648    COUNT is the size of the load group (the number of statements plus the
5649    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5650    only one statement, with a gap of COUNT - 1.
5651
5652    Returns true if a suitable permute exists.  */
5653
5654 bool
5655 vect_grouped_load_supported (tree vectype, bool single_element_p,
5656                              unsigned HOST_WIDE_INT count)
5657 {
5658   machine_mode mode = TYPE_MODE (vectype);
5659
5660   /* If this is single-element interleaving with an element distance
5661      that leaves unused vector loads around punt - we at least create
5662      very sub-optimal code in that case (and blow up memory,
5663      see PR65518).  */
5664   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5665     {
5666       if (dump_enabled_p ())
5667         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5668                          "single-element interleaving not supported "
5669                          "for not adjacent vector loads\n");
5670       return false;
5671     }
5672
5673   /* vect_permute_load_chain requires the group size to be equal to 3 or
5674      be a power of two.  */
5675   if (count != 3 && exact_log2 (count) == -1)
5676     {
5677       if (dump_enabled_p ())
5678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5679                          "the size of the group of accesses"
5680                          " is not a power of 2 or not equal to 3\n");
5681       return false;
5682     }
5683
5684   /* Check that the permutation is supported.  */
5685   if (VECTOR_MODE_P (mode))
5686     {
5687       unsigned int i, j;
5688       if (count == 3)
5689         {
5690           unsigned int nelt;
5691           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5692             {
5693               if (dump_enabled_p ())
5694                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5695                                  "cannot handle groups of 3 loads for"
5696                                  " variable-length vectors\n");
5697               return false;
5698             }
5699
5700           vec_perm_builder sel (nelt, nelt, 1);
5701           sel.quick_grow (nelt);
5702           vec_perm_indices indices;
5703           unsigned int k;
5704           for (k = 0; k < 3; k++)
5705             {
5706               for (i = 0; i < nelt; i++)
5707                 if (3 * i + k < 2 * nelt)
5708                   sel[i] = 3 * i + k;
5709                 else
5710                   sel[i] = 0;
5711               indices.new_vector (sel, 2, nelt);
5712               if (!can_vec_perm_const_p (mode, indices))
5713                 {
5714                   if (dump_enabled_p ())
5715                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5716                                      "shuffle of 3 loads is not supported by"
5717                                      " target\n");
5718                   return false;
5719                 }
5720               for (i = 0, j = 0; i < nelt; i++)
5721                 if (3 * i + k < 2 * nelt)
5722                   sel[i] = i;
5723                 else
5724                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5725               indices.new_vector (sel, 2, nelt);
5726               if (!can_vec_perm_const_p (mode, indices))
5727                 {
5728                   if (dump_enabled_p ())
5729                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5730                                      "shuffle of 3 loads is not supported by"
5731                                      " target\n");
5732                   return false;
5733                 }
5734             }
5735           return true;
5736         }
5737       else
5738         {
5739           /* If length is not equal to 3 then only power of 2 is supported.  */
5740           gcc_assert (pow2p_hwi (count));
5741           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5742
5743           /* The encoding has a single stepped pattern.  */
5744           vec_perm_builder sel (nelt, 1, 3);
5745           sel.quick_grow (3);
5746           for (i = 0; i < 3; i++)
5747             sel[i] = i * 2;
5748           vec_perm_indices indices (sel, 2, nelt);
5749           if (can_vec_perm_const_p (mode, indices))
5750             {
5751               for (i = 0; i < 3; i++)
5752                 sel[i] = i * 2 + 1;
5753               indices.new_vector (sel, 2, nelt);
5754               if (can_vec_perm_const_p (mode, indices))
5755                 return true;
5756             }
5757         }
5758     }
5759
5760   if (dump_enabled_p ())
5761     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5762                      "extract even/odd not supported by target\n");
5763   return false;
5764 }
5765
5766 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5767    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5768
5769 bool
5770 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5771                            bool masked_p)
5772 {
5773   if (masked_p)
5774     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5775                                          vec_mask_load_lanes_optab,
5776                                          vectype, count);
5777   else
5778     return vect_lanes_optab_supported_p ("vec_load_lanes",
5779                                          vec_load_lanes_optab,
5780                                          vectype, count);
5781 }
5782
5783 /* Function vect_permute_load_chain.
5784
5785    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5786    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5787    the input data correctly.  Return the final references for loads in
5788    RESULT_CHAIN.
5789
5790    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5791    The input is 4 vectors each containing 8 elements. We assign a number to each
5792    element, the input sequence is:
5793
5794    1st vec:   0  1  2  3  4  5  6  7
5795    2nd vec:   8  9 10 11 12 13 14 15
5796    3rd vec:  16 17 18 19 20 21 22 23
5797    4th vec:  24 25 26 27 28 29 30 31
5798
5799    The output sequence should be:
5800
5801    1st vec:  0 4  8 12 16 20 24 28
5802    2nd vec:  1 5  9 13 17 21 25 29
5803    3rd vec:  2 6 10 14 18 22 26 30
5804    4th vec:  3 7 11 15 19 23 27 31
5805
5806    i.e., the first output vector should contain the first elements of each
5807    interleaving group, etc.
5808
5809    We use extract_even/odd instructions to create such output.  The input of
5810    each extract_even/odd operation is two vectors
5811    1st vec    2nd vec
5812    0 1 2 3    4 5 6 7
5813
5814    and the output is the vector of extracted even/odd elements.  The output of
5815    extract_even will be:   0 2 4 6
5816    and of extract_odd:     1 3 5 7
5817
5818
5819    The permutation is done in log LENGTH stages.  In each stage extract_even
5820    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5821    their order.  In our example,
5822
5823    E1: extract_even (1st vec, 2nd vec)
5824    E2: extract_odd (1st vec, 2nd vec)
5825    E3: extract_even (3rd vec, 4th vec)
5826    E4: extract_odd (3rd vec, 4th vec)
5827
5828    The output for the first stage will be:
5829
5830    E1:  0  2  4  6  8 10 12 14
5831    E2:  1  3  5  7  9 11 13 15
5832    E3: 16 18 20 22 24 26 28 30
5833    E4: 17 19 21 23 25 27 29 31
5834
5835    In order to proceed and create the correct sequence for the next stage (or
5836    for the correct output, if the second stage is the last one, as in our
5837    example), we first put the output of extract_even operation and then the
5838    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5839    The input for the second stage is:
5840
5841    1st vec (E1):  0  2  4  6  8 10 12 14
5842    2nd vec (E3): 16 18 20 22 24 26 28 30
5843    3rd vec (E2):  1  3  5  7  9 11 13 15
5844    4th vec (E4): 17 19 21 23 25 27 29 31
5845
5846    The output of the second stage:
5847
5848    E1: 0 4  8 12 16 20 24 28
5849    E2: 2 6 10 14 18 22 26 30
5850    E3: 1 5  9 13 17 21 25 29
5851    E4: 3 7 11 15 19 23 27 31
5852
5853    And RESULT_CHAIN after reordering:
5854
5855    1st vec (E1):  0 4  8 12 16 20 24 28
5856    2nd vec (E3):  1 5  9 13 17 21 25 29
5857    3rd vec (E2):  2 6 10 14 18 22 26 30
5858    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5859
5860 static void
5861 vect_permute_load_chain (vec<tree> dr_chain,
5862                          unsigned int length,
5863                          stmt_vec_info stmt_info,
5864                          gimple_stmt_iterator *gsi,
5865                          vec<tree> *result_chain)
5866 {
5867   tree data_ref, first_vect, second_vect;
5868   tree perm_mask_even, perm_mask_odd;
5869   tree perm3_mask_low, perm3_mask_high;
5870   gimple *perm_stmt;
5871   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5872   unsigned int i, j, log_length = exact_log2 (length);
5873
5874   result_chain->quick_grow (length);
5875   memcpy (result_chain->address (), dr_chain.address (),
5876           length * sizeof (tree));
5877
5878   if (length == 3)
5879     {
5880       /* vect_grouped_load_supported ensures that this is constant.  */
5881       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5882       unsigned int k;
5883
5884       vec_perm_builder sel (nelt, nelt, 1);
5885       sel.quick_grow (nelt);
5886       vec_perm_indices indices;
5887       for (k = 0; k < 3; k++)
5888         {
5889           for (i = 0; i < nelt; i++)
5890             if (3 * i + k < 2 * nelt)
5891               sel[i] = 3 * i + k;
5892             else
5893               sel[i] = 0;
5894           indices.new_vector (sel, 2, nelt);
5895           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5896
5897           for (i = 0, j = 0; i < nelt; i++)
5898             if (3 * i + k < 2 * nelt)
5899               sel[i] = i;
5900             else
5901               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5902           indices.new_vector (sel, 2, nelt);
5903           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5904
5905           first_vect = dr_chain[0];
5906           second_vect = dr_chain[1];
5907
5908           /* Create interleaving stmt (low part of):
5909              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5910                                                              ...}>  */
5911           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5912           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5913                                            second_vect, perm3_mask_low);
5914           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5915
5916           /* Create interleaving stmt (high part of):
5917              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5918                                                               ...}>  */
5919           first_vect = data_ref;
5920           second_vect = dr_chain[2];
5921           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5922           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5923                                            second_vect, perm3_mask_high);
5924           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5925           (*result_chain)[k] = data_ref;
5926         }
5927     }
5928   else
5929     {
5930       /* If length is not equal to 3 then only power of 2 is supported.  */
5931       gcc_assert (pow2p_hwi (length));
5932
5933       /* The encoding has a single stepped pattern.  */
5934       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5935       vec_perm_builder sel (nelt, 1, 3);
5936       sel.quick_grow (3);
5937       for (i = 0; i < 3; ++i)
5938         sel[i] = i * 2;
5939       vec_perm_indices indices (sel, 2, nelt);
5940       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5941
5942       for (i = 0; i < 3; ++i)
5943         sel[i] = i * 2 + 1;
5944       indices.new_vector (sel, 2, nelt);
5945       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5946
5947       for (i = 0; i < log_length; i++)
5948         {
5949           for (j = 0; j < length; j += 2)
5950             {
5951               first_vect = dr_chain[j];
5952               second_vect = dr_chain[j+1];
5953
5954               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5955               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5956               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5957                                                first_vect, second_vect,
5958                                                perm_mask_even);
5959               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5960               (*result_chain)[j/2] = data_ref;
5961
5962               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5963               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5964               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5965                                                first_vect, second_vect,
5966                                                perm_mask_odd);
5967               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5968               (*result_chain)[j/2+length/2] = data_ref;
5969             }
5970           memcpy (dr_chain.address (), result_chain->address (),
5971                   length * sizeof (tree));
5972         }
5973     }
5974 }
5975
5976 /* Function vect_shift_permute_load_chain.
5977
5978    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5979    sequence of stmts to reorder the input data accordingly.
5980    Return the final references for loads in RESULT_CHAIN.
5981    Return true if successed, false otherwise.
5982
5983    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5984    The input is 3 vectors each containing 8 elements.  We assign a
5985    number to each element, the input sequence is:
5986
5987    1st vec:   0  1  2  3  4  5  6  7
5988    2nd vec:   8  9 10 11 12 13 14 15
5989    3rd vec:  16 17 18 19 20 21 22 23
5990
5991    The output sequence should be:
5992
5993    1st vec:  0 3 6  9 12 15 18 21
5994    2nd vec:  1 4 7 10 13 16 19 22
5995    3rd vec:  2 5 8 11 14 17 20 23
5996
5997    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5998
5999    First we shuffle all 3 vectors to get correct elements order:
6000
6001    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6002    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6003    3rd vec:  (16 19 22) (17 20 23) (18 21)
6004
6005    Next we unite and shift vector 3 times:
6006
6007    1st step:
6008      shift right by 6 the concatenation of:
6009      "1st vec" and  "2nd vec"
6010        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6011      "2nd vec" and  "3rd vec"
6012        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6013      "3rd vec" and  "1st vec"
6014        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6015                              | New vectors                   |
6016
6017      So that now new vectors are:
6018
6019      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6020      2nd vec:  (10 13) (16 19 22) (17 20 23)
6021      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6022
6023    2nd step:
6024      shift right by 5 the concatenation of:
6025      "1st vec" and  "3rd vec"
6026        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6027      "2nd vec" and  "1st vec"
6028        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6029      "3rd vec" and  "2nd vec"
6030        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6031                           | New vectors                   |
6032
6033      So that now new vectors are:
6034
6035      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6036      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6037      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6038
6039    3rd step:
6040      shift right by 5 the concatenation of:
6041      "1st vec" and  "1st vec"
6042        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6043      shift right by 3 the concatenation of:
6044      "2nd vec" and  "2nd vec"
6045                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6046                           | New vectors                   |
6047
6048      So that now all vectors are READY:
6049      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6050      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6051      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6052
6053    This algorithm is faster than one in vect_permute_load_chain if:
6054      1.  "shift of a concatination" is faster than general permutation.
6055          This is usually so.
6056      2.  The TARGET machine can't execute vector instructions in parallel.
6057          This is because each step of the algorithm depends on previous.
6058          The algorithm in vect_permute_load_chain is much more parallel.
6059
6060    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6061 */
6062
6063 static bool
6064 vect_shift_permute_load_chain (vec<tree> dr_chain,
6065                                unsigned int length,
6066                                stmt_vec_info stmt_info,
6067                                gimple_stmt_iterator *gsi,
6068                                vec<tree> *result_chain)
6069 {
6070   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6071   tree perm2_mask1, perm2_mask2, perm3_mask;
6072   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6073   gimple *perm_stmt;
6074
6075   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6076   unsigned int i;
6077   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6078
6079   unsigned HOST_WIDE_INT nelt, vf;
6080   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6081       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6082     /* Not supported for variable-length vectors.  */
6083     return false;
6084
6085   vec_perm_builder sel (nelt, nelt, 1);
6086   sel.quick_grow (nelt);
6087
6088   result_chain->quick_grow (length);
6089   memcpy (result_chain->address (), dr_chain.address (),
6090           length * sizeof (tree));
6091
6092   if (pow2p_hwi (length) && vf > 4)
6093     {
6094       unsigned int j, log_length = exact_log2 (length);
6095       for (i = 0; i < nelt / 2; ++i)
6096         sel[i] = i * 2;
6097       for (i = 0; i < nelt / 2; ++i)
6098         sel[nelt / 2 + i] = i * 2 + 1;
6099       vec_perm_indices indices (sel, 2, nelt);
6100       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6101         {
6102           if (dump_enabled_p ())
6103             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6104                              "shuffle of 2 fields structure is not \
6105                               supported by target\n");
6106           return false;
6107         }
6108       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6109
6110       for (i = 0; i < nelt / 2; ++i)
6111         sel[i] = i * 2 + 1;
6112       for (i = 0; i < nelt / 2; ++i)
6113         sel[nelt / 2 + i] = i * 2;
6114       indices.new_vector (sel, 2, nelt);
6115       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6116         {
6117           if (dump_enabled_p ())
6118             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6119                              "shuffle of 2 fields structure is not \
6120                               supported by target\n");
6121           return false;
6122         }
6123       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6124
6125       /* Generating permutation constant to shift all elements.
6126          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6127       for (i = 0; i < nelt; i++)
6128         sel[i] = nelt / 2 + i;
6129       indices.new_vector (sel, 2, nelt);
6130       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6131         {
6132           if (dump_enabled_p ())
6133             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6134                              "shift permutation is not supported by target\n");
6135           return false;
6136         }
6137       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6138
6139       /* Generating permutation constant to select vector from 2.
6140          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6141       for (i = 0; i < nelt / 2; i++)
6142         sel[i] = i;
6143       for (i = nelt / 2; i < nelt; i++)
6144         sel[i] = nelt + i;
6145       indices.new_vector (sel, 2, nelt);
6146       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6147         {
6148           if (dump_enabled_p ())
6149             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6150                              "select is not supported by target\n");
6151           return false;
6152         }
6153       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6154
6155       for (i = 0; i < log_length; i++)
6156         {
6157           for (j = 0; j < length; j += 2)
6158             {
6159               first_vect = dr_chain[j];
6160               second_vect = dr_chain[j + 1];
6161
6162               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6163               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6164                                                first_vect, first_vect,
6165                                                perm2_mask1);
6166               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6167               vect[0] = data_ref;
6168
6169               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6170               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6171                                                second_vect, second_vect,
6172                                                perm2_mask2);
6173               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6174               vect[1] = data_ref;
6175
6176               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6177               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6178                                                vect[0], vect[1], shift1_mask);
6179               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6180               (*result_chain)[j/2 + length/2] = data_ref;
6181
6182               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6183               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6184                                                vect[0], vect[1], select_mask);
6185               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6186               (*result_chain)[j/2] = data_ref;
6187             }
6188           memcpy (dr_chain.address (), result_chain->address (),
6189                   length * sizeof (tree));
6190         }
6191       return true;
6192     }
6193   if (length == 3 && vf > 2)
6194     {
6195       unsigned int k = 0, l = 0;
6196
6197       /* Generating permutation constant to get all elements in rigth order.
6198          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6199       for (i = 0; i < nelt; i++)
6200         {
6201           if (3 * k + (l % 3) >= nelt)
6202             {
6203               k = 0;
6204               l += (3 - (nelt % 3));
6205             }
6206           sel[i] = 3 * k + (l % 3);
6207           k++;
6208         }
6209       vec_perm_indices indices (sel, 2, nelt);
6210       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6211         {
6212           if (dump_enabled_p ())
6213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6214                              "shuffle of 3 fields structure is not \
6215                               supported by target\n");
6216           return false;
6217         }
6218       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6219
6220       /* Generating permutation constant to shift all elements.
6221          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6222       for (i = 0; i < nelt; i++)
6223         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6224       indices.new_vector (sel, 2, nelt);
6225       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6226         {
6227           if (dump_enabled_p ())
6228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229                              "shift permutation is not supported by target\n");
6230           return false;
6231         }
6232       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6233
6234       /* Generating permutation constant to shift all elements.
6235          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6236       for (i = 0; i < nelt; i++)
6237         sel[i] = 2 * (nelt / 3) + 1 + i;
6238       indices.new_vector (sel, 2, nelt);
6239       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6240         {
6241           if (dump_enabled_p ())
6242             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6243                              "shift permutation is not supported by target\n");
6244           return false;
6245         }
6246       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6247
6248       /* Generating permutation constant to shift all elements.
6249          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6250       for (i = 0; i < nelt; i++)
6251         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6252       indices.new_vector (sel, 2, nelt);
6253       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6254         {
6255           if (dump_enabled_p ())
6256             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6257                              "shift permutation is not supported by target\n");
6258           return false;
6259         }
6260       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6261
6262       /* Generating permutation constant to shift all elements.
6263          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6264       for (i = 0; i < nelt; i++)
6265         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6266       indices.new_vector (sel, 2, nelt);
6267       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6268         {
6269           if (dump_enabled_p ())
6270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6271                              "shift permutation is not supported by target\n");
6272           return false;
6273         }
6274       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6275
6276       for (k = 0; k < 3; k++)
6277         {
6278           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6279           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6280                                            dr_chain[k], dr_chain[k],
6281                                            perm3_mask);
6282           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6283           vect[k] = data_ref;
6284         }
6285
6286       for (k = 0; k < 3; k++)
6287         {
6288           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6289           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6290                                            vect[k % 3], vect[(k + 1) % 3],
6291                                            shift1_mask);
6292           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6293           vect_shift[k] = data_ref;
6294         }
6295
6296       for (k = 0; k < 3; k++)
6297         {
6298           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6299           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6300                                            vect_shift[(4 - k) % 3],
6301                                            vect_shift[(3 - k) % 3],
6302                                            shift2_mask);
6303           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6304           vect[k] = data_ref;
6305         }
6306
6307       (*result_chain)[3 - (nelt % 3)] = vect[2];
6308
6309       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6310       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6311                                        vect[0], shift3_mask);
6312       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6313       (*result_chain)[nelt % 3] = data_ref;
6314
6315       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6316       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6317                                        vect[1], shift4_mask);
6318       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6319       (*result_chain)[0] = data_ref;
6320       return true;
6321     }
6322   return false;
6323 }
6324
6325 /* Function vect_transform_grouped_load.
6326
6327    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6328    to perform their permutation and ascribe the result vectorized statements to
6329    the scalar statements.
6330 */
6331
6332 void
6333 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6334                              int size, gimple_stmt_iterator *gsi)
6335 {
6336   machine_mode mode;
6337   vec<tree> result_chain = vNULL;
6338
6339   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6340      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6341      vectors, that are ready for vector computation.  */
6342   result_chain.create (size);
6343
6344   /* If reassociation width for vector type is 2 or greater target machine can
6345      execute 2 or more vector instructions in parallel.  Otherwise try to
6346      get chain for loads group using vect_shift_permute_load_chain.  */
6347   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6348   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6349       || pow2p_hwi (size)
6350       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6351                                          gsi, &result_chain))
6352     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6353   vect_record_grouped_load_vectors (stmt_info, result_chain);
6354   result_chain.release ();
6355 }
6356
6357 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6358    generated as part of the vectorization of STMT_INFO.  Assign the statement
6359    for each vector to the associated scalar statement.  */
6360
6361 void
6362 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6363                                   vec<tree> result_chain)
6364 {
6365   vec_info *vinfo = stmt_info->vinfo;
6366   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6367   unsigned int i, gap_count;
6368   tree tmp_data_ref;
6369
6370   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6371      Since we scan the chain starting from it's first node, their order
6372      corresponds the order of data-refs in RESULT_CHAIN.  */
6373   stmt_vec_info next_stmt_info = first_stmt_info;
6374   gap_count = 1;
6375   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6376     {
6377       if (!next_stmt_info)
6378         break;
6379
6380       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6381        code elimination pass later.  No need to check for the first stmt in
6382        the group, since it always exists.
6383        DR_GROUP_GAP is the number of steps in elements from the previous
6384        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6385        correspond to the gaps.  */
6386       if (next_stmt_info != first_stmt_info
6387           && gap_count < DR_GROUP_GAP (next_stmt_info))
6388         {
6389           gap_count++;
6390           continue;
6391         }
6392
6393       /* ???  The following needs cleanup after the removal of
6394          DR_GROUP_SAME_DR_STMT.  */
6395       if (next_stmt_info)
6396         {
6397           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6398           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6399              copies, and we put the new vector statement in the first available
6400              RELATED_STMT.  */
6401           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6402             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6403           else
6404             {
6405               stmt_vec_info prev_stmt_info
6406                 = STMT_VINFO_VEC_STMT (next_stmt_info);
6407               stmt_vec_info rel_stmt_info
6408                 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6409               while (rel_stmt_info)
6410                 {
6411                   prev_stmt_info = rel_stmt_info;
6412                   rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6413                 }
6414
6415               STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6416             }
6417
6418           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6419           gap_count = 1;
6420         }
6421     }
6422 }
6423
6424 /* Function vect_force_dr_alignment_p.
6425
6426    Returns whether the alignment of a DECL can be forced to be aligned
6427    on ALIGNMENT bit boundary.  */
6428
6429 bool
6430 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6431 {
6432   if (!VAR_P (decl))
6433     return false;
6434
6435   if (decl_in_symtab_p (decl)
6436       && !symtab_node::get (decl)->can_increase_alignment_p ())
6437     return false;
6438
6439   if (TREE_STATIC (decl))
6440     return (known_le (alignment,
6441                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6442   else
6443     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6444 }
6445
6446
6447 /* Return whether the data reference DR_INFO is supported with respect to its
6448    alignment.
6449    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6450    it is aligned, i.e., check if it is possible to vectorize it with different
6451    alignment.  */
6452
6453 enum dr_alignment_support
6454 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6455                                bool check_aligned_accesses)
6456 {
6457   data_reference *dr = dr_info->dr;
6458   stmt_vec_info stmt_info = dr_info->stmt;
6459   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6460   machine_mode mode = TYPE_MODE (vectype);
6461   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6462   class loop *vect_loop = NULL;
6463   bool nested_in_vect_loop = false;
6464
6465   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6466     return dr_aligned;
6467
6468   /* For now assume all conditional loads/stores support unaligned
6469      access without any special code.  */
6470   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6471     if (gimple_call_internal_p (stmt)
6472         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6473             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6474       return dr_unaligned_supported;
6475
6476   if (loop_vinfo)
6477     {
6478       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6479       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6480     }
6481
6482   /* Possibly unaligned access.  */
6483
6484   /* We can choose between using the implicit realignment scheme (generating
6485      a misaligned_move stmt) and the explicit realignment scheme (generating
6486      aligned loads with a REALIGN_LOAD).  There are two variants to the
6487      explicit realignment scheme: optimized, and unoptimized.
6488      We can optimize the realignment only if the step between consecutive
6489      vector loads is equal to the vector size.  Since the vector memory
6490      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6491      is guaranteed that the misalignment amount remains the same throughout the
6492      execution of the vectorized loop.  Therefore, we can create the
6493      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6494      at the loop preheader.
6495
6496      However, in the case of outer-loop vectorization, when vectorizing a
6497      memory access in the inner-loop nested within the LOOP that is now being
6498      vectorized, while it is guaranteed that the misalignment of the
6499      vectorized memory access will remain the same in different outer-loop
6500      iterations, it is *not* guaranteed that is will remain the same throughout
6501      the execution of the inner-loop.  This is because the inner-loop advances
6502      with the original scalar step (and not in steps of VS).  If the inner-loop
6503      step happens to be a multiple of VS, then the misalignment remains fixed
6504      and we can use the optimized realignment scheme.  For example:
6505
6506       for (i=0; i<N; i++)
6507         for (j=0; j<M; j++)
6508           s += a[i+j];
6509
6510      When vectorizing the i-loop in the above example, the step between
6511      consecutive vector loads is 1, and so the misalignment does not remain
6512      fixed across the execution of the inner-loop, and the realignment cannot
6513      be optimized (as illustrated in the following pseudo vectorized loop):
6514
6515       for (i=0; i<N; i+=4)
6516         for (j=0; j<M; j++){
6517           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6518                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6519                          // (assuming that we start from an aligned address).
6520           }
6521
6522      We therefore have to use the unoptimized realignment scheme:
6523
6524       for (i=0; i<N; i+=4)
6525           for (j=k; j<M; j+=4)
6526           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6527                            // that the misalignment of the initial address is
6528                            // 0).
6529
6530      The loop can then be vectorized as follows:
6531
6532       for (k=0; k<4; k++){
6533         rt = get_realignment_token (&vp[k]);
6534         for (i=0; i<N; i+=4){
6535           v1 = vp[i+k];
6536           for (j=k; j<M; j+=4){
6537             v2 = vp[i+j+VS-1];
6538             va = REALIGN_LOAD <v1,v2,rt>;
6539             vs += va;
6540             v1 = v2;
6541           }
6542         }
6543     } */
6544
6545   if (DR_IS_READ (dr))
6546     {
6547       bool is_packed = false;
6548       tree type = (TREE_TYPE (DR_REF (dr)));
6549
6550       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6551           && (!targetm.vectorize.builtin_mask_for_load
6552               || targetm.vectorize.builtin_mask_for_load ()))
6553         {
6554           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6555
6556           /* If we are doing SLP then the accesses need not have the
6557              same alignment, instead it depends on the SLP group size.  */
6558           if (loop_vinfo
6559               && STMT_SLP_TYPE (stmt_info)
6560               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6561                               * (DR_GROUP_SIZE
6562                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6563                               TYPE_VECTOR_SUBPARTS (vectype)))
6564             ;
6565           else if (!loop_vinfo
6566                    || (nested_in_vect_loop
6567                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6568                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6569             return dr_explicit_realign;
6570           else
6571             return dr_explicit_realign_optimized;
6572         }
6573       if (!known_alignment_for_access_p (dr_info))
6574         is_packed = not_size_aligned (DR_REF (dr));
6575
6576       if (targetm.vectorize.support_vector_misalignment
6577             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6578         /* Can't software pipeline the loads, but can at least do them.  */
6579         return dr_unaligned_supported;
6580     }
6581   else
6582     {
6583       bool is_packed = false;
6584       tree type = (TREE_TYPE (DR_REF (dr)));
6585
6586       if (!known_alignment_for_access_p (dr_info))
6587         is_packed = not_size_aligned (DR_REF (dr));
6588
6589      if (targetm.vectorize.support_vector_misalignment
6590            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6591        return dr_unaligned_supported;
6592     }
6593
6594   /* Unsupported.  */
6595   return dr_unaligned_unsupported;
6596 }