gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 149     {
 150       unsigned int i = 0;
 151       if (gimple_call_internal_p (call))
 152         {
 153           internal_fn ifn = gimple_call_internal_fn (call);
 154           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 155             /* gimple_expr_type already picked the type of the loaded
 156                or stored data.  */
 157             i = ~0U;
 158           else if (internal_fn_mask_index (ifn) == 0)
 159             i = 1;
 160         }
 161       if (i < gimple_call_num_args (call))
 162         {
 163           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 164           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 165             {
 166               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 167               if (rhs < lhs)
 168                 scalar_type = rhs_type;
 169             }
 170         }
 171     }
 172
 173   *lhs_size_unit = lhs;
 174   *rhs_size_unit = rhs;
 175   return scalar_type;
 176 }
 177
 178
 179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 180    tested at run-time.  Return TRUE if DDR was successfully inserted.
 181    Return false if versioning is not supported.  */
 182
 183 static opt_result
 184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 185 {
 186   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 187
 188   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 189     return opt_result::failure_at (vect_location,
 190                                    "will not create alias checks, as"
 191                                    " --param vect-max-version-for-alias-checks"
 192                                    " == 0\n");
 193
 194   opt_result res
 195     = runtime_alias_check_p (ddr, loop,
 196                              optimize_loop_nest_for_speed_p (loop));
 197   if (!res)
 198     return res;
 199
 200   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 201   return opt_result::success ();
 202 }
 203
 204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 205
 206 static void
 207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 208 {
 209   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 210   for (unsigned int i = 0; i < checks.length(); ++i)
 211     if (checks[i] == value)
 212       return;
 213
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location,
 216                      "need run-time check that %T is nonzero\n",
 217                      value);
 218   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 219 }
 220
 221 /* Return true if we know that the order of vectorized DR_INFO_A and
 222    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 223    DR_INFO_B.  At least one of the accesses is a write.  */
 224
 225 static bool
 226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 227 {
 228   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 229   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 230
 231   /* Single statements are always kept in their original order.  */
 232   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 233       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 234     return true;
 235
 236   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 237      SLP group are emitted at the position of the last scalar load and
 238      all loads in an interleaving group are emitted at the position
 239      of the first scalar load.
 240      Stores in a group are emitted at the position of the last scalar store.
 241      Compute that position and check whether the resulting order matches
 242      the current one.
 243      We have not yet decided between SLP and interleaving so we have
 244      to conservatively assume both.  */
 245   stmt_vec_info il_a;
 246   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 247   if (last_a)
 248     {
 249       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 250            s = DR_GROUP_NEXT_ELEMENT (s))
 251         last_a = get_later_stmt (last_a, s);
 252       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 253         {
 254           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 255                s = DR_GROUP_NEXT_ELEMENT (s))
 256             if (get_later_stmt (il_a, s) == il_a)
 257               il_a = s;
 258         }
 259       else
 260         il_a = last_a;
 261     }
 262   else
 263     last_a = il_a = stmtinfo_a;
 264   stmt_vec_info il_b;
 265   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 266   if (last_b)
 267     {
 268       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 269            s = DR_GROUP_NEXT_ELEMENT (s))
 270         last_b = get_later_stmt (last_b, s);
 271       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 272         {
 273           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 274                s = DR_GROUP_NEXT_ELEMENT (s))
 275             if (get_later_stmt (il_b, s) == il_b)
 276               il_b = s;
 277         }
 278       else
 279         il_b = last_b;
 280     }
 281   else
 282     last_b = il_b = stmtinfo_b;
 283   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 284   return (/* SLP */
 285           (get_later_stmt (last_a, last_b) == last_a) == a_after_b
 286           /* Interleaving */
 287           && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
 288           /* Mixed */
 289           && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
 290           && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 291 }
 292
 293 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 294    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 295    distances.  These distances are conservatively correct but they don't
 296    reflect a guaranteed dependence.
 297
 298    Return true if this function does all the work necessary to avoid
 299    an alias or false if the caller should use the dependence distances
 300    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 301    the depth of the loop described by LOOP_VINFO and the other arguments
 302    are as for vect_analyze_data_ref_dependence.  */
 303
 304 static bool
 305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 306                                        loop_vec_info loop_vinfo,
 307                                        int loop_depth, unsigned int *max_vf)
 308 {
 309   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 310   lambda_vector dist_v;
 311   unsigned int i;
 312   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 313     {
 314       int dist = dist_v[loop_depth];
 315       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 316         {
 317           /* If the user asserted safelen >= DIST consecutive iterations
 318              can be executed concurrently, assume independence.
 319
 320              ??? An alternative would be to add the alias check even
 321              in this case, and vectorize the fallback loop with the
 322              maximum VF set to safelen.  However, if the user has
 323              explicitly given a length, it's less likely that that
 324              would be a win.  */
 325           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 326             {
 327               if ((unsigned int) loop->safelen < *max_vf)
 328                 *max_vf = loop->safelen;
 329               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 330               continue;
 331             }
 332
 333           /* For dependence distances of 2 or more, we have the option
 334              of limiting VF or checking for an alias at runtime.
 335              Prefer to check at runtime if we can, to avoid limiting
 336              the VF unnecessarily when the bases are in fact independent.
 337
 338              Note that the alias checks will be removed if the VF ends up
 339              being small enough.  */
 340           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 341           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 342           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 343                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 344                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 345         }
 346     }
 347   return true;
 348 }
 349
 350
 351 /* Function vect_analyze_data_ref_dependence.
 352
 353    FIXME: I needed to change the sense of the returned flag.
 354
 355    Return FALSE if there (might) exist a dependence between a memory-reference
 356    DRA and a memory-reference DRB.  When versioning for alias may check a
 357    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 358    the data dependence.  */
 359
 360 static opt_result
 361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 362                                   loop_vec_info loop_vinfo,
 363                                   unsigned int *max_vf)
 364 {
 365   unsigned int i;
 366   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 367   struct data_reference *dra = DDR_A (ddr);
 368   struct data_reference *drb = DDR_B (ddr);
 369   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 370   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 371   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 372   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 373   lambda_vector dist_v;
 374   unsigned int loop_depth;
 375
 376   /* In loop analysis all data references should be vectorizable.  */
 377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 379     gcc_unreachable ();
 380
 381   /* Independent data accesses.  */
 382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 383     return opt_result::success ();
 384
 385   if (dra == drb
 386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 387     return opt_result::success ();
 388
 389   /* We do not have to consider dependences between accesses that belong
 390      to the same group, unless the stride could be smaller than the
 391      group size.  */
 392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 394           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 396     return opt_result::success ();
 397
 398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 399      least two scalar iterations, there is always also a true dependence.
 400      As the vectorizer does not re-order loads and stores we can ignore
 401      the anti-dependence if TBAA can disambiguate both DRs similar to the
 402      case with known negative distance anti-dependences (positive
 403      distance anti-dependences would violate TBAA constraints).  */
 404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 407                                  get_alias_set (DR_REF (drb))))
 408     return opt_result::success ();
 409
 410   /* Unknown data dependence.  */
 411   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 412     {
 413       /* If user asserted safelen consecutive iterations can be
 414          executed concurrently, assume independence.  */
 415       if (loop->safelen >= 2)
 416         {
 417           if ((unsigned int) loop->safelen < *max_vf)
 418             *max_vf = loop->safelen;
 419           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 420           return opt_result::success ();
 421         }
 422
 423       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 424           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 425         return opt_result::failure_at
 426           (stmtinfo_a->stmt,
 427            "versioning for alias not supported for: "
 428            "can't determine dependence between %T and %T\n",
 429            DR_REF (dra), DR_REF (drb));
 430
 431       if (dump_enabled_p ())
 432         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 433                          "versioning for alias required: "
 434                          "can't determine dependence between %T and %T\n",
 435                          DR_REF (dra), DR_REF (drb));
 436
 437       /* Add to list of ddrs that need to be tested at run-time.  */
 438       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 439     }
 440
 441   /* Known data dependence.  */
 442   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 443     {
 444       /* If user asserted safelen consecutive iterations can be
 445          executed concurrently, assume independence.  */
 446       if (loop->safelen >= 2)
 447         {
 448           if ((unsigned int) loop->safelen < *max_vf)
 449             *max_vf = loop->safelen;
 450           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 451           return opt_result::success ();
 452         }
 453
 454       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 455           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 456         return opt_result::failure_at
 457           (stmtinfo_a->stmt,
 458            "versioning for alias not supported for: "
 459            "bad dist vector for %T and %T\n",
 460            DR_REF (dra), DR_REF (drb));
 461
 462       if (dump_enabled_p ())
 463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 464                          "versioning for alias required: "
 465                          "bad dist vector for %T and %T\n",
 466                          DR_REF (dra), DR_REF (drb));
 467       /* Add to list of ddrs that need to be tested at run-time.  */
 468       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 469     }
 470
 471   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 472
 473   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 474       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 475                                                 loop_depth, max_vf))
 476     return opt_result::success ();
 477
 478   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 479     {
 480       int dist = dist_v[loop_depth];
 481
 482       if (dump_enabled_p ())
 483         dump_printf_loc (MSG_NOTE, vect_location,
 484                          "dependence distance  = %d.\n", dist);
 485
 486       if (dist == 0)
 487         {
 488           if (dump_enabled_p ())
 489             dump_printf_loc (MSG_NOTE, vect_location,
 490                              "dependence distance == 0 between %T and %T\n",
 491                              DR_REF (dra), DR_REF (drb));
 492
 493           /* When we perform grouped accesses and perform implicit CSE
 494              by detecting equal accesses and doing disambiguation with
 495              runtime alias tests like for
 496                 .. = a[i];
 497                 .. = a[i+1];
 498                 a[i] = ..;
 499                 a[i+1] = ..;
 500                 *p = ..;
 501                 .. = a[i];
 502                 .. = a[i+1];
 503              where we will end up loading { a[i], a[i+1] } once, make
 504              sure that inserting group loads before the first load and
 505              stores after the last store will do the right thing.
 506              Similar for groups like
 507                 a[i] = ...;
 508                 ... = a[i];
 509                 a[i+1] = ...;
 510              where loads from the group interleave with the store.  */
 511           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 512             return opt_result::failure_at (stmtinfo_a->stmt,
 513                                            "READ_WRITE dependence"
 514                                            " in interleaving.\n");
 515
 516           if (loop->safelen < 2)
 517             {
 518               tree indicator = dr_zero_step_indicator (dra);
 519               if (!indicator || integer_zerop (indicator))
 520                 return opt_result::failure_at (stmtinfo_a->stmt,
 521                                                "access also has a zero step\n");
 522               else if (TREE_CODE (indicator) != INTEGER_CST)
 523                 vect_check_nonzero_value (loop_vinfo, indicator);
 524             }
 525           continue;
 526         }
 527
 528       if (dist > 0 && DDR_REVERSED_P (ddr))
 529         {
 530           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 531              reversed (to make distance vector positive), and the actual
 532              distance is negative.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance negative.\n");
 536           /* When doing outer loop vectorization, we need to check if there is
 537              a backward dependence at the inner loop level if the dependence
 538              at the outer loop is reversed.  See PR81740.  */
 539           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 540               || nested_in_vect_loop_p (loop, stmtinfo_b))
 541             {
 542               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 543                                                          DDR_LOOP_NEST (ddr));
 544               if (dist_v[inner_depth] < 0)
 545                 return opt_result::failure_at (stmtinfo_a->stmt,
 546                                                "not vectorized, dependence "
 547                                                "between data-refs %T and %T\n",
 548                                                DR_REF (dra), DR_REF (drb));
 549             }
 550           /* Record a negative dependence distance to later limit the
 551              amount of stmt copying / unrolling we can perform.
 552              Only need to handle read-after-write dependence.  */
 553           if (DR_IS_READ (drb)
 554               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 555                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 556             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 557           continue;
 558         }
 559
 560       unsigned int abs_dist = abs (dist);
 561       if (abs_dist >= 2 && abs_dist < *max_vf)
 562         {
 563           /* The dependence distance requires reduction of the maximal
 564              vectorization factor.  */
 565           *max_vf = abs_dist;
 566           if (dump_enabled_p ())
 567             dump_printf_loc (MSG_NOTE, vect_location,
 568                              "adjusting maximal vectorization factor to %i\n",
 569                              *max_vf);
 570         }
 571
 572       if (abs_dist >= *max_vf)
 573         {
 574           /* Dependence distance does not create dependence, as far as
 575              vectorization is concerned, in this case.  */
 576           if (dump_enabled_p ())
 577             dump_printf_loc (MSG_NOTE, vect_location,
 578                              "dependence distance >= VF.\n");
 579           continue;
 580         }
 581
 582       return opt_result::failure_at (stmtinfo_a->stmt,
 583                                      "not vectorized, possible dependence "
 584                                      "between data-refs %T and %T\n",
 585                                      DR_REF (dra), DR_REF (drb));
 586     }
 587
 588   return opt_result::success ();
 589 }
 590
 591 /* Function vect_analyze_data_ref_dependences.
 592
 593    Examine all the data references in the loop, and make sure there do not
 594    exist any data dependences between them.  Set *MAX_VF according to
 595    the maximum vectorization factor the data dependences allow.  */
 596
 597 opt_result
 598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 599                                    unsigned int *max_vf)
 600 {
 601   unsigned int i;
 602   struct data_dependence_relation *ddr;
 603
 604   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 605
 606   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 607     {
 608       LOOP_VINFO_DDRS (loop_vinfo)
 609         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 610                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 611       /* We need read-read dependences to compute
 612          STMT_VINFO_SAME_ALIGN_REFS.  */
 613       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 614                                           &LOOP_VINFO_DDRS (loop_vinfo),
 615                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 616                                           true);
 617       gcc_assert (res);
 618     }
 619
 620   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 621
 622   /* For epilogues we either have no aliases or alias versioning
 623      was applied to original loop.  Therefore we may just get max_vf
 624      using VF of original loop.  */
 625   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 626     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 627   else
 628     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 629       {
 630         opt_result res
 631           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 632         if (!res)
 633           return res;
 634       }
 635
 636   return opt_result::success ();
 637 }
 638
 639
 640 /* Function vect_slp_analyze_data_ref_dependence.
 641
 642    Return TRUE if there (might) exist a dependence between a memory-reference
 643    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 644    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 645    according to the data dependence.  */
 646
 647 static bool
 648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 649                                       struct data_dependence_relation *ddr)
 650 {
 651   struct data_reference *dra = DDR_A (ddr);
 652   struct data_reference *drb = DDR_B (ddr);
 653   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 654   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 655
 656   /* We need to check dependences of statements marked as unvectorizable
 657      as well, they still can prohibit vectorization.  */
 658
 659   /* Independent data accesses.  */
 660   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 661     return false;
 662
 663   if (dra == drb)
 664     return false;
 665
 666   /* Read-read is OK.  */
 667   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 668     return false;
 669
 670   /* If dra and drb are part of the same interleaving chain consider
 671      them independent.  */
 672   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 673       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 674           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 675     return false;
 676
 677   /* Unknown data dependence.  */
 678   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 679     {
 680       if  (dump_enabled_p ())
 681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 682                          "can't determine dependence between %T and %T\n",
 683                          DR_REF (dra), DR_REF (drb));
 684     }
 685   else if (dump_enabled_p ())
 686     dump_printf_loc (MSG_NOTE, vect_location,
 687                      "determined dependence between %T and %T\n",
 688                      DR_REF (dra), DR_REF (drb));
 689
 690   return true;
 691 }
 692
 693
 694 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 695    contain the vector of scalar stores of this instance if we are
 696    disambiguating the loads.  */
 697
 698 static bool
 699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 700                                    vec<stmt_vec_info> stores,
 701                                    stmt_vec_info last_store_info)
 702 {
 703   /* This walks over all stmts involved in the SLP load/store done
 704      in NODE verifying we can sink them up to the last stmt in the
 705      group.  */
 706   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 707   vec_info *vinfo = last_access_info->vinfo;
 708   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 709     {
 710       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 711       if (access_info == last_access_info)
 712         continue;
 713       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 714       ao_ref ref;
 715       bool ref_initialized_p = false;
 716       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 717            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 718         {
 719           gimple *stmt = gsi_stmt (gsi);
 720           if (! gimple_vuse (stmt)
 721               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 722             continue;
 723
 724           /* If we couldn't record a (single) data reference for this
 725              stmt we have to resort to the alias oracle.  */
 726           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 727           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 728           if (!dr_b)
 729             {
 730               /* We are moving a store or sinking a load - this means
 731                  we cannot use TBAA for disambiguation.  */
 732               if (!ref_initialized_p)
 733                 ao_ref_init (&ref, DR_REF (dr_a));
 734               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 735                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 736                 return false;
 737               continue;
 738             }
 739
 740           bool dependent = false;
 741           /* If we run into a store of this same instance (we've just
 742              marked those) then delay dependence checking until we run
 743              into the last store because this is where it will have
 744              been sunk to (and we verify if we can do that as well).  */
 745           if (gimple_visited_p (stmt))
 746             {
 747               if (stmt_info != last_store_info)
 748                 continue;
 749               unsigned i;
 750               stmt_vec_info store_info;
 751               FOR_EACH_VEC_ELT (stores, i, store_info)
 752                 {
 753                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 754                   ddr_p ddr = initialize_data_dependence_relation
 755                                 (dr_a, store_dr, vNULL);
 756                   dependent
 757                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 758                   free_dependence_relation (ddr);
 759                   if (dependent)
 760                     break;
 761                 }
 762             }
 763           else
 764             {
 765               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 766                                                                dr_b, vNULL);
 767               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 768               free_dependence_relation (ddr);
 769             }
 770           if (dependent)
 771             return false;
 772         }
 773     }
 774   return true;
 775 }
 776
 777
 778 /* Function vect_analyze_data_ref_dependences.
 779
 780    Examine all the data references in the basic-block, and make sure there
 781    do not exist any data dependences between them.  Set *MAX_VF according to
 782    the maximum vectorization factor the data dependences allow.  */
 783
 784 bool
 785 vect_slp_analyze_instance_dependence (slp_instance instance)
 786 {
 787   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 788
 789   /* The stores of this instance are at the root of the SLP tree.  */
 790   slp_tree store = SLP_INSTANCE_TREE (instance);
 791   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 792     store = NULL;
 793
 794   /* Verify we can sink stores to the vectorized stmt insert location.  */
 795   stmt_vec_info last_store_info = NULL;
 796   if (store)
 797     {
 798       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 799         return false;
 800
 801       /* Mark stores in this instance and remember the last one.  */
 802       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 803       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 804         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 805     }
 806
 807   bool res = true;
 808
 809   /* Verify we can sink loads to the vectorized stmt insert location,
 810      special-casing stores of this instance.  */
 811   slp_tree load;
 812   unsigned int i;
 813   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 814     if (! vect_slp_analyze_node_dependences (instance, load,
 815                                              store
 816                                              ? SLP_TREE_SCALAR_STMTS (store)
 817                                              : vNULL, last_store_info))
 818       {
 819         res = false;
 820         break;
 821       }
 822
 823   /* Unset the visited flag.  */
 824   if (store)
 825     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 826       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 827
 828   return res;
 829 }
 830
 831 /* Record the base alignment guarantee given by DRB, which occurs
 832    in STMT_INFO.  */
 833
 834 static void
 835 vect_record_base_alignment (stmt_vec_info stmt_info,
 836                             innermost_loop_behavior *drb)
 837 {
 838   vec_info *vinfo = stmt_info->vinfo;
 839   bool existed;
 840   innermost_loop_behavior *&entry
 841     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 842   if (!existed || entry->base_alignment < drb->base_alignment)
 843     {
 844       entry = drb;
 845       if (dump_enabled_p ())
 846         dump_printf_loc (MSG_NOTE, vect_location,
 847                          "recording new base alignment for %T\n"
 848                          "  alignment:    %d\n"
 849                          "  misalignment: %d\n"
 850                          "  based on:     %G",
 851                          drb->base_address,
 852                          drb->base_alignment,
 853                          drb->base_misalignment,
 854                          stmt_info->stmt);
 855     }
 856 }
 857
 858 /* If the region we're going to vectorize is reached, all unconditional
 859    data references occur at least once.  We can therefore pool the base
 860    alignment guarantees from each unconditional reference.  Do this by
 861    going through all the data references in VINFO and checking whether
 862    the containing statement makes the reference unconditionally.  If so,
 863    record the alignment of the base address in VINFO so that it can be
 864    used for all other references with the same base.  */
 865
 866 void
 867 vect_record_base_alignments (vec_info *vinfo)
 868 {
 869   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 870   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 871   data_reference *dr;
 872   unsigned int i;
 873   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 874     {
 875       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 876       stmt_vec_info stmt_info = dr_info->stmt;
 877       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 878           && STMT_VINFO_VECTORIZABLE (stmt_info)
 879           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 880         {
 881           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 882
 883           /* If DR is nested in the loop that is being vectorized, we can also
 884              record the alignment of the base wrt the outer loop.  */
 885           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 886             vect_record_base_alignment
 887               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 888         }
 889     }
 890 }
 891
 892 /* Return the target alignment for the vectorized form of DR_INFO.  */
 893
 894 static poly_uint64
 895 vect_calculate_target_alignment (dr_vec_info *dr_info)
 896 {
 897   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 898   return targetm.vectorize.preferred_vector_alignment (vectype);
 899 }
 900
 901 /* Function vect_compute_data_ref_alignment
 902
 903    Compute the misalignment of the data reference DR_INFO.
 904
 905    Output:
 906    1. DR_MISALIGNMENT (DR_INFO) is defined.
 907
 908    FOR NOW: No analysis is actually performed. Misalignment is calculated
 909    only for trivial cases. TODO.  */
 910
 911 static void
 912 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 913 {
 914   stmt_vec_info stmt_info = dr_info->stmt;
 915   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 916   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 917   class loop *loop = NULL;
 918   tree ref = DR_REF (dr_info->dr);
 919   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 920
 921   if (dump_enabled_p ())
 922     dump_printf_loc (MSG_NOTE, vect_location,
 923                      "vect_compute_data_ref_alignment:\n");
 924
 925   if (loop_vinfo)
 926     loop = LOOP_VINFO_LOOP (loop_vinfo);
 927
 928   /* Initialize misalignment to unknown.  */
 929   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 930
 931   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 932     return;
 933
 934   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 935   bool step_preserves_misalignment_p;
 936
 937   poly_uint64 vector_alignment
 938     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 939   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 940
 941   unsigned HOST_WIDE_INT vect_align_c;
 942   if (!vector_alignment.is_constant (&vect_align_c))
 943     return;
 944
 945   /* No step for BB vectorization.  */
 946   if (!loop)
 947     {
 948       gcc_assert (integer_zerop (drb->step));
 949       step_preserves_misalignment_p = true;
 950     }
 951
 952   /* In case the dataref is in an inner-loop of the loop that is being
 953      vectorized (LOOP), we use the base and misalignment information
 954      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 955      stays the same throughout the execution of the inner-loop, which is why
 956      we have to check that the stride of the dataref in the inner-loop evenly
 957      divides by the vector alignment.  */
 958   else if (nested_in_vect_loop_p (loop, stmt_info))
 959     {
 960       step_preserves_misalignment_p
 961         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 962
 963       if (dump_enabled_p ())
 964         {
 965           if (step_preserves_misalignment_p)
 966             dump_printf_loc (MSG_NOTE, vect_location,
 967                              "inner step divides the vector alignment.\n");
 968           else
 969             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 970                              "inner step doesn't divide the vector"
 971                              " alignment.\n");
 972         }
 973     }
 974
 975   /* Similarly we can only use base and misalignment information relative to
 976      an innermost loop if the misalignment stays the same throughout the
 977      execution of the loop.  As above, this is the case if the stride of
 978      the dataref evenly divides by the alignment.  */
 979   else
 980     {
 981       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 982       step_preserves_misalignment_p
 983         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 984
 985       if (!step_preserves_misalignment_p && dump_enabled_p ())
 986         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 987                          "step doesn't divide the vector alignment.\n");
 988     }
 989
 990   unsigned int base_alignment = drb->base_alignment;
 991   unsigned int base_misalignment = drb->base_misalignment;
 992
 993   /* Calculate the maximum of the pooled base address alignment and the
 994      alignment that we can compute for DR itself.  */
 995   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 996   if (entry && base_alignment < (*entry)->base_alignment)
 997     {
 998       base_alignment = (*entry)->base_alignment;
 999       base_misalignment = (*entry)->base_misalignment;
1000     }
1001
1002   if (drb->offset_alignment < vect_align_c
1003       || !step_preserves_misalignment_p
1004       /* We need to know whether the step wrt the vectorized loop is
1005          negative when computing the starting misalignment below.  */
1006       || TREE_CODE (drb->step) != INTEGER_CST)
1007     {
1008       if (dump_enabled_p ())
1009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010                          "Unknown alignment for access: %T\n", ref);
1011       return;
1012     }
1013
1014   if (base_alignment < vect_align_c)
1015     {
1016       unsigned int max_alignment;
1017       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1018       if (max_alignment < vect_align_c
1019           || !vect_can_force_dr_alignment_p (base,
1020                                              vect_align_c * BITS_PER_UNIT))
1021         {
1022           if (dump_enabled_p ())
1023             dump_printf_loc (MSG_NOTE, vect_location,
1024                              "can't force alignment of ref: %T\n", ref);
1025           return;
1026         }
1027
1028       /* Force the alignment of the decl.
1029          NOTE: This is the only change to the code we make during
1030          the analysis phase, before deciding to vectorize the loop.  */
1031       if (dump_enabled_p ())
1032         dump_printf_loc (MSG_NOTE, vect_location,
1033                          "force alignment of %T\n", ref);
1034
1035       dr_info->base_decl = base;
1036       dr_info->base_misaligned = true;
1037       base_misalignment = 0;
1038     }
1039   poly_int64 misalignment
1040     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1041
1042   /* If this is a backward running DR then first access in the larger
1043      vectype actually is N-1 elements before the address in the DR.
1044      Adjust misalign accordingly.  */
1045   if (tree_int_cst_sgn (drb->step) < 0)
1046     /* PLUS because STEP is negative.  */
1047     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1048                      * TREE_INT_CST_LOW (drb->step));
1049
1050   unsigned int const_misalignment;
1051   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1052     {
1053       if (dump_enabled_p ())
1054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                          "Non-constant misalignment for access: %T\n", ref);
1056       return;
1057     }
1058
1059   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1060
1061   if (dump_enabled_p ())
1062     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063                      "misalign = %d bytes of ref %T\n",
1064                      DR_MISALIGNMENT (dr_info), ref);
1065
1066   return;
1067 }
1068
1069 /* Function vect_update_misalignment_for_peel.
1070    Sets DR_INFO's misalignment
1071    - to 0 if it has the same alignment as DR_PEEL_INFO,
1072    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1073    - to -1 (unknown) otherwise.
1074
1075    DR_INFO - the data reference whose misalignment is to be adjusted.
1076    DR_PEEL_INFO - the data reference whose misalignment is being made
1077                   zero in the vector loop by the peel.
1078    NPEEL - the number of iterations in the peel loop if the misalignment
1079            of DR_PEEL_INFO is known at compile time.  */
1080
1081 static void
1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1083                                    dr_vec_info *dr_peel_info, int npeel)
1084 {
1085   unsigned int i;
1086   vec<dr_p> same_aligned_drs;
1087   struct data_reference *current_dr;
1088   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1089
1090   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1091      it is aligned in the vector loop.  */
1092   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1093   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1094     {
1095       if (current_dr != dr_info->dr)
1096         continue;
1097       gcc_assert (!known_alignment_for_access_p (dr_info)
1098                   || !known_alignment_for_access_p (dr_peel_info)
1099                   || (DR_MISALIGNMENT (dr_info)
1100                       == DR_MISALIGNMENT (dr_peel_info)));
1101       SET_DR_MISALIGNMENT (dr_info, 0);
1102       return;
1103     }
1104
1105   unsigned HOST_WIDE_INT alignment;
1106   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1107       && known_alignment_for_access_p (dr_info)
1108       && known_alignment_for_access_p (dr_peel_info))
1109     {
1110       int misal = DR_MISALIGNMENT (dr_info);
1111       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1112       misal &= alignment - 1;
1113       SET_DR_MISALIGNMENT (dr_info, misal);
1114       return;
1115     }
1116
1117   if (dump_enabled_p ())
1118     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1119                      "to unknown (-1).\n");
1120   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1121 }
1122
1123
1124 /* Function verify_data_ref_alignment
1125
1126    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1127
1128 static opt_result
1129 verify_data_ref_alignment (dr_vec_info *dr_info)
1130 {
1131   enum dr_alignment_support supportable_dr_alignment
1132     = vect_supportable_dr_alignment (dr_info, false);
1133   if (!supportable_dr_alignment)
1134     return opt_result::failure_at
1135       (dr_info->stmt->stmt,
1136        DR_IS_READ (dr_info->dr)
1137         ? "not vectorized: unsupported unaligned load: %T\n"
1138         : "not vectorized: unsupported unaligned store: %T\n",
1139        DR_REF (dr_info->dr));
1140
1141   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1142     dump_printf_loc (MSG_NOTE, vect_location,
1143                      "Vectorizing an unaligned access.\n");
1144
1145   return opt_result::success ();
1146 }
1147
1148 /* Function vect_verify_datarefs_alignment
1149
1150    Return TRUE if all data references in the loop can be
1151    handled with respect to alignment.  */
1152
1153 opt_result
1154 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1155 {
1156   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1157   struct data_reference *dr;
1158   unsigned int i;
1159
1160   FOR_EACH_VEC_ELT (datarefs, i, dr)
1161     {
1162       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1163       stmt_vec_info stmt_info = dr_info->stmt;
1164
1165       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1166         continue;
1167
1168       /* For interleaving, only the alignment of the first access matters.   */
1169       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1171         continue;
1172
1173       /* Strided accesses perform only component accesses, alignment is
1174          irrelevant for them.  */
1175       if (STMT_VINFO_STRIDED_P (stmt_info)
1176           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1177         continue;
1178
1179       opt_result res = verify_data_ref_alignment (dr_info);
1180       if (!res)
1181         return res;
1182     }
1183
1184   return opt_result::success ();
1185 }
1186
1187 /* Given an memory reference EXP return whether its alignment is less
1188    than its size.  */
1189
1190 static bool
1191 not_size_aligned (tree exp)
1192 {
1193   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1194     return true;
1195
1196   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1197           > get_object_alignment (exp));
1198 }
1199
1200 /* Function vector_alignment_reachable_p
1201
1202    Return true if vector alignment for DR_INFO is reachable by peeling
1203    a few loop iterations.  Return false otherwise.  */
1204
1205 static bool
1206 vector_alignment_reachable_p (dr_vec_info *dr_info)
1207 {
1208   stmt_vec_info stmt_info = dr_info->stmt;
1209   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1210
1211   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212     {
1213       /* For interleaved access we peel only if number of iterations in
1214          the prolog loop ({VF - misalignment}), is a multiple of the
1215          number of the interleaved accesses.  */
1216       int elem_size, mis_in_elements;
1217
1218       /* FORNOW: handle only known alignment.  */
1219       if (!known_alignment_for_access_p (dr_info))
1220         return false;
1221
1222       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1223       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1224       elem_size = vector_element_size (vector_size, nelements);
1225       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1226
1227       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1228         return false;
1229     }
1230
1231   /* If misalignment is known at the compile time then allow peeling
1232      only if natural alignment is reachable through peeling.  */
1233   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1234     {
1235       HOST_WIDE_INT elmsize =
1236                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1237       if (dump_enabled_p ())
1238         {
1239           dump_printf_loc (MSG_NOTE, vect_location,
1240                            "data size = %wd. misalignment = %d.\n", elmsize,
1241                            DR_MISALIGNMENT (dr_info));
1242         }
1243       if (DR_MISALIGNMENT (dr_info) % elmsize)
1244         {
1245           if (dump_enabled_p ())
1246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247                              "data size does not divide the misalignment.\n");
1248           return false;
1249         }
1250     }
1251
1252   if (!known_alignment_for_access_p (dr_info))
1253     {
1254       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1255       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1256       if (dump_enabled_p ())
1257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                          "Unknown misalignment, %snaturally aligned\n",
1259                          is_packed ? "not " : "");
1260       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1261     }
1262
1263   return true;
1264 }
1265
1266
1267 /* Calculate the cost of the memory access represented by DR_INFO.  */
1268
1269 static void
1270 vect_get_data_access_cost (dr_vec_info *dr_info,
1271                            unsigned int *inside_cost,
1272                            unsigned int *outside_cost,
1273                            stmt_vector_for_cost *body_cost_vec,
1274                            stmt_vector_for_cost *prologue_cost_vec)
1275 {
1276   stmt_vec_info stmt_info = dr_info->stmt;
1277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278   int ncopies;
1279
1280   if (PURE_SLP_STMT (stmt_info))
1281     ncopies = 1;
1282   else
1283     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1284
1285   if (DR_IS_READ (dr_info->dr))
1286     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287                         prologue_cost_vec, body_cost_vec, false);
1288   else
1289     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1290
1291   if (dump_enabled_p ())
1292     dump_printf_loc (MSG_NOTE, vect_location,
1293                      "vect_get_data_access_cost: inside_cost = %d, "
1294                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1295 }
1296
1297
1298 typedef struct _vect_peel_info
1299 {
1300   dr_vec_info *dr_info;
1301   int npeel;
1302   unsigned int count;
1303 } *vect_peel_info;
1304
1305 typedef struct _vect_peel_extended_info
1306 {
1307   struct _vect_peel_info peel_info;
1308   unsigned int inside_cost;
1309   unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1311
1312
1313 /* Peeling hashtable helpers.  */
1314
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1316 {
1317   static inline hashval_t hash (const _vect_peel_info *);
1318   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1319 };
1320
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1323 {
1324   return (hashval_t) peel_info->npeel;
1325 }
1326
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1329 {
1330   return (a->npeel == b->npeel);
1331 }
1332
1333
1334 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1335
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1339                           int npeel)
1340 {
1341   struct _vect_peel_info elem, *slot;
1342   _vect_peel_info **new_slot;
1343   bool supportable_dr_alignment
1344     = vect_supportable_dr_alignment (dr_info, true);
1345
1346   elem.npeel = npeel;
1347   slot = peeling_htab->find (&elem);
1348   if (slot)
1349     slot->count++;
1350   else
1351     {
1352       slot = XNEW (struct _vect_peel_info);
1353       slot->npeel = npeel;
1354       slot->dr_info = dr_info;
1355       slot->count = 1;
1356       new_slot = peeling_htab->find_slot (slot, INSERT);
1357       *new_slot = slot;
1358     }
1359
1360   if (!supportable_dr_alignment
1361       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1362     slot->count += VECT_MAX_COST;
1363 }
1364
1365
1366 /* Traverse peeling hash table to find peeling option that aligns maximum
1367    number of data accesses.  */
1368
1369 int
1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1371                                      _vect_peel_extended_info *max)
1372 {
1373   vect_peel_info elem = *slot;
1374
1375   if (elem->count > max->peel_info.count
1376       || (elem->count == max->peel_info.count
1377           && max->peel_info.npeel > elem->npeel))
1378     {
1379       max->peel_info.npeel = elem->npeel;
1380       max->peel_info.count = elem->count;
1381       max->peel_info.dr_info = elem->dr_info;
1382     }
1383
1384   return 1;
1385 }
1386
1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1388    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1389    we assume DR0_INFO's misalignment will be zero after peeling.  */
1390
1391 static void
1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1393                                 dr_vec_info *dr0_info,
1394                                 unsigned int *inside_cost,
1395                                 unsigned int *outside_cost,
1396                                 stmt_vector_for_cost *body_cost_vec,
1397                                 stmt_vector_for_cost *prologue_cost_vec,
1398                                 unsigned int npeel,
1399                                 bool unknown_misalignment)
1400 {
1401   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402   unsigned i;
1403   data_reference *dr;
1404
1405   FOR_EACH_VEC_ELT (datarefs, i, dr)
1406     {
1407       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1408       stmt_vec_info stmt_info = dr_info->stmt;
1409       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410         continue;
1411
1412       /* For interleaving, only the alignment of the first access
1413          matters.  */
1414       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416         continue;
1417
1418       /* Strided accesses perform only component accesses, alignment is
1419          irrelevant for them.  */
1420       if (STMT_VINFO_STRIDED_P (stmt_info)
1421           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422         continue;
1423
1424       int save_misalignment;
1425       save_misalignment = DR_MISALIGNMENT (dr_info);
1426       if (npeel == 0)
1427         ;
1428       else if (unknown_misalignment && dr_info == dr0_info)
1429         SET_DR_MISALIGNMENT (dr_info, 0);
1430       else
1431         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1432       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1433                                  body_cost_vec, prologue_cost_vec);
1434       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1435     }
1436 }
1437
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439    Find the one with the lowest cost.  */
1440
1441 int
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443                                    _vect_peel_extended_info *min)
1444 {
1445   vect_peel_info elem = *slot;
1446   int dummy;
1447   unsigned int inside_cost = 0, outside_cost = 0;
1448   stmt_vec_info stmt_info = elem->dr_info->stmt;
1449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451                        epilogue_cost_vec;
1452
1453   prologue_cost_vec.create (2);
1454   body_cost_vec.create (2);
1455   epilogue_cost_vec.create (2);
1456
1457   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1458                                   &outside_cost, &body_cost_vec,
1459                                   &prologue_cost_vec, elem->npeel, false);
1460
1461   body_cost_vec.release ();
1462
1463   outside_cost += vect_get_known_peeling_cost
1464     (loop_vinfo, elem->npeel, &dummy,
1465      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466      &prologue_cost_vec, &epilogue_cost_vec);
1467
1468   /* Prologue and epilogue costs are added to the target model later.
1469      These costs depend only on the scalar iteration cost, the
1470      number of peeling iterations finally chosen, and the number of
1471      misaligned statements.  So discard the information found here.  */
1472   prologue_cost_vec.release ();
1473   epilogue_cost_vec.release ();
1474
1475   if (inside_cost < min->inside_cost
1476       || (inside_cost == min->inside_cost
1477           && outside_cost < min->outside_cost))
1478     {
1479       min->inside_cost = inside_cost;
1480       min->outside_cost = outside_cost;
1481       min->peel_info.dr_info = elem->dr_info;
1482       min->peel_info.npeel = elem->npeel;
1483       min->peel_info.count = elem->count;
1484     }
1485
1486   return 1;
1487 }
1488
1489
1490 /* Choose best peeling option by traversing peeling hash table and either
1491    choosing an option with the lowest cost (if cost model is enabled) or the
1492    option that aligns as many accesses as possible.  */
1493
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496                                        loop_vec_info loop_vinfo)
1497 {
1498    struct _vect_peel_extended_info res;
1499
1500    res.peel_info.dr_info = NULL;
1501
1502    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1503      {
1504        res.inside_cost = INT_MAX;
1505        res.outside_cost = INT_MAX;
1506        peeling_htab->traverse <_vect_peel_extended_info *,
1507                                vect_peeling_hash_get_lowest_cost> (&res);
1508      }
1509    else
1510      {
1511        res.peel_info.count = 0;
1512        peeling_htab->traverse <_vect_peel_extended_info *,
1513                                vect_peeling_hash_get_most_frequent> (&res);
1514        res.inside_cost = 0;
1515        res.outside_cost = 0;
1516      }
1517
1518    return res;
1519 }
1520
1521 /* Return true if the new peeling NPEEL is supported.  */
1522
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1525                           unsigned npeel)
1526 {
1527   unsigned i;
1528   struct data_reference *dr = NULL;
1529   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530   enum dr_alignment_support supportable_dr_alignment;
1531
1532   /* Ensure that all data refs can be vectorized after the peel.  */
1533   FOR_EACH_VEC_ELT (datarefs, i, dr)
1534     {
1535       int save_misalignment;
1536
1537       if (dr == dr0_info->dr)
1538         continue;
1539
1540       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541       stmt_vec_info stmt_info = dr_info->stmt;
1542       /* For interleaving, only the alignment of the first access
1543          matters.  */
1544       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546         continue;
1547
1548       /* Strided accesses perform only component accesses, alignment is
1549          irrelevant for them.  */
1550       if (STMT_VINFO_STRIDED_P (stmt_info)
1551           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552         continue;
1553
1554       save_misalignment = DR_MISALIGNMENT (dr_info);
1555       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1556       supportable_dr_alignment
1557         = vect_supportable_dr_alignment (dr_info, false);
1558       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1559
1560       if (!supportable_dr_alignment)
1561         return false;
1562     }
1563
1564   return true;
1565 }
1566
1567 /* Function vect_enhance_data_refs_alignment
1568
1569    This pass will use loop versioning and loop peeling in order to enhance
1570    the alignment of data references in the loop.
1571
1572    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573    original loop is to be vectorized.  Any other loops that are created by
1574    the transformations performed in this pass - are not supposed to be
1575    vectorized.  This restriction will be relaxed.
1576
1577    This pass will require a cost model to guide it whether to apply peeling
1578    or versioning or a combination of the two.  For example, the scheme that
1579    intel uses when given a loop with several memory accesses, is as follows:
1580    choose one memory access ('p') which alignment you want to force by doing
1581    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1582    other accesses are not necessarily aligned, or (2) use loop versioning to
1583    generate one loop in which all accesses are aligned, and another loop in
1584    which only 'p' is necessarily aligned.
1585
1586    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589
1590    Devising a cost model is the most critical aspect of this work.  It will
1591    guide us on which access to peel for, whether to use loop versioning, how
1592    many versions to create, etc.  The cost model will probably consist of
1593    generic considerations as well as target specific considerations (on
1594    powerpc for example, misaligned stores are more painful than misaligned
1595    loads).
1596
1597    Here are the general steps involved in alignment enhancements:
1598
1599      -- original loop, before alignment analysis:
1600         for (i=0; i<N; i++){
1601           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1602           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1603         }
1604
1605      -- After vect_compute_data_refs_alignment:
1606         for (i=0; i<N; i++){
1607           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1608           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1609         }
1610
1611      -- Possibility 1: we do loop versioning:
1612      if (p is aligned) {
1613         for (i=0; i<N; i++){    # loop 1A
1614           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1615           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1616         }
1617      }
1618      else {
1619         for (i=0; i<N; i++){    # loop 1B
1620           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1621           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1622         }
1623      }
1624
1625      -- Possibility 2: we do loop peeling:
1626      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1627         x = q[i];
1628         p[i] = y;
1629      }
1630      for (i = 3; i < N; i++){   # loop 2A
1631         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1632         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1633      }
1634
1635      -- Possibility 3: combination of loop peeling and versioning:
1636      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1637         x = q[i];
1638         p[i] = y;
1639      }
1640      if (p is aligned) {
1641         for (i = 3; i<N; i++){  # loop 3A
1642           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1643           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1644         }
1645      }
1646      else {
1647         for (i = 3; i<N; i++){  # loop 3B
1648           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1649           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1650         }
1651      }
1652
1653      These loops are later passed to loop_transform to be vectorized.  The
1654      vectorizer will use the alignment information to guide the transformation
1655      (whether to generate regular loads/stores, or with special handling for
1656      misalignment).  */
1657
1658 opt_result
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 {
1661   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   enum dr_alignment_support supportable_dr_alignment;
1664   dr_vec_info *first_store = NULL;
1665   dr_vec_info *dr0_info = NULL;
1666   struct data_reference *dr;
1667   unsigned int i, j;
1668   bool do_peeling = false;
1669   bool do_versioning = false;
1670   unsigned int npeel = 0;
1671   bool one_misalignment_known = false;
1672   bool one_misalignment_unknown = false;
1673   bool one_dr_unsupportable = false;
1674   dr_vec_info *unsupportable_dr_info = NULL;
1675   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676   unsigned possible_npeel_number = 1;
1677   tree vectype;
1678   unsigned int mis, same_align_drs_max = 0;
1679   hash_table<peel_info_hasher> peeling_htab (1);
1680
1681   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682
1683   /* Reset data so we can safely be called multiple times.  */
1684   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686
1687   /* While cost model enhancements are expected in the future, the high level
1688      view of the code at this time is as follows:
1689
1690      A) If there is a misaligned access then see if peeling to align
1691         this access can make all data references satisfy
1692         vect_supportable_dr_alignment.  If so, update data structures
1693         as needed and return true.
1694
1695      B) If peeling wasn't possible and there is a data reference with an
1696         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697         then see if loop versioning checks can be used to make all data
1698         references satisfy vect_supportable_dr_alignment.  If so, update
1699         data structures as needed and return true.
1700
1701      C) If neither peeling nor versioning were successful then return false if
1702         any data reference does not satisfy vect_supportable_dr_alignment.
1703
1704      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705
1706      Note, Possibility 3 above (which is peeling and versioning together) is not
1707      being done at this time.  */
1708
1709   /* (1) Peeling to force alignment.  */
1710
1711   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712      Considerations:
1713      + How many accesses will become aligned due to the peeling
1714      - How many accesses will become unaligned due to the peeling,
1715        and the cost of misaligned accesses.
1716      - The cost of peeling (the extra runtime checks, the increase
1717        in code size).  */
1718
1719   FOR_EACH_VEC_ELT (datarefs, i, dr)
1720     {
1721       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1722       stmt_vec_info stmt_info = dr_info->stmt;
1723
1724       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725         continue;
1726
1727       /* For interleaving, only the alignment of the first access
1728          matters.  */
1729       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1731         continue;
1732
1733       /* For scatter-gather or invariant accesses there is nothing
1734          to enhance.  */
1735       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1736           || integer_zerop (DR_STEP (dr)))
1737         continue;
1738
1739       /* Strided accesses perform only component accesses, alignment is
1740          irrelevant for them.  */
1741       if (STMT_VINFO_STRIDED_P (stmt_info)
1742           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1743         continue;
1744
1745       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1746       do_peeling = vector_alignment_reachable_p (dr_info);
1747       if (do_peeling)
1748         {
1749           if (known_alignment_for_access_p (dr_info))
1750             {
1751               unsigned int npeel_tmp = 0;
1752               bool negative = tree_int_cst_compare (DR_STEP (dr),
1753                                                     size_zero_node) < 0;
1754
1755               vectype = STMT_VINFO_VECTYPE (stmt_info);
1756               /* If known_alignment_for_access_p then we have set
1757                  DR_MISALIGNMENT which is only done if we know it at compiler
1758                  time, so it is safe to assume target alignment is constant.
1759                */
1760               unsigned int target_align =
1761                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1762               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1763               mis = (negative
1764                      ? DR_MISALIGNMENT (dr_info)
1765                      : -DR_MISALIGNMENT (dr_info));
1766               if (DR_MISALIGNMENT (dr_info) != 0)
1767                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775                  first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781                  cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783                 {
1784                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786                   possible_npeel_number
1787                     = vect_get_num_vectors (nscalars, vectype);
1788
1789                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1790                      allow peeling NELEMENTS.  */
1791                   if (DR_MISALIGNMENT (dr_info) == 0)
1792                     possible_npeel_number++;
1793                 }
1794
1795               /* Save info about DR in the hash table.  Also include peeling
1796                  amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800                                             dr_info, npeel_tmp);
1801                   npeel_tmp += target_align / dr_size;
1802                 }
1803
1804               one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812               unsigned same_align_drs
1813                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814               if (!dr0_info
1815                   || same_align_drs_max < same_align_drs)
1816                 {
1817                   same_align_drs_max = same_align_drs;
1818                   dr0_info = dr_info;
1819                 }
1820               /* For data-refs with the same number of related
1821                  accesses prefer the one where the misalign
1822                  computation will be invariant in the outermost loop.  */
1823               else if (same_align_drs_max == same_align_drs)
1824                 {
1825                   class loop *ivloop0, *ivloop;
1826                   ivloop0 = outermost_invariant_loop_for_expr
1827                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1828                   ivloop = outermost_invariant_loop_for_expr
1829                     (loop, DR_BASE_ADDRESS (dr));
1830                   if ((ivloop && !ivloop0)
1831                       || (ivloop && ivloop0
1832                           && flow_loop_nested_p (ivloop, ivloop0)))
1833                     dr0_info = dr_info;
1834                 }
1835
1836               one_misalignment_unknown = true;
1837
1838               /* Check for data refs with unsupportable alignment that
1839                  can be peeled.  */
1840               if (!supportable_dr_alignment)
1841               {
1842                 one_dr_unsupportable = true;
1843                 unsupportable_dr_info = dr_info;
1844               }
1845
1846               if (!first_store && DR_IS_WRITE (dr))
1847                 first_store = dr_info;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr_info))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1891                                       &load_inside_cost,
1892                                       &load_outside_cost,
1893                                       &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895
1896       if (first_store)
1897         {
1898           dummy.create (2);
1899           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1900                                           &store_inside_cost,
1901                                           &store_outside_cost,
1902                                           &dummy, &dummy,
1903                                           estimated_npeels, true);
1904           dummy.release ();
1905         }
1906       else
1907         {
1908           store_inside_cost = INT_MAX;
1909           store_outside_cost = INT_MAX;
1910         }
1911
1912       if (load_inside_cost > store_inside_cost
1913           || (load_inside_cost == store_inside_cost
1914               && load_outside_cost > store_outside_cost))
1915         {
1916           dr0_info = first_store;
1917           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919         }
1920       else
1921         {
1922           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924         }
1925
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932         (loop_vinfo, estimated_npeels, &dummy2,
1933          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934          &prologue_cost_vec, &epilogue_cost_vec);
1935
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938
1939       peel_for_unknown_alignment.peel_info.count = 1
1940         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1941     }
1942
1943   peel_for_unknown_alignment.peel_info.npeel = 0;
1944   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1945
1946   best_peel = peel_for_unknown_alignment;
1947
1948   peel_for_known_alignment.inside_cost = INT_MAX;
1949   peel_for_known_alignment.outside_cost = INT_MAX;
1950   peel_for_known_alignment.peel_info.count = 0;
1951   peel_for_known_alignment.peel_info.dr_info = NULL;
1952
1953   if (do_peeling && one_misalignment_known)
1954     {
1955       /* Peeling is possible, but there is no data access that is not supported
1956          unless aligned.  So we try to choose the best possible peeling from
1957          the hash table.  */
1958       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1959         (&peeling_htab, loop_vinfo);
1960     }
1961
1962   /* Compare costs of peeling for known and unknown alignment. */
1963   if (peel_for_known_alignment.peel_info.dr_info != NULL
1964       && peel_for_unknown_alignment.inside_cost
1965       >= peel_for_known_alignment.inside_cost)
1966     {
1967       best_peel = peel_for_known_alignment;
1968
1969       /* If the best peeling for known alignment has NPEEL == 0, perform no
1970          peeling at all except if there is an unsupportable dr that we can
1971          align.  */
1972       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1973         do_peeling = false;
1974     }
1975
1976   /* If there is an unsupportable data ref, prefer this over all choices so far
1977      since we'd have to discard a chosen peeling except when it accidentally
1978      aligned the unsupportable data ref.  */
1979   if (one_dr_unsupportable)
1980     dr0_info = unsupportable_dr_info;
1981   else if (do_peeling)
1982     {
1983       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1984          TODO: Use nopeel_outside_cost or get rid of it?  */
1985       unsigned nopeel_inside_cost = 0;
1986       unsigned nopeel_outside_cost = 0;
1987
1988       stmt_vector_for_cost dummy;
1989       dummy.create (2);
1990       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1991                                       &nopeel_outside_cost, &dummy, &dummy,
1992                                       0, false);
1993       dummy.release ();
1994
1995       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1996          costs will be recorded.  */
1997       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1998       prologue_cost_vec.create (2);
1999       epilogue_cost_vec.create (2);
2000
2001       int dummy2;
2002       nopeel_outside_cost += vect_get_known_peeling_cost
2003         (loop_vinfo, 0, &dummy2,
2004          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2005          &prologue_cost_vec, &epilogue_cost_vec);
2006
2007       prologue_cost_vec.release ();
2008       epilogue_cost_vec.release ();
2009
2010       npeel = best_peel.peel_info.npeel;
2011       dr0_info = best_peel.peel_info.dr_info;
2012
2013       /* If no peeling is not more expensive than the best peeling we
2014          have so far, don't perform any peeling.  */
2015       if (nopeel_inside_cost <= best_peel.inside_cost)
2016         do_peeling = false;
2017     }
2018
2019   if (do_peeling)
2020     {
2021       stmt_vec_info stmt_info = dr0_info->stmt;
2022       vectype = STMT_VINFO_VECTYPE (stmt_info);
2023
2024       if (known_alignment_for_access_p (dr0_info))
2025         {
2026           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2027                                                 size_zero_node) < 0;
2028           if (!npeel)
2029             {
2030               /* Since it's known at compile time, compute the number of
2031                  iterations in the peeled loop (the peeling factor) for use in
2032                  updating DR_MISALIGNMENT values.  The peeling factor is the
2033                  vectorization factor minus the misalignment as an element
2034                  count.  */
2035               mis = (negative
2036                      ? DR_MISALIGNMENT (dr0_info)
2037                      : -DR_MISALIGNMENT (dr0_info));
2038               /* If known_alignment_for_access_p then we have set
2039                  DR_MISALIGNMENT which is only done if we know it at compiler
2040                  time, so it is safe to assume target alignment is constant.
2041                */
2042               unsigned int target_align =
2043                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2044               npeel = ((mis & (target_align - 1))
2045                        / vect_get_scalar_dr_size (dr0_info));
2046             }
2047
2048           /* For interleaved data access every iteration accesses all the
2049              members of the group, therefore we divide the number of iterations
2050              by the group size.  */
2051           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052             npeel /= DR_GROUP_SIZE (stmt_info);
2053
2054           if (dump_enabled_p ())
2055             dump_printf_loc (MSG_NOTE, vect_location,
2056                              "Try peeling by %d\n", npeel);
2057         }
2058
2059       /* Ensure that all datarefs can be vectorized after the peel.  */
2060       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2061         do_peeling = false;
2062
2063       /* Check if all datarefs are supportable and log.  */
2064       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2065         {
2066           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2067           if (!stat)
2068             do_peeling = false;
2069           else
2070             return stat;
2071         }
2072
2073       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2074       if (do_peeling)
2075         {
2076           unsigned max_allowed_peel
2077             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2078           if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2079             max_allowed_peel = 0;
2080           if (max_allowed_peel != (unsigned)-1)
2081             {
2082               unsigned max_peel = npeel;
2083               if (max_peel == 0)
2084                 {
2085                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2086                   unsigned HOST_WIDE_INT target_align_c;
2087                   if (target_align.is_constant (&target_align_c))
2088                     max_peel =
2089                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2090                   else
2091                     {
2092                       do_peeling = false;
2093                       if (dump_enabled_p ())
2094                         dump_printf_loc (MSG_NOTE, vect_location,
2095                           "Disable peeling, max peels set and vector"
2096                           " alignment unknown\n");
2097                     }
2098                 }
2099               if (max_peel > max_allowed_peel)
2100                 {
2101                   do_peeling = false;
2102                   if (dump_enabled_p ())
2103                     dump_printf_loc (MSG_NOTE, vect_location,
2104                         "Disable peeling, max peels reached: %d\n", max_peel);
2105                 }
2106             }
2107         }
2108
2109       /* Cost model #2 - if peeling may result in a remaining loop not
2110          iterating enough to be vectorized then do not peel.  Since this
2111          is a cost heuristic rather than a correctness decision, use the
2112          most likely runtime value for variable vectorization factors.  */
2113       if (do_peeling
2114           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2115         {
2116           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2117           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2118           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2119               < assumed_vf + max_peel)
2120             do_peeling = false;
2121         }
2122
2123       if (do_peeling)
2124         {
2125           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2126              If the misalignment of DR_i is identical to that of dr0 then set
2127              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2128              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2129              by the peeling factor times the element size of DR_i (MOD the
2130              vectorization factor times the size).  Otherwise, the
2131              misalignment of DR_i must be set to unknown.  */
2132           FOR_EACH_VEC_ELT (datarefs, i, dr)
2133             if (dr != dr0_info->dr)
2134               {
2135                 /* Strided accesses perform only component accesses, alignment
2136                    is irrelevant for them.  */
2137                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2138                 stmt_info = dr_info->stmt;
2139                 if (STMT_VINFO_STRIDED_P (stmt_info)
2140                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2141                   continue;
2142
2143                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2144               }
2145
2146           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2147           if (npeel)
2148             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2149           else
2150             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2151               = DR_MISALIGNMENT (dr0_info);
2152           SET_DR_MISALIGNMENT (dr0_info, 0);
2153           if (dump_enabled_p ())
2154             {
2155               dump_printf_loc (MSG_NOTE, vect_location,
2156                                "Alignment of access forced using peeling.\n");
2157               dump_printf_loc (MSG_NOTE, vect_location,
2158                                "Peeling for alignment will be applied.\n");
2159             }
2160
2161           /* The inside-loop cost will be accounted for in vectorizable_load
2162              and vectorizable_store correctly with adjusted alignments.
2163              Drop the body_cst_vec on the floor here.  */
2164           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2165           gcc_assert (stat);
2166           return stat;
2167         }
2168     }
2169
2170   /* (2) Versioning to force alignment.  */
2171
2172   /* Try versioning if:
2173      1) optimize loop for speed and the cost-model is not cheap
2174      2) there is at least one unsupported misaligned data ref with an unknown
2175         misalignment, and
2176      3) all misaligned data refs with a known misalignment are supported, and
2177      4) the number of runtime alignment checks is within reason.  */
2178
2179   do_versioning
2180     = (optimize_loop_nest_for_speed_p (loop)
2181        && !loop->inner /* FORNOW */
2182        && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2183
2184   if (do_versioning)
2185     {
2186       FOR_EACH_VEC_ELT (datarefs, i, dr)
2187         {
2188           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2189           stmt_vec_info stmt_info = dr_info->stmt;
2190
2191           /* For interleaving, only the alignment of the first access
2192              matters.  */
2193           if (aligned_access_p (dr_info)
2194               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2195                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2196             continue;
2197
2198           if (STMT_VINFO_STRIDED_P (stmt_info))
2199             {
2200               /* Strided loads perform only component accesses, alignment is
2201                  irrelevant for them.  */
2202               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2203                 continue;
2204               do_versioning = false;
2205               break;
2206             }
2207
2208           supportable_dr_alignment
2209             = vect_supportable_dr_alignment (dr_info, false);
2210
2211           if (!supportable_dr_alignment)
2212             {
2213               int mask;
2214               tree vectype;
2215
2216               if (known_alignment_for_access_p (dr_info)
2217                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2218                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2219                 {
2220                   do_versioning = false;
2221                   break;
2222                 }
2223
2224               vectype = STMT_VINFO_VECTYPE (stmt_info);
2225               gcc_assert (vectype);
2226
2227               /* At present we don't support versioning for alignment
2228                  with variable VF, since there's no guarantee that the
2229                  VF is a power of two.  We could relax this if we added
2230                  a way of enforcing a power-of-two size.  */
2231               unsigned HOST_WIDE_INT size;
2232               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2233                 {
2234                   do_versioning = false;
2235                   break;
2236                 }
2237
2238               /* Forcing alignment in the first iteration is no good if
2239                  we don't keep it across iterations.  For now, just disable
2240                  versioning in this case.
2241                  ?? We could actually unroll the loop to achieve the required
2242                  overall step alignment, and forcing the alignment could be
2243                  done by doing some iterations of the non-vectorized loop.  */
2244               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2245                                * DR_STEP_ALIGNMENT (dr),
2246                                DR_TARGET_ALIGNMENT (dr_info)))
2247                 {
2248                   do_versioning = false;
2249                   break;
2250                 }
2251
2252               /* The rightmost bits of an aligned address must be zeros.
2253                  Construct the mask needed for this test.  For example,
2254                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2255                  mask must be 15 = 0xf. */
2256               mask = size - 1;
2257
2258               /* FORNOW: use the same mask to test all potentially unaligned
2259                  references in the loop.  The vectorizer currently supports
2260                  a single vector size, see the reference to
2261                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2262                  vectorization factor is computed.  */
2263               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2264                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2265               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2266               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2267             }
2268         }
2269
2270       /* Versioning requires at least one misaligned data reference.  */
2271       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2272         do_versioning = false;
2273       else if (!do_versioning)
2274         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2275     }
2276
2277   if (do_versioning)
2278     {
2279       vec<stmt_vec_info> may_misalign_stmts
2280         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2281       stmt_vec_info stmt_info;
2282
2283       /* It can now be assumed that the data references in the statements
2284          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2285          of the loop being vectorized.  */
2286       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2287         {
2288           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2289           SET_DR_MISALIGNMENT (dr_info, 0);
2290           if (dump_enabled_p ())
2291             dump_printf_loc (MSG_NOTE, vect_location,
2292                              "Alignment of access forced using versioning.\n");
2293         }
2294
2295       if (dump_enabled_p ())
2296         dump_printf_loc (MSG_NOTE, vect_location,
2297                          "Versioning for alignment will be applied.\n");
2298
2299       /* Peeling and versioning can't be done together at this time.  */
2300       gcc_assert (! (do_peeling && do_versioning));
2301
2302       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2303       gcc_assert (stat);
2304       return stat;
2305     }
2306
2307   /* This point is reached if neither peeling nor versioning is being done.  */
2308   gcc_assert (! (do_peeling || do_versioning));
2309
2310   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2311   return stat;
2312 }
2313
2314
2315 /* Function vect_find_same_alignment_drs.
2316
2317    Update group and alignment relations in VINFO according to the chosen
2318    vectorization factor.  */
2319
2320 static void
2321 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2322 {
2323   struct data_reference *dra = DDR_A (ddr);
2324   struct data_reference *drb = DDR_B (ddr);
2325   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2326   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2327   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2328   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2329
2330   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2331     return;
2332
2333   if (dra == drb)
2334     return;
2335
2336   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2337       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2338     return;
2339
2340   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2341       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2342       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2343     return;
2344
2345   /* Two references with distance zero have the same alignment.  */
2346   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2347                           - wi::to_poly_offset (DR_INIT (drb)));
2348   if (maybe_ne (diff, 0))
2349     {
2350       /* Get the wider of the two alignments.  */
2351       poly_uint64 align_a =
2352         exact_div (vect_calculate_target_alignment (dr_info_a),
2353                    BITS_PER_UNIT);
2354       poly_uint64 align_b =
2355         exact_div (vect_calculate_target_alignment (dr_info_b),
2356                    BITS_PER_UNIT);
2357       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2358       if (!align_a.is_constant (&align_a_c)
2359           || !align_b.is_constant (&align_b_c))
2360         return;
2361
2362       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2363
2364       /* Require the gap to be a multiple of the larger vector alignment.  */
2365       if (!multiple_p (diff, max_align))
2366         return;
2367     }
2368
2369   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2370   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2371   if (dump_enabled_p ())
2372     dump_printf_loc (MSG_NOTE, vect_location,
2373                      "accesses have the same alignment: %T and %T\n",
2374                      DR_REF (dra), DR_REF (drb));
2375 }
2376
2377
2378 /* Function vect_analyze_data_refs_alignment
2379
2380    Analyze the alignment of the data-references in the loop.
2381    Return FALSE if a data reference is found that cannot be vectorized.  */
2382
2383 opt_result
2384 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2385 {
2386   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2387
2388   /* Mark groups of data references with same alignment using
2389      data dependence information.  */
2390   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2391   struct data_dependence_relation *ddr;
2392   unsigned int i;
2393
2394   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2395     vect_find_same_alignment_drs (vinfo, ddr);
2396
2397   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2398   struct data_reference *dr;
2399
2400   vect_record_base_alignments (vinfo);
2401   FOR_EACH_VEC_ELT (datarefs, i, dr)
2402     {
2403       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2404       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2405         vect_compute_data_ref_alignment (dr_info);
2406     }
2407
2408   return opt_result::success ();
2409 }
2410
2411
2412 /* Analyze alignment of DRs of stmts in NODE.  */
2413
2414 static bool
2415 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2416 {
2417   /* We vectorize from the first scalar stmt in the node unless
2418      the node is permuted in which case we start from the first
2419      element in the group.  */
2420   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2421   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2422   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2423     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2424
2425   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2426   vect_compute_data_ref_alignment (dr_info);
2427   /* For creating the data-ref pointer we need alignment of the
2428      first element anyway.  */
2429   if (dr_info != first_dr_info)
2430     vect_compute_data_ref_alignment (first_dr_info);
2431   if (! verify_data_ref_alignment (dr_info))
2432     {
2433       if (dump_enabled_p ())
2434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2435                          "not vectorized: bad data alignment in basic "
2436                          "block.\n");
2437       return false;
2438     }
2439
2440   return true;
2441 }
2442
2443 /* Function vect_slp_analyze_instance_alignment
2444
2445    Analyze the alignment of the data-references in the SLP instance.
2446    Return FALSE if a data reference is found that cannot be vectorized.  */
2447
2448 bool
2449 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2450 {
2451   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2452
2453   slp_tree node;
2454   unsigned i;
2455   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2456     if (! vect_slp_analyze_and_verify_node_alignment (node))
2457       return false;
2458
2459   node = SLP_INSTANCE_TREE (instance);
2460   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2461       && ! vect_slp_analyze_and_verify_node_alignment
2462              (SLP_INSTANCE_TREE (instance)))
2463     return false;
2464
2465   return true;
2466 }
2467
2468
2469 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2470    accesses of legal size, step, etc.  Detect gaps, single element
2471    interleaving, and other special cases. Set grouped access info.
2472    Collect groups of strided stores for further use in SLP analysis.
2473    Worker for vect_analyze_group_access.  */
2474
2475 static bool
2476 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2477 {
2478   data_reference *dr = dr_info->dr;
2479   tree step = DR_STEP (dr);
2480   tree scalar_type = TREE_TYPE (DR_REF (dr));
2481   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2482   stmt_vec_info stmt_info = dr_info->stmt;
2483   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2484   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2485   HOST_WIDE_INT dr_step = -1;
2486   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2487   bool slp_impossible = false;
2488
2489   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2490      size of the interleaving group (including gaps).  */
2491   if (tree_fits_shwi_p (step))
2492     {
2493       dr_step = tree_to_shwi (step);
2494       /* Check that STEP is a multiple of type size.  Otherwise there is
2495          a non-element-sized gap at the end of the group which we
2496          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2497          ???  As we can handle non-constant step fine here we should
2498          simply remove uses of DR_GROUP_GAP between the last and first
2499          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2500          simply not include that gap.  */
2501       if ((dr_step % type_size) != 0)
2502         {
2503           if (dump_enabled_p ())
2504             dump_printf_loc (MSG_NOTE, vect_location,
2505                              "Step %T is not a multiple of the element size"
2506                              " for %T\n",
2507                              step, DR_REF (dr));
2508           return false;
2509         }
2510       groupsize = absu_hwi (dr_step) / type_size;
2511     }
2512   else
2513     groupsize = 0;
2514
2515   /* Not consecutive access is possible only if it is a part of interleaving.  */
2516   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2517     {
2518       /* Check if it this DR is a part of interleaving, and is a single
2519          element of the group that is accessed in the loop.  */
2520
2521       /* Gaps are supported only for loads. STEP must be a multiple of the type
2522          size.  */
2523       if (DR_IS_READ (dr)
2524           && (dr_step % type_size) == 0
2525           && groupsize > 0)
2526         {
2527           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2528           DR_GROUP_SIZE (stmt_info) = groupsize;
2529           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2530           if (dump_enabled_p ())
2531             dump_printf_loc (MSG_NOTE, vect_location,
2532                              "Detected single element interleaving %T"
2533                              " step %T\n",
2534                              DR_REF (dr), step);
2535
2536           return true;
2537         }
2538
2539       if (dump_enabled_p ())
2540         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2541                          "not consecutive access %G", stmt_info->stmt);
2542
2543       if (bb_vinfo)
2544         {
2545           /* Mark the statement as unvectorizable.  */
2546           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2547           return true;
2548         }
2549
2550       if (dump_enabled_p ())
2551         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2552       STMT_VINFO_STRIDED_P (stmt_info) = true;
2553       return true;
2554     }
2555
2556   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2557     {
2558       /* First stmt in the interleaving chain. Check the chain.  */
2559       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2560       struct data_reference *data_ref = dr;
2561       unsigned int count = 1;
2562       tree prev_init = DR_INIT (data_ref);
2563       HOST_WIDE_INT diff, gaps = 0;
2564
2565       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2566       while (next)
2567         {
2568           /* We never have the same DR multiple times.  */
2569           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2570                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2571
2572           data_ref = STMT_VINFO_DATA_REF (next);
2573
2574           /* All group members have the same STEP by construction.  */
2575           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2576
2577           /* Check that the distance between two accesses is equal to the type
2578              size. Otherwise, we have gaps.  */
2579           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2580                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2581           if (diff != 1)
2582             {
2583               /* FORNOW: SLP of accesses with gaps is not supported.  */
2584               slp_impossible = true;
2585               if (DR_IS_WRITE (data_ref))
2586                 {
2587                   if (dump_enabled_p ())
2588                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2589                                      "interleaved store with gaps\n");
2590                   return false;
2591                 }
2592
2593               gaps += diff - 1;
2594             }
2595
2596           last_accessed_element += diff;
2597
2598           /* Store the gap from the previous member of the group. If there is no
2599              gap in the access, DR_GROUP_GAP is always 1.  */
2600           DR_GROUP_GAP (next) = diff;
2601
2602           prev_init = DR_INIT (data_ref);
2603           next = DR_GROUP_NEXT_ELEMENT (next);
2604           /* Count the number of data-refs in the chain.  */
2605           count++;
2606         }
2607
2608       if (groupsize == 0)
2609         groupsize = count + gaps;
2610
2611       /* This could be UINT_MAX but as we are generating code in a very
2612          inefficient way we have to cap earlier.  See PR78699 for example.  */
2613       if (groupsize > 4096)
2614         {
2615           if (dump_enabled_p ())
2616             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617                              "group is too large\n");
2618           return false;
2619         }
2620
2621       /* Check that the size of the interleaving is equal to count for stores,
2622          i.e., that there are no gaps.  */
2623       if (groupsize != count
2624           && !DR_IS_READ (dr))
2625         {
2626           groupsize = count;
2627           STMT_VINFO_STRIDED_P (stmt_info) = true;
2628         }
2629
2630       /* If there is a gap after the last load in the group it is the
2631          difference between the groupsize and the last accessed
2632          element.
2633          When there is no gap, this difference should be 0.  */
2634       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2635
2636       DR_GROUP_SIZE (stmt_info) = groupsize;
2637       if (dump_enabled_p ())
2638         {
2639           dump_printf_loc (MSG_NOTE, vect_location,
2640                            "Detected interleaving ");
2641           if (DR_IS_READ (dr))
2642             dump_printf (MSG_NOTE, "load ");
2643           else if (STMT_VINFO_STRIDED_P (stmt_info))
2644             dump_printf (MSG_NOTE, "strided store ");
2645           else
2646             dump_printf (MSG_NOTE, "store ");
2647           dump_printf (MSG_NOTE, "of size %u\n",
2648                        (unsigned)groupsize);
2649           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2650           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2651           while (next)
2652             {
2653               if (DR_GROUP_GAP (next) != 1)
2654                 dump_printf_loc (MSG_NOTE, vect_location,
2655                                  "\t<gap of %d elements>\n",
2656                                  DR_GROUP_GAP (next) - 1);
2657               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2658               next = DR_GROUP_NEXT_ELEMENT (next);
2659             }
2660           if (DR_GROUP_GAP (stmt_info) != 0)
2661             dump_printf_loc (MSG_NOTE, vect_location,
2662                              "\t<gap of %d elements>\n",
2663                              DR_GROUP_GAP (stmt_info));
2664         }
2665
2666       /* SLP: create an SLP data structure for every interleaving group of
2667          stores for further analysis in vect_analyse_slp.  */
2668       if (DR_IS_WRITE (dr) && !slp_impossible)
2669         {
2670           if (loop_vinfo)
2671             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2672           if (bb_vinfo)
2673             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2674         }
2675     }
2676
2677   return true;
2678 }
2679
2680 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2681    accesses of legal size, step, etc.  Detect gaps, single element
2682    interleaving, and other special cases. Set grouped access info.
2683    Collect groups of strided stores for further use in SLP analysis.  */
2684
2685 static bool
2686 vect_analyze_group_access (dr_vec_info *dr_info)
2687 {
2688   if (!vect_analyze_group_access_1 (dr_info))
2689     {
2690       /* Dissolve the group if present.  */
2691       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2692       while (stmt_info)
2693         {
2694           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2695           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2696           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2697           stmt_info = next;
2698         }
2699       return false;
2700     }
2701   return true;
2702 }
2703
2704 /* Analyze the access pattern of the data-reference DR_INFO.
2705    In case of non-consecutive accesses call vect_analyze_group_access() to
2706    analyze groups of accesses.  */
2707
2708 static bool
2709 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2710 {
2711   data_reference *dr = dr_info->dr;
2712   tree step = DR_STEP (dr);
2713   tree scalar_type = TREE_TYPE (DR_REF (dr));
2714   stmt_vec_info stmt_info = dr_info->stmt;
2715   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2716   class loop *loop = NULL;
2717
2718   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2719     return true;
2720
2721   if (loop_vinfo)
2722     loop = LOOP_VINFO_LOOP (loop_vinfo);
2723
2724   if (loop_vinfo && !step)
2725     {
2726       if (dump_enabled_p ())
2727         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2728                          "bad data-ref access in loop\n");
2729       return false;
2730     }
2731
2732   /* Allow loads with zero step in inner-loop vectorization.  */
2733   if (loop_vinfo && integer_zerop (step))
2734     {
2735       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2736       if (!nested_in_vect_loop_p (loop, stmt_info))
2737         return DR_IS_READ (dr);
2738       /* Allow references with zero step for outer loops marked
2739          with pragma omp simd only - it guarantees absence of
2740          loop-carried dependencies between inner loop iterations.  */
2741       if (loop->safelen < 2)
2742         {
2743           if (dump_enabled_p ())
2744             dump_printf_loc (MSG_NOTE, vect_location,
2745                              "zero step in inner loop of nest\n");
2746           return false;
2747         }
2748     }
2749
2750   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2751     {
2752       /* Interleaved accesses are not yet supported within outer-loop
2753         vectorization for references in the inner-loop.  */
2754       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2755
2756       /* For the rest of the analysis we use the outer-loop step.  */
2757       step = STMT_VINFO_DR_STEP (stmt_info);
2758       if (integer_zerop (step))
2759         {
2760           if (dump_enabled_p ())
2761             dump_printf_loc (MSG_NOTE, vect_location,
2762                              "zero step in outer loop.\n");
2763           return DR_IS_READ (dr);
2764         }
2765     }
2766
2767   /* Consecutive?  */
2768   if (TREE_CODE (step) == INTEGER_CST)
2769     {
2770       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2771       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2772           || (dr_step < 0
2773               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2774         {
2775           /* Mark that it is not interleaving.  */
2776           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2777           return true;
2778         }
2779     }
2780
2781   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2782     {
2783       if (dump_enabled_p ())
2784         dump_printf_loc (MSG_NOTE, vect_location,
2785                          "grouped access in outer loop.\n");
2786       return false;
2787     }
2788
2789
2790   /* Assume this is a DR handled by non-constant strided load case.  */
2791   if (TREE_CODE (step) != INTEGER_CST)
2792     return (STMT_VINFO_STRIDED_P (stmt_info)
2793             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2794                 || vect_analyze_group_access (dr_info)));
2795
2796   /* Not consecutive access - check if it's a part of interleaving group.  */
2797   return vect_analyze_group_access (dr_info);
2798 }
2799
2800 /* Compare two data-references DRA and DRB to group them into chunks
2801    suitable for grouping.  */
2802
2803 static int
2804 dr_group_sort_cmp (const void *dra_, const void *drb_)
2805 {
2806   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2807   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2808   int cmp;
2809
2810   /* Stabilize sort.  */
2811   if (dra == drb)
2812     return 0;
2813
2814   /* DRs in different loops never belong to the same group.  */
2815   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2816   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2817   if (loopa != loopb)
2818     return loopa->num < loopb->num ? -1 : 1;
2819
2820   /* Ordering of DRs according to base.  */
2821   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2822                                DR_BASE_ADDRESS (drb));
2823   if (cmp != 0)
2824     return cmp;
2825
2826   /* And according to DR_OFFSET.  */
2827   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2828   if (cmp != 0)
2829     return cmp;
2830
2831   /* Put reads before writes.  */
2832   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2833     return DR_IS_READ (dra) ? -1 : 1;
2834
2835   /* Then sort after access size.  */
2836   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2837                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2838   if (cmp != 0)
2839     return cmp;
2840
2841   /* And after step.  */
2842   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2843   if (cmp != 0)
2844     return cmp;
2845
2846   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2847   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2848   if (cmp == 0)
2849     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2850   return cmp;
2851 }
2852
2853 /* If OP is the result of a conversion, return the unconverted value,
2854    otherwise return null.  */
2855
2856 static tree
2857 strip_conversion (tree op)
2858 {
2859   if (TREE_CODE (op) != SSA_NAME)
2860     return NULL_TREE;
2861   gimple *stmt = SSA_NAME_DEF_STMT (op);
2862   if (!is_gimple_assign (stmt)
2863       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2864     return NULL_TREE;
2865   return gimple_assign_rhs1 (stmt);
2866 }
2867
2868 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2869    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2870    be grouped in SLP mode.  */
2871
2872 static bool
2873 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2874                    bool allow_slp_p)
2875 {
2876   if (gimple_assign_single_p (stmt1_info->stmt))
2877     return gimple_assign_single_p (stmt2_info->stmt);
2878
2879   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2880   if (call1 && gimple_call_internal_p (call1))
2881     {
2882       /* Check for two masked loads or two masked stores.  */
2883       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2884       if (!call2 || !gimple_call_internal_p (call2))
2885         return false;
2886       internal_fn ifn = gimple_call_internal_fn (call1);
2887       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2888         return false;
2889       if (ifn != gimple_call_internal_fn (call2))
2890         return false;
2891
2892       /* Check that the masks are the same.  Cope with casts of masks,
2893          like those created by build_mask_conversion.  */
2894       tree mask1 = gimple_call_arg (call1, 2);
2895       tree mask2 = gimple_call_arg (call2, 2);
2896       if (!operand_equal_p (mask1, mask2, 0)
2897           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2898         {
2899           mask1 = strip_conversion (mask1);
2900           if (!mask1)
2901             return false;
2902           mask2 = strip_conversion (mask2);
2903           if (!mask2)
2904             return false;
2905           if (!operand_equal_p (mask1, mask2, 0))
2906             return false;
2907         }
2908       return true;
2909     }
2910
2911   return false;
2912 }
2913
2914 /* Function vect_analyze_data_ref_accesses.
2915
2916    Analyze the access pattern of all the data references in the loop.
2917
2918    FORNOW: the only access pattern that is considered vectorizable is a
2919            simple step 1 (consecutive) access.
2920
2921    FORNOW: handle only arrays and pointer accesses.  */
2922
2923 opt_result
2924 vect_analyze_data_ref_accesses (vec_info *vinfo)
2925 {
2926   unsigned int i;
2927   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2928   struct data_reference *dr;
2929
2930   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2931
2932   if (datarefs.is_empty ())
2933     return opt_result::success ();
2934
2935   /* Sort the array of datarefs to make building the interleaving chains
2936      linear.  Don't modify the original vector's order, it is needed for
2937      determining what dependencies are reversed.  */
2938   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2939   datarefs_copy.qsort (dr_group_sort_cmp);
2940   hash_set<stmt_vec_info> to_fixup;
2941
2942   /* Build the interleaving chains.  */
2943   for (i = 0; i < datarefs_copy.length () - 1;)
2944     {
2945       data_reference_p dra = datarefs_copy[i];
2946       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2947       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2948       stmt_vec_info lastinfo = NULL;
2949       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2950           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2951         {
2952           ++i;
2953           continue;
2954         }
2955       for (i = i + 1; i < datarefs_copy.length (); ++i)
2956         {
2957           data_reference_p drb = datarefs_copy[i];
2958           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2959           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2960           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2961               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2962             break;
2963
2964           /* ???  Imperfect sorting (non-compatible types, non-modulo
2965              accesses, same accesses) can lead to a group to be artificially
2966              split here as we don't just skip over those.  If it really
2967              matters we can push those to a worklist and re-iterate
2968              over them.  The we can just skip ahead to the next DR here.  */
2969
2970           /* DRs in a different loop should not be put into the same
2971              interleaving group.  */
2972           if (gimple_bb (DR_STMT (dra))->loop_father
2973               != gimple_bb (DR_STMT (drb))->loop_father)
2974             break;
2975
2976           /* Check that the data-refs have same first location (except init)
2977              and they are both either store or load (not load and store,
2978              not masked loads or stores).  */
2979           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2980               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2981                                         DR_BASE_ADDRESS (drb)) != 0
2982               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2983               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2984             break;
2985
2986           /* Check that the data-refs have the same constant size.  */
2987           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2988           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2989           if (!tree_fits_uhwi_p (sza)
2990               || !tree_fits_uhwi_p (szb)
2991               || !tree_int_cst_equal (sza, szb))
2992             break;
2993
2994           /* Check that the data-refs have the same step.  */
2995           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2996             break;
2997
2998           /* Check the types are compatible.
2999              ???  We don't distinguish this during sorting.  */
3000           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3001                                    TREE_TYPE (DR_REF (drb))))
3002             break;
3003
3004           /* Check that the DR_INITs are compile-time constants.  */
3005           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3006               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3007             break;
3008
3009           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3010              just hold extra information.  */
3011           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3012               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3013               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3014             break;
3015
3016           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3017           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3018           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3019           HOST_WIDE_INT init_prev
3020             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3021           gcc_assert (init_a <= init_b
3022                       && init_a <= init_prev
3023                       && init_prev <= init_b);
3024
3025           /* Do not place the same access in the interleaving chain twice.  */
3026           if (init_b == init_prev)
3027             {
3028               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3029                           < gimple_uid (DR_STMT (drb)));
3030               /* Simply link in duplicates and fix up the chain below.  */
3031             }
3032           else
3033             {
3034               /* If init_b == init_a + the size of the type * k, we have an
3035                  interleaving, and DRA is accessed before DRB.  */
3036               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3037               if (type_size_a == 0
3038                   || (init_b - init_a) % type_size_a != 0)
3039                 break;
3040
3041               /* If we have a store, the accesses are adjacent.  This splits
3042                  groups into chunks we support (we don't support vectorization
3043                  of stores with gaps).  */
3044               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3045                 break;
3046
3047               /* If the step (if not zero or non-constant) is greater than the
3048                  difference between data-refs' inits this splits groups into
3049                  suitable sizes.  */
3050               if (tree_fits_shwi_p (DR_STEP (dra)))
3051                 {
3052                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3053                   if (step != 0 && step <= (init_b - init_a))
3054                     break;
3055                 }
3056             }
3057
3058           if (dump_enabled_p ())
3059             dump_printf_loc (MSG_NOTE, vect_location,
3060                              DR_IS_READ (dra)
3061                              ? "Detected interleaving load %T and %T\n"
3062                              : "Detected interleaving store %T and %T\n",
3063                              DR_REF (dra), DR_REF (drb));
3064
3065           /* Link the found element into the group list.  */
3066           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3067             {
3068               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3069               lastinfo = stmtinfo_a;
3070             }
3071           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3072           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3073           lastinfo = stmtinfo_b;
3074
3075           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3076             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3077
3078           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3079             dump_printf_loc (MSG_NOTE, vect_location,
3080                              "Load suitable for SLP vectorization only.\n");
3081
3082           if (init_b == init_prev
3083               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3084               && dump_enabled_p ())
3085             dump_printf_loc (MSG_NOTE, vect_location,
3086                              "Queuing group with duplicate access for fixup\n");
3087         }
3088     }
3089
3090   /* Fixup groups with duplicate entries by splitting it.  */
3091   while (1)
3092     {
3093       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3094       if (!(it != to_fixup.end ()))
3095         break;
3096       stmt_vec_info grp = *it;
3097       to_fixup.remove (grp);
3098
3099       /* Find the earliest duplicate group member.  */
3100       unsigned first_duplicate = -1u;
3101       stmt_vec_info next, g = grp;
3102       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3103         {
3104           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3105                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3106               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3107             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3108           g = next;
3109         }
3110       if (first_duplicate == -1U)
3111         continue;
3112
3113       /* Then move all stmts after the first duplicate to a new group.
3114          Note this is a heuristic but one with the property that *it
3115          is fixed up completely.  */
3116       g = grp;
3117       stmt_vec_info newgroup = NULL, ng = grp;
3118       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3119         {
3120           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3121             {
3122               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3123               if (!newgroup)
3124                 newgroup = next;
3125               else
3126                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3127               ng = next;
3128               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3129             }
3130           else
3131             g = DR_GROUP_NEXT_ELEMENT (g);
3132         }
3133       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3134
3135       /* Fixup the new group which still may contain duplicates.  */
3136       to_fixup.add (newgroup);
3137     }
3138
3139   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3140     {
3141       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3142       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3143           && !vect_analyze_data_ref_access (dr_info))
3144         {
3145           if (dump_enabled_p ())
3146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3147                              "not vectorized: complicated access pattern.\n");
3148
3149           if (is_a <bb_vec_info> (vinfo))
3150             {
3151               /* Mark the statement as not vectorizable.  */
3152               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3153               continue;
3154             }
3155           else
3156             {
3157               datarefs_copy.release ();
3158               return opt_result::failure_at (dr_info->stmt->stmt,
3159                                              "not vectorized:"
3160                                              " complicated access pattern.\n");
3161             }
3162         }
3163     }
3164
3165   datarefs_copy.release ();
3166   return opt_result::success ();
3167 }
3168
3169 /* Function vect_vfa_segment_size.
3170
3171    Input:
3172      DR_INFO: The data reference.
3173      LENGTH_FACTOR: segment length to consider.
3174
3175    Return a value suitable for the dr_with_seg_len::seg_len field.
3176    This is the "distance travelled" by the pointer from the first
3177    iteration in the segment to the last.  Note that it does not include
3178    the size of the access; in effect it only describes the first byte.  */
3179
3180 static tree
3181 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3182 {
3183   length_factor = size_binop (MINUS_EXPR,
3184                               fold_convert (sizetype, length_factor),
3185                               size_one_node);
3186   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3187                      length_factor);
3188 }
3189
3190 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3191    gives the worst-case number of bytes covered by the segment.  */
3192
3193 static unsigned HOST_WIDE_INT
3194 vect_vfa_access_size (dr_vec_info *dr_info)
3195 {
3196   stmt_vec_info stmt_vinfo = dr_info->stmt;
3197   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3198   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3199   unsigned HOST_WIDE_INT access_size = ref_size;
3200   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3201     {
3202       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3203       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3204     }
3205   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3206       && (vect_supportable_dr_alignment (dr_info, false)
3207           == dr_explicit_realign_optimized))
3208     {
3209       /* We might access a full vector's worth.  */
3210       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3211       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3212     }
3213   return access_size;
3214 }
3215
3216 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3217    describes.  */
3218
3219 static unsigned int
3220 vect_vfa_align (dr_vec_info *dr_info)
3221 {
3222   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3223 }
3224
3225 /* Function vect_no_alias_p.
3226
3227    Given data references A and B with equal base and offset, see whether
3228    the alias relation can be decided at compilation time.  Return 1 if
3229    it can and the references alias, 0 if it can and the references do
3230    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3231    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3232    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3233
3234 static int
3235 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3236                          tree segment_length_a, tree segment_length_b,
3237                          unsigned HOST_WIDE_INT access_size_a,
3238                          unsigned HOST_WIDE_INT access_size_b)
3239 {
3240   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3241   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3242   poly_uint64 const_length_a;
3243   poly_uint64 const_length_b;
3244
3245   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3246      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3247      [a, a+12) */
3248   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3249     {
3250       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3251       offset_a = (offset_a + access_size_a) - const_length_a;
3252     }
3253   else
3254     const_length_a = tree_to_poly_uint64 (segment_length_a);
3255   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3256     {
3257       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3258       offset_b = (offset_b + access_size_b) - const_length_b;
3259     }
3260   else
3261     const_length_b = tree_to_poly_uint64 (segment_length_b);
3262
3263   const_length_a += access_size_a;
3264   const_length_b += access_size_b;
3265
3266   if (ranges_known_overlap_p (offset_a, const_length_a,
3267                               offset_b, const_length_b))
3268     return 1;
3269
3270   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3271                                offset_b, const_length_b))
3272     return 0;
3273
3274   return -1;
3275 }
3276
3277 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3278    in DDR is >= VF.  */
3279
3280 static bool
3281 dependence_distance_ge_vf (data_dependence_relation *ddr,
3282                            unsigned int loop_depth, poly_uint64 vf)
3283 {
3284   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3285       || DDR_NUM_DIST_VECTS (ddr) == 0)
3286     return false;
3287
3288   /* If the dependence is exact, we should have limited the VF instead.  */
3289   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3290
3291   unsigned int i;
3292   lambda_vector dist_v;
3293   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3294     {
3295       HOST_WIDE_INT dist = dist_v[loop_depth];
3296       if (dist != 0
3297           && !(dist > 0 && DDR_REVERSED_P (ddr))
3298           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3299         return false;
3300     }
3301
3302   if (dump_enabled_p ())
3303     dump_printf_loc (MSG_NOTE, vect_location,
3304                      "dependence distance between %T and %T is >= VF\n",
3305                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3306
3307   return true;
3308 }
3309
3310 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3311
3312 static void
3313 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3314 {
3315   dump_printf (dump_kind, "%s (%T) >= ",
3316                lower_bound.unsigned_p ? "unsigned" : "abs",
3317                lower_bound.expr);
3318   dump_dec (dump_kind, lower_bound.min_value);
3319 }
3320
3321 /* Record that the vectorized loop requires the vec_lower_bound described
3322    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3323
3324 static void
3325 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3326                         poly_uint64 min_value)
3327 {
3328   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3329   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3330     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3331       {
3332         unsigned_p &= lower_bounds[i].unsigned_p;
3333         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3334         if (lower_bounds[i].unsigned_p != unsigned_p
3335             || maybe_lt (lower_bounds[i].min_value, min_value))
3336           {
3337             lower_bounds[i].unsigned_p = unsigned_p;
3338             lower_bounds[i].min_value = min_value;
3339             if (dump_enabled_p ())
3340               {
3341                 dump_printf_loc (MSG_NOTE, vect_location,
3342                                  "updating run-time check to ");
3343                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3344                 dump_printf (MSG_NOTE, "\n");
3345               }
3346           }
3347         return;
3348       }
3349
3350   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3351   if (dump_enabled_p ())
3352     {
3353       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3354       dump_lower_bound (MSG_NOTE, lower_bound);
3355       dump_printf (MSG_NOTE, "\n");
3356     }
3357   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3358 }
3359
3360 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3361    will span fewer than GAP bytes.  */
3362
3363 static bool
3364 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3365                   poly_int64 gap)
3366 {
3367   stmt_vec_info stmt_info = dr_info->stmt;
3368   HOST_WIDE_INT count
3369     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3370   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3371     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3372   return (estimated_poly_value (gap)
3373           <= count * vect_get_scalar_dr_size (dr_info));
3374 }
3375
3376 /* Return true if we know that there is no alias between DR_INFO_A and
3377    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3378    When returning true, set *LOWER_BOUND_OUT to this N.  */
3379
3380 static bool
3381 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3382                                 poly_uint64 *lower_bound_out)
3383 {
3384   /* Check that there is a constant gap of known sign between DR_A
3385      and DR_B.  */
3386   data_reference *dr_a = dr_info_a->dr;
3387   data_reference *dr_b = dr_info_b->dr;
3388   poly_int64 init_a, init_b;
3389   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3390       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3391       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3392       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3393       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3394       || !ordered_p (init_a, init_b))
3395     return false;
3396
3397   /* Sort DR_A and DR_B by the address they access.  */
3398   if (maybe_lt (init_b, init_a))
3399     {
3400       std::swap (init_a, init_b);
3401       std::swap (dr_info_a, dr_info_b);
3402       std::swap (dr_a, dr_b);
3403     }
3404
3405   /* If the two accesses could be dependent within a scalar iteration,
3406      make sure that we'd retain their order.  */
3407   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3408       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3409     return false;
3410
3411   /* There is no alias if abs (DR_STEP) is greater than or equal to
3412      the bytes spanned by the combination of the two accesses.  */
3413   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3414   return true;
3415 }
3416
3417 /* Function vect_prune_runtime_alias_test_list.
3418
3419    Prune a list of ddrs to be tested at run-time by versioning for alias.
3420    Merge several alias checks into one if possible.
3421    Return FALSE if resulting list of ddrs is longer then allowed by
3422    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3423
3424 opt_result
3425 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3426 {
3427   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3428   hash_set <tree_pair_hash> compared_objects;
3429
3430   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3431   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3432     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3433   vec<vec_object_pair> &check_unequal_addrs
3434     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3435   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3436   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3437
3438   ddr_p ddr;
3439   unsigned int i;
3440   tree length_factor;
3441
3442   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3443
3444   /* Step values are irrelevant for aliasing if the number of vector
3445      iterations is equal to the number of scalar iterations (which can
3446      happen for fully-SLP loops).  */
3447   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3448
3449   if (!ignore_step_p)
3450     {
3451       /* Convert the checks for nonzero steps into bound tests.  */
3452       tree value;
3453       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3454         vect_check_lower_bound (loop_vinfo, value, true, 1);
3455     }
3456
3457   if (may_alias_ddrs.is_empty ())
3458     return opt_result::success ();
3459
3460   comp_alias_ddrs.create (may_alias_ddrs.length ());
3461
3462   unsigned int loop_depth
3463     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3464                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3465
3466   /* First, we collect all data ref pairs for aliasing checks.  */
3467   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3468     {
3469       int comp_res;
3470       poly_uint64 lower_bound;
3471       tree segment_length_a, segment_length_b;
3472       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3473       unsigned int align_a, align_b;
3474
3475       /* Ignore the alias if the VF we chose ended up being no greater
3476          than the dependence distance.  */
3477       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3478         continue;
3479
3480       if (DDR_OBJECT_A (ddr))
3481         {
3482           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3483           if (!compared_objects.add (new_pair))
3484             {
3485               if (dump_enabled_p ())
3486                 dump_printf_loc (MSG_NOTE, vect_location,
3487                                  "checking that %T and %T"
3488                                  " have different addresses\n",
3489                                  new_pair.first, new_pair.second);
3490               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3491             }
3492           continue;
3493         }
3494
3495       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3496       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3497
3498       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3499       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3500
3501       /* Skip the pair if inter-iteration dependencies are irrelevant
3502          and intra-iteration dependencies are guaranteed to be honored.  */
3503       if (ignore_step_p
3504           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3505               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3506                                                  &lower_bound)))
3507         {
3508           if (dump_enabled_p ())
3509             dump_printf_loc (MSG_NOTE, vect_location,
3510                              "no need for alias check between "
3511                              "%T and %T when VF is 1\n",
3512                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3513           continue;
3514         }
3515
3516       /* See whether we can handle the alias using a bounds check on
3517          the step, and whether that's likely to be the best approach.
3518          (It might not be, for example, if the minimum step is much larger
3519          than the number of bytes handled by one vector iteration.)  */
3520       if (!ignore_step_p
3521           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3522           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3523                                              &lower_bound)
3524           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3525               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3526         {
3527           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3528           if (dump_enabled_p ())
3529             {
3530               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3531                                "%T and %T when the step %T is outside ",
3532                                DR_REF (dr_info_a->dr),
3533                                DR_REF (dr_info_b->dr),
3534                                DR_STEP (dr_info_a->dr));
3535               if (unsigned_p)
3536                 dump_printf (MSG_NOTE, "[0");
3537               else
3538                 {
3539                   dump_printf (MSG_NOTE, "(");
3540                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3541                 }
3542               dump_printf (MSG_NOTE, ", ");
3543               dump_dec (MSG_NOTE, lower_bound);
3544               dump_printf (MSG_NOTE, ")\n");
3545             }
3546           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3547                                   unsigned_p, lower_bound);
3548           continue;
3549         }
3550
3551       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3552       if (dr_group_first_a)
3553         {
3554           stmt_info_a = dr_group_first_a;
3555           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3556         }
3557
3558       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3559       if (dr_group_first_b)
3560         {
3561           stmt_info_b = dr_group_first_b;
3562           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3563         }
3564
3565       if (ignore_step_p)
3566         {
3567           segment_length_a = size_zero_node;
3568           segment_length_b = size_zero_node;
3569         }
3570       else
3571         {
3572           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3573                                 DR_STEP (dr_info_b->dr), 0))
3574             length_factor = scalar_loop_iters;
3575           else
3576             length_factor = size_int (vect_factor);
3577           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3578           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3579         }
3580       access_size_a = vect_vfa_access_size (dr_info_a);
3581       access_size_b = vect_vfa_access_size (dr_info_b);
3582       align_a = vect_vfa_align (dr_info_a);
3583       align_b = vect_vfa_align (dr_info_b);
3584
3585       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3586                                         DR_BASE_ADDRESS (dr_info_b->dr));
3587       if (comp_res == 0)
3588         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3589                                           DR_OFFSET (dr_info_b->dr));
3590
3591       /* See whether the alias is known at compilation time.  */
3592       if (comp_res == 0
3593           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3594           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3595           && poly_int_tree_p (segment_length_a)
3596           && poly_int_tree_p (segment_length_b))
3597         {
3598           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3599                                              segment_length_a,
3600                                              segment_length_b,
3601                                              access_size_a,
3602                                              access_size_b);
3603           if (res >= 0 && dump_enabled_p ())
3604             {
3605               dump_printf_loc (MSG_NOTE, vect_location,
3606                                "can tell at compile time that %T and %T",
3607                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3608               if (res == 0)
3609                 dump_printf (MSG_NOTE, " do not alias\n");
3610               else
3611                 dump_printf (MSG_NOTE, " alias\n");
3612             }
3613
3614           if (res == 0)
3615             continue;
3616
3617           if (res == 1)
3618             return opt_result::failure_at (stmt_info_b->stmt,
3619                                            "not vectorized:"
3620                                            " compilation time alias: %G%G",
3621                                            stmt_info_a->stmt,
3622                                            stmt_info_b->stmt);
3623         }
3624
3625       dr_with_seg_len_pair_t dr_with_seg_len_pair
3626         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3627                           access_size_a, align_a),
3628          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3629                           access_size_b, align_b));
3630
3631       /* Canonicalize pairs by sorting the two DR members.  */
3632       if (comp_res > 0)
3633         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3634
3635       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3636     }
3637
3638   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3639
3640   unsigned int count = (comp_alias_ddrs.length ()
3641                         + check_unequal_addrs.length ());
3642
3643   if (dump_enabled_p ())
3644     dump_printf_loc (MSG_NOTE, vect_location,
3645                      "improved number of alias checks from %d to %d\n",
3646                      may_alias_ddrs.length (), count);
3647   unsigned limit = PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS);
3648   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3649     limit = default_param_value
3650               (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) * 6 / 10;
3651   if (count > limit)
3652     return opt_result::failure_at
3653       (vect_location,
3654        "number of versioning for alias run-time tests exceeds %d "
3655        "(--param vect-max-version-for-alias-checks)\n", limit);
3656
3657   return opt_result::success ();
3658 }
3659
3660 /* Check whether we can use an internal function for a gather load
3661    or scatter store.  READ_P is true for loads and false for stores.
3662    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3663    the type of the memory elements being loaded or stored.  OFFSET_BITS
3664    is the number of bits in each scalar offset and OFFSET_SIGN is the
3665    sign of the offset.  SCALE is the amount by which the offset should
3666    be multiplied *after* it has been converted to address width.
3667
3668    Return true if the function is supported, storing the function
3669    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3670
3671 bool
3672 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3673                           tree memory_type, unsigned int offset_bits,
3674                           signop offset_sign, int scale,
3675                           internal_fn *ifn_out, tree *element_type_out)
3676 {
3677   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3678   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3679   if (offset_bits > element_bits)
3680     /* Internal functions require the offset to be the same width as
3681        the vector elements.  We can extend narrower offsets, but it isn't
3682        safe to truncate wider offsets.  */
3683     return false;
3684
3685   if (element_bits != memory_bits)
3686     /* For now the vector elements must be the same width as the
3687        memory elements.  */
3688     return false;
3689
3690   /* Work out which function we need.  */
3691   internal_fn ifn;
3692   if (read_p)
3693     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3694   else
3695     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3696
3697   /* Test whether the target supports this combination.  */
3698   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3699                                                offset_sign, scale))
3700     return false;
3701
3702   *ifn_out = ifn;
3703   *element_type_out = TREE_TYPE (vectype);
3704   return true;
3705 }
3706
3707 /* STMT_INFO is a call to an internal gather load or scatter store function.
3708    Describe the operation in INFO.  */
3709
3710 static void
3711 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3712                                    gather_scatter_info *info)
3713 {
3714   gcall *call = as_a <gcall *> (stmt_info->stmt);
3715   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3716   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3717
3718   info->ifn = gimple_call_internal_fn (call);
3719   info->decl = NULL_TREE;
3720   info->base = gimple_call_arg (call, 0);
3721   info->offset = gimple_call_arg (call, 1);
3722   info->offset_dt = vect_unknown_def_type;
3723   info->offset_vectype = NULL_TREE;
3724   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3725   info->element_type = TREE_TYPE (vectype);
3726   info->memory_type = TREE_TYPE (DR_REF (dr));
3727 }
3728
3729 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3730    gather load or scatter store.  Describe the operation in *INFO if so.  */
3731
3732 bool
3733 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3734                            gather_scatter_info *info)
3735 {
3736   HOST_WIDE_INT scale = 1;
3737   poly_int64 pbitpos, pbitsize;
3738   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3739   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3740   tree offtype = NULL_TREE;
3741   tree decl = NULL_TREE, base, off;
3742   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3743   tree memory_type = TREE_TYPE (DR_REF (dr));
3744   machine_mode pmode;
3745   int punsignedp, reversep, pvolatilep = 0;
3746   internal_fn ifn;
3747   tree element_type;
3748   bool masked_p = false;
3749
3750   /* See whether this is already a call to a gather/scatter internal function.
3751      If not, see whether it's a masked load or store.  */
3752   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3753   if (call && gimple_call_internal_p (call))
3754     {
3755       ifn = gimple_call_internal_fn (call);
3756       if (internal_gather_scatter_fn_p (ifn))
3757         {
3758           vect_describe_gather_scatter_call (stmt_info, info);
3759           return true;
3760         }
3761       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3762     }
3763
3764   /* True if we should aim to use internal functions rather than
3765      built-in functions.  */
3766   bool use_ifn_p = (DR_IS_READ (dr)
3767                     ? supports_vec_gather_load_p ()
3768                     : supports_vec_scatter_store_p ());
3769
3770   base = DR_REF (dr);
3771   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3772      see if we can use the def stmt of the address.  */
3773   if (masked_p
3774       && TREE_CODE (base) == MEM_REF
3775       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3776       && integer_zerop (TREE_OPERAND (base, 1))
3777       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3778     {
3779       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3780       if (is_gimple_assign (def_stmt)
3781           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3782         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3783     }
3784
3785   /* The gather and scatter builtins need address of the form
3786      loop_invariant + vector * {1, 2, 4, 8}
3787      or
3788      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3789      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3790      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3791      multiplications and additions in it.  To get a vector, we need
3792      a single SSA_NAME that will be defined in the loop and will
3793      contain everything that is not loop invariant and that can be
3794      vectorized.  The following code attempts to find such a preexistng
3795      SSA_NAME OFF and put the loop invariants into a tree BASE
3796      that can be gimplified before the loop.  */
3797   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3798                               &punsignedp, &reversep, &pvolatilep);
3799   if (reversep)
3800     return false;
3801
3802   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3803
3804   if (TREE_CODE (base) == MEM_REF)
3805     {
3806       if (!integer_zerop (TREE_OPERAND (base, 1)))
3807         {
3808           if (off == NULL_TREE)
3809             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3810           else
3811             off = size_binop (PLUS_EXPR, off,
3812                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3813         }
3814       base = TREE_OPERAND (base, 0);
3815     }
3816   else
3817     base = build_fold_addr_expr (base);
3818
3819   if (off == NULL_TREE)
3820     off = size_zero_node;
3821
3822   /* If base is not loop invariant, either off is 0, then we start with just
3823      the constant offset in the loop invariant BASE and continue with base
3824      as OFF, otherwise give up.
3825      We could handle that case by gimplifying the addition of base + off
3826      into some SSA_NAME and use that as off, but for now punt.  */
3827   if (!expr_invariant_in_loop_p (loop, base))
3828     {
3829       if (!integer_zerop (off))
3830         return false;
3831       off = base;
3832       base = size_int (pbytepos);
3833     }
3834   /* Otherwise put base + constant offset into the loop invariant BASE
3835      and continue with OFF.  */
3836   else
3837     {
3838       base = fold_convert (sizetype, base);
3839       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3840     }
3841
3842   /* OFF at this point may be either a SSA_NAME or some tree expression
3843      from get_inner_reference.  Try to peel off loop invariants from it
3844      into BASE as long as possible.  */
3845   STRIP_NOPS (off);
3846   while (offtype == NULL_TREE)
3847     {
3848       enum tree_code code;
3849       tree op0, op1, add = NULL_TREE;
3850
3851       if (TREE_CODE (off) == SSA_NAME)
3852         {
3853           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3854
3855           if (expr_invariant_in_loop_p (loop, off))
3856             return false;
3857
3858           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3859             break;
3860
3861           op0 = gimple_assign_rhs1 (def_stmt);
3862           code = gimple_assign_rhs_code (def_stmt);
3863           op1 = gimple_assign_rhs2 (def_stmt);
3864         }
3865       else
3866         {
3867           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3868             return false;
3869           code = TREE_CODE (off);
3870           extract_ops_from_tree (off, &code, &op0, &op1);
3871         }
3872       switch (code)
3873         {
3874         case POINTER_PLUS_EXPR:
3875         case PLUS_EXPR:
3876           if (expr_invariant_in_loop_p (loop, op0))
3877             {
3878               add = op0;
3879               off = op1;
3880             do_add:
3881               add = fold_convert (sizetype, add);
3882               if (scale != 1)
3883                 add = size_binop (MULT_EXPR, add, size_int (scale));
3884               base = size_binop (PLUS_EXPR, base, add);
3885               continue;
3886             }
3887           if (expr_invariant_in_loop_p (loop, op1))
3888             {
3889               add = op1;
3890               off = op0;
3891               goto do_add;
3892             }
3893           break;
3894         case MINUS_EXPR:
3895           if (expr_invariant_in_loop_p (loop, op1))
3896             {
3897               add = fold_convert (sizetype, op1);
3898               add = size_binop (MINUS_EXPR, size_zero_node, add);
3899               off = op0;
3900               goto do_add;
3901             }
3902           break;
3903         case MULT_EXPR:
3904           if (scale == 1 && tree_fits_shwi_p (op1))
3905             {
3906               int new_scale = tree_to_shwi (op1);
3907               /* Only treat this as a scaling operation if the target
3908                  supports it.  */
3909               if (use_ifn_p
3910                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3911                                                 vectype, memory_type, 1,
3912                                                 TYPE_SIGN (TREE_TYPE (op0)),
3913                                                 new_scale, &ifn,
3914                                                 &element_type))
3915                 break;
3916               scale = new_scale;
3917               off = op0;
3918               continue;
3919             }
3920           break;
3921         case SSA_NAME:
3922           off = op0;
3923           continue;
3924         CASE_CONVERT:
3925           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3926               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3927             break;
3928           if (TYPE_PRECISION (TREE_TYPE (op0))
3929               == TYPE_PRECISION (TREE_TYPE (off)))
3930             {
3931               off = op0;
3932               continue;
3933             }
3934
3935           /* The internal functions need the offset to be the same width
3936              as the elements of VECTYPE.  Don't include operations that
3937              cast the offset from that width to a different width.  */
3938           if (use_ifn_p
3939               && (int_size_in_bytes (TREE_TYPE (vectype))
3940                   == int_size_in_bytes (TREE_TYPE (off))))
3941             break;
3942
3943           if (TYPE_PRECISION (TREE_TYPE (op0))
3944               < TYPE_PRECISION (TREE_TYPE (off)))
3945             {
3946               off = op0;
3947               offtype = TREE_TYPE (off);
3948               STRIP_NOPS (off);
3949               continue;
3950             }
3951           break;
3952         default:
3953           break;
3954         }
3955       break;
3956     }
3957
3958   /* If at the end OFF still isn't a SSA_NAME or isn't
3959      defined in the loop, punt.  */
3960   if (TREE_CODE (off) != SSA_NAME
3961       || expr_invariant_in_loop_p (loop, off))
3962     return false;
3963
3964   if (offtype == NULL_TREE)
3965     offtype = TREE_TYPE (off);
3966
3967   if (use_ifn_p)
3968     {
3969       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3970                                      memory_type, TYPE_PRECISION (offtype),
3971                                      TYPE_SIGN (offtype), scale, &ifn,
3972                                      &element_type))
3973         return false;
3974     }
3975   else
3976     {
3977       if (DR_IS_READ (dr))
3978         {
3979           if (targetm.vectorize.builtin_gather)
3980             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3981         }
3982       else
3983         {
3984           if (targetm.vectorize.builtin_scatter)
3985             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3986         }
3987
3988       if (!decl)
3989         return false;
3990
3991       ifn = IFN_LAST;
3992       element_type = TREE_TYPE (vectype);
3993     }
3994
3995   info->ifn = ifn;
3996   info->decl = decl;
3997   info->base = base;
3998   info->offset = off;
3999   info->offset_dt = vect_unknown_def_type;
4000   info->offset_vectype = NULL_TREE;
4001   info->scale = scale;
4002   info->element_type = element_type;
4003   info->memory_type = memory_type;
4004   return true;
4005 }
4006
4007 /* Find the data references in STMT, analyze them with respect to LOOP and
4008    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4009    be handled.  */
4010
4011 opt_result
4012 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4013                                vec<data_reference_p> *datarefs)
4014 {
4015   /* We can ignore clobbers for dataref analysis - they are removed during
4016      loop vectorization and BB vectorization checks dependences with a
4017      stmt walk.  */
4018   if (gimple_clobber_p (stmt))
4019     return opt_result::success ();
4020
4021   if (gimple_has_volatile_ops (stmt))
4022     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4023                                    stmt);
4024
4025   if (stmt_can_throw_internal (cfun, stmt))
4026     return opt_result::failure_at (stmt,
4027                                    "not vectorized:"
4028                                    " statement can throw an exception: %G",
4029                                    stmt);
4030
4031   auto_vec<data_reference_p, 2> refs;
4032   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4033   if (!res)
4034     return res;
4035
4036   if (refs.is_empty ())
4037     return opt_result::success ();
4038
4039   if (refs.length () > 1)
4040     return opt_result::failure_at (stmt,
4041                                    "not vectorized:"
4042                                    " more than one data ref in stmt: %G", stmt);
4043
4044   if (gcall *call = dyn_cast <gcall *> (stmt))
4045     if (!gimple_call_internal_p (call)
4046         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4047             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4048       return opt_result::failure_at (stmt,
4049                                      "not vectorized: dr in a call %G", stmt);
4050
4051   data_reference_p dr = refs.pop ();
4052   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4053       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4054     return opt_result::failure_at (stmt,
4055                                    "not vectorized:"
4056                                    " statement is bitfield access %G", stmt);
4057
4058   if (DR_BASE_ADDRESS (dr)
4059       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4060     return opt_result::failure_at (stmt,
4061                                    "not vectorized:"
4062                                    " base addr of dr is a constant\n");
4063
4064   /* Check whether this may be a SIMD lane access and adjust the
4065      DR to make it easier for us to handle it.  */
4066   if (loop
4067       && loop->simduid
4068       && (!DR_BASE_ADDRESS (dr)
4069           || !DR_OFFSET (dr)
4070           || !DR_INIT (dr)
4071           || !DR_STEP (dr)))
4072     {
4073       struct data_reference *newdr
4074         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4075                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4076       if (DR_BASE_ADDRESS (newdr)
4077           && DR_OFFSET (newdr)
4078           && DR_INIT (newdr)
4079           && DR_STEP (newdr)
4080           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4081           && integer_zerop (DR_STEP (newdr)))
4082         {
4083           tree base_address = DR_BASE_ADDRESS (newdr);
4084           tree off = DR_OFFSET (newdr);
4085           tree step = ssize_int (1);
4086           if (integer_zerop (off)
4087               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4088             {
4089               off = TREE_OPERAND (base_address, 1);
4090               base_address = TREE_OPERAND (base_address, 0);
4091             }
4092           STRIP_NOPS (off);
4093           if (TREE_CODE (off) == MULT_EXPR
4094               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4095             {
4096               step = TREE_OPERAND (off, 1);
4097               off = TREE_OPERAND (off, 0);
4098               STRIP_NOPS (off);
4099             }
4100           if (CONVERT_EXPR_P (off)
4101               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4102                   < TYPE_PRECISION (TREE_TYPE (off))))
4103             off = TREE_OPERAND (off, 0);
4104           if (TREE_CODE (off) == SSA_NAME)
4105             {
4106               gimple *def = SSA_NAME_DEF_STMT (off);
4107               /* Look through widening conversion.  */
4108               if (is_gimple_assign (def)
4109                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4110                 {
4111                   tree rhs1 = gimple_assign_rhs1 (def);
4112                   if (TREE_CODE (rhs1) == SSA_NAME
4113                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4114                       && (TYPE_PRECISION (TREE_TYPE (off))
4115                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4116                     def = SSA_NAME_DEF_STMT (rhs1);
4117                 }
4118               if (is_gimple_call (def)
4119                   && gimple_call_internal_p (def)
4120                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4121                 {
4122                   tree arg = gimple_call_arg (def, 0);
4123                   tree reft = TREE_TYPE (DR_REF (newdr));
4124                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4125                   arg = SSA_NAME_VAR (arg);
4126                   if (arg == loop->simduid
4127                       /* For now.  */
4128                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4129                     {
4130                       DR_BASE_ADDRESS (newdr) = base_address;
4131                       DR_OFFSET (newdr) = ssize_int (0);
4132                       DR_STEP (newdr) = step;
4133                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4134                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4135                       /* Mark as simd-lane access.  */
4136                       tree arg2 = gimple_call_arg (def, 1);
4137                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4138                       free_data_ref (dr);
4139                       datarefs->safe_push (newdr);
4140                       return opt_result::success ();
4141                     }
4142                 }
4143             }
4144         }
4145       free_data_ref (newdr);
4146     }
4147
4148   datarefs->safe_push (dr);
4149   return opt_result::success ();
4150 }
4151
4152 /* Function vect_analyze_data_refs.
4153
4154   Find all the data references in the loop or basic block.
4155
4156    The general structure of the analysis of data refs in the vectorizer is as
4157    follows:
4158    1- vect_analyze_data_refs(loop/bb): call
4159       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4160       in the loop/bb and their dependences.
4161    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4162    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4163    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4164
4165 */
4166
4167 opt_result
4168 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4169 {
4170   class loop *loop = NULL;
4171   unsigned int i;
4172   struct data_reference *dr;
4173   tree scalar_type;
4174
4175   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4176
4177   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4178     loop = LOOP_VINFO_LOOP (loop_vinfo);
4179
4180   /* Go through the data-refs, check that the analysis succeeded.  Update
4181      pointer from stmt_vec_info struct to DR and vectype.  */
4182
4183   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4184   FOR_EACH_VEC_ELT (datarefs, i, dr)
4185     {
4186       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4187       poly_uint64 vf;
4188
4189       gcc_assert (DR_REF (dr));
4190       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4191       gcc_assert (!stmt_info->dr_aux.dr);
4192       stmt_info->dr_aux.dr = dr;
4193       stmt_info->dr_aux.stmt = stmt_info;
4194
4195       /* Check that analysis of the data-ref succeeded.  */
4196       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4197           || !DR_STEP (dr))
4198         {
4199           bool maybe_gather
4200             = DR_IS_READ (dr)
4201               && !TREE_THIS_VOLATILE (DR_REF (dr))
4202               && (targetm.vectorize.builtin_gather != NULL
4203                   || supports_vec_gather_load_p ());
4204           bool maybe_scatter
4205             = DR_IS_WRITE (dr)
4206               && !TREE_THIS_VOLATILE (DR_REF (dr))
4207               && (targetm.vectorize.builtin_scatter != NULL
4208                   || supports_vec_scatter_store_p ());
4209
4210           /* If target supports vector gather loads or scatter stores,
4211              see if they can't be used.  */
4212           if (is_a <loop_vec_info> (vinfo)
4213               && !nested_in_vect_loop_p (loop, stmt_info))
4214             {
4215               if (maybe_gather || maybe_scatter)
4216                 {
4217                   if (maybe_gather)
4218                     gatherscatter = GATHER;
4219                   else
4220                     gatherscatter = SCATTER;
4221                 }
4222             }
4223
4224           if (gatherscatter == SG_NONE)
4225             {
4226               if (dump_enabled_p ())
4227                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4228                                  "not vectorized: data ref analysis "
4229                                  "failed %G", stmt_info->stmt);
4230               if (is_a <bb_vec_info> (vinfo))
4231                 {
4232                   /* In BB vectorization the ref can still participate
4233                      in dependence analysis, we just can't vectorize it.  */
4234                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4235                   continue;
4236                 }
4237               return opt_result::failure_at (stmt_info->stmt,
4238                                              "not vectorized:"
4239                                              " data ref analysis failed: %G",
4240                                              stmt_info->stmt);
4241             }
4242         }
4243
4244       /* See if this was detected as SIMD lane access.  */
4245       if (dr->aux == (void *)-1
4246           || dr->aux == (void *)-2
4247           || dr->aux == (void *)-3
4248           || dr->aux == (void *)-4)
4249         {
4250           if (nested_in_vect_loop_p (loop, stmt_info))
4251             return opt_result::failure_at (stmt_info->stmt,
4252                                            "not vectorized:"
4253                                            " data ref analysis failed: %G",
4254                                            stmt_info->stmt);
4255           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4256             = -(uintptr_t) dr->aux;
4257         }
4258
4259       tree base = get_base_address (DR_REF (dr));
4260       if (base && VAR_P (base) && DECL_NONALIASED (base))
4261         {
4262           if (dump_enabled_p ())
4263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4264                              "not vectorized: base object not addressable "
4265                              "for stmt: %G", stmt_info->stmt);
4266           if (is_a <bb_vec_info> (vinfo))
4267             {
4268               /* In BB vectorization the ref can still participate
4269                  in dependence analysis, we just can't vectorize it.  */
4270               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4271               continue;
4272             }
4273           return opt_result::failure_at (stmt_info->stmt,
4274                                          "not vectorized: base object not"
4275                                          " addressable for stmt: %G",
4276                                          stmt_info->stmt);
4277         }
4278
4279       if (is_a <loop_vec_info> (vinfo)
4280           && DR_STEP (dr)
4281           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4282         {
4283           if (nested_in_vect_loop_p (loop, stmt_info))
4284             return opt_result::failure_at (stmt_info->stmt,
4285                                            "not vectorized: "
4286                                            "not suitable for strided load %G",
4287                                            stmt_info->stmt);
4288           STMT_VINFO_STRIDED_P (stmt_info) = true;
4289         }
4290
4291       /* Update DR field in stmt_vec_info struct.  */
4292
4293       /* If the dataref is in an inner-loop of the loop that is considered for
4294          for vectorization, we also want to analyze the access relative to
4295          the outer-loop (DR contains information only relative to the
4296          inner-most enclosing loop).  We do that by building a reference to the
4297          first location accessed by the inner-loop, and analyze it relative to
4298          the outer-loop.  */
4299       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4300         {
4301           /* Build a reference to the first location accessed by the
4302              inner loop: *(BASE + INIT + OFFSET).  By construction,
4303              this address must be invariant in the inner loop, so we
4304              can consider it as being used in the outer loop.  */
4305           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4306           tree offset = unshare_expr (DR_OFFSET (dr));
4307           tree init = unshare_expr (DR_INIT (dr));
4308           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4309                                           init, offset);
4310           tree init_addr = fold_build_pointer_plus (base, init_offset);
4311           tree init_ref = build_fold_indirect_ref (init_addr);
4312
4313           if (dump_enabled_p ())
4314             dump_printf_loc (MSG_NOTE, vect_location,
4315                              "analyze in outer loop: %T\n", init_ref);
4316
4317           opt_result res
4318             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4319                                     init_ref, loop, stmt_info->stmt);
4320           if (!res)
4321             /* dr_analyze_innermost already explained the failure.  */
4322             return res;
4323
4324           if (dump_enabled_p ())
4325             dump_printf_loc (MSG_NOTE, vect_location,
4326                              "\touter base_address: %T\n"
4327                              "\touter offset from base address: %T\n"
4328                              "\touter constant offset from base address: %T\n"
4329                              "\touter step: %T\n"
4330                              "\touter base alignment: %d\n\n"
4331                              "\touter base misalignment: %d\n"
4332                              "\touter offset alignment: %d\n"
4333                              "\touter step alignment: %d\n",
4334                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4335                              STMT_VINFO_DR_OFFSET (stmt_info),
4336                              STMT_VINFO_DR_INIT (stmt_info),
4337                              STMT_VINFO_DR_STEP (stmt_info),
4338                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4339                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4340                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4341                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4342         }
4343
4344       /* Set vectype for STMT.  */
4345       scalar_type = TREE_TYPE (DR_REF (dr));
4346       STMT_VINFO_VECTYPE (stmt_info)
4347         = get_vectype_for_scalar_type (vinfo, scalar_type);
4348       if (!STMT_VINFO_VECTYPE (stmt_info))
4349         {
4350           if (dump_enabled_p ())
4351             {
4352               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4353                                "not vectorized: no vectype for stmt: %G",
4354                                stmt_info->stmt);
4355               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4356               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4357                                  scalar_type);
4358               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4359             }
4360
4361           if (is_a <bb_vec_info> (vinfo))
4362             {
4363               /* No vector type is fine, the ref can still participate
4364                  in dependence analysis, we just can't vectorize it.  */
4365               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4366               continue;
4367             }
4368           if (fatal)
4369             *fatal = false;
4370           return opt_result::failure_at (stmt_info->stmt,
4371                                          "not vectorized:"
4372                                          " no vectype for stmt: %G"
4373                                          " scalar_type: %T\n",
4374                                          stmt_info->stmt, scalar_type);
4375         }
4376       else
4377         {
4378           if (dump_enabled_p ())
4379             dump_printf_loc (MSG_NOTE, vect_location,
4380                              "got vectype for stmt: %G%T\n",
4381                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4382         }
4383
4384       /* Adjust the minimal vectorization factor according to the
4385          vector type.  */
4386       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4387       *min_vf = upper_bound (*min_vf, vf);
4388
4389       if (gatherscatter != SG_NONE)
4390         {
4391           gather_scatter_info gs_info;
4392           if (!vect_check_gather_scatter (stmt_info,
4393                                           as_a <loop_vec_info> (vinfo),
4394                                           &gs_info)
4395               || !get_vectype_for_scalar_type (vinfo,
4396                                                TREE_TYPE (gs_info.offset)))
4397             {
4398               if (fatal)
4399                 *fatal = false;
4400               return opt_result::failure_at
4401                         (stmt_info->stmt,
4402                          (gatherscatter == GATHER)
4403                          ? "not vectorized: not suitable for gather load %G"
4404                          : "not vectorized: not suitable for scatter store %G",
4405                          stmt_info->stmt);
4406             }
4407           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4408         }
4409     }
4410
4411   /* We used to stop processing and prune the list here.  Verify we no
4412      longer need to.  */
4413   gcc_assert (i == datarefs.length ());
4414
4415   return opt_result::success ();
4416 }
4417
4418
4419 /* Function vect_get_new_vect_var.
4420
4421    Returns a name for a new variable.  The current naming scheme appends the
4422    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4423    the name of vectorizer generated variables, and appends that to NAME if
4424    provided.  */
4425
4426 tree
4427 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4428 {
4429   const char *prefix;
4430   tree new_vect_var;
4431
4432   switch (var_kind)
4433   {
4434   case vect_simple_var:
4435     prefix = "vect";
4436     break;
4437   case vect_scalar_var:
4438     prefix = "stmp";
4439     break;
4440   case vect_mask_var:
4441     prefix = "mask";
4442     break;
4443   case vect_pointer_var:
4444     prefix = "vectp";
4445     break;
4446   default:
4447     gcc_unreachable ();
4448   }
4449
4450   if (name)
4451     {
4452       char* tmp = concat (prefix, "_", name, NULL);
4453       new_vect_var = create_tmp_reg (type, tmp);
4454       free (tmp);
4455     }
4456   else
4457     new_vect_var = create_tmp_reg (type, prefix);
4458
4459   return new_vect_var;
4460 }
4461
4462 /* Like vect_get_new_vect_var but return an SSA name.  */
4463
4464 tree
4465 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4466 {
4467   const char *prefix;
4468   tree new_vect_var;
4469
4470   switch (var_kind)
4471   {
4472   case vect_simple_var:
4473     prefix = "vect";
4474     break;
4475   case vect_scalar_var:
4476     prefix = "stmp";
4477     break;
4478   case vect_pointer_var:
4479     prefix = "vectp";
4480     break;
4481   default:
4482     gcc_unreachable ();
4483   }
4484
4485   if (name)
4486     {
4487       char* tmp = concat (prefix, "_", name, NULL);
4488       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4489       free (tmp);
4490     }
4491   else
4492     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4493
4494   return new_vect_var;
4495 }
4496
4497 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4498
4499 static void
4500 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4501 {
4502   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4503   int misalign = DR_MISALIGNMENT (dr_info);
4504   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4505     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4506   else
4507     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4508                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4509                             misalign);
4510 }
4511
4512 /* Function vect_create_addr_base_for_vector_ref.
4513
4514    Create an expression that computes the address of the first memory location
4515    that will be accessed for a data reference.
4516
4517    Input:
4518    STMT_INFO: The statement containing the data reference.
4519    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4520    OFFSET: Optional. If supplied, it is be added to the initial address.
4521    LOOP:    Specify relative to which loop-nest should the address be computed.
4522             For example, when the dataref is in an inner-loop nested in an
4523             outer-loop that is now being vectorized, LOOP can be either the
4524             outer-loop, or the inner-loop.  The first memory location accessed
4525             by the following dataref ('in' points to short):
4526
4527                 for (i=0; i<N; i++)
4528                    for (j=0; j<M; j++)
4529                      s += in[i+j]
4530
4531             is as follows:
4532             if LOOP=i_loop:     &in             (relative to i_loop)
4533             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4534    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4535             initial address.  Unlike OFFSET, which is number of elements to
4536             be added, BYTE_OFFSET is measured in bytes.
4537
4538    Output:
4539    1. Return an SSA_NAME whose value is the address of the memory location of
4540       the first vector of the data reference.
4541    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4542       these statement(s) which define the returned SSA_NAME.
4543
4544    FORNOW: We are only handling array accesses with step 1.  */
4545
4546 tree
4547 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4548                                       gimple_seq *new_stmt_list,
4549                                       tree offset,
4550                                       tree byte_offset)
4551 {
4552   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4553   struct data_reference *dr = dr_info->dr;
4554   const char *base_name;
4555   tree addr_base;
4556   tree dest;
4557   gimple_seq seq = NULL;
4558   tree vect_ptr_type;
4559   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4560   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4561   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4562
4563   tree data_ref_base = unshare_expr (drb->base_address);
4564   tree base_offset = unshare_expr (drb->offset);
4565   tree init = unshare_expr (drb->init);
4566
4567   if (loop_vinfo)
4568     base_name = get_name (data_ref_base);
4569   else
4570     {
4571       base_offset = ssize_int (0);
4572       init = ssize_int (0);
4573       base_name = get_name (DR_REF (dr));
4574     }
4575
4576   /* Create base_offset */
4577   base_offset = size_binop (PLUS_EXPR,
4578                             fold_convert (sizetype, base_offset),
4579                             fold_convert (sizetype, init));
4580
4581   if (offset)
4582     {
4583       offset = fold_build2 (MULT_EXPR, sizetype,
4584                             fold_convert (sizetype, offset), step);
4585       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4586                                  base_offset, offset);
4587     }
4588   if (byte_offset)
4589     {
4590       byte_offset = fold_convert (sizetype, byte_offset);
4591       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4592                                  base_offset, byte_offset);
4593     }
4594
4595   /* base + base_offset */
4596   if (loop_vinfo)
4597     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4598   else
4599     {
4600       addr_base = build1 (ADDR_EXPR,
4601                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4602                           unshare_expr (DR_REF (dr)));
4603     }
4604
4605   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4606   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4607   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4608   gimple_seq_add_seq (new_stmt_list, seq);
4609
4610   if (DR_PTR_INFO (dr)
4611       && TREE_CODE (addr_base) == SSA_NAME
4612       && !SSA_NAME_PTR_INFO (addr_base))
4613     {
4614       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4615       if (offset || byte_offset)
4616         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4617     }
4618
4619   if (dump_enabled_p ())
4620     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4621
4622   return addr_base;
4623 }
4624
4625
4626 /* Function vect_create_data_ref_ptr.
4627
4628    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4629    location accessed in the loop by STMT_INFO, along with the def-use update
4630    chain to appropriately advance the pointer through the loop iterations.
4631    Also set aliasing information for the pointer.  This pointer is used by
4632    the callers to this function to create a memory reference expression for
4633    vector load/store access.
4634
4635    Input:
4636    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4637          GIMPLE_ASSIGN <name, data-ref> or
4638          GIMPLE_ASSIGN <data-ref, name>.
4639    2. AGGR_TYPE: the type of the reference, which should be either a vector
4640         or an array.
4641    3. AT_LOOP: the loop where the vector memref is to be created.
4642    4. OFFSET (optional): an offset to be added to the initial address accessed
4643         by the data-ref in STMT_INFO.
4644    5. BSI: location where the new stmts are to be placed if there is no loop
4645    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4646         pointing to the initial address.
4647    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4648         to the initial address accessed by the data-ref in STMT_INFO.  This is
4649         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4650         in bytes.
4651    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4652         to the IV during each iteration of the loop.  NULL says to move
4653         by one copy of AGGR_TYPE up or down, depending on the step of the
4654         data reference.
4655
4656    Output:
4657    1. Declare a new ptr to vector_type, and have it point to the base of the
4658       data reference (initial addressed accessed by the data reference).
4659       For example, for vector of type V8HI, the following code is generated:
4660
4661       v8hi *ap;
4662       ap = (v8hi *)initial_address;
4663
4664       if OFFSET is not supplied:
4665          initial_address = &a[init];
4666       if OFFSET is supplied:
4667          initial_address = &a[init + OFFSET];
4668       if BYTE_OFFSET is supplied:
4669          initial_address = &a[init] + BYTE_OFFSET;
4670
4671       Return the initial_address in INITIAL_ADDRESS.
4672
4673    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4674       update the pointer in each iteration of the loop.
4675
4676       Return the increment stmt that updates the pointer in PTR_INCR.
4677
4678    3. Return the pointer.  */
4679
4680 tree
4681 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4682                           class loop *at_loop, tree offset,
4683                           tree *initial_address, gimple_stmt_iterator *gsi,
4684                           gimple **ptr_incr, bool only_init,
4685                           tree byte_offset, tree iv_step)
4686 {
4687   const char *base_name;
4688   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4689   class loop *loop = NULL;
4690   bool nested_in_vect_loop = false;
4691   class loop *containing_loop = NULL;
4692   tree aggr_ptr_type;
4693   tree aggr_ptr;
4694   tree new_temp;
4695   gimple_seq new_stmt_list = NULL;
4696   edge pe = NULL;
4697   basic_block new_bb;
4698   tree aggr_ptr_init;
4699   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4700   struct data_reference *dr = dr_info->dr;
4701   tree aptr;
4702   gimple_stmt_iterator incr_gsi;
4703   bool insert_after;
4704   tree indx_before_incr, indx_after_incr;
4705   gimple *incr;
4706   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4707
4708   gcc_assert (iv_step != NULL_TREE
4709               || TREE_CODE (aggr_type) == ARRAY_TYPE
4710               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4711
4712   if (loop_vinfo)
4713     {
4714       loop = LOOP_VINFO_LOOP (loop_vinfo);
4715       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4716       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4717       pe = loop_preheader_edge (loop);
4718     }
4719   else
4720     {
4721       gcc_assert (bb_vinfo);
4722       only_init = true;
4723       *ptr_incr = NULL;
4724     }
4725
4726   /* Create an expression for the first address accessed by this load
4727      in LOOP.  */
4728   base_name = get_name (DR_BASE_ADDRESS (dr));
4729
4730   if (dump_enabled_p ())
4731     {
4732       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4733       dump_printf_loc (MSG_NOTE, vect_location,
4734                        "create %s-pointer variable to type: %T",
4735                        get_tree_code_name (TREE_CODE (aggr_type)),
4736                        aggr_type);
4737       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4738         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4739       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4740         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4741       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4742         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4743       else
4744         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4745       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4746     }
4747
4748   /* (1) Create the new aggregate-pointer variable.
4749      Vector and array types inherit the alias set of their component
4750      type by default so we need to use a ref-all pointer if the data
4751      reference does not conflict with the created aggregated data
4752      reference because it is not addressable.  */
4753   bool need_ref_all = false;
4754   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4755                               get_alias_set (DR_REF (dr))))
4756     need_ref_all = true;
4757   /* Likewise for any of the data references in the stmt group.  */
4758   else if (DR_GROUP_SIZE (stmt_info) > 1)
4759     {
4760       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4761       do
4762         {
4763           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4764           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4765                                       get_alias_set (DR_REF (sdr))))
4766             {
4767               need_ref_all = true;
4768               break;
4769             }
4770           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4771         }
4772       while (sinfo);
4773     }
4774   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4775                                                need_ref_all);
4776   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4777
4778
4779   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4780      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4781      def-use update cycles for the pointer: one relative to the outer-loop
4782      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4783      to the inner-loop (which is the inner-most loop containing the dataref),
4784      and this is done be step (5) below.
4785
4786      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4787      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4788      redundant.  Steps (3),(4) create the following:
4789
4790         vp0 = &base_addr;
4791         LOOP:   vp1 = phi(vp0,vp2)
4792                 ...
4793                 ...
4794                 vp2 = vp1 + step
4795                 goto LOOP
4796
4797      If there is an inner-loop nested in loop, then step (5) will also be
4798      applied, and an additional update in the inner-loop will be created:
4799
4800         vp0 = &base_addr;
4801         LOOP:   vp1 = phi(vp0,vp2)
4802                 ...
4803         inner:     vp3 = phi(vp1,vp4)
4804                    vp4 = vp3 + inner_step
4805                    if () goto inner
4806                 ...
4807                 vp2 = vp1 + step
4808                 if () goto LOOP   */
4809
4810   /* (2) Calculate the initial address of the aggregate-pointer, and set
4811      the aggregate-pointer to point to it before the loop.  */
4812
4813   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4814
4815   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4816                                                    offset, byte_offset);
4817   if (new_stmt_list)
4818     {
4819       if (pe)
4820         {
4821           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4822           gcc_assert (!new_bb);
4823         }
4824       else
4825         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4826     }
4827
4828   *initial_address = new_temp;
4829   aggr_ptr_init = new_temp;
4830
4831   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4832      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4833      inner-loop nested in LOOP (during outer-loop vectorization).  */
4834
4835   /* No update in loop is required.  */
4836   if (only_init && (!loop_vinfo || at_loop == loop))
4837     aptr = aggr_ptr_init;
4838   else
4839     {
4840       /* Accesses to invariant addresses should be handled specially
4841          by the caller.  */
4842       tree step = vect_dr_behavior (dr_info)->step;
4843       gcc_assert (!integer_zerop (step));
4844
4845       if (iv_step == NULL_TREE)
4846         {
4847           /* The step of the aggregate pointer is the type size,
4848              negated for downward accesses.  */
4849           iv_step = TYPE_SIZE_UNIT (aggr_type);
4850           if (tree_int_cst_sgn (step) == -1)
4851             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4852         }
4853
4854       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4855
4856       create_iv (aggr_ptr_init,
4857                  fold_convert (aggr_ptr_type, iv_step),
4858                  aggr_ptr, loop, &incr_gsi, insert_after,
4859                  &indx_before_incr, &indx_after_incr);
4860       incr = gsi_stmt (incr_gsi);
4861       loop_vinfo->add_stmt (incr);
4862
4863       /* Copy the points-to information if it exists. */
4864       if (DR_PTR_INFO (dr))
4865         {
4866           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4867           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4868         }
4869       if (ptr_incr)
4870         *ptr_incr = incr;
4871
4872       aptr = indx_before_incr;
4873     }
4874
4875   if (!nested_in_vect_loop || only_init)
4876     return aptr;
4877
4878
4879   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4880      nested in LOOP, if exists.  */
4881
4882   gcc_assert (nested_in_vect_loop);
4883   if (!only_init)
4884     {
4885       standard_iv_increment_position (containing_loop, &incr_gsi,
4886                                       &insert_after);
4887       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4888                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4889                  &indx_after_incr);
4890       incr = gsi_stmt (incr_gsi);
4891       loop_vinfo->add_stmt (incr);
4892
4893       /* Copy the points-to information if it exists. */
4894       if (DR_PTR_INFO (dr))
4895         {
4896           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4897           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4898         }
4899       if (ptr_incr)
4900         *ptr_incr = incr;
4901
4902       return indx_before_incr;
4903     }
4904   else
4905     gcc_unreachable ();
4906 }
4907
4908
4909 /* Function bump_vector_ptr
4910
4911    Increment a pointer (to a vector type) by vector-size. If requested,
4912    i.e. if PTR-INCR is given, then also connect the new increment stmt
4913    to the existing def-use update-chain of the pointer, by modifying
4914    the PTR_INCR as illustrated below:
4915
4916    The pointer def-use update-chain before this function:
4917                         DATAREF_PTR = phi (p_0, p_2)
4918                         ....
4919         PTR_INCR:       p_2 = DATAREF_PTR + step
4920
4921    The pointer def-use update-chain after this function:
4922                         DATAREF_PTR = phi (p_0, p_2)
4923                         ....
4924                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4925                         ....
4926         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4927
4928    Input:
4929    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4930                  in the loop.
4931    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4932               the loop.  The increment amount across iterations is expected
4933               to be vector_size.
4934    BSI - location where the new update stmt is to be placed.
4935    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4936    BUMP - optional. The offset by which to bump the pointer. If not given,
4937           the offset is assumed to be vector_size.
4938
4939    Output: Return NEW_DATAREF_PTR as illustrated above.
4940
4941 */
4942
4943 tree
4944 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4945                  stmt_vec_info stmt_info, tree bump)
4946 {
4947   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4948   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4949   tree update = TYPE_SIZE_UNIT (vectype);
4950   gassign *incr_stmt;
4951   ssa_op_iter iter;
4952   use_operand_p use_p;
4953   tree new_dataref_ptr;
4954
4955   if (bump)
4956     update = bump;
4957
4958   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4959     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4960   else
4961     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4962   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4963                                    dataref_ptr, update);
4964   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4965
4966   /* Copy the points-to information if it exists. */
4967   if (DR_PTR_INFO (dr))
4968     {
4969       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4970       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4971     }
4972
4973   if (!ptr_incr)
4974     return new_dataref_ptr;
4975
4976   /* Update the vector-pointer's cross-iteration increment.  */
4977   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4978     {
4979       tree use = USE_FROM_PTR (use_p);
4980
4981       if (use == dataref_ptr)
4982         SET_USE (use_p, new_dataref_ptr);
4983       else
4984         gcc_assert (operand_equal_p (use, update, 0));
4985     }
4986
4987   return new_dataref_ptr;
4988 }
4989
4990
4991 /* Copy memory reference info such as base/clique from the SRC reference
4992    to the DEST MEM_REF.  */
4993
4994 void
4995 vect_copy_ref_info (tree dest, tree src)
4996 {
4997   if (TREE_CODE (dest) != MEM_REF)
4998     return;
4999
5000   tree src_base = src;
5001   while (handled_component_p (src_base))
5002     src_base = TREE_OPERAND (src_base, 0);
5003   if (TREE_CODE (src_base) != MEM_REF
5004       && TREE_CODE (src_base) != TARGET_MEM_REF)
5005     return;
5006
5007   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5008   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5009 }
5010
5011
5012 /* Function vect_create_destination_var.
5013
5014    Create a new temporary of type VECTYPE.  */
5015
5016 tree
5017 vect_create_destination_var (tree scalar_dest, tree vectype)
5018 {
5019   tree vec_dest;
5020   const char *name;
5021   char *new_name;
5022   tree type;
5023   enum vect_var_kind kind;
5024
5025   kind = vectype
5026     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5027     ? vect_mask_var
5028     : vect_simple_var
5029     : vect_scalar_var;
5030   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5031
5032   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5033
5034   name = get_name (scalar_dest);
5035   if (name)
5036     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5037   else
5038     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5039   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5040   free (new_name);
5041
5042   return vec_dest;
5043 }
5044
5045 /* Function vect_grouped_store_supported.
5046
5047    Returns TRUE if interleave high and interleave low permutations
5048    are supported, and FALSE otherwise.  */
5049
5050 bool
5051 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5052 {
5053   machine_mode mode = TYPE_MODE (vectype);
5054
5055   /* vect_permute_store_chain requires the group size to be equal to 3 or
5056      be a power of two.  */
5057   if (count != 3 && exact_log2 (count) == -1)
5058     {
5059       if (dump_enabled_p ())
5060         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5061                          "the size of the group of accesses"
5062                          " is not a power of 2 or not eqaul to 3\n");
5063       return false;
5064     }
5065
5066   /* Check that the permutation is supported.  */
5067   if (VECTOR_MODE_P (mode))
5068     {
5069       unsigned int i;
5070       if (count == 3)
5071         {
5072           unsigned int j0 = 0, j1 = 0, j2 = 0;
5073           unsigned int i, j;
5074
5075           unsigned int nelt;
5076           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5077             {
5078               if (dump_enabled_p ())
5079                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5080                                  "cannot handle groups of 3 stores for"
5081                                  " variable-length vectors\n");
5082               return false;
5083             }
5084
5085           vec_perm_builder sel (nelt, nelt, 1);
5086           sel.quick_grow (nelt);
5087           vec_perm_indices indices;
5088           for (j = 0; j < 3; j++)
5089             {
5090               int nelt0 = ((3 - j) * nelt) % 3;
5091               int nelt1 = ((3 - j) * nelt + 1) % 3;
5092               int nelt2 = ((3 - j) * nelt + 2) % 3;
5093               for (i = 0; i < nelt; i++)
5094                 {
5095                   if (3 * i + nelt0 < nelt)
5096                     sel[3 * i + nelt0] = j0++;
5097                   if (3 * i + nelt1 < nelt)
5098                     sel[3 * i + nelt1] = nelt + j1++;
5099                   if (3 * i + nelt2 < nelt)
5100                     sel[3 * i + nelt2] = 0;
5101                 }
5102               indices.new_vector (sel, 2, nelt);
5103               if (!can_vec_perm_const_p (mode, indices))
5104                 {
5105                   if (dump_enabled_p ())
5106                     dump_printf (MSG_MISSED_OPTIMIZATION,
5107                                  "permutation op not supported by target.\n");
5108                   return false;
5109                 }
5110
5111               for (i = 0; i < nelt; i++)
5112                 {
5113                   if (3 * i + nelt0 < nelt)
5114                     sel[3 * i + nelt0] = 3 * i + nelt0;
5115                   if (3 * i + nelt1 < nelt)
5116                     sel[3 * i + nelt1] = 3 * i + nelt1;
5117                   if (3 * i + nelt2 < nelt)
5118                     sel[3 * i + nelt2] = nelt + j2++;
5119                 }
5120               indices.new_vector (sel, 2, nelt);
5121               if (!can_vec_perm_const_p (mode, indices))
5122                 {
5123                   if (dump_enabled_p ())
5124                     dump_printf (MSG_MISSED_OPTIMIZATION,
5125                                  "permutation op not supported by target.\n");
5126                   return false;
5127                 }
5128             }
5129           return true;
5130         }
5131       else
5132         {
5133           /* If length is not equal to 3 then only power of 2 is supported.  */
5134           gcc_assert (pow2p_hwi (count));
5135           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5136
5137           /* The encoding has 2 interleaved stepped patterns.  */
5138           vec_perm_builder sel (nelt, 2, 3);
5139           sel.quick_grow (6);
5140           for (i = 0; i < 3; i++)
5141             {
5142               sel[i * 2] = i;
5143               sel[i * 2 + 1] = i + nelt;
5144             }
5145           vec_perm_indices indices (sel, 2, nelt);
5146           if (can_vec_perm_const_p (mode, indices))
5147             {
5148               for (i = 0; i < 6; i++)
5149                 sel[i] += exact_div (nelt, 2);
5150               indices.new_vector (sel, 2, nelt);
5151               if (can_vec_perm_const_p (mode, indices))
5152                 return true;
5153             }
5154         }
5155     }
5156
5157   if (dump_enabled_p ())
5158     dump_printf (MSG_MISSED_OPTIMIZATION,
5159                  "permutation op not supported by target.\n");
5160   return false;
5161 }
5162
5163
5164 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5165    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5166
5167 bool
5168 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5169                             bool masked_p)
5170 {
5171   if (masked_p)
5172     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5173                                          vec_mask_store_lanes_optab,
5174                                          vectype, count);
5175   else
5176     return vect_lanes_optab_supported_p ("vec_store_lanes",
5177                                          vec_store_lanes_optab,
5178                                          vectype, count);
5179 }
5180
5181
5182 /* Function vect_permute_store_chain.
5183
5184    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5185    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5186    the data correctly for the stores.  Return the final references for stores
5187    in RESULT_CHAIN.
5188
5189    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5190    The input is 4 vectors each containing 8 elements.  We assign a number to
5191    each element, the input sequence is:
5192
5193    1st vec:   0  1  2  3  4  5  6  7
5194    2nd vec:   8  9 10 11 12 13 14 15
5195    3rd vec:  16 17 18 19 20 21 22 23
5196    4th vec:  24 25 26 27 28 29 30 31
5197
5198    The output sequence should be:
5199
5200    1st vec:  0  8 16 24  1  9 17 25
5201    2nd vec:  2 10 18 26  3 11 19 27
5202    3rd vec:  4 12 20 28  5 13 21 30
5203    4th vec:  6 14 22 30  7 15 23 31
5204
5205    i.e., we interleave the contents of the four vectors in their order.
5206
5207    We use interleave_high/low instructions to create such output.  The input of
5208    each interleave_high/low operation is two vectors:
5209    1st vec    2nd vec
5210    0 1 2 3    4 5 6 7
5211    the even elements of the result vector are obtained left-to-right from the
5212    high/low elements of the first vector.  The odd elements of the result are
5213    obtained left-to-right from the high/low elements of the second vector.
5214    The output of interleave_high will be:   0 4 1 5
5215    and of interleave_low:                   2 6 3 7
5216
5217
5218    The permutation is done in log LENGTH stages.  In each stage interleave_high
5219    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5220    where the first argument is taken from the first half of DR_CHAIN and the
5221    second argument from it's second half.
5222    In our example,
5223
5224    I1: interleave_high (1st vec, 3rd vec)
5225    I2: interleave_low (1st vec, 3rd vec)
5226    I3: interleave_high (2nd vec, 4th vec)
5227    I4: interleave_low (2nd vec, 4th vec)
5228
5229    The output for the first stage is:
5230
5231    I1:  0 16  1 17  2 18  3 19
5232    I2:  4 20  5 21  6 22  7 23
5233    I3:  8 24  9 25 10 26 11 27
5234    I4: 12 28 13 29 14 30 15 31
5235
5236    The output of the second stage, i.e. the final result is:
5237
5238    I1:  0  8 16 24  1  9 17 25
5239    I2:  2 10 18 26  3 11 19 27
5240    I3:  4 12 20 28  5 13 21 30
5241    I4:  6 14 22 30  7 15 23 31.  */
5242
5243 void
5244 vect_permute_store_chain (vec<tree> dr_chain,
5245                           unsigned int length,
5246                           stmt_vec_info stmt_info,
5247                           gimple_stmt_iterator *gsi,
5248                           vec<tree> *result_chain)
5249 {
5250   tree vect1, vect2, high, low;
5251   gimple *perm_stmt;
5252   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5253   tree perm_mask_low, perm_mask_high;
5254   tree data_ref;
5255   tree perm3_mask_low, perm3_mask_high;
5256   unsigned int i, j, n, log_length = exact_log2 (length);
5257
5258   result_chain->quick_grow (length);
5259   memcpy (result_chain->address (), dr_chain.address (),
5260           length * sizeof (tree));
5261
5262   if (length == 3)
5263     {
5264       /* vect_grouped_store_supported ensures that this is constant.  */
5265       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5266       unsigned int j0 = 0, j1 = 0, j2 = 0;
5267
5268       vec_perm_builder sel (nelt, nelt, 1);
5269       sel.quick_grow (nelt);
5270       vec_perm_indices indices;
5271       for (j = 0; j < 3; j++)
5272         {
5273           int nelt0 = ((3 - j) * nelt) % 3;
5274           int nelt1 = ((3 - j) * nelt + 1) % 3;
5275           int nelt2 = ((3 - j) * nelt + 2) % 3;
5276
5277           for (i = 0; i < nelt; i++)
5278             {
5279               if (3 * i + nelt0 < nelt)
5280                 sel[3 * i + nelt0] = j0++;
5281               if (3 * i + nelt1 < nelt)
5282                 sel[3 * i + nelt1] = nelt + j1++;
5283               if (3 * i + nelt2 < nelt)
5284                 sel[3 * i + nelt2] = 0;
5285             }
5286           indices.new_vector (sel, 2, nelt);
5287           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5288
5289           for (i = 0; i < nelt; i++)
5290             {
5291               if (3 * i + nelt0 < nelt)
5292                 sel[3 * i + nelt0] = 3 * i + nelt0;
5293               if (3 * i + nelt1 < nelt)
5294                 sel[3 * i + nelt1] = 3 * i + nelt1;
5295               if (3 * i + nelt2 < nelt)
5296                 sel[3 * i + nelt2] = nelt + j2++;
5297             }
5298           indices.new_vector (sel, 2, nelt);
5299           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5300
5301           vect1 = dr_chain[0];
5302           vect2 = dr_chain[1];
5303
5304           /* Create interleaving stmt:
5305              low = VEC_PERM_EXPR <vect1, vect2,
5306                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5307                                    j + 2, nelt + j + 2, *, ...}>  */
5308           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5309           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5310                                            vect2, perm3_mask_low);
5311           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5312
5313           vect1 = data_ref;
5314           vect2 = dr_chain[2];
5315           /* Create interleaving stmt:
5316              low = VEC_PERM_EXPR <vect1, vect2,
5317                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5318                                    6, 7, nelt + j + 2, ...}>  */
5319           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5320           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5321                                            vect2, perm3_mask_high);
5322           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5323           (*result_chain)[j] = data_ref;
5324         }
5325     }
5326   else
5327     {
5328       /* If length is not equal to 3 then only power of 2 is supported.  */
5329       gcc_assert (pow2p_hwi (length));
5330
5331       /* The encoding has 2 interleaved stepped patterns.  */
5332       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5333       vec_perm_builder sel (nelt, 2, 3);
5334       sel.quick_grow (6);
5335       for (i = 0; i < 3; i++)
5336         {
5337           sel[i * 2] = i;
5338           sel[i * 2 + 1] = i + nelt;
5339         }
5340         vec_perm_indices indices (sel, 2, nelt);
5341         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5342
5343         for (i = 0; i < 6; i++)
5344           sel[i] += exact_div (nelt, 2);
5345         indices.new_vector (sel, 2, nelt);
5346         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5347
5348         for (i = 0, n = log_length; i < n; i++)
5349           {
5350             for (j = 0; j < length/2; j++)
5351               {
5352                 vect1 = dr_chain[j];
5353                 vect2 = dr_chain[j+length/2];
5354
5355                 /* Create interleaving stmt:
5356                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5357                                                         ...}>  */
5358                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5359                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5360                                                  vect2, perm_mask_high);
5361                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5362                 (*result_chain)[2*j] = high;
5363
5364                 /* Create interleaving stmt:
5365                    low = VEC_PERM_EXPR <vect1, vect2,
5366                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5367                                          ...}>  */
5368                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5369                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5370                                                  vect2, perm_mask_low);
5371                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5372                 (*result_chain)[2*j+1] = low;
5373               }
5374             memcpy (dr_chain.address (), result_chain->address (),
5375                     length * sizeof (tree));
5376           }
5377     }
5378 }
5379
5380 /* Function vect_setup_realignment
5381
5382    This function is called when vectorizing an unaligned load using
5383    the dr_explicit_realign[_optimized] scheme.
5384    This function generates the following code at the loop prolog:
5385
5386       p = initial_addr;
5387    x  msq_init = *(floor(p));   # prolog load
5388       realignment_token = call target_builtin;
5389     loop:
5390    x  msq = phi (msq_init, ---)
5391
5392    The stmts marked with x are generated only for the case of
5393    dr_explicit_realign_optimized.
5394
5395    The code above sets up a new (vector) pointer, pointing to the first
5396    location accessed by STMT_INFO, and a "floor-aligned" load using that
5397    pointer.  It also generates code to compute the "realignment-token"
5398    (if the relevant target hook was defined), and creates a phi-node at the
5399    loop-header bb whose arguments are the result of the prolog-load (created
5400    by this function) and the result of a load that takes place in the loop
5401    (to be created by the caller to this function).
5402
5403    For the case of dr_explicit_realign_optimized:
5404    The caller to this function uses the phi-result (msq) to create the
5405    realignment code inside the loop, and sets up the missing phi argument,
5406    as follows:
5407     loop:
5408       msq = phi (msq_init, lsq)
5409       lsq = *(floor(p'));        # load in loop
5410       result = realign_load (msq, lsq, realignment_token);
5411
5412    For the case of dr_explicit_realign:
5413     loop:
5414       msq = *(floor(p));        # load in loop
5415       p' = p + (VS-1);
5416       lsq = *(floor(p'));       # load in loop
5417       result = realign_load (msq, lsq, realignment_token);
5418
5419    Input:
5420    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5421                a memory location that may be unaligned.
5422    BSI - place where new code is to be inserted.
5423    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5424                               is used.
5425
5426    Output:
5427    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5428                        target hook, if defined.
5429    Return value - the result of the loop-header phi node.  */
5430
5431 tree
5432 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5433                         tree *realignment_token,
5434                         enum dr_alignment_support alignment_support_scheme,
5435                         tree init_addr,
5436                         class loop **at_loop)
5437 {
5438   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5441   struct data_reference *dr = dr_info->dr;
5442   class loop *loop = NULL;
5443   edge pe = NULL;
5444   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5445   tree vec_dest;
5446   gimple *inc;
5447   tree ptr;
5448   tree data_ref;
5449   basic_block new_bb;
5450   tree msq_init = NULL_TREE;
5451   tree new_temp;
5452   gphi *phi_stmt;
5453   tree msq = NULL_TREE;
5454   gimple_seq stmts = NULL;
5455   bool compute_in_loop = false;
5456   bool nested_in_vect_loop = false;
5457   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5458   class loop *loop_for_initial_load = NULL;
5459
5460   if (loop_vinfo)
5461     {
5462       loop = LOOP_VINFO_LOOP (loop_vinfo);
5463       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5464     }
5465
5466   gcc_assert (alignment_support_scheme == dr_explicit_realign
5467               || alignment_support_scheme == dr_explicit_realign_optimized);
5468
5469   /* We need to generate three things:
5470      1. the misalignment computation
5471      2. the extra vector load (for the optimized realignment scheme).
5472      3. the phi node for the two vectors from which the realignment is
5473       done (for the optimized realignment scheme).  */
5474
5475   /* 1. Determine where to generate the misalignment computation.
5476
5477      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478      calculation will be generated by this function, outside the loop (in the
5479      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5480      caller, inside the loop.
5481
5482      Background: If the misalignment remains fixed throughout the iterations of
5483      the loop, then both realignment schemes are applicable, and also the
5484      misalignment computation can be done outside LOOP.  This is because we are
5485      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486      are a multiple of VS (the Vector Size), and therefore the misalignment in
5487      different vectorized LOOP iterations is always the same.
5488      The problem arises only if the memory access is in an inner-loop nested
5489      inside LOOP, which is now being vectorized using outer-loop vectorization.
5490      This is the only case when the misalignment of the memory access may not
5491      remain fixed throughout the iterations of the inner-loop (as explained in
5492      detail in vect_supportable_dr_alignment).  In this case, not only is the
5493      optimized realignment scheme not applicable, but also the misalignment
5494      computation (and generation of the realignment token that is passed to
5495      REALIGN_LOAD) have to be done inside the loop.
5496
5497      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498      or not, which in turn determines if the misalignment is computed inside
5499      the inner-loop, or outside LOOP.  */
5500
5501   if (init_addr != NULL_TREE || !loop_vinfo)
5502     {
5503       compute_in_loop = true;
5504       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5505     }
5506
5507
5508   /* 2. Determine where to generate the extra vector load.
5509
5510      For the optimized realignment scheme, instead of generating two vector
5511      loads in each iteration, we generate a single extra vector load in the
5512      preheader of the loop, and in each iteration reuse the result of the
5513      vector load from the previous iteration.  In case the memory access is in
5514      an inner-loop nested inside LOOP, which is now being vectorized using
5515      outer-loop vectorization, we need to determine whether this initial vector
5516      load should be generated at the preheader of the inner-loop, or can be
5517      generated at the preheader of LOOP.  If the memory access has no evolution
5518      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519      to be generated inside LOOP (in the preheader of the inner-loop).  */
5520
5521   if (nested_in_vect_loop)
5522     {
5523       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524       bool invariant_in_outerloop =
5525             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5527     }
5528   else
5529     loop_for_initial_load = loop;
5530   if (at_loop)
5531     *at_loop = loop_for_initial_load;
5532
5533   if (loop_for_initial_load)
5534     pe = loop_preheader_edge (loop_for_initial_load);
5535
5536   /* 3. For the case of the optimized realignment, create the first vector
5537       load at the loop preheader.  */
5538
5539   if (alignment_support_scheme == dr_explicit_realign_optimized)
5540     {
5541       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5542       gassign *new_stmt;
5543
5544       gcc_assert (!compute_in_loop);
5545       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5547                                       loop_for_initial_load, NULL_TREE,
5548                                       &init_addr, NULL, &inc, true);
5549       if (TREE_CODE (ptr) == SSA_NAME)
5550         new_temp = copy_ssa_name (ptr);
5551       else
5552         new_temp = make_ssa_name (TREE_TYPE (ptr));
5553       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5554       tree type = TREE_TYPE (ptr);
5555       new_stmt = gimple_build_assign
5556                    (new_temp, BIT_AND_EXPR, ptr,
5557                     fold_build2 (MINUS_EXPR, type,
5558                                  build_int_cst (type, 0),
5559                                  build_int_cst (type, align)));
5560       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5561       gcc_assert (!new_bb);
5562       data_ref
5563         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5564                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5565       vect_copy_ref_info (data_ref, DR_REF (dr));
5566       new_stmt = gimple_build_assign (vec_dest, data_ref);
5567       new_temp = make_ssa_name (vec_dest, new_stmt);
5568       gimple_assign_set_lhs (new_stmt, new_temp);
5569       if (pe)
5570         {
5571           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5572           gcc_assert (!new_bb);
5573         }
5574       else
5575          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5576
5577       msq_init = gimple_assign_lhs (new_stmt);
5578     }
5579
5580   /* 4. Create realignment token using a target builtin, if available.
5581       It is done either inside the containing loop, or before LOOP (as
5582       determined above).  */
5583
5584   if (targetm.vectorize.builtin_mask_for_load)
5585     {
5586       gcall *new_stmt;
5587       tree builtin_decl;
5588
5589       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5590       if (!init_addr)
5591         {
5592           /* Generate the INIT_ADDR computation outside LOOP.  */
5593           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5594                                                             NULL_TREE);
5595           if (loop)
5596             {
5597               pe = loop_preheader_edge (loop);
5598               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5599               gcc_assert (!new_bb);
5600             }
5601           else
5602              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5603         }
5604
5605       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5606       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5607       vec_dest =
5608         vect_create_destination_var (scalar_dest,
5609                                      gimple_call_return_type (new_stmt));
5610       new_temp = make_ssa_name (vec_dest, new_stmt);
5611       gimple_call_set_lhs (new_stmt, new_temp);
5612
5613       if (compute_in_loop)
5614         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5615       else
5616         {
5617           /* Generate the misalignment computation outside LOOP.  */
5618           pe = loop_preheader_edge (loop);
5619           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5620           gcc_assert (!new_bb);
5621         }
5622
5623       *realignment_token = gimple_call_lhs (new_stmt);
5624
5625       /* The result of the CALL_EXPR to this builtin is determined from
5626          the value of the parameter and no global variables are touched
5627          which makes the builtin a "const" function.  Requiring the
5628          builtin to have the "const" attribute makes it unnecessary
5629          to call mark_call_clobbered.  */
5630       gcc_assert (TREE_READONLY (builtin_decl));
5631     }
5632
5633   if (alignment_support_scheme == dr_explicit_realign)
5634     return msq;
5635
5636   gcc_assert (!compute_in_loop);
5637   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5638
5639
5640   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5641
5642   pe = loop_preheader_edge (containing_loop);
5643   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5644   msq = make_ssa_name (vec_dest);
5645   phi_stmt = create_phi_node (msq, containing_loop->header);
5646   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5647
5648   return msq;
5649 }
5650
5651
5652 /* Function vect_grouped_load_supported.
5653
5654    COUNT is the size of the load group (the number of statements plus the
5655    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5656    only one statement, with a gap of COUNT - 1.
5657
5658    Returns true if a suitable permute exists.  */
5659
5660 bool
5661 vect_grouped_load_supported (tree vectype, bool single_element_p,
5662                              unsigned HOST_WIDE_INT count)
5663 {
5664   machine_mode mode = TYPE_MODE (vectype);
5665
5666   /* If this is single-element interleaving with an element distance
5667      that leaves unused vector loads around punt - we at least create
5668      very sub-optimal code in that case (and blow up memory,
5669      see PR65518).  */
5670   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5671     {
5672       if (dump_enabled_p ())
5673         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5674                          "single-element interleaving not supported "
5675                          "for not adjacent vector loads\n");
5676       return false;
5677     }
5678
5679   /* vect_permute_load_chain requires the group size to be equal to 3 or
5680      be a power of two.  */
5681   if (count != 3 && exact_log2 (count) == -1)
5682     {
5683       if (dump_enabled_p ())
5684         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5685                          "the size of the group of accesses"
5686                          " is not a power of 2 or not equal to 3\n");
5687       return false;
5688     }
5689
5690   /* Check that the permutation is supported.  */
5691   if (VECTOR_MODE_P (mode))
5692     {
5693       unsigned int i, j;
5694       if (count == 3)
5695         {
5696           unsigned int nelt;
5697           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5698             {
5699               if (dump_enabled_p ())
5700                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5701                                  "cannot handle groups of 3 loads for"
5702                                  " variable-length vectors\n");
5703               return false;
5704             }
5705
5706           vec_perm_builder sel (nelt, nelt, 1);
5707           sel.quick_grow (nelt);
5708           vec_perm_indices indices;
5709           unsigned int k;
5710           for (k = 0; k < 3; k++)
5711             {
5712               for (i = 0; i < nelt; i++)
5713                 if (3 * i + k < 2 * nelt)
5714                   sel[i] = 3 * i + k;
5715                 else
5716                   sel[i] = 0;
5717               indices.new_vector (sel, 2, nelt);
5718               if (!can_vec_perm_const_p (mode, indices))
5719                 {
5720                   if (dump_enabled_p ())
5721                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5722                                      "shuffle of 3 loads is not supported by"
5723                                      " target\n");
5724                   return false;
5725                 }
5726               for (i = 0, j = 0; i < nelt; i++)
5727                 if (3 * i + k < 2 * nelt)
5728                   sel[i] = i;
5729                 else
5730                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5731               indices.new_vector (sel, 2, nelt);
5732               if (!can_vec_perm_const_p (mode, indices))
5733                 {
5734                   if (dump_enabled_p ())
5735                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5736                                      "shuffle of 3 loads is not supported by"
5737                                      " target\n");
5738                   return false;
5739                 }
5740             }
5741           return true;
5742         }
5743       else
5744         {
5745           /* If length is not equal to 3 then only power of 2 is supported.  */
5746           gcc_assert (pow2p_hwi (count));
5747           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5748
5749           /* The encoding has a single stepped pattern.  */
5750           vec_perm_builder sel (nelt, 1, 3);
5751           sel.quick_grow (3);
5752           for (i = 0; i < 3; i++)
5753             sel[i] = i * 2;
5754           vec_perm_indices indices (sel, 2, nelt);
5755           if (can_vec_perm_const_p (mode, indices))
5756             {
5757               for (i = 0; i < 3; i++)
5758                 sel[i] = i * 2 + 1;
5759               indices.new_vector (sel, 2, nelt);
5760               if (can_vec_perm_const_p (mode, indices))
5761                 return true;
5762             }
5763         }
5764     }
5765
5766   if (dump_enabled_p ())
5767     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5768                      "extract even/odd not supported by target\n");
5769   return false;
5770 }
5771
5772 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5773    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5774
5775 bool
5776 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5777                            bool masked_p)
5778 {
5779   if (masked_p)
5780     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5781                                          vec_mask_load_lanes_optab,
5782                                          vectype, count);
5783   else
5784     return vect_lanes_optab_supported_p ("vec_load_lanes",
5785                                          vec_load_lanes_optab,
5786                                          vectype, count);
5787 }
5788
5789 /* Function vect_permute_load_chain.
5790
5791    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5792    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5793    the input data correctly.  Return the final references for loads in
5794    RESULT_CHAIN.
5795
5796    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5797    The input is 4 vectors each containing 8 elements. We assign a number to each
5798    element, the input sequence is:
5799
5800    1st vec:   0  1  2  3  4  5  6  7
5801    2nd vec:   8  9 10 11 12 13 14 15
5802    3rd vec:  16 17 18 19 20 21 22 23
5803    4th vec:  24 25 26 27 28 29 30 31
5804
5805    The output sequence should be:
5806
5807    1st vec:  0 4  8 12 16 20 24 28
5808    2nd vec:  1 5  9 13 17 21 25 29
5809    3rd vec:  2 6 10 14 18 22 26 30
5810    4th vec:  3 7 11 15 19 23 27 31
5811
5812    i.e., the first output vector should contain the first elements of each
5813    interleaving group, etc.
5814
5815    We use extract_even/odd instructions to create such output.  The input of
5816    each extract_even/odd operation is two vectors
5817    1st vec    2nd vec
5818    0 1 2 3    4 5 6 7
5819
5820    and the output is the vector of extracted even/odd elements.  The output of
5821    extract_even will be:   0 2 4 6
5822    and of extract_odd:     1 3 5 7
5823
5824
5825    The permutation is done in log LENGTH stages.  In each stage extract_even
5826    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5827    their order.  In our example,
5828
5829    E1: extract_even (1st vec, 2nd vec)
5830    E2: extract_odd (1st vec, 2nd vec)
5831    E3: extract_even (3rd vec, 4th vec)
5832    E4: extract_odd (3rd vec, 4th vec)
5833
5834    The output for the first stage will be:
5835
5836    E1:  0  2  4  6  8 10 12 14
5837    E2:  1  3  5  7  9 11 13 15
5838    E3: 16 18 20 22 24 26 28 30
5839    E4: 17 19 21 23 25 27 29 31
5840
5841    In order to proceed and create the correct sequence for the next stage (or
5842    for the correct output, if the second stage is the last one, as in our
5843    example), we first put the output of extract_even operation and then the
5844    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5845    The input for the second stage is:
5846
5847    1st vec (E1):  0  2  4  6  8 10 12 14
5848    2nd vec (E3): 16 18 20 22 24 26 28 30
5849    3rd vec (E2):  1  3  5  7  9 11 13 15
5850    4th vec (E4): 17 19 21 23 25 27 29 31
5851
5852    The output of the second stage:
5853
5854    E1: 0 4  8 12 16 20 24 28
5855    E2: 2 6 10 14 18 22 26 30
5856    E3: 1 5  9 13 17 21 25 29
5857    E4: 3 7 11 15 19 23 27 31
5858
5859    And RESULT_CHAIN after reordering:
5860
5861    1st vec (E1):  0 4  8 12 16 20 24 28
5862    2nd vec (E3):  1 5  9 13 17 21 25 29
5863    3rd vec (E2):  2 6 10 14 18 22 26 30
5864    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5865
5866 static void
5867 vect_permute_load_chain (vec<tree> dr_chain,
5868                          unsigned int length,
5869                          stmt_vec_info stmt_info,
5870                          gimple_stmt_iterator *gsi,
5871                          vec<tree> *result_chain)
5872 {
5873   tree data_ref, first_vect, second_vect;
5874   tree perm_mask_even, perm_mask_odd;
5875   tree perm3_mask_low, perm3_mask_high;
5876   gimple *perm_stmt;
5877   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5878   unsigned int i, j, log_length = exact_log2 (length);
5879
5880   result_chain->quick_grow (length);
5881   memcpy (result_chain->address (), dr_chain.address (),
5882           length * sizeof (tree));
5883
5884   if (length == 3)
5885     {
5886       /* vect_grouped_load_supported ensures that this is constant.  */
5887       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5888       unsigned int k;
5889
5890       vec_perm_builder sel (nelt, nelt, 1);
5891       sel.quick_grow (nelt);
5892       vec_perm_indices indices;
5893       for (k = 0; k < 3; k++)
5894         {
5895           for (i = 0; i < nelt; i++)
5896             if (3 * i + k < 2 * nelt)
5897               sel[i] = 3 * i + k;
5898             else
5899               sel[i] = 0;
5900           indices.new_vector (sel, 2, nelt);
5901           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5902
5903           for (i = 0, j = 0; i < nelt; i++)
5904             if (3 * i + k < 2 * nelt)
5905               sel[i] = i;
5906             else
5907               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5908           indices.new_vector (sel, 2, nelt);
5909           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5910
5911           first_vect = dr_chain[0];
5912           second_vect = dr_chain[1];
5913
5914           /* Create interleaving stmt (low part of):
5915              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5916                                                              ...}>  */
5917           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5918           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5919                                            second_vect, perm3_mask_low);
5920           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5921
5922           /* Create interleaving stmt (high part of):
5923              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5924                                                               ...}>  */
5925           first_vect = data_ref;
5926           second_vect = dr_chain[2];
5927           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5928           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5929                                            second_vect, perm3_mask_high);
5930           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5931           (*result_chain)[k] = data_ref;
5932         }
5933     }
5934   else
5935     {
5936       /* If length is not equal to 3 then only power of 2 is supported.  */
5937       gcc_assert (pow2p_hwi (length));
5938
5939       /* The encoding has a single stepped pattern.  */
5940       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5941       vec_perm_builder sel (nelt, 1, 3);
5942       sel.quick_grow (3);
5943       for (i = 0; i < 3; ++i)
5944         sel[i] = i * 2;
5945       vec_perm_indices indices (sel, 2, nelt);
5946       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5947
5948       for (i = 0; i < 3; ++i)
5949         sel[i] = i * 2 + 1;
5950       indices.new_vector (sel, 2, nelt);
5951       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5952
5953       for (i = 0; i < log_length; i++)
5954         {
5955           for (j = 0; j < length; j += 2)
5956             {
5957               first_vect = dr_chain[j];
5958               second_vect = dr_chain[j+1];
5959
5960               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5961               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5962               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5963                                                first_vect, second_vect,
5964                                                perm_mask_even);
5965               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5966               (*result_chain)[j/2] = data_ref;
5967
5968               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5969               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5970               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5971                                                first_vect, second_vect,
5972                                                perm_mask_odd);
5973               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5974               (*result_chain)[j/2+length/2] = data_ref;
5975             }
5976           memcpy (dr_chain.address (), result_chain->address (),
5977                   length * sizeof (tree));
5978         }
5979     }
5980 }
5981
5982 /* Function vect_shift_permute_load_chain.
5983
5984    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5985    sequence of stmts to reorder the input data accordingly.
5986    Return the final references for loads in RESULT_CHAIN.
5987    Return true if successed, false otherwise.
5988
5989    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5990    The input is 3 vectors each containing 8 elements.  We assign a
5991    number to each element, the input sequence is:
5992
5993    1st vec:   0  1  2  3  4  5  6  7
5994    2nd vec:   8  9 10 11 12 13 14 15
5995    3rd vec:  16 17 18 19 20 21 22 23
5996
5997    The output sequence should be:
5998
5999    1st vec:  0 3 6  9 12 15 18 21
6000    2nd vec:  1 4 7 10 13 16 19 22
6001    3rd vec:  2 5 8 11 14 17 20 23
6002
6003    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6004
6005    First we shuffle all 3 vectors to get correct elements order:
6006
6007    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6008    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6009    3rd vec:  (16 19 22) (17 20 23) (18 21)
6010
6011    Next we unite and shift vector 3 times:
6012
6013    1st step:
6014      shift right by 6 the concatenation of:
6015      "1st vec" and  "2nd vec"
6016        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6017      "2nd vec" and  "3rd vec"
6018        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6019      "3rd vec" and  "1st vec"
6020        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6021                              | New vectors                   |
6022
6023      So that now new vectors are:
6024
6025      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6026      2nd vec:  (10 13) (16 19 22) (17 20 23)
6027      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6028
6029    2nd step:
6030      shift right by 5 the concatenation of:
6031      "1st vec" and  "3rd vec"
6032        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6033      "2nd vec" and  "1st vec"
6034        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6035      "3rd vec" and  "2nd vec"
6036        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6037                           | New vectors                   |
6038
6039      So that now new vectors are:
6040
6041      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6042      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6043      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6044
6045    3rd step:
6046      shift right by 5 the concatenation of:
6047      "1st vec" and  "1st vec"
6048        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6049      shift right by 3 the concatenation of:
6050      "2nd vec" and  "2nd vec"
6051                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6052                           | New vectors                   |
6053
6054      So that now all vectors are READY:
6055      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6056      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6057      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6058
6059    This algorithm is faster than one in vect_permute_load_chain if:
6060      1.  "shift of a concatination" is faster than general permutation.
6061          This is usually so.
6062      2.  The TARGET machine can't execute vector instructions in parallel.
6063          This is because each step of the algorithm depends on previous.
6064          The algorithm in vect_permute_load_chain is much more parallel.
6065
6066    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6067 */
6068
6069 static bool
6070 vect_shift_permute_load_chain (vec<tree> dr_chain,
6071                                unsigned int length,
6072                                stmt_vec_info stmt_info,
6073                                gimple_stmt_iterator *gsi,
6074                                vec<tree> *result_chain)
6075 {
6076   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6077   tree perm2_mask1, perm2_mask2, perm3_mask;
6078   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6079   gimple *perm_stmt;
6080
6081   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6082   unsigned int i;
6083   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6084
6085   unsigned HOST_WIDE_INT nelt, vf;
6086   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6087       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6088     /* Not supported for variable-length vectors.  */
6089     return false;
6090
6091   vec_perm_builder sel (nelt, nelt, 1);
6092   sel.quick_grow (nelt);
6093
6094   result_chain->quick_grow (length);
6095   memcpy (result_chain->address (), dr_chain.address (),
6096           length * sizeof (tree));
6097
6098   if (pow2p_hwi (length) && vf > 4)
6099     {
6100       unsigned int j, log_length = exact_log2 (length);
6101       for (i = 0; i < nelt / 2; ++i)
6102         sel[i] = i * 2;
6103       for (i = 0; i < nelt / 2; ++i)
6104         sel[nelt / 2 + i] = i * 2 + 1;
6105       vec_perm_indices indices (sel, 2, nelt);
6106       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6107         {
6108           if (dump_enabled_p ())
6109             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6110                              "shuffle of 2 fields structure is not \
6111                               supported by target\n");
6112           return false;
6113         }
6114       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6115
6116       for (i = 0; i < nelt / 2; ++i)
6117         sel[i] = i * 2 + 1;
6118       for (i = 0; i < nelt / 2; ++i)
6119         sel[nelt / 2 + i] = i * 2;
6120       indices.new_vector (sel, 2, nelt);
6121       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6122         {
6123           if (dump_enabled_p ())
6124             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6125                              "shuffle of 2 fields structure is not \
6126                               supported by target\n");
6127           return false;
6128         }
6129       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6130
6131       /* Generating permutation constant to shift all elements.
6132          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6133       for (i = 0; i < nelt; i++)
6134         sel[i] = nelt / 2 + i;
6135       indices.new_vector (sel, 2, nelt);
6136       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6137         {
6138           if (dump_enabled_p ())
6139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6140                              "shift permutation is not supported by target\n");
6141           return false;
6142         }
6143       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6144
6145       /* Generating permutation constant to select vector from 2.
6146          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6147       for (i = 0; i < nelt / 2; i++)
6148         sel[i] = i;
6149       for (i = nelt / 2; i < nelt; i++)
6150         sel[i] = nelt + i;
6151       indices.new_vector (sel, 2, nelt);
6152       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6153         {
6154           if (dump_enabled_p ())
6155             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6156                              "select is not supported by target\n");
6157           return false;
6158         }
6159       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6160
6161       for (i = 0; i < log_length; i++)
6162         {
6163           for (j = 0; j < length; j += 2)
6164             {
6165               first_vect = dr_chain[j];
6166               second_vect = dr_chain[j + 1];
6167
6168               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6169               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6170                                                first_vect, first_vect,
6171                                                perm2_mask1);
6172               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6173               vect[0] = data_ref;
6174
6175               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6176               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6177                                                second_vect, second_vect,
6178                                                perm2_mask2);
6179               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6180               vect[1] = data_ref;
6181
6182               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6183               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6184                                                vect[0], vect[1], shift1_mask);
6185               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6186               (*result_chain)[j/2 + length/2] = data_ref;
6187
6188               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6189               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6190                                                vect[0], vect[1], select_mask);
6191               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6192               (*result_chain)[j/2] = data_ref;
6193             }
6194           memcpy (dr_chain.address (), result_chain->address (),
6195                   length * sizeof (tree));
6196         }
6197       return true;
6198     }
6199   if (length == 3 && vf > 2)
6200     {
6201       unsigned int k = 0, l = 0;
6202
6203       /* Generating permutation constant to get all elements in rigth order.
6204          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6205       for (i = 0; i < nelt; i++)
6206         {
6207           if (3 * k + (l % 3) >= nelt)
6208             {
6209               k = 0;
6210               l += (3 - (nelt % 3));
6211             }
6212           sel[i] = 3 * k + (l % 3);
6213           k++;
6214         }
6215       vec_perm_indices indices (sel, 2, nelt);
6216       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6217         {
6218           if (dump_enabled_p ())
6219             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6220                              "shuffle of 3 fields structure is not \
6221                               supported by target\n");
6222           return false;
6223         }
6224       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6225
6226       /* Generating permutation constant to shift all elements.
6227          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6228       for (i = 0; i < nelt; i++)
6229         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6230       indices.new_vector (sel, 2, nelt);
6231       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6232         {
6233           if (dump_enabled_p ())
6234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6235                              "shift permutation is not supported by target\n");
6236           return false;
6237         }
6238       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6239
6240       /* Generating permutation constant to shift all elements.
6241          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6242       for (i = 0; i < nelt; i++)
6243         sel[i] = 2 * (nelt / 3) + 1 + i;
6244       indices.new_vector (sel, 2, nelt);
6245       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6246         {
6247           if (dump_enabled_p ())
6248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6249                              "shift permutation is not supported by target\n");
6250           return false;
6251         }
6252       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6253
6254       /* Generating permutation constant to shift all elements.
6255          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6256       for (i = 0; i < nelt; i++)
6257         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6258       indices.new_vector (sel, 2, nelt);
6259       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6260         {
6261           if (dump_enabled_p ())
6262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263                              "shift permutation is not supported by target\n");
6264           return false;
6265         }
6266       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6267
6268       /* Generating permutation constant to shift all elements.
6269          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6270       for (i = 0; i < nelt; i++)
6271         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6272       indices.new_vector (sel, 2, nelt);
6273       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6274         {
6275           if (dump_enabled_p ())
6276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277                              "shift permutation is not supported by target\n");
6278           return false;
6279         }
6280       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6281
6282       for (k = 0; k < 3; k++)
6283         {
6284           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6285           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6286                                            dr_chain[k], dr_chain[k],
6287                                            perm3_mask);
6288           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6289           vect[k] = data_ref;
6290         }
6291
6292       for (k = 0; k < 3; k++)
6293         {
6294           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6295           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6296                                            vect[k % 3], vect[(k + 1) % 3],
6297                                            shift1_mask);
6298           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6299           vect_shift[k] = data_ref;
6300         }
6301
6302       for (k = 0; k < 3; k++)
6303         {
6304           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6305           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6306                                            vect_shift[(4 - k) % 3],
6307                                            vect_shift[(3 - k) % 3],
6308                                            shift2_mask);
6309           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6310           vect[k] = data_ref;
6311         }
6312
6313       (*result_chain)[3 - (nelt % 3)] = vect[2];
6314
6315       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6316       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6317                                        vect[0], shift3_mask);
6318       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6319       (*result_chain)[nelt % 3] = data_ref;
6320
6321       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6322       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6323                                        vect[1], shift4_mask);
6324       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6325       (*result_chain)[0] = data_ref;
6326       return true;
6327     }
6328   return false;
6329 }
6330
6331 /* Function vect_transform_grouped_load.
6332
6333    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6334    to perform their permutation and ascribe the result vectorized statements to
6335    the scalar statements.
6336 */
6337
6338 void
6339 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6340                              int size, gimple_stmt_iterator *gsi)
6341 {
6342   machine_mode mode;
6343   vec<tree> result_chain = vNULL;
6344
6345   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6346      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6347      vectors, that are ready for vector computation.  */
6348   result_chain.create (size);
6349
6350   /* If reassociation width for vector type is 2 or greater target machine can
6351      execute 2 or more vector instructions in parallel.  Otherwise try to
6352      get chain for loads group using vect_shift_permute_load_chain.  */
6353   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6354   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6355       || pow2p_hwi (size)
6356       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6357                                          gsi, &result_chain))
6358     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6359   vect_record_grouped_load_vectors (stmt_info, result_chain);
6360   result_chain.release ();
6361 }
6362
6363 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6364    generated as part of the vectorization of STMT_INFO.  Assign the statement
6365    for each vector to the associated scalar statement.  */
6366
6367 void
6368 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6369                                   vec<tree> result_chain)
6370 {
6371   vec_info *vinfo = stmt_info->vinfo;
6372   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6373   unsigned int i, gap_count;
6374   tree tmp_data_ref;
6375
6376   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6377      Since we scan the chain starting from it's first node, their order
6378      corresponds the order of data-refs in RESULT_CHAIN.  */
6379   stmt_vec_info next_stmt_info = first_stmt_info;
6380   gap_count = 1;
6381   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6382     {
6383       if (!next_stmt_info)
6384         break;
6385
6386       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6387        code elimination pass later.  No need to check for the first stmt in
6388        the group, since it always exists.
6389        DR_GROUP_GAP is the number of steps in elements from the previous
6390        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6391        correspond to the gaps.  */
6392       if (next_stmt_info != first_stmt_info
6393           && gap_count < DR_GROUP_GAP (next_stmt_info))
6394         {
6395           gap_count++;
6396           continue;
6397         }
6398
6399       /* ???  The following needs cleanup after the removal of
6400          DR_GROUP_SAME_DR_STMT.  */
6401       if (next_stmt_info)
6402         {
6403           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6404           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6405              copies, and we put the new vector statement in the first available
6406              RELATED_STMT.  */
6407           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6408             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6409           else
6410             {
6411               stmt_vec_info prev_stmt_info
6412                 = STMT_VINFO_VEC_STMT (next_stmt_info);
6413               stmt_vec_info rel_stmt_info
6414                 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6415               while (rel_stmt_info)
6416                 {
6417                   prev_stmt_info = rel_stmt_info;
6418                   rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6419                 }
6420
6421               STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6422             }
6423
6424           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6425           gap_count = 1;
6426         }
6427     }
6428 }
6429
6430 /* Function vect_force_dr_alignment_p.
6431
6432    Returns whether the alignment of a DECL can be forced to be aligned
6433    on ALIGNMENT bit boundary.  */
6434
6435 bool
6436 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6437 {
6438   if (!VAR_P (decl))
6439     return false;
6440
6441   if (decl_in_symtab_p (decl)
6442       && !symtab_node::get (decl)->can_increase_alignment_p ())
6443     return false;
6444
6445   if (TREE_STATIC (decl))
6446     return (known_le (alignment,
6447                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6448   else
6449     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6450 }
6451
6452
6453 /* Return whether the data reference DR_INFO is supported with respect to its
6454    alignment.
6455    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6456    it is aligned, i.e., check if it is possible to vectorize it with different
6457    alignment.  */
6458
6459 enum dr_alignment_support
6460 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6461                                bool check_aligned_accesses)
6462 {
6463   data_reference *dr = dr_info->dr;
6464   stmt_vec_info stmt_info = dr_info->stmt;
6465   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6466   machine_mode mode = TYPE_MODE (vectype);
6467   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6468   class loop *vect_loop = NULL;
6469   bool nested_in_vect_loop = false;
6470
6471   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6472     return dr_aligned;
6473
6474   /* For now assume all conditional loads/stores support unaligned
6475      access without any special code.  */
6476   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6477     if (gimple_call_internal_p (stmt)
6478         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6479             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6480       return dr_unaligned_supported;
6481
6482   if (loop_vinfo)
6483     {
6484       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6485       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6486     }
6487
6488   /* Possibly unaligned access.  */
6489
6490   /* We can choose between using the implicit realignment scheme (generating
6491      a misaligned_move stmt) and the explicit realignment scheme (generating
6492      aligned loads with a REALIGN_LOAD).  There are two variants to the
6493      explicit realignment scheme: optimized, and unoptimized.
6494      We can optimize the realignment only if the step between consecutive
6495      vector loads is equal to the vector size.  Since the vector memory
6496      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6497      is guaranteed that the misalignment amount remains the same throughout the
6498      execution of the vectorized loop.  Therefore, we can create the
6499      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6500      at the loop preheader.
6501
6502      However, in the case of outer-loop vectorization, when vectorizing a
6503      memory access in the inner-loop nested within the LOOP that is now being
6504      vectorized, while it is guaranteed that the misalignment of the
6505      vectorized memory access will remain the same in different outer-loop
6506      iterations, it is *not* guaranteed that is will remain the same throughout
6507      the execution of the inner-loop.  This is because the inner-loop advances
6508      with the original scalar step (and not in steps of VS).  If the inner-loop
6509      step happens to be a multiple of VS, then the misalignment remains fixed
6510      and we can use the optimized realignment scheme.  For example:
6511
6512       for (i=0; i<N; i++)
6513         for (j=0; j<M; j++)
6514           s += a[i+j];
6515
6516      When vectorizing the i-loop in the above example, the step between
6517      consecutive vector loads is 1, and so the misalignment does not remain
6518      fixed across the execution of the inner-loop, and the realignment cannot
6519      be optimized (as illustrated in the following pseudo vectorized loop):
6520
6521       for (i=0; i<N; i+=4)
6522         for (j=0; j<M; j++){
6523           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6524                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6525                          // (assuming that we start from an aligned address).
6526           }
6527
6528      We therefore have to use the unoptimized realignment scheme:
6529
6530       for (i=0; i<N; i+=4)
6531           for (j=k; j<M; j+=4)
6532           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6533                            // that the misalignment of the initial address is
6534                            // 0).
6535
6536      The loop can then be vectorized as follows:
6537
6538       for (k=0; k<4; k++){
6539         rt = get_realignment_token (&vp[k]);
6540         for (i=0; i<N; i+=4){
6541           v1 = vp[i+k];
6542           for (j=k; j<M; j+=4){
6543             v2 = vp[i+j+VS-1];
6544             va = REALIGN_LOAD <v1,v2,rt>;
6545             vs += va;
6546             v1 = v2;
6547           }
6548         }
6549     } */
6550
6551   if (DR_IS_READ (dr))
6552     {
6553       bool is_packed = false;
6554       tree type = (TREE_TYPE (DR_REF (dr)));
6555
6556       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6557           && (!targetm.vectorize.builtin_mask_for_load
6558               || targetm.vectorize.builtin_mask_for_load ()))
6559         {
6560           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6561
6562           /* If we are doing SLP then the accesses need not have the
6563              same alignment, instead it depends on the SLP group size.  */
6564           if (loop_vinfo
6565               && STMT_SLP_TYPE (stmt_info)
6566               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6567                               * (DR_GROUP_SIZE
6568                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6569                               TYPE_VECTOR_SUBPARTS (vectype)))
6570             ;
6571           else if (!loop_vinfo
6572                    || (nested_in_vect_loop
6573                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6574                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6575             return dr_explicit_realign;
6576           else
6577             return dr_explicit_realign_optimized;
6578         }
6579       if (!known_alignment_for_access_p (dr_info))
6580         is_packed = not_size_aligned (DR_REF (dr));
6581
6582       if (targetm.vectorize.support_vector_misalignment
6583             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6584         /* Can't software pipeline the loads, but can at least do them.  */
6585         return dr_unaligned_supported;
6586     }
6587   else
6588     {
6589       bool is_packed = false;
6590       tree type = (TREE_TYPE (DR_REF (dr)));
6591
6592       if (!known_alignment_for_access_p (dr_info))
6593         is_packed = not_size_aligned (DR_REF (dr));
6594
6595      if (targetm.vectorize.support_vector_misalignment
6596            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6597        return dr_unaligned_supported;
6598     }
6599
6600   /* Unsupported.  */
6601   return dr_unaligned_unsupported;
6602 }