gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2016 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "tm_p.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "cgraph.h"
  35 #include "dumpfile.h"
  36 #include "alias.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "tree-eh.h"
  40 #include "gimplify.h"
  41 #include "gimple-iterator.h"
  42 #include "gimplify-me.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "tree-ssa-loop-manip.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "expr.h"
  50 #include "builtins.h"
  51 #include "params.h"
  52
  53 /* Return true if load- or store-lanes optab OPTAB is implemented for
  54    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  55
  56 static bool
  57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  58                               tree vectype, unsigned HOST_WIDE_INT count)
  59 {
  60   machine_mode mode, array_mode;
  61   bool limit_p;
  62
  63   mode = TYPE_MODE (vectype);
  64   limit_p = !targetm.array_mode_supported_p (mode, count);
  65   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  66                               MODE_INT, limit_p);
  67
  68   if (array_mode == BLKmode)
  69     {
  70       if (dump_enabled_p ())
  71         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  72                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  73                          GET_MODE_NAME (mode), count);
  74       return false;
  75     }
  76
  77   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  78     {
  79       if (dump_enabled_p ())
  80         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  81                          "cannot use %s<%s><%s>\n", name,
  82                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  83       return false;
  84     }
  85
  86   if (dump_enabled_p ())
  87     dump_printf_loc (MSG_NOTE, vect_location,
  88                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  89                      GET_MODE_NAME (mode));
  90
  91   return true;
  92 }
  93
  94
  95 /* Return the smallest scalar part of STMT.
  96    This is used to determine the vectype of the stmt.  We generally set the
  97    vectype according to the type of the result (lhs).  For stmts whose
  98    result-type is different than the type of the arguments (e.g., demotion,
  99    promotion), vectype will be reset appropriately (later).  Note that we have
 100    to visit the smallest datatype in this function, because that determines the
 101    VF.  If the smallest datatype in the loop is present only as the rhs of a
 102    promotion operation - we'd miss it.
 103    Such a case, where a variable of this datatype does not appear in the lhs
 104    anywhere in the loop, can only occur if it's an invariant: e.g.:
 105    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 106    invariant motion.  However, we cannot rely on invariant motion to always
 107    take invariants out of the loop, and so in the case of promotion we also
 108    have to check the rhs.
 109    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 110    types.  */
 111
 112 tree
 113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 114                                HOST_WIDE_INT *rhs_size_unit)
 115 {
 116   tree scalar_type = gimple_expr_type (stmt);
 117   HOST_WIDE_INT lhs, rhs;
 118
 119   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 120
 121   if (is_gimple_assign (stmt)
 122       && (gimple_assign_cast_p (stmt)
 123           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 124           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 125           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 126     {
 127       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 128
 129       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 130       if (rhs < lhs)
 131         scalar_type = rhs_type;
 132     }
 133
 134   *lhs_size_unit = lhs;
 135   *rhs_size_unit = rhs;
 136   return scalar_type;
 137 }
 138
 139
 140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 141    tested at run-time.  Return TRUE if DDR was successfully inserted.
 142    Return false if versioning is not supported.  */
 143
 144 static bool
 145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 146 {
 147   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 148
 149   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 150     return false;
 151
 152   if (dump_enabled_p ())
 153     {
 154       dump_printf_loc (MSG_NOTE, vect_location,
 155                        "mark for run-time aliasing test between ");
 156       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 157       dump_printf (MSG_NOTE,  " and ");
 158       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 159       dump_printf (MSG_NOTE, "\n");
 160     }
 161
 162   if (optimize_loop_nest_for_size_p (loop))
 163     {
 164       if (dump_enabled_p ())
 165         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 166                          "versioning not supported when optimizing"
 167                          " for size.\n");
 168       return false;
 169     }
 170
 171   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 172   if (loop->inner)
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 176                          "versioning not yet supported for outer-loops.\n");
 177       return false;
 178     }
 179
 180   /* FORNOW: We don't support creating runtime alias tests for non-constant
 181      step.  */
 182   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 183       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 184     {
 185       if (dump_enabled_p ())
 186         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 187                          "versioning not yet supported for non-constant "
 188                          "step\n");
 189       return false;
 190     }
 191
 192   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 193   return true;
 194 }
 195
 196
 197 /* Function vect_analyze_data_ref_dependence.
 198
 199    Return TRUE if there (might) exist a dependence between a memory-reference
 200    DRA and a memory-reference DRB.  When versioning for alias may check a
 201    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 202    the data dependence.  */
 203
 204 static bool
 205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 206                                   loop_vec_info loop_vinfo, int *max_vf)
 207 {
 208   unsigned int i;
 209   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 210   struct data_reference *dra = DDR_A (ddr);
 211   struct data_reference *drb = DDR_B (ddr);
 212   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 213   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 214   lambda_vector dist_v;
 215   unsigned int loop_depth;
 216
 217   /* In loop analysis all data references should be vectorizable.  */
 218   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 219       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 220     gcc_unreachable ();
 221
 222   /* Independent data accesses.  */
 223   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 224     return false;
 225
 226   if (dra == drb
 227       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 228     return false;
 229
 230   /* We do not have to consider dependences between accesses that belong
 231      to the same group.  */
 232   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 233       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 234     return false;
 235
 236   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 237      least two scalar iterations, there is always also a true dependence.
 238      As the vectorizer does not re-order loads and stores we can ignore
 239      the anti-dependence if TBAA can disambiguate both DRs similar to the
 240      case with known negative distance anti-dependences (positive
 241      distance anti-dependences would violate TBAA constraints).  */
 242   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 243        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 244       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 245                                  get_alias_set (DR_REF (drb))))
 246     return false;
 247
 248   /* Unknown data dependence.  */
 249   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 250     {
 251       /* If user asserted safelen consecutive iterations can be
 252          executed concurrently, assume independence.  */
 253       if (loop->safelen >= 2)
 254         {
 255           if (loop->safelen < *max_vf)
 256             *max_vf = loop->safelen;
 257           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 258           return false;
 259         }
 260
 261       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 262           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 263         {
 264           if (dump_enabled_p ())
 265             {
 266               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 267                                "versioning for alias not supported for: "
 268                                "can't determine dependence between ");
 269               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 270                                  DR_REF (dra));
 271               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 272               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 273                                  DR_REF (drb));
 274               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 275             }
 276           return true;
 277         }
 278
 279       if (dump_enabled_p ())
 280         {
 281           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 282                            "versioning for alias required: "
 283                            "can't determine dependence between ");
 284           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 285                              DR_REF (dra));
 286           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 287           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 288                              DR_REF (drb));
 289           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 290         }
 291
 292       /* Add to list of ddrs that need to be tested at run-time.  */
 293       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 294     }
 295
 296   /* Known data dependence.  */
 297   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 298     {
 299       /* If user asserted safelen consecutive iterations can be
 300          executed concurrently, assume independence.  */
 301       if (loop->safelen >= 2)
 302         {
 303           if (loop->safelen < *max_vf)
 304             *max_vf = loop->safelen;
 305           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 306           return false;
 307         }
 308
 309       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 310           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 311         {
 312           if (dump_enabled_p ())
 313             {
 314               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 315                                "versioning for alias not supported for: "
 316                                "bad dist vector for ");
 317               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 318                                  DR_REF (dra));
 319               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 320               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 321                                  DR_REF (drb));
 322               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 323             }
 324           return true;
 325         }
 326
 327       if (dump_enabled_p ())
 328         {
 329           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 330                            "versioning for alias required: "
 331                            "bad dist vector for ");
 332           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 333           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 334           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 335           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 336         }
 337       /* Add to list of ddrs that need to be tested at run-time.  */
 338       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 339     }
 340
 341   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 342   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 343     {
 344       int dist = dist_v[loop_depth];
 345
 346       if (dump_enabled_p ())
 347         dump_printf_loc (MSG_NOTE, vect_location,
 348                          "dependence distance  = %d.\n", dist);
 349
 350       if (dist == 0)
 351         {
 352           if (dump_enabled_p ())
 353             {
 354               dump_printf_loc (MSG_NOTE, vect_location,
 355                                "dependence distance == 0 between ");
 356               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 357               dump_printf (MSG_NOTE, " and ");
 358               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 359               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 360             }
 361
 362           /* When we perform grouped accesses and perform implicit CSE
 363              by detecting equal accesses and doing disambiguation with
 364              runtime alias tests like for
 365                 .. = a[i];
 366                 .. = a[i+1];
 367                 a[i] = ..;
 368                 a[i+1] = ..;
 369                 *p = ..;
 370                 .. = a[i];
 371                 .. = a[i+1];
 372              where we will end up loading { a[i], a[i+1] } once, make
 373              sure that inserting group loads before the first load and
 374              stores after the last store will do the right thing.
 375              Similar for groups like
 376                 a[i] = ...;
 377                 ... = a[i];
 378                 a[i+1] = ...;
 379              where loads from the group interleave with the store.  */
 380           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 381               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 382             {
 383               gimple *earlier_stmt;
 384               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 385               if (DR_IS_WRITE
 386                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 387                 {
 388                   if (dump_enabled_p ())
 389                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 390                                      "READ_WRITE dependence in interleaving."
 391                                      "\n");
 392                   return true;
 393                 }
 394             }
 395
 396           continue;
 397         }
 398
 399       if (dist > 0 && DDR_REVERSED_P (ddr))
 400         {
 401           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 402              reversed (to make distance vector positive), and the actual
 403              distance is negative.  */
 404           if (dump_enabled_p ())
 405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                              "dependence distance negative.\n");
 407           /* Record a negative dependence distance to later limit the
 408              amount of stmt copying / unrolling we can perform.
 409              Only need to handle read-after-write dependence.  */
 410           if (DR_IS_READ (drb)
 411               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 412                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 413             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 414           continue;
 415         }
 416
 417       if (abs (dist) >= 2
 418           && abs (dist) < *max_vf)
 419         {
 420           /* The dependence distance requires reduction of the maximal
 421              vectorization factor.  */
 422           *max_vf = abs (dist);
 423           if (dump_enabled_p ())
 424             dump_printf_loc (MSG_NOTE, vect_location,
 425                              "adjusting maximal vectorization factor to %i\n",
 426                              *max_vf);
 427         }
 428
 429       if (abs (dist) >= *max_vf)
 430         {
 431           /* Dependence distance does not create dependence, as far as
 432              vectorization is concerned, in this case.  */
 433           if (dump_enabled_p ())
 434             dump_printf_loc (MSG_NOTE, vect_location,
 435                              "dependence distance >= VF.\n");
 436           continue;
 437         }
 438
 439       if (dump_enabled_p ())
 440         {
 441           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 442                        "not vectorized, possible dependence "
 443                        "between data-refs ");
 444           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 445           dump_printf (MSG_NOTE,  " and ");
 446           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 447           dump_printf (MSG_NOTE,  "\n");
 448         }
 449
 450       return true;
 451     }
 452
 453   return false;
 454 }
 455
 456 /* Function vect_analyze_data_ref_dependences.
 457
 458    Examine all the data references in the loop, and make sure there do not
 459    exist any data dependences between them.  Set *MAX_VF according to
 460    the maximum vectorization factor the data dependences allow.  */
 461
 462 bool
 463 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 464 {
 465   unsigned int i;
 466   struct data_dependence_relation *ddr;
 467
 468   if (dump_enabled_p ())
 469     dump_printf_loc (MSG_NOTE, vect_location,
 470                      "=== vect_analyze_data_ref_dependences ===\n");
 471
 472   LOOP_VINFO_DDRS (loop_vinfo)
 473     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 474              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 475   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 476   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 477   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 478                                 &LOOP_VINFO_DDRS (loop_vinfo),
 479                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 480     return false;
 481
 482   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 483     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 484       return false;
 485
 486   return true;
 487 }
 488
 489
 490 /* Function vect_slp_analyze_data_ref_dependence.
 491
 492    Return TRUE if there (might) exist a dependence between a memory-reference
 493    DRA and a memory-reference DRB.  When versioning for alias may check a
 494    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 495    the data dependence.  */
 496
 497 static bool
 498 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 499 {
 500   struct data_reference *dra = DDR_A (ddr);
 501   struct data_reference *drb = DDR_B (ddr);
 502
 503   /* We need to check dependences of statements marked as unvectorizable
 504      as well, they still can prohibit vectorization.  */
 505
 506   /* Independent data accesses.  */
 507   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 508     return false;
 509
 510   if (dra == drb)
 511     return false;
 512
 513   /* Read-read is OK.  */
 514   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 515     return false;
 516
 517   /* If dra and drb are part of the same interleaving chain consider
 518      them independent.  */
 519   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 520       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 521           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 522     return false;
 523
 524   /* Unknown data dependence.  */
 525   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 526     {
 527       if  (dump_enabled_p ())
 528         {
 529           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 530                            "can't determine dependence between ");
 531           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 532           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 533           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 534           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 535         }
 536     }
 537   else if (dump_enabled_p ())
 538     {
 539       dump_printf_loc (MSG_NOTE, vect_location,
 540                        "determined dependence between ");
 541       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 542       dump_printf (MSG_NOTE, " and ");
 543       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 544       dump_printf (MSG_NOTE,  "\n");
 545     }
 546
 547   return true;
 548 }
 549
 550
 551 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 552    contain the vector of scalar stores of this instance if we are
 553    disambiguating the loads.  */
 554
 555 static bool
 556 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 557                                    vec<gimple *> stores, gimple *last_store)
 558 {
 559   /* This walks over all stmts involved in the SLP load/store done
 560      in NODE verifying we can sink them up to the last stmt in the
 561      group.  */
 562   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 563   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 564     {
 565       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 566       if (access == last_access)
 567         continue;
 568       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 569       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 570            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 571         {
 572           gimple *stmt = gsi_stmt (gsi);
 573           if (! gimple_vuse (stmt)
 574               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 575             continue;
 576
 577           /* If we couldn't record a (single) data reference for this
 578              stmt we have to give up.  */
 579           /* ???  Here and below if dependence analysis fails we can resort
 580              to the alias oracle which can handle more kinds of stmts.  */
 581           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 582           if (!dr_b)
 583             return false;
 584
 585           /* If we run into a store of this same instance (we've just
 586              marked those) then delay dependence checking until we run
 587              into the last store because this is where it will have
 588              been sunk to (and we verify if we can do that as well).  */
 589           if (gimple_visited_p (stmt))
 590             {
 591               if (stmt != last_store)
 592                 continue;
 593               unsigned i;
 594               gimple *store;
 595               FOR_EACH_VEC_ELT (stores, i, store)
 596                 {
 597                   data_reference *store_dr
 598                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 599                   ddr_p ddr = initialize_data_dependence_relation
 600                                 (dr_a, store_dr, vNULL);
 601                   if (vect_slp_analyze_data_ref_dependence (ddr))
 602                     {
 603                       free_dependence_relation (ddr);
 604                       return false;
 605                     }
 606                   free_dependence_relation (ddr);
 607                 }
 608             }
 609
 610           ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
 611           if (vect_slp_analyze_data_ref_dependence (ddr))
 612             {
 613               free_dependence_relation (ddr);
 614               return false;
 615             }
 616           free_dependence_relation (ddr);
 617         }
 618     }
 619   return true;
 620 }
 621
 622
 623 /* Function vect_analyze_data_ref_dependences.
 624
 625    Examine all the data references in the basic-block, and make sure there
 626    do not exist any data dependences between them.  Set *MAX_VF according to
 627    the maximum vectorization factor the data dependences allow.  */
 628
 629 bool
 630 vect_slp_analyze_instance_dependence (slp_instance instance)
 631 {
 632   if (dump_enabled_p ())
 633     dump_printf_loc (MSG_NOTE, vect_location,
 634                      "=== vect_slp_analyze_instance_dependence ===\n");
 635
 636   /* The stores of this instance are at the root of the SLP tree.  */
 637   slp_tree store = SLP_INSTANCE_TREE (instance);
 638   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 639     store = NULL;
 640
 641   /* Verify we can sink stores to the vectorized stmt insert location.  */
 642   gimple *last_store = NULL;
 643   if (store)
 644     {
 645       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 646         return false;
 647
 648       /* Mark stores in this instance and remember the last one.  */
 649       last_store = vect_find_last_scalar_stmt_in_slp (store);
 650       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 651         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 652     }
 653
 654   bool res = true;
 655
 656   /* Verify we can sink loads to the vectorized stmt insert location,
 657      special-casing stores of this instance.  */
 658   slp_tree load;
 659   unsigned int i;
 660   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 661     if (! vect_slp_analyze_node_dependences (instance, load,
 662                                              store
 663                                              ? SLP_TREE_SCALAR_STMTS (store)
 664                                              : vNULL, last_store))
 665       {
 666         res = false;
 667         break;
 668       }
 669
 670   /* Unset the visited flag.  */
 671   if (store)
 672     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 673       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 674
 675   return res;
 676 }
 677
 678 /* Function vect_compute_data_ref_alignment
 679
 680    Compute the misalignment of the data reference DR.
 681
 682    Output:
 683    1. If during the misalignment computation it is found that the data reference
 684       cannot be vectorized then false is returned.
 685    2. DR_MISALIGNMENT (DR) is defined.
 686
 687    FOR NOW: No analysis is actually performed. Misalignment is calculated
 688    only for trivial cases. TODO.  */
 689
 690 bool
 691 vect_compute_data_ref_alignment (struct data_reference *dr)
 692 {
 693   gimple *stmt = DR_STMT (dr);
 694   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 695   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 696   struct loop *loop = NULL;
 697   tree ref = DR_REF (dr);
 698   tree vectype;
 699   tree base, base_addr;
 700   tree misalign = NULL_TREE;
 701   tree aligned_to;
 702   tree step;
 703   unsigned HOST_WIDE_INT alignment;
 704
 705   if (dump_enabled_p ())
 706     dump_printf_loc (MSG_NOTE, vect_location,
 707                      "vect_compute_data_ref_alignment:\n");
 708
 709   if (loop_vinfo)
 710     loop = LOOP_VINFO_LOOP (loop_vinfo);
 711
 712   /* Initialize misalignment to unknown.  */
 713   SET_DR_MISALIGNMENT (dr, -1);
 714
 715   if (tree_fits_shwi_p (DR_STEP (dr)))
 716     misalign = DR_INIT (dr);
 717   aligned_to = DR_ALIGNED_TO (dr);
 718   base_addr = DR_BASE_ADDRESS (dr);
 719   vectype = STMT_VINFO_VECTYPE (stmt_info);
 720
 721   /* In case the dataref is in an inner-loop of the loop that is being
 722      vectorized (LOOP), we use the base and misalignment information
 723      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 724      stays the same throughout the execution of the inner-loop, which is why
 725      we have to check that the stride of the dataref in the inner-loop evenly
 726      divides by the vector size.  */
 727   if (loop && nested_in_vect_loop_p (loop, stmt))
 728     {
 729       tree step = DR_STEP (dr);
 730
 731       if (tree_fits_shwi_p (step)
 732           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 733         {
 734           if (dump_enabled_p ())
 735             dump_printf_loc (MSG_NOTE, vect_location,
 736                              "inner step divides the vector-size.\n");
 737           misalign = STMT_VINFO_DR_INIT (stmt_info);
 738           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 739           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 740         }
 741       else
 742         {
 743           if (dump_enabled_p ())
 744             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 745                              "inner step doesn't divide the vector-size.\n");
 746           misalign = NULL_TREE;
 747         }
 748     }
 749
 750   /* Similarly we can only use base and misalignment information relative to
 751      an innermost loop if the misalignment stays the same throughout the
 752      execution of the loop.  As above, this is the case if the stride of
 753      the dataref evenly divides by the vector size.  */
 754   else
 755     {
 756       tree step = DR_STEP (dr);
 757       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 758
 759       if (tree_fits_shwi_p (step)
 760           && ((tree_to_shwi (step) * vf)
 761               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 762         {
 763           if (dump_enabled_p ())
 764             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 765                              "step doesn't divide the vector-size.\n");
 766           misalign = NULL_TREE;
 767         }
 768     }
 769
 770   /* To look at alignment of the base we have to preserve an inner MEM_REF
 771      as that carries alignment information of the actual access.  */
 772   base = ref;
 773   while (handled_component_p (base))
 774     base = TREE_OPERAND (base, 0);
 775   if (TREE_CODE (base) == MEM_REF)
 776     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 777                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 778   unsigned int base_alignment = get_object_alignment (base);
 779
 780   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
 781     DR_VECT_AUX (dr)->base_element_aligned = true;
 782
 783   alignment = TYPE_ALIGN_UNIT (vectype);
 784
 785   if ((compare_tree_int (aligned_to, alignment) < 0)
 786       || !misalign)
 787     {
 788       if (dump_enabled_p ())
 789         {
 790           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 791                            "Unknown alignment for access: ");
 792           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 793           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 794         }
 795       return true;
 796     }
 797
 798   if (base_alignment < TYPE_ALIGN (vectype))
 799     {
 800       /* Strip an inner MEM_REF to a bare decl if possible.  */
 801       if (TREE_CODE (base) == MEM_REF
 802           && integer_zerop (TREE_OPERAND (base, 1))
 803           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 804         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 805
 806       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 807         {
 808           if (dump_enabled_p ())
 809             {
 810               dump_printf_loc (MSG_NOTE, vect_location,
 811                                "can't force alignment of ref: ");
 812               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 813               dump_printf (MSG_NOTE, "\n");
 814             }
 815           return true;
 816         }
 817
 818       /* Force the alignment of the decl.
 819          NOTE: This is the only change to the code we make during
 820          the analysis phase, before deciding to vectorize the loop.  */
 821       if (dump_enabled_p ())
 822         {
 823           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 824           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 825           dump_printf (MSG_NOTE, "\n");
 826         }
 827
 828       DR_VECT_AUX (dr)->base_decl = base;
 829       DR_VECT_AUX (dr)->base_misaligned = true;
 830       DR_VECT_AUX (dr)->base_element_aligned = true;
 831     }
 832
 833   if (loop && nested_in_vect_loop_p (loop, stmt))
 834     step = STMT_VINFO_DR_STEP (stmt_info);
 835   else
 836     step = DR_STEP (dr);
 837   /* If this is a backward running DR then first access in the larger
 838      vectype actually is N-1 elements before the address in the DR.
 839      Adjust misalign accordingly.  */
 840   if (tree_int_cst_sgn (step) < 0)
 841     {
 842       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 843       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 844          otherwise we wouldn't be here.  */
 845       offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
 846       /* PLUS because STEP was negative.  */
 847       misalign = size_binop (PLUS_EXPR, misalign, offset);
 848     }
 849
 850   SET_DR_MISALIGNMENT (dr,
 851                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 852
 853   if (dump_enabled_p ())
 854     {
 855       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 856                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 857       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 858       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 859     }
 860
 861   return true;
 862 }
 863
 864
 865 /* Function vect_update_misalignment_for_peel
 866
 867    DR - the data reference whose misalignment is to be adjusted.
 868    DR_PEEL - the data reference whose misalignment is being made
 869              zero in the vector loop by the peel.
 870    NPEEL - the number of iterations in the peel loop if the misalignment
 871            of DR_PEEL is known at compile time.  */
 872
 873 static void
 874 vect_update_misalignment_for_peel (struct data_reference *dr,
 875                                    struct data_reference *dr_peel, int npeel)
 876 {
 877   unsigned int i;
 878   vec<dr_p> same_align_drs;
 879   struct data_reference *current_dr;
 880   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 881   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 882   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 883   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 884
 885  /* For interleaved data accesses the step in the loop must be multiplied by
 886      the size of the interleaving group.  */
 887   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 888     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 889   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 890     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 891
 892   /* It can be assumed that the data refs with the same alignment as dr_peel
 893      are aligned in the vector loop.  */
 894   same_align_drs
 895     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 896   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 897     {
 898       if (current_dr != dr)
 899         continue;
 900       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 901                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 902       SET_DR_MISALIGNMENT (dr, 0);
 903       return;
 904     }
 905
 906   if (known_alignment_for_access_p (dr)
 907       && known_alignment_for_access_p (dr_peel))
 908     {
 909       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 910       int misal = DR_MISALIGNMENT (dr);
 911       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 912       misal += negative ? -npeel * dr_size : npeel * dr_size;
 913       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 914       SET_DR_MISALIGNMENT (dr, misal);
 915       return;
 916     }
 917
 918   if (dump_enabled_p ())
 919     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 920   SET_DR_MISALIGNMENT (dr, -1);
 921 }
 922
 923
 924 /* Function verify_data_ref_alignment
 925
 926    Return TRUE if DR can be handled with respect to alignment.  */
 927
 928 static bool
 929 verify_data_ref_alignment (data_reference_p dr)
 930 {
 931   enum dr_alignment_support supportable_dr_alignment
 932     = vect_supportable_dr_alignment (dr, false);
 933   if (!supportable_dr_alignment)
 934     {
 935       if (dump_enabled_p ())
 936         {
 937           if (DR_IS_READ (dr))
 938             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 939                              "not vectorized: unsupported unaligned load.");
 940           else
 941             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 942                              "not vectorized: unsupported unaligned "
 943                              "store.");
 944
 945           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 946                              DR_REF (dr));
 947           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 948         }
 949       return false;
 950     }
 951
 952   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 953     dump_printf_loc (MSG_NOTE, vect_location,
 954                      "Vectorizing an unaligned access.\n");
 955
 956   return true;
 957 }
 958
 959 /* Function vect_verify_datarefs_alignment
 960
 961    Return TRUE if all data references in the loop can be
 962    handled with respect to alignment.  */
 963
 964 bool
 965 vect_verify_datarefs_alignment (loop_vec_info vinfo)
 966 {
 967   vec<data_reference_p> datarefs = vinfo->datarefs;
 968   struct data_reference *dr;
 969   unsigned int i;
 970
 971   FOR_EACH_VEC_ELT (datarefs, i, dr)
 972     {
 973       gimple *stmt = DR_STMT (dr);
 974       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 975
 976       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 977         continue;
 978
 979       /* For interleaving, only the alignment of the first access matters.   */
 980       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
 981           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 982         continue;
 983
 984       /* Strided accesses perform only component accesses, alignment is
 985          irrelevant for them.  */
 986       if (STMT_VINFO_STRIDED_P (stmt_info)
 987           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 988         continue;
 989
 990       if (! verify_data_ref_alignment (dr))
 991         return false;
 992     }
 993
 994   return true;
 995 }
 996
 997 /* Given an memory reference EXP return whether its alignment is less
 998    than its size.  */
 999
1000 static bool
1001 not_size_aligned (tree exp)
1002 {
1003   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1004     return true;
1005
1006   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1007           > get_object_alignment (exp));
1008 }
1009
1010 /* Function vector_alignment_reachable_p
1011
1012    Return true if vector alignment for DR is reachable by peeling
1013    a few loop iterations.  Return false otherwise.  */
1014
1015 static bool
1016 vector_alignment_reachable_p (struct data_reference *dr)
1017 {
1018   gimple *stmt = DR_STMT (dr);
1019   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1020   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1021
1022   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1023     {
1024       /* For interleaved access we peel only if number of iterations in
1025          the prolog loop ({VF - misalignment}), is a multiple of the
1026          number of the interleaved accesses.  */
1027       int elem_size, mis_in_elements;
1028       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1029
1030       /* FORNOW: handle only known alignment.  */
1031       if (!known_alignment_for_access_p (dr))
1032         return false;
1033
1034       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1035       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1036
1037       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1038         return false;
1039     }
1040
1041   /* If misalignment is known at the compile time then allow peeling
1042      only if natural alignment is reachable through peeling.  */
1043   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1044     {
1045       HOST_WIDE_INT elmsize =
1046                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1047       if (dump_enabled_p ())
1048         {
1049           dump_printf_loc (MSG_NOTE, vect_location,
1050                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1051           dump_printf (MSG_NOTE,
1052                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1053         }
1054       if (DR_MISALIGNMENT (dr) % elmsize)
1055         {
1056           if (dump_enabled_p ())
1057             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058                              "data size does not divide the misalignment.\n");
1059           return false;
1060         }
1061     }
1062
1063   if (!known_alignment_for_access_p (dr))
1064     {
1065       tree type = TREE_TYPE (DR_REF (dr));
1066       bool is_packed = not_size_aligned (DR_REF (dr));
1067       if (dump_enabled_p ())
1068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069                          "Unknown misalignment, is_packed = %d\n",is_packed);
1070       if ((TYPE_USER_ALIGN (type) && !is_packed)
1071           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1072         return true;
1073       else
1074         return false;
1075     }
1076
1077   return true;
1078 }
1079
1080
1081 /* Calculate the cost of the memory access represented by DR.  */
1082
1083 static void
1084 vect_get_data_access_cost (struct data_reference *dr,
1085                            unsigned int *inside_cost,
1086                            unsigned int *outside_cost,
1087                            stmt_vector_for_cost *body_cost_vec)
1088 {
1089   gimple *stmt = DR_STMT (dr);
1090   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1091   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1092   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1093   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1094   int ncopies = vf / nunits;
1095
1096   if (DR_IS_READ (dr))
1097     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1098                         NULL, body_cost_vec, false);
1099   else
1100     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1101
1102   if (dump_enabled_p ())
1103     dump_printf_loc (MSG_NOTE, vect_location,
1104                      "vect_get_data_access_cost: inside_cost = %d, "
1105                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1106 }
1107
1108
1109 typedef struct _vect_peel_info
1110 {
1111   int npeel;
1112   struct data_reference *dr;
1113   unsigned int count;
1114 } *vect_peel_info;
1115
1116 typedef struct _vect_peel_extended_info
1117 {
1118   struct _vect_peel_info peel_info;
1119   unsigned int inside_cost;
1120   unsigned int outside_cost;
1121   stmt_vector_for_cost body_cost_vec;
1122 } *vect_peel_extended_info;
1123
1124
1125 /* Peeling hashtable helpers.  */
1126
1127 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1128 {
1129   static inline hashval_t hash (const _vect_peel_info *);
1130   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1131 };
1132
1133 inline hashval_t
1134 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1135 {
1136   return (hashval_t) peel_info->npeel;
1137 }
1138
1139 inline bool
1140 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1141 {
1142   return (a->npeel == b->npeel);
1143 }
1144
1145
1146 /* Insert DR into peeling hash table with NPEEL as key.  */
1147
1148 static void
1149 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1150                           loop_vec_info loop_vinfo, struct data_reference *dr,
1151                           int npeel)
1152 {
1153   struct _vect_peel_info elem, *slot;
1154   _vect_peel_info **new_slot;
1155   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1156
1157   elem.npeel = npeel;
1158   slot = peeling_htab->find (&elem);
1159   if (slot)
1160     slot->count++;
1161   else
1162     {
1163       slot = XNEW (struct _vect_peel_info);
1164       slot->npeel = npeel;
1165       slot->dr = dr;
1166       slot->count = 1;
1167       new_slot = peeling_htab->find_slot (slot, INSERT);
1168       *new_slot = slot;
1169     }
1170
1171   if (!supportable_dr_alignment
1172       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1173     slot->count += VECT_MAX_COST;
1174 }
1175
1176
1177 /* Traverse peeling hash table to find peeling option that aligns maximum
1178    number of data accesses.  */
1179
1180 int
1181 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1182                                      _vect_peel_extended_info *max)
1183 {
1184   vect_peel_info elem = *slot;
1185
1186   if (elem->count > max->peel_info.count
1187       || (elem->count == max->peel_info.count
1188           && max->peel_info.npeel > elem->npeel))
1189     {
1190       max->peel_info.npeel = elem->npeel;
1191       max->peel_info.count = elem->count;
1192       max->peel_info.dr = elem->dr;
1193     }
1194
1195   return 1;
1196 }
1197
1198
1199 /* Traverse peeling hash table and calculate cost for each peeling option.
1200    Find the one with the lowest cost.  */
1201
1202 int
1203 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1204                                    _vect_peel_extended_info *min)
1205 {
1206   vect_peel_info elem = *slot;
1207   int save_misalignment, dummy;
1208   unsigned int inside_cost = 0, outside_cost = 0, i;
1209   gimple *stmt = DR_STMT (elem->dr);
1210   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1212   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1213   struct data_reference *dr;
1214   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1215
1216   prologue_cost_vec.create (2);
1217   body_cost_vec.create (2);
1218   epilogue_cost_vec.create (2);
1219
1220   FOR_EACH_VEC_ELT (datarefs, i, dr)
1221     {
1222       stmt = DR_STMT (dr);
1223       stmt_info = vinfo_for_stmt (stmt);
1224       /* For interleaving, only the alignment of the first access
1225          matters.  */
1226       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1227           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1228         continue;
1229
1230       /* Strided accesses perform only component accesses, alignment is
1231          irrelevant for them.  */
1232       if (STMT_VINFO_STRIDED_P (stmt_info)
1233           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1234         continue;
1235
1236       save_misalignment = DR_MISALIGNMENT (dr);
1237       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1238       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1239                                  &body_cost_vec);
1240       SET_DR_MISALIGNMENT (dr, save_misalignment);
1241     }
1242
1243   outside_cost += vect_get_known_peeling_cost
1244     (loop_vinfo, elem->npeel, &dummy,
1245      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1246      &prologue_cost_vec, &epilogue_cost_vec);
1247
1248   /* Prologue and epilogue costs are added to the target model later.
1249      These costs depend only on the scalar iteration cost, the
1250      number of peeling iterations finally chosen, and the number of
1251      misaligned statements.  So discard the information found here.  */
1252   prologue_cost_vec.release ();
1253   epilogue_cost_vec.release ();
1254
1255   if (inside_cost < min->inside_cost
1256       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1257     {
1258       min->inside_cost = inside_cost;
1259       min->outside_cost = outside_cost;
1260       min->body_cost_vec.release ();
1261       min->body_cost_vec = body_cost_vec;
1262       min->peel_info.dr = elem->dr;
1263       min->peel_info.npeel = elem->npeel;
1264     }
1265   else
1266     body_cost_vec.release ();
1267
1268   return 1;
1269 }
1270
1271
1272 /* Choose best peeling option by traversing peeling hash table and either
1273    choosing an option with the lowest cost (if cost model is enabled) or the
1274    option that aligns as many accesses as possible.  */
1275
1276 static struct data_reference *
1277 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1278                                        loop_vec_info loop_vinfo,
1279                                        unsigned int *npeel,
1280                                        stmt_vector_for_cost *body_cost_vec)
1281 {
1282    struct _vect_peel_extended_info res;
1283
1284    res.peel_info.dr = NULL;
1285    res.body_cost_vec = stmt_vector_for_cost ();
1286
1287    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1288      {
1289        res.inside_cost = INT_MAX;
1290        res.outside_cost = INT_MAX;
1291        peeling_htab->traverse <_vect_peel_extended_info *,
1292                                vect_peeling_hash_get_lowest_cost> (&res);
1293      }
1294    else
1295      {
1296        res.peel_info.count = 0;
1297        peeling_htab->traverse <_vect_peel_extended_info *,
1298                                vect_peeling_hash_get_most_frequent> (&res);
1299      }
1300
1301    *npeel = res.peel_info.npeel;
1302    *body_cost_vec = res.body_cost_vec;
1303    return res.peel_info.dr;
1304 }
1305
1306
1307 /* Function vect_enhance_data_refs_alignment
1308
1309    This pass will use loop versioning and loop peeling in order to enhance
1310    the alignment of data references in the loop.
1311
1312    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1313    original loop is to be vectorized.  Any other loops that are created by
1314    the transformations performed in this pass - are not supposed to be
1315    vectorized.  This restriction will be relaxed.
1316
1317    This pass will require a cost model to guide it whether to apply peeling
1318    or versioning or a combination of the two.  For example, the scheme that
1319    intel uses when given a loop with several memory accesses, is as follows:
1320    choose one memory access ('p') which alignment you want to force by doing
1321    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1322    other accesses are not necessarily aligned, or (2) use loop versioning to
1323    generate one loop in which all accesses are aligned, and another loop in
1324    which only 'p' is necessarily aligned.
1325
1326    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1327    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1328    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1329
1330    Devising a cost model is the most critical aspect of this work.  It will
1331    guide us on which access to peel for, whether to use loop versioning, how
1332    many versions to create, etc.  The cost model will probably consist of
1333    generic considerations as well as target specific considerations (on
1334    powerpc for example, misaligned stores are more painful than misaligned
1335    loads).
1336
1337    Here are the general steps involved in alignment enhancements:
1338
1339      -- original loop, before alignment analysis:
1340         for (i=0; i<N; i++){
1341           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1342           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1343         }
1344
1345      -- After vect_compute_data_refs_alignment:
1346         for (i=0; i<N; i++){
1347           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1348           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1349         }
1350
1351      -- Possibility 1: we do loop versioning:
1352      if (p is aligned) {
1353         for (i=0; i<N; i++){    # loop 1A
1354           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1355           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1356         }
1357      }
1358      else {
1359         for (i=0; i<N; i++){    # loop 1B
1360           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1361           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1362         }
1363      }
1364
1365      -- Possibility 2: we do loop peeling:
1366      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1367         x = q[i];
1368         p[i] = y;
1369      }
1370      for (i = 3; i < N; i++){   # loop 2A
1371         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1372         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1373      }
1374
1375      -- Possibility 3: combination of loop peeling and versioning:
1376      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1377         x = q[i];
1378         p[i] = y;
1379      }
1380      if (p is aligned) {
1381         for (i = 3; i<N; i++){  # loop 3A
1382           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1383           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1384         }
1385      }
1386      else {
1387         for (i = 3; i<N; i++){  # loop 3B
1388           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1389           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1390         }
1391      }
1392
1393      These loops are later passed to loop_transform to be vectorized.  The
1394      vectorizer will use the alignment information to guide the transformation
1395      (whether to generate regular loads/stores, or with special handling for
1396      misalignment).  */
1397
1398 bool
1399 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1400 {
1401   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1403   enum dr_alignment_support supportable_dr_alignment;
1404   struct data_reference *dr0 = NULL, *first_store = NULL;
1405   struct data_reference *dr;
1406   unsigned int i, j;
1407   bool do_peeling = false;
1408   bool do_versioning = false;
1409   bool stat;
1410   gimple *stmt;
1411   stmt_vec_info stmt_info;
1412   unsigned int npeel = 0;
1413   bool all_misalignments_unknown = true;
1414   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1415   unsigned possible_npeel_number = 1;
1416   tree vectype;
1417   unsigned int nelements, mis, same_align_drs_max = 0;
1418   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1419   hash_table<peel_info_hasher> peeling_htab (1);
1420
1421   if (dump_enabled_p ())
1422     dump_printf_loc (MSG_NOTE, vect_location,
1423                      "=== vect_enhance_data_refs_alignment ===\n");
1424
1425   /* Reset data so we can safely be called multiple times.  */
1426   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1427   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1428
1429   /* While cost model enhancements are expected in the future, the high level
1430      view of the code at this time is as follows:
1431
1432      A) If there is a misaligned access then see if peeling to align
1433         this access can make all data references satisfy
1434         vect_supportable_dr_alignment.  If so, update data structures
1435         as needed and return true.
1436
1437      B) If peeling wasn't possible and there is a data reference with an
1438         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1439         then see if loop versioning checks can be used to make all data
1440         references satisfy vect_supportable_dr_alignment.  If so, update
1441         data structures as needed and return true.
1442
1443      C) If neither peeling nor versioning were successful then return false if
1444         any data reference does not satisfy vect_supportable_dr_alignment.
1445
1446      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1447
1448      Note, Possibility 3 above (which is peeling and versioning together) is not
1449      being done at this time.  */
1450
1451   /* (1) Peeling to force alignment.  */
1452
1453   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1454      Considerations:
1455      + How many accesses will become aligned due to the peeling
1456      - How many accesses will become unaligned due to the peeling,
1457        and the cost of misaligned accesses.
1458      - The cost of peeling (the extra runtime checks, the increase
1459        in code size).  */
1460
1461   FOR_EACH_VEC_ELT (datarefs, i, dr)
1462     {
1463       stmt = DR_STMT (dr);
1464       stmt_info = vinfo_for_stmt (stmt);
1465
1466       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1467         continue;
1468
1469       /* For interleaving, only the alignment of the first access
1470          matters.  */
1471       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1472           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1473         continue;
1474
1475       /* For invariant accesses there is nothing to enhance.  */
1476       if (integer_zerop (DR_STEP (dr)))
1477         continue;
1478
1479       /* Strided accesses perform only component accesses, alignment is
1480          irrelevant for them.  */
1481       if (STMT_VINFO_STRIDED_P (stmt_info)
1482           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1483         continue;
1484
1485       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1486       do_peeling = vector_alignment_reachable_p (dr);
1487       if (do_peeling)
1488         {
1489           if (known_alignment_for_access_p (dr))
1490             {
1491               unsigned int npeel_tmp;
1492               bool negative = tree_int_cst_compare (DR_STEP (dr),
1493                                                     size_zero_node) < 0;
1494
1495               /* Save info about DR in the hash table.  */
1496               vectype = STMT_VINFO_VECTYPE (stmt_info);
1497               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1498               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1499                                                 TREE_TYPE (DR_REF (dr))));
1500               npeel_tmp = (negative
1501                            ? (mis - nelements) : (nelements - mis))
1502                   & (nelements - 1);
1503
1504               /* For multiple types, it is possible that the bigger type access
1505                  will have more than one peeling option.  E.g., a loop with two
1506                  types: one of size (vector size / 4), and the other one of
1507                  size (vector size / 8).  Vectorization factor will 8.  If both
1508                  access are misaligned by 3, the first one needs one scalar
1509                  iteration to be aligned, and the second one needs 5.  But the
1510                  first one will be aligned also by peeling 5 scalar
1511                  iterations, and in that case both accesses will be aligned.
1512                  Hence, except for the immediate peeling amount, we also want
1513                  to try to add full vector size, while we don't exceed
1514                  vectorization factor.
1515                  We do this automtically for cost model, since we calculate cost
1516                  for every peeling option.  */
1517               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1518                 {
1519                   if (STMT_SLP_TYPE (stmt_info))
1520                     possible_npeel_number
1521                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1522                   else
1523                     possible_npeel_number = vf / nelements;
1524                 }
1525
1526               /* Handle the aligned case. We may decide to align some other
1527                  access, making DR unaligned.  */
1528               if (DR_MISALIGNMENT (dr) == 0)
1529                 {
1530                   npeel_tmp = 0;
1531                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1532                     possible_npeel_number++;
1533                 }
1534
1535               for (j = 0; j < possible_npeel_number; j++)
1536                 {
1537                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1538                                             dr, npeel_tmp);
1539                   npeel_tmp += nelements;
1540                 }
1541
1542               all_misalignments_unknown = false;
1543               /* Data-ref that was chosen for the case that all the
1544                  misalignments are unknown is not relevant anymore, since we
1545                  have a data-ref with known alignment.  */
1546               dr0 = NULL;
1547             }
1548           else
1549             {
1550               /* If we don't know any misalignment values, we prefer
1551                  peeling for data-ref that has the maximum number of data-refs
1552                  with the same alignment, unless the target prefers to align
1553                  stores over load.  */
1554               if (all_misalignments_unknown)
1555                 {
1556                   unsigned same_align_drs
1557                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1558                   if (!dr0
1559                       || same_align_drs_max < same_align_drs)
1560                     {
1561                       same_align_drs_max = same_align_drs;
1562                       dr0 = dr;
1563                     }
1564                   /* For data-refs with the same number of related
1565                      accesses prefer the one where the misalign
1566                      computation will be invariant in the outermost loop.  */
1567                   else if (same_align_drs_max == same_align_drs)
1568                     {
1569                       struct loop *ivloop0, *ivloop;
1570                       ivloop0 = outermost_invariant_loop_for_expr
1571                           (loop, DR_BASE_ADDRESS (dr0));
1572                       ivloop = outermost_invariant_loop_for_expr
1573                           (loop, DR_BASE_ADDRESS (dr));
1574                       if ((ivloop && !ivloop0)
1575                           || (ivloop && ivloop0
1576                               && flow_loop_nested_p (ivloop, ivloop0)))
1577                         dr0 = dr;
1578                     }
1579
1580                   if (!first_store && DR_IS_WRITE (dr))
1581                     first_store = dr;
1582                 }
1583
1584               /* If there are both known and unknown misaligned accesses in the
1585                  loop, we choose peeling amount according to the known
1586                  accesses.  */
1587               if (!supportable_dr_alignment)
1588                 {
1589                   dr0 = dr;
1590                   if (!first_store && DR_IS_WRITE (dr))
1591                     first_store = dr;
1592                 }
1593             }
1594         }
1595       else
1596         {
1597           if (!aligned_access_p (dr))
1598             {
1599               if (dump_enabled_p ())
1600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                                  "vector alignment may not be reachable\n");
1602               break;
1603             }
1604         }
1605     }
1606
1607   /* Check if we can possibly peel the loop.  */
1608   if (!vect_can_advance_ivs_p (loop_vinfo)
1609       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1610       || loop->inner)
1611     do_peeling = false;
1612
1613   if (do_peeling
1614       && all_misalignments_unknown
1615       && vect_supportable_dr_alignment (dr0, false))
1616     {
1617       /* Check if the target requires to prefer stores over loads, i.e., if
1618          misaligned stores are more expensive than misaligned loads (taking
1619          drs with same alignment into account).  */
1620       if (first_store && DR_IS_READ (dr0))
1621         {
1622           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1623           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1624           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1625           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1626           stmt_vector_for_cost dummy;
1627           dummy.create (2);
1628
1629           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1630                                      &dummy);
1631           vect_get_data_access_cost (first_store, &store_inside_cost,
1632                                      &store_outside_cost, &dummy);
1633
1634           dummy.release ();
1635
1636           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1637              aligning the load DR0).  */
1638           load_inside_penalty = store_inside_cost;
1639           load_outside_penalty = store_outside_cost;
1640           for (i = 0;
1641                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1642                           DR_STMT (first_store))).iterate (i, &dr);
1643                i++)
1644             if (DR_IS_READ (dr))
1645               {
1646                 load_inside_penalty += load_inside_cost;
1647                 load_outside_penalty += load_outside_cost;
1648               }
1649             else
1650               {
1651                 load_inside_penalty += store_inside_cost;
1652                 load_outside_penalty += store_outside_cost;
1653               }
1654
1655           /* Calculate the penalty for leaving DR0 unaligned (by
1656              aligning the FIRST_STORE).  */
1657           store_inside_penalty = load_inside_cost;
1658           store_outside_penalty = load_outside_cost;
1659           for (i = 0;
1660                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1661                       DR_STMT (dr0))).iterate (i, &dr);
1662                i++)
1663             if (DR_IS_READ (dr))
1664               {
1665                 store_inside_penalty += load_inside_cost;
1666                 store_outside_penalty += load_outside_cost;
1667               }
1668             else
1669               {
1670                 store_inside_penalty += store_inside_cost;
1671                 store_outside_penalty += store_outside_cost;
1672               }
1673
1674           if (load_inside_penalty > store_inside_penalty
1675               || (load_inside_penalty == store_inside_penalty
1676                   && load_outside_penalty > store_outside_penalty))
1677             dr0 = first_store;
1678         }
1679
1680       /* In case there are only loads with different unknown misalignments, use
1681          peeling only if it may help to align other accesses in the loop or
1682          if it may help improving load bandwith when we'd end up using
1683          unaligned loads.  */
1684       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1685       if (!first_store
1686           && !STMT_VINFO_SAME_ALIGN_REFS (
1687                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1688           && (vect_supportable_dr_alignment (dr0, false)
1689               != dr_unaligned_supported
1690               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1691                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1692         do_peeling = false;
1693     }
1694
1695   if (do_peeling && !dr0)
1696     {
1697       /* Peeling is possible, but there is no data access that is not supported
1698          unless aligned. So we try to choose the best possible peeling.  */
1699
1700       /* We should get here only if there are drs with known misalignment.  */
1701       gcc_assert (!all_misalignments_unknown);
1702
1703       /* Choose the best peeling from the hash table.  */
1704       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1705                                                    loop_vinfo, &npeel,
1706                                                    &body_cost_vec);
1707       if (!dr0 || !npeel)
1708         do_peeling = false;
1709     }
1710
1711   if (do_peeling)
1712     {
1713       stmt = DR_STMT (dr0);
1714       stmt_info = vinfo_for_stmt (stmt);
1715       vectype = STMT_VINFO_VECTYPE (stmt_info);
1716       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1717
1718       if (known_alignment_for_access_p (dr0))
1719         {
1720           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1721                                                 size_zero_node) < 0;
1722           if (!npeel)
1723             {
1724               /* Since it's known at compile time, compute the number of
1725                  iterations in the peeled loop (the peeling factor) for use in
1726                  updating DR_MISALIGNMENT values.  The peeling factor is the
1727                  vectorization factor minus the misalignment as an element
1728                  count.  */
1729               mis = DR_MISALIGNMENT (dr0);
1730               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1731               npeel = ((negative ? mis - nelements : nelements - mis)
1732                        & (nelements - 1));
1733             }
1734
1735           /* For interleaved data access every iteration accesses all the
1736              members of the group, therefore we divide the number of iterations
1737              by the group size.  */
1738           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1739           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1740             npeel /= GROUP_SIZE (stmt_info);
1741
1742           if (dump_enabled_p ())
1743             dump_printf_loc (MSG_NOTE, vect_location,
1744                              "Try peeling by %d\n", npeel);
1745         }
1746
1747       /* Ensure that all data refs can be vectorized after the peel.  */
1748       FOR_EACH_VEC_ELT (datarefs, i, dr)
1749         {
1750           int save_misalignment;
1751
1752           if (dr == dr0)
1753             continue;
1754
1755           stmt = DR_STMT (dr);
1756           stmt_info = vinfo_for_stmt (stmt);
1757           /* For interleaving, only the alignment of the first access
1758             matters.  */
1759           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1760               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1761             continue;
1762
1763           /* Strided accesses perform only component accesses, alignment is
1764              irrelevant for them.  */
1765           if (STMT_VINFO_STRIDED_P (stmt_info)
1766               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1767             continue;
1768
1769           save_misalignment = DR_MISALIGNMENT (dr);
1770           vect_update_misalignment_for_peel (dr, dr0, npeel);
1771           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1772           SET_DR_MISALIGNMENT (dr, save_misalignment);
1773
1774           if (!supportable_dr_alignment)
1775             {
1776               do_peeling = false;
1777               break;
1778             }
1779         }
1780
1781       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1782         {
1783           stat = vect_verify_datarefs_alignment (loop_vinfo);
1784           if (!stat)
1785             do_peeling = false;
1786           else
1787             {
1788               body_cost_vec.release ();
1789               return stat;
1790             }
1791         }
1792
1793       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1794       if (do_peeling)
1795         {
1796           unsigned max_allowed_peel
1797             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1798           if (max_allowed_peel != (unsigned)-1)
1799             {
1800               unsigned max_peel = npeel;
1801               if (max_peel == 0)
1802                 {
1803                   gimple *dr_stmt = DR_STMT (dr0);
1804                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1805                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1806                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1807                 }
1808               if (max_peel > max_allowed_peel)
1809                 {
1810                   do_peeling = false;
1811                   if (dump_enabled_p ())
1812                     dump_printf_loc (MSG_NOTE, vect_location,
1813                         "Disable peeling, max peels reached: %d\n", max_peel);
1814                 }
1815             }
1816         }
1817
1818       /* Cost model #2 - if peeling may result in a remaining loop not
1819          iterating enough to be vectorized then do not peel.  */
1820       if (do_peeling
1821           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1822         {
1823           unsigned max_peel
1824             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1825           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1826               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1827             do_peeling = false;
1828         }
1829
1830       if (do_peeling)
1831         {
1832           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1833              If the misalignment of DR_i is identical to that of dr0 then set
1834              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1835              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1836              by the peeling factor times the element size of DR_i (MOD the
1837              vectorization factor times the size).  Otherwise, the
1838              misalignment of DR_i must be set to unknown.  */
1839           FOR_EACH_VEC_ELT (datarefs, i, dr)
1840             if (dr != dr0)
1841               {
1842                 /* Strided accesses perform only component accesses, alignment
1843                    is irrelevant for them.  */
1844                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1845                 if (STMT_VINFO_STRIDED_P (stmt_info)
1846                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1847                   continue;
1848
1849                 vect_update_misalignment_for_peel (dr, dr0, npeel);
1850               }
1851
1852           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1853           if (npeel)
1854             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1855           else
1856             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857               = DR_MISALIGNMENT (dr0);
1858           SET_DR_MISALIGNMENT (dr0, 0);
1859           if (dump_enabled_p ())
1860             {
1861               dump_printf_loc (MSG_NOTE, vect_location,
1862                                "Alignment of access forced using peeling.\n");
1863               dump_printf_loc (MSG_NOTE, vect_location,
1864                                "Peeling for alignment will be applied.\n");
1865             }
1866           /* The inside-loop cost will be accounted for in vectorizable_load
1867              and vectorizable_store correctly with adjusted alignments.
1868              Drop the body_cst_vec on the floor here.  */
1869           body_cost_vec.release ();
1870
1871           stat = vect_verify_datarefs_alignment (loop_vinfo);
1872           gcc_assert (stat);
1873           return stat;
1874         }
1875     }
1876
1877   body_cost_vec.release ();
1878
1879   /* (2) Versioning to force alignment.  */
1880
1881   /* Try versioning if:
1882      1) optimize loop for speed
1883      2) there is at least one unsupported misaligned data ref with an unknown
1884         misalignment, and
1885      3) all misaligned data refs with a known misalignment are supported, and
1886      4) the number of runtime alignment checks is within reason.  */
1887
1888   do_versioning =
1889         optimize_loop_nest_for_speed_p (loop)
1890         && (!loop->inner); /* FORNOW */
1891
1892   if (do_versioning)
1893     {
1894       FOR_EACH_VEC_ELT (datarefs, i, dr)
1895         {
1896           stmt = DR_STMT (dr);
1897           stmt_info = vinfo_for_stmt (stmt);
1898
1899           /* For interleaving, only the alignment of the first access
1900              matters.  */
1901           if (aligned_access_p (dr)
1902               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1903                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1904             continue;
1905
1906           if (STMT_VINFO_STRIDED_P (stmt_info))
1907             {
1908               /* Strided loads perform only component accesses, alignment is
1909                  irrelevant for them.  */
1910               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1911                 continue;
1912               do_versioning = false;
1913               break;
1914             }
1915
1916           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1917
1918           if (!supportable_dr_alignment)
1919             {
1920               gimple *stmt;
1921               int mask;
1922               tree vectype;
1923
1924               if (known_alignment_for_access_p (dr)
1925                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1926                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1927                 {
1928                   do_versioning = false;
1929                   break;
1930                 }
1931
1932               stmt = DR_STMT (dr);
1933               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1934               gcc_assert (vectype);
1935
1936               /* The rightmost bits of an aligned address must be zeros.
1937                  Construct the mask needed for this test.  For example,
1938                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1939                  mask must be 15 = 0xf. */
1940               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1941
1942               /* FORNOW: use the same mask to test all potentially unaligned
1943                  references in the loop.  The vectorizer currently supports
1944                  a single vector size, see the reference to
1945                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1946                  vectorization factor is computed.  */
1947               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1948                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1949               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1950               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1951                       DR_STMT (dr));
1952             }
1953         }
1954
1955       /* Versioning requires at least one misaligned data reference.  */
1956       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1957         do_versioning = false;
1958       else if (!do_versioning)
1959         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1960     }
1961
1962   if (do_versioning)
1963     {
1964       vec<gimple *> may_misalign_stmts
1965         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1966       gimple *stmt;
1967
1968       /* It can now be assumed that the data references in the statements
1969          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1970          of the loop being vectorized.  */
1971       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1972         {
1973           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1974           dr = STMT_VINFO_DATA_REF (stmt_info);
1975           SET_DR_MISALIGNMENT (dr, 0);
1976           if (dump_enabled_p ())
1977             dump_printf_loc (MSG_NOTE, vect_location,
1978                              "Alignment of access forced using versioning.\n");
1979         }
1980
1981       if (dump_enabled_p ())
1982         dump_printf_loc (MSG_NOTE, vect_location,
1983                          "Versioning for alignment will be applied.\n");
1984
1985       /* Peeling and versioning can't be done together at this time.  */
1986       gcc_assert (! (do_peeling && do_versioning));
1987
1988       stat = vect_verify_datarefs_alignment (loop_vinfo);
1989       gcc_assert (stat);
1990       return stat;
1991     }
1992
1993   /* This point is reached if neither peeling nor versioning is being done.  */
1994   gcc_assert (! (do_peeling || do_versioning));
1995
1996   stat = vect_verify_datarefs_alignment (loop_vinfo);
1997   return stat;
1998 }
1999
2000
2001 /* Function vect_find_same_alignment_drs.
2002
2003    Update group and alignment relations according to the chosen
2004    vectorization factor.  */
2005
2006 static void
2007 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2008                               loop_vec_info loop_vinfo)
2009 {
2010   unsigned int i;
2011   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2012   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2013   struct data_reference *dra = DDR_A (ddr);
2014   struct data_reference *drb = DDR_B (ddr);
2015   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2016   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2017   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2018   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2019   lambda_vector dist_v;
2020   unsigned int loop_depth;
2021
2022   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2023     return;
2024
2025   if (dra == drb)
2026     return;
2027
2028   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2029     return;
2030
2031   /* Loop-based vectorization and known data dependence.  */
2032   if (DDR_NUM_DIST_VECTS (ddr) == 0)
2033     return;
2034
2035   /* Data-dependence analysis reports a distance vector of zero
2036      for data-references that overlap only in the first iteration
2037      but have different sign step (see PR45764).
2038      So as a sanity check require equal DR_STEP.  */
2039   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2040     return;
2041
2042   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2043   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2044     {
2045       int dist = dist_v[loop_depth];
2046
2047       if (dump_enabled_p ())
2048         dump_printf_loc (MSG_NOTE, vect_location,
2049                          "dependence distance  = %d.\n", dist);
2050
2051       /* Same loop iteration.  */
2052       if (dist == 0
2053           || (dist % vectorization_factor == 0 && dra_size == drb_size))
2054         {
2055           /* Two references with distance zero have the same alignment.  */
2056           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2057           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2058           if (dump_enabled_p ())
2059             {
2060               dump_printf_loc (MSG_NOTE, vect_location,
2061                                "accesses have the same alignment.\n");
2062               dump_printf (MSG_NOTE,
2063                            "dependence distance modulo vf == 0 between ");
2064               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2065               dump_printf (MSG_NOTE,  " and ");
2066               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2067               dump_printf (MSG_NOTE, "\n");
2068             }
2069         }
2070     }
2071 }
2072
2073
2074 /* Function vect_analyze_data_refs_alignment
2075
2076    Analyze the alignment of the data-references in the loop.
2077    Return FALSE if a data reference is found that cannot be vectorized.  */
2078
2079 bool
2080 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2081 {
2082   if (dump_enabled_p ())
2083     dump_printf_loc (MSG_NOTE, vect_location,
2084                      "=== vect_analyze_data_refs_alignment ===\n");
2085
2086   /* Mark groups of data references with same alignment using
2087      data dependence information.  */
2088   vec<ddr_p> ddrs = vinfo->ddrs;
2089   struct data_dependence_relation *ddr;
2090   unsigned int i;
2091
2092   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2093     vect_find_same_alignment_drs (ddr, vinfo);
2094
2095   vec<data_reference_p> datarefs = vinfo->datarefs;
2096   struct data_reference *dr;
2097
2098   FOR_EACH_VEC_ELT (datarefs, i, dr)
2099     {
2100       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2101       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2102           && !vect_compute_data_ref_alignment (dr))
2103         {
2104           /* Strided accesses perform only component accesses, misalignment
2105              information is irrelevant for them.  */
2106           if (STMT_VINFO_STRIDED_P (stmt_info)
2107               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2108             continue;
2109
2110           if (dump_enabled_p ())
2111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2112                              "not vectorized: can't calculate alignment "
2113                              "for data ref.\n");
2114
2115           return false;
2116         }
2117     }
2118
2119   return true;
2120 }
2121
2122
2123 /* Analyze alignment of DRs of stmts in NODE.  */
2124
2125 static bool
2126 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2127 {
2128   /* We vectorize from the first scalar stmt in the node unless
2129      the node is permuted in which case we start from the first
2130      element in the group.  */
2131   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2132   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2133   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2134     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2135
2136   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2137   if (! vect_compute_data_ref_alignment (dr)
2138       /* For creating the data-ref pointer we need alignment of the
2139          first element anyway.  */
2140       || (dr != first_dr
2141           && ! vect_compute_data_ref_alignment (first_dr))
2142       || ! verify_data_ref_alignment (dr))
2143     {
2144       if (dump_enabled_p ())
2145         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2146                          "not vectorized: bad data alignment in basic "
2147                          "block.\n");
2148       return false;
2149     }
2150
2151   return true;
2152 }
2153
2154 /* Function vect_slp_analyze_instance_alignment
2155
2156    Analyze the alignment of the data-references in the SLP instance.
2157    Return FALSE if a data reference is found that cannot be vectorized.  */
2158
2159 bool
2160 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2161 {
2162   if (dump_enabled_p ())
2163     dump_printf_loc (MSG_NOTE, vect_location,
2164                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2165
2166   slp_tree node;
2167   unsigned i;
2168   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2169     if (! vect_slp_analyze_and_verify_node_alignment (node))
2170       return false;
2171
2172   node = SLP_INSTANCE_TREE (instance);
2173   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2174       && ! vect_slp_analyze_and_verify_node_alignment
2175              (SLP_INSTANCE_TREE (instance)))
2176     return false;
2177
2178   return true;
2179 }
2180
2181
2182 /* Analyze groups of accesses: check that DR belongs to a group of
2183    accesses of legal size, step, etc.  Detect gaps, single element
2184    interleaving, and other special cases. Set grouped access info.
2185    Collect groups of strided stores for further use in SLP analysis.
2186    Worker for vect_analyze_group_access.  */
2187
2188 static bool
2189 vect_analyze_group_access_1 (struct data_reference *dr)
2190 {
2191   tree step = DR_STEP (dr);
2192   tree scalar_type = TREE_TYPE (DR_REF (dr));
2193   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2194   gimple *stmt = DR_STMT (dr);
2195   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2196   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2197   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2198   HOST_WIDE_INT dr_step = -1;
2199   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2200   bool slp_impossible = false;
2201
2202   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2203      size of the interleaving group (including gaps).  */
2204   if (tree_fits_shwi_p (step))
2205     {
2206       dr_step = tree_to_shwi (step);
2207       /* Check that STEP is a multiple of type size.  Otherwise there is
2208          a non-element-sized gap at the end of the group which we
2209          cannot represent in GROUP_GAP or GROUP_SIZE.
2210          ???  As we can handle non-constant step fine here we should
2211          simply remove uses of GROUP_GAP between the last and first
2212          element and instead rely on DR_STEP.  GROUP_SIZE then would
2213          simply not include that gap.  */
2214       if ((dr_step % type_size) != 0)
2215         {
2216           if (dump_enabled_p ())
2217             {
2218               dump_printf_loc (MSG_NOTE, vect_location,
2219                                "Step ");
2220               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2221               dump_printf (MSG_NOTE,
2222                            " is not a multiple of the element size for ");
2223               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2224               dump_printf (MSG_NOTE, "\n");
2225             }
2226           return false;
2227         }
2228       groupsize = absu_hwi (dr_step) / type_size;
2229     }
2230   else
2231     groupsize = 0;
2232
2233   /* Not consecutive access is possible only if it is a part of interleaving.  */
2234   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2235     {
2236       /* Check if it this DR is a part of interleaving, and is a single
2237          element of the group that is accessed in the loop.  */
2238
2239       /* Gaps are supported only for loads. STEP must be a multiple of the type
2240          size.  The size of the group must be a power of 2.  */
2241       if (DR_IS_READ (dr)
2242           && (dr_step % type_size) == 0
2243           && groupsize > 0
2244           && exact_log2 (groupsize) != -1)
2245         {
2246           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2247           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2248           GROUP_GAP (stmt_info) = groupsize - 1;
2249           if (dump_enabled_p ())
2250             {
2251               dump_printf_loc (MSG_NOTE, vect_location,
2252                                "Detected single element interleaving ");
2253               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2254               dump_printf (MSG_NOTE, " step ");
2255               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2256               dump_printf (MSG_NOTE, "\n");
2257             }
2258
2259           return true;
2260         }
2261
2262       if (dump_enabled_p ())
2263         {
2264           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2265                            "not consecutive access ");
2266           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2267         }
2268
2269       if (bb_vinfo)
2270         {
2271           /* Mark the statement as unvectorizable.  */
2272           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2273           return true;
2274         }
2275
2276       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2277       STMT_VINFO_STRIDED_P (stmt_info) = true;
2278       return true;
2279     }
2280
2281   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2282     {
2283       /* First stmt in the interleaving chain. Check the chain.  */
2284       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2285       struct data_reference *data_ref = dr;
2286       unsigned int count = 1;
2287       tree prev_init = DR_INIT (data_ref);
2288       gimple *prev = stmt;
2289       HOST_WIDE_INT diff, gaps = 0;
2290
2291       while (next)
2292         {
2293           /* Skip same data-refs.  In case that two or more stmts share
2294              data-ref (supported only for loads), we vectorize only the first
2295              stmt, and the rest get their vectorized loads from the first
2296              one.  */
2297           if (!tree_int_cst_compare (DR_INIT (data_ref),
2298                                      DR_INIT (STMT_VINFO_DATA_REF (
2299                                                    vinfo_for_stmt (next)))))
2300             {
2301               if (DR_IS_WRITE (data_ref))
2302                 {
2303                   if (dump_enabled_p ())
2304                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2305                                      "Two store stmts share the same dr.\n");
2306                   return false;
2307                 }
2308
2309               if (dump_enabled_p ())
2310                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                                  "Two or more load stmts share the same dr.\n");
2312
2313               /* For load use the same data-ref load.  */
2314               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2315
2316               prev = next;
2317               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2318               continue;
2319             }
2320
2321           prev = next;
2322           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2323
2324           /* All group members have the same STEP by construction.  */
2325           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2326
2327           /* Check that the distance between two accesses is equal to the type
2328              size. Otherwise, we have gaps.  */
2329           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2330                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2331           if (diff != 1)
2332             {
2333               /* FORNOW: SLP of accesses with gaps is not supported.  */
2334               slp_impossible = true;
2335               if (DR_IS_WRITE (data_ref))
2336                 {
2337                   if (dump_enabled_p ())
2338                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339                                      "interleaved store with gaps\n");
2340                   return false;
2341                 }
2342
2343               gaps += diff - 1;
2344             }
2345
2346           last_accessed_element += diff;
2347
2348           /* Store the gap from the previous member of the group. If there is no
2349              gap in the access, GROUP_GAP is always 1.  */
2350           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2351
2352           prev_init = DR_INIT (data_ref);
2353           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2354           /* Count the number of data-refs in the chain.  */
2355           count++;
2356         }
2357
2358       if (groupsize == 0)
2359         groupsize = count + gaps;
2360
2361       if (groupsize > UINT_MAX)
2362         {
2363           if (dump_enabled_p ())
2364             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2365                              "group is too large\n");
2366           return false;
2367         }
2368
2369       /* Check that the size of the interleaving is equal to count for stores,
2370          i.e., that there are no gaps.  */
2371       if (groupsize != count
2372           && !DR_IS_READ (dr))
2373         {
2374           if (dump_enabled_p ())
2375             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2376                              "interleaved store with gaps\n");
2377           return false;
2378         }
2379
2380       /* If there is a gap after the last load in the group it is the
2381          difference between the groupsize and the last accessed
2382          element.
2383          When there is no gap, this difference should be 0.  */
2384       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2385
2386       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2387       if (dump_enabled_p ())
2388         {
2389           dump_printf_loc (MSG_NOTE, vect_location,
2390                            "Detected interleaving ");
2391           if (DR_IS_READ (dr))
2392             dump_printf (MSG_NOTE, "load ");
2393           else
2394             dump_printf (MSG_NOTE, "store ");
2395           dump_printf (MSG_NOTE, "of size %u starting with ",
2396                        (unsigned)groupsize);
2397           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2398           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2399             dump_printf_loc (MSG_NOTE, vect_location,
2400                              "There is a gap of %u elements after the group\n",
2401                              GROUP_GAP (vinfo_for_stmt (stmt)));
2402         }
2403
2404       /* SLP: create an SLP data structure for every interleaving group of
2405          stores for further analysis in vect_analyse_slp.  */
2406       if (DR_IS_WRITE (dr) && !slp_impossible)
2407         {
2408           if (loop_vinfo)
2409             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2410           if (bb_vinfo)
2411             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2412         }
2413     }
2414
2415   return true;
2416 }
2417
2418 /* Analyze groups of accesses: check that DR belongs to a group of
2419    accesses of legal size, step, etc.  Detect gaps, single element
2420    interleaving, and other special cases. Set grouped access info.
2421    Collect groups of strided stores for further use in SLP analysis.  */
2422
2423 static bool
2424 vect_analyze_group_access (struct data_reference *dr)
2425 {
2426   if (!vect_analyze_group_access_1 (dr))
2427     {
2428       /* Dissolve the group if present.  */
2429       gimple *next;
2430       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2431       while (stmt)
2432         {
2433           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2434           next = GROUP_NEXT_ELEMENT (vinfo);
2435           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2436           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2437           stmt = next;
2438         }
2439       return false;
2440     }
2441   return true;
2442 }
2443
2444 /* Analyze the access pattern of the data-reference DR.
2445    In case of non-consecutive accesses call vect_analyze_group_access() to
2446    analyze groups of accesses.  */
2447
2448 static bool
2449 vect_analyze_data_ref_access (struct data_reference *dr)
2450 {
2451   tree step = DR_STEP (dr);
2452   tree scalar_type = TREE_TYPE (DR_REF (dr));
2453   gimple *stmt = DR_STMT (dr);
2454   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2455   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2456   struct loop *loop = NULL;
2457
2458   if (loop_vinfo)
2459     loop = LOOP_VINFO_LOOP (loop_vinfo);
2460
2461   if (loop_vinfo && !step)
2462     {
2463       if (dump_enabled_p ())
2464         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465                          "bad data-ref access in loop\n");
2466       return false;
2467     }
2468
2469   /* Allow loads with zero step in inner-loop vectorization.  */
2470   if (loop_vinfo && integer_zerop (step))
2471     {
2472       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2473       if (!nested_in_vect_loop_p (loop, stmt))
2474         return DR_IS_READ (dr);
2475       /* Allow references with zero step for outer loops marked
2476          with pragma omp simd only - it guarantees absence of
2477          loop-carried dependencies between inner loop iterations.  */
2478       if (!loop->force_vectorize)
2479         {
2480           if (dump_enabled_p ())
2481             dump_printf_loc (MSG_NOTE, vect_location,
2482                              "zero step in inner loop of nest\n");
2483           return false;
2484         }
2485     }
2486
2487   if (loop && nested_in_vect_loop_p (loop, stmt))
2488     {
2489       /* Interleaved accesses are not yet supported within outer-loop
2490         vectorization for references in the inner-loop.  */
2491       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2492
2493       /* For the rest of the analysis we use the outer-loop step.  */
2494       step = STMT_VINFO_DR_STEP (stmt_info);
2495       if (integer_zerop (step))
2496         {
2497           if (dump_enabled_p ())
2498             dump_printf_loc (MSG_NOTE, vect_location,
2499                              "zero step in outer loop.\n");
2500           return DR_IS_READ (dr);
2501         }
2502     }
2503
2504   /* Consecutive?  */
2505   if (TREE_CODE (step) == INTEGER_CST)
2506     {
2507       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2508       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2509           || (dr_step < 0
2510               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2511         {
2512           /* Mark that it is not interleaving.  */
2513           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2514           return true;
2515         }
2516     }
2517
2518   if (loop && nested_in_vect_loop_p (loop, stmt))
2519     {
2520       if (dump_enabled_p ())
2521         dump_printf_loc (MSG_NOTE, vect_location,
2522                          "grouped access in outer loop.\n");
2523       return false;
2524     }
2525
2526
2527   /* Assume this is a DR handled by non-constant strided load case.  */
2528   if (TREE_CODE (step) != INTEGER_CST)
2529     return (STMT_VINFO_STRIDED_P (stmt_info)
2530             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2531                 || vect_analyze_group_access (dr)));
2532
2533   /* Not consecutive access - check if it's a part of interleaving group.  */
2534   return vect_analyze_group_access (dr);
2535 }
2536
2537
2538
2539 /*  A helper function used in the comparator function to sort data
2540     references.  T1 and T2 are two data references to be compared.
2541     The function returns -1, 0, or 1.  */
2542
2543 static int
2544 compare_tree (tree t1, tree t2)
2545 {
2546   int i, cmp;
2547   enum tree_code code;
2548   char tclass;
2549
2550   if (t1 == t2)
2551     return 0;
2552   if (t1 == NULL)
2553     return -1;
2554   if (t2 == NULL)
2555     return 1;
2556
2557   STRIP_NOPS (t1);
2558   STRIP_NOPS (t2);
2559
2560   if (TREE_CODE (t1) != TREE_CODE (t2))
2561     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2562
2563   code = TREE_CODE (t1);
2564   switch (code)
2565     {
2566     /* For const values, we can just use hash values for comparisons.  */
2567     case INTEGER_CST:
2568     case REAL_CST:
2569     case FIXED_CST:
2570     case STRING_CST:
2571     case COMPLEX_CST:
2572     case VECTOR_CST:
2573       {
2574         hashval_t h1 = iterative_hash_expr (t1, 0);
2575         hashval_t h2 = iterative_hash_expr (t2, 0);
2576         if (h1 != h2)
2577           return h1 < h2 ? -1 : 1;
2578         break;
2579       }
2580
2581     case SSA_NAME:
2582       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2583       if (cmp != 0)
2584         return cmp;
2585
2586       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2587         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2588       break;
2589
2590     default:
2591       tclass = TREE_CODE_CLASS (code);
2592
2593       /* For var-decl, we could compare their UIDs.  */
2594       if (tclass == tcc_declaration)
2595         {
2596           if (DECL_UID (t1) != DECL_UID (t2))
2597             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2598           break;
2599         }
2600
2601       /* For expressions with operands, compare their operands recursively.  */
2602       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2603         {
2604           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2605           if (cmp != 0)
2606             return cmp;
2607         }
2608     }
2609
2610   return 0;
2611 }
2612
2613
2614 /* Compare two data-references DRA and DRB to group them into chunks
2615    suitable for grouping.  */
2616
2617 static int
2618 dr_group_sort_cmp (const void *dra_, const void *drb_)
2619 {
2620   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2621   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2622   int cmp;
2623
2624   /* Stabilize sort.  */
2625   if (dra == drb)
2626     return 0;
2627
2628   /* DRs in different loops never belong to the same group.  */
2629   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2630   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2631   if (loopa != loopb)
2632     return loopa->num < loopb->num ? -1 : 1;
2633
2634   /* Ordering of DRs according to base.  */
2635   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2636     {
2637       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2638       if (cmp != 0)
2639         return cmp;
2640     }
2641
2642   /* And according to DR_OFFSET.  */
2643   if (!dr_equal_offsets_p (dra, drb))
2644     {
2645       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2646       if (cmp != 0)
2647         return cmp;
2648     }
2649
2650   /* Put reads before writes.  */
2651   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2652     return DR_IS_READ (dra) ? -1 : 1;
2653
2654   /* Then sort after access size.  */
2655   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2656                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2657     {
2658       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2659                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2660       if (cmp != 0)
2661         return cmp;
2662     }
2663
2664   /* And after step.  */
2665   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2666     {
2667       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2668       if (cmp != 0)
2669         return cmp;
2670     }
2671
2672   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2673   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2674   if (cmp == 0)
2675     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2676   return cmp;
2677 }
2678
2679 /* Function vect_analyze_data_ref_accesses.
2680
2681    Analyze the access pattern of all the data references in the loop.
2682
2683    FORNOW: the only access pattern that is considered vectorizable is a
2684            simple step 1 (consecutive) access.
2685
2686    FORNOW: handle only arrays and pointer accesses.  */
2687
2688 bool
2689 vect_analyze_data_ref_accesses (vec_info *vinfo)
2690 {
2691   unsigned int i;
2692   vec<data_reference_p> datarefs = vinfo->datarefs;
2693   struct data_reference *dr;
2694
2695   if (dump_enabled_p ())
2696     dump_printf_loc (MSG_NOTE, vect_location,
2697                      "=== vect_analyze_data_ref_accesses ===\n");
2698
2699   if (datarefs.is_empty ())
2700     return true;
2701
2702   /* Sort the array of datarefs to make building the interleaving chains
2703      linear.  Don't modify the original vector's order, it is needed for
2704      determining what dependencies are reversed.  */
2705   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2706   datarefs_copy.qsort (dr_group_sort_cmp);
2707
2708   /* Build the interleaving chains.  */
2709   for (i = 0; i < datarefs_copy.length () - 1;)
2710     {
2711       data_reference_p dra = datarefs_copy[i];
2712       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2713       stmt_vec_info lastinfo = NULL;
2714       for (i = i + 1; i < datarefs_copy.length (); ++i)
2715         {
2716           data_reference_p drb = datarefs_copy[i];
2717           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2718
2719           /* ???  Imperfect sorting (non-compatible types, non-modulo
2720              accesses, same accesses) can lead to a group to be artificially
2721              split here as we don't just skip over those.  If it really
2722              matters we can push those to a worklist and re-iterate
2723              over them.  The we can just skip ahead to the next DR here.  */
2724
2725           /* DRs in a different loop should not be put into the same
2726              interleaving group.  */
2727           if (gimple_bb (DR_STMT (dra))->loop_father
2728               != gimple_bb (DR_STMT (drb))->loop_father)
2729             break;
2730
2731           /* Check that the data-refs have same first location (except init)
2732              and they are both either store or load (not load and store,
2733              not masked loads or stores).  */
2734           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2735               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2736                                    DR_BASE_ADDRESS (drb), 0)
2737               || !dr_equal_offsets_p (dra, drb)
2738               || !gimple_assign_single_p (DR_STMT (dra))
2739               || !gimple_assign_single_p (DR_STMT (drb)))
2740             break;
2741
2742           /* Check that the data-refs have the same constant size.  */
2743           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2744           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2745           if (!tree_fits_uhwi_p (sza)
2746               || !tree_fits_uhwi_p (szb)
2747               || !tree_int_cst_equal (sza, szb))
2748             break;
2749
2750           /* Check that the data-refs have the same step.  */
2751           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2752             break;
2753
2754           /* Do not place the same access in the interleaving chain twice.  */
2755           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2756             break;
2757
2758           /* Check the types are compatible.
2759              ???  We don't distinguish this during sorting.  */
2760           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2761                                    TREE_TYPE (DR_REF (drb))))
2762             break;
2763
2764           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2765           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2766           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2767           gcc_assert (init_a <= init_b);
2768
2769           /* If init_b == init_a + the size of the type * k, we have an
2770              interleaving, and DRA is accessed before DRB.  */
2771           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2772           if (type_size_a == 0
2773               || (init_b - init_a) % type_size_a != 0)
2774             break;
2775
2776           /* If we have a store, the accesses are adjacent.  This splits
2777              groups into chunks we support (we don't support vectorization
2778              of stores with gaps).  */
2779           if (!DR_IS_READ (dra)
2780               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2781                                              (DR_INIT (datarefs_copy[i-1]))
2782                   != type_size_a))
2783             break;
2784
2785           /* If the step (if not zero or non-constant) is greater than the
2786              difference between data-refs' inits this splits groups into
2787              suitable sizes.  */
2788           if (tree_fits_shwi_p (DR_STEP (dra)))
2789             {
2790               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2791               if (step != 0 && step <= (init_b - init_a))
2792                 break;
2793             }
2794
2795           if (dump_enabled_p ())
2796             {
2797               dump_printf_loc (MSG_NOTE, vect_location,
2798                                "Detected interleaving ");
2799               if (DR_IS_READ (dra))
2800                 dump_printf (MSG_NOTE, "load ");
2801               else
2802                 dump_printf (MSG_NOTE, "store ");
2803               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2804               dump_printf (MSG_NOTE,  " and ");
2805               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2806               dump_printf (MSG_NOTE, "\n");
2807             }
2808
2809           /* Link the found element into the group list.  */
2810           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2811             {
2812               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2813               lastinfo = stmtinfo_a;
2814             }
2815           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2816           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2817           lastinfo = stmtinfo_b;
2818         }
2819     }
2820
2821   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2822     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2823         && !vect_analyze_data_ref_access (dr))
2824       {
2825         if (dump_enabled_p ())
2826           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2827                            "not vectorized: complicated access pattern.\n");
2828
2829         if (is_a <bb_vec_info> (vinfo))
2830           {
2831             /* Mark the statement as not vectorizable.  */
2832             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2833             continue;
2834           }
2835         else
2836           {
2837             datarefs_copy.release ();
2838             return false;
2839           }
2840       }
2841
2842   datarefs_copy.release ();
2843   return true;
2844 }
2845
2846
2847 /* Operator == between two dr_with_seg_len objects.
2848
2849    This equality operator is used to make sure two data refs
2850    are the same one so that we will consider to combine the
2851    aliasing checks of those two pairs of data dependent data
2852    refs.  */
2853
2854 static bool
2855 operator == (const dr_with_seg_len& d1,
2856              const dr_with_seg_len& d2)
2857 {
2858   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2859                           DR_BASE_ADDRESS (d2.dr), 0)
2860            && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2861            && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2862            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2863 }
2864
2865 /* Function comp_dr_with_seg_len_pair.
2866
2867    Comparison function for sorting objects of dr_with_seg_len_pair_t
2868    so that we can combine aliasing checks in one scan.  */
2869
2870 static int
2871 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2872 {
2873   const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2874   const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2875   const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2876   const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2877
2878   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2879      if a and c have the same basic address snd step, and b and d have the same
2880      address and step.  Therefore, if any a&c or b&d don't have the same address
2881      and step, we don't care the order of those two pairs after sorting.  */
2882   int comp_res;
2883
2884   if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2885                                 DR_BASE_ADDRESS (b1.dr))) != 0)
2886     return comp_res;
2887   if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2888                                 DR_BASE_ADDRESS (b2.dr))) != 0)
2889     return comp_res;
2890   if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2891     return comp_res;
2892   if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2893     return comp_res;
2894   if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2895     return comp_res;
2896   if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2897     return comp_res;
2898   if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2899     return comp_res;
2900   if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2901     return comp_res;
2902
2903   return 0;
2904 }
2905
2906 /* Function vect_vfa_segment_size.
2907
2908    Create an expression that computes the size of segment
2909    that will be accessed for a data reference.  The functions takes into
2910    account that realignment loads may access one more vector.
2911
2912    Input:
2913      DR: The data reference.
2914      LENGTH_FACTOR: segment length to consider.
2915
2916    Return an expression whose value is the size of segment which will be
2917    accessed by DR.  */
2918
2919 static tree
2920 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2921 {
2922   tree segment_length;
2923
2924   if (integer_zerop (DR_STEP (dr)))
2925     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2926   else
2927     segment_length = size_binop (MULT_EXPR,
2928                                  fold_convert (sizetype, DR_STEP (dr)),
2929                                  fold_convert (sizetype, length_factor));
2930
2931   if (vect_supportable_dr_alignment (dr, false)
2932         == dr_explicit_realign_optimized)
2933     {
2934       tree vector_size = TYPE_SIZE_UNIT
2935                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2936
2937       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2938     }
2939   return segment_length;
2940 }
2941
2942 /* Function vect_no_alias_p.
2943
2944    Given data references A and B with equal base and offset, the alias
2945    relation can be decided at compilation time, return TRUE if they do
2946    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
2947    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2948    respectively.  */
2949
2950 static bool
2951 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2952                  tree segment_length_a, tree segment_length_b)
2953 {
2954   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2955               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2956   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2957     return false;
2958
2959   tree seg_a_min = DR_INIT (a);
2960   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2961                                 seg_a_min, segment_length_a);
2962   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2963      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2964      [a, a+12) */
2965   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2966     {
2967       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2968       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2969                                seg_a_max, unit_size);
2970       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2971                                DR_INIT (a), unit_size);
2972     }
2973   tree seg_b_min = DR_INIT (b);
2974   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2975                                 seg_b_min, segment_length_b);
2976   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2977     {
2978       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2979       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2980                                seg_b_max, unit_size);
2981       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2982                                DR_INIT (b), unit_size);
2983     }
2984
2985   if (tree_int_cst_le (seg_a_max, seg_b_min)
2986       || tree_int_cst_le (seg_b_max, seg_a_min))
2987     return true;
2988
2989   return false;
2990 }
2991
2992 /* Function vect_prune_runtime_alias_test_list.
2993
2994    Prune a list of ddrs to be tested at run-time by versioning for alias.
2995    Merge several alias checks into one if possible.
2996    Return FALSE if resulting list of ddrs is longer then allowed by
2997    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2998
2999 bool
3000 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3001 {
3002   vec<ddr_p> may_alias_ddrs =
3003     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3004   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3005     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3006   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3007   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3008
3009   ddr_p ddr;
3010   unsigned int i;
3011   tree length_factor;
3012
3013   if (dump_enabled_p ())
3014     dump_printf_loc (MSG_NOTE, vect_location,
3015                      "=== vect_prune_runtime_alias_test_list ===\n");
3016
3017   if (may_alias_ddrs.is_empty ())
3018     return true;
3019
3020   /* Basically, for each pair of dependent data refs store_ptr_0
3021      and load_ptr_0, we create an expression:
3022
3023      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3024      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3025
3026      for aliasing checks.  However, in some cases we can decrease
3027      the number of checks by combining two checks into one.  For
3028      example, suppose we have another pair of data refs store_ptr_0
3029      and load_ptr_1, and if the following condition is satisfied:
3030
3031      load_ptr_0 < load_ptr_1  &&
3032      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3033
3034      (this condition means, in each iteration of vectorized loop,
3035      the accessed memory of store_ptr_0 cannot be between the memory
3036      of load_ptr_0 and load_ptr_1.)
3037
3038      we then can use only the following expression to finish the
3039      alising checks between store_ptr_0 & load_ptr_0 and
3040      store_ptr_0 & load_ptr_1:
3041
3042      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3043      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3044
3045      Note that we only consider that load_ptr_0 and load_ptr_1 have the
3046      same basic address.  */
3047
3048   comp_alias_ddrs.create (may_alias_ddrs.length ());
3049
3050   /* First, we collect all data ref pairs for aliasing checks.  */
3051   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3052     {
3053       int comp_res;
3054       struct data_reference *dr_a, *dr_b;
3055       gimple *dr_group_first_a, *dr_group_first_b;
3056       tree segment_length_a, segment_length_b;
3057       gimple *stmt_a, *stmt_b;
3058
3059       dr_a = DDR_A (ddr);
3060       stmt_a = DR_STMT (DDR_A (ddr));
3061       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3062       if (dr_group_first_a)
3063         {
3064           stmt_a = dr_group_first_a;
3065           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3066         }
3067
3068       dr_b = DDR_B (ddr);
3069       stmt_b = DR_STMT (DDR_B (ddr));
3070       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3071       if (dr_group_first_b)
3072         {
3073           stmt_b = dr_group_first_b;
3074           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3075         }
3076
3077       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3078         length_factor = scalar_loop_iters;
3079       else
3080         length_factor = size_int (vect_factor);
3081       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3082       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3083
3084       comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3085       if (comp_res == 0)
3086         comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3087
3088       /* Alias is known at compilation time.  */
3089       if (comp_res == 0
3090           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3091           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3092           && TREE_CODE (segment_length_a) == INTEGER_CST
3093           && TREE_CODE (segment_length_b) == INTEGER_CST)
3094         {
3095           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3096             continue;
3097
3098           if (dump_enabled_p ())
3099             dump_printf_loc (MSG_NOTE, vect_location,
3100                              "not vectorized: compilation time alias.\n");
3101
3102           return false;
3103         }
3104
3105       dr_with_seg_len_pair_t dr_with_seg_len_pair
3106           (dr_with_seg_len (dr_a, segment_length_a),
3107            dr_with_seg_len (dr_b, segment_length_b));
3108
3109       /* Canonicalize pairs by sorting the two DR members.  */
3110       if (comp_res > 0)
3111         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3112
3113       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3114     }
3115
3116   /* Second, we sort the collected data ref pairs so that we can scan
3117      them once to combine all possible aliasing checks.  */
3118   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3119
3120   /* Third, we scan the sorted dr pairs and check if we can combine
3121      alias checks of two neighboring dr pairs.  */
3122   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3123     {
3124       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
3125       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3126                       *dr_b1 = &comp_alias_ddrs[i-1].second,
3127                       *dr_a2 = &comp_alias_ddrs[i].first,
3128                       *dr_b2 = &comp_alias_ddrs[i].second;
3129
3130       /* Remove duplicate data ref pairs.  */
3131       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3132         {
3133           if (dump_enabled_p ())
3134             {
3135               dump_printf_loc (MSG_NOTE, vect_location,
3136                                "found equal ranges ");
3137               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138                                  DR_REF (dr_a1->dr));
3139               dump_printf (MSG_NOTE,  ", ");
3140               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3141                                  DR_REF (dr_b1->dr));
3142               dump_printf (MSG_NOTE,  " and ");
3143               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3144                                  DR_REF (dr_a2->dr));
3145               dump_printf (MSG_NOTE,  ", ");
3146               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3147                                  DR_REF (dr_b2->dr));
3148               dump_printf (MSG_NOTE, "\n");
3149             }
3150
3151           comp_alias_ddrs.ordered_remove (i--);
3152           continue;
3153         }
3154
3155       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3156         {
3157           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3158              and DR_A1 and DR_A2 are two consecutive memrefs.  */
3159           if (*dr_a1 == *dr_a2)
3160             {
3161               std::swap (dr_a1, dr_b1);
3162               std::swap (dr_a2, dr_b2);
3163             }
3164
3165           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3166                                 DR_BASE_ADDRESS (dr_a2->dr), 0)
3167               || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3168                                    DR_OFFSET (dr_a2->dr), 0)
3169               || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3170               || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3171             continue;
3172
3173           /* Make sure dr_a1 starts left of dr_a2.  */
3174           if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3175             std::swap (*dr_a1, *dr_a2);
3176
3177           bool do_remove = false;
3178           unsigned HOST_WIDE_INT diff
3179             = (tree_to_shwi (DR_INIT (dr_a2->dr))
3180                - tree_to_shwi (DR_INIT (dr_a1->dr)));
3181
3182           /* If the left segment does not extend beyond the start of the
3183              right segment the new segment length is that of the right
3184              plus the segment distance.  */
3185           if (tree_fits_uhwi_p (dr_a1->seg_len)
3186               && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3187             {
3188               dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3189                                            size_int (diff));
3190               do_remove = true;
3191             }
3192           /* Generally the new segment length is the maximum of the
3193              left segment size and the right segment size plus the distance.
3194              ???  We can also build tree MAX_EXPR here but it's not clear this
3195              is profitable.  */
3196           else if (tree_fits_uhwi_p (dr_a1->seg_len)
3197                    && tree_fits_uhwi_p (dr_a2->seg_len))
3198             {
3199               unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3200               unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3201               dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3202               do_remove = true;
3203             }
3204           /* Now we check if the following condition is satisfied:
3205
3206              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3207
3208              where DIFF = DR_A2_INIT - DR_A1_INIT.  However,
3209              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3210              have to make a best estimation.  We can get the minimum value
3211              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3212              then either of the following two conditions can guarantee the
3213              one above:
3214
3215              1: DIFF <= MIN_SEG_LEN_B
3216              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B  */
3217           else
3218             {
3219               unsigned HOST_WIDE_INT min_seg_len_b
3220                 = (tree_fits_uhwi_p (dr_b1->seg_len)
3221                    ? tree_to_uhwi (dr_b1->seg_len)
3222                    : vect_factor);
3223
3224               if (diff <= min_seg_len_b
3225                   || (tree_fits_uhwi_p (dr_a1->seg_len)
3226                       && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3227                 {
3228                   dr_a1->seg_len = size_binop (PLUS_EXPR,
3229                                                dr_a2->seg_len, size_int (diff));
3230                   do_remove = true;
3231                 }
3232             }
3233
3234           if (do_remove)
3235             {
3236               if (dump_enabled_p ())
3237                 {
3238                   dump_printf_loc (MSG_NOTE, vect_location,
3239                                    "merging ranges for ");
3240                   dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3241                   dump_printf (MSG_NOTE,  ", ");
3242                   dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3243                   dump_printf (MSG_NOTE,  " and ");
3244                   dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3245                   dump_printf (MSG_NOTE,  ", ");
3246                   dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3247                   dump_printf (MSG_NOTE, "\n");
3248                 }
3249               comp_alias_ddrs.ordered_remove (i--);
3250             }
3251         }
3252     }
3253
3254   dump_printf_loc (MSG_NOTE, vect_location,
3255                    "improved number of alias checks from %d to %d\n",
3256                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3257   if ((int) comp_alias_ddrs.length () >
3258       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3259     {
3260       if (dump_enabled_p ())
3261         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3262                          "number of versioning for alias "
3263                          "run-time tests exceeds %d "
3264                          "(--param vect-max-version-for-alias-checks)\n",
3265                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3266       return false;
3267     }
3268
3269   /* All alias checks have been resolved at compilation time.  */
3270   if (!comp_alias_ddrs.length ())
3271     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3272
3273   return true;
3274 }
3275
3276 /* Return true if a non-affine read or write in STMT is suitable for a
3277    gather load or scatter store.  Describe the operation in *INFO if so.  */
3278
3279 bool
3280 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3281                            gather_scatter_info *info)
3282 {
3283   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3284   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3285   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3286   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3287   tree offtype = NULL_TREE;
3288   tree decl, base, off;
3289   machine_mode pmode;
3290   int punsignedp, reversep, pvolatilep = 0;
3291
3292   base = DR_REF (dr);
3293   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3294      see if we can use the def stmt of the address.  */
3295   if (is_gimple_call (stmt)
3296       && gimple_call_internal_p (stmt)
3297       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3298           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3299       && TREE_CODE (base) == MEM_REF
3300       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3301       && integer_zerop (TREE_OPERAND (base, 1))
3302       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3303     {
3304       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3305       if (is_gimple_assign (def_stmt)
3306           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3307         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3308     }
3309
3310   /* The gather and scatter builtins need address of the form
3311      loop_invariant + vector * {1, 2, 4, 8}
3312      or
3313      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3314      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3315      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3316      multiplications and additions in it.  To get a vector, we need
3317      a single SSA_NAME that will be defined in the loop and will
3318      contain everything that is not loop invariant and that can be
3319      vectorized.  The following code attempts to find such a preexistng
3320      SSA_NAME OFF and put the loop invariants into a tree BASE
3321      that can be gimplified before the loop.  */
3322   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3323                               &punsignedp, &reversep, &pvolatilep);
3324   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3325
3326   if (TREE_CODE (base) == MEM_REF)
3327     {
3328       if (!integer_zerop (TREE_OPERAND (base, 1)))
3329         {
3330           if (off == NULL_TREE)
3331             {
3332               offset_int moff = mem_ref_offset (base);
3333               off = wide_int_to_tree (sizetype, moff);
3334             }
3335           else
3336             off = size_binop (PLUS_EXPR, off,
3337                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3338         }
3339       base = TREE_OPERAND (base, 0);
3340     }
3341   else
3342     base = build_fold_addr_expr (base);
3343
3344   if (off == NULL_TREE)
3345     off = size_zero_node;
3346
3347   /* If base is not loop invariant, either off is 0, then we start with just
3348      the constant offset in the loop invariant BASE and continue with base
3349      as OFF, otherwise give up.
3350      We could handle that case by gimplifying the addition of base + off
3351      into some SSA_NAME and use that as off, but for now punt.  */
3352   if (!expr_invariant_in_loop_p (loop, base))
3353     {
3354       if (!integer_zerop (off))
3355         return false;
3356       off = base;
3357       base = size_int (pbitpos / BITS_PER_UNIT);
3358     }
3359   /* Otherwise put base + constant offset into the loop invariant BASE
3360      and continue with OFF.  */
3361   else
3362     {
3363       base = fold_convert (sizetype, base);
3364       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3365     }
3366
3367   /* OFF at this point may be either a SSA_NAME or some tree expression
3368      from get_inner_reference.  Try to peel off loop invariants from it
3369      into BASE as long as possible.  */
3370   STRIP_NOPS (off);
3371   while (offtype == NULL_TREE)
3372     {
3373       enum tree_code code;
3374       tree op0, op1, add = NULL_TREE;
3375
3376       if (TREE_CODE (off) == SSA_NAME)
3377         {
3378           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3379
3380           if (expr_invariant_in_loop_p (loop, off))
3381             return false;
3382
3383           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3384             break;
3385
3386           op0 = gimple_assign_rhs1 (def_stmt);
3387           code = gimple_assign_rhs_code (def_stmt);
3388           op1 = gimple_assign_rhs2 (def_stmt);
3389         }
3390       else
3391         {
3392           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3393             return false;
3394           code = TREE_CODE (off);
3395           extract_ops_from_tree (off, &code, &op0, &op1);
3396         }
3397       switch (code)
3398         {
3399         case POINTER_PLUS_EXPR:
3400         case PLUS_EXPR:
3401           if (expr_invariant_in_loop_p (loop, op0))
3402             {
3403               add = op0;
3404               off = op1;
3405             do_add:
3406               add = fold_convert (sizetype, add);
3407               if (scale != 1)
3408                 add = size_binop (MULT_EXPR, add, size_int (scale));
3409               base = size_binop (PLUS_EXPR, base, add);
3410               continue;
3411             }
3412           if (expr_invariant_in_loop_p (loop, op1))
3413             {
3414               add = op1;
3415               off = op0;
3416               goto do_add;
3417             }
3418           break;
3419         case MINUS_EXPR:
3420           if (expr_invariant_in_loop_p (loop, op1))
3421             {
3422               add = fold_convert (sizetype, op1);
3423               add = size_binop (MINUS_EXPR, size_zero_node, add);
3424               off = op0;
3425               goto do_add;
3426             }
3427           break;
3428         case MULT_EXPR:
3429           if (scale == 1 && tree_fits_shwi_p (op1))
3430             {
3431               scale = tree_to_shwi (op1);
3432               off = op0;
3433               continue;
3434             }
3435           break;
3436         case SSA_NAME:
3437           off = op0;
3438           continue;
3439         CASE_CONVERT:
3440           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3441               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3442             break;
3443           if (TYPE_PRECISION (TREE_TYPE (op0))
3444               == TYPE_PRECISION (TREE_TYPE (off)))
3445             {
3446               off = op0;
3447               continue;
3448             }
3449           if (TYPE_PRECISION (TREE_TYPE (op0))
3450               < TYPE_PRECISION (TREE_TYPE (off)))
3451             {
3452               off = op0;
3453               offtype = TREE_TYPE (off);
3454               STRIP_NOPS (off);
3455               continue;
3456             }
3457           break;
3458         default:
3459           break;
3460         }
3461       break;
3462     }
3463
3464   /* If at the end OFF still isn't a SSA_NAME or isn't
3465      defined in the loop, punt.  */
3466   if (TREE_CODE (off) != SSA_NAME
3467       || expr_invariant_in_loop_p (loop, off))
3468     return false;
3469
3470   if (offtype == NULL_TREE)
3471     offtype = TREE_TYPE (off);
3472
3473   if (DR_IS_READ (dr))
3474     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3475                                              offtype, scale);
3476   else
3477     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3478                                               offtype, scale);
3479
3480   if (decl == NULL_TREE)
3481     return false;
3482
3483   info->decl = decl;
3484   info->base = base;
3485   info->offset = off;
3486   info->offset_dt = vect_unknown_def_type;
3487   info->offset_vectype = NULL_TREE;
3488   info->scale = scale;
3489   return true;
3490 }
3491
3492 /* Function vect_analyze_data_refs.
3493
3494   Find all the data references in the loop or basic block.
3495
3496    The general structure of the analysis of data refs in the vectorizer is as
3497    follows:
3498    1- vect_analyze_data_refs(loop/bb): call
3499       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3500       in the loop/bb and their dependences.
3501    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3502    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3503    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3504
3505 */
3506
3507 bool
3508 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3509 {
3510   struct loop *loop = NULL;
3511   unsigned int i;
3512   struct data_reference *dr;
3513   tree scalar_type;
3514
3515   if (dump_enabled_p ())
3516     dump_printf_loc (MSG_NOTE, vect_location,
3517                      "=== vect_analyze_data_refs ===\n");
3518
3519   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3520     loop = LOOP_VINFO_LOOP (loop_vinfo);
3521
3522   /* Go through the data-refs, check that the analysis succeeded.  Update
3523      pointer from stmt_vec_info struct to DR and vectype.  */
3524
3525   vec<data_reference_p> datarefs = vinfo->datarefs;
3526   FOR_EACH_VEC_ELT (datarefs, i, dr)
3527     {
3528       gimple *stmt;
3529       stmt_vec_info stmt_info;
3530       tree base, offset, init;
3531       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3532       bool simd_lane_access = false;
3533       int vf;
3534
3535 again:
3536       if (!dr || !DR_REF (dr))
3537         {
3538           if (dump_enabled_p ())
3539             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3540                              "not vectorized: unhandled data-ref\n");
3541           return false;
3542         }
3543
3544       stmt = DR_STMT (dr);
3545       stmt_info = vinfo_for_stmt (stmt);
3546
3547       /* Discard clobbers from the dataref vector.  We will remove
3548          clobber stmts during vectorization.  */
3549       if (gimple_clobber_p (stmt))
3550         {
3551           free_data_ref (dr);
3552           if (i == datarefs.length () - 1)
3553             {
3554               datarefs.pop ();
3555               break;
3556             }
3557           datarefs.ordered_remove (i);
3558           dr = datarefs[i];
3559           goto again;
3560         }
3561
3562       /* Check that analysis of the data-ref succeeded.  */
3563       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3564           || !DR_STEP (dr))
3565         {
3566           bool maybe_gather
3567             = DR_IS_READ (dr)
3568               && !TREE_THIS_VOLATILE (DR_REF (dr))
3569               && targetm.vectorize.builtin_gather != NULL;
3570           bool maybe_scatter
3571             = DR_IS_WRITE (dr)
3572               && !TREE_THIS_VOLATILE (DR_REF (dr))
3573               && targetm.vectorize.builtin_scatter != NULL;
3574           bool maybe_simd_lane_access
3575             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3576
3577           /* If target supports vector gather loads or scatter stores, or if
3578              this might be a SIMD lane access, see if they can't be used.  */
3579           if (is_a <loop_vec_info> (vinfo)
3580               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3581               && !nested_in_vect_loop_p (loop, stmt))
3582             {
3583               struct data_reference *newdr
3584                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3585                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3586               gcc_assert (newdr != NULL && DR_REF (newdr));
3587               if (DR_BASE_ADDRESS (newdr)
3588                   && DR_OFFSET (newdr)
3589                   && DR_INIT (newdr)
3590                   && DR_STEP (newdr)
3591                   && integer_zerop (DR_STEP (newdr)))
3592                 {
3593                   if (maybe_simd_lane_access)
3594                     {
3595                       tree off = DR_OFFSET (newdr);
3596                       STRIP_NOPS (off);
3597                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3598                           && TREE_CODE (off) == MULT_EXPR
3599                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3600                         {
3601                           tree step = TREE_OPERAND (off, 1);
3602                           off = TREE_OPERAND (off, 0);
3603                           STRIP_NOPS (off);
3604                           if (CONVERT_EXPR_P (off)
3605                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3606                                                                           0)))
3607                                  < TYPE_PRECISION (TREE_TYPE (off)))
3608                             off = TREE_OPERAND (off, 0);
3609                           if (TREE_CODE (off) == SSA_NAME)
3610                             {
3611                               gimple *def = SSA_NAME_DEF_STMT (off);
3612                               tree reft = TREE_TYPE (DR_REF (newdr));
3613                               if (is_gimple_call (def)
3614                                   && gimple_call_internal_p (def)
3615                                   && (gimple_call_internal_fn (def)
3616                                       == IFN_GOMP_SIMD_LANE))
3617                                 {
3618                                   tree arg = gimple_call_arg (def, 0);
3619                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3620                                   arg = SSA_NAME_VAR (arg);
3621                                   if (arg == loop->simduid
3622                                       /* For now.  */
3623                                       && tree_int_cst_equal
3624                                            (TYPE_SIZE_UNIT (reft),
3625                                             step))
3626                                     {
3627                                       DR_OFFSET (newdr) = ssize_int (0);
3628                                       DR_STEP (newdr) = step;
3629                                       DR_ALIGNED_TO (newdr)
3630                                         = size_int (BIGGEST_ALIGNMENT);
3631                                       dr = newdr;
3632                                       simd_lane_access = true;
3633                                     }
3634                                 }
3635                             }
3636                         }
3637                     }
3638                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3639                     {
3640                       dr = newdr;
3641                       if (maybe_gather)
3642                         gatherscatter = GATHER;
3643                       else
3644                         gatherscatter = SCATTER;
3645                     }
3646                 }
3647               if (gatherscatter == SG_NONE && !simd_lane_access)
3648                 free_data_ref (newdr);
3649             }
3650
3651           if (gatherscatter == SG_NONE && !simd_lane_access)
3652             {
3653               if (dump_enabled_p ())
3654                 {
3655                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3656                                    "not vectorized: data ref analysis "
3657                                    "failed ");
3658                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3659                 }
3660
3661               if (is_a <bb_vec_info> (vinfo))
3662                 break;
3663
3664               return false;
3665             }
3666         }
3667
3668       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3669         {
3670           if (dump_enabled_p ())
3671             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3672                              "not vectorized: base addr of dr is a "
3673                              "constant\n");
3674
3675           if (is_a <bb_vec_info> (vinfo))
3676             break;
3677
3678           if (gatherscatter != SG_NONE || simd_lane_access)
3679             free_data_ref (dr);
3680           return false;
3681         }
3682
3683       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3684         {
3685           if (dump_enabled_p ())
3686             {
3687               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3688                                "not vectorized: volatile type ");
3689               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3690             }
3691
3692           if (is_a <bb_vec_info> (vinfo))
3693             break;
3694
3695           return false;
3696         }
3697
3698       if (stmt_can_throw_internal (stmt))
3699         {
3700           if (dump_enabled_p ())
3701             {
3702               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3703                                "not vectorized: statement can throw an "
3704                                "exception ");
3705               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3706             }
3707
3708           if (is_a <bb_vec_info> (vinfo))
3709             break;
3710
3711           if (gatherscatter != SG_NONE || simd_lane_access)
3712             free_data_ref (dr);
3713           return false;
3714         }
3715
3716       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3717           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3718         {
3719           if (dump_enabled_p ())
3720             {
3721               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3722                                "not vectorized: statement is bitfield "
3723                                "access ");
3724               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3725             }
3726
3727           if (is_a <bb_vec_info> (vinfo))
3728             break;
3729
3730           if (gatherscatter != SG_NONE || simd_lane_access)
3731             free_data_ref (dr);
3732           return false;
3733         }
3734
3735       base = unshare_expr (DR_BASE_ADDRESS (dr));
3736       offset = unshare_expr (DR_OFFSET (dr));
3737       init = unshare_expr (DR_INIT (dr));
3738
3739       if (is_gimple_call (stmt)
3740           && (!gimple_call_internal_p (stmt)
3741               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3742                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3743         {
3744           if (dump_enabled_p ())
3745             {
3746               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3747                                "not vectorized: dr in a call ");
3748               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3749             }
3750
3751           if (is_a <bb_vec_info> (vinfo))
3752             break;
3753
3754           if (gatherscatter != SG_NONE || simd_lane_access)
3755             free_data_ref (dr);
3756           return false;
3757         }
3758
3759       /* Update DR field in stmt_vec_info struct.  */
3760
3761       /* If the dataref is in an inner-loop of the loop that is considered for
3762          for vectorization, we also want to analyze the access relative to
3763          the outer-loop (DR contains information only relative to the
3764          inner-most enclosing loop).  We do that by building a reference to the
3765          first location accessed by the inner-loop, and analyze it relative to
3766          the outer-loop.  */
3767       if (loop && nested_in_vect_loop_p (loop, stmt))
3768         {
3769           tree outer_step, outer_base, outer_init;
3770           HOST_WIDE_INT pbitsize, pbitpos;
3771           tree poffset;
3772           machine_mode pmode;
3773           int punsignedp, preversep, pvolatilep;
3774           affine_iv base_iv, offset_iv;
3775           tree dinit;
3776
3777           /* Build a reference to the first location accessed by the
3778              inner-loop: *(BASE+INIT).  (The first location is actually
3779              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3780           tree inner_base = build_fold_indirect_ref
3781                                 (fold_build_pointer_plus (base, init));
3782
3783           if (dump_enabled_p ())
3784             {
3785               dump_printf_loc (MSG_NOTE, vect_location,
3786                                "analyze in outer-loop: ");
3787               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3788               dump_printf (MSG_NOTE, "\n");
3789             }
3790
3791           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3792                                             &poffset, &pmode, &punsignedp,
3793                                             &preversep, &pvolatilep);
3794           gcc_assert (outer_base != NULL_TREE);
3795
3796           if (pbitpos % BITS_PER_UNIT != 0)
3797             {
3798               if (dump_enabled_p ())
3799                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3800                                  "failed: bit offset alignment.\n");
3801               return false;
3802             }
3803
3804           if (preversep)
3805             {
3806               if (dump_enabled_p ())
3807                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3808                                  "failed: reverse storage order.\n");
3809               return false;
3810             }
3811
3812           outer_base = build_fold_addr_expr (outer_base);
3813           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3814                           &base_iv, false))
3815             {
3816               if (dump_enabled_p ())
3817                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3818                                  "failed: evolution of base is not affine.\n");
3819               return false;
3820             }
3821
3822           if (offset)
3823             {
3824               if (poffset)
3825                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3826                                        poffset);
3827               else
3828                 poffset = offset;
3829             }
3830
3831           if (!poffset)
3832             {
3833               offset_iv.base = ssize_int (0);
3834               offset_iv.step = ssize_int (0);
3835             }
3836           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3837                                &offset_iv, false))
3838             {
3839               if (dump_enabled_p ())
3840                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3841                                  "evolution of offset is not affine.\n");
3842               return false;
3843             }
3844
3845           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3846           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3847           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3848           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3849           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3850
3851           outer_step = size_binop (PLUS_EXPR,
3852                                 fold_convert (ssizetype, base_iv.step),
3853                                 fold_convert (ssizetype, offset_iv.step));
3854
3855           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3856           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3857           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3858           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3859           STMT_VINFO_DR_OFFSET (stmt_info) =
3860                                 fold_convert (ssizetype, offset_iv.base);
3861           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3862                                 size_int (highest_pow2_factor (offset_iv.base));
3863
3864           if (dump_enabled_p ())
3865             {
3866               dump_printf_loc (MSG_NOTE, vect_location,
3867                                "\touter base_address: ");
3868               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3869                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3870               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3871               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3872                                  STMT_VINFO_DR_OFFSET (stmt_info));
3873               dump_printf (MSG_NOTE,
3874                            "\n\touter constant offset from base address: ");
3875               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3876                                  STMT_VINFO_DR_INIT (stmt_info));
3877               dump_printf (MSG_NOTE, "\n\touter step: ");
3878               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3879                                  STMT_VINFO_DR_STEP (stmt_info));
3880               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3881               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3882                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3883               dump_printf (MSG_NOTE, "\n");
3884             }
3885         }
3886
3887       if (STMT_VINFO_DATA_REF (stmt_info))
3888         {
3889           if (dump_enabled_p ())
3890             {
3891               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3892                                "not vectorized: more than one data ref "
3893                                "in stmt: ");
3894               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3895             }
3896
3897           if (is_a <bb_vec_info> (vinfo))
3898             break;
3899
3900           if (gatherscatter != SG_NONE || simd_lane_access)
3901             free_data_ref (dr);
3902           return false;
3903         }
3904
3905       STMT_VINFO_DATA_REF (stmt_info) = dr;
3906       if (simd_lane_access)
3907         {
3908           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3909           free_data_ref (datarefs[i]);
3910           datarefs[i] = dr;
3911         }
3912
3913       /* Set vectype for STMT.  */
3914       scalar_type = TREE_TYPE (DR_REF (dr));
3915       STMT_VINFO_VECTYPE (stmt_info)
3916         = get_vectype_for_scalar_type (scalar_type);
3917       if (!STMT_VINFO_VECTYPE (stmt_info))
3918         {
3919           if (dump_enabled_p ())
3920             {
3921               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3922                                "not vectorized: no vectype for stmt: ");
3923               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3924               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3925               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3926                                  scalar_type);
3927               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3928             }
3929
3930           if (is_a <bb_vec_info> (vinfo))
3931             {
3932               /* No vector type is fine, the ref can still participate
3933                  in dependence analysis, we just can't vectorize it.  */
3934               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3935               continue;
3936             }
3937
3938           if (gatherscatter != SG_NONE || simd_lane_access)
3939             {
3940               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3941               if (gatherscatter != SG_NONE)
3942                 free_data_ref (dr);
3943             }
3944           return false;
3945         }
3946       else
3947         {
3948           if (dump_enabled_p ())
3949             {
3950               dump_printf_loc (MSG_NOTE, vect_location,
3951                                "got vectype for stmt: ");
3952               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3953               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3954                                  STMT_VINFO_VECTYPE (stmt_info));
3955               dump_printf (MSG_NOTE, "\n");
3956             }
3957         }
3958
3959       /* Adjust the minimal vectorization factor according to the
3960          vector type.  */
3961       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3962       if (vf > *min_vf)
3963         *min_vf = vf;
3964
3965       if (gatherscatter != SG_NONE)
3966         {
3967           gather_scatter_info gs_info;
3968           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3969                                           &gs_info)
3970               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3971             {
3972               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3973               free_data_ref (dr);
3974               if (dump_enabled_p ())
3975                 {
3976                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977                                    (gatherscatter == GATHER) ?
3978                                    "not vectorized: not suitable for gather "
3979                                    "load " :
3980                                    "not vectorized: not suitable for scatter "
3981                                    "store ");
3982                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3983                 }
3984               return false;
3985             }
3986
3987           free_data_ref (datarefs[i]);
3988           datarefs[i] = dr;
3989           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3990         }
3991
3992       else if (is_a <loop_vec_info> (vinfo)
3993                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3994         {
3995           if (nested_in_vect_loop_p (loop, stmt))
3996             {
3997               if (dump_enabled_p ())
3998                 {
3999                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4000                                    "not vectorized: not suitable for strided "
4001                                    "load ");
4002                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4003                 }
4004               return false;
4005             }
4006           STMT_VINFO_STRIDED_P (stmt_info) = true;
4007         }
4008     }
4009
4010   /* If we stopped analysis at the first dataref we could not analyze
4011      when trying to vectorize a basic-block mark the rest of the datarefs
4012      as not vectorizable and truncate the vector of datarefs.  That
4013      avoids spending useless time in analyzing their dependence.  */
4014   if (i != datarefs.length ())
4015     {
4016       gcc_assert (is_a <bb_vec_info> (vinfo));
4017       for (unsigned j = i; j < datarefs.length (); ++j)
4018         {
4019           data_reference_p dr = datarefs[j];
4020           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4021           free_data_ref (dr);
4022         }
4023       datarefs.truncate (i);
4024     }
4025
4026   return true;
4027 }
4028
4029
4030 /* Function vect_get_new_vect_var.
4031
4032    Returns a name for a new variable.  The current naming scheme appends the
4033    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4034    the name of vectorizer generated variables, and appends that to NAME if
4035    provided.  */
4036
4037 tree
4038 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4039 {
4040   const char *prefix;
4041   tree new_vect_var;
4042
4043   switch (var_kind)
4044   {
4045   case vect_simple_var:
4046     prefix = "vect";
4047     break;
4048   case vect_scalar_var:
4049     prefix = "stmp";
4050     break;
4051   case vect_mask_var:
4052     prefix = "mask";
4053     break;
4054   case vect_pointer_var:
4055     prefix = "vectp";
4056     break;
4057   default:
4058     gcc_unreachable ();
4059   }
4060
4061   if (name)
4062     {
4063       char* tmp = concat (prefix, "_", name, NULL);
4064       new_vect_var = create_tmp_reg (type, tmp);
4065       free (tmp);
4066     }
4067   else
4068     new_vect_var = create_tmp_reg (type, prefix);
4069
4070   return new_vect_var;
4071 }
4072
4073 /* Like vect_get_new_vect_var but return an SSA name.  */
4074
4075 tree
4076 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4077 {
4078   const char *prefix;
4079   tree new_vect_var;
4080
4081   switch (var_kind)
4082   {
4083   case vect_simple_var:
4084     prefix = "vect";
4085     break;
4086   case vect_scalar_var:
4087     prefix = "stmp";
4088     break;
4089   case vect_pointer_var:
4090     prefix = "vectp";
4091     break;
4092   default:
4093     gcc_unreachable ();
4094   }
4095
4096   if (name)
4097     {
4098       char* tmp = concat (prefix, "_", name, NULL);
4099       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4100       free (tmp);
4101     }
4102   else
4103     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4104
4105   return new_vect_var;
4106 }
4107
4108 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4109
4110 static void
4111 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4112                                   stmt_vec_info stmt_info)
4113 {
4114   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4115   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4116   int misalign = DR_MISALIGNMENT (dr);
4117   if (misalign == -1)
4118     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4119   else
4120     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4121 }
4122
4123 /* Function vect_create_addr_base_for_vector_ref.
4124
4125    Create an expression that computes the address of the first memory location
4126    that will be accessed for a data reference.
4127
4128    Input:
4129    STMT: The statement containing the data reference.
4130    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4131    OFFSET: Optional. If supplied, it is be added to the initial address.
4132    LOOP:    Specify relative to which loop-nest should the address be computed.
4133             For example, when the dataref is in an inner-loop nested in an
4134             outer-loop that is now being vectorized, LOOP can be either the
4135             outer-loop, or the inner-loop.  The first memory location accessed
4136             by the following dataref ('in' points to short):
4137
4138                 for (i=0; i<N; i++)
4139                    for (j=0; j<M; j++)
4140                      s += in[i+j]
4141
4142             is as follows:
4143             if LOOP=i_loop:     &in             (relative to i_loop)
4144             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4145    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4146             initial address.  Unlike OFFSET, which is number of elements to
4147             be added, BYTE_OFFSET is measured in bytes.
4148
4149    Output:
4150    1. Return an SSA_NAME whose value is the address of the memory location of
4151       the first vector of the data reference.
4152    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4153       these statement(s) which define the returned SSA_NAME.
4154
4155    FORNOW: We are only handling array accesses with step 1.  */
4156
4157 tree
4158 vect_create_addr_base_for_vector_ref (gimple *stmt,
4159                                       gimple_seq *new_stmt_list,
4160                                       tree offset,
4161                                       struct loop *loop,
4162                                       tree byte_offset)
4163 {
4164   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4165   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4166   tree data_ref_base;
4167   const char *base_name;
4168   tree addr_base;
4169   tree dest;
4170   gimple_seq seq = NULL;
4171   tree base_offset;
4172   tree init;
4173   tree vect_ptr_type;
4174   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4175   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4176
4177   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4178     {
4179       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4180
4181       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4182
4183       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4184       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4185       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4186     }
4187   else
4188     {
4189       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4190       base_offset = unshare_expr (DR_OFFSET (dr));
4191       init = unshare_expr (DR_INIT (dr));
4192     }
4193
4194   if (loop_vinfo)
4195     base_name = get_name (data_ref_base);
4196   else
4197     {
4198       base_offset = ssize_int (0);
4199       init = ssize_int (0);
4200       base_name = get_name (DR_REF (dr));
4201     }
4202
4203   /* Create base_offset */
4204   base_offset = size_binop (PLUS_EXPR,
4205                             fold_convert (sizetype, base_offset),
4206                             fold_convert (sizetype, init));
4207
4208   if (offset)
4209     {
4210       offset = fold_build2 (MULT_EXPR, sizetype,
4211                             fold_convert (sizetype, offset), step);
4212       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4213                                  base_offset, offset);
4214     }
4215   if (byte_offset)
4216     {
4217       byte_offset = fold_convert (sizetype, byte_offset);
4218       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4219                                  base_offset, byte_offset);
4220     }
4221
4222   /* base + base_offset */
4223   if (loop_vinfo)
4224     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4225   else
4226     {
4227       addr_base = build1 (ADDR_EXPR,
4228                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4229                           unshare_expr (DR_REF (dr)));
4230     }
4231
4232   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4233   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4234   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4235   gimple_seq_add_seq (new_stmt_list, seq);
4236
4237   if (DR_PTR_INFO (dr)
4238       && TREE_CODE (addr_base) == SSA_NAME
4239       && !SSA_NAME_PTR_INFO (addr_base))
4240     {
4241       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4242       if (offset || byte_offset)
4243         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4244     }
4245
4246   if (dump_enabled_p ())
4247     {
4248       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4249       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4250       dump_printf (MSG_NOTE, "\n");
4251     }
4252
4253   return addr_base;
4254 }
4255
4256
4257 /* Function vect_create_data_ref_ptr.
4258
4259    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4260    location accessed in the loop by STMT, along with the def-use update
4261    chain to appropriately advance the pointer through the loop iterations.
4262    Also set aliasing information for the pointer.  This pointer is used by
4263    the callers to this function to create a memory reference expression for
4264    vector load/store access.
4265
4266    Input:
4267    1. STMT: a stmt that references memory. Expected to be of the form
4268          GIMPLE_ASSIGN <name, data-ref> or
4269          GIMPLE_ASSIGN <data-ref, name>.
4270    2. AGGR_TYPE: the type of the reference, which should be either a vector
4271         or an array.
4272    3. AT_LOOP: the loop where the vector memref is to be created.
4273    4. OFFSET (optional): an offset to be added to the initial address accessed
4274         by the data-ref in STMT.
4275    5. BSI: location where the new stmts are to be placed if there is no loop
4276    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4277         pointing to the initial address.
4278    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4279         to the initial address accessed by the data-ref in STMT.  This is
4280         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4281         in bytes.
4282
4283    Output:
4284    1. Declare a new ptr to vector_type, and have it point to the base of the
4285       data reference (initial addressed accessed by the data reference).
4286       For example, for vector of type V8HI, the following code is generated:
4287
4288       v8hi *ap;
4289       ap = (v8hi *)initial_address;
4290
4291       if OFFSET is not supplied:
4292          initial_address = &a[init];
4293       if OFFSET is supplied:
4294          initial_address = &a[init + OFFSET];
4295       if BYTE_OFFSET is supplied:
4296          initial_address = &a[init] + BYTE_OFFSET;
4297
4298       Return the initial_address in INITIAL_ADDRESS.
4299
4300    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4301       update the pointer in each iteration of the loop.
4302
4303       Return the increment stmt that updates the pointer in PTR_INCR.
4304
4305    3. Set INV_P to true if the access pattern of the data reference in the
4306       vectorized loop is invariant.  Set it to false otherwise.
4307
4308    4. Return the pointer.  */
4309
4310 tree
4311 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4312                           tree offset, tree *initial_address,
4313                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4314                           bool only_init, bool *inv_p, tree byte_offset)
4315 {
4316   const char *base_name;
4317   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4318   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4319   struct loop *loop = NULL;
4320   bool nested_in_vect_loop = false;
4321   struct loop *containing_loop = NULL;
4322   tree aggr_ptr_type;
4323   tree aggr_ptr;
4324   tree new_temp;
4325   gimple_seq new_stmt_list = NULL;
4326   edge pe = NULL;
4327   basic_block new_bb;
4328   tree aggr_ptr_init;
4329   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4330   tree aptr;
4331   gimple_stmt_iterator incr_gsi;
4332   bool insert_after;
4333   tree indx_before_incr, indx_after_incr;
4334   gimple *incr;
4335   tree step;
4336   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4337
4338   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4339               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4340
4341   if (loop_vinfo)
4342     {
4343       loop = LOOP_VINFO_LOOP (loop_vinfo);
4344       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4345       containing_loop = (gimple_bb (stmt))->loop_father;
4346       pe = loop_preheader_edge (loop);
4347     }
4348   else
4349     {
4350       gcc_assert (bb_vinfo);
4351       only_init = true;
4352       *ptr_incr = NULL;
4353     }
4354
4355   /* Check the step (evolution) of the load in LOOP, and record
4356      whether it's invariant.  */
4357   if (nested_in_vect_loop)
4358     step = STMT_VINFO_DR_STEP (stmt_info);
4359   else
4360     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4361
4362   if (integer_zerop (step))
4363     *inv_p = true;
4364   else
4365     *inv_p = false;
4366
4367   /* Create an expression for the first address accessed by this load
4368      in LOOP.  */
4369   base_name = get_name (DR_BASE_ADDRESS (dr));
4370
4371   if (dump_enabled_p ())
4372     {
4373       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4374       dump_printf_loc (MSG_NOTE, vect_location,
4375                        "create %s-pointer variable to type: ",
4376                        get_tree_code_name (TREE_CODE (aggr_type)));
4377       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4378       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4379         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4380       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4381         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4382       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4383         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4384       else
4385         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4386       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4387       dump_printf (MSG_NOTE, "\n");
4388     }
4389
4390   /* (1) Create the new aggregate-pointer variable.
4391      Vector and array types inherit the alias set of their component
4392      type by default so we need to use a ref-all pointer if the data
4393      reference does not conflict with the created aggregated data
4394      reference because it is not addressable.  */
4395   bool need_ref_all = false;
4396   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4397                               get_alias_set (DR_REF (dr))))
4398     need_ref_all = true;
4399   /* Likewise for any of the data references in the stmt group.  */
4400   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4401     {
4402       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4403       do
4404         {
4405           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4406           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4407           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4408                                       get_alias_set (DR_REF (sdr))))
4409             {
4410               need_ref_all = true;
4411               break;
4412             }
4413           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4414         }
4415       while (orig_stmt);
4416     }
4417   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4418                                                need_ref_all);
4419   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4420
4421
4422   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4423      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4424      def-use update cycles for the pointer: one relative to the outer-loop
4425      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4426      to the inner-loop (which is the inner-most loop containing the dataref),
4427      and this is done be step (5) below.
4428
4429      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4430      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4431      redundant.  Steps (3),(4) create the following:
4432
4433         vp0 = &base_addr;
4434         LOOP:   vp1 = phi(vp0,vp2)
4435                 ...
4436                 ...
4437                 vp2 = vp1 + step
4438                 goto LOOP
4439
4440      If there is an inner-loop nested in loop, then step (5) will also be
4441      applied, and an additional update in the inner-loop will be created:
4442
4443         vp0 = &base_addr;
4444         LOOP:   vp1 = phi(vp0,vp2)
4445                 ...
4446         inner:     vp3 = phi(vp1,vp4)
4447                    vp4 = vp3 + inner_step
4448                    if () goto inner
4449                 ...
4450                 vp2 = vp1 + step
4451                 if () goto LOOP   */
4452
4453   /* (2) Calculate the initial address of the aggregate-pointer, and set
4454      the aggregate-pointer to point to it before the loop.  */
4455
4456   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4457
4458   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4459                                                    offset, loop, byte_offset);
4460   if (new_stmt_list)
4461     {
4462       if (pe)
4463         {
4464           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4465           gcc_assert (!new_bb);
4466         }
4467       else
4468         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4469     }
4470
4471   *initial_address = new_temp;
4472   aggr_ptr_init = new_temp;
4473
4474   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4475      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4476      inner-loop nested in LOOP (during outer-loop vectorization).  */
4477
4478   /* No update in loop is required.  */
4479   if (only_init && (!loop_vinfo || at_loop == loop))
4480     aptr = aggr_ptr_init;
4481   else
4482     {
4483       /* The step of the aggregate pointer is the type size.  */
4484       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4485       /* One exception to the above is when the scalar step of the load in
4486          LOOP is zero. In this case the step here is also zero.  */
4487       if (*inv_p)
4488         iv_step = size_zero_node;
4489       else if (tree_int_cst_sgn (step) == -1)
4490         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4491
4492       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4493
4494       create_iv (aggr_ptr_init,
4495                  fold_convert (aggr_ptr_type, iv_step),
4496                  aggr_ptr, loop, &incr_gsi, insert_after,
4497                  &indx_before_incr, &indx_after_incr);
4498       incr = gsi_stmt (incr_gsi);
4499       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4500
4501       /* Copy the points-to information if it exists. */
4502       if (DR_PTR_INFO (dr))
4503         {
4504           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4505           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4506         }
4507       if (ptr_incr)
4508         *ptr_incr = incr;
4509
4510       aptr = indx_before_incr;
4511     }
4512
4513   if (!nested_in_vect_loop || only_init)
4514     return aptr;
4515
4516
4517   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4518      nested in LOOP, if exists.  */
4519
4520   gcc_assert (nested_in_vect_loop);
4521   if (!only_init)
4522     {
4523       standard_iv_increment_position (containing_loop, &incr_gsi,
4524                                       &insert_after);
4525       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4526                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4527                  &indx_after_incr);
4528       incr = gsi_stmt (incr_gsi);
4529       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4530
4531       /* Copy the points-to information if it exists. */
4532       if (DR_PTR_INFO (dr))
4533         {
4534           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4535           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4536         }
4537       if (ptr_incr)
4538         *ptr_incr = incr;
4539
4540       return indx_before_incr;
4541     }
4542   else
4543     gcc_unreachable ();
4544 }
4545
4546
4547 /* Function bump_vector_ptr
4548
4549    Increment a pointer (to a vector type) by vector-size. If requested,
4550    i.e. if PTR-INCR is given, then also connect the new increment stmt
4551    to the existing def-use update-chain of the pointer, by modifying
4552    the PTR_INCR as illustrated below:
4553
4554    The pointer def-use update-chain before this function:
4555                         DATAREF_PTR = phi (p_0, p_2)
4556                         ....
4557         PTR_INCR:       p_2 = DATAREF_PTR + step
4558
4559    The pointer def-use update-chain after this function:
4560                         DATAREF_PTR = phi (p_0, p_2)
4561                         ....
4562                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4563                         ....
4564         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4565
4566    Input:
4567    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4568                  in the loop.
4569    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4570               the loop.  The increment amount across iterations is expected
4571               to be vector_size.
4572    BSI - location where the new update stmt is to be placed.
4573    STMT - the original scalar memory-access stmt that is being vectorized.
4574    BUMP - optional. The offset by which to bump the pointer. If not given,
4575           the offset is assumed to be vector_size.
4576
4577    Output: Return NEW_DATAREF_PTR as illustrated above.
4578
4579 */
4580
4581 tree
4582 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4583                  gimple *stmt, tree bump)
4584 {
4585   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4586   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4587   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4588   tree update = TYPE_SIZE_UNIT (vectype);
4589   gassign *incr_stmt;
4590   ssa_op_iter iter;
4591   use_operand_p use_p;
4592   tree new_dataref_ptr;
4593
4594   if (bump)
4595     update = bump;
4596
4597   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4598     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4599   else
4600     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4601   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4602                                    dataref_ptr, update);
4603   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4604
4605   /* Copy the points-to information if it exists. */
4606   if (DR_PTR_INFO (dr))
4607     {
4608       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4609       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4610     }
4611
4612   if (!ptr_incr)
4613     return new_dataref_ptr;
4614
4615   /* Update the vector-pointer's cross-iteration increment.  */
4616   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4617     {
4618       tree use = USE_FROM_PTR (use_p);
4619
4620       if (use == dataref_ptr)
4621         SET_USE (use_p, new_dataref_ptr);
4622       else
4623         gcc_assert (tree_int_cst_compare (use, update) == 0);
4624     }
4625
4626   return new_dataref_ptr;
4627 }
4628
4629
4630 /* Function vect_create_destination_var.
4631
4632    Create a new temporary of type VECTYPE.  */
4633
4634 tree
4635 vect_create_destination_var (tree scalar_dest, tree vectype)
4636 {
4637   tree vec_dest;
4638   const char *name;
4639   char *new_name;
4640   tree type;
4641   enum vect_var_kind kind;
4642
4643   kind = vectype
4644     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4645     ? vect_mask_var
4646     : vect_simple_var
4647     : vect_scalar_var;
4648   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4649
4650   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4651
4652   name = get_name (scalar_dest);
4653   if (name)
4654     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4655   else
4656     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4657   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4658   free (new_name);
4659
4660   return vec_dest;
4661 }
4662
4663 /* Function vect_grouped_store_supported.
4664
4665    Returns TRUE if interleave high and interleave low permutations
4666    are supported, and FALSE otherwise.  */
4667
4668 bool
4669 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4670 {
4671   machine_mode mode = TYPE_MODE (vectype);
4672
4673   /* vect_permute_store_chain requires the group size to be equal to 3 or
4674      be a power of two.  */
4675   if (count != 3 && exact_log2 (count) == -1)
4676     {
4677       if (dump_enabled_p ())
4678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4679                          "the size of the group of accesses"
4680                          " is not a power of 2 or not eqaul to 3\n");
4681       return false;
4682     }
4683
4684   /* Check that the permutation is supported.  */
4685   if (VECTOR_MODE_P (mode))
4686     {
4687       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4688       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4689
4690       if (count == 3)
4691         {
4692           unsigned int j0 = 0, j1 = 0, j2 = 0;
4693           unsigned int i, j;
4694
4695           for (j = 0; j < 3; j++)
4696             {
4697               int nelt0 = ((3 - j) * nelt) % 3;
4698               int nelt1 = ((3 - j) * nelt + 1) % 3;
4699               int nelt2 = ((3 - j) * nelt + 2) % 3;
4700               for (i = 0; i < nelt; i++)
4701                 {
4702                   if (3 * i + nelt0 < nelt)
4703                     sel[3 * i + nelt0] = j0++;
4704                   if (3 * i + nelt1 < nelt)
4705                     sel[3 * i + nelt1] = nelt + j1++;
4706                   if (3 * i + nelt2 < nelt)
4707                     sel[3 * i + nelt2] = 0;
4708                 }
4709               if (!can_vec_perm_p (mode, false, sel))
4710                 {
4711                   if (dump_enabled_p ())
4712                     dump_printf (MSG_MISSED_OPTIMIZATION,
4713                                  "permutaion op not supported by target.\n");
4714                   return false;
4715                 }
4716
4717               for (i = 0; i < nelt; i++)
4718                 {
4719                   if (3 * i + nelt0 < nelt)
4720                     sel[3 * i + nelt0] = 3 * i + nelt0;
4721                   if (3 * i + nelt1 < nelt)
4722                     sel[3 * i + nelt1] = 3 * i + nelt1;
4723                   if (3 * i + nelt2 < nelt)
4724                     sel[3 * i + nelt2] = nelt + j2++;
4725                 }
4726               if (!can_vec_perm_p (mode, false, sel))
4727                 {
4728                   if (dump_enabled_p ())
4729                     dump_printf (MSG_MISSED_OPTIMIZATION,
4730                                  "permutaion op not supported by target.\n");
4731                   return false;
4732                 }
4733             }
4734           return true;
4735         }
4736       else
4737         {
4738           /* If length is not equal to 3 then only power of 2 is supported.  */
4739           gcc_assert (exact_log2 (count) != -1);
4740
4741           for (i = 0; i < nelt / 2; i++)
4742             {
4743               sel[i * 2] = i;
4744               sel[i * 2 + 1] = i + nelt;
4745             }
4746             if (can_vec_perm_p (mode, false, sel))
4747               {
4748                 for (i = 0; i < nelt; i++)
4749                   sel[i] += nelt / 2;
4750                 if (can_vec_perm_p (mode, false, sel))
4751                   return true;
4752               }
4753         }
4754     }
4755
4756   if (dump_enabled_p ())
4757     dump_printf (MSG_MISSED_OPTIMIZATION,
4758                  "permutaion op not supported by target.\n");
4759   return false;
4760 }
4761
4762
4763 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4764    type VECTYPE.  */
4765
4766 bool
4767 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4768 {
4769   return vect_lanes_optab_supported_p ("vec_store_lanes",
4770                                        vec_store_lanes_optab,
4771                                        vectype, count);
4772 }
4773
4774
4775 /* Function vect_permute_store_chain.
4776
4777    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4778    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4779    the data correctly for the stores.  Return the final references for stores
4780    in RESULT_CHAIN.
4781
4782    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4783    The input is 4 vectors each containing 8 elements.  We assign a number to
4784    each element, the input sequence is:
4785
4786    1st vec:   0  1  2  3  4  5  6  7
4787    2nd vec:   8  9 10 11 12 13 14 15
4788    3rd vec:  16 17 18 19 20 21 22 23
4789    4th vec:  24 25 26 27 28 29 30 31
4790
4791    The output sequence should be:
4792
4793    1st vec:  0  8 16 24  1  9 17 25
4794    2nd vec:  2 10 18 26  3 11 19 27
4795    3rd vec:  4 12 20 28  5 13 21 30
4796    4th vec:  6 14 22 30  7 15 23 31
4797
4798    i.e., we interleave the contents of the four vectors in their order.
4799
4800    We use interleave_high/low instructions to create such output.  The input of
4801    each interleave_high/low operation is two vectors:
4802    1st vec    2nd vec
4803    0 1 2 3    4 5 6 7
4804    the even elements of the result vector are obtained left-to-right from the
4805    high/low elements of the first vector.  The odd elements of the result are
4806    obtained left-to-right from the high/low elements of the second vector.
4807    The output of interleave_high will be:   0 4 1 5
4808    and of interleave_low:                   2 6 3 7
4809
4810
4811    The permutation is done in log LENGTH stages.  In each stage interleave_high
4812    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4813    where the first argument is taken from the first half of DR_CHAIN and the
4814    second argument from it's second half.
4815    In our example,
4816
4817    I1: interleave_high (1st vec, 3rd vec)
4818    I2: interleave_low (1st vec, 3rd vec)
4819    I3: interleave_high (2nd vec, 4th vec)
4820    I4: interleave_low (2nd vec, 4th vec)
4821
4822    The output for the first stage is:
4823
4824    I1:  0 16  1 17  2 18  3 19
4825    I2:  4 20  5 21  6 22  7 23
4826    I3:  8 24  9 25 10 26 11 27
4827    I4: 12 28 13 29 14 30 15 31
4828
4829    The output of the second stage, i.e. the final result is:
4830
4831    I1:  0  8 16 24  1  9 17 25
4832    I2:  2 10 18 26  3 11 19 27
4833    I3:  4 12 20 28  5 13 21 30
4834    I4:  6 14 22 30  7 15 23 31.  */
4835
4836 void
4837 vect_permute_store_chain (vec<tree> dr_chain,
4838                           unsigned int length,
4839                           gimple *stmt,
4840                           gimple_stmt_iterator *gsi,
4841                           vec<tree> *result_chain)
4842 {
4843   tree vect1, vect2, high, low;
4844   gimple *perm_stmt;
4845   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4846   tree perm_mask_low, perm_mask_high;
4847   tree data_ref;
4848   tree perm3_mask_low, perm3_mask_high;
4849   unsigned int i, n, log_length = exact_log2 (length);
4850   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4851   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4852
4853   result_chain->quick_grow (length);
4854   memcpy (result_chain->address (), dr_chain.address (),
4855           length * sizeof (tree));
4856
4857   if (length == 3)
4858     {
4859       unsigned int j0 = 0, j1 = 0, j2 = 0;
4860
4861       for (j = 0; j < 3; j++)
4862         {
4863           int nelt0 = ((3 - j) * nelt) % 3;
4864           int nelt1 = ((3 - j) * nelt + 1) % 3;
4865           int nelt2 = ((3 - j) * nelt + 2) % 3;
4866
4867           for (i = 0; i < nelt; i++)
4868             {
4869               if (3 * i + nelt0 < nelt)
4870                 sel[3 * i + nelt0] = j0++;
4871               if (3 * i + nelt1 < nelt)
4872                 sel[3 * i + nelt1] = nelt + j1++;
4873               if (3 * i + nelt2 < nelt)
4874                 sel[3 * i + nelt2] = 0;
4875             }
4876           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4877
4878           for (i = 0; i < nelt; i++)
4879             {
4880               if (3 * i + nelt0 < nelt)
4881                 sel[3 * i + nelt0] = 3 * i + nelt0;
4882               if (3 * i + nelt1 < nelt)
4883                 sel[3 * i + nelt1] = 3 * i + nelt1;
4884               if (3 * i + nelt2 < nelt)
4885                 sel[3 * i + nelt2] = nelt + j2++;
4886             }
4887           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4888
4889           vect1 = dr_chain[0];
4890           vect2 = dr_chain[1];
4891
4892           /* Create interleaving stmt:
4893              low = VEC_PERM_EXPR <vect1, vect2,
4894                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4895                                    j + 2, nelt + j + 2, *, ...}>  */
4896           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4897           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4898                                            vect2, perm3_mask_low);
4899           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4900
4901           vect1 = data_ref;
4902           vect2 = dr_chain[2];
4903           /* Create interleaving stmt:
4904              low = VEC_PERM_EXPR <vect1, vect2,
4905                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4906                                    6, 7, nelt + j + 2, ...}>  */
4907           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4908           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4909                                            vect2, perm3_mask_high);
4910           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4911           (*result_chain)[j] = data_ref;
4912         }
4913     }
4914   else
4915     {
4916       /* If length is not equal to 3 then only power of 2 is supported.  */
4917       gcc_assert (exact_log2 (length) != -1);
4918
4919       for (i = 0, n = nelt / 2; i < n; i++)
4920         {
4921           sel[i * 2] = i;
4922           sel[i * 2 + 1] = i + nelt;
4923         }
4924         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4925
4926         for (i = 0; i < nelt; i++)
4927           sel[i] += nelt / 2;
4928         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4929
4930         for (i = 0, n = log_length; i < n; i++)
4931           {
4932             for (j = 0; j < length/2; j++)
4933               {
4934                 vect1 = dr_chain[j];
4935                 vect2 = dr_chain[j+length/2];
4936
4937                 /* Create interleaving stmt:
4938                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4939                                                         ...}>  */
4940                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4941                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4942                                                  vect2, perm_mask_high);
4943                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4944                 (*result_chain)[2*j] = high;
4945
4946                 /* Create interleaving stmt:
4947                    low = VEC_PERM_EXPR <vect1, vect2,
4948                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4949                                          ...}>  */
4950                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4951                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4952                                                  vect2, perm_mask_low);
4953                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4954                 (*result_chain)[2*j+1] = low;
4955               }
4956             memcpy (dr_chain.address (), result_chain->address (),
4957                     length * sizeof (tree));
4958           }
4959     }
4960 }
4961
4962 /* Function vect_setup_realignment
4963
4964    This function is called when vectorizing an unaligned load using
4965    the dr_explicit_realign[_optimized] scheme.
4966    This function generates the following code at the loop prolog:
4967
4968       p = initial_addr;
4969    x  msq_init = *(floor(p));   # prolog load
4970       realignment_token = call target_builtin;
4971     loop:
4972    x  msq = phi (msq_init, ---)
4973
4974    The stmts marked with x are generated only for the case of
4975    dr_explicit_realign_optimized.
4976
4977    The code above sets up a new (vector) pointer, pointing to the first
4978    location accessed by STMT, and a "floor-aligned" load using that pointer.
4979    It also generates code to compute the "realignment-token" (if the relevant
4980    target hook was defined), and creates a phi-node at the loop-header bb
4981    whose arguments are the result of the prolog-load (created by this
4982    function) and the result of a load that takes place in the loop (to be
4983    created by the caller to this function).
4984
4985    For the case of dr_explicit_realign_optimized:
4986    The caller to this function uses the phi-result (msq) to create the
4987    realignment code inside the loop, and sets up the missing phi argument,
4988    as follows:
4989     loop:
4990       msq = phi (msq_init, lsq)
4991       lsq = *(floor(p'));        # load in loop
4992       result = realign_load (msq, lsq, realignment_token);
4993
4994    For the case of dr_explicit_realign:
4995     loop:
4996       msq = *(floor(p));        # load in loop
4997       p' = p + (VS-1);
4998       lsq = *(floor(p'));       # load in loop
4999       result = realign_load (msq, lsq, realignment_token);
5000
5001    Input:
5002    STMT - (scalar) load stmt to be vectorized. This load accesses
5003           a memory location that may be unaligned.
5004    BSI - place where new code is to be inserted.
5005    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5006                               is used.
5007
5008    Output:
5009    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5010                        target hook, if defined.
5011    Return value - the result of the loop-header phi node.  */
5012
5013 tree
5014 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5015                         tree *realignment_token,
5016                         enum dr_alignment_support alignment_support_scheme,
5017                         tree init_addr,
5018                         struct loop **at_loop)
5019 {
5020   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5021   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5022   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5023   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5024   struct loop *loop = NULL;
5025   edge pe = NULL;
5026   tree scalar_dest = gimple_assign_lhs (stmt);
5027   tree vec_dest;
5028   gimple *inc;
5029   tree ptr;
5030   tree data_ref;
5031   basic_block new_bb;
5032   tree msq_init = NULL_TREE;
5033   tree new_temp;
5034   gphi *phi_stmt;
5035   tree msq = NULL_TREE;
5036   gimple_seq stmts = NULL;
5037   bool inv_p;
5038   bool compute_in_loop = false;
5039   bool nested_in_vect_loop = false;
5040   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5041   struct loop *loop_for_initial_load = NULL;
5042
5043   if (loop_vinfo)
5044     {
5045       loop = LOOP_VINFO_LOOP (loop_vinfo);
5046       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5047     }
5048
5049   gcc_assert (alignment_support_scheme == dr_explicit_realign
5050               || alignment_support_scheme == dr_explicit_realign_optimized);
5051
5052   /* We need to generate three things:
5053      1. the misalignment computation
5054      2. the extra vector load (for the optimized realignment scheme).
5055      3. the phi node for the two vectors from which the realignment is
5056       done (for the optimized realignment scheme).  */
5057
5058   /* 1. Determine where to generate the misalignment computation.
5059
5060      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5061      calculation will be generated by this function, outside the loop (in the
5062      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5063      caller, inside the loop.
5064
5065      Background: If the misalignment remains fixed throughout the iterations of
5066      the loop, then both realignment schemes are applicable, and also the
5067      misalignment computation can be done outside LOOP.  This is because we are
5068      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5069      are a multiple of VS (the Vector Size), and therefore the misalignment in
5070      different vectorized LOOP iterations is always the same.
5071      The problem arises only if the memory access is in an inner-loop nested
5072      inside LOOP, which is now being vectorized using outer-loop vectorization.
5073      This is the only case when the misalignment of the memory access may not
5074      remain fixed throughout the iterations of the inner-loop (as explained in
5075      detail in vect_supportable_dr_alignment).  In this case, not only is the
5076      optimized realignment scheme not applicable, but also the misalignment
5077      computation (and generation of the realignment token that is passed to
5078      REALIGN_LOAD) have to be done inside the loop.
5079
5080      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5081      or not, which in turn determines if the misalignment is computed inside
5082      the inner-loop, or outside LOOP.  */
5083
5084   if (init_addr != NULL_TREE || !loop_vinfo)
5085     {
5086       compute_in_loop = true;
5087       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5088     }
5089
5090
5091   /* 2. Determine where to generate the extra vector load.
5092
5093      For the optimized realignment scheme, instead of generating two vector
5094      loads in each iteration, we generate a single extra vector load in the
5095      preheader of the loop, and in each iteration reuse the result of the
5096      vector load from the previous iteration.  In case the memory access is in
5097      an inner-loop nested inside LOOP, which is now being vectorized using
5098      outer-loop vectorization, we need to determine whether this initial vector
5099      load should be generated at the preheader of the inner-loop, or can be
5100      generated at the preheader of LOOP.  If the memory access has no evolution
5101      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5102      to be generated inside LOOP (in the preheader of the inner-loop).  */
5103
5104   if (nested_in_vect_loop)
5105     {
5106       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5107       bool invariant_in_outerloop =
5108             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5109       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5110     }
5111   else
5112     loop_for_initial_load = loop;
5113   if (at_loop)
5114     *at_loop = loop_for_initial_load;
5115
5116   if (loop_for_initial_load)
5117     pe = loop_preheader_edge (loop_for_initial_load);
5118
5119   /* 3. For the case of the optimized realignment, create the first vector
5120       load at the loop preheader.  */
5121
5122   if (alignment_support_scheme == dr_explicit_realign_optimized)
5123     {
5124       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5125       gassign *new_stmt;
5126
5127       gcc_assert (!compute_in_loop);
5128       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5129       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5130                                       NULL_TREE, &init_addr, NULL, &inc,
5131                                       true, &inv_p);
5132       if (TREE_CODE (ptr) == SSA_NAME)
5133         new_temp = copy_ssa_name (ptr);
5134       else
5135         new_temp = make_ssa_name (TREE_TYPE (ptr));
5136       new_stmt = gimple_build_assign
5137                    (new_temp, BIT_AND_EXPR, ptr,
5138                     build_int_cst (TREE_TYPE (ptr),
5139                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5140       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5141       gcc_assert (!new_bb);
5142       data_ref
5143         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5144                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5145       new_stmt = gimple_build_assign (vec_dest, data_ref);
5146       new_temp = make_ssa_name (vec_dest, new_stmt);
5147       gimple_assign_set_lhs (new_stmt, new_temp);
5148       if (pe)
5149         {
5150           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5151           gcc_assert (!new_bb);
5152         }
5153       else
5154          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5155
5156       msq_init = gimple_assign_lhs (new_stmt);
5157     }
5158
5159   /* 4. Create realignment token using a target builtin, if available.
5160       It is done either inside the containing loop, or before LOOP (as
5161       determined above).  */
5162
5163   if (targetm.vectorize.builtin_mask_for_load)
5164     {
5165       gcall *new_stmt;
5166       tree builtin_decl;
5167
5168       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5169       if (!init_addr)
5170         {
5171           /* Generate the INIT_ADDR computation outside LOOP.  */
5172           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5173                                                         NULL_TREE, loop);
5174           if (loop)
5175             {
5176               pe = loop_preheader_edge (loop);
5177               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5178               gcc_assert (!new_bb);
5179             }
5180           else
5181              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5182         }
5183
5184       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5185       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5186       vec_dest =
5187         vect_create_destination_var (scalar_dest,
5188                                      gimple_call_return_type (new_stmt));
5189       new_temp = make_ssa_name (vec_dest, new_stmt);
5190       gimple_call_set_lhs (new_stmt, new_temp);
5191
5192       if (compute_in_loop)
5193         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5194       else
5195         {
5196           /* Generate the misalignment computation outside LOOP.  */
5197           pe = loop_preheader_edge (loop);
5198           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5199           gcc_assert (!new_bb);
5200         }
5201
5202       *realignment_token = gimple_call_lhs (new_stmt);
5203
5204       /* The result of the CALL_EXPR to this builtin is determined from
5205          the value of the parameter and no global variables are touched
5206          which makes the builtin a "const" function.  Requiring the
5207          builtin to have the "const" attribute makes it unnecessary
5208          to call mark_call_clobbered.  */
5209       gcc_assert (TREE_READONLY (builtin_decl));
5210     }
5211
5212   if (alignment_support_scheme == dr_explicit_realign)
5213     return msq;
5214
5215   gcc_assert (!compute_in_loop);
5216   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5217
5218
5219   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5220
5221   pe = loop_preheader_edge (containing_loop);
5222   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5223   msq = make_ssa_name (vec_dest);
5224   phi_stmt = create_phi_node (msq, containing_loop->header);
5225   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5226
5227   return msq;
5228 }
5229
5230
5231 /* Function vect_grouped_load_supported.
5232
5233    COUNT is the size of the load group (the number of statements plus the
5234    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5235    only one statement, with a gap of COUNT - 1.
5236
5237    Returns true if a suitable permute exists.  */
5238
5239 bool
5240 vect_grouped_load_supported (tree vectype, bool single_element_p,
5241                              unsigned HOST_WIDE_INT count)
5242 {
5243   machine_mode mode = TYPE_MODE (vectype);
5244
5245   /* If this is single-element interleaving with an element distance
5246      that leaves unused vector loads around punt - we at least create
5247      very sub-optimal code in that case (and blow up memory,
5248      see PR65518).  */
5249   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5250     {
5251       if (dump_enabled_p ())
5252         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5253                          "single-element interleaving not supported "
5254                          "for not adjacent vector loads\n");
5255       return false;
5256     }
5257
5258   /* vect_permute_load_chain requires the group size to be equal to 3 or
5259      be a power of two.  */
5260   if (count != 3 && exact_log2 (count) == -1)
5261     {
5262       if (dump_enabled_p ())
5263         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5264                          "the size of the group of accesses"
5265                          " is not a power of 2 or not equal to 3\n");
5266       return false;
5267     }
5268
5269   /* Check that the permutation is supported.  */
5270   if (VECTOR_MODE_P (mode))
5271     {
5272       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5273       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5274
5275       if (count == 3)
5276         {
5277           unsigned int k;
5278           for (k = 0; k < 3; k++)
5279             {
5280               for (i = 0; i < nelt; i++)
5281                 if (3 * i + k < 2 * nelt)
5282                   sel[i] = 3 * i + k;
5283                 else
5284                   sel[i] = 0;
5285               if (!can_vec_perm_p (mode, false, sel))
5286                 {
5287                   if (dump_enabled_p ())
5288                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5289                                      "shuffle of 3 loads is not supported by"
5290                                      " target\n");
5291                   return false;
5292                 }
5293               for (i = 0, j = 0; i < nelt; i++)
5294                 if (3 * i + k < 2 * nelt)
5295                   sel[i] = i;
5296                 else
5297                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5298               if (!can_vec_perm_p (mode, false, sel))
5299                 {
5300                   if (dump_enabled_p ())
5301                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5302                                      "shuffle of 3 loads is not supported by"
5303                                      " target\n");
5304                   return false;
5305                 }
5306             }
5307           return true;
5308         }
5309       else
5310         {
5311           /* If length is not equal to 3 then only power of 2 is supported.  */
5312           gcc_assert (exact_log2 (count) != -1);
5313           for (i = 0; i < nelt; i++)
5314             sel[i] = i * 2;
5315           if (can_vec_perm_p (mode, false, sel))
5316             {
5317               for (i = 0; i < nelt; i++)
5318                 sel[i] = i * 2 + 1;
5319               if (can_vec_perm_p (mode, false, sel))
5320                 return true;
5321             }
5322         }
5323     }
5324
5325   if (dump_enabled_p ())
5326     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5327                      "extract even/odd not supported by target\n");
5328   return false;
5329 }
5330
5331 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5332    type VECTYPE.  */
5333
5334 bool
5335 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5336 {
5337   return vect_lanes_optab_supported_p ("vec_load_lanes",
5338                                        vec_load_lanes_optab,
5339                                        vectype, count);
5340 }
5341
5342 /* Function vect_permute_load_chain.
5343
5344    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5345    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5346    the input data correctly.  Return the final references for loads in
5347    RESULT_CHAIN.
5348
5349    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5350    The input is 4 vectors each containing 8 elements. We assign a number to each
5351    element, the input sequence is:
5352
5353    1st vec:   0  1  2  3  4  5  6  7
5354    2nd vec:   8  9 10 11 12 13 14 15
5355    3rd vec:  16 17 18 19 20 21 22 23
5356    4th vec:  24 25 26 27 28 29 30 31
5357
5358    The output sequence should be:
5359
5360    1st vec:  0 4  8 12 16 20 24 28
5361    2nd vec:  1 5  9 13 17 21 25 29
5362    3rd vec:  2 6 10 14 18 22 26 30
5363    4th vec:  3 7 11 15 19 23 27 31
5364
5365    i.e., the first output vector should contain the first elements of each
5366    interleaving group, etc.
5367
5368    We use extract_even/odd instructions to create such output.  The input of
5369    each extract_even/odd operation is two vectors
5370    1st vec    2nd vec
5371    0 1 2 3    4 5 6 7
5372
5373    and the output is the vector of extracted even/odd elements.  The output of
5374    extract_even will be:   0 2 4 6
5375    and of extract_odd:     1 3 5 7
5376
5377
5378    The permutation is done in log LENGTH stages.  In each stage extract_even
5379    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5380    their order.  In our example,
5381
5382    E1: extract_even (1st vec, 2nd vec)
5383    E2: extract_odd (1st vec, 2nd vec)
5384    E3: extract_even (3rd vec, 4th vec)
5385    E4: extract_odd (3rd vec, 4th vec)
5386
5387    The output for the first stage will be:
5388
5389    E1:  0  2  4  6  8 10 12 14
5390    E2:  1  3  5  7  9 11 13 15
5391    E3: 16 18 20 22 24 26 28 30
5392    E4: 17 19 21 23 25 27 29 31
5393
5394    In order to proceed and create the correct sequence for the next stage (or
5395    for the correct output, if the second stage is the last one, as in our
5396    example), we first put the output of extract_even operation and then the
5397    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5398    The input for the second stage is:
5399
5400    1st vec (E1):  0  2  4  6  8 10 12 14
5401    2nd vec (E3): 16 18 20 22 24 26 28 30
5402    3rd vec (E2):  1  3  5  7  9 11 13 15
5403    4th vec (E4): 17 19 21 23 25 27 29 31
5404
5405    The output of the second stage:
5406
5407    E1: 0 4  8 12 16 20 24 28
5408    E2: 2 6 10 14 18 22 26 30
5409    E3: 1 5  9 13 17 21 25 29
5410    E4: 3 7 11 15 19 23 27 31
5411
5412    And RESULT_CHAIN after reordering:
5413
5414    1st vec (E1):  0 4  8 12 16 20 24 28
5415    2nd vec (E3):  1 5  9 13 17 21 25 29
5416    3rd vec (E2):  2 6 10 14 18 22 26 30
5417    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5418
5419 static void
5420 vect_permute_load_chain (vec<tree> dr_chain,
5421                          unsigned int length,
5422                          gimple *stmt,
5423                          gimple_stmt_iterator *gsi,
5424                          vec<tree> *result_chain)
5425 {
5426   tree data_ref, first_vect, second_vect;
5427   tree perm_mask_even, perm_mask_odd;
5428   tree perm3_mask_low, perm3_mask_high;
5429   gimple *perm_stmt;
5430   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5431   unsigned int i, j, log_length = exact_log2 (length);
5432   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5433   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5434
5435   result_chain->quick_grow (length);
5436   memcpy (result_chain->address (), dr_chain.address (),
5437           length * sizeof (tree));
5438
5439   if (length == 3)
5440     {
5441       unsigned int k;
5442
5443       for (k = 0; k < 3; k++)
5444         {
5445           for (i = 0; i < nelt; i++)
5446             if (3 * i + k < 2 * nelt)
5447               sel[i] = 3 * i + k;
5448             else
5449               sel[i] = 0;
5450           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5451
5452           for (i = 0, j = 0; i < nelt; i++)
5453             if (3 * i + k < 2 * nelt)
5454               sel[i] = i;
5455             else
5456               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5457
5458           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5459
5460           first_vect = dr_chain[0];
5461           second_vect = dr_chain[1];
5462
5463           /* Create interleaving stmt (low part of):
5464              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5465                                                              ...}>  */
5466           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5467           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5468                                            second_vect, perm3_mask_low);
5469           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5470
5471           /* Create interleaving stmt (high part of):
5472              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5473                                                               ...}>  */
5474           first_vect = data_ref;
5475           second_vect = dr_chain[2];
5476           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5477           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5478                                            second_vect, perm3_mask_high);
5479           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5480           (*result_chain)[k] = data_ref;
5481         }
5482     }
5483   else
5484     {
5485       /* If length is not equal to 3 then only power of 2 is supported.  */
5486       gcc_assert (exact_log2 (length) != -1);
5487
5488       for (i = 0; i < nelt; ++i)
5489         sel[i] = i * 2;
5490       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5491
5492       for (i = 0; i < nelt; ++i)
5493         sel[i] = i * 2 + 1;
5494       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5495
5496       for (i = 0; i < log_length; i++)
5497         {
5498           for (j = 0; j < length; j += 2)
5499             {
5500               first_vect = dr_chain[j];
5501               second_vect = dr_chain[j+1];
5502
5503               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5504               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5505               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5506                                                first_vect, second_vect,
5507                                                perm_mask_even);
5508               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5509               (*result_chain)[j/2] = data_ref;
5510
5511               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5512               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5513               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5514                                                first_vect, second_vect,
5515                                                perm_mask_odd);
5516               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5517               (*result_chain)[j/2+length/2] = data_ref;
5518             }
5519           memcpy (dr_chain.address (), result_chain->address (),
5520                   length * sizeof (tree));
5521         }
5522     }
5523 }
5524
5525 /* Function vect_shift_permute_load_chain.
5526
5527    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5528    sequence of stmts to reorder the input data accordingly.
5529    Return the final references for loads in RESULT_CHAIN.
5530    Return true if successed, false otherwise.
5531
5532    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5533    The input is 3 vectors each containing 8 elements.  We assign a
5534    number to each element, the input sequence is:
5535
5536    1st vec:   0  1  2  3  4  5  6  7
5537    2nd vec:   8  9 10 11 12 13 14 15
5538    3rd vec:  16 17 18 19 20 21 22 23
5539
5540    The output sequence should be:
5541
5542    1st vec:  0 3 6  9 12 15 18 21
5543    2nd vec:  1 4 7 10 13 16 19 22
5544    3rd vec:  2 5 8 11 14 17 20 23
5545
5546    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5547
5548    First we shuffle all 3 vectors to get correct elements order:
5549
5550    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5551    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5552    3rd vec:  (16 19 22) (17 20 23) (18 21)
5553
5554    Next we unite and shift vector 3 times:
5555
5556    1st step:
5557      shift right by 6 the concatenation of:
5558      "1st vec" and  "2nd vec"
5559        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5560      "2nd vec" and  "3rd vec"
5561        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5562      "3rd vec" and  "1st vec"
5563        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5564                              | New vectors                   |
5565
5566      So that now new vectors are:
5567
5568      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5569      2nd vec:  (10 13) (16 19 22) (17 20 23)
5570      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5571
5572    2nd step:
5573      shift right by 5 the concatenation of:
5574      "1st vec" and  "3rd vec"
5575        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5576      "2nd vec" and  "1st vec"
5577        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5578      "3rd vec" and  "2nd vec"
5579        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5580                           | New vectors                   |
5581
5582      So that now new vectors are:
5583
5584      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5585      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5586      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5587
5588    3rd step:
5589      shift right by 5 the concatenation of:
5590      "1st vec" and  "1st vec"
5591        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5592      shift right by 3 the concatenation of:
5593      "2nd vec" and  "2nd vec"
5594                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5595                           | New vectors                   |
5596
5597      So that now all vectors are READY:
5598      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5599      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5600      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5601
5602    This algorithm is faster than one in vect_permute_load_chain if:
5603      1.  "shift of a concatination" is faster than general permutation.
5604          This is usually so.
5605      2.  The TARGET machine can't execute vector instructions in parallel.
5606          This is because each step of the algorithm depends on previous.
5607          The algorithm in vect_permute_load_chain is much more parallel.
5608
5609    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5610 */
5611
5612 static bool
5613 vect_shift_permute_load_chain (vec<tree> dr_chain,
5614                                unsigned int length,
5615                                gimple *stmt,
5616                                gimple_stmt_iterator *gsi,
5617                                vec<tree> *result_chain)
5618 {
5619   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5620   tree perm2_mask1, perm2_mask2, perm3_mask;
5621   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5622   gimple *perm_stmt;
5623
5624   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5625   unsigned int i;
5626   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5627   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5628   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5629   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5630
5631   result_chain->quick_grow (length);
5632   memcpy (result_chain->address (), dr_chain.address (),
5633           length * sizeof (tree));
5634
5635   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5636     {
5637       unsigned int j, log_length = exact_log2 (length);
5638       for (i = 0; i < nelt / 2; ++i)
5639         sel[i] = i * 2;
5640       for (i = 0; i < nelt / 2; ++i)
5641         sel[nelt / 2 + i] = i * 2 + 1;
5642       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5643         {
5644           if (dump_enabled_p ())
5645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5646                              "shuffle of 2 fields structure is not \
5647                               supported by target\n");
5648           return false;
5649         }
5650       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5651
5652       for (i = 0; i < nelt / 2; ++i)
5653         sel[i] = i * 2 + 1;
5654       for (i = 0; i < nelt / 2; ++i)
5655         sel[nelt / 2 + i] = i * 2;
5656       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5657         {
5658           if (dump_enabled_p ())
5659             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5660                              "shuffle of 2 fields structure is not \
5661                               supported by target\n");
5662           return false;
5663         }
5664       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5665
5666       /* Generating permutation constant to shift all elements.
5667          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5668       for (i = 0; i < nelt; i++)
5669         sel[i] = nelt / 2 + i;
5670       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5671         {
5672           if (dump_enabled_p ())
5673             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5674                              "shift permutation is not supported by target\n");
5675           return false;
5676         }
5677       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5678
5679       /* Generating permutation constant to select vector from 2.
5680          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5681       for (i = 0; i < nelt / 2; i++)
5682         sel[i] = i;
5683       for (i = nelt / 2; i < nelt; i++)
5684         sel[i] = nelt + i;
5685       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5686         {
5687           if (dump_enabled_p ())
5688             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5689                              "select is not supported by target\n");
5690           return false;
5691         }
5692       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5693
5694       for (i = 0; i < log_length; i++)
5695         {
5696           for (j = 0; j < length; j += 2)
5697             {
5698               first_vect = dr_chain[j];
5699               second_vect = dr_chain[j + 1];
5700
5701               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5702               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5703                                                first_vect, first_vect,
5704                                                perm2_mask1);
5705               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5706               vect[0] = data_ref;
5707
5708               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5709               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5710                                                second_vect, second_vect,
5711                                                perm2_mask2);
5712               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5713               vect[1] = data_ref;
5714
5715               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5716               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5717                                                vect[0], vect[1], shift1_mask);
5718               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5719               (*result_chain)[j/2 + length/2] = data_ref;
5720
5721               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5722               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5723                                                vect[0], vect[1], select_mask);
5724               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5725               (*result_chain)[j/2] = data_ref;
5726             }
5727           memcpy (dr_chain.address (), result_chain->address (),
5728                   length * sizeof (tree));
5729         }
5730       return true;
5731     }
5732   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5733     {
5734       unsigned int k = 0, l = 0;
5735
5736       /* Generating permutation constant to get all elements in rigth order.
5737          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5738       for (i = 0; i < nelt; i++)
5739         {
5740           if (3 * k + (l % 3) >= nelt)
5741             {
5742               k = 0;
5743               l += (3 - (nelt % 3));
5744             }
5745           sel[i] = 3 * k + (l % 3);
5746           k++;
5747         }
5748       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5749         {
5750           if (dump_enabled_p ())
5751             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5752                              "shuffle of 3 fields structure is not \
5753                               supported by target\n");
5754           return false;
5755         }
5756       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5757
5758       /* Generating permutation constant to shift all elements.
5759          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5760       for (i = 0; i < nelt; i++)
5761         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5762       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5763         {
5764           if (dump_enabled_p ())
5765             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766                              "shift permutation is not supported by target\n");
5767           return false;
5768         }
5769       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5770
5771       /* Generating permutation constant to shift all elements.
5772          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5773       for (i = 0; i < nelt; i++)
5774         sel[i] = 2 * (nelt / 3) + 1 + i;
5775       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5776         {
5777           if (dump_enabled_p ())
5778             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5779                              "shift permutation is not supported by target\n");
5780           return false;
5781         }
5782       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5783
5784       /* Generating permutation constant to shift all elements.
5785          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5786       for (i = 0; i < nelt; i++)
5787         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5788       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5789         {
5790           if (dump_enabled_p ())
5791             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5792                              "shift permutation is not supported by target\n");
5793           return false;
5794         }
5795       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5796
5797       /* Generating permutation constant to shift all elements.
5798          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5799       for (i = 0; i < nelt; i++)
5800         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5801       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5802         {
5803           if (dump_enabled_p ())
5804             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805                              "shift permutation is not supported by target\n");
5806           return false;
5807         }
5808       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5809
5810       for (k = 0; k < 3; k++)
5811         {
5812           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5813           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5814                                            dr_chain[k], dr_chain[k],
5815                                            perm3_mask);
5816           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5817           vect[k] = data_ref;
5818         }
5819
5820       for (k = 0; k < 3; k++)
5821         {
5822           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5823           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5824                                            vect[k % 3], vect[(k + 1) % 3],
5825                                            shift1_mask);
5826           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5827           vect_shift[k] = data_ref;
5828         }
5829
5830       for (k = 0; k < 3; k++)
5831         {
5832           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5833           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5834                                            vect_shift[(4 - k) % 3],
5835                                            vect_shift[(3 - k) % 3],
5836                                            shift2_mask);
5837           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5838           vect[k] = data_ref;
5839         }
5840
5841       (*result_chain)[3 - (nelt % 3)] = vect[2];
5842
5843       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5844       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5845                                        vect[0], shift3_mask);
5846       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5847       (*result_chain)[nelt % 3] = data_ref;
5848
5849       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5850       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5851                                        vect[1], shift4_mask);
5852       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5853       (*result_chain)[0] = data_ref;
5854       return true;
5855     }
5856   return false;
5857 }
5858
5859 /* Function vect_transform_grouped_load.
5860
5861    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5862    to perform their permutation and ascribe the result vectorized statements to
5863    the scalar statements.
5864 */
5865
5866 void
5867 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5868                              gimple_stmt_iterator *gsi)
5869 {
5870   machine_mode mode;
5871   vec<tree> result_chain = vNULL;
5872
5873   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5874      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5875      vectors, that are ready for vector computation.  */
5876   result_chain.create (size);
5877
5878   /* If reassociation width for vector type is 2 or greater target machine can
5879      execute 2 or more vector instructions in parallel.  Otherwise try to
5880      get chain for loads group using vect_shift_permute_load_chain.  */
5881   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5882   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5883       || exact_log2 (size) != -1
5884       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5885                                          gsi, &result_chain))
5886     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5887   vect_record_grouped_load_vectors (stmt, result_chain);
5888   result_chain.release ();
5889 }
5890
5891 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5892    generated as part of the vectorization of STMT.  Assign the statement
5893    for each vector to the associated scalar statement.  */
5894
5895 void
5896 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5897 {
5898   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5899   gimple *next_stmt, *new_stmt;
5900   unsigned int i, gap_count;
5901   tree tmp_data_ref;
5902
5903   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5904      Since we scan the chain starting from it's first node, their order
5905      corresponds the order of data-refs in RESULT_CHAIN.  */
5906   next_stmt = first_stmt;
5907   gap_count = 1;
5908   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5909     {
5910       if (!next_stmt)
5911         break;
5912
5913       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5914        code elimination pass later.  No need to check for the first stmt in
5915        the group, since it always exists.
5916        GROUP_GAP is the number of steps in elements from the previous
5917        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5918        correspond to the gaps.  */
5919       if (next_stmt != first_stmt
5920           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5921       {
5922         gap_count++;
5923         continue;
5924       }
5925
5926       while (next_stmt)
5927         {
5928           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5929           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5930              copies, and we put the new vector statement in the first available
5931              RELATED_STMT.  */
5932           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5933             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5934           else
5935             {
5936               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5937                 {
5938                   gimple *prev_stmt =
5939                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5940                   gimple *rel_stmt =
5941                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5942                   while (rel_stmt)
5943                     {
5944                       prev_stmt = rel_stmt;
5945                       rel_stmt =
5946                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5947                     }
5948
5949                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5950                     new_stmt;
5951                 }
5952             }
5953
5954           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5955           gap_count = 1;
5956           /* If NEXT_STMT accesses the same DR as the previous statement,
5957              put the same TMP_DATA_REF as its vectorized statement; otherwise
5958              get the next data-ref from RESULT_CHAIN.  */
5959           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5960             break;
5961         }
5962     }
5963 }
5964
5965 /* Function vect_force_dr_alignment_p.
5966
5967    Returns whether the alignment of a DECL can be forced to be aligned
5968    on ALIGNMENT bit boundary.  */
5969
5970 bool
5971 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5972 {
5973   if (TREE_CODE (decl) != VAR_DECL)
5974     return false;
5975
5976   if (decl_in_symtab_p (decl)
5977       && !symtab_node::get (decl)->can_increase_alignment_p ())
5978     return false;
5979
5980   if (TREE_STATIC (decl))
5981     return (alignment <= MAX_OFILE_ALIGNMENT);
5982   else
5983     return (alignment <= MAX_STACK_ALIGNMENT);
5984 }
5985
5986
5987 /* Return whether the data reference DR is supported with respect to its
5988    alignment.
5989    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5990    it is aligned, i.e., check if it is possible to vectorize it with different
5991    alignment.  */
5992
5993 enum dr_alignment_support
5994 vect_supportable_dr_alignment (struct data_reference *dr,
5995                                bool check_aligned_accesses)
5996 {
5997   gimple *stmt = DR_STMT (dr);
5998   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5999   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6000   machine_mode mode = TYPE_MODE (vectype);
6001   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6002   struct loop *vect_loop = NULL;
6003   bool nested_in_vect_loop = false;
6004
6005   if (aligned_access_p (dr) && !check_aligned_accesses)
6006     return dr_aligned;
6007
6008   /* For now assume all conditional loads/stores support unaligned
6009      access without any special code.  */
6010   if (is_gimple_call (stmt)
6011       && gimple_call_internal_p (stmt)
6012       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6013           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6014     return dr_unaligned_supported;
6015
6016   if (loop_vinfo)
6017     {
6018       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6019       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6020     }
6021
6022   /* Possibly unaligned access.  */
6023
6024   /* We can choose between using the implicit realignment scheme (generating
6025      a misaligned_move stmt) and the explicit realignment scheme (generating
6026      aligned loads with a REALIGN_LOAD).  There are two variants to the
6027      explicit realignment scheme: optimized, and unoptimized.
6028      We can optimize the realignment only if the step between consecutive
6029      vector loads is equal to the vector size.  Since the vector memory
6030      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6031      is guaranteed that the misalignment amount remains the same throughout the
6032      execution of the vectorized loop.  Therefore, we can create the
6033      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6034      at the loop preheader.
6035
6036      However, in the case of outer-loop vectorization, when vectorizing a
6037      memory access in the inner-loop nested within the LOOP that is now being
6038      vectorized, while it is guaranteed that the misalignment of the
6039      vectorized memory access will remain the same in different outer-loop
6040      iterations, it is *not* guaranteed that is will remain the same throughout
6041      the execution of the inner-loop.  This is because the inner-loop advances
6042      with the original scalar step (and not in steps of VS).  If the inner-loop
6043      step happens to be a multiple of VS, then the misalignment remains fixed
6044      and we can use the optimized realignment scheme.  For example:
6045
6046       for (i=0; i<N; i++)
6047         for (j=0; j<M; j++)
6048           s += a[i+j];
6049
6050      When vectorizing the i-loop in the above example, the step between
6051      consecutive vector loads is 1, and so the misalignment does not remain
6052      fixed across the execution of the inner-loop, and the realignment cannot
6053      be optimized (as illustrated in the following pseudo vectorized loop):
6054
6055       for (i=0; i<N; i+=4)
6056         for (j=0; j<M; j++){
6057           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6058                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6059                          // (assuming that we start from an aligned address).
6060           }
6061
6062      We therefore have to use the unoptimized realignment scheme:
6063
6064       for (i=0; i<N; i+=4)
6065           for (j=k; j<M; j+=4)
6066           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6067                            // that the misalignment of the initial address is
6068                            // 0).
6069
6070      The loop can then be vectorized as follows:
6071
6072       for (k=0; k<4; k++){
6073         rt = get_realignment_token (&vp[k]);
6074         for (i=0; i<N; i+=4){
6075           v1 = vp[i+k];
6076           for (j=k; j<M; j+=4){
6077             v2 = vp[i+j+VS-1];
6078             va = REALIGN_LOAD <v1,v2,rt>;
6079             vs += va;
6080             v1 = v2;
6081           }
6082         }
6083     } */
6084
6085   if (DR_IS_READ (dr))
6086     {
6087       bool is_packed = false;
6088       tree type = (TREE_TYPE (DR_REF (dr)));
6089
6090       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6091           && (!targetm.vectorize.builtin_mask_for_load
6092               || targetm.vectorize.builtin_mask_for_load ()))
6093         {
6094           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6095
6096           /* If we are doing SLP then the accesses need not have the
6097              same alignment, instead it depends on the SLP group size.  */
6098           if (loop_vinfo
6099               && STMT_SLP_TYPE (stmt_info)
6100               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6101                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6102                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6103             ;
6104           else if (!loop_vinfo
6105                    || (nested_in_vect_loop
6106                        && (TREE_INT_CST_LOW (DR_STEP (dr))
6107                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6108             return dr_explicit_realign;
6109           else
6110             return dr_explicit_realign_optimized;
6111         }
6112       if (!known_alignment_for_access_p (dr))
6113         is_packed = not_size_aligned (DR_REF (dr));
6114
6115       if ((TYPE_USER_ALIGN (type) && !is_packed)
6116           || targetm.vectorize.
6117                support_vector_misalignment (mode, type,
6118                                             DR_MISALIGNMENT (dr), is_packed))
6119         /* Can't software pipeline the loads, but can at least do them.  */
6120         return dr_unaligned_supported;
6121     }
6122   else
6123     {
6124       bool is_packed = false;
6125       tree type = (TREE_TYPE (DR_REF (dr)));
6126
6127       if (!known_alignment_for_access_p (dr))
6128         is_packed = not_size_aligned (DR_REF (dr));
6129
6130      if ((TYPE_USER_ALIGN (type) && !is_packed)
6131          || targetm.vectorize.
6132               support_vector_misalignment (mode, type,
6133                                            DR_MISALIGNMENT (dr), is_packed))
6134        return dr_unaligned_supported;
6135     }
6136
6137   /* Unsupported.  */
6138   return dr_unaligned_unsupported;
6139 }