gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2016 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "tm_p.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "cgraph.h"
  35 #include "dumpfile.h"
  36 #include "alias.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "tree-eh.h"
  40 #include "gimplify.h"
  41 #include "gimple-iterator.h"
  42 #include "gimplify-me.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "tree-ssa-loop-manip.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "expr.h"
  50 #include "builtins.h"
  51 #include "params.h"
  52
  53 /* Return true if load- or store-lanes optab OPTAB is implemented for
  54    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  55
  56 static bool
  57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  58                               tree vectype, unsigned HOST_WIDE_INT count)
  59 {
  60   machine_mode mode, array_mode;
  61   bool limit_p;
  62
  63   mode = TYPE_MODE (vectype);
  64   limit_p = !targetm.array_mode_supported_p (mode, count);
  65   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  66                               MODE_INT, limit_p);
  67
  68   if (array_mode == BLKmode)
  69     {
  70       if (dump_enabled_p ())
  71         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  72                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  73                          GET_MODE_NAME (mode), count);
  74       return false;
  75     }
  76
  77   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  78     {
  79       if (dump_enabled_p ())
  80         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  81                          "cannot use %s<%s><%s>\n", name,
  82                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  83       return false;
  84     }
  85
  86   if (dump_enabled_p ())
  87     dump_printf_loc (MSG_NOTE, vect_location,
  88                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  89                      GET_MODE_NAME (mode));
  90
  91   return true;
  92 }
  93
  94
  95 /* Return the smallest scalar part of STMT.
  96    This is used to determine the vectype of the stmt.  We generally set the
  97    vectype according to the type of the result (lhs).  For stmts whose
  98    result-type is different than the type of the arguments (e.g., demotion,
  99    promotion), vectype will be reset appropriately (later).  Note that we have
 100    to visit the smallest datatype in this function, because that determines the
 101    VF.  If the smallest datatype in the loop is present only as the rhs of a
 102    promotion operation - we'd miss it.
 103    Such a case, where a variable of this datatype does not appear in the lhs
 104    anywhere in the loop, can only occur if it's an invariant: e.g.:
 105    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 106    invariant motion.  However, we cannot rely on invariant motion to always
 107    take invariants out of the loop, and so in the case of promotion we also
 108    have to check the rhs.
 109    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 110    types.  */
 111
 112 tree
 113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 114                                HOST_WIDE_INT *rhs_size_unit)
 115 {
 116   tree scalar_type = gimple_expr_type (stmt);
 117   HOST_WIDE_INT lhs, rhs;
 118
 119   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 120
 121   if (is_gimple_assign (stmt)
 122       && (gimple_assign_cast_p (stmt)
 123           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 124           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 125           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 126     {
 127       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 128
 129       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 130       if (rhs < lhs)
 131         scalar_type = rhs_type;
 132     }
 133
 134   *lhs_size_unit = lhs;
 135   *rhs_size_unit = rhs;
 136   return scalar_type;
 137 }
 138
 139
 140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 141    tested at run-time.  Return TRUE if DDR was successfully inserted.
 142    Return false if versioning is not supported.  */
 143
 144 static bool
 145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 146 {
 147   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 148
 149   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 150     return false;
 151
 152   if (dump_enabled_p ())
 153     {
 154       dump_printf_loc (MSG_NOTE, vect_location,
 155                        "mark for run-time aliasing test between ");
 156       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 157       dump_printf (MSG_NOTE,  " and ");
 158       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 159       dump_printf (MSG_NOTE, "\n");
 160     }
 161
 162   if (optimize_loop_nest_for_size_p (loop))
 163     {
 164       if (dump_enabled_p ())
 165         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 166                          "versioning not supported when optimizing"
 167                          " for size.\n");
 168       return false;
 169     }
 170
 171   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 172   if (loop->inner)
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 176                          "versioning not yet supported for outer-loops.\n");
 177       return false;
 178     }
 179
 180   /* FORNOW: We don't support creating runtime alias tests for non-constant
 181      step.  */
 182   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 183       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 184     {
 185       if (dump_enabled_p ())
 186         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 187                          "versioning not yet supported for non-constant "
 188                          "step\n");
 189       return false;
 190     }
 191
 192   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 193   return true;
 194 }
 195
 196
 197 /* Function vect_analyze_data_ref_dependence.
 198
 199    Return TRUE if there (might) exist a dependence between a memory-reference
 200    DRA and a memory-reference DRB.  When versioning for alias may check a
 201    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 202    the data dependence.  */
 203
 204 static bool
 205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 206                                   loop_vec_info loop_vinfo, int *max_vf)
 207 {
 208   unsigned int i;
 209   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 210   struct data_reference *dra = DDR_A (ddr);
 211   struct data_reference *drb = DDR_B (ddr);
 212   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 213   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 214   lambda_vector dist_v;
 215   unsigned int loop_depth;
 216
 217   /* In loop analysis all data references should be vectorizable.  */
 218   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 219       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 220     gcc_unreachable ();
 221
 222   /* Independent data accesses.  */
 223   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 224     return false;
 225
 226   if (dra == drb
 227       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 228     return false;
 229
 230   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 231      least two scalar iterations, there is always also a true dependence.
 232      As the vectorizer does not re-order loads and stores we can ignore
 233      the anti-dependence if TBAA can disambiguate both DRs similar to the
 234      case with known negative distance anti-dependences (positive
 235      distance anti-dependences would violate TBAA constraints).  */
 236   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 237        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 238       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 239                                  get_alias_set (DR_REF (drb))))
 240     return false;
 241
 242   /* Unknown data dependence.  */
 243   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 244     {
 245       /* If user asserted safelen consecutive iterations can be
 246          executed concurrently, assume independence.  */
 247       if (loop->safelen >= 2)
 248         {
 249           if (loop->safelen < *max_vf)
 250             *max_vf = loop->safelen;
 251           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 252           return false;
 253         }
 254
 255       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 256           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 257         {
 258           if (dump_enabled_p ())
 259             {
 260               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 261                                "versioning for alias not supported for: "
 262                                "can't determine dependence between ");
 263               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 264                                  DR_REF (dra));
 265               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 266               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 267                                  DR_REF (drb));
 268               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 269             }
 270           return true;
 271         }
 272
 273       if (dump_enabled_p ())
 274         {
 275           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 276                            "versioning for alias required: "
 277                            "can't determine dependence between ");
 278           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 279                              DR_REF (dra));
 280           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 281           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 282                              DR_REF (drb));
 283           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 284         }
 285
 286       /* Add to list of ddrs that need to be tested at run-time.  */
 287       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 288     }
 289
 290   /* Known data dependence.  */
 291   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 292     {
 293       /* If user asserted safelen consecutive iterations can be
 294          executed concurrently, assume independence.  */
 295       if (loop->safelen >= 2)
 296         {
 297           if (loop->safelen < *max_vf)
 298             *max_vf = loop->safelen;
 299           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 300           return false;
 301         }
 302
 303       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 304           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 305         {
 306           if (dump_enabled_p ())
 307             {
 308               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 309                                "versioning for alias not supported for: "
 310                                "bad dist vector for ");
 311               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                                  DR_REF (dra));
 313               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 314               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 315                                  DR_REF (drb));
 316               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 317             }
 318           return true;
 319         }
 320
 321       if (dump_enabled_p ())
 322         {
 323           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 324                            "versioning for alias required: "
 325                            "bad dist vector for ");
 326           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 327           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 328           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 329           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 330         }
 331       /* Add to list of ddrs that need to be tested at run-time.  */
 332       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 333     }
 334
 335   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 336   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 337     {
 338       int dist = dist_v[loop_depth];
 339
 340       if (dump_enabled_p ())
 341         dump_printf_loc (MSG_NOTE, vect_location,
 342                          "dependence distance  = %d.\n", dist);
 343
 344       if (dist == 0)
 345         {
 346           if (dump_enabled_p ())
 347             {
 348               dump_printf_loc (MSG_NOTE, vect_location,
 349                                "dependence distance == 0 between ");
 350               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 351               dump_printf (MSG_NOTE, " and ");
 352               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 353               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 354             }
 355
 356           /* When we perform grouped accesses and perform implicit CSE
 357              by detecting equal accesses and doing disambiguation with
 358              runtime alias tests like for
 359                 .. = a[i];
 360                 .. = a[i+1];
 361                 a[i] = ..;
 362                 a[i+1] = ..;
 363                 *p = ..;
 364                 .. = a[i];
 365                 .. = a[i+1];
 366              where we will end up loading { a[i], a[i+1] } once, make
 367              sure that inserting group loads before the first load and
 368              stores after the last store will do the right thing.
 369              Similar for groups like
 370                 a[i] = ...;
 371                 ... = a[i];
 372                 a[i+1] = ...;
 373              where loads from the group interleave with the store.  */
 374           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 375               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 376             {
 377               gimple *earlier_stmt;
 378               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 379               if (DR_IS_WRITE
 380                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 381                 {
 382                   if (dump_enabled_p ())
 383                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 384                                      "READ_WRITE dependence in interleaving."
 385                                      "\n");
 386                   return true;
 387                 }
 388             }
 389
 390           continue;
 391         }
 392
 393       if (dist > 0 && DDR_REVERSED_P (ddr))
 394         {
 395           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 396              reversed (to make distance vector positive), and the actual
 397              distance is negative.  */
 398           if (dump_enabled_p ())
 399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 400                              "dependence distance negative.\n");
 401           /* Record a negative dependence distance to later limit the
 402              amount of stmt copying / unrolling we can perform.
 403              Only need to handle read-after-write dependence.  */
 404           if (DR_IS_READ (drb)
 405               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 406                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 407             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 408           continue;
 409         }
 410
 411       if (abs (dist) >= 2
 412           && abs (dist) < *max_vf)
 413         {
 414           /* The dependence distance requires reduction of the maximal
 415              vectorization factor.  */
 416           *max_vf = abs (dist);
 417           if (dump_enabled_p ())
 418             dump_printf_loc (MSG_NOTE, vect_location,
 419                              "adjusting maximal vectorization factor to %i\n",
 420                              *max_vf);
 421         }
 422
 423       if (abs (dist) >= *max_vf)
 424         {
 425           /* Dependence distance does not create dependence, as far as
 426              vectorization is concerned, in this case.  */
 427           if (dump_enabled_p ())
 428             dump_printf_loc (MSG_NOTE, vect_location,
 429                              "dependence distance >= VF.\n");
 430           continue;
 431         }
 432
 433       if (dump_enabled_p ())
 434         {
 435           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 436                        "not vectorized, possible dependence "
 437                        "between data-refs ");
 438           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 439           dump_printf (MSG_NOTE,  " and ");
 440           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 441           dump_printf (MSG_NOTE,  "\n");
 442         }
 443
 444       return true;
 445     }
 446
 447   return false;
 448 }
 449
 450 /* Function vect_analyze_data_ref_dependences.
 451
 452    Examine all the data references in the loop, and make sure there do not
 453    exist any data dependences between them.  Set *MAX_VF according to
 454    the maximum vectorization factor the data dependences allow.  */
 455
 456 bool
 457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 458 {
 459   unsigned int i;
 460   struct data_dependence_relation *ddr;
 461
 462   if (dump_enabled_p ())
 463     dump_printf_loc (MSG_NOTE, vect_location,
 464                      "=== vect_analyze_data_ref_dependences ===\n");
 465
 466   LOOP_VINFO_DDRS (loop_vinfo)
 467     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 468              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 469   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 470   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 471                                 &LOOP_VINFO_DDRS (loop_vinfo),
 472                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 473     return false;
 474
 475   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 476     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 477       return false;
 478
 479   return true;
 480 }
 481
 482
 483 /* Function vect_slp_analyze_data_ref_dependence.
 484
 485    Return TRUE if there (might) exist a dependence between a memory-reference
 486    DRA and a memory-reference DRB.  When versioning for alias may check a
 487    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 488    the data dependence.  */
 489
 490 static bool
 491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 492 {
 493   struct data_reference *dra = DDR_A (ddr);
 494   struct data_reference *drb = DDR_B (ddr);
 495
 496   /* We need to check dependences of statements marked as unvectorizable
 497      as well, they still can prohibit vectorization.  */
 498
 499   /* Independent data accesses.  */
 500   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 501     return false;
 502
 503   if (dra == drb)
 504     return false;
 505
 506   /* Read-read is OK.  */
 507   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 508     return false;
 509
 510   /* If dra and drb are part of the same interleaving chain consider
 511      them independent.  */
 512   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 513       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 514           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 515     return false;
 516
 517   /* Unknown data dependence.  */
 518   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 519     {
 520       if  (dump_enabled_p ())
 521         {
 522           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 523                            "can't determine dependence between ");
 524           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 525           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 526           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 527           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 528         }
 529     }
 530   else if (dump_enabled_p ())
 531     {
 532       dump_printf_loc (MSG_NOTE, vect_location,
 533                        "determined dependence between ");
 534       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 535       dump_printf (MSG_NOTE, " and ");
 536       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 537       dump_printf (MSG_NOTE,  "\n");
 538     }
 539
 540   return true;
 541 }
 542
 543
 544 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 545    contain the vector of scalar stores of this instance if we are
 546    disambiguating the loads.  */
 547
 548 static bool
 549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 550                                    vec<gimple *> stores, gimple *last_store)
 551 {
 552   /* This walks over all stmts involved in the SLP load/store done
 553      in NODE verifying we can sink them up to the last stmt in the
 554      group.  */
 555   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 556   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 557     {
 558       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 559       if (access == last_access)
 560         continue;
 561       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 562       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 563            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 564         {
 565           gimple *stmt = gsi_stmt (gsi);
 566           if (! gimple_vuse (stmt)
 567               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 568             continue;
 569
 570           /* If we couldn't record a (single) data reference for this
 571              stmt we have to give up.  */
 572           /* ???  Here and below if dependence analysis fails we can resort
 573              to the alias oracle which can handle more kinds of stmts.  */
 574           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 575           if (!dr_b)
 576             return false;
 577
 578           /* If we run into a store of this same instance (we've just
 579              marked those) then delay dependence checking until we run
 580              into the last store because this is where it will have
 581              been sunk to (and we verify if we can do that as well).  */
 582           if (gimple_visited_p (stmt))
 583             {
 584               if (stmt != last_store)
 585                 continue;
 586               unsigned i;
 587               gimple *store;
 588               FOR_EACH_VEC_ELT (stores, i, store)
 589                 {
 590                   data_reference *store_dr
 591                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 592                   ddr_p ddr = initialize_data_dependence_relation
 593                                 (dr_a, store_dr, vNULL);
 594                   if (vect_slp_analyze_data_ref_dependence (ddr))
 595                     {
 596                       free_dependence_relation (ddr);
 597                       return false;
 598                     }
 599                   free_dependence_relation (ddr);
 600                 }
 601             }
 602
 603           ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
 604           if (vect_slp_analyze_data_ref_dependence (ddr))
 605             {
 606               free_dependence_relation (ddr);
 607               return false;
 608             }
 609           free_dependence_relation (ddr);
 610         }
 611     }
 612   return true;
 613 }
 614
 615
 616 /* Function vect_analyze_data_ref_dependences.
 617
 618    Examine all the data references in the basic-block, and make sure there
 619    do not exist any data dependences between them.  Set *MAX_VF according to
 620    the maximum vectorization factor the data dependences allow.  */
 621
 622 bool
 623 vect_slp_analyze_instance_dependence (slp_instance instance)
 624 {
 625   if (dump_enabled_p ())
 626     dump_printf_loc (MSG_NOTE, vect_location,
 627                      "=== vect_slp_analyze_instance_dependence ===\n");
 628
 629   /* The stores of this instance are at the root of the SLP tree.  */
 630   slp_tree store = SLP_INSTANCE_TREE (instance);
 631   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 632     store = NULL;
 633
 634   /* Verify we can sink stores to the vectorized stmt insert location.  */
 635   gimple *last_store = NULL;
 636   if (store)
 637     {
 638       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 639         return false;
 640
 641       /* Mark stores in this instance and remember the last one.  */
 642       last_store = vect_find_last_scalar_stmt_in_slp (store);
 643       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 644         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 645     }
 646
 647   bool res = true;
 648
 649   /* Verify we can sink loads to the vectorized stmt insert location,
 650      special-casing stores of this instance.  */
 651   slp_tree load;
 652   unsigned int i;
 653   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 654     if (! vect_slp_analyze_node_dependences (instance, load,
 655                                              store
 656                                              ? SLP_TREE_SCALAR_STMTS (store)
 657                                              : vNULL, last_store))
 658       {
 659         res = false;
 660         break;
 661       }
 662
 663   /* Unset the visited flag.  */
 664   if (store)
 665     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 666       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 667
 668   return res;
 669 }
 670
 671 /* Function vect_compute_data_ref_alignment
 672
 673    Compute the misalignment of the data reference DR.
 674
 675    Output:
 676    1. If during the misalignment computation it is found that the data reference
 677       cannot be vectorized then false is returned.
 678    2. DR_MISALIGNMENT (DR) is defined.
 679
 680    FOR NOW: No analysis is actually performed. Misalignment is calculated
 681    only for trivial cases. TODO.  */
 682
 683 bool
 684 vect_compute_data_ref_alignment (struct data_reference *dr)
 685 {
 686   gimple *stmt = DR_STMT (dr);
 687   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 688   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 689   struct loop *loop = NULL;
 690   tree ref = DR_REF (dr);
 691   tree vectype;
 692   tree base, base_addr;
 693   tree misalign = NULL_TREE;
 694   tree aligned_to;
 695   unsigned HOST_WIDE_INT alignment;
 696
 697   if (dump_enabled_p ())
 698     dump_printf_loc (MSG_NOTE, vect_location,
 699                      "vect_compute_data_ref_alignment:\n");
 700
 701   if (loop_vinfo)
 702     loop = LOOP_VINFO_LOOP (loop_vinfo);
 703
 704   /* Initialize misalignment to unknown.  */
 705   SET_DR_MISALIGNMENT (dr, -1);
 706
 707   if (tree_fits_shwi_p (DR_STEP (dr)))
 708     misalign = DR_INIT (dr);
 709   aligned_to = DR_ALIGNED_TO (dr);
 710   base_addr = DR_BASE_ADDRESS (dr);
 711   vectype = STMT_VINFO_VECTYPE (stmt_info);
 712
 713   /* In case the dataref is in an inner-loop of the loop that is being
 714      vectorized (LOOP), we use the base and misalignment information
 715      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 716      stays the same throughout the execution of the inner-loop, which is why
 717      we have to check that the stride of the dataref in the inner-loop evenly
 718      divides by the vector size.  */
 719   if (loop && nested_in_vect_loop_p (loop, stmt))
 720     {
 721       tree step = DR_STEP (dr);
 722
 723       if (tree_fits_shwi_p (step)
 724           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 725         {
 726           if (dump_enabled_p ())
 727             dump_printf_loc (MSG_NOTE, vect_location,
 728                              "inner step divides the vector-size.\n");
 729           misalign = STMT_VINFO_DR_INIT (stmt_info);
 730           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 731           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 732         }
 733       else
 734         {
 735           if (dump_enabled_p ())
 736             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 737                              "inner step doesn't divide the vector-size.\n");
 738           misalign = NULL_TREE;
 739         }
 740     }
 741
 742   /* Similarly we can only use base and misalignment information relative to
 743      an innermost loop if the misalignment stays the same throughout the
 744      execution of the loop.  As above, this is the case if the stride of
 745      the dataref evenly divides by the vector size.  */
 746   else
 747     {
 748       tree step = DR_STEP (dr);
 749       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 750
 751       if (tree_fits_shwi_p (step)
 752           && ((tree_to_shwi (step) * vf)
 753               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 754         {
 755           if (dump_enabled_p ())
 756             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                              "step doesn't divide the vector-size.\n");
 758           misalign = NULL_TREE;
 759         }
 760     }
 761
 762   /* To look at alignment of the base we have to preserve an inner MEM_REF
 763      as that carries alignment information of the actual access.  */
 764   base = ref;
 765   while (handled_component_p (base))
 766     base = TREE_OPERAND (base, 0);
 767   if (TREE_CODE (base) == MEM_REF)
 768     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 769                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 770   unsigned int base_alignment = get_object_alignment (base);
 771
 772   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
 773     DR_VECT_AUX (dr)->base_element_aligned = true;
 774
 775   alignment = TYPE_ALIGN_UNIT (vectype);
 776
 777   if ((compare_tree_int (aligned_to, alignment) < 0)
 778       || !misalign)
 779     {
 780       if (dump_enabled_p ())
 781         {
 782           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 783                            "Unknown alignment for access: ");
 784           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 785           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 786         }
 787       return true;
 788     }
 789
 790   if (base_alignment < TYPE_ALIGN (vectype))
 791     {
 792       /* Strip an inner MEM_REF to a bare decl if possible.  */
 793       if (TREE_CODE (base) == MEM_REF
 794           && integer_zerop (TREE_OPERAND (base, 1))
 795           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 796         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 797
 798       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 799         {
 800           if (dump_enabled_p ())
 801             {
 802               dump_printf_loc (MSG_NOTE, vect_location,
 803                                "can't force alignment of ref: ");
 804               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 805               dump_printf (MSG_NOTE, "\n");
 806             }
 807           return true;
 808         }
 809
 810       /* Force the alignment of the decl.
 811          NOTE: This is the only change to the code we make during
 812          the analysis phase, before deciding to vectorize the loop.  */
 813       if (dump_enabled_p ())
 814         {
 815           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 816           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 817           dump_printf (MSG_NOTE, "\n");
 818         }
 819
 820       DR_VECT_AUX (dr)->base_decl = base;
 821       DR_VECT_AUX (dr)->base_misaligned = true;
 822       DR_VECT_AUX (dr)->base_element_aligned = true;
 823     }
 824
 825   /* If this is a backward running DR then first access in the larger
 826      vectype actually is N-1 elements before the address in the DR.
 827      Adjust misalign accordingly.  */
 828   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 829     {
 830       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 831       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 832          otherwise we wouldn't be here.  */
 833       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 834       /* PLUS because DR_STEP was negative.  */
 835       misalign = size_binop (PLUS_EXPR, misalign, offset);
 836     }
 837
 838   SET_DR_MISALIGNMENT (dr,
 839                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 840
 841   if (dump_enabled_p ())
 842     {
 843       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 844                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 845       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 846       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 847     }
 848
 849   return true;
 850 }
 851
 852
 853 /* Function vect_update_misalignment_for_peel
 854
 855    DR - the data reference whose misalignment is to be adjusted.
 856    DR_PEEL - the data reference whose misalignment is being made
 857              zero in the vector loop by the peel.
 858    NPEEL - the number of iterations in the peel loop if the misalignment
 859            of DR_PEEL is known at compile time.  */
 860
 861 static void
 862 vect_update_misalignment_for_peel (struct data_reference *dr,
 863                                    struct data_reference *dr_peel, int npeel)
 864 {
 865   unsigned int i;
 866   vec<dr_p> same_align_drs;
 867   struct data_reference *current_dr;
 868   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 869   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 870   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 871   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 872
 873  /* For interleaved data accesses the step in the loop must be multiplied by
 874      the size of the interleaving group.  */
 875   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 876     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 877   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 878     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 879
 880   /* It can be assumed that the data refs with the same alignment as dr_peel
 881      are aligned in the vector loop.  */
 882   same_align_drs
 883     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 884   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 885     {
 886       if (current_dr != dr)
 887         continue;
 888       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 889                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 890       SET_DR_MISALIGNMENT (dr, 0);
 891       return;
 892     }
 893
 894   if (known_alignment_for_access_p (dr)
 895       && known_alignment_for_access_p (dr_peel))
 896     {
 897       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 898       int misal = DR_MISALIGNMENT (dr);
 899       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 900       misal += negative ? -npeel * dr_size : npeel * dr_size;
 901       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 902       SET_DR_MISALIGNMENT (dr, misal);
 903       return;
 904     }
 905
 906   if (dump_enabled_p ())
 907     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 908   SET_DR_MISALIGNMENT (dr, -1);
 909 }
 910
 911
 912 /* Function verify_data_ref_alignment
 913
 914    Return TRUE if DR can be handled with respect to alignment.  */
 915
 916 static bool
 917 verify_data_ref_alignment (data_reference_p dr)
 918 {
 919   enum dr_alignment_support supportable_dr_alignment
 920     = vect_supportable_dr_alignment (dr, false);
 921   if (!supportable_dr_alignment)
 922     {
 923       if (dump_enabled_p ())
 924         {
 925           if (DR_IS_READ (dr))
 926             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 927                              "not vectorized: unsupported unaligned load.");
 928           else
 929             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 930                              "not vectorized: unsupported unaligned "
 931                              "store.");
 932
 933           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 934                              DR_REF (dr));
 935           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 936         }
 937       return false;
 938     }
 939
 940   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 941     dump_printf_loc (MSG_NOTE, vect_location,
 942                      "Vectorizing an unaligned access.\n");
 943
 944   return true;
 945 }
 946
 947 /* Function vect_verify_datarefs_alignment
 948
 949    Return TRUE if all data references in the loop can be
 950    handled with respect to alignment.  */
 951
 952 bool
 953 vect_verify_datarefs_alignment (loop_vec_info vinfo)
 954 {
 955   vec<data_reference_p> datarefs = vinfo->datarefs;
 956   struct data_reference *dr;
 957   unsigned int i;
 958
 959   FOR_EACH_VEC_ELT (datarefs, i, dr)
 960     {
 961       gimple *stmt = DR_STMT (dr);
 962       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 963
 964       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 965         continue;
 966
 967       /* For interleaving, only the alignment of the first access matters.   */
 968       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
 969           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 970         continue;
 971
 972       /* Strided accesses perform only component accesses, alignment is
 973          irrelevant for them.  */
 974       if (STMT_VINFO_STRIDED_P (stmt_info)
 975           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 976         continue;
 977
 978       if (! verify_data_ref_alignment (dr))
 979         return false;
 980     }
 981
 982   return true;
 983 }
 984
 985 /* Given an memory reference EXP return whether its alignment is less
 986    than its size.  */
 987
 988 static bool
 989 not_size_aligned (tree exp)
 990 {
 991   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 992     return true;
 993
 994   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 995           > get_object_alignment (exp));
 996 }
 997
 998 /* Function vector_alignment_reachable_p
 999
1000    Return true if vector alignment for DR is reachable by peeling
1001    a few loop iterations.  Return false otherwise.  */
1002
1003 static bool
1004 vector_alignment_reachable_p (struct data_reference *dr)
1005 {
1006   gimple *stmt = DR_STMT (dr);
1007   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1009
1010   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1011     {
1012       /* For interleaved access we peel only if number of iterations in
1013          the prolog loop ({VF - misalignment}), is a multiple of the
1014          number of the interleaved accesses.  */
1015       int elem_size, mis_in_elements;
1016       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1017
1018       /* FORNOW: handle only known alignment.  */
1019       if (!known_alignment_for_access_p (dr))
1020         return false;
1021
1022       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1023       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1024
1025       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1026         return false;
1027     }
1028
1029   /* If misalignment is known at the compile time then allow peeling
1030      only if natural alignment is reachable through peeling.  */
1031   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1032     {
1033       HOST_WIDE_INT elmsize =
1034                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1035       if (dump_enabled_p ())
1036         {
1037           dump_printf_loc (MSG_NOTE, vect_location,
1038                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1039           dump_printf (MSG_NOTE,
1040                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1041         }
1042       if (DR_MISALIGNMENT (dr) % elmsize)
1043         {
1044           if (dump_enabled_p ())
1045             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046                              "data size does not divide the misalignment.\n");
1047           return false;
1048         }
1049     }
1050
1051   if (!known_alignment_for_access_p (dr))
1052     {
1053       tree type = TREE_TYPE (DR_REF (dr));
1054       bool is_packed = not_size_aligned (DR_REF (dr));
1055       if (dump_enabled_p ())
1056         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1057                          "Unknown misalignment, is_packed = %d\n",is_packed);
1058       if ((TYPE_USER_ALIGN (type) && !is_packed)
1059           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1060         return true;
1061       else
1062         return false;
1063     }
1064
1065   return true;
1066 }
1067
1068
1069 /* Calculate the cost of the memory access represented by DR.  */
1070
1071 static void
1072 vect_get_data_access_cost (struct data_reference *dr,
1073                            unsigned int *inside_cost,
1074                            unsigned int *outside_cost,
1075                            stmt_vector_for_cost *body_cost_vec)
1076 {
1077   gimple *stmt = DR_STMT (dr);
1078   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1079   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1080   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1081   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1082   int ncopies = vf / nunits;
1083
1084   if (DR_IS_READ (dr))
1085     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1086                         NULL, body_cost_vec, false);
1087   else
1088     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1089
1090   if (dump_enabled_p ())
1091     dump_printf_loc (MSG_NOTE, vect_location,
1092                      "vect_get_data_access_cost: inside_cost = %d, "
1093                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1094 }
1095
1096
1097 typedef struct _vect_peel_info
1098 {
1099   int npeel;
1100   struct data_reference *dr;
1101   unsigned int count;
1102 } *vect_peel_info;
1103
1104 typedef struct _vect_peel_extended_info
1105 {
1106   struct _vect_peel_info peel_info;
1107   unsigned int inside_cost;
1108   unsigned int outside_cost;
1109   stmt_vector_for_cost body_cost_vec;
1110 } *vect_peel_extended_info;
1111
1112
1113 /* Peeling hashtable helpers.  */
1114
1115 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1116 {
1117   static inline hashval_t hash (const _vect_peel_info *);
1118   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1119 };
1120
1121 inline hashval_t
1122 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1123 {
1124   return (hashval_t) peel_info->npeel;
1125 }
1126
1127 inline bool
1128 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1129 {
1130   return (a->npeel == b->npeel);
1131 }
1132
1133
1134 /* Insert DR into peeling hash table with NPEEL as key.  */
1135
1136 static void
1137 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1138                           loop_vec_info loop_vinfo, struct data_reference *dr,
1139                           int npeel)
1140 {
1141   struct _vect_peel_info elem, *slot;
1142   _vect_peel_info **new_slot;
1143   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1144
1145   elem.npeel = npeel;
1146   slot = peeling_htab->find (&elem);
1147   if (slot)
1148     slot->count++;
1149   else
1150     {
1151       slot = XNEW (struct _vect_peel_info);
1152       slot->npeel = npeel;
1153       slot->dr = dr;
1154       slot->count = 1;
1155       new_slot = peeling_htab->find_slot (slot, INSERT);
1156       *new_slot = slot;
1157     }
1158
1159   if (!supportable_dr_alignment
1160       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1161     slot->count += VECT_MAX_COST;
1162 }
1163
1164
1165 /* Traverse peeling hash table to find peeling option that aligns maximum
1166    number of data accesses.  */
1167
1168 int
1169 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1170                                      _vect_peel_extended_info *max)
1171 {
1172   vect_peel_info elem = *slot;
1173
1174   if (elem->count > max->peel_info.count
1175       || (elem->count == max->peel_info.count
1176           && max->peel_info.npeel > elem->npeel))
1177     {
1178       max->peel_info.npeel = elem->npeel;
1179       max->peel_info.count = elem->count;
1180       max->peel_info.dr = elem->dr;
1181     }
1182
1183   return 1;
1184 }
1185
1186
1187 /* Traverse peeling hash table and calculate cost for each peeling option.
1188    Find the one with the lowest cost.  */
1189
1190 int
1191 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1192                                    _vect_peel_extended_info *min)
1193 {
1194   vect_peel_info elem = *slot;
1195   int save_misalignment, dummy;
1196   unsigned int inside_cost = 0, outside_cost = 0, i;
1197   gimple *stmt = DR_STMT (elem->dr);
1198   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1199   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1200   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1201   struct data_reference *dr;
1202   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1203
1204   prologue_cost_vec.create (2);
1205   body_cost_vec.create (2);
1206   epilogue_cost_vec.create (2);
1207
1208   FOR_EACH_VEC_ELT (datarefs, i, dr)
1209     {
1210       stmt = DR_STMT (dr);
1211       stmt_info = vinfo_for_stmt (stmt);
1212       /* For interleaving, only the alignment of the first access
1213          matters.  */
1214       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1215           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1216         continue;
1217
1218       /* Strided accesses perform only component accesses, alignment is
1219          irrelevant for them.  */
1220       if (STMT_VINFO_STRIDED_P (stmt_info)
1221           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1222         continue;
1223
1224       save_misalignment = DR_MISALIGNMENT (dr);
1225       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1226       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1227                                  &body_cost_vec);
1228       SET_DR_MISALIGNMENT (dr, save_misalignment);
1229     }
1230
1231   outside_cost += vect_get_known_peeling_cost
1232     (loop_vinfo, elem->npeel, &dummy,
1233      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1234      &prologue_cost_vec, &epilogue_cost_vec);
1235
1236   /* Prologue and epilogue costs are added to the target model later.
1237      These costs depend only on the scalar iteration cost, the
1238      number of peeling iterations finally chosen, and the number of
1239      misaligned statements.  So discard the information found here.  */
1240   prologue_cost_vec.release ();
1241   epilogue_cost_vec.release ();
1242
1243   if (inside_cost < min->inside_cost
1244       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1245     {
1246       min->inside_cost = inside_cost;
1247       min->outside_cost = outside_cost;
1248       min->body_cost_vec.release ();
1249       min->body_cost_vec = body_cost_vec;
1250       min->peel_info.dr = elem->dr;
1251       min->peel_info.npeel = elem->npeel;
1252     }
1253   else
1254     body_cost_vec.release ();
1255
1256   return 1;
1257 }
1258
1259
1260 /* Choose best peeling option by traversing peeling hash table and either
1261    choosing an option with the lowest cost (if cost model is enabled) or the
1262    option that aligns as many accesses as possible.  */
1263
1264 static struct data_reference *
1265 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1266                                        loop_vec_info loop_vinfo,
1267                                        unsigned int *npeel,
1268                                        stmt_vector_for_cost *body_cost_vec)
1269 {
1270    struct _vect_peel_extended_info res;
1271
1272    res.peel_info.dr = NULL;
1273    res.body_cost_vec = stmt_vector_for_cost ();
1274
1275    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1276      {
1277        res.inside_cost = INT_MAX;
1278        res.outside_cost = INT_MAX;
1279        peeling_htab->traverse <_vect_peel_extended_info *,
1280                                vect_peeling_hash_get_lowest_cost> (&res);
1281      }
1282    else
1283      {
1284        res.peel_info.count = 0;
1285        peeling_htab->traverse <_vect_peel_extended_info *,
1286                                vect_peeling_hash_get_most_frequent> (&res);
1287      }
1288
1289    *npeel = res.peel_info.npeel;
1290    *body_cost_vec = res.body_cost_vec;
1291    return res.peel_info.dr;
1292 }
1293
1294
1295 /* Function vect_enhance_data_refs_alignment
1296
1297    This pass will use loop versioning and loop peeling in order to enhance
1298    the alignment of data references in the loop.
1299
1300    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1301    original loop is to be vectorized.  Any other loops that are created by
1302    the transformations performed in this pass - are not supposed to be
1303    vectorized.  This restriction will be relaxed.
1304
1305    This pass will require a cost model to guide it whether to apply peeling
1306    or versioning or a combination of the two.  For example, the scheme that
1307    intel uses when given a loop with several memory accesses, is as follows:
1308    choose one memory access ('p') which alignment you want to force by doing
1309    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1310    other accesses are not necessarily aligned, or (2) use loop versioning to
1311    generate one loop in which all accesses are aligned, and another loop in
1312    which only 'p' is necessarily aligned.
1313
1314    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1315    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1316    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1317
1318    Devising a cost model is the most critical aspect of this work.  It will
1319    guide us on which access to peel for, whether to use loop versioning, how
1320    many versions to create, etc.  The cost model will probably consist of
1321    generic considerations as well as target specific considerations (on
1322    powerpc for example, misaligned stores are more painful than misaligned
1323    loads).
1324
1325    Here are the general steps involved in alignment enhancements:
1326
1327      -- original loop, before alignment analysis:
1328         for (i=0; i<N; i++){
1329           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1330           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1331         }
1332
1333      -- After vect_compute_data_refs_alignment:
1334         for (i=0; i<N; i++){
1335           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1336           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1337         }
1338
1339      -- Possibility 1: we do loop versioning:
1340      if (p is aligned) {
1341         for (i=0; i<N; i++){    # loop 1A
1342           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1343           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1344         }
1345      }
1346      else {
1347         for (i=0; i<N; i++){    # loop 1B
1348           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1349           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1350         }
1351      }
1352
1353      -- Possibility 2: we do loop peeling:
1354      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1355         x = q[i];
1356         p[i] = y;
1357      }
1358      for (i = 3; i < N; i++){   # loop 2A
1359         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1360         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1361      }
1362
1363      -- Possibility 3: combination of loop peeling and versioning:
1364      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1365         x = q[i];
1366         p[i] = y;
1367      }
1368      if (p is aligned) {
1369         for (i = 3; i<N; i++){  # loop 3A
1370           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1371           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1372         }
1373      }
1374      else {
1375         for (i = 3; i<N; i++){  # loop 3B
1376           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1377           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1378         }
1379      }
1380
1381      These loops are later passed to loop_transform to be vectorized.  The
1382      vectorizer will use the alignment information to guide the transformation
1383      (whether to generate regular loads/stores, or with special handling for
1384      misalignment).  */
1385
1386 bool
1387 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1388 {
1389   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1390   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1391   enum dr_alignment_support supportable_dr_alignment;
1392   struct data_reference *dr0 = NULL, *first_store = NULL;
1393   struct data_reference *dr;
1394   unsigned int i, j;
1395   bool do_peeling = false;
1396   bool do_versioning = false;
1397   bool stat;
1398   gimple *stmt;
1399   stmt_vec_info stmt_info;
1400   unsigned int npeel = 0;
1401   bool all_misalignments_unknown = true;
1402   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1403   unsigned possible_npeel_number = 1;
1404   tree vectype;
1405   unsigned int nelements, mis, same_align_drs_max = 0;
1406   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1407   hash_table<peel_info_hasher> peeling_htab (1);
1408
1409   if (dump_enabled_p ())
1410     dump_printf_loc (MSG_NOTE, vect_location,
1411                      "=== vect_enhance_data_refs_alignment ===\n");
1412
1413   /* Reset data so we can safely be called multiple times.  */
1414   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1415   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1416
1417   /* While cost model enhancements are expected in the future, the high level
1418      view of the code at this time is as follows:
1419
1420      A) If there is a misaligned access then see if peeling to align
1421         this access can make all data references satisfy
1422         vect_supportable_dr_alignment.  If so, update data structures
1423         as needed and return true.
1424
1425      B) If peeling wasn't possible and there is a data reference with an
1426         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1427         then see if loop versioning checks can be used to make all data
1428         references satisfy vect_supportable_dr_alignment.  If so, update
1429         data structures as needed and return true.
1430
1431      C) If neither peeling nor versioning were successful then return false if
1432         any data reference does not satisfy vect_supportable_dr_alignment.
1433
1434      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1435
1436      Note, Possibility 3 above (which is peeling and versioning together) is not
1437      being done at this time.  */
1438
1439   /* (1) Peeling to force alignment.  */
1440
1441   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1442      Considerations:
1443      + How many accesses will become aligned due to the peeling
1444      - How many accesses will become unaligned due to the peeling,
1445        and the cost of misaligned accesses.
1446      - The cost of peeling (the extra runtime checks, the increase
1447        in code size).  */
1448
1449   FOR_EACH_VEC_ELT (datarefs, i, dr)
1450     {
1451       stmt = DR_STMT (dr);
1452       stmt_info = vinfo_for_stmt (stmt);
1453
1454       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1455         continue;
1456
1457       /* For interleaving, only the alignment of the first access
1458          matters.  */
1459       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1460           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1461         continue;
1462
1463       /* For invariant accesses there is nothing to enhance.  */
1464       if (integer_zerop (DR_STEP (dr)))
1465         continue;
1466
1467       /* Strided accesses perform only component accesses, alignment is
1468          irrelevant for them.  */
1469       if (STMT_VINFO_STRIDED_P (stmt_info)
1470           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1471         continue;
1472
1473       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1474       do_peeling = vector_alignment_reachable_p (dr);
1475       if (do_peeling)
1476         {
1477           if (known_alignment_for_access_p (dr))
1478             {
1479               unsigned int npeel_tmp;
1480               bool negative = tree_int_cst_compare (DR_STEP (dr),
1481                                                     size_zero_node) < 0;
1482
1483               /* Save info about DR in the hash table.  */
1484               vectype = STMT_VINFO_VECTYPE (stmt_info);
1485               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1486               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1487                                                 TREE_TYPE (DR_REF (dr))));
1488               npeel_tmp = (negative
1489                            ? (mis - nelements) : (nelements - mis))
1490                   & (nelements - 1);
1491
1492               /* For multiple types, it is possible that the bigger type access
1493                  will have more than one peeling option.  E.g., a loop with two
1494                  types: one of size (vector size / 4), and the other one of
1495                  size (vector size / 8).  Vectorization factor will 8.  If both
1496                  access are misaligned by 3, the first one needs one scalar
1497                  iteration to be aligned, and the second one needs 5.  But the
1498                  the first one will be aligned also by peeling 5 scalar
1499                  iterations, and in that case both accesses will be aligned.
1500                  Hence, except for the immediate peeling amount, we also want
1501                  to try to add full vector size, while we don't exceed
1502                  vectorization factor.
1503                  We do this automtically for cost model, since we calculate cost
1504                  for every peeling option.  */
1505               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1506                 {
1507                   if (STMT_SLP_TYPE (stmt_info))
1508                     possible_npeel_number
1509                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1510                   else
1511                     possible_npeel_number = vf / nelements;
1512                 }
1513
1514               /* Handle the aligned case. We may decide to align some other
1515                  access, making DR unaligned.  */
1516               if (DR_MISALIGNMENT (dr) == 0)
1517                 {
1518                   npeel_tmp = 0;
1519                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1520                     possible_npeel_number++;
1521                 }
1522
1523               for (j = 0; j < possible_npeel_number; j++)
1524                 {
1525                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1526                                             dr, npeel_tmp);
1527                   npeel_tmp += nelements;
1528                 }
1529
1530               all_misalignments_unknown = false;
1531               /* Data-ref that was chosen for the case that all the
1532                  misalignments are unknown is not relevant anymore, since we
1533                  have a data-ref with known alignment.  */
1534               dr0 = NULL;
1535             }
1536           else
1537             {
1538               /* If we don't know any misalignment values, we prefer
1539                  peeling for data-ref that has the maximum number of data-refs
1540                  with the same alignment, unless the target prefers to align
1541                  stores over load.  */
1542               if (all_misalignments_unknown)
1543                 {
1544                   unsigned same_align_drs
1545                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1546                   if (!dr0
1547                       || same_align_drs_max < same_align_drs)
1548                     {
1549                       same_align_drs_max = same_align_drs;
1550                       dr0 = dr;
1551                     }
1552                   /* For data-refs with the same number of related
1553                      accesses prefer the one where the misalign
1554                      computation will be invariant in the outermost loop.  */
1555                   else if (same_align_drs_max == same_align_drs)
1556                     {
1557                       struct loop *ivloop0, *ivloop;
1558                       ivloop0 = outermost_invariant_loop_for_expr
1559                           (loop, DR_BASE_ADDRESS (dr0));
1560                       ivloop = outermost_invariant_loop_for_expr
1561                           (loop, DR_BASE_ADDRESS (dr));
1562                       if ((ivloop && !ivloop0)
1563                           || (ivloop && ivloop0
1564                               && flow_loop_nested_p (ivloop, ivloop0)))
1565                         dr0 = dr;
1566                     }
1567
1568                   if (!first_store && DR_IS_WRITE (dr))
1569                     first_store = dr;
1570                 }
1571
1572               /* If there are both known and unknown misaligned accesses in the
1573                  loop, we choose peeling amount according to the known
1574                  accesses.  */
1575               if (!supportable_dr_alignment)
1576                 {
1577                   dr0 = dr;
1578                   if (!first_store && DR_IS_WRITE (dr))
1579                     first_store = dr;
1580                 }
1581             }
1582         }
1583       else
1584         {
1585           if (!aligned_access_p (dr))
1586             {
1587               if (dump_enabled_p ())
1588                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1589                                  "vector alignment may not be reachable\n");
1590               break;
1591             }
1592         }
1593     }
1594
1595   /* Check if we can possibly peel the loop.  */
1596   if (!vect_can_advance_ivs_p (loop_vinfo)
1597       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1598       || loop->inner)
1599     do_peeling = false;
1600
1601   if (do_peeling
1602       && all_misalignments_unknown
1603       && vect_supportable_dr_alignment (dr0, false))
1604     {
1605       /* Check if the target requires to prefer stores over loads, i.e., if
1606          misaligned stores are more expensive than misaligned loads (taking
1607          drs with same alignment into account).  */
1608       if (first_store && DR_IS_READ (dr0))
1609         {
1610           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1611           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1612           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1613           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1614           stmt_vector_for_cost dummy;
1615           dummy.create (2);
1616
1617           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1618                                      &dummy);
1619           vect_get_data_access_cost (first_store, &store_inside_cost,
1620                                      &store_outside_cost, &dummy);
1621
1622           dummy.release ();
1623
1624           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1625              aligning the load DR0).  */
1626           load_inside_penalty = store_inside_cost;
1627           load_outside_penalty = store_outside_cost;
1628           for (i = 0;
1629                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1630                           DR_STMT (first_store))).iterate (i, &dr);
1631                i++)
1632             if (DR_IS_READ (dr))
1633               {
1634                 load_inside_penalty += load_inside_cost;
1635                 load_outside_penalty += load_outside_cost;
1636               }
1637             else
1638               {
1639                 load_inside_penalty += store_inside_cost;
1640                 load_outside_penalty += store_outside_cost;
1641               }
1642
1643           /* Calculate the penalty for leaving DR0 unaligned (by
1644              aligning the FIRST_STORE).  */
1645           store_inside_penalty = load_inside_cost;
1646           store_outside_penalty = load_outside_cost;
1647           for (i = 0;
1648                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1649                       DR_STMT (dr0))).iterate (i, &dr);
1650                i++)
1651             if (DR_IS_READ (dr))
1652               {
1653                 store_inside_penalty += load_inside_cost;
1654                 store_outside_penalty += load_outside_cost;
1655               }
1656             else
1657               {
1658                 store_inside_penalty += store_inside_cost;
1659                 store_outside_penalty += store_outside_cost;
1660               }
1661
1662           if (load_inside_penalty > store_inside_penalty
1663               || (load_inside_penalty == store_inside_penalty
1664                   && load_outside_penalty > store_outside_penalty))
1665             dr0 = first_store;
1666         }
1667
1668       /* In case there are only loads with different unknown misalignments, use
1669          peeling only if it may help to align other accesses in the loop or
1670          if it may help improving load bandwith when we'd end up using
1671          unaligned loads.  */
1672       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1673       if (!first_store
1674           && !STMT_VINFO_SAME_ALIGN_REFS (
1675                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1676           && (vect_supportable_dr_alignment (dr0, false)
1677               != dr_unaligned_supported
1678               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1679                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1680         do_peeling = false;
1681     }
1682
1683   if (do_peeling && !dr0)
1684     {
1685       /* Peeling is possible, but there is no data access that is not supported
1686          unless aligned. So we try to choose the best possible peeling.  */
1687
1688       /* We should get here only if there are drs with known misalignment.  */
1689       gcc_assert (!all_misalignments_unknown);
1690
1691       /* Choose the best peeling from the hash table.  */
1692       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1693                                                    loop_vinfo, &npeel,
1694                                                    &body_cost_vec);
1695       if (!dr0 || !npeel)
1696         do_peeling = false;
1697     }
1698
1699   if (do_peeling)
1700     {
1701       stmt = DR_STMT (dr0);
1702       stmt_info = vinfo_for_stmt (stmt);
1703       vectype = STMT_VINFO_VECTYPE (stmt_info);
1704       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1705
1706       if (known_alignment_for_access_p (dr0))
1707         {
1708           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1709                                                 size_zero_node) < 0;
1710           if (!npeel)
1711             {
1712               /* Since it's known at compile time, compute the number of
1713                  iterations in the peeled loop (the peeling factor) for use in
1714                  updating DR_MISALIGNMENT values.  The peeling factor is the
1715                  vectorization factor minus the misalignment as an element
1716                  count.  */
1717               mis = DR_MISALIGNMENT (dr0);
1718               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1719               npeel = ((negative ? mis - nelements : nelements - mis)
1720                        & (nelements - 1));
1721             }
1722
1723           /* For interleaved data access every iteration accesses all the
1724              members of the group, therefore we divide the number of iterations
1725              by the group size.  */
1726           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1727           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1728             npeel /= GROUP_SIZE (stmt_info);
1729
1730           if (dump_enabled_p ())
1731             dump_printf_loc (MSG_NOTE, vect_location,
1732                              "Try peeling by %d\n", npeel);
1733         }
1734
1735       /* Ensure that all data refs can be vectorized after the peel.  */
1736       FOR_EACH_VEC_ELT (datarefs, i, dr)
1737         {
1738           int save_misalignment;
1739
1740           if (dr == dr0)
1741             continue;
1742
1743           stmt = DR_STMT (dr);
1744           stmt_info = vinfo_for_stmt (stmt);
1745           /* For interleaving, only the alignment of the first access
1746             matters.  */
1747           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1748               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1749             continue;
1750
1751           /* Strided accesses perform only component accesses, alignment is
1752              irrelevant for them.  */
1753           if (STMT_VINFO_STRIDED_P (stmt_info)
1754               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755             continue;
1756
1757           save_misalignment = DR_MISALIGNMENT (dr);
1758           vect_update_misalignment_for_peel (dr, dr0, npeel);
1759           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1760           SET_DR_MISALIGNMENT (dr, save_misalignment);
1761
1762           if (!supportable_dr_alignment)
1763             {
1764               do_peeling = false;
1765               break;
1766             }
1767         }
1768
1769       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1770         {
1771           stat = vect_verify_datarefs_alignment (loop_vinfo);
1772           if (!stat)
1773             do_peeling = false;
1774           else
1775             {
1776               body_cost_vec.release ();
1777               return stat;
1778             }
1779         }
1780
1781       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1782       if (do_peeling)
1783         {
1784           unsigned max_allowed_peel
1785             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1786           if (max_allowed_peel != (unsigned)-1)
1787             {
1788               unsigned max_peel = npeel;
1789               if (max_peel == 0)
1790                 {
1791                   gimple *dr_stmt = DR_STMT (dr0);
1792                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1793                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1794                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1795                 }
1796               if (max_peel > max_allowed_peel)
1797                 {
1798                   do_peeling = false;
1799                   if (dump_enabled_p ())
1800                     dump_printf_loc (MSG_NOTE, vect_location,
1801                         "Disable peeling, max peels reached: %d\n", max_peel);
1802                 }
1803             }
1804         }
1805
1806       /* Cost model #2 - if peeling may result in a remaining loop not
1807          iterating enough to be vectorized then do not peel.  */
1808       if (do_peeling
1809           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1810         {
1811           unsigned max_peel
1812             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1813           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1814               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1815             do_peeling = false;
1816         }
1817
1818       if (do_peeling)
1819         {
1820           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1821              If the misalignment of DR_i is identical to that of dr0 then set
1822              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1823              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1824              by the peeling factor times the element size of DR_i (MOD the
1825              vectorization factor times the size).  Otherwise, the
1826              misalignment of DR_i must be set to unknown.  */
1827           FOR_EACH_VEC_ELT (datarefs, i, dr)
1828             if (dr != dr0)
1829               vect_update_misalignment_for_peel (dr, dr0, npeel);
1830
1831           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1832           if (npeel)
1833             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1834           else
1835             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1836               = DR_MISALIGNMENT (dr0);
1837           SET_DR_MISALIGNMENT (dr0, 0);
1838           if (dump_enabled_p ())
1839             {
1840               dump_printf_loc (MSG_NOTE, vect_location,
1841                                "Alignment of access forced using peeling.\n");
1842               dump_printf_loc (MSG_NOTE, vect_location,
1843                                "Peeling for alignment will be applied.\n");
1844             }
1845           /* The inside-loop cost will be accounted for in vectorizable_load
1846              and vectorizable_store correctly with adjusted alignments.
1847              Drop the body_cst_vec on the floor here.  */
1848           body_cost_vec.release ();
1849
1850           stat = vect_verify_datarefs_alignment (loop_vinfo);
1851           gcc_assert (stat);
1852           return stat;
1853         }
1854     }
1855
1856   body_cost_vec.release ();
1857
1858   /* (2) Versioning to force alignment.  */
1859
1860   /* Try versioning if:
1861      1) optimize loop for speed
1862      2) there is at least one unsupported misaligned data ref with an unknown
1863         misalignment, and
1864      3) all misaligned data refs with a known misalignment are supported, and
1865      4) the number of runtime alignment checks is within reason.  */
1866
1867   do_versioning =
1868         optimize_loop_nest_for_speed_p (loop)
1869         && (!loop->inner); /* FORNOW */
1870
1871   if (do_versioning)
1872     {
1873       FOR_EACH_VEC_ELT (datarefs, i, dr)
1874         {
1875           stmt = DR_STMT (dr);
1876           stmt_info = vinfo_for_stmt (stmt);
1877
1878           /* For interleaving, only the alignment of the first access
1879              matters.  */
1880           if (aligned_access_p (dr)
1881               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1882                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1883             continue;
1884
1885           if (STMT_VINFO_STRIDED_P (stmt_info))
1886             {
1887               /* Strided loads perform only component accesses, alignment is
1888                  irrelevant for them.  */
1889               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1890                 continue;
1891               do_versioning = false;
1892               break;
1893             }
1894
1895           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1896
1897           if (!supportable_dr_alignment)
1898             {
1899               gimple *stmt;
1900               int mask;
1901               tree vectype;
1902
1903               if (known_alignment_for_access_p (dr)
1904                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1905                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1906                 {
1907                   do_versioning = false;
1908                   break;
1909                 }
1910
1911               stmt = DR_STMT (dr);
1912               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1913               gcc_assert (vectype);
1914
1915               /* The rightmost bits of an aligned address must be zeros.
1916                  Construct the mask needed for this test.  For example,
1917                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1918                  mask must be 15 = 0xf. */
1919               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1920
1921               /* FORNOW: use the same mask to test all potentially unaligned
1922                  references in the loop.  The vectorizer currently supports
1923                  a single vector size, see the reference to
1924                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1925                  vectorization factor is computed.  */
1926               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1927                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1928               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1929               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1930                       DR_STMT (dr));
1931             }
1932         }
1933
1934       /* Versioning requires at least one misaligned data reference.  */
1935       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1936         do_versioning = false;
1937       else if (!do_versioning)
1938         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1939     }
1940
1941   if (do_versioning)
1942     {
1943       vec<gimple *> may_misalign_stmts
1944         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1945       gimple *stmt;
1946
1947       /* It can now be assumed that the data references in the statements
1948          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1949          of the loop being vectorized.  */
1950       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1951         {
1952           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1953           dr = STMT_VINFO_DATA_REF (stmt_info);
1954           SET_DR_MISALIGNMENT (dr, 0);
1955           if (dump_enabled_p ())
1956             dump_printf_loc (MSG_NOTE, vect_location,
1957                              "Alignment of access forced using versioning.\n");
1958         }
1959
1960       if (dump_enabled_p ())
1961         dump_printf_loc (MSG_NOTE, vect_location,
1962                          "Versioning for alignment will be applied.\n");
1963
1964       /* Peeling and versioning can't be done together at this time.  */
1965       gcc_assert (! (do_peeling && do_versioning));
1966
1967       stat = vect_verify_datarefs_alignment (loop_vinfo);
1968       gcc_assert (stat);
1969       return stat;
1970     }
1971
1972   /* This point is reached if neither peeling nor versioning is being done.  */
1973   gcc_assert (! (do_peeling || do_versioning));
1974
1975   stat = vect_verify_datarefs_alignment (loop_vinfo);
1976   return stat;
1977 }
1978
1979
1980 /* Function vect_find_same_alignment_drs.
1981
1982    Update group and alignment relations according to the chosen
1983    vectorization factor.  */
1984
1985 static void
1986 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1987                               loop_vec_info loop_vinfo)
1988 {
1989   unsigned int i;
1990   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1991   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1992   struct data_reference *dra = DDR_A (ddr);
1993   struct data_reference *drb = DDR_B (ddr);
1994   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1995   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1996   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1997   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1998   lambda_vector dist_v;
1999   unsigned int loop_depth;
2000
2001   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2002     return;
2003
2004   if (dra == drb)
2005     return;
2006
2007   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2008     return;
2009
2010   /* Loop-based vectorization and known data dependence.  */
2011   if (DDR_NUM_DIST_VECTS (ddr) == 0)
2012     return;
2013
2014   /* Data-dependence analysis reports a distance vector of zero
2015      for data-references that overlap only in the first iteration
2016      but have different sign step (see PR45764).
2017      So as a sanity check require equal DR_STEP.  */
2018   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2019     return;
2020
2021   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2022   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2023     {
2024       int dist = dist_v[loop_depth];
2025
2026       if (dump_enabled_p ())
2027         dump_printf_loc (MSG_NOTE, vect_location,
2028                          "dependence distance  = %d.\n", dist);
2029
2030       /* Same loop iteration.  */
2031       if (dist == 0
2032           || (dist % vectorization_factor == 0 && dra_size == drb_size))
2033         {
2034           /* Two references with distance zero have the same alignment.  */
2035           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2036           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2037           if (dump_enabled_p ())
2038             {
2039               dump_printf_loc (MSG_NOTE, vect_location,
2040                                "accesses have the same alignment.\n");
2041               dump_printf (MSG_NOTE,
2042                            "dependence distance modulo vf == 0 between ");
2043               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2044               dump_printf (MSG_NOTE,  " and ");
2045               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2046               dump_printf (MSG_NOTE, "\n");
2047             }
2048         }
2049     }
2050 }
2051
2052
2053 /* Function vect_analyze_data_refs_alignment
2054
2055    Analyze the alignment of the data-references in the loop.
2056    Return FALSE if a data reference is found that cannot be vectorized.  */
2057
2058 bool
2059 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2060 {
2061   if (dump_enabled_p ())
2062     dump_printf_loc (MSG_NOTE, vect_location,
2063                      "=== vect_analyze_data_refs_alignment ===\n");
2064
2065   /* Mark groups of data references with same alignment using
2066      data dependence information.  */
2067   vec<ddr_p> ddrs = vinfo->ddrs;
2068   struct data_dependence_relation *ddr;
2069   unsigned int i;
2070
2071   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2072     vect_find_same_alignment_drs (ddr, vinfo);
2073
2074   vec<data_reference_p> datarefs = vinfo->datarefs;
2075   struct data_reference *dr;
2076
2077   FOR_EACH_VEC_ELT (datarefs, i, dr)
2078     {
2079       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2080       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2081           && !vect_compute_data_ref_alignment (dr))
2082         {
2083           /* Strided accesses perform only component accesses, misalignment
2084              information is irrelevant for them.  */
2085           if (STMT_VINFO_STRIDED_P (stmt_info)
2086               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2087             continue;
2088
2089           if (dump_enabled_p ())
2090             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2091                              "not vectorized: can't calculate alignment "
2092                              "for data ref.\n");
2093
2094           return false;
2095         }
2096     }
2097
2098   return true;
2099 }
2100
2101
2102 /* Analyze alignment of DRs of stmts in NODE.  */
2103
2104 static bool
2105 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2106 {
2107   /* We vectorize from the first scalar stmt in the node unless
2108      the node is permuted in which case we start from the first
2109      element in the group.  */
2110   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2111   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2112   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2113     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2114
2115   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2116   if (! vect_compute_data_ref_alignment (dr)
2117       /* For creating the data-ref pointer we need alignment of the
2118          first element anyway.  */
2119       || (dr != first_dr
2120           && ! vect_compute_data_ref_alignment (first_dr))
2121       || ! verify_data_ref_alignment (dr))
2122     {
2123       if (dump_enabled_p ())
2124         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125                          "not vectorized: bad data alignment in basic "
2126                          "block.\n");
2127       return false;
2128     }
2129
2130   return true;
2131 }
2132
2133 /* Function vect_slp_analyze_instance_alignment
2134
2135    Analyze the alignment of the data-references in the SLP instance.
2136    Return FALSE if a data reference is found that cannot be vectorized.  */
2137
2138 bool
2139 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2140 {
2141   if (dump_enabled_p ())
2142     dump_printf_loc (MSG_NOTE, vect_location,
2143                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2144
2145   slp_tree node;
2146   unsigned i;
2147   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2148     if (! vect_slp_analyze_and_verify_node_alignment (node))
2149       return false;
2150
2151   node = SLP_INSTANCE_TREE (instance);
2152   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2153       && ! vect_slp_analyze_and_verify_node_alignment
2154              (SLP_INSTANCE_TREE (instance)))
2155     return false;
2156
2157   return true;
2158 }
2159
2160
2161 /* Analyze groups of accesses: check that DR belongs to a group of
2162    accesses of legal size, step, etc.  Detect gaps, single element
2163    interleaving, and other special cases. Set grouped access info.
2164    Collect groups of strided stores for further use in SLP analysis.
2165    Worker for vect_analyze_group_access.  */
2166
2167 static bool
2168 vect_analyze_group_access_1 (struct data_reference *dr)
2169 {
2170   tree step = DR_STEP (dr);
2171   tree scalar_type = TREE_TYPE (DR_REF (dr));
2172   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2173   gimple *stmt = DR_STMT (dr);
2174   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2175   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2176   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2177   HOST_WIDE_INT dr_step = -1;
2178   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2179   bool slp_impossible = false;
2180
2181   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2182      size of the interleaving group (including gaps).  */
2183   if (tree_fits_shwi_p (step))
2184     {
2185       dr_step = tree_to_shwi (step);
2186       /* Check that STEP is a multiple of type size.  Otherwise there is
2187          a non-element-sized gap at the end of the group which we
2188          cannot represent in GROUP_GAP or GROUP_SIZE.
2189          ???  As we can handle non-constant step fine here we should
2190          simply remove uses of GROUP_GAP between the last and first
2191          element and instead rely on DR_STEP.  GROUP_SIZE then would
2192          simply not include that gap.  */
2193       if ((dr_step % type_size) != 0)
2194         {
2195           if (dump_enabled_p ())
2196             {
2197               dump_printf_loc (MSG_NOTE, vect_location,
2198                                "Step ");
2199               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2200               dump_printf (MSG_NOTE,
2201                            " is not a multiple of the element size for ");
2202               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2203               dump_printf (MSG_NOTE, "\n");
2204             }
2205           return false;
2206         }
2207       groupsize = absu_hwi (dr_step) / type_size;
2208     }
2209   else
2210     groupsize = 0;
2211
2212   /* Not consecutive access is possible only if it is a part of interleaving.  */
2213   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2214     {
2215       /* Check if it this DR is a part of interleaving, and is a single
2216          element of the group that is accessed in the loop.  */
2217
2218       /* Gaps are supported only for loads. STEP must be a multiple of the type
2219          size.  The size of the group must be a power of 2.  */
2220       if (DR_IS_READ (dr)
2221           && (dr_step % type_size) == 0
2222           && groupsize > 0
2223           && exact_log2 (groupsize) != -1)
2224         {
2225           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2226           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2227           if (dump_enabled_p ())
2228             {
2229               dump_printf_loc (MSG_NOTE, vect_location,
2230                                "Detected single element interleaving ");
2231               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2232               dump_printf (MSG_NOTE, " step ");
2233               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2234               dump_printf (MSG_NOTE, "\n");
2235             }
2236
2237           return true;
2238         }
2239
2240       if (dump_enabled_p ())
2241         {
2242           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243                            "not consecutive access ");
2244           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2245         }
2246
2247       if (bb_vinfo)
2248         {
2249           /* Mark the statement as unvectorizable.  */
2250           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2251           return true;
2252         }
2253
2254       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2255       STMT_VINFO_STRIDED_P (stmt_info) = true;
2256       return true;
2257     }
2258
2259   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2260     {
2261       /* First stmt in the interleaving chain. Check the chain.  */
2262       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2263       struct data_reference *data_ref = dr;
2264       unsigned int count = 1;
2265       tree prev_init = DR_INIT (data_ref);
2266       gimple *prev = stmt;
2267       HOST_WIDE_INT diff, gaps = 0;
2268
2269       while (next)
2270         {
2271           /* Skip same data-refs.  In case that two or more stmts share
2272              data-ref (supported only for loads), we vectorize only the first
2273              stmt, and the rest get their vectorized loads from the first
2274              one.  */
2275           if (!tree_int_cst_compare (DR_INIT (data_ref),
2276                                      DR_INIT (STMT_VINFO_DATA_REF (
2277                                                    vinfo_for_stmt (next)))))
2278             {
2279               if (DR_IS_WRITE (data_ref))
2280                 {
2281                   if (dump_enabled_p ())
2282                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2283                                      "Two store stmts share the same dr.\n");
2284                   return false;
2285                 }
2286
2287               if (dump_enabled_p ())
2288                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289                                  "Two or more load stmts share the same dr.\n");
2290
2291               /* For load use the same data-ref load.  */
2292               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2293
2294               prev = next;
2295               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2296               continue;
2297             }
2298
2299           prev = next;
2300           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2301
2302           /* All group members have the same STEP by construction.  */
2303           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2304
2305           /* Check that the distance between two accesses is equal to the type
2306              size. Otherwise, we have gaps.  */
2307           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2308                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2309           if (diff != 1)
2310             {
2311               /* FORNOW: SLP of accesses with gaps is not supported.  */
2312               slp_impossible = true;
2313               if (DR_IS_WRITE (data_ref))
2314                 {
2315                   if (dump_enabled_p ())
2316                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                                      "interleaved store with gaps\n");
2318                   return false;
2319                 }
2320
2321               gaps += diff - 1;
2322             }
2323
2324           last_accessed_element += diff;
2325
2326           /* Store the gap from the previous member of the group. If there is no
2327              gap in the access, GROUP_GAP is always 1.  */
2328           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2329
2330           prev_init = DR_INIT (data_ref);
2331           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2332           /* Count the number of data-refs in the chain.  */
2333           count++;
2334         }
2335
2336       if (groupsize == 0)
2337         groupsize = count + gaps;
2338
2339       if (groupsize > UINT_MAX)
2340         {
2341           if (dump_enabled_p ())
2342             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343                              "group is too large\n");
2344           return false;
2345         }
2346
2347       /* Check that the size of the interleaving is equal to count for stores,
2348          i.e., that there are no gaps.  */
2349       if (groupsize != count
2350           && !DR_IS_READ (dr))
2351         {
2352           if (dump_enabled_p ())
2353             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354                              "interleaved store with gaps\n");
2355           return false;
2356         }
2357
2358       /* If there is a gap after the last load in the group it is the
2359          difference between the groupsize and the last accessed
2360          element.
2361          When there is no gap, this difference should be 0.  */
2362       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2363
2364       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2365       if (dump_enabled_p ())
2366         {
2367           dump_printf_loc (MSG_NOTE, vect_location,
2368                            "Detected interleaving ");
2369           if (DR_IS_READ (dr))
2370             dump_printf (MSG_NOTE, "load ");
2371           else
2372             dump_printf (MSG_NOTE, "store ");
2373           dump_printf (MSG_NOTE, "of size %u starting with ",
2374                        (unsigned)groupsize);
2375           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2376           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2377             dump_printf_loc (MSG_NOTE, vect_location,
2378                              "There is a gap of %u elements after the group\n",
2379                              GROUP_GAP (vinfo_for_stmt (stmt)));
2380         }
2381
2382       /* SLP: create an SLP data structure for every interleaving group of
2383          stores for further analysis in vect_analyse_slp.  */
2384       if (DR_IS_WRITE (dr) && !slp_impossible)
2385         {
2386           if (loop_vinfo)
2387             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2388           if (bb_vinfo)
2389             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2390         }
2391     }
2392
2393   return true;
2394 }
2395
2396 /* Analyze groups of accesses: check that DR belongs to a group of
2397    accesses of legal size, step, etc.  Detect gaps, single element
2398    interleaving, and other special cases. Set grouped access info.
2399    Collect groups of strided stores for further use in SLP analysis.  */
2400
2401 static bool
2402 vect_analyze_group_access (struct data_reference *dr)
2403 {
2404   if (!vect_analyze_group_access_1 (dr))
2405     {
2406       /* Dissolve the group if present.  */
2407       gimple *next;
2408       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2409       while (stmt)
2410         {
2411           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2412           next = GROUP_NEXT_ELEMENT (vinfo);
2413           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2414           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2415           stmt = next;
2416         }
2417       return false;
2418     }
2419   return true;
2420 }
2421
2422 /* Analyze the access pattern of the data-reference DR.
2423    In case of non-consecutive accesses call vect_analyze_group_access() to
2424    analyze groups of accesses.  */
2425
2426 static bool
2427 vect_analyze_data_ref_access (struct data_reference *dr)
2428 {
2429   tree step = DR_STEP (dr);
2430   tree scalar_type = TREE_TYPE (DR_REF (dr));
2431   gimple *stmt = DR_STMT (dr);
2432   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2433   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2434   struct loop *loop = NULL;
2435
2436   if (loop_vinfo)
2437     loop = LOOP_VINFO_LOOP (loop_vinfo);
2438
2439   if (loop_vinfo && !step)
2440     {
2441       if (dump_enabled_p ())
2442         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2443                          "bad data-ref access in loop\n");
2444       return false;
2445     }
2446
2447   /* Allow loads with zero step in inner-loop vectorization.  */
2448   if (loop_vinfo && integer_zerop (step))
2449     {
2450       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2451       if (!nested_in_vect_loop_p (loop, stmt))
2452         return DR_IS_READ (dr);
2453       /* Allow references with zero step for outer loops marked
2454          with pragma omp simd only - it guarantees absence of
2455          loop-carried dependencies between inner loop iterations.  */
2456       if (!loop->force_vectorize)
2457         {
2458           if (dump_enabled_p ())
2459             dump_printf_loc (MSG_NOTE, vect_location,
2460                              "zero step in inner loop of nest\n");
2461           return false;
2462         }
2463     }
2464
2465   if (loop && nested_in_vect_loop_p (loop, stmt))
2466     {
2467       /* Interleaved accesses are not yet supported within outer-loop
2468         vectorization for references in the inner-loop.  */
2469       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2470
2471       /* For the rest of the analysis we use the outer-loop step.  */
2472       step = STMT_VINFO_DR_STEP (stmt_info);
2473       if (integer_zerop (step))
2474         {
2475           if (dump_enabled_p ())
2476             dump_printf_loc (MSG_NOTE, vect_location,
2477                              "zero step in outer loop.\n");
2478           return DR_IS_READ (dr);
2479         }
2480     }
2481
2482   /* Consecutive?  */
2483   if (TREE_CODE (step) == INTEGER_CST)
2484     {
2485       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2486       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2487           || (dr_step < 0
2488               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2489         {
2490           /* Mark that it is not interleaving.  */
2491           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2492           return true;
2493         }
2494     }
2495
2496   if (loop && nested_in_vect_loop_p (loop, stmt))
2497     {
2498       if (dump_enabled_p ())
2499         dump_printf_loc (MSG_NOTE, vect_location,
2500                          "grouped access in outer loop.\n");
2501       return false;
2502     }
2503
2504
2505   /* Assume this is a DR handled by non-constant strided load case.  */
2506   if (TREE_CODE (step) != INTEGER_CST)
2507     return (STMT_VINFO_STRIDED_P (stmt_info)
2508             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2509                 || vect_analyze_group_access (dr)));
2510
2511   /* Not consecutive access - check if it's a part of interleaving group.  */
2512   return vect_analyze_group_access (dr);
2513 }
2514
2515
2516
2517 /*  A helper function used in the comparator function to sort data
2518     references.  T1 and T2 are two data references to be compared.
2519     The function returns -1, 0, or 1.  */
2520
2521 static int
2522 compare_tree (tree t1, tree t2)
2523 {
2524   int i, cmp;
2525   enum tree_code code;
2526   char tclass;
2527
2528   if (t1 == t2)
2529     return 0;
2530   if (t1 == NULL)
2531     return -1;
2532   if (t2 == NULL)
2533     return 1;
2534
2535   STRIP_NOPS (t1);
2536   STRIP_NOPS (t2);
2537
2538   if (TREE_CODE (t1) != TREE_CODE (t2))
2539     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2540
2541   code = TREE_CODE (t1);
2542   switch (code)
2543     {
2544     /* For const values, we can just use hash values for comparisons.  */
2545     case INTEGER_CST:
2546     case REAL_CST:
2547     case FIXED_CST:
2548     case STRING_CST:
2549     case COMPLEX_CST:
2550     case VECTOR_CST:
2551       {
2552         hashval_t h1 = iterative_hash_expr (t1, 0);
2553         hashval_t h2 = iterative_hash_expr (t2, 0);
2554         if (h1 != h2)
2555           return h1 < h2 ? -1 : 1;
2556         break;
2557       }
2558
2559     case SSA_NAME:
2560       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2561       if (cmp != 0)
2562         return cmp;
2563
2564       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2565         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2566       break;
2567
2568     default:
2569       tclass = TREE_CODE_CLASS (code);
2570
2571       /* For var-decl, we could compare their UIDs.  */
2572       if (tclass == tcc_declaration)
2573         {
2574           if (DECL_UID (t1) != DECL_UID (t2))
2575             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2576           break;
2577         }
2578
2579       /* For expressions with operands, compare their operands recursively.  */
2580       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2581         {
2582           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2583           if (cmp != 0)
2584             return cmp;
2585         }
2586     }
2587
2588   return 0;
2589 }
2590
2591
2592 /* Compare two data-references DRA and DRB to group them into chunks
2593    suitable for grouping.  */
2594
2595 static int
2596 dr_group_sort_cmp (const void *dra_, const void *drb_)
2597 {
2598   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2599   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2600   int cmp;
2601
2602   /* Stabilize sort.  */
2603   if (dra == drb)
2604     return 0;
2605
2606   /* DRs in different loops never belong to the same group.  */
2607   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2608   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2609   if (loopa != loopb)
2610     return loopa->num < loopb->num ? -1 : 1;
2611
2612   /* Ordering of DRs according to base.  */
2613   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2614     {
2615       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2616       if (cmp != 0)
2617         return cmp;
2618     }
2619
2620   /* And according to DR_OFFSET.  */
2621   if (!dr_equal_offsets_p (dra, drb))
2622     {
2623       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2624       if (cmp != 0)
2625         return cmp;
2626     }
2627
2628   /* Put reads before writes.  */
2629   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2630     return DR_IS_READ (dra) ? -1 : 1;
2631
2632   /* Then sort after access size.  */
2633   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2634                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2635     {
2636       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2637                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2638       if (cmp != 0)
2639         return cmp;
2640     }
2641
2642   /* And after step.  */
2643   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2644     {
2645       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2646       if (cmp != 0)
2647         return cmp;
2648     }
2649
2650   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2651   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2652   if (cmp == 0)
2653     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2654   return cmp;
2655 }
2656
2657 /* Function vect_analyze_data_ref_accesses.
2658
2659    Analyze the access pattern of all the data references in the loop.
2660
2661    FORNOW: the only access pattern that is considered vectorizable is a
2662            simple step 1 (consecutive) access.
2663
2664    FORNOW: handle only arrays and pointer accesses.  */
2665
2666 bool
2667 vect_analyze_data_ref_accesses (vec_info *vinfo)
2668 {
2669   unsigned int i;
2670   vec<data_reference_p> datarefs = vinfo->datarefs;
2671   struct data_reference *dr;
2672
2673   if (dump_enabled_p ())
2674     dump_printf_loc (MSG_NOTE, vect_location,
2675                      "=== vect_analyze_data_ref_accesses ===\n");
2676
2677   if (datarefs.is_empty ())
2678     return true;
2679
2680   /* Sort the array of datarefs to make building the interleaving chains
2681      linear.  Don't modify the original vector's order, it is needed for
2682      determining what dependencies are reversed.  */
2683   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2684   datarefs_copy.qsort (dr_group_sort_cmp);
2685
2686   /* Build the interleaving chains.  */
2687   for (i = 0; i < datarefs_copy.length () - 1;)
2688     {
2689       data_reference_p dra = datarefs_copy[i];
2690       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2691       stmt_vec_info lastinfo = NULL;
2692       for (i = i + 1; i < datarefs_copy.length (); ++i)
2693         {
2694           data_reference_p drb = datarefs_copy[i];
2695           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2696
2697           /* ???  Imperfect sorting (non-compatible types, non-modulo
2698              accesses, same accesses) can lead to a group to be artificially
2699              split here as we don't just skip over those.  If it really
2700              matters we can push those to a worklist and re-iterate
2701              over them.  The we can just skip ahead to the next DR here.  */
2702
2703           /* DRs in a different loop should not be put into the same
2704              interleaving group.  */
2705           if (gimple_bb (DR_STMT (dra))->loop_father
2706               != gimple_bb (DR_STMT (drb))->loop_father)
2707             break;
2708
2709           /* Check that the data-refs have same first location (except init)
2710              and they are both either store or load (not load and store,
2711              not masked loads or stores).  */
2712           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2713               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2714                                    DR_BASE_ADDRESS (drb), 0)
2715               || !dr_equal_offsets_p (dra, drb)
2716               || !gimple_assign_single_p (DR_STMT (dra))
2717               || !gimple_assign_single_p (DR_STMT (drb)))
2718             break;
2719
2720           /* Check that the data-refs have the same constant size.  */
2721           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2722           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2723           if (!tree_fits_uhwi_p (sza)
2724               || !tree_fits_uhwi_p (szb)
2725               || !tree_int_cst_equal (sza, szb))
2726             break;
2727
2728           /* Check that the data-refs have the same step.  */
2729           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2730             break;
2731
2732           /* Do not place the same access in the interleaving chain twice.  */
2733           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2734             break;
2735
2736           /* Check the types are compatible.
2737              ???  We don't distinguish this during sorting.  */
2738           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2739                                    TREE_TYPE (DR_REF (drb))))
2740             break;
2741
2742           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2743           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2744           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2745           gcc_assert (init_a < init_b);
2746
2747           /* If init_b == init_a + the size of the type * k, we have an
2748              interleaving, and DRA is accessed before DRB.  */
2749           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2750           if (type_size_a == 0
2751               || (init_b - init_a) % type_size_a != 0)
2752             break;
2753
2754           /* If we have a store, the accesses are adjacent.  This splits
2755              groups into chunks we support (we don't support vectorization
2756              of stores with gaps).  */
2757           if (!DR_IS_READ (dra)
2758               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2759                                              (DR_INIT (datarefs_copy[i-1]))
2760                   != type_size_a))
2761             break;
2762
2763           /* If the step (if not zero or non-constant) is greater than the
2764              difference between data-refs' inits this splits groups into
2765              suitable sizes.  */
2766           if (tree_fits_shwi_p (DR_STEP (dra)))
2767             {
2768               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2769               if (step != 0 && step <= (init_b - init_a))
2770                 break;
2771             }
2772
2773           if (dump_enabled_p ())
2774             {
2775               dump_printf_loc (MSG_NOTE, vect_location,
2776                                "Detected interleaving ");
2777               if (DR_IS_READ (dra))
2778                 dump_printf (MSG_NOTE, "load ");
2779               else
2780                 dump_printf (MSG_NOTE, "store ");
2781               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2782               dump_printf (MSG_NOTE,  " and ");
2783               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2784               dump_printf (MSG_NOTE, "\n");
2785             }
2786
2787           /* Link the found element into the group list.  */
2788           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2789             {
2790               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2791               lastinfo = stmtinfo_a;
2792             }
2793           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2794           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2795           lastinfo = stmtinfo_b;
2796         }
2797     }
2798
2799   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2800     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2801         && !vect_analyze_data_ref_access (dr))
2802       {
2803         if (dump_enabled_p ())
2804           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2805                            "not vectorized: complicated access pattern.\n");
2806
2807         if (is_a <bb_vec_info> (vinfo))
2808           {
2809             /* Mark the statement as not vectorizable.  */
2810             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2811             continue;
2812           }
2813         else
2814           {
2815             datarefs_copy.release ();
2816             return false;
2817           }
2818       }
2819
2820   datarefs_copy.release ();
2821   return true;
2822 }
2823
2824
2825 /* Operator == between two dr_with_seg_len objects.
2826
2827    This equality operator is used to make sure two data refs
2828    are the same one so that we will consider to combine the
2829    aliasing checks of those two pairs of data dependent data
2830    refs.  */
2831
2832 static bool
2833 operator == (const dr_with_seg_len& d1,
2834              const dr_with_seg_len& d2)
2835 {
2836   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2837                           DR_BASE_ADDRESS (d2.dr), 0)
2838            && compare_tree (d1.offset, d2.offset) == 0
2839            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2840 }
2841
2842 /* Function comp_dr_with_seg_len_pair.
2843
2844    Comparison function for sorting objects of dr_with_seg_len_pair_t
2845    so that we can combine aliasing checks in one scan.  */
2846
2847 static int
2848 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2849 {
2850   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2851   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2852
2853   const dr_with_seg_len &p11 = p1->first,
2854                         &p12 = p1->second,
2855                         &p21 = p2->first,
2856                         &p22 = p2->second;
2857
2858   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2859      if a and c have the same basic address snd step, and b and d have the same
2860      address and step.  Therefore, if any a&c or b&d don't have the same address
2861      and step, we don't care the order of those two pairs after sorting.  */
2862   int comp_res;
2863
2864   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2865                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2866     return comp_res;
2867   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2868                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2869     return comp_res;
2870   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2871     return comp_res;
2872   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2873     return comp_res;
2874   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2875     return comp_res;
2876   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2877     return comp_res;
2878
2879   return 0;
2880 }
2881
2882 /* Function vect_vfa_segment_size.
2883
2884    Create an expression that computes the size of segment
2885    that will be accessed for a data reference.  The functions takes into
2886    account that realignment loads may access one more vector.
2887
2888    Input:
2889      DR: The data reference.
2890      LENGTH_FACTOR: segment length to consider.
2891
2892    Return an expression whose value is the size of segment which will be
2893    accessed by DR.  */
2894
2895 static tree
2896 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2897 {
2898   tree segment_length;
2899
2900   if (integer_zerop (DR_STEP (dr)))
2901     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2902   else
2903     segment_length = size_binop (MULT_EXPR,
2904                                  fold_convert (sizetype, DR_STEP (dr)),
2905                                  fold_convert (sizetype, length_factor));
2906
2907   if (vect_supportable_dr_alignment (dr, false)
2908         == dr_explicit_realign_optimized)
2909     {
2910       tree vector_size = TYPE_SIZE_UNIT
2911                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2912
2913       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2914     }
2915   return segment_length;
2916 }
2917
2918 /* Function vect_prune_runtime_alias_test_list.
2919
2920    Prune a list of ddrs to be tested at run-time by versioning for alias.
2921    Merge several alias checks into one if possible.
2922    Return FALSE if resulting list of ddrs is longer then allowed by
2923    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2924
2925 bool
2926 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2927 {
2928   vec<ddr_p> may_alias_ddrs =
2929     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2930   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2931     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2932   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2933   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2934
2935   ddr_p ddr;
2936   unsigned int i;
2937   tree length_factor;
2938
2939   if (dump_enabled_p ())
2940     dump_printf_loc (MSG_NOTE, vect_location,
2941                      "=== vect_prune_runtime_alias_test_list ===\n");
2942
2943   if (may_alias_ddrs.is_empty ())
2944     return true;
2945
2946   /* Basically, for each pair of dependent data refs store_ptr_0
2947      and load_ptr_0, we create an expression:
2948
2949      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2950      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2951
2952      for aliasing checks.  However, in some cases we can decrease
2953      the number of checks by combining two checks into one.  For
2954      example, suppose we have another pair of data refs store_ptr_0
2955      and load_ptr_1, and if the following condition is satisfied:
2956
2957      load_ptr_0 < load_ptr_1  &&
2958      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2959
2960      (this condition means, in each iteration of vectorized loop,
2961      the accessed memory of store_ptr_0 cannot be between the memory
2962      of load_ptr_0 and load_ptr_1.)
2963
2964      we then can use only the following expression to finish the
2965      alising checks between store_ptr_0 & load_ptr_0 and
2966      store_ptr_0 & load_ptr_1:
2967
2968      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2969      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2970
2971      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2972      same basic address.  */
2973
2974   comp_alias_ddrs.create (may_alias_ddrs.length ());
2975
2976   /* First, we collect all data ref pairs for aliasing checks.  */
2977   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2978     {
2979       struct data_reference *dr_a, *dr_b;
2980       gimple *dr_group_first_a, *dr_group_first_b;
2981       tree segment_length_a, segment_length_b;
2982       gimple *stmt_a, *stmt_b;
2983
2984       dr_a = DDR_A (ddr);
2985       stmt_a = DR_STMT (DDR_A (ddr));
2986       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2987       if (dr_group_first_a)
2988         {
2989           stmt_a = dr_group_first_a;
2990           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2991         }
2992
2993       dr_b = DDR_B (ddr);
2994       stmt_b = DR_STMT (DDR_B (ddr));
2995       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2996       if (dr_group_first_b)
2997         {
2998           stmt_b = dr_group_first_b;
2999           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3000         }
3001
3002       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3003         length_factor = scalar_loop_iters;
3004       else
3005         length_factor = size_int (vect_factor);
3006       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3007       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3008
3009       dr_with_seg_len_pair_t dr_with_seg_len_pair
3010           (dr_with_seg_len (dr_a, segment_length_a),
3011            dr_with_seg_len (dr_b, segment_length_b));
3012
3013       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3014         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3015
3016       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3017     }
3018
3019   /* Second, we sort the collected data ref pairs so that we can scan
3020      them once to combine all possible aliasing checks.  */
3021   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3022
3023   /* Third, we scan the sorted dr pairs and check if we can combine
3024      alias checks of two neighbouring dr pairs.  */
3025   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3026     {
3027       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
3028       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3029                       *dr_b1 = &comp_alias_ddrs[i-1].second,
3030                       *dr_a2 = &comp_alias_ddrs[i].first,
3031                       *dr_b2 = &comp_alias_ddrs[i].second;
3032
3033       /* Remove duplicate data ref pairs.  */
3034       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3035         {
3036           if (dump_enabled_p ())
3037             {
3038               dump_printf_loc (MSG_NOTE, vect_location,
3039                                "found equal ranges ");
3040               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3041                                  DR_REF (dr_a1->dr));
3042               dump_printf (MSG_NOTE,  ", ");
3043               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3044                                  DR_REF (dr_b1->dr));
3045               dump_printf (MSG_NOTE,  " and ");
3046               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3047                                  DR_REF (dr_a2->dr));
3048               dump_printf (MSG_NOTE,  ", ");
3049               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3050                                  DR_REF (dr_b2->dr));
3051               dump_printf (MSG_NOTE, "\n");
3052             }
3053
3054           comp_alias_ddrs.ordered_remove (i--);
3055           continue;
3056         }
3057
3058       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3059         {
3060           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3061              and DR_A1 and DR_A2 are two consecutive memrefs.  */
3062           if (*dr_a1 == *dr_a2)
3063             {
3064               std::swap (dr_a1, dr_b1);
3065               std::swap (dr_a2, dr_b2);
3066             }
3067
3068           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3069                                 DR_BASE_ADDRESS (dr_a2->dr),
3070                                 0)
3071               || !tree_fits_shwi_p (dr_a1->offset)
3072               || !tree_fits_shwi_p (dr_a2->offset))
3073             continue;
3074
3075           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
3076                                 - tree_to_shwi (dr_a1->offset));
3077
3078
3079           /* Now we check if the following condition is satisfied:
3080
3081              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3082
3083              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
3084              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3085              have to make a best estimation.  We can get the minimum value
3086              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3087              then either of the following two conditions can guarantee the
3088              one above:
3089
3090              1: DIFF <= MIN_SEG_LEN_B
3091              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3092
3093              */
3094
3095           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
3096                                           ? tree_to_shwi (dr_b1->seg_len)
3097                                           : vect_factor);
3098
3099           if (diff <= min_seg_len_b
3100               || (tree_fits_shwi_p (dr_a1->seg_len)
3101                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
3102             {
3103               if (dump_enabled_p ())
3104                 {
3105                   dump_printf_loc (MSG_NOTE, vect_location,
3106                                    "merging ranges for ");
3107                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
3108                                      DR_REF (dr_a1->dr));
3109                   dump_printf (MSG_NOTE,  ", ");
3110                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
3111                                      DR_REF (dr_b1->dr));
3112                   dump_printf (MSG_NOTE,  " and ");
3113                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
3114                                      DR_REF (dr_a2->dr));
3115                   dump_printf (MSG_NOTE,  ", ");
3116                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
3117                                      DR_REF (dr_b2->dr));
3118                   dump_printf (MSG_NOTE, "\n");
3119                 }
3120
3121               dr_a1->seg_len = size_binop (PLUS_EXPR,
3122                                            dr_a2->seg_len, size_int (diff));
3123               comp_alias_ddrs.ordered_remove (i--);
3124             }
3125         }
3126     }
3127
3128   dump_printf_loc (MSG_NOTE, vect_location,
3129                    "improved number of alias checks from %d to %d\n",
3130                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3131   if ((int) comp_alias_ddrs.length () >
3132       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3133     return false;
3134
3135   return true;
3136 }
3137
3138 /* Check whether a non-affine read or write in stmt is suitable for gather load
3139    or scatter store and if so, return a builtin decl for that operation.  */
3140
3141 tree
3142 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3143                            tree *offp, int *scalep)
3144 {
3145   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3146   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3147   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3148   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3149   tree offtype = NULL_TREE;
3150   tree decl, base, off;
3151   machine_mode pmode;
3152   int punsignedp, reversep, pvolatilep = 0;
3153
3154   base = DR_REF (dr);
3155   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3156      see if we can use the def stmt of the address.  */
3157   if (is_gimple_call (stmt)
3158       && gimple_call_internal_p (stmt)
3159       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3160           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3161       && TREE_CODE (base) == MEM_REF
3162       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3163       && integer_zerop (TREE_OPERAND (base, 1))
3164       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3165     {
3166       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3167       if (is_gimple_assign (def_stmt)
3168           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3169         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3170     }
3171
3172   /* The gather and scatter builtins need address of the form
3173      loop_invariant + vector * {1, 2, 4, 8}
3174      or
3175      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3176      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3177      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3178      multiplications and additions in it.  To get a vector, we need
3179      a single SSA_NAME that will be defined in the loop and will
3180      contain everything that is not loop invariant and that can be
3181      vectorized.  The following code attempts to find such a preexistng
3182      SSA_NAME OFF and put the loop invariants into a tree BASE
3183      that can be gimplified before the loop.  */
3184   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3185                               &punsignedp, &reversep, &pvolatilep, false);
3186   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3187
3188   if (TREE_CODE (base) == MEM_REF)
3189     {
3190       if (!integer_zerop (TREE_OPERAND (base, 1)))
3191         {
3192           if (off == NULL_TREE)
3193             {
3194               offset_int moff = mem_ref_offset (base);
3195               off = wide_int_to_tree (sizetype, moff);
3196             }
3197           else
3198             off = size_binop (PLUS_EXPR, off,
3199                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3200         }
3201       base = TREE_OPERAND (base, 0);
3202     }
3203   else
3204     base = build_fold_addr_expr (base);
3205
3206   if (off == NULL_TREE)
3207     off = size_zero_node;
3208
3209   /* If base is not loop invariant, either off is 0, then we start with just
3210      the constant offset in the loop invariant BASE and continue with base
3211      as OFF, otherwise give up.
3212      We could handle that case by gimplifying the addition of base + off
3213      into some SSA_NAME and use that as off, but for now punt.  */
3214   if (!expr_invariant_in_loop_p (loop, base))
3215     {
3216       if (!integer_zerop (off))
3217         return NULL_TREE;
3218       off = base;
3219       base = size_int (pbitpos / BITS_PER_UNIT);
3220     }
3221   /* Otherwise put base + constant offset into the loop invariant BASE
3222      and continue with OFF.  */
3223   else
3224     {
3225       base = fold_convert (sizetype, base);
3226       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3227     }
3228
3229   /* OFF at this point may be either a SSA_NAME or some tree expression
3230      from get_inner_reference.  Try to peel off loop invariants from it
3231      into BASE as long as possible.  */
3232   STRIP_NOPS (off);
3233   while (offtype == NULL_TREE)
3234     {
3235       enum tree_code code;
3236       tree op0, op1, add = NULL_TREE;
3237
3238       if (TREE_CODE (off) == SSA_NAME)
3239         {
3240           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3241
3242           if (expr_invariant_in_loop_p (loop, off))
3243             return NULL_TREE;
3244
3245           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3246             break;
3247
3248           op0 = gimple_assign_rhs1 (def_stmt);
3249           code = gimple_assign_rhs_code (def_stmt);
3250           op1 = gimple_assign_rhs2 (def_stmt);
3251         }
3252       else
3253         {
3254           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3255             return NULL_TREE;
3256           code = TREE_CODE (off);
3257           extract_ops_from_tree (off, &code, &op0, &op1);
3258         }
3259       switch (code)
3260         {
3261         case POINTER_PLUS_EXPR:
3262         case PLUS_EXPR:
3263           if (expr_invariant_in_loop_p (loop, op0))
3264             {
3265               add = op0;
3266               off = op1;
3267             do_add:
3268               add = fold_convert (sizetype, add);
3269               if (scale != 1)
3270                 add = size_binop (MULT_EXPR, add, size_int (scale));
3271               base = size_binop (PLUS_EXPR, base, add);
3272               continue;
3273             }
3274           if (expr_invariant_in_loop_p (loop, op1))
3275             {
3276               add = op1;
3277               off = op0;
3278               goto do_add;
3279             }
3280           break;
3281         case MINUS_EXPR:
3282           if (expr_invariant_in_loop_p (loop, op1))
3283             {
3284               add = fold_convert (sizetype, op1);
3285               add = size_binop (MINUS_EXPR, size_zero_node, add);
3286               off = op0;
3287               goto do_add;
3288             }
3289           break;
3290         case MULT_EXPR:
3291           if (scale == 1 && tree_fits_shwi_p (op1))
3292             {
3293               scale = tree_to_shwi (op1);
3294               off = op0;
3295               continue;
3296             }
3297           break;
3298         case SSA_NAME:
3299           off = op0;
3300           continue;
3301         CASE_CONVERT:
3302           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3303               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3304             break;
3305           if (TYPE_PRECISION (TREE_TYPE (op0))
3306               == TYPE_PRECISION (TREE_TYPE (off)))
3307             {
3308               off = op0;
3309               continue;
3310             }
3311           if (TYPE_PRECISION (TREE_TYPE (op0))
3312               < TYPE_PRECISION (TREE_TYPE (off)))
3313             {
3314               off = op0;
3315               offtype = TREE_TYPE (off);
3316               STRIP_NOPS (off);
3317               continue;
3318             }
3319           break;
3320         default:
3321           break;
3322         }
3323       break;
3324     }
3325
3326   /* If at the end OFF still isn't a SSA_NAME or isn't
3327      defined in the loop, punt.  */
3328   if (TREE_CODE (off) != SSA_NAME
3329       || expr_invariant_in_loop_p (loop, off))
3330     return NULL_TREE;
3331
3332   if (offtype == NULL_TREE)
3333     offtype = TREE_TYPE (off);
3334
3335   if (DR_IS_READ (dr))
3336     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3337                                              offtype, scale);
3338   else
3339     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3340                                               offtype, scale);
3341
3342   if (decl == NULL_TREE)
3343     return NULL_TREE;
3344
3345   if (basep)
3346     *basep = base;
3347   if (offp)
3348     *offp = off;
3349   if (scalep)
3350     *scalep = scale;
3351   return decl;
3352 }
3353
3354 /* Function vect_analyze_data_refs.
3355
3356   Find all the data references in the loop or basic block.
3357
3358    The general structure of the analysis of data refs in the vectorizer is as
3359    follows:
3360    1- vect_analyze_data_refs(loop/bb): call
3361       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3362       in the loop/bb and their dependences.
3363    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3364    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3365    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3366
3367 */
3368
3369 bool
3370 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3371 {
3372   struct loop *loop = NULL;
3373   unsigned int i;
3374   struct data_reference *dr;
3375   tree scalar_type;
3376
3377   if (dump_enabled_p ())
3378     dump_printf_loc (MSG_NOTE, vect_location,
3379                      "=== vect_analyze_data_refs ===\n");
3380
3381   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3382     loop = LOOP_VINFO_LOOP (loop_vinfo);
3383
3384   /* Go through the data-refs, check that the analysis succeeded.  Update
3385      pointer from stmt_vec_info struct to DR and vectype.  */
3386
3387   vec<data_reference_p> datarefs = vinfo->datarefs;
3388   FOR_EACH_VEC_ELT (datarefs, i, dr)
3389     {
3390       gimple *stmt;
3391       stmt_vec_info stmt_info;
3392       tree base, offset, init;
3393       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3394       bool simd_lane_access = false;
3395       int vf;
3396
3397 again:
3398       if (!dr || !DR_REF (dr))
3399         {
3400           if (dump_enabled_p ())
3401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3402                              "not vectorized: unhandled data-ref\n");
3403           return false;
3404         }
3405
3406       stmt = DR_STMT (dr);
3407       stmt_info = vinfo_for_stmt (stmt);
3408
3409       /* Discard clobbers from the dataref vector.  We will remove
3410          clobber stmts during vectorization.  */
3411       if (gimple_clobber_p (stmt))
3412         {
3413           free_data_ref (dr);
3414           if (i == datarefs.length () - 1)
3415             {
3416               datarefs.pop ();
3417               break;
3418             }
3419           datarefs.ordered_remove (i);
3420           dr = datarefs[i];
3421           goto again;
3422         }
3423
3424       /* Check that analysis of the data-ref succeeded.  */
3425       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3426           || !DR_STEP (dr))
3427         {
3428           bool maybe_gather
3429             = DR_IS_READ (dr)
3430               && !TREE_THIS_VOLATILE (DR_REF (dr))
3431               && targetm.vectorize.builtin_gather != NULL;
3432           bool maybe_scatter
3433             = DR_IS_WRITE (dr)
3434               && !TREE_THIS_VOLATILE (DR_REF (dr))
3435               && targetm.vectorize.builtin_scatter != NULL;
3436           bool maybe_simd_lane_access
3437             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3438
3439           /* If target supports vector gather loads or scatter stores, or if
3440              this might be a SIMD lane access, see if they can't be used.  */
3441           if (is_a <loop_vec_info> (vinfo)
3442               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3443               && !nested_in_vect_loop_p (loop, stmt))
3444             {
3445               struct data_reference *newdr
3446                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3447                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3448               gcc_assert (newdr != NULL && DR_REF (newdr));
3449               if (DR_BASE_ADDRESS (newdr)
3450                   && DR_OFFSET (newdr)
3451                   && DR_INIT (newdr)
3452                   && DR_STEP (newdr)
3453                   && integer_zerop (DR_STEP (newdr)))
3454                 {
3455                   if (maybe_simd_lane_access)
3456                     {
3457                       tree off = DR_OFFSET (newdr);
3458                       STRIP_NOPS (off);
3459                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3460                           && TREE_CODE (off) == MULT_EXPR
3461                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3462                         {
3463                           tree step = TREE_OPERAND (off, 1);
3464                           off = TREE_OPERAND (off, 0);
3465                           STRIP_NOPS (off);
3466                           if (CONVERT_EXPR_P (off)
3467                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3468                                                                           0)))
3469                                  < TYPE_PRECISION (TREE_TYPE (off)))
3470                             off = TREE_OPERAND (off, 0);
3471                           if (TREE_CODE (off) == SSA_NAME)
3472                             {
3473                               gimple *def = SSA_NAME_DEF_STMT (off);
3474                               tree reft = TREE_TYPE (DR_REF (newdr));
3475                               if (is_gimple_call (def)
3476                                   && gimple_call_internal_p (def)
3477                                   && (gimple_call_internal_fn (def)
3478                                       == IFN_GOMP_SIMD_LANE))
3479                                 {
3480                                   tree arg = gimple_call_arg (def, 0);
3481                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3482                                   arg = SSA_NAME_VAR (arg);
3483                                   if (arg == loop->simduid
3484                                       /* For now.  */
3485                                       && tree_int_cst_equal
3486                                            (TYPE_SIZE_UNIT (reft),
3487                                             step))
3488                                     {
3489                                       DR_OFFSET (newdr) = ssize_int (0);
3490                                       DR_STEP (newdr) = step;
3491                                       DR_ALIGNED_TO (newdr)
3492                                         = size_int (BIGGEST_ALIGNMENT);
3493                                       dr = newdr;
3494                                       simd_lane_access = true;
3495                                     }
3496                                 }
3497                             }
3498                         }
3499                     }
3500                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3501                     {
3502                       dr = newdr;
3503                       if (maybe_gather)
3504                         gatherscatter = GATHER;
3505                       else
3506                         gatherscatter = SCATTER;
3507                     }
3508                 }
3509               if (gatherscatter == SG_NONE && !simd_lane_access)
3510                 free_data_ref (newdr);
3511             }
3512
3513           if (gatherscatter == SG_NONE && !simd_lane_access)
3514             {
3515               if (dump_enabled_p ())
3516                 {
3517                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3518                                    "not vectorized: data ref analysis "
3519                                    "failed ");
3520                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3521                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3522                 }
3523
3524               if (is_a <bb_vec_info> (vinfo))
3525                 break;
3526
3527               return false;
3528             }
3529         }
3530
3531       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3532         {
3533           if (dump_enabled_p ())
3534             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3535                              "not vectorized: base addr of dr is a "
3536                              "constant\n");
3537
3538           if (is_a <bb_vec_info> (vinfo))
3539             break;
3540
3541           if (gatherscatter != SG_NONE || simd_lane_access)
3542             free_data_ref (dr);
3543           return false;
3544         }
3545
3546       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3547         {
3548           if (dump_enabled_p ())
3549             {
3550               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3551                                "not vectorized: volatile type ");
3552               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3553               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3554             }
3555
3556           if (is_a <bb_vec_info> (vinfo))
3557             break;
3558
3559           return false;
3560         }
3561
3562       if (stmt_can_throw_internal (stmt))
3563         {
3564           if (dump_enabled_p ())
3565             {
3566               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3567                                "not vectorized: statement can throw an "
3568                                "exception ");
3569               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3570               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3571             }
3572
3573           if (is_a <bb_vec_info> (vinfo))
3574             break;
3575
3576           if (gatherscatter != SG_NONE || simd_lane_access)
3577             free_data_ref (dr);
3578           return false;
3579         }
3580
3581       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3582           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3583         {
3584           if (dump_enabled_p ())
3585             {
3586               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3587                                "not vectorized: statement is bitfield "
3588                                "access ");
3589               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3590               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3591             }
3592
3593           if (is_a <bb_vec_info> (vinfo))
3594             break;
3595
3596           if (gatherscatter != SG_NONE || simd_lane_access)
3597             free_data_ref (dr);
3598           return false;
3599         }
3600
3601       base = unshare_expr (DR_BASE_ADDRESS (dr));
3602       offset = unshare_expr (DR_OFFSET (dr));
3603       init = unshare_expr (DR_INIT (dr));
3604
3605       if (is_gimple_call (stmt)
3606           && (!gimple_call_internal_p (stmt)
3607               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3608                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3609         {
3610           if (dump_enabled_p ())
3611             {
3612               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3613                                "not vectorized: dr in a call ");
3614               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3615               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3616             }
3617
3618           if (is_a <bb_vec_info> (vinfo))
3619             break;
3620
3621           if (gatherscatter != SG_NONE || simd_lane_access)
3622             free_data_ref (dr);
3623           return false;
3624         }
3625
3626       /* Update DR field in stmt_vec_info struct.  */
3627
3628       /* If the dataref is in an inner-loop of the loop that is considered for
3629          for vectorization, we also want to analyze the access relative to
3630          the outer-loop (DR contains information only relative to the
3631          inner-most enclosing loop).  We do that by building a reference to the
3632          first location accessed by the inner-loop, and analyze it relative to
3633          the outer-loop.  */
3634       if (loop && nested_in_vect_loop_p (loop, stmt))
3635         {
3636           tree outer_step, outer_base, outer_init;
3637           HOST_WIDE_INT pbitsize, pbitpos;
3638           tree poffset;
3639           machine_mode pmode;
3640           int punsignedp, preversep, pvolatilep;
3641           affine_iv base_iv, offset_iv;
3642           tree dinit;
3643
3644           /* Build a reference to the first location accessed by the
3645              inner-loop: *(BASE+INIT).  (The first location is actually
3646              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3647           tree inner_base = build_fold_indirect_ref
3648                                 (fold_build_pointer_plus (base, init));
3649
3650           if (dump_enabled_p ())
3651             {
3652               dump_printf_loc (MSG_NOTE, vect_location,
3653                                "analyze in outer-loop: ");
3654               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3655               dump_printf (MSG_NOTE, "\n");
3656             }
3657
3658           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3659                                             &poffset, &pmode, &punsignedp,
3660                                             &preversep, &pvolatilep, false);
3661           gcc_assert (outer_base != NULL_TREE);
3662
3663           if (pbitpos % BITS_PER_UNIT != 0)
3664             {
3665               if (dump_enabled_p ())
3666                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3667                                  "failed: bit offset alignment.\n");
3668               return false;
3669             }
3670
3671           if (preversep)
3672             {
3673               if (dump_enabled_p ())
3674                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3675                                  "failed: reverse storage order.\n");
3676               return false;
3677             }
3678
3679           outer_base = build_fold_addr_expr (outer_base);
3680           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3681                           &base_iv, false))
3682             {
3683               if (dump_enabled_p ())
3684                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3685                                  "failed: evolution of base is not affine.\n");
3686               return false;
3687             }
3688
3689           if (offset)
3690             {
3691               if (poffset)
3692                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3693                                        poffset);
3694               else
3695                 poffset = offset;
3696             }
3697
3698           if (!poffset)
3699             {
3700               offset_iv.base = ssize_int (0);
3701               offset_iv.step = ssize_int (0);
3702             }
3703           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3704                                &offset_iv, false))
3705             {
3706               if (dump_enabled_p ())
3707                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3708                                  "evolution of offset is not affine.\n");
3709               return false;
3710             }
3711
3712           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3713           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3714           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3715           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3716           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3717
3718           outer_step = size_binop (PLUS_EXPR,
3719                                 fold_convert (ssizetype, base_iv.step),
3720                                 fold_convert (ssizetype, offset_iv.step));
3721
3722           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3723           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3724           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3725           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3726           STMT_VINFO_DR_OFFSET (stmt_info) =
3727                                 fold_convert (ssizetype, offset_iv.base);
3728           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3729                                 size_int (highest_pow2_factor (offset_iv.base));
3730
3731           if (dump_enabled_p ())
3732             {
3733               dump_printf_loc (MSG_NOTE, vect_location,
3734                                "\touter base_address: ");
3735               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3736                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3737               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3738               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3739                                  STMT_VINFO_DR_OFFSET (stmt_info));
3740               dump_printf (MSG_NOTE,
3741                            "\n\touter constant offset from base address: ");
3742               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3743                                  STMT_VINFO_DR_INIT (stmt_info));
3744               dump_printf (MSG_NOTE, "\n\touter step: ");
3745               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3746                                  STMT_VINFO_DR_STEP (stmt_info));
3747               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3748               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3749                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3750               dump_printf (MSG_NOTE, "\n");
3751             }
3752         }
3753
3754       if (STMT_VINFO_DATA_REF (stmt_info))
3755         {
3756           if (dump_enabled_p ())
3757             {
3758               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3759                                "not vectorized: more than one data ref "
3760                                "in stmt: ");
3761               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3762               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3763             }
3764
3765           if (is_a <bb_vec_info> (vinfo))
3766             break;
3767
3768           if (gatherscatter != SG_NONE || simd_lane_access)
3769             free_data_ref (dr);
3770           return false;
3771         }
3772
3773       STMT_VINFO_DATA_REF (stmt_info) = dr;
3774       if (simd_lane_access)
3775         {
3776           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3777           free_data_ref (datarefs[i]);
3778           datarefs[i] = dr;
3779         }
3780
3781       /* Set vectype for STMT.  */
3782       scalar_type = TREE_TYPE (DR_REF (dr));
3783       STMT_VINFO_VECTYPE (stmt_info)
3784         = get_vectype_for_scalar_type (scalar_type);
3785       if (!STMT_VINFO_VECTYPE (stmt_info))
3786         {
3787           if (dump_enabled_p ())
3788             {
3789               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3790                                "not vectorized: no vectype for stmt: ");
3791               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3792               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3793               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3794                                  scalar_type);
3795               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3796             }
3797
3798           if (is_a <bb_vec_info> (vinfo))
3799             {
3800               /* No vector type is fine, the ref can still participate
3801                  in dependence analysis, we just can't vectorize it.  */
3802               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3803               continue;
3804             }
3805
3806           if (gatherscatter != SG_NONE || simd_lane_access)
3807             {
3808               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3809               if (gatherscatter != SG_NONE)
3810                 free_data_ref (dr);
3811             }
3812           return false;
3813         }
3814       else
3815         {
3816           if (dump_enabled_p ())
3817             {
3818               dump_printf_loc (MSG_NOTE, vect_location,
3819                                "got vectype for stmt: ");
3820               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3821               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3822                                  STMT_VINFO_VECTYPE (stmt_info));
3823               dump_printf (MSG_NOTE, "\n");
3824             }
3825         }
3826
3827       /* Adjust the minimal vectorization factor according to the
3828          vector type.  */
3829       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3830       if (vf > *min_vf)
3831         *min_vf = vf;
3832
3833       if (gatherscatter != SG_NONE)
3834         {
3835           tree off;
3836           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3837                                           NULL, &off, NULL)
3838               || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3839             {
3840               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3841               free_data_ref (dr);
3842               if (dump_enabled_p ())
3843                 {
3844                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3845                                    (gatherscatter == GATHER) ?
3846                                    "not vectorized: not suitable for gather "
3847                                    "load " :
3848                                    "not vectorized: not suitable for scatter "
3849                                    "store ");
3850                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3851                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3852                 }
3853               return false;
3854             }
3855
3856           free_data_ref (datarefs[i]);
3857           datarefs[i] = dr;
3858           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3859         }
3860
3861       else if (is_a <loop_vec_info> (vinfo)
3862                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3863         {
3864           if (nested_in_vect_loop_p (loop, stmt))
3865             {
3866               if (dump_enabled_p ())
3867                 {
3868                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3869                                    "not vectorized: not suitable for strided "
3870                                    "load ");
3871                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3872                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3873                 }
3874               return false;
3875             }
3876           STMT_VINFO_STRIDED_P (stmt_info) = true;
3877         }
3878     }
3879
3880   /* If we stopped analysis at the first dataref we could not analyze
3881      when trying to vectorize a basic-block mark the rest of the datarefs
3882      as not vectorizable and truncate the vector of datarefs.  That
3883      avoids spending useless time in analyzing their dependence.  */
3884   if (i != datarefs.length ())
3885     {
3886       gcc_assert (is_a <bb_vec_info> (vinfo));
3887       for (unsigned j = i; j < datarefs.length (); ++j)
3888         {
3889           data_reference_p dr = datarefs[j];
3890           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3891           free_data_ref (dr);
3892         }
3893       datarefs.truncate (i);
3894     }
3895
3896   return true;
3897 }
3898
3899
3900 /* Function vect_get_new_vect_var.
3901
3902    Returns a name for a new variable.  The current naming scheme appends the
3903    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3904    the name of vectorizer generated variables, and appends that to NAME if
3905    provided.  */
3906
3907 tree
3908 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3909 {
3910   const char *prefix;
3911   tree new_vect_var;
3912
3913   switch (var_kind)
3914   {
3915   case vect_simple_var:
3916     prefix = "vect";
3917     break;
3918   case vect_scalar_var:
3919     prefix = "stmp";
3920     break;
3921   case vect_mask_var:
3922     prefix = "mask";
3923     break;
3924   case vect_pointer_var:
3925     prefix = "vectp";
3926     break;
3927   default:
3928     gcc_unreachable ();
3929   }
3930
3931   if (name)
3932     {
3933       char* tmp = concat (prefix, "_", name, NULL);
3934       new_vect_var = create_tmp_reg (type, tmp);
3935       free (tmp);
3936     }
3937   else
3938     new_vect_var = create_tmp_reg (type, prefix);
3939
3940   return new_vect_var;
3941 }
3942
3943 /* Like vect_get_new_vect_var but return an SSA name.  */
3944
3945 tree
3946 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3947 {
3948   const char *prefix;
3949   tree new_vect_var;
3950
3951   switch (var_kind)
3952   {
3953   case vect_simple_var:
3954     prefix = "vect";
3955     break;
3956   case vect_scalar_var:
3957     prefix = "stmp";
3958     break;
3959   case vect_pointer_var:
3960     prefix = "vectp";
3961     break;
3962   default:
3963     gcc_unreachable ();
3964   }
3965
3966   if (name)
3967     {
3968       char* tmp = concat (prefix, "_", name, NULL);
3969       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3970       free (tmp);
3971     }
3972   else
3973     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3974
3975   return new_vect_var;
3976 }
3977
3978 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3979
3980 static void
3981 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3982                                   stmt_vec_info stmt_info)
3983 {
3984   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3985   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3986   int misalign = DR_MISALIGNMENT (dr);
3987   if (misalign == -1)
3988     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3989   else
3990     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3991 }
3992
3993 /* Function vect_create_addr_base_for_vector_ref.
3994
3995    Create an expression that computes the address of the first memory location
3996    that will be accessed for a data reference.
3997
3998    Input:
3999    STMT: The statement containing the data reference.
4000    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4001    OFFSET: Optional. If supplied, it is be added to the initial address.
4002    LOOP:    Specify relative to which loop-nest should the address be computed.
4003             For example, when the dataref is in an inner-loop nested in an
4004             outer-loop that is now being vectorized, LOOP can be either the
4005             outer-loop, or the inner-loop.  The first memory location accessed
4006             by the following dataref ('in' points to short):
4007
4008                 for (i=0; i<N; i++)
4009                    for (j=0; j<M; j++)
4010                      s += in[i+j]
4011
4012             is as follows:
4013             if LOOP=i_loop:     &in             (relative to i_loop)
4014             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4015    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4016             initial address.  Unlike OFFSET, which is number of elements to
4017             be added, BYTE_OFFSET is measured in bytes.
4018
4019    Output:
4020    1. Return an SSA_NAME whose value is the address of the memory location of
4021       the first vector of the data reference.
4022    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4023       these statement(s) which define the returned SSA_NAME.
4024
4025    FORNOW: We are only handling array accesses with step 1.  */
4026
4027 tree
4028 vect_create_addr_base_for_vector_ref (gimple *stmt,
4029                                       gimple_seq *new_stmt_list,
4030                                       tree offset,
4031                                       struct loop *loop,
4032                                       tree byte_offset)
4033 {
4034   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4035   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4036   tree data_ref_base;
4037   const char *base_name;
4038   tree addr_base;
4039   tree dest;
4040   gimple_seq seq = NULL;
4041   tree base_offset;
4042   tree init;
4043   tree vect_ptr_type;
4044   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4045   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4046
4047   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4048     {
4049       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4050
4051       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4052
4053       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4054       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4055       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4056     }
4057   else
4058     {
4059       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4060       base_offset = unshare_expr (DR_OFFSET (dr));
4061       init = unshare_expr (DR_INIT (dr));
4062     }
4063
4064   if (loop_vinfo)
4065     base_name = get_name (data_ref_base);
4066   else
4067     {
4068       base_offset = ssize_int (0);
4069       init = ssize_int (0);
4070       base_name = get_name (DR_REF (dr));
4071     }
4072
4073   /* Create base_offset */
4074   base_offset = size_binop (PLUS_EXPR,
4075                             fold_convert (sizetype, base_offset),
4076                             fold_convert (sizetype, init));
4077
4078   if (offset)
4079     {
4080       offset = fold_build2 (MULT_EXPR, sizetype,
4081                             fold_convert (sizetype, offset), step);
4082       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4083                                  base_offset, offset);
4084     }
4085   if (byte_offset)
4086     {
4087       byte_offset = fold_convert (sizetype, byte_offset);
4088       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4089                                  base_offset, byte_offset);
4090     }
4091
4092   /* base + base_offset */
4093   if (loop_vinfo)
4094     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4095   else
4096     {
4097       addr_base = build1 (ADDR_EXPR,
4098                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4099                           unshare_expr (DR_REF (dr)));
4100     }
4101
4102   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4103   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4104   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4105   gimple_seq_add_seq (new_stmt_list, seq);
4106
4107   if (DR_PTR_INFO (dr)
4108       && TREE_CODE (addr_base) == SSA_NAME
4109       && !SSA_NAME_PTR_INFO (addr_base))
4110     {
4111       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4112       if (offset || byte_offset)
4113         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4114     }
4115
4116   if (dump_enabled_p ())
4117     {
4118       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4119       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4120       dump_printf (MSG_NOTE, "\n");
4121     }
4122
4123   return addr_base;
4124 }
4125
4126
4127 /* Function vect_create_data_ref_ptr.
4128
4129    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4130    location accessed in the loop by STMT, along with the def-use update
4131    chain to appropriately advance the pointer through the loop iterations.
4132    Also set aliasing information for the pointer.  This pointer is used by
4133    the callers to this function to create a memory reference expression for
4134    vector load/store access.
4135
4136    Input:
4137    1. STMT: a stmt that references memory. Expected to be of the form
4138          GIMPLE_ASSIGN <name, data-ref> or
4139          GIMPLE_ASSIGN <data-ref, name>.
4140    2. AGGR_TYPE: the type of the reference, which should be either a vector
4141         or an array.
4142    3. AT_LOOP: the loop where the vector memref is to be created.
4143    4. OFFSET (optional): an offset to be added to the initial address accessed
4144         by the data-ref in STMT.
4145    5. BSI: location where the new stmts are to be placed if there is no loop
4146    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4147         pointing to the initial address.
4148    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4149         to the initial address accessed by the data-ref in STMT.  This is
4150         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4151         in bytes.
4152
4153    Output:
4154    1. Declare a new ptr to vector_type, and have it point to the base of the
4155       data reference (initial addressed accessed by the data reference).
4156       For example, for vector of type V8HI, the following code is generated:
4157
4158       v8hi *ap;
4159       ap = (v8hi *)initial_address;
4160
4161       if OFFSET is not supplied:
4162          initial_address = &a[init];
4163       if OFFSET is supplied:
4164          initial_address = &a[init + OFFSET];
4165       if BYTE_OFFSET is supplied:
4166          initial_address = &a[init] + BYTE_OFFSET;
4167
4168       Return the initial_address in INITIAL_ADDRESS.
4169
4170    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4171       update the pointer in each iteration of the loop.
4172
4173       Return the increment stmt that updates the pointer in PTR_INCR.
4174
4175    3. Set INV_P to true if the access pattern of the data reference in the
4176       vectorized loop is invariant.  Set it to false otherwise.
4177
4178    4. Return the pointer.  */
4179
4180 tree
4181 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4182                           tree offset, tree *initial_address,
4183                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4184                           bool only_init, bool *inv_p, tree byte_offset)
4185 {
4186   const char *base_name;
4187   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4188   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4189   struct loop *loop = NULL;
4190   bool nested_in_vect_loop = false;
4191   struct loop *containing_loop = NULL;
4192   tree aggr_ptr_type;
4193   tree aggr_ptr;
4194   tree new_temp;
4195   gimple_seq new_stmt_list = NULL;
4196   edge pe = NULL;
4197   basic_block new_bb;
4198   tree aggr_ptr_init;
4199   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4200   tree aptr;
4201   gimple_stmt_iterator incr_gsi;
4202   bool insert_after;
4203   tree indx_before_incr, indx_after_incr;
4204   gimple *incr;
4205   tree step;
4206   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4207
4208   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4209               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4210
4211   if (loop_vinfo)
4212     {
4213       loop = LOOP_VINFO_LOOP (loop_vinfo);
4214       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4215       containing_loop = (gimple_bb (stmt))->loop_father;
4216       pe = loop_preheader_edge (loop);
4217     }
4218   else
4219     {
4220       gcc_assert (bb_vinfo);
4221       only_init = true;
4222       *ptr_incr = NULL;
4223     }
4224
4225   /* Check the step (evolution) of the load in LOOP, and record
4226      whether it's invariant.  */
4227   if (nested_in_vect_loop)
4228     step = STMT_VINFO_DR_STEP (stmt_info);
4229   else
4230     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4231
4232   if (integer_zerop (step))
4233     *inv_p = true;
4234   else
4235     *inv_p = false;
4236
4237   /* Create an expression for the first address accessed by this load
4238      in LOOP.  */
4239   base_name = get_name (DR_BASE_ADDRESS (dr));
4240
4241   if (dump_enabled_p ())
4242     {
4243       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4244       dump_printf_loc (MSG_NOTE, vect_location,
4245                        "create %s-pointer variable to type: ",
4246                        get_tree_code_name (TREE_CODE (aggr_type)));
4247       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4248       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4249         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4250       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4251         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4252       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4253         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4254       else
4255         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4256       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4257       dump_printf (MSG_NOTE, "\n");
4258     }
4259
4260   /* (1) Create the new aggregate-pointer variable.
4261      Vector and array types inherit the alias set of their component
4262      type by default so we need to use a ref-all pointer if the data
4263      reference does not conflict with the created aggregated data
4264      reference because it is not addressable.  */
4265   bool need_ref_all = false;
4266   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4267                               get_alias_set (DR_REF (dr))))
4268     need_ref_all = true;
4269   /* Likewise for any of the data references in the stmt group.  */
4270   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4271     {
4272       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4273       do
4274         {
4275           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4276           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4277           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4278                                       get_alias_set (DR_REF (sdr))))
4279             {
4280               need_ref_all = true;
4281               break;
4282             }
4283           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4284         }
4285       while (orig_stmt);
4286     }
4287   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4288                                                need_ref_all);
4289   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4290
4291
4292   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4293      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4294      def-use update cycles for the pointer: one relative to the outer-loop
4295      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4296      to the inner-loop (which is the inner-most loop containing the dataref),
4297      and this is done be step (5) below.
4298
4299      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4300      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4301      redundant.  Steps (3),(4) create the following:
4302
4303         vp0 = &base_addr;
4304         LOOP:   vp1 = phi(vp0,vp2)
4305                 ...
4306                 ...
4307                 vp2 = vp1 + step
4308                 goto LOOP
4309
4310      If there is an inner-loop nested in loop, then step (5) will also be
4311      applied, and an additional update in the inner-loop will be created:
4312
4313         vp0 = &base_addr;
4314         LOOP:   vp1 = phi(vp0,vp2)
4315                 ...
4316         inner:     vp3 = phi(vp1,vp4)
4317                    vp4 = vp3 + inner_step
4318                    if () goto inner
4319                 ...
4320                 vp2 = vp1 + step
4321                 if () goto LOOP   */
4322
4323   /* (2) Calculate the initial address of the aggregate-pointer, and set
4324      the aggregate-pointer to point to it before the loop.  */
4325
4326   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4327
4328   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4329                                                    offset, loop, byte_offset);
4330   if (new_stmt_list)
4331     {
4332       if (pe)
4333         {
4334           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4335           gcc_assert (!new_bb);
4336         }
4337       else
4338         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4339     }
4340
4341   *initial_address = new_temp;
4342   aggr_ptr_init = new_temp;
4343
4344   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4345      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4346      inner-loop nested in LOOP (during outer-loop vectorization).  */
4347
4348   /* No update in loop is required.  */
4349   if (only_init && (!loop_vinfo || at_loop == loop))
4350     aptr = aggr_ptr_init;
4351   else
4352     {
4353       /* The step of the aggregate pointer is the type size.  */
4354       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4355       /* One exception to the above is when the scalar step of the load in
4356          LOOP is zero. In this case the step here is also zero.  */
4357       if (*inv_p)
4358         iv_step = size_zero_node;
4359       else if (tree_int_cst_sgn (step) == -1)
4360         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4361
4362       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4363
4364       create_iv (aggr_ptr_init,
4365                  fold_convert (aggr_ptr_type, iv_step),
4366                  aggr_ptr, loop, &incr_gsi, insert_after,
4367                  &indx_before_incr, &indx_after_incr);
4368       incr = gsi_stmt (incr_gsi);
4369       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4370
4371       /* Copy the points-to information if it exists. */
4372       if (DR_PTR_INFO (dr))
4373         {
4374           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4375           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4376         }
4377       if (ptr_incr)
4378         *ptr_incr = incr;
4379
4380       aptr = indx_before_incr;
4381     }
4382
4383   if (!nested_in_vect_loop || only_init)
4384     return aptr;
4385
4386
4387   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4388      nested in LOOP, if exists.  */
4389
4390   gcc_assert (nested_in_vect_loop);
4391   if (!only_init)
4392     {
4393       standard_iv_increment_position (containing_loop, &incr_gsi,
4394                                       &insert_after);
4395       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4396                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4397                  &indx_after_incr);
4398       incr = gsi_stmt (incr_gsi);
4399       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4400
4401       /* Copy the points-to information if it exists. */
4402       if (DR_PTR_INFO (dr))
4403         {
4404           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4405           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4406         }
4407       if (ptr_incr)
4408         *ptr_incr = incr;
4409
4410       return indx_before_incr;
4411     }
4412   else
4413     gcc_unreachable ();
4414 }
4415
4416
4417 /* Function bump_vector_ptr
4418
4419    Increment a pointer (to a vector type) by vector-size. If requested,
4420    i.e. if PTR-INCR is given, then also connect the new increment stmt
4421    to the existing def-use update-chain of the pointer, by modifying
4422    the PTR_INCR as illustrated below:
4423
4424    The pointer def-use update-chain before this function:
4425                         DATAREF_PTR = phi (p_0, p_2)
4426                         ....
4427         PTR_INCR:       p_2 = DATAREF_PTR + step
4428
4429    The pointer def-use update-chain after this function:
4430                         DATAREF_PTR = phi (p_0, p_2)
4431                         ....
4432                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4433                         ....
4434         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4435
4436    Input:
4437    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4438                  in the loop.
4439    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4440               the loop.  The increment amount across iterations is expected
4441               to be vector_size.
4442    BSI - location where the new update stmt is to be placed.
4443    STMT - the original scalar memory-access stmt that is being vectorized.
4444    BUMP - optional. The offset by which to bump the pointer. If not given,
4445           the offset is assumed to be vector_size.
4446
4447    Output: Return NEW_DATAREF_PTR as illustrated above.
4448
4449 */
4450
4451 tree
4452 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4453                  gimple *stmt, tree bump)
4454 {
4455   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4456   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4457   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4458   tree update = TYPE_SIZE_UNIT (vectype);
4459   gassign *incr_stmt;
4460   ssa_op_iter iter;
4461   use_operand_p use_p;
4462   tree new_dataref_ptr;
4463
4464   if (bump)
4465     update = bump;
4466
4467   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4468     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4469   else
4470     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4471   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4472                                    dataref_ptr, update);
4473   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4474
4475   /* Copy the points-to information if it exists. */
4476   if (DR_PTR_INFO (dr))
4477     {
4478       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4479       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4480     }
4481
4482   if (!ptr_incr)
4483     return new_dataref_ptr;
4484
4485   /* Update the vector-pointer's cross-iteration increment.  */
4486   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4487     {
4488       tree use = USE_FROM_PTR (use_p);
4489
4490       if (use == dataref_ptr)
4491         SET_USE (use_p, new_dataref_ptr);
4492       else
4493         gcc_assert (tree_int_cst_compare (use, update) == 0);
4494     }
4495
4496   return new_dataref_ptr;
4497 }
4498
4499
4500 /* Function vect_create_destination_var.
4501
4502    Create a new temporary of type VECTYPE.  */
4503
4504 tree
4505 vect_create_destination_var (tree scalar_dest, tree vectype)
4506 {
4507   tree vec_dest;
4508   const char *name;
4509   char *new_name;
4510   tree type;
4511   enum vect_var_kind kind;
4512
4513   kind = vectype
4514     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4515     ? vect_mask_var
4516     : vect_simple_var
4517     : vect_scalar_var;
4518   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4519
4520   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4521
4522   name = get_name (scalar_dest);
4523   if (name)
4524     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4525   else
4526     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4527   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4528   free (new_name);
4529
4530   return vec_dest;
4531 }
4532
4533 /* Function vect_grouped_store_supported.
4534
4535    Returns TRUE if interleave high and interleave low permutations
4536    are supported, and FALSE otherwise.  */
4537
4538 bool
4539 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4540 {
4541   machine_mode mode = TYPE_MODE (vectype);
4542
4543   /* vect_permute_store_chain requires the group size to be equal to 3 or
4544      be a power of two.  */
4545   if (count != 3 && exact_log2 (count) == -1)
4546     {
4547       if (dump_enabled_p ())
4548         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4549                          "the size of the group of accesses"
4550                          " is not a power of 2 or not eqaul to 3\n");
4551       return false;
4552     }
4553
4554   /* Check that the permutation is supported.  */
4555   if (VECTOR_MODE_P (mode))
4556     {
4557       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4558       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4559
4560       if (count == 3)
4561         {
4562           unsigned int j0 = 0, j1 = 0, j2 = 0;
4563           unsigned int i, j;
4564
4565           for (j = 0; j < 3; j++)
4566             {
4567               int nelt0 = ((3 - j) * nelt) % 3;
4568               int nelt1 = ((3 - j) * nelt + 1) % 3;
4569               int nelt2 = ((3 - j) * nelt + 2) % 3;
4570               for (i = 0; i < nelt; i++)
4571                 {
4572                   if (3 * i + nelt0 < nelt)
4573                     sel[3 * i + nelt0] = j0++;
4574                   if (3 * i + nelt1 < nelt)
4575                     sel[3 * i + nelt1] = nelt + j1++;
4576                   if (3 * i + nelt2 < nelt)
4577                     sel[3 * i + nelt2] = 0;
4578                 }
4579               if (!can_vec_perm_p (mode, false, sel))
4580                 {
4581                   if (dump_enabled_p ())
4582                     dump_printf (MSG_MISSED_OPTIMIZATION,
4583                                  "permutaion op not supported by target.\n");
4584                   return false;
4585                 }
4586
4587               for (i = 0; i < nelt; i++)
4588                 {
4589                   if (3 * i + nelt0 < nelt)
4590                     sel[3 * i + nelt0] = 3 * i + nelt0;
4591                   if (3 * i + nelt1 < nelt)
4592                     sel[3 * i + nelt1] = 3 * i + nelt1;
4593                   if (3 * i + nelt2 < nelt)
4594                     sel[3 * i + nelt2] = nelt + j2++;
4595                 }
4596               if (!can_vec_perm_p (mode, false, sel))
4597                 {
4598                   if (dump_enabled_p ())
4599                     dump_printf (MSG_MISSED_OPTIMIZATION,
4600                                  "permutaion op not supported by target.\n");
4601                   return false;
4602                 }
4603             }
4604           return true;
4605         }
4606       else
4607         {
4608           /* If length is not equal to 3 then only power of 2 is supported.  */
4609           gcc_assert (exact_log2 (count) != -1);
4610
4611           for (i = 0; i < nelt / 2; i++)
4612             {
4613               sel[i * 2] = i;
4614               sel[i * 2 + 1] = i + nelt;
4615             }
4616             if (can_vec_perm_p (mode, false, sel))
4617               {
4618                 for (i = 0; i < nelt; i++)
4619                   sel[i] += nelt / 2;
4620                 if (can_vec_perm_p (mode, false, sel))
4621                   return true;
4622               }
4623         }
4624     }
4625
4626   if (dump_enabled_p ())
4627     dump_printf (MSG_MISSED_OPTIMIZATION,
4628                  "permutaion op not supported by target.\n");
4629   return false;
4630 }
4631
4632
4633 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4634    type VECTYPE.  */
4635
4636 bool
4637 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4638 {
4639   return vect_lanes_optab_supported_p ("vec_store_lanes",
4640                                        vec_store_lanes_optab,
4641                                        vectype, count);
4642 }
4643
4644
4645 /* Function vect_permute_store_chain.
4646
4647    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4648    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4649    the data correctly for the stores.  Return the final references for stores
4650    in RESULT_CHAIN.
4651
4652    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4653    The input is 4 vectors each containing 8 elements.  We assign a number to
4654    each element, the input sequence is:
4655
4656    1st vec:   0  1  2  3  4  5  6  7
4657    2nd vec:   8  9 10 11 12 13 14 15
4658    3rd vec:  16 17 18 19 20 21 22 23
4659    4th vec:  24 25 26 27 28 29 30 31
4660
4661    The output sequence should be:
4662
4663    1st vec:  0  8 16 24  1  9 17 25
4664    2nd vec:  2 10 18 26  3 11 19 27
4665    3rd vec:  4 12 20 28  5 13 21 30
4666    4th vec:  6 14 22 30  7 15 23 31
4667
4668    i.e., we interleave the contents of the four vectors in their order.
4669
4670    We use interleave_high/low instructions to create such output.  The input of
4671    each interleave_high/low operation is two vectors:
4672    1st vec    2nd vec
4673    0 1 2 3    4 5 6 7
4674    the even elements of the result vector are obtained left-to-right from the
4675    high/low elements of the first vector.  The odd elements of the result are
4676    obtained left-to-right from the high/low elements of the second vector.
4677    The output of interleave_high will be:   0 4 1 5
4678    and of interleave_low:                   2 6 3 7
4679
4680
4681    The permutation is done in log LENGTH stages.  In each stage interleave_high
4682    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4683    where the first argument is taken from the first half of DR_CHAIN and the
4684    second argument from it's second half.
4685    In our example,
4686
4687    I1: interleave_high (1st vec, 3rd vec)
4688    I2: interleave_low (1st vec, 3rd vec)
4689    I3: interleave_high (2nd vec, 4th vec)
4690    I4: interleave_low (2nd vec, 4th vec)
4691
4692    The output for the first stage is:
4693
4694    I1:  0 16  1 17  2 18  3 19
4695    I2:  4 20  5 21  6 22  7 23
4696    I3:  8 24  9 25 10 26 11 27
4697    I4: 12 28 13 29 14 30 15 31
4698
4699    The output of the second stage, i.e. the final result is:
4700
4701    I1:  0  8 16 24  1  9 17 25
4702    I2:  2 10 18 26  3 11 19 27
4703    I3:  4 12 20 28  5 13 21 30
4704    I4:  6 14 22 30  7 15 23 31.  */
4705
4706 void
4707 vect_permute_store_chain (vec<tree> dr_chain,
4708                           unsigned int length,
4709                           gimple *stmt,
4710                           gimple_stmt_iterator *gsi,
4711                           vec<tree> *result_chain)
4712 {
4713   tree vect1, vect2, high, low;
4714   gimple *perm_stmt;
4715   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4716   tree perm_mask_low, perm_mask_high;
4717   tree data_ref;
4718   tree perm3_mask_low, perm3_mask_high;
4719   unsigned int i, n, log_length = exact_log2 (length);
4720   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4721   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4722
4723   result_chain->quick_grow (length);
4724   memcpy (result_chain->address (), dr_chain.address (),
4725           length * sizeof (tree));
4726
4727   if (length == 3)
4728     {
4729       unsigned int j0 = 0, j1 = 0, j2 = 0;
4730
4731       for (j = 0; j < 3; j++)
4732         {
4733           int nelt0 = ((3 - j) * nelt) % 3;
4734           int nelt1 = ((3 - j) * nelt + 1) % 3;
4735           int nelt2 = ((3 - j) * nelt + 2) % 3;
4736
4737           for (i = 0; i < nelt; i++)
4738             {
4739               if (3 * i + nelt0 < nelt)
4740                 sel[3 * i + nelt0] = j0++;
4741               if (3 * i + nelt1 < nelt)
4742                 sel[3 * i + nelt1] = nelt + j1++;
4743               if (3 * i + nelt2 < nelt)
4744                 sel[3 * i + nelt2] = 0;
4745             }
4746           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4747
4748           for (i = 0; i < nelt; i++)
4749             {
4750               if (3 * i + nelt0 < nelt)
4751                 sel[3 * i + nelt0] = 3 * i + nelt0;
4752               if (3 * i + nelt1 < nelt)
4753                 sel[3 * i + nelt1] = 3 * i + nelt1;
4754               if (3 * i + nelt2 < nelt)
4755                 sel[3 * i + nelt2] = nelt + j2++;
4756             }
4757           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4758
4759           vect1 = dr_chain[0];
4760           vect2 = dr_chain[1];
4761
4762           /* Create interleaving stmt:
4763              low = VEC_PERM_EXPR <vect1, vect2,
4764                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4765                                    j + 2, nelt + j + 2, *, ...}>  */
4766           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4767           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4768                                            vect2, perm3_mask_low);
4769           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4770
4771           vect1 = data_ref;
4772           vect2 = dr_chain[2];
4773           /* Create interleaving stmt:
4774              low = VEC_PERM_EXPR <vect1, vect2,
4775                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4776                                    6, 7, nelt + j + 2, ...}>  */
4777           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4778           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4779                                            vect2, perm3_mask_high);
4780           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4781           (*result_chain)[j] = data_ref;
4782         }
4783     }
4784   else
4785     {
4786       /* If length is not equal to 3 then only power of 2 is supported.  */
4787       gcc_assert (exact_log2 (length) != -1);
4788
4789       for (i = 0, n = nelt / 2; i < n; i++)
4790         {
4791           sel[i * 2] = i;
4792           sel[i * 2 + 1] = i + nelt;
4793         }
4794         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4795
4796         for (i = 0; i < nelt; i++)
4797           sel[i] += nelt / 2;
4798         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4799
4800         for (i = 0, n = log_length; i < n; i++)
4801           {
4802             for (j = 0; j < length/2; j++)
4803               {
4804                 vect1 = dr_chain[j];
4805                 vect2 = dr_chain[j+length/2];
4806
4807                 /* Create interleaving stmt:
4808                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4809                                                         ...}>  */
4810                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4811                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4812                                                  vect2, perm_mask_high);
4813                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4814                 (*result_chain)[2*j] = high;
4815
4816                 /* Create interleaving stmt:
4817                    low = VEC_PERM_EXPR <vect1, vect2,
4818                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4819                                          ...}>  */
4820                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4821                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4822                                                  vect2, perm_mask_low);
4823                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4824                 (*result_chain)[2*j+1] = low;
4825               }
4826             memcpy (dr_chain.address (), result_chain->address (),
4827                     length * sizeof (tree));
4828           }
4829     }
4830 }
4831
4832 /* Function vect_setup_realignment
4833
4834    This function is called when vectorizing an unaligned load using
4835    the dr_explicit_realign[_optimized] scheme.
4836    This function generates the following code at the loop prolog:
4837
4838       p = initial_addr;
4839    x  msq_init = *(floor(p));   # prolog load
4840       realignment_token = call target_builtin;
4841     loop:
4842    x  msq = phi (msq_init, ---)
4843
4844    The stmts marked with x are generated only for the case of
4845    dr_explicit_realign_optimized.
4846
4847    The code above sets up a new (vector) pointer, pointing to the first
4848    location accessed by STMT, and a "floor-aligned" load using that pointer.
4849    It also generates code to compute the "realignment-token" (if the relevant
4850    target hook was defined), and creates a phi-node at the loop-header bb
4851    whose arguments are the result of the prolog-load (created by this
4852    function) and the result of a load that takes place in the loop (to be
4853    created by the caller to this function).
4854
4855    For the case of dr_explicit_realign_optimized:
4856    The caller to this function uses the phi-result (msq) to create the
4857    realignment code inside the loop, and sets up the missing phi argument,
4858    as follows:
4859     loop:
4860       msq = phi (msq_init, lsq)
4861       lsq = *(floor(p'));        # load in loop
4862       result = realign_load (msq, lsq, realignment_token);
4863
4864    For the case of dr_explicit_realign:
4865     loop:
4866       msq = *(floor(p));        # load in loop
4867       p' = p + (VS-1);
4868       lsq = *(floor(p'));       # load in loop
4869       result = realign_load (msq, lsq, realignment_token);
4870
4871    Input:
4872    STMT - (scalar) load stmt to be vectorized. This load accesses
4873           a memory location that may be unaligned.
4874    BSI - place where new code is to be inserted.
4875    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4876                               is used.
4877
4878    Output:
4879    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4880                        target hook, if defined.
4881    Return value - the result of the loop-header phi node.  */
4882
4883 tree
4884 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4885                         tree *realignment_token,
4886                         enum dr_alignment_support alignment_support_scheme,
4887                         tree init_addr,
4888                         struct loop **at_loop)
4889 {
4890   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4891   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4892   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4893   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4894   struct loop *loop = NULL;
4895   edge pe = NULL;
4896   tree scalar_dest = gimple_assign_lhs (stmt);
4897   tree vec_dest;
4898   gimple *inc;
4899   tree ptr;
4900   tree data_ref;
4901   basic_block new_bb;
4902   tree msq_init = NULL_TREE;
4903   tree new_temp;
4904   gphi *phi_stmt;
4905   tree msq = NULL_TREE;
4906   gimple_seq stmts = NULL;
4907   bool inv_p;
4908   bool compute_in_loop = false;
4909   bool nested_in_vect_loop = false;
4910   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4911   struct loop *loop_for_initial_load = NULL;
4912
4913   if (loop_vinfo)
4914     {
4915       loop = LOOP_VINFO_LOOP (loop_vinfo);
4916       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4917     }
4918
4919   gcc_assert (alignment_support_scheme == dr_explicit_realign
4920               || alignment_support_scheme == dr_explicit_realign_optimized);
4921
4922   /* We need to generate three things:
4923      1. the misalignment computation
4924      2. the extra vector load (for the optimized realignment scheme).
4925      3. the phi node for the two vectors from which the realignment is
4926       done (for the optimized realignment scheme).  */
4927
4928   /* 1. Determine where to generate the misalignment computation.
4929
4930      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4931      calculation will be generated by this function, outside the loop (in the
4932      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4933      caller, inside the loop.
4934
4935      Background: If the misalignment remains fixed throughout the iterations of
4936      the loop, then both realignment schemes are applicable, and also the
4937      misalignment computation can be done outside LOOP.  This is because we are
4938      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4939      are a multiple of VS (the Vector Size), and therefore the misalignment in
4940      different vectorized LOOP iterations is always the same.
4941      The problem arises only if the memory access is in an inner-loop nested
4942      inside LOOP, which is now being vectorized using outer-loop vectorization.
4943      This is the only case when the misalignment of the memory access may not
4944      remain fixed throughout the iterations of the inner-loop (as explained in
4945      detail in vect_supportable_dr_alignment).  In this case, not only is the
4946      optimized realignment scheme not applicable, but also the misalignment
4947      computation (and generation of the realignment token that is passed to
4948      REALIGN_LOAD) have to be done inside the loop.
4949
4950      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4951      or not, which in turn determines if the misalignment is computed inside
4952      the inner-loop, or outside LOOP.  */
4953
4954   if (init_addr != NULL_TREE || !loop_vinfo)
4955     {
4956       compute_in_loop = true;
4957       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4958     }
4959
4960
4961   /* 2. Determine where to generate the extra vector load.
4962
4963      For the optimized realignment scheme, instead of generating two vector
4964      loads in each iteration, we generate a single extra vector load in the
4965      preheader of the loop, and in each iteration reuse the result of the
4966      vector load from the previous iteration.  In case the memory access is in
4967      an inner-loop nested inside LOOP, which is now being vectorized using
4968      outer-loop vectorization, we need to determine whether this initial vector
4969      load should be generated at the preheader of the inner-loop, or can be
4970      generated at the preheader of LOOP.  If the memory access has no evolution
4971      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4972      to be generated inside LOOP (in the preheader of the inner-loop).  */
4973
4974   if (nested_in_vect_loop)
4975     {
4976       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4977       bool invariant_in_outerloop =
4978             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4979       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4980     }
4981   else
4982     loop_for_initial_load = loop;
4983   if (at_loop)
4984     *at_loop = loop_for_initial_load;
4985
4986   if (loop_for_initial_load)
4987     pe = loop_preheader_edge (loop_for_initial_load);
4988
4989   /* 3. For the case of the optimized realignment, create the first vector
4990       load at the loop preheader.  */
4991
4992   if (alignment_support_scheme == dr_explicit_realign_optimized)
4993     {
4994       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4995       gassign *new_stmt;
4996
4997       gcc_assert (!compute_in_loop);
4998       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4999       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5000                                       NULL_TREE, &init_addr, NULL, &inc,
5001                                       true, &inv_p);
5002       if (TREE_CODE (ptr) == SSA_NAME)
5003         new_temp = copy_ssa_name (ptr);
5004       else
5005         new_temp = make_ssa_name (TREE_TYPE (ptr));
5006       new_stmt = gimple_build_assign
5007                    (new_temp, BIT_AND_EXPR, ptr,
5008                     build_int_cst (TREE_TYPE (ptr),
5009                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5010       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5011       gcc_assert (!new_bb);
5012       data_ref
5013         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5014                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5015       new_stmt = gimple_build_assign (vec_dest, data_ref);
5016       new_temp = make_ssa_name (vec_dest, new_stmt);
5017       gimple_assign_set_lhs (new_stmt, new_temp);
5018       if (pe)
5019         {
5020           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5021           gcc_assert (!new_bb);
5022         }
5023       else
5024          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5025
5026       msq_init = gimple_assign_lhs (new_stmt);
5027     }
5028
5029   /* 4. Create realignment token using a target builtin, if available.
5030       It is done either inside the containing loop, or before LOOP (as
5031       determined above).  */
5032
5033   if (targetm.vectorize.builtin_mask_for_load)
5034     {
5035       gcall *new_stmt;
5036       tree builtin_decl;
5037
5038       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5039       if (!init_addr)
5040         {
5041           /* Generate the INIT_ADDR computation outside LOOP.  */
5042           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5043                                                         NULL_TREE, loop);
5044           if (loop)
5045             {
5046               pe = loop_preheader_edge (loop);
5047               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5048               gcc_assert (!new_bb);
5049             }
5050           else
5051              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5052         }
5053
5054       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5055       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5056       vec_dest =
5057         vect_create_destination_var (scalar_dest,
5058                                      gimple_call_return_type (new_stmt));
5059       new_temp = make_ssa_name (vec_dest, new_stmt);
5060       gimple_call_set_lhs (new_stmt, new_temp);
5061
5062       if (compute_in_loop)
5063         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5064       else
5065         {
5066           /* Generate the misalignment computation outside LOOP.  */
5067           pe = loop_preheader_edge (loop);
5068           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5069           gcc_assert (!new_bb);
5070         }
5071
5072       *realignment_token = gimple_call_lhs (new_stmt);
5073
5074       /* The result of the CALL_EXPR to this builtin is determined from
5075          the value of the parameter and no global variables are touched
5076          which makes the builtin a "const" function.  Requiring the
5077          builtin to have the "const" attribute makes it unnecessary
5078          to call mark_call_clobbered.  */
5079       gcc_assert (TREE_READONLY (builtin_decl));
5080     }
5081
5082   if (alignment_support_scheme == dr_explicit_realign)
5083     return msq;
5084
5085   gcc_assert (!compute_in_loop);
5086   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5087
5088
5089   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5090
5091   pe = loop_preheader_edge (containing_loop);
5092   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5093   msq = make_ssa_name (vec_dest);
5094   phi_stmt = create_phi_node (msq, containing_loop->header);
5095   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5096
5097   return msq;
5098 }
5099
5100
5101 /* Function vect_grouped_load_supported.
5102
5103    Returns TRUE if even and odd permutations are supported,
5104    and FALSE otherwise.  */
5105
5106 bool
5107 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5108 {
5109   machine_mode mode = TYPE_MODE (vectype);
5110
5111   /* vect_permute_load_chain requires the group size to be equal to 3 or
5112      be a power of two.  */
5113   if (count != 3 && exact_log2 (count) == -1)
5114     {
5115       if (dump_enabled_p ())
5116         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5117                          "the size of the group of accesses"
5118                          " is not a power of 2 or not equal to 3\n");
5119       return false;
5120     }
5121
5122   /* Check that the permutation is supported.  */
5123   if (VECTOR_MODE_P (mode))
5124     {
5125       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5126       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5127
5128       if (count == 3)
5129         {
5130           unsigned int k;
5131           for (k = 0; k < 3; k++)
5132             {
5133               for (i = 0; i < nelt; i++)
5134                 if (3 * i + k < 2 * nelt)
5135                   sel[i] = 3 * i + k;
5136                 else
5137                   sel[i] = 0;
5138               if (!can_vec_perm_p (mode, false, sel))
5139                 {
5140                   if (dump_enabled_p ())
5141                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5142                                      "shuffle of 3 loads is not supported by"
5143                                      " target\n");
5144                   return false;
5145                 }
5146               for (i = 0, j = 0; i < nelt; i++)
5147                 if (3 * i + k < 2 * nelt)
5148                   sel[i] = i;
5149                 else
5150                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5151               if (!can_vec_perm_p (mode, false, sel))
5152                 {
5153                   if (dump_enabled_p ())
5154                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5155                                      "shuffle of 3 loads is not supported by"
5156                                      " target\n");
5157                   return false;
5158                 }
5159             }
5160           return true;
5161         }
5162       else
5163         {
5164           /* If length is not equal to 3 then only power of 2 is supported.  */
5165           gcc_assert (exact_log2 (count) != -1);
5166           for (i = 0; i < nelt; i++)
5167             sel[i] = i * 2;
5168           if (can_vec_perm_p (mode, false, sel))
5169             {
5170               for (i = 0; i < nelt; i++)
5171                 sel[i] = i * 2 + 1;
5172               if (can_vec_perm_p (mode, false, sel))
5173                 return true;
5174             }
5175         }
5176     }
5177
5178   if (dump_enabled_p ())
5179     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5180                      "extract even/odd not supported by target\n");
5181   return false;
5182 }
5183
5184 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5185    type VECTYPE.  */
5186
5187 bool
5188 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5189 {
5190   return vect_lanes_optab_supported_p ("vec_load_lanes",
5191                                        vec_load_lanes_optab,
5192                                        vectype, count);
5193 }
5194
5195 /* Function vect_permute_load_chain.
5196
5197    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5198    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5199    the input data correctly.  Return the final references for loads in
5200    RESULT_CHAIN.
5201
5202    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5203    The input is 4 vectors each containing 8 elements. We assign a number to each
5204    element, the input sequence is:
5205
5206    1st vec:   0  1  2  3  4  5  6  7
5207    2nd vec:   8  9 10 11 12 13 14 15
5208    3rd vec:  16 17 18 19 20 21 22 23
5209    4th vec:  24 25 26 27 28 29 30 31
5210
5211    The output sequence should be:
5212
5213    1st vec:  0 4  8 12 16 20 24 28
5214    2nd vec:  1 5  9 13 17 21 25 29
5215    3rd vec:  2 6 10 14 18 22 26 30
5216    4th vec:  3 7 11 15 19 23 27 31
5217
5218    i.e., the first output vector should contain the first elements of each
5219    interleaving group, etc.
5220
5221    We use extract_even/odd instructions to create such output.  The input of
5222    each extract_even/odd operation is two vectors
5223    1st vec    2nd vec
5224    0 1 2 3    4 5 6 7
5225
5226    and the output is the vector of extracted even/odd elements.  The output of
5227    extract_even will be:   0 2 4 6
5228    and of extract_odd:     1 3 5 7
5229
5230
5231    The permutation is done in log LENGTH stages.  In each stage extract_even
5232    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5233    their order.  In our example,
5234
5235    E1: extract_even (1st vec, 2nd vec)
5236    E2: extract_odd (1st vec, 2nd vec)
5237    E3: extract_even (3rd vec, 4th vec)
5238    E4: extract_odd (3rd vec, 4th vec)
5239
5240    The output for the first stage will be:
5241
5242    E1:  0  2  4  6  8 10 12 14
5243    E2:  1  3  5  7  9 11 13 15
5244    E3: 16 18 20 22 24 26 28 30
5245    E4: 17 19 21 23 25 27 29 31
5246
5247    In order to proceed and create the correct sequence for the next stage (or
5248    for the correct output, if the second stage is the last one, as in our
5249    example), we first put the output of extract_even operation and then the
5250    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5251    The input for the second stage is:
5252
5253    1st vec (E1):  0  2  4  6  8 10 12 14
5254    2nd vec (E3): 16 18 20 22 24 26 28 30
5255    3rd vec (E2):  1  3  5  7  9 11 13 15
5256    4th vec (E4): 17 19 21 23 25 27 29 31
5257
5258    The output of the second stage:
5259
5260    E1: 0 4  8 12 16 20 24 28
5261    E2: 2 6 10 14 18 22 26 30
5262    E3: 1 5  9 13 17 21 25 29
5263    E4: 3 7 11 15 19 23 27 31
5264
5265    And RESULT_CHAIN after reordering:
5266
5267    1st vec (E1):  0 4  8 12 16 20 24 28
5268    2nd vec (E3):  1 5  9 13 17 21 25 29
5269    3rd vec (E2):  2 6 10 14 18 22 26 30
5270    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5271
5272 static void
5273 vect_permute_load_chain (vec<tree> dr_chain,
5274                          unsigned int length,
5275                          gimple *stmt,
5276                          gimple_stmt_iterator *gsi,
5277                          vec<tree> *result_chain)
5278 {
5279   tree data_ref, first_vect, second_vect;
5280   tree perm_mask_even, perm_mask_odd;
5281   tree perm3_mask_low, perm3_mask_high;
5282   gimple *perm_stmt;
5283   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5284   unsigned int i, j, log_length = exact_log2 (length);
5285   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5286   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5287
5288   result_chain->quick_grow (length);
5289   memcpy (result_chain->address (), dr_chain.address (),
5290           length * sizeof (tree));
5291
5292   if (length == 3)
5293     {
5294       unsigned int k;
5295
5296       for (k = 0; k < 3; k++)
5297         {
5298           for (i = 0; i < nelt; i++)
5299             if (3 * i + k < 2 * nelt)
5300               sel[i] = 3 * i + k;
5301             else
5302               sel[i] = 0;
5303           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5304
5305           for (i = 0, j = 0; i < nelt; i++)
5306             if (3 * i + k < 2 * nelt)
5307               sel[i] = i;
5308             else
5309               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5310
5311           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5312
5313           first_vect = dr_chain[0];
5314           second_vect = dr_chain[1];
5315
5316           /* Create interleaving stmt (low part of):
5317              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5318                                                              ...}>  */
5319           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5320           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5321                                            second_vect, perm3_mask_low);
5322           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5323
5324           /* Create interleaving stmt (high part of):
5325              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5326                                                               ...}>  */
5327           first_vect = data_ref;
5328           second_vect = dr_chain[2];
5329           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5330           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5331                                            second_vect, perm3_mask_high);
5332           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5333           (*result_chain)[k] = data_ref;
5334         }
5335     }
5336   else
5337     {
5338       /* If length is not equal to 3 then only power of 2 is supported.  */
5339       gcc_assert (exact_log2 (length) != -1);
5340
5341       for (i = 0; i < nelt; ++i)
5342         sel[i] = i * 2;
5343       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5344
5345       for (i = 0; i < nelt; ++i)
5346         sel[i] = i * 2 + 1;
5347       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5348
5349       for (i = 0; i < log_length; i++)
5350         {
5351           for (j = 0; j < length; j += 2)
5352             {
5353               first_vect = dr_chain[j];
5354               second_vect = dr_chain[j+1];
5355
5356               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5357               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5358               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5359                                                first_vect, second_vect,
5360                                                perm_mask_even);
5361               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5362               (*result_chain)[j/2] = data_ref;
5363
5364               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5365               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5366               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5367                                                first_vect, second_vect,
5368                                                perm_mask_odd);
5369               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5370               (*result_chain)[j/2+length/2] = data_ref;
5371             }
5372           memcpy (dr_chain.address (), result_chain->address (),
5373                   length * sizeof (tree));
5374         }
5375     }
5376 }
5377
5378 /* Function vect_shift_permute_load_chain.
5379
5380    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5381    sequence of stmts to reorder the input data accordingly.
5382    Return the final references for loads in RESULT_CHAIN.
5383    Return true if successed, false otherwise.
5384
5385    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5386    The input is 3 vectors each containing 8 elements.  We assign a
5387    number to each element, the input sequence is:
5388
5389    1st vec:   0  1  2  3  4  5  6  7
5390    2nd vec:   8  9 10 11 12 13 14 15
5391    3rd vec:  16 17 18 19 20 21 22 23
5392
5393    The output sequence should be:
5394
5395    1st vec:  0 3 6  9 12 15 18 21
5396    2nd vec:  1 4 7 10 13 16 19 22
5397    3rd vec:  2 5 8 11 14 17 20 23
5398
5399    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5400
5401    First we shuffle all 3 vectors to get correct elements order:
5402
5403    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5404    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5405    3rd vec:  (16 19 22) (17 20 23) (18 21)
5406
5407    Next we unite and shift vector 3 times:
5408
5409    1st step:
5410      shift right by 6 the concatenation of:
5411      "1st vec" and  "2nd vec"
5412        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5413      "2nd vec" and  "3rd vec"
5414        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5415      "3rd vec" and  "1st vec"
5416        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5417                              | New vectors                   |
5418
5419      So that now new vectors are:
5420
5421      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5422      2nd vec:  (10 13) (16 19 22) (17 20 23)
5423      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5424
5425    2nd step:
5426      shift right by 5 the concatenation of:
5427      "1st vec" and  "3rd vec"
5428        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5429      "2nd vec" and  "1st vec"
5430        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5431      "3rd vec" and  "2nd vec"
5432        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5433                           | New vectors                   |
5434
5435      So that now new vectors are:
5436
5437      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5438      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5439      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5440
5441    3rd step:
5442      shift right by 5 the concatenation of:
5443      "1st vec" and  "1st vec"
5444        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5445      shift right by 3 the concatenation of:
5446      "2nd vec" and  "2nd vec"
5447                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5448                           | New vectors                   |
5449
5450      So that now all vectors are READY:
5451      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5452      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5453      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5454
5455    This algorithm is faster than one in vect_permute_load_chain if:
5456      1.  "shift of a concatination" is faster than general permutation.
5457          This is usually so.
5458      2.  The TARGET machine can't execute vector instructions in parallel.
5459          This is because each step of the algorithm depends on previous.
5460          The algorithm in vect_permute_load_chain is much more parallel.
5461
5462    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5463 */
5464
5465 static bool
5466 vect_shift_permute_load_chain (vec<tree> dr_chain,
5467                                unsigned int length,
5468                                gimple *stmt,
5469                                gimple_stmt_iterator *gsi,
5470                                vec<tree> *result_chain)
5471 {
5472   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5473   tree perm2_mask1, perm2_mask2, perm3_mask;
5474   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5475   gimple *perm_stmt;
5476
5477   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5478   unsigned int i;
5479   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5480   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5481   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5482   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5483
5484   result_chain->quick_grow (length);
5485   memcpy (result_chain->address (), dr_chain.address (),
5486           length * sizeof (tree));
5487
5488   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5489     {
5490       unsigned int j, log_length = exact_log2 (length);
5491       for (i = 0; i < nelt / 2; ++i)
5492         sel[i] = i * 2;
5493       for (i = 0; i < nelt / 2; ++i)
5494         sel[nelt / 2 + i] = i * 2 + 1;
5495       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5496         {
5497           if (dump_enabled_p ())
5498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5499                              "shuffle of 2 fields structure is not \
5500                               supported by target\n");
5501           return false;
5502         }
5503       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5504
5505       for (i = 0; i < nelt / 2; ++i)
5506         sel[i] = i * 2 + 1;
5507       for (i = 0; i < nelt / 2; ++i)
5508         sel[nelt / 2 + i] = i * 2;
5509       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5510         {
5511           if (dump_enabled_p ())
5512             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5513                              "shuffle of 2 fields structure is not \
5514                               supported by target\n");
5515           return false;
5516         }
5517       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5518
5519       /* Generating permutation constant to shift all elements.
5520          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5521       for (i = 0; i < nelt; i++)
5522         sel[i] = nelt / 2 + i;
5523       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5524         {
5525           if (dump_enabled_p ())
5526             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5527                              "shift permutation is not supported by target\n");
5528           return false;
5529         }
5530       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5531
5532       /* Generating permutation constant to select vector from 2.
5533          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5534       for (i = 0; i < nelt / 2; i++)
5535         sel[i] = i;
5536       for (i = nelt / 2; i < nelt; i++)
5537         sel[i] = nelt + i;
5538       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5539         {
5540           if (dump_enabled_p ())
5541             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5542                              "select is not supported by target\n");
5543           return false;
5544         }
5545       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5546
5547       for (i = 0; i < log_length; i++)
5548         {
5549           for (j = 0; j < length; j += 2)
5550             {
5551               first_vect = dr_chain[j];
5552               second_vect = dr_chain[j + 1];
5553
5554               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5555               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5556                                                first_vect, first_vect,
5557                                                perm2_mask1);
5558               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5559               vect[0] = data_ref;
5560
5561               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5562               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5563                                                second_vect, second_vect,
5564                                                perm2_mask2);
5565               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5566               vect[1] = data_ref;
5567
5568               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5569               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5570                                                vect[0], vect[1], shift1_mask);
5571               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5572               (*result_chain)[j/2 + length/2] = data_ref;
5573
5574               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5575               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5576                                                vect[0], vect[1], select_mask);
5577               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5578               (*result_chain)[j/2] = data_ref;
5579             }
5580           memcpy (dr_chain.address (), result_chain->address (),
5581                   length * sizeof (tree));
5582         }
5583       return true;
5584     }
5585   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5586     {
5587       unsigned int k = 0, l = 0;
5588
5589       /* Generating permutation constant to get all elements in rigth order.
5590          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5591       for (i = 0; i < nelt; i++)
5592         {
5593           if (3 * k + (l % 3) >= nelt)
5594             {
5595               k = 0;
5596               l += (3 - (nelt % 3));
5597             }
5598           sel[i] = 3 * k + (l % 3);
5599           k++;
5600         }
5601       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5602         {
5603           if (dump_enabled_p ())
5604             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5605                              "shuffle of 3 fields structure is not \
5606                               supported by target\n");
5607           return false;
5608         }
5609       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5610
5611       /* Generating permutation constant to shift all elements.
5612          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5613       for (i = 0; i < nelt; i++)
5614         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5615       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5616         {
5617           if (dump_enabled_p ())
5618             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5619                              "shift permutation is not supported by target\n");
5620           return false;
5621         }
5622       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5623
5624       /* Generating permutation constant to shift all elements.
5625          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5626       for (i = 0; i < nelt; i++)
5627         sel[i] = 2 * (nelt / 3) + 1 + i;
5628       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5629         {
5630           if (dump_enabled_p ())
5631             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5632                              "shift permutation is not supported by target\n");
5633           return false;
5634         }
5635       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5636
5637       /* Generating permutation constant to shift all elements.
5638          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5639       for (i = 0; i < nelt; i++)
5640         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5641       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5642         {
5643           if (dump_enabled_p ())
5644             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5645                              "shift permutation is not supported by target\n");
5646           return false;
5647         }
5648       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5649
5650       /* Generating permutation constant to shift all elements.
5651          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5652       for (i = 0; i < nelt; i++)
5653         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5654       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5655         {
5656           if (dump_enabled_p ())
5657             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5658                              "shift permutation is not supported by target\n");
5659           return false;
5660         }
5661       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5662
5663       for (k = 0; k < 3; k++)
5664         {
5665           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5666           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5667                                            dr_chain[k], dr_chain[k],
5668                                            perm3_mask);
5669           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5670           vect[k] = data_ref;
5671         }
5672
5673       for (k = 0; k < 3; k++)
5674         {
5675           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5676           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5677                                            vect[k % 3], vect[(k + 1) % 3],
5678                                            shift1_mask);
5679           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5680           vect_shift[k] = data_ref;
5681         }
5682
5683       for (k = 0; k < 3; k++)
5684         {
5685           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5686           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5687                                            vect_shift[(4 - k) % 3],
5688                                            vect_shift[(3 - k) % 3],
5689                                            shift2_mask);
5690           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5691           vect[k] = data_ref;
5692         }
5693
5694       (*result_chain)[3 - (nelt % 3)] = vect[2];
5695
5696       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5697       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5698                                        vect[0], shift3_mask);
5699       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5700       (*result_chain)[nelt % 3] = data_ref;
5701
5702       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5703       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5704                                        vect[1], shift4_mask);
5705       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5706       (*result_chain)[0] = data_ref;
5707       return true;
5708     }
5709   return false;
5710 }
5711
5712 /* Function vect_transform_grouped_load.
5713
5714    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5715    to perform their permutation and ascribe the result vectorized statements to
5716    the scalar statements.
5717 */
5718
5719 void
5720 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5721                              gimple_stmt_iterator *gsi)
5722 {
5723   machine_mode mode;
5724   vec<tree> result_chain = vNULL;
5725
5726   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5727      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5728      vectors, that are ready for vector computation.  */
5729   result_chain.create (size);
5730
5731   /* If reassociation width for vector type is 2 or greater target machine can
5732      execute 2 or more vector instructions in parallel.  Otherwise try to
5733      get chain for loads group using vect_shift_permute_load_chain.  */
5734   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5735   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5736       || exact_log2 (size) != -1
5737       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5738                                          gsi, &result_chain))
5739     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5740   vect_record_grouped_load_vectors (stmt, result_chain);
5741   result_chain.release ();
5742 }
5743
5744 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5745    generated as part of the vectorization of STMT.  Assign the statement
5746    for each vector to the associated scalar statement.  */
5747
5748 void
5749 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5750 {
5751   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5752   gimple *next_stmt, *new_stmt;
5753   unsigned int i, gap_count;
5754   tree tmp_data_ref;
5755
5756   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5757      Since we scan the chain starting from it's first node, their order
5758      corresponds the order of data-refs in RESULT_CHAIN.  */
5759   next_stmt = first_stmt;
5760   gap_count = 1;
5761   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5762     {
5763       if (!next_stmt)
5764         break;
5765
5766       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5767        code elimination pass later.  No need to check for the first stmt in
5768        the group, since it always exists.
5769        GROUP_GAP is the number of steps in elements from the previous
5770        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5771        correspond to the gaps.  */
5772       if (next_stmt != first_stmt
5773           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5774       {
5775         gap_count++;
5776         continue;
5777       }
5778
5779       while (next_stmt)
5780         {
5781           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5782           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5783              copies, and we put the new vector statement in the first available
5784              RELATED_STMT.  */
5785           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5786             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5787           else
5788             {
5789               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5790                 {
5791                   gimple *prev_stmt =
5792                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5793                   gimple *rel_stmt =
5794                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5795                   while (rel_stmt)
5796                     {
5797                       prev_stmt = rel_stmt;
5798                       rel_stmt =
5799                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5800                     }
5801
5802                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5803                     new_stmt;
5804                 }
5805             }
5806
5807           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5808           gap_count = 1;
5809           /* If NEXT_STMT accesses the same DR as the previous statement,
5810              put the same TMP_DATA_REF as its vectorized statement; otherwise
5811              get the next data-ref from RESULT_CHAIN.  */
5812           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5813             break;
5814         }
5815     }
5816 }
5817
5818 /* Function vect_force_dr_alignment_p.
5819
5820    Returns whether the alignment of a DECL can be forced to be aligned
5821    on ALIGNMENT bit boundary.  */
5822
5823 bool
5824 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5825 {
5826   if (TREE_CODE (decl) != VAR_DECL)
5827     return false;
5828
5829   if (decl_in_symtab_p (decl)
5830       && !symtab_node::get (decl)->can_increase_alignment_p ())
5831     return false;
5832
5833   if (TREE_STATIC (decl))
5834     return (alignment <= MAX_OFILE_ALIGNMENT);
5835   else
5836     return (alignment <= MAX_STACK_ALIGNMENT);
5837 }
5838
5839
5840 /* Return whether the data reference DR is supported with respect to its
5841    alignment.
5842    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5843    it is aligned, i.e., check if it is possible to vectorize it with different
5844    alignment.  */
5845
5846 enum dr_alignment_support
5847 vect_supportable_dr_alignment (struct data_reference *dr,
5848                                bool check_aligned_accesses)
5849 {
5850   gimple *stmt = DR_STMT (dr);
5851   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5852   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5853   machine_mode mode = TYPE_MODE (vectype);
5854   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5855   struct loop *vect_loop = NULL;
5856   bool nested_in_vect_loop = false;
5857
5858   if (aligned_access_p (dr) && !check_aligned_accesses)
5859     return dr_aligned;
5860
5861   /* For now assume all conditional loads/stores support unaligned
5862      access without any special code.  */
5863   if (is_gimple_call (stmt)
5864       && gimple_call_internal_p (stmt)
5865       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5866           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5867     return dr_unaligned_supported;
5868
5869   if (loop_vinfo)
5870     {
5871       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5872       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5873     }
5874
5875   /* Possibly unaligned access.  */
5876
5877   /* We can choose between using the implicit realignment scheme (generating
5878      a misaligned_move stmt) and the explicit realignment scheme (generating
5879      aligned loads with a REALIGN_LOAD).  There are two variants to the
5880      explicit realignment scheme: optimized, and unoptimized.
5881      We can optimize the realignment only if the step between consecutive
5882      vector loads is equal to the vector size.  Since the vector memory
5883      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5884      is guaranteed that the misalignment amount remains the same throughout the
5885      execution of the vectorized loop.  Therefore, we can create the
5886      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5887      at the loop preheader.
5888
5889      However, in the case of outer-loop vectorization, when vectorizing a
5890      memory access in the inner-loop nested within the LOOP that is now being
5891      vectorized, while it is guaranteed that the misalignment of the
5892      vectorized memory access will remain the same in different outer-loop
5893      iterations, it is *not* guaranteed that is will remain the same throughout
5894      the execution of the inner-loop.  This is because the inner-loop advances
5895      with the original scalar step (and not in steps of VS).  If the inner-loop
5896      step happens to be a multiple of VS, then the misalignment remains fixed
5897      and we can use the optimized realignment scheme.  For example:
5898
5899       for (i=0; i<N; i++)
5900         for (j=0; j<M; j++)
5901           s += a[i+j];
5902
5903      When vectorizing the i-loop in the above example, the step between
5904      consecutive vector loads is 1, and so the misalignment does not remain
5905      fixed across the execution of the inner-loop, and the realignment cannot
5906      be optimized (as illustrated in the following pseudo vectorized loop):
5907
5908       for (i=0; i<N; i+=4)
5909         for (j=0; j<M; j++){
5910           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5911                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5912                          // (assuming that we start from an aligned address).
5913           }
5914
5915      We therefore have to use the unoptimized realignment scheme:
5916
5917       for (i=0; i<N; i+=4)
5918           for (j=k; j<M; j+=4)
5919           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5920                            // that the misalignment of the initial address is
5921                            // 0).
5922
5923      The loop can then be vectorized as follows:
5924
5925       for (k=0; k<4; k++){
5926         rt = get_realignment_token (&vp[k]);
5927         for (i=0; i<N; i+=4){
5928           v1 = vp[i+k];
5929           for (j=k; j<M; j+=4){
5930             v2 = vp[i+j+VS-1];
5931             va = REALIGN_LOAD <v1,v2,rt>;
5932             vs += va;
5933             v1 = v2;
5934           }
5935         }
5936     } */
5937
5938   if (DR_IS_READ (dr))
5939     {
5940       bool is_packed = false;
5941       tree type = (TREE_TYPE (DR_REF (dr)));
5942
5943       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5944           && (!targetm.vectorize.builtin_mask_for_load
5945               || targetm.vectorize.builtin_mask_for_load ()))
5946         {
5947           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5948           if ((nested_in_vect_loop
5949                && (TREE_INT_CST_LOW (DR_STEP (dr))
5950                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5951               || !loop_vinfo)
5952             return dr_explicit_realign;
5953           else
5954             return dr_explicit_realign_optimized;
5955         }
5956       if (!known_alignment_for_access_p (dr))
5957         is_packed = not_size_aligned (DR_REF (dr));
5958
5959       if ((TYPE_USER_ALIGN (type) && !is_packed)
5960           || targetm.vectorize.
5961                support_vector_misalignment (mode, type,
5962                                             DR_MISALIGNMENT (dr), is_packed))
5963         /* Can't software pipeline the loads, but can at least do them.  */
5964         return dr_unaligned_supported;
5965     }
5966   else
5967     {
5968       bool is_packed = false;
5969       tree type = (TREE_TYPE (DR_REF (dr)));
5970
5971       if (!known_alignment_for_access_p (dr))
5972         is_packed = not_size_aligned (DR_REF (dr));
5973
5974      if ((TYPE_USER_ALIGN (type) && !is_packed)
5975          || targetm.vectorize.
5976               support_vector_misalignment (mode, type,
5977                                            DR_MISALIGNMENT (dr), is_packed))
5978        return dr_unaligned_supported;
5979     }
5980
5981   /* Unsupported.  */
5982   return dr_unaligned_unsupported;
5983 }