gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "tm_p.h"
  30 #include "target.h"
  31 #include "predict.h"
  32 #include "vec.h"
  33 #include "hashtab.h"
  34 #include "hash-set.h"
  35 #include "machmode.h"
  36 #include "hard-reg-set.h"
  37 #include "input.h"
  38 #include "function.h"
  39 #include "dominance.h"
  40 #include "cfg.h"
  41 #include "basic-block.h"
  42 #include "gimple-pretty-print.h"
  43 #include "tree-ssa-alias.h"
  44 #include "internal-fn.h"
  45 #include "tree-eh.h"
  46 #include "gimple-expr.h"
  47 #include "is-a.h"
  48 #include "gimple.h"
  49 #include "gimplify.h"
  50 #include "gimple-iterator.h"
  51 #include "gimplify-me.h"
  52 #include "gimple-ssa.h"
  53 #include "tree-phinodes.h"
  54 #include "ssa-iterators.h"
  55 #include "stringpool.h"
  56 #include "tree-ssanames.h"
  57 #include "tree-ssa-loop-ivopts.h"
  58 #include "tree-ssa-loop-manip.h"
  59 #include "tree-ssa-loop.h"
  60 #include "dumpfile.h"
  61 #include "cfgloop.h"
  62 #include "tree-chrec.h"
  63 #include "tree-scalar-evolution.h"
  64 #include "tree-vectorizer.h"
  65 #include "diagnostic-core.h"
  66 #include "hash-map.h"
  67 #include "plugin-api.h"
  68 #include "ipa-ref.h"
  69 #include "cgraph.h"
  70 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  71 #include "expr.h"
  72 #include "optabs.h"
  73 #include "builtins.h"
  74 #include "varasm.h"
  75
  76 /* Return true if load- or store-lanes optab OPTAB is implemented for
  77    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  78
  79 static bool
  80 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  81                               tree vectype, unsigned HOST_WIDE_INT count)
  82 {
  83   machine_mode mode, array_mode;
  84   bool limit_p;
  85
  86   mode = TYPE_MODE (vectype);
  87   limit_p = !targetm.array_mode_supported_p (mode, count);
  88   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  89                               MODE_INT, limit_p);
  90
  91   if (array_mode == BLKmode)
  92     {
  93       if (dump_enabled_p ())
  94         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  95                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  96                          GET_MODE_NAME (mode), count);
  97       return false;
  98     }
  99
 100   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
 101     {
 102       if (dump_enabled_p ())
 103         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 104                          "cannot use %s<%s><%s>\n", name,
 105                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
 106       return false;
 107     }
 108
 109   if (dump_enabled_p ())
 110     dump_printf_loc (MSG_NOTE, vect_location,
 111                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
 112                      GET_MODE_NAME (mode));
 113
 114   return true;
 115 }
 116
 117
 118 /* Return the smallest scalar part of STMT.
 119    This is used to determine the vectype of the stmt.  We generally set the
 120    vectype according to the type of the result (lhs).  For stmts whose
 121    result-type is different than the type of the arguments (e.g., demotion,
 122    promotion), vectype will be reset appropriately (later).  Note that we have
 123    to visit the smallest datatype in this function, because that determines the
 124    VF.  If the smallest datatype in the loop is present only as the rhs of a
 125    promotion operation - we'd miss it.
 126    Such a case, where a variable of this datatype does not appear in the lhs
 127    anywhere in the loop, can only occur if it's an invariant: e.g.:
 128    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 129    invariant motion.  However, we cannot rely on invariant motion to always
 130    take invariants out of the loop, and so in the case of promotion we also
 131    have to check the rhs.
 132    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 133    types.  */
 134
 135 tree
 136 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 137                                HOST_WIDE_INT *rhs_size_unit)
 138 {
 139   tree scalar_type = gimple_expr_type (stmt);
 140   HOST_WIDE_INT lhs, rhs;
 141
 142   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 143
 144   if (is_gimple_assign (stmt)
 145       && (gimple_assign_cast_p (stmt)
 146           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 147           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 148           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 149     {
 150       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 151
 152       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 153       if (rhs < lhs)
 154         scalar_type = rhs_type;
 155     }
 156
 157   *lhs_size_unit = lhs;
 158   *rhs_size_unit = rhs;
 159   return scalar_type;
 160 }
 161
 162
 163 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 164    tested at run-time.  Return TRUE if DDR was successfully inserted.
 165    Return false if versioning is not supported.  */
 166
 167 static bool
 168 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 169 {
 170   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 171
 172   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 173     return false;
 174
 175   if (dump_enabled_p ())
 176     {
 177       dump_printf_loc (MSG_NOTE, vect_location,
 178                        "mark for run-time aliasing test between ");
 179       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 180       dump_printf (MSG_NOTE,  " and ");
 181       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 182       dump_printf (MSG_NOTE, "\n");
 183     }
 184
 185   if (optimize_loop_nest_for_size_p (loop))
 186     {
 187       if (dump_enabled_p ())
 188         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 189                          "versioning not supported when optimizing"
 190                          " for size.\n");
 191       return false;
 192     }
 193
 194   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 195   if (loop->inner)
 196     {
 197       if (dump_enabled_p ())
 198         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 199                          "versioning not yet supported for outer-loops.\n");
 200       return false;
 201     }
 202
 203   /* FORNOW: We don't support creating runtime alias tests for non-constant
 204      step.  */
 205   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 206       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 207     {
 208       if (dump_enabled_p ())
 209         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 210                          "versioning not yet supported for non-constant "
 211                          "step\n");
 212       return false;
 213     }
 214
 215   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 216   return true;
 217 }
 218
 219
 220 /* Function vect_analyze_data_ref_dependence.
 221
 222    Return TRUE if there (might) exist a dependence between a memory-reference
 223    DRA and a memory-reference DRB.  When versioning for alias may check a
 224    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 225    the data dependence.  */
 226
 227 static bool
 228 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 229                                   loop_vec_info loop_vinfo, int *max_vf)
 230 {
 231   unsigned int i;
 232   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 233   struct data_reference *dra = DDR_A (ddr);
 234   struct data_reference *drb = DDR_B (ddr);
 235   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 236   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 237   lambda_vector dist_v;
 238   unsigned int loop_depth;
 239
 240   /* In loop analysis all data references should be vectorizable.  */
 241   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 242       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 243     gcc_unreachable ();
 244
 245   /* Independent data accesses.  */
 246   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 247     return false;
 248
 249   if (dra == drb
 250       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 251     return false;
 252
 253   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 254      least two scalar iterations, there is always also a true dependence.
 255      As the vectorizer does not re-order loads and stores we can ignore
 256      the anti-dependence if TBAA can disambiguate both DRs similar to the
 257      case with known negative distance anti-dependences (positive
 258      distance anti-dependences would violate TBAA constraints).  */
 259   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 260        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 261       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 262                                  get_alias_set (DR_REF (drb))))
 263     return false;
 264
 265   /* Unknown data dependence.  */
 266   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 267     {
 268       /* If user asserted safelen consecutive iterations can be
 269          executed concurrently, assume independence.  */
 270       if (loop->safelen >= 2)
 271         {
 272           if (loop->safelen < *max_vf)
 273             *max_vf = loop->safelen;
 274           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 275           return false;
 276         }
 277
 278       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 279           || STMT_VINFO_GATHER_P (stmtinfo_b))
 280         {
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 284                                "versioning for alias not supported for: "
 285                                "can't determine dependence between ");
 286               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 287                                  DR_REF (dra));
 288               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 289               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 290                                  DR_REF (drb));
 291               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 292             }
 293           return true;
 294         }
 295
 296       if (dump_enabled_p ())
 297         {
 298           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 299                            "versioning for alias required: "
 300                            "can't determine dependence between ");
 301           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 302                              DR_REF (dra));
 303           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 304           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 305                              DR_REF (drb));
 306           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 307         }
 308
 309       /* Add to list of ddrs that need to be tested at run-time.  */
 310       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 311     }
 312
 313   /* Known data dependence.  */
 314   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 315     {
 316       /* If user asserted safelen consecutive iterations can be
 317          executed concurrently, assume independence.  */
 318       if (loop->safelen >= 2)
 319         {
 320           if (loop->safelen < *max_vf)
 321             *max_vf = loop->safelen;
 322           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 323           return false;
 324         }
 325
 326       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 327           || STMT_VINFO_GATHER_P (stmtinfo_b))
 328         {
 329           if (dump_enabled_p ())
 330             {
 331               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 332                                "versioning for alias not supported for: "
 333                                "bad dist vector for ");
 334               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 335                                  DR_REF (dra));
 336               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 337               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 338                                  DR_REF (drb));
 339               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 340             }
 341           return true;
 342         }
 343
 344       if (dump_enabled_p ())
 345         {
 346           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 347                            "versioning for alias required: "
 348                            "bad dist vector for ");
 349           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 350           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 351           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 352           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 353         }
 354       /* Add to list of ddrs that need to be tested at run-time.  */
 355       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 356     }
 357
 358   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 359   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 360     {
 361       int dist = dist_v[loop_depth];
 362
 363       if (dump_enabled_p ())
 364         dump_printf_loc (MSG_NOTE, vect_location,
 365                          "dependence distance  = %d.\n", dist);
 366
 367       if (dist == 0)
 368         {
 369           if (dump_enabled_p ())
 370             {
 371               dump_printf_loc (MSG_NOTE, vect_location,
 372                                "dependence distance == 0 between ");
 373               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 374               dump_printf (MSG_NOTE, " and ");
 375               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 376               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 377             }
 378
 379           /* When we perform grouped accesses and perform implicit CSE
 380              by detecting equal accesses and doing disambiguation with
 381              runtime alias tests like for
 382                 .. = a[i];
 383                 .. = a[i+1];
 384                 a[i] = ..;
 385                 a[i+1] = ..;
 386                 *p = ..;
 387                 .. = a[i];
 388                 .. = a[i+1];
 389              where we will end up loading { a[i], a[i+1] } once, make
 390              sure that inserting group loads before the first load and
 391              stores after the last store will do the right thing.
 392              Similar for groups like
 393                 a[i] = ...;
 394                 ... = a[i];
 395                 a[i+1] = ...;
 396              where loads from the group interleave with the store.  */
 397           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 398               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 399             {
 400               gimple earlier_stmt;
 401               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 402               if (DR_IS_WRITE
 403                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 404                 {
 405                   if (dump_enabled_p ())
 406                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                                      "READ_WRITE dependence in interleaving."
 408                                      "\n");
 409                   return true;
 410                 }
 411             }
 412
 413           continue;
 414         }
 415
 416       if (dist > 0 && DDR_REVERSED_P (ddr))
 417         {
 418           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 419              reversed (to make distance vector positive), and the actual
 420              distance is negative.  */
 421           if (dump_enabled_p ())
 422             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 423                              "dependence distance negative.\n");
 424           /* Record a negative dependence distance to later limit the
 425              amount of stmt copying / unrolling we can perform.
 426              Only need to handle read-after-write dependence.  */
 427           if (DR_IS_READ (drb)
 428               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 429                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 430             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 431           continue;
 432         }
 433
 434       if (abs (dist) >= 2
 435           && abs (dist) < *max_vf)
 436         {
 437           /* The dependence distance requires reduction of the maximal
 438              vectorization factor.  */
 439           *max_vf = abs (dist);
 440           if (dump_enabled_p ())
 441             dump_printf_loc (MSG_NOTE, vect_location,
 442                              "adjusting maximal vectorization factor to %i\n",
 443                              *max_vf);
 444         }
 445
 446       if (abs (dist) >= *max_vf)
 447         {
 448           /* Dependence distance does not create dependence, as far as
 449              vectorization is concerned, in this case.  */
 450           if (dump_enabled_p ())
 451             dump_printf_loc (MSG_NOTE, vect_location,
 452                              "dependence distance >= VF.\n");
 453           continue;
 454         }
 455
 456       if (dump_enabled_p ())
 457         {
 458           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 459                        "not vectorized, possible dependence "
 460                        "between data-refs ");
 461           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 462           dump_printf (MSG_NOTE,  " and ");
 463           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 464           dump_printf (MSG_NOTE,  "\n");
 465         }
 466
 467       return true;
 468     }
 469
 470   return false;
 471 }
 472
 473 /* Function vect_analyze_data_ref_dependences.
 474
 475    Examine all the data references in the loop, and make sure there do not
 476    exist any data dependences between them.  Set *MAX_VF according to
 477    the maximum vectorization factor the data dependences allow.  */
 478
 479 bool
 480 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 481 {
 482   unsigned int i;
 483   struct data_dependence_relation *ddr;
 484
 485   if (dump_enabled_p ())
 486     dump_printf_loc (MSG_NOTE, vect_location,
 487                      "=== vect_analyze_data_ref_dependences ===\n");
 488
 489   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 490   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 491                                 &LOOP_VINFO_DDRS (loop_vinfo),
 492                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 493     return false;
 494
 495   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 496     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 497       return false;
 498
 499   return true;
 500 }
 501
 502
 503 /* Function vect_slp_analyze_data_ref_dependence.
 504
 505    Return TRUE if there (might) exist a dependence between a memory-reference
 506    DRA and a memory-reference DRB.  When versioning for alias may check a
 507    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 508    the data dependence.  */
 509
 510 static bool
 511 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 512 {
 513   struct data_reference *dra = DDR_A (ddr);
 514   struct data_reference *drb = DDR_B (ddr);
 515
 516   /* We need to check dependences of statements marked as unvectorizable
 517      as well, they still can prohibit vectorization.  */
 518
 519   /* Independent data accesses.  */
 520   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 521     return false;
 522
 523   if (dra == drb)
 524     return false;
 525
 526   /* Read-read is OK.  */
 527   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 528     return false;
 529
 530   /* If dra and drb are part of the same interleaving chain consider
 531      them independent.  */
 532   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 533       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 534           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 535     return false;
 536
 537   /* Unknown data dependence.  */
 538   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 539     {
 540       if  (dump_enabled_p ())
 541         {
 542           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 543                            "can't determine dependence between ");
 544           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 545           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 546           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 547           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 548         }
 549     }
 550   else if (dump_enabled_p ())
 551     {
 552       dump_printf_loc (MSG_NOTE, vect_location,
 553                        "determined dependence between ");
 554       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 555       dump_printf (MSG_NOTE, " and ");
 556       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 557       dump_printf (MSG_NOTE,  "\n");
 558     }
 559
 560   /* We do not vectorize basic blocks with write-write dependencies.  */
 561   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 562     return true;
 563
 564   /* If we have a read-write dependence check that the load is before the store.
 565      When we vectorize basic blocks, vector load can be only before
 566      corresponding scalar load, and vector store can be only after its
 567      corresponding scalar store.  So the order of the acceses is preserved in
 568      case the load is before the store.  */
 569   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 570   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 571     {
 572       /* That only holds for load-store pairs taking part in vectorization.  */
 573       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 574           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 575         return false;
 576     }
 577
 578   return true;
 579 }
 580
 581
 582 /* Function vect_analyze_data_ref_dependences.
 583
 584    Examine all the data references in the basic-block, and make sure there
 585    do not exist any data dependences between them.  Set *MAX_VF according to
 586    the maximum vectorization factor the data dependences allow.  */
 587
 588 bool
 589 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 590 {
 591   struct data_dependence_relation *ddr;
 592   unsigned int i;
 593
 594   if (dump_enabled_p ())
 595     dump_printf_loc (MSG_NOTE, vect_location,
 596                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 597
 598   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 599                                 &BB_VINFO_DDRS (bb_vinfo),
 600                                 vNULL, true))
 601     return false;
 602
 603   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 604     if (vect_slp_analyze_data_ref_dependence (ddr))
 605       return false;
 606
 607   return true;
 608 }
 609
 610
 611 /* Function vect_compute_data_ref_alignment
 612
 613    Compute the misalignment of the data reference DR.
 614
 615    Output:
 616    1. If during the misalignment computation it is found that the data reference
 617       cannot be vectorized then false is returned.
 618    2. DR_MISALIGNMENT (DR) is defined.
 619
 620    FOR NOW: No analysis is actually performed. Misalignment is calculated
 621    only for trivial cases. TODO.  */
 622
 623 static bool
 624 vect_compute_data_ref_alignment (struct data_reference *dr)
 625 {
 626   gimple stmt = DR_STMT (dr);
 627   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 628   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 629   struct loop *loop = NULL;
 630   tree ref = DR_REF (dr);
 631   tree vectype;
 632   tree base, base_addr;
 633   bool base_aligned;
 634   tree misalign;
 635   tree aligned_to, alignment;
 636
 637   if (dump_enabled_p ())
 638     dump_printf_loc (MSG_NOTE, vect_location,
 639                      "vect_compute_data_ref_alignment:\n");
 640
 641   if (loop_vinfo)
 642     loop = LOOP_VINFO_LOOP (loop_vinfo);
 643
 644   /* Initialize misalignment to unknown.  */
 645   SET_DR_MISALIGNMENT (dr, -1);
 646
 647   /* Strided loads perform only component accesses, misalignment information
 648      is irrelevant for them.  */
 649   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 650     return true;
 651
 652   misalign = DR_INIT (dr);
 653   aligned_to = DR_ALIGNED_TO (dr);
 654   base_addr = DR_BASE_ADDRESS (dr);
 655   vectype = STMT_VINFO_VECTYPE (stmt_info);
 656
 657   /* In case the dataref is in an inner-loop of the loop that is being
 658      vectorized (LOOP), we use the base and misalignment information
 659      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 660      stays the same throughout the execution of the inner-loop, which is why
 661      we have to check that the stride of the dataref in the inner-loop evenly
 662      divides by the vector size.  */
 663   if (loop && nested_in_vect_loop_p (loop, stmt))
 664     {
 665       tree step = DR_STEP (dr);
 666       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 667
 668       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 669         {
 670           if (dump_enabled_p ())
 671             dump_printf_loc (MSG_NOTE, vect_location,
 672                              "inner step divides the vector-size.\n");
 673           misalign = STMT_VINFO_DR_INIT (stmt_info);
 674           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 675           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 676         }
 677       else
 678         {
 679           if (dump_enabled_p ())
 680             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 681                              "inner step doesn't divide the vector-size.\n");
 682           misalign = NULL_TREE;
 683         }
 684     }
 685
 686   /* Similarly, if we're doing basic-block vectorization, we can only use
 687      base and misalignment information relative to an innermost loop if the
 688      misalignment stays the same throughout the execution of the loop.
 689      As above, this is the case if the stride of the dataref evenly divides
 690      by the vector size.  */
 691   if (!loop)
 692     {
 693       tree step = DR_STEP (dr);
 694       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 695
 696       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 697         {
 698           if (dump_enabled_p ())
 699             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 700                              "SLP: step doesn't divide the vector-size.\n");
 701           misalign = NULL_TREE;
 702         }
 703     }
 704
 705   base = build_fold_indirect_ref (base_addr);
 706   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
 707
 708   if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
 709       || !misalign)
 710     {
 711       if (dump_enabled_p ())
 712         {
 713           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 714                            "Unknown alignment for access: ");
 715           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
 716           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 717         }
 718       return true;
 719     }
 720
 721   if ((DECL_P (base)
 722        && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
 723                                 alignment) >= 0)
 724       || (TREE_CODE (base_addr) == SSA_NAME
 725           && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
 726                                                       TREE_TYPE (base_addr)))),
 727                                    alignment) >= 0)
 728       || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
 729     base_aligned = true;
 730   else
 731     base_aligned = false;
 732
 733   if (!base_aligned)
 734     {
 735       /* Do not change the alignment of global variables here if
 736          flag_section_anchors is enabled as we already generated
 737          RTL for other functions.  Most global variables should
 738          have been aligned during the IPA increase_alignment pass.  */
 739       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
 740           || (TREE_STATIC (base) && flag_section_anchors))
 741         {
 742           if (dump_enabled_p ())
 743             {
 744               dump_printf_loc (MSG_NOTE, vect_location,
 745                                "can't force alignment of ref: ");
 746               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 747               dump_printf (MSG_NOTE, "\n");
 748             }
 749           return true;
 750         }
 751
 752       /* Force the alignment of the decl.
 753          NOTE: This is the only change to the code we make during
 754          the analysis phase, before deciding to vectorize the loop.  */
 755       if (dump_enabled_p ())
 756         {
 757           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 758           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 759           dump_printf (MSG_NOTE, "\n");
 760         }
 761
 762       ((dataref_aux *)dr->aux)->base_decl = base;
 763       ((dataref_aux *)dr->aux)->base_misaligned = true;
 764     }
 765
 766   /* If this is a backward running DR then first access in the larger
 767      vectype actually is N-1 elements before the address in the DR.
 768      Adjust misalign accordingly.  */
 769   if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
 770     {
 771       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 772       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 773          otherwise we wouldn't be here.  */
 774       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 775       /* PLUS because DR_STEP was negative.  */
 776       misalign = size_binop (PLUS_EXPR, misalign, offset);
 777     }
 778
 779   /* Modulo alignment.  */
 780   misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
 781
 782   if (!tree_fits_uhwi_p (misalign))
 783     {
 784       /* Negative or overflowed misalignment value.  */
 785       if (dump_enabled_p ())
 786         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 787                          "unexpected misalign value\n");
 788       return false;
 789     }
 790
 791   SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
 792
 793   if (dump_enabled_p ())
 794     {
 795       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 796                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 797       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 798       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 799     }
 800
 801   return true;
 802 }
 803
 804
 805 /* Function vect_compute_data_refs_alignment
 806
 807    Compute the misalignment of data references in the loop.
 808    Return FALSE if a data reference is found that cannot be vectorized.  */
 809
 810 static bool
 811 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 812                                   bb_vec_info bb_vinfo)
 813 {
 814   vec<data_reference_p> datarefs;
 815   struct data_reference *dr;
 816   unsigned int i;
 817
 818   if (loop_vinfo)
 819     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 820   else
 821     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 822
 823   FOR_EACH_VEC_ELT (datarefs, i, dr)
 824     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 825         && !vect_compute_data_ref_alignment (dr))
 826       {
 827         if (bb_vinfo)
 828           {
 829             /* Mark unsupported statement as unvectorizable.  */
 830             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 831             continue;
 832           }
 833         else
 834           return false;
 835       }
 836
 837   return true;
 838 }
 839
 840
 841 /* Function vect_update_misalignment_for_peel
 842
 843    DR - the data reference whose misalignment is to be adjusted.
 844    DR_PEEL - the data reference whose misalignment is being made
 845              zero in the vector loop by the peel.
 846    NPEEL - the number of iterations in the peel loop if the misalignment
 847            of DR_PEEL is known at compile time.  */
 848
 849 static void
 850 vect_update_misalignment_for_peel (struct data_reference *dr,
 851                                    struct data_reference *dr_peel, int npeel)
 852 {
 853   unsigned int i;
 854   vec<dr_p> same_align_drs;
 855   struct data_reference *current_dr;
 856   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 857   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 858   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 859   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 860
 861  /* For interleaved data accesses the step in the loop must be multiplied by
 862      the size of the interleaving group.  */
 863   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 864     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 865   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 866     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 867
 868   /* It can be assumed that the data refs with the same alignment as dr_peel
 869      are aligned in the vector loop.  */
 870   same_align_drs
 871     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 872   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 873     {
 874       if (current_dr != dr)
 875         continue;
 876       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 877                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 878       SET_DR_MISALIGNMENT (dr, 0);
 879       return;
 880     }
 881
 882   if (known_alignment_for_access_p (dr)
 883       && known_alignment_for_access_p (dr_peel))
 884     {
 885       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 886       int misal = DR_MISALIGNMENT (dr);
 887       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 888       misal += negative ? -npeel * dr_size : npeel * dr_size;
 889       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 890       SET_DR_MISALIGNMENT (dr, misal);
 891       return;
 892     }
 893
 894   if (dump_enabled_p ())
 895     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 896   SET_DR_MISALIGNMENT (dr, -1);
 897 }
 898
 899
 900 /* Function vect_verify_datarefs_alignment
 901
 902    Return TRUE if all data references in the loop can be
 903    handled with respect to alignment.  */
 904
 905 bool
 906 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 907 {
 908   vec<data_reference_p> datarefs;
 909   struct data_reference *dr;
 910   enum dr_alignment_support supportable_dr_alignment;
 911   unsigned int i;
 912
 913   if (loop_vinfo)
 914     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 915   else
 916     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 917
 918   FOR_EACH_VEC_ELT (datarefs, i, dr)
 919     {
 920       gimple stmt = DR_STMT (dr);
 921       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 922
 923       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 924         continue;
 925
 926       /* For interleaving, only the alignment of the first access matters.
 927          Skip statements marked as not vectorizable.  */
 928       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 929            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 930           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 931         continue;
 932
 933       /* Strided loads perform only component accesses, alignment is
 934          irrelevant for them.  */
 935       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 936         continue;
 937
 938       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 939       if (!supportable_dr_alignment)
 940         {
 941           if (dump_enabled_p ())
 942             {
 943               if (DR_IS_READ (dr))
 944                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 945                                  "not vectorized: unsupported unaligned load.");
 946               else
 947                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 948                                  "not vectorized: unsupported unaligned "
 949                                  "store.");
 950
 951               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 952                                  DR_REF (dr));
 953               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 954             }
 955           return false;
 956         }
 957       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 958         dump_printf_loc (MSG_NOTE, vect_location,
 959                          "Vectorizing an unaligned access.\n");
 960     }
 961   return true;
 962 }
 963
 964 /* Given an memory reference EXP return whether its alignment is less
 965    than its size.  */
 966
 967 static bool
 968 not_size_aligned (tree exp)
 969 {
 970   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 971     return true;
 972
 973   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 974           > get_object_alignment (exp));
 975 }
 976
 977 /* Function vector_alignment_reachable_p
 978
 979    Return true if vector alignment for DR is reachable by peeling
 980    a few loop iterations.  Return false otherwise.  */
 981
 982 static bool
 983 vector_alignment_reachable_p (struct data_reference *dr)
 984 {
 985   gimple stmt = DR_STMT (dr);
 986   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 987   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 988
 989   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 990     {
 991       /* For interleaved access we peel only if number of iterations in
 992          the prolog loop ({VF - misalignment}), is a multiple of the
 993          number of the interleaved accesses.  */
 994       int elem_size, mis_in_elements;
 995       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 996
 997       /* FORNOW: handle only known alignment.  */
 998       if (!known_alignment_for_access_p (dr))
 999         return false;
1000
1001       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1002       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1003
1004       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1005         return false;
1006     }
1007
1008   /* If misalignment is known at the compile time then allow peeling
1009      only if natural alignment is reachable through peeling.  */
1010   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1011     {
1012       HOST_WIDE_INT elmsize =
1013                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1014       if (dump_enabled_p ())
1015         {
1016           dump_printf_loc (MSG_NOTE, vect_location,
1017                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1018           dump_printf (MSG_NOTE,
1019                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1020         }
1021       if (DR_MISALIGNMENT (dr) % elmsize)
1022         {
1023           if (dump_enabled_p ())
1024             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025                              "data size does not divide the misalignment.\n");
1026           return false;
1027         }
1028     }
1029
1030   if (!known_alignment_for_access_p (dr))
1031     {
1032       tree type = TREE_TYPE (DR_REF (dr));
1033       bool is_packed = not_size_aligned (DR_REF (dr));
1034       if (dump_enabled_p ())
1035         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036                          "Unknown misalignment, is_packed = %d\n",is_packed);
1037       if ((TYPE_USER_ALIGN (type) && !is_packed)
1038           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1039         return true;
1040       else
1041         return false;
1042     }
1043
1044   return true;
1045 }
1046
1047
1048 /* Calculate the cost of the memory access represented by DR.  */
1049
1050 static void
1051 vect_get_data_access_cost (struct data_reference *dr,
1052                            unsigned int *inside_cost,
1053                            unsigned int *outside_cost,
1054                            stmt_vector_for_cost *body_cost_vec)
1055 {
1056   gimple stmt = DR_STMT (dr);
1057   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1058   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1059   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1060   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1061   int ncopies = vf / nunits;
1062
1063   if (DR_IS_READ (dr))
1064     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1065                         NULL, body_cost_vec, false);
1066   else
1067     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1068
1069   if (dump_enabled_p ())
1070     dump_printf_loc (MSG_NOTE, vect_location,
1071                      "vect_get_data_access_cost: inside_cost = %d, "
1072                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1073 }
1074
1075
1076 /* Insert DR into peeling hash table with NPEEL as key.  */
1077
1078 static void
1079 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1080                           int npeel)
1081 {
1082   struct _vect_peel_info elem, *slot;
1083   _vect_peel_info **new_slot;
1084   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1085
1086   elem.npeel = npeel;
1087   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1088   if (slot)
1089     slot->count++;
1090   else
1091     {
1092       slot = XNEW (struct _vect_peel_info);
1093       slot->npeel = npeel;
1094       slot->dr = dr;
1095       slot->count = 1;
1096       new_slot
1097         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1098       *new_slot = slot;
1099     }
1100
1101   if (!supportable_dr_alignment
1102       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1103     slot->count += VECT_MAX_COST;
1104 }
1105
1106
1107 /* Traverse peeling hash table to find peeling option that aligns maximum
1108    number of data accesses.  */
1109
1110 int
1111 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1112                                      _vect_peel_extended_info *max)
1113 {
1114   vect_peel_info elem = *slot;
1115
1116   if (elem->count > max->peel_info.count
1117       || (elem->count == max->peel_info.count
1118           && max->peel_info.npeel > elem->npeel))
1119     {
1120       max->peel_info.npeel = elem->npeel;
1121       max->peel_info.count = elem->count;
1122       max->peel_info.dr = elem->dr;
1123     }
1124
1125   return 1;
1126 }
1127
1128
1129 /* Traverse peeling hash table and calculate cost for each peeling option.
1130    Find the one with the lowest cost.  */
1131
1132 int
1133 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1134                                    _vect_peel_extended_info *min)
1135 {
1136   vect_peel_info elem = *slot;
1137   int save_misalignment, dummy;
1138   unsigned int inside_cost = 0, outside_cost = 0, i;
1139   gimple stmt = DR_STMT (elem->dr);
1140   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1141   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1142   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1143   struct data_reference *dr;
1144   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1145   int single_iter_cost;
1146
1147   prologue_cost_vec.create (2);
1148   body_cost_vec.create (2);
1149   epilogue_cost_vec.create (2);
1150
1151   FOR_EACH_VEC_ELT (datarefs, i, dr)
1152     {
1153       stmt = DR_STMT (dr);
1154       stmt_info = vinfo_for_stmt (stmt);
1155       /* For interleaving, only the alignment of the first access
1156          matters.  */
1157       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1158           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1159         continue;
1160
1161       save_misalignment = DR_MISALIGNMENT (dr);
1162       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1163       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1164                                  &body_cost_vec);
1165       SET_DR_MISALIGNMENT (dr, save_misalignment);
1166     }
1167
1168   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1169   outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1170                                                &dummy, single_iter_cost,
1171                                                &prologue_cost_vec,
1172                                                &epilogue_cost_vec);
1173
1174   /* Prologue and epilogue costs are added to the target model later.
1175      These costs depend only on the scalar iteration cost, the
1176      number of peeling iterations finally chosen, and the number of
1177      misaligned statements.  So discard the information found here.  */
1178   prologue_cost_vec.release ();
1179   epilogue_cost_vec.release ();
1180
1181   if (inside_cost < min->inside_cost
1182       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1183     {
1184       min->inside_cost = inside_cost;
1185       min->outside_cost = outside_cost;
1186       min->body_cost_vec.release ();
1187       min->body_cost_vec = body_cost_vec;
1188       min->peel_info.dr = elem->dr;
1189       min->peel_info.npeel = elem->npeel;
1190     }
1191   else
1192     body_cost_vec.release ();
1193
1194   return 1;
1195 }
1196
1197
1198 /* Choose best peeling option by traversing peeling hash table and either
1199    choosing an option with the lowest cost (if cost model is enabled) or the
1200    option that aligns as many accesses as possible.  */
1201
1202 static struct data_reference *
1203 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1204                                        unsigned int *npeel,
1205                                        stmt_vector_for_cost *body_cost_vec)
1206 {
1207    struct _vect_peel_extended_info res;
1208
1209    res.peel_info.dr = NULL;
1210    res.body_cost_vec = stmt_vector_for_cost ();
1211
1212    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1213      {
1214        res.inside_cost = INT_MAX;
1215        res.outside_cost = INT_MAX;
1216        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1217            ->traverse <_vect_peel_extended_info *,
1218                        vect_peeling_hash_get_lowest_cost> (&res);
1219      }
1220    else
1221      {
1222        res.peel_info.count = 0;
1223        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1224            ->traverse <_vect_peel_extended_info *,
1225                        vect_peeling_hash_get_most_frequent> (&res);
1226      }
1227
1228    *npeel = res.peel_info.npeel;
1229    *body_cost_vec = res.body_cost_vec;
1230    return res.peel_info.dr;
1231 }
1232
1233
1234 /* Function vect_enhance_data_refs_alignment
1235
1236    This pass will use loop versioning and loop peeling in order to enhance
1237    the alignment of data references in the loop.
1238
1239    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1240    original loop is to be vectorized.  Any other loops that are created by
1241    the transformations performed in this pass - are not supposed to be
1242    vectorized.  This restriction will be relaxed.
1243
1244    This pass will require a cost model to guide it whether to apply peeling
1245    or versioning or a combination of the two.  For example, the scheme that
1246    intel uses when given a loop with several memory accesses, is as follows:
1247    choose one memory access ('p') which alignment you want to force by doing
1248    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1249    other accesses are not necessarily aligned, or (2) use loop versioning to
1250    generate one loop in which all accesses are aligned, and another loop in
1251    which only 'p' is necessarily aligned.
1252
1253    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1254    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1255    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1256
1257    Devising a cost model is the most critical aspect of this work.  It will
1258    guide us on which access to peel for, whether to use loop versioning, how
1259    many versions to create, etc.  The cost model will probably consist of
1260    generic considerations as well as target specific considerations (on
1261    powerpc for example, misaligned stores are more painful than misaligned
1262    loads).
1263
1264    Here are the general steps involved in alignment enhancements:
1265
1266      -- original loop, before alignment analysis:
1267         for (i=0; i<N; i++){
1268           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1269           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1270         }
1271
1272      -- After vect_compute_data_refs_alignment:
1273         for (i=0; i<N; i++){
1274           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1275           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1276         }
1277
1278      -- Possibility 1: we do loop versioning:
1279      if (p is aligned) {
1280         for (i=0; i<N; i++){    # loop 1A
1281           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1282           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1283         }
1284      }
1285      else {
1286         for (i=0; i<N; i++){    # loop 1B
1287           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1288           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1289         }
1290      }
1291
1292      -- Possibility 2: we do loop peeling:
1293      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1294         x = q[i];
1295         p[i] = y;
1296      }
1297      for (i = 3; i < N; i++){   # loop 2A
1298         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1299         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1300      }
1301
1302      -- Possibility 3: combination of loop peeling and versioning:
1303      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1304         x = q[i];
1305         p[i] = y;
1306      }
1307      if (p is aligned) {
1308         for (i = 3; i<N; i++){  # loop 3A
1309           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1310           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1311         }
1312      }
1313      else {
1314         for (i = 3; i<N; i++){  # loop 3B
1315           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1316           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1317         }
1318      }
1319
1320      These loops are later passed to loop_transform to be vectorized.  The
1321      vectorizer will use the alignment information to guide the transformation
1322      (whether to generate regular loads/stores, or with special handling for
1323      misalignment).  */
1324
1325 bool
1326 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1327 {
1328   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1329   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1330   enum dr_alignment_support supportable_dr_alignment;
1331   struct data_reference *dr0 = NULL, *first_store = NULL;
1332   struct data_reference *dr;
1333   unsigned int i, j;
1334   bool do_peeling = false;
1335   bool do_versioning = false;
1336   bool stat;
1337   gimple stmt;
1338   stmt_vec_info stmt_info;
1339   unsigned int npeel = 0;
1340   bool all_misalignments_unknown = true;
1341   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1342   unsigned possible_npeel_number = 1;
1343   tree vectype;
1344   unsigned int nelements, mis, same_align_drs_max = 0;
1345   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1346
1347   if (dump_enabled_p ())
1348     dump_printf_loc (MSG_NOTE, vect_location,
1349                      "=== vect_enhance_data_refs_alignment ===\n");
1350
1351   /* While cost model enhancements are expected in the future, the high level
1352      view of the code at this time is as follows:
1353
1354      A) If there is a misaligned access then see if peeling to align
1355         this access can make all data references satisfy
1356         vect_supportable_dr_alignment.  If so, update data structures
1357         as needed and return true.
1358
1359      B) If peeling wasn't possible and there is a data reference with an
1360         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1361         then see if loop versioning checks can be used to make all data
1362         references satisfy vect_supportable_dr_alignment.  If so, update
1363         data structures as needed and return true.
1364
1365      C) If neither peeling nor versioning were successful then return false if
1366         any data reference does not satisfy vect_supportable_dr_alignment.
1367
1368      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1369
1370      Note, Possibility 3 above (which is peeling and versioning together) is not
1371      being done at this time.  */
1372
1373   /* (1) Peeling to force alignment.  */
1374
1375   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1376      Considerations:
1377      + How many accesses will become aligned due to the peeling
1378      - How many accesses will become unaligned due to the peeling,
1379        and the cost of misaligned accesses.
1380      - The cost of peeling (the extra runtime checks, the increase
1381        in code size).  */
1382
1383   FOR_EACH_VEC_ELT (datarefs, i, dr)
1384     {
1385       stmt = DR_STMT (dr);
1386       stmt_info = vinfo_for_stmt (stmt);
1387
1388       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1389         continue;
1390
1391       /* For interleaving, only the alignment of the first access
1392          matters.  */
1393       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1394           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1395         continue;
1396
1397       /* For invariant accesses there is nothing to enhance.  */
1398       if (integer_zerop (DR_STEP (dr)))
1399         continue;
1400
1401       /* Strided loads perform only component accesses, alignment is
1402          irrelevant for them.  */
1403       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1404         continue;
1405
1406       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1407       do_peeling = vector_alignment_reachable_p (dr);
1408       if (do_peeling)
1409         {
1410           if (known_alignment_for_access_p (dr))
1411             {
1412               unsigned int npeel_tmp;
1413               bool negative = tree_int_cst_compare (DR_STEP (dr),
1414                                                     size_zero_node) < 0;
1415
1416               /* Save info about DR in the hash table.  */
1417               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1418                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1419                   = new hash_table<peel_info_hasher> (1);
1420
1421               vectype = STMT_VINFO_VECTYPE (stmt_info);
1422               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1423               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1424                                                 TREE_TYPE (DR_REF (dr))));
1425               npeel_tmp = (negative
1426                            ? (mis - nelements) : (nelements - mis))
1427                   & (nelements - 1);
1428
1429               /* For multiple types, it is possible that the bigger type access
1430                  will have more than one peeling option.  E.g., a loop with two
1431                  types: one of size (vector size / 4), and the other one of
1432                  size (vector size / 8).  Vectorization factor will 8.  If both
1433                  access are misaligned by 3, the first one needs one scalar
1434                  iteration to be aligned, and the second one needs 5.  But the
1435                  the first one will be aligned also by peeling 5 scalar
1436                  iterations, and in that case both accesses will be aligned.
1437                  Hence, except for the immediate peeling amount, we also want
1438                  to try to add full vector size, while we don't exceed
1439                  vectorization factor.
1440                  We do this automtically for cost model, since we calculate cost
1441                  for every peeling option.  */
1442               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1443                 possible_npeel_number = vf /nelements;
1444
1445               /* Handle the aligned case. We may decide to align some other
1446                  access, making DR unaligned.  */
1447               if (DR_MISALIGNMENT (dr) == 0)
1448                 {
1449                   npeel_tmp = 0;
1450                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1451                     possible_npeel_number++;
1452                 }
1453
1454               for (j = 0; j < possible_npeel_number; j++)
1455                 {
1456                   gcc_assert (npeel_tmp <= vf);
1457                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1458                   npeel_tmp += nelements;
1459                 }
1460
1461               all_misalignments_unknown = false;
1462               /* Data-ref that was chosen for the case that all the
1463                  misalignments are unknown is not relevant anymore, since we
1464                  have a data-ref with known alignment.  */
1465               dr0 = NULL;
1466             }
1467           else
1468             {
1469               /* If we don't know any misalignment values, we prefer
1470                  peeling for data-ref that has the maximum number of data-refs
1471                  with the same alignment, unless the target prefers to align
1472                  stores over load.  */
1473               if (all_misalignments_unknown)
1474                 {
1475                   unsigned same_align_drs
1476                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1477                   if (!dr0
1478                       || same_align_drs_max < same_align_drs)
1479                     {
1480                       same_align_drs_max = same_align_drs;
1481                       dr0 = dr;
1482                     }
1483                   /* For data-refs with the same number of related
1484                      accesses prefer the one where the misalign
1485                      computation will be invariant in the outermost loop.  */
1486                   else if (same_align_drs_max == same_align_drs)
1487                     {
1488                       struct loop *ivloop0, *ivloop;
1489                       ivloop0 = outermost_invariant_loop_for_expr
1490                           (loop, DR_BASE_ADDRESS (dr0));
1491                       ivloop = outermost_invariant_loop_for_expr
1492                           (loop, DR_BASE_ADDRESS (dr));
1493                       if ((ivloop && !ivloop0)
1494                           || (ivloop && ivloop0
1495                               && flow_loop_nested_p (ivloop, ivloop0)))
1496                         dr0 = dr;
1497                     }
1498
1499                   if (!first_store && DR_IS_WRITE (dr))
1500                     first_store = dr;
1501                 }
1502
1503               /* If there are both known and unknown misaligned accesses in the
1504                  loop, we choose peeling amount according to the known
1505                  accesses.  */
1506               if (!supportable_dr_alignment)
1507                 {
1508                   dr0 = dr;
1509                   if (!first_store && DR_IS_WRITE (dr))
1510                     first_store = dr;
1511                 }
1512             }
1513         }
1514       else
1515         {
1516           if (!aligned_access_p (dr))
1517             {
1518               if (dump_enabled_p ())
1519                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1520                                  "vector alignment may not be reachable\n");
1521               break;
1522             }
1523         }
1524     }
1525
1526   /* Check if we can possibly peel the loop.  */
1527   if (!vect_can_advance_ivs_p (loop_vinfo)
1528       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1529     do_peeling = false;
1530
1531   /* If we don't know how many times the peeling loop will run
1532      assume it will run VF-1 times and disable peeling if the remaining
1533      iters are less than the vectorization factor.  */
1534   if (do_peeling
1535       && all_misalignments_unknown
1536       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1537       && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1538           < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1539     do_peeling = false;
1540
1541   if (do_peeling
1542       && all_misalignments_unknown
1543       && vect_supportable_dr_alignment (dr0, false))
1544     {
1545       /* Check if the target requires to prefer stores over loads, i.e., if
1546          misaligned stores are more expensive than misaligned loads (taking
1547          drs with same alignment into account).  */
1548       if (first_store && DR_IS_READ (dr0))
1549         {
1550           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1551           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1552           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1553           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1554           stmt_vector_for_cost dummy;
1555           dummy.create (2);
1556
1557           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1558                                      &dummy);
1559           vect_get_data_access_cost (first_store, &store_inside_cost,
1560                                      &store_outside_cost, &dummy);
1561
1562           dummy.release ();
1563
1564           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1565              aligning the load DR0).  */
1566           load_inside_penalty = store_inside_cost;
1567           load_outside_penalty = store_outside_cost;
1568           for (i = 0;
1569                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1570                           DR_STMT (first_store))).iterate (i, &dr);
1571                i++)
1572             if (DR_IS_READ (dr))
1573               {
1574                 load_inside_penalty += load_inside_cost;
1575                 load_outside_penalty += load_outside_cost;
1576               }
1577             else
1578               {
1579                 load_inside_penalty += store_inside_cost;
1580                 load_outside_penalty += store_outside_cost;
1581               }
1582
1583           /* Calculate the penalty for leaving DR0 unaligned (by
1584              aligning the FIRST_STORE).  */
1585           store_inside_penalty = load_inside_cost;
1586           store_outside_penalty = load_outside_cost;
1587           for (i = 0;
1588                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1589                       DR_STMT (dr0))).iterate (i, &dr);
1590                i++)
1591             if (DR_IS_READ (dr))
1592               {
1593                 store_inside_penalty += load_inside_cost;
1594                 store_outside_penalty += load_outside_cost;
1595               }
1596             else
1597               {
1598                 store_inside_penalty += store_inside_cost;
1599                 store_outside_penalty += store_outside_cost;
1600               }
1601
1602           if (load_inside_penalty > store_inside_penalty
1603               || (load_inside_penalty == store_inside_penalty
1604                   && load_outside_penalty > store_outside_penalty))
1605             dr0 = first_store;
1606         }
1607
1608       /* In case there are only loads with different unknown misalignments, use
1609          peeling only if it may help to align other accesses in the loop.  */
1610       if (!first_store
1611           && !STMT_VINFO_SAME_ALIGN_REFS (
1612                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1613           && vect_supportable_dr_alignment (dr0, false)
1614               != dr_unaligned_supported)
1615         do_peeling = false;
1616     }
1617
1618   if (do_peeling && !dr0)
1619     {
1620       /* Peeling is possible, but there is no data access that is not supported
1621          unless aligned. So we try to choose the best possible peeling.  */
1622
1623       /* We should get here only if there are drs with known misalignment.  */
1624       gcc_assert (!all_misalignments_unknown);
1625
1626       /* Choose the best peeling from the hash table.  */
1627       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1628                                                    &body_cost_vec);
1629       if (!dr0 || !npeel)
1630         do_peeling = false;
1631
1632       /* If peeling by npeel will result in a remaining loop not iterating
1633          enough to be vectorized then do not peel.  */
1634       if (do_peeling
1635           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1636           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1637               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1638         do_peeling = false;
1639     }
1640
1641   if (do_peeling)
1642     {
1643       stmt = DR_STMT (dr0);
1644       stmt_info = vinfo_for_stmt (stmt);
1645       vectype = STMT_VINFO_VECTYPE (stmt_info);
1646       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1647
1648       if (known_alignment_for_access_p (dr0))
1649         {
1650           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1651                                                 size_zero_node) < 0;
1652           if (!npeel)
1653             {
1654               /* Since it's known at compile time, compute the number of
1655                  iterations in the peeled loop (the peeling factor) for use in
1656                  updating DR_MISALIGNMENT values.  The peeling factor is the
1657                  vectorization factor minus the misalignment as an element
1658                  count.  */
1659               mis = DR_MISALIGNMENT (dr0);
1660               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1661               npeel = ((negative ? mis - nelements : nelements - mis)
1662                        & (nelements - 1));
1663             }
1664
1665           /* For interleaved data access every iteration accesses all the
1666              members of the group, therefore we divide the number of iterations
1667              by the group size.  */
1668           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1669           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1670             npeel /= GROUP_SIZE (stmt_info);
1671
1672           if (dump_enabled_p ())
1673             dump_printf_loc (MSG_NOTE, vect_location,
1674                              "Try peeling by %d\n", npeel);
1675         }
1676
1677       /* Ensure that all data refs can be vectorized after the peel.  */
1678       FOR_EACH_VEC_ELT (datarefs, i, dr)
1679         {
1680           int save_misalignment;
1681
1682           if (dr == dr0)
1683             continue;
1684
1685           stmt = DR_STMT (dr);
1686           stmt_info = vinfo_for_stmt (stmt);
1687           /* For interleaving, only the alignment of the first access
1688             matters.  */
1689           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1690               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1691             continue;
1692
1693           /* Strided loads perform only component accesses, alignment is
1694              irrelevant for them.  */
1695           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1696             continue;
1697
1698           save_misalignment = DR_MISALIGNMENT (dr);
1699           vect_update_misalignment_for_peel (dr, dr0, npeel);
1700           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1701           SET_DR_MISALIGNMENT (dr, save_misalignment);
1702
1703           if (!supportable_dr_alignment)
1704             {
1705               do_peeling = false;
1706               break;
1707             }
1708         }
1709
1710       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1711         {
1712           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1713           if (!stat)
1714             do_peeling = false;
1715           else
1716             {
1717               body_cost_vec.release ();
1718               return stat;
1719             }
1720         }
1721
1722       if (do_peeling)
1723         {
1724           unsigned max_allowed_peel
1725             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1726           if (max_allowed_peel != (unsigned)-1)
1727             {
1728               unsigned max_peel = npeel;
1729               if (max_peel == 0)
1730                 {
1731                   gimple dr_stmt = DR_STMT (dr0);
1732                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1733                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1734                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1735                 }
1736               if (max_peel > max_allowed_peel)
1737                 {
1738                   do_peeling = false;
1739                   if (dump_enabled_p ())
1740                     dump_printf_loc (MSG_NOTE, vect_location,
1741                         "Disable peeling, max peels reached: %d\n", max_peel);
1742                 }
1743             }
1744         }
1745
1746       if (do_peeling)
1747         {
1748           stmt_info_for_cost *si;
1749           void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1750
1751           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1752              If the misalignment of DR_i is identical to that of dr0 then set
1753              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1754              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1755              by the peeling factor times the element size of DR_i (MOD the
1756              vectorization factor times the size).  Otherwise, the
1757              misalignment of DR_i must be set to unknown.  */
1758           FOR_EACH_VEC_ELT (datarefs, i, dr)
1759             if (dr != dr0)
1760               vect_update_misalignment_for_peel (dr, dr0, npeel);
1761
1762           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1763           if (npeel)
1764             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1765           else
1766             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1767               = DR_MISALIGNMENT (dr0);
1768           SET_DR_MISALIGNMENT (dr0, 0);
1769           if (dump_enabled_p ())
1770             {
1771               dump_printf_loc (MSG_NOTE, vect_location,
1772                                "Alignment of access forced using peeling.\n");
1773               dump_printf_loc (MSG_NOTE, vect_location,
1774                                "Peeling for alignment will be applied.\n");
1775             }
1776           /* We've delayed passing the inside-loop peeling costs to the
1777              target cost model until we were sure peeling would happen.
1778              Do so now.  */
1779           if (body_cost_vec.exists ())
1780             {
1781               FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1782                 {
1783                   struct _stmt_vec_info *stmt_info
1784                     = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1785                   (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1786                                         si->misalign, vect_body);
1787                 }
1788               body_cost_vec.release ();
1789             }
1790
1791           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1792           gcc_assert (stat);
1793           return stat;
1794         }
1795     }
1796
1797   body_cost_vec.release ();
1798
1799   /* (2) Versioning to force alignment.  */
1800
1801   /* Try versioning if:
1802      1) optimize loop for speed
1803      2) there is at least one unsupported misaligned data ref with an unknown
1804         misalignment, and
1805      3) all misaligned data refs with a known misalignment are supported, and
1806      4) the number of runtime alignment checks is within reason.  */
1807
1808   do_versioning =
1809         optimize_loop_nest_for_speed_p (loop)
1810         && (!loop->inner); /* FORNOW */
1811
1812   if (do_versioning)
1813     {
1814       FOR_EACH_VEC_ELT (datarefs, i, dr)
1815         {
1816           stmt = DR_STMT (dr);
1817           stmt_info = vinfo_for_stmt (stmt);
1818
1819           /* For interleaving, only the alignment of the first access
1820              matters.  */
1821           if (aligned_access_p (dr)
1822               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1823                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1824             continue;
1825
1826           /* Strided loads perform only component accesses, alignment is
1827              irrelevant for them.  */
1828           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1829             continue;
1830
1831           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1832
1833           if (!supportable_dr_alignment)
1834             {
1835               gimple stmt;
1836               int mask;
1837               tree vectype;
1838
1839               if (known_alignment_for_access_p (dr)
1840                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1841                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1842                 {
1843                   do_versioning = false;
1844                   break;
1845                 }
1846
1847               stmt = DR_STMT (dr);
1848               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1849               gcc_assert (vectype);
1850
1851               /* The rightmost bits of an aligned address must be zeros.
1852                  Construct the mask needed for this test.  For example,
1853                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1854                  mask must be 15 = 0xf. */
1855               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1856
1857               /* FORNOW: use the same mask to test all potentially unaligned
1858                  references in the loop.  The vectorizer currently supports
1859                  a single vector size, see the reference to
1860                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1861                  vectorization factor is computed.  */
1862               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1863                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1864               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1865               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1866                       DR_STMT (dr));
1867             }
1868         }
1869
1870       /* Versioning requires at least one misaligned data reference.  */
1871       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1872         do_versioning = false;
1873       else if (!do_versioning)
1874         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1875     }
1876
1877   if (do_versioning)
1878     {
1879       vec<gimple> may_misalign_stmts
1880         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1881       gimple stmt;
1882
1883       /* It can now be assumed that the data references in the statements
1884          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1885          of the loop being vectorized.  */
1886       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1887         {
1888           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1889           dr = STMT_VINFO_DATA_REF (stmt_info);
1890           SET_DR_MISALIGNMENT (dr, 0);
1891           if (dump_enabled_p ())
1892             dump_printf_loc (MSG_NOTE, vect_location,
1893                              "Alignment of access forced using versioning.\n");
1894         }
1895
1896       if (dump_enabled_p ())
1897         dump_printf_loc (MSG_NOTE, vect_location,
1898                          "Versioning for alignment will be applied.\n");
1899
1900       /* Peeling and versioning can't be done together at this time.  */
1901       gcc_assert (! (do_peeling && do_versioning));
1902
1903       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1904       gcc_assert (stat);
1905       return stat;
1906     }
1907
1908   /* This point is reached if neither peeling nor versioning is being done.  */
1909   gcc_assert (! (do_peeling || do_versioning));
1910
1911   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1912   return stat;
1913 }
1914
1915
1916 /* Function vect_find_same_alignment_drs.
1917
1918    Update group and alignment relations according to the chosen
1919    vectorization factor.  */
1920
1921 static void
1922 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1923                               loop_vec_info loop_vinfo)
1924 {
1925   unsigned int i;
1926   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1927   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1928   struct data_reference *dra = DDR_A (ddr);
1929   struct data_reference *drb = DDR_B (ddr);
1930   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1931   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1932   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1933   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1934   lambda_vector dist_v;
1935   unsigned int loop_depth;
1936
1937   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1938     return;
1939
1940   if (dra == drb)
1941     return;
1942
1943   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1944     return;
1945
1946   /* Loop-based vectorization and known data dependence.  */
1947   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1948     return;
1949
1950   /* Data-dependence analysis reports a distance vector of zero
1951      for data-references that overlap only in the first iteration
1952      but have different sign step (see PR45764).
1953      So as a sanity check require equal DR_STEP.  */
1954   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1955     return;
1956
1957   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1958   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1959     {
1960       int dist = dist_v[loop_depth];
1961
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964                          "dependence distance  = %d.\n", dist);
1965
1966       /* Same loop iteration.  */
1967       if (dist == 0
1968           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1969         {
1970           /* Two references with distance zero have the same alignment.  */
1971           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1972           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1973           if (dump_enabled_p ())
1974             {
1975               dump_printf_loc (MSG_NOTE, vect_location,
1976                                "accesses have the same alignment.\n");
1977               dump_printf (MSG_NOTE,
1978                            "dependence distance modulo vf == 0 between ");
1979               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1980               dump_printf (MSG_NOTE,  " and ");
1981               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1982               dump_printf (MSG_NOTE, "\n");
1983             }
1984         }
1985     }
1986 }
1987
1988
1989 /* Function vect_analyze_data_refs_alignment
1990
1991    Analyze the alignment of the data-references in the loop.
1992    Return FALSE if a data reference is found that cannot be vectorized.  */
1993
1994 bool
1995 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1996                                   bb_vec_info bb_vinfo)
1997 {
1998   if (dump_enabled_p ())
1999     dump_printf_loc (MSG_NOTE, vect_location,
2000                      "=== vect_analyze_data_refs_alignment ===\n");
2001
2002   /* Mark groups of data references with same alignment using
2003      data dependence information.  */
2004   if (loop_vinfo)
2005     {
2006       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2007       struct data_dependence_relation *ddr;
2008       unsigned int i;
2009
2010       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2011         vect_find_same_alignment_drs (ddr, loop_vinfo);
2012     }
2013
2014   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "not vectorized: can't calculate alignment "
2019                          "for data ref.\n");
2020       return false;
2021     }
2022
2023   return true;
2024 }
2025
2026
2027 /* Analyze groups of accesses: check that DR belongs to a group of
2028    accesses of legal size, step, etc.  Detect gaps, single element
2029    interleaving, and other special cases. Set grouped access info.
2030    Collect groups of strided stores for further use in SLP analysis.  */
2031
2032 static bool
2033 vect_analyze_group_access (struct data_reference *dr)
2034 {
2035   tree step = DR_STEP (dr);
2036   tree scalar_type = TREE_TYPE (DR_REF (dr));
2037   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2038   gimple stmt = DR_STMT (dr);
2039   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2040   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2041   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2042   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2043   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2044   bool slp_impossible = false;
2045   struct loop *loop = NULL;
2046
2047   if (loop_vinfo)
2048     loop = LOOP_VINFO_LOOP (loop_vinfo);
2049
2050   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2051      size of the interleaving group (including gaps).  */
2052   groupsize = absu_hwi (dr_step) / type_size;
2053
2054   /* Not consecutive access is possible only if it is a part of interleaving.  */
2055   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2056     {
2057       /* Check if it this DR is a part of interleaving, and is a single
2058          element of the group that is accessed in the loop.  */
2059
2060       /* Gaps are supported only for loads. STEP must be a multiple of the type
2061          size.  The size of the group must be a power of 2.  */
2062       if (DR_IS_READ (dr)
2063           && (dr_step % type_size) == 0
2064           && groupsize > 0
2065           && exact_log2 (groupsize) != -1)
2066         {
2067           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2068           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2069           if (dump_enabled_p ())
2070             {
2071               dump_printf_loc (MSG_NOTE, vect_location,
2072                                "Detected single element interleaving ");
2073               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2074               dump_printf (MSG_NOTE, " step ");
2075               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2076               dump_printf (MSG_NOTE, "\n");
2077             }
2078
2079           if (loop_vinfo)
2080             {
2081               if (dump_enabled_p ())
2082                 dump_printf_loc (MSG_NOTE, vect_location,
2083                                  "Data access with gaps requires scalar "
2084                                  "epilogue loop\n");
2085               if (loop->inner)
2086                 {
2087                   if (dump_enabled_p ())
2088                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089                                      "Peeling for outer loop is not"
2090                                      " supported\n");
2091                   return false;
2092                 }
2093
2094               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2095             }
2096
2097           return true;
2098         }
2099
2100       if (dump_enabled_p ())
2101         {
2102           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103                            "not consecutive access ");
2104           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2105           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2106         }
2107
2108       if (bb_vinfo)
2109         {
2110           /* Mark the statement as unvectorizable.  */
2111           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2112           return true;
2113         }
2114
2115       return false;
2116     }
2117
2118   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2119     {
2120       /* First stmt in the interleaving chain. Check the chain.  */
2121       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2122       struct data_reference *data_ref = dr;
2123       unsigned int count = 1;
2124       tree prev_init = DR_INIT (data_ref);
2125       gimple prev = stmt;
2126       HOST_WIDE_INT diff, gaps = 0;
2127       unsigned HOST_WIDE_INT count_in_bytes;
2128
2129       while (next)
2130         {
2131           /* Skip same data-refs.  In case that two or more stmts share
2132              data-ref (supported only for loads), we vectorize only the first
2133              stmt, and the rest get their vectorized loads from the first
2134              one.  */
2135           if (!tree_int_cst_compare (DR_INIT (data_ref),
2136                                      DR_INIT (STMT_VINFO_DATA_REF (
2137                                                    vinfo_for_stmt (next)))))
2138             {
2139               if (DR_IS_WRITE (data_ref))
2140                 {
2141                   if (dump_enabled_p ())
2142                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2143                                      "Two store stmts share the same dr.\n");
2144                   return false;
2145                 }
2146
2147               /* For load use the same data-ref load.  */
2148               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2149
2150               prev = next;
2151               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2152               continue;
2153             }
2154
2155           prev = next;
2156           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2157
2158           /* All group members have the same STEP by construction.  */
2159           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2160
2161           /* Check that the distance between two accesses is equal to the type
2162              size. Otherwise, we have gaps.  */
2163           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2164                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2165           if (diff != 1)
2166             {
2167               /* FORNOW: SLP of accesses with gaps is not supported.  */
2168               slp_impossible = true;
2169               if (DR_IS_WRITE (data_ref))
2170                 {
2171                   if (dump_enabled_p ())
2172                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173                                      "interleaved store with gaps\n");
2174                   return false;
2175                 }
2176
2177               gaps += diff - 1;
2178             }
2179
2180           last_accessed_element += diff;
2181
2182           /* Store the gap from the previous member of the group. If there is no
2183              gap in the access, GROUP_GAP is always 1.  */
2184           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2185
2186           prev_init = DR_INIT (data_ref);
2187           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2188           /* Count the number of data-refs in the chain.  */
2189           count++;
2190         }
2191
2192       /* COUNT is the number of accesses found, we multiply it by the size of
2193          the type to get COUNT_IN_BYTES.  */
2194       count_in_bytes = type_size * count;
2195
2196       /* Check that the size of the interleaving (including gaps) is not
2197          greater than STEP.  */
2198       if (dr_step != 0
2199           && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2200         {
2201           if (dump_enabled_p ())
2202             {
2203               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204                                "interleaving size is greater than step for ");
2205               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2206                                  DR_REF (dr));
2207               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2208             }
2209           return false;
2210         }
2211
2212       /* Check that the size of the interleaving is equal to STEP for stores,
2213          i.e., that there are no gaps.  */
2214       if (dr_step != 0
2215           && absu_hwi (dr_step) != count_in_bytes)
2216         {
2217           if (DR_IS_READ (dr))
2218             {
2219               slp_impossible = true;
2220               /* There is a gap after the last load in the group. This gap is a
2221                  difference between the groupsize and the number of elements.
2222                  When there is no gap, this difference should be 0.  */
2223               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2224             }
2225           else
2226             {
2227               if (dump_enabled_p ())
2228                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2229                                  "interleaved store with gaps\n");
2230               return false;
2231             }
2232         }
2233
2234       /* Check that STEP is a multiple of type size.  */
2235       if (dr_step != 0
2236           && (dr_step % type_size) != 0)
2237         {
2238           if (dump_enabled_p ())
2239             {
2240               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2241                                "step is not a multiple of type size: step ");
2242               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2243               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2244               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2245                                  TYPE_SIZE_UNIT (scalar_type));
2246               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2247             }
2248           return false;
2249         }
2250
2251       if (groupsize == 0)
2252         groupsize = count;
2253
2254       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2255       if (dump_enabled_p ())
2256         dump_printf_loc (MSG_NOTE, vect_location,
2257                          "Detected interleaving of size %d\n", (int)groupsize);
2258
2259       /* SLP: create an SLP data structure for every interleaving group of
2260          stores for further analysis in vect_analyse_slp.  */
2261       if (DR_IS_WRITE (dr) && !slp_impossible)
2262         {
2263           if (loop_vinfo)
2264             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2265           if (bb_vinfo)
2266             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2267         }
2268
2269       /* There is a gap in the end of the group.  */
2270       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2271         {
2272           if (dump_enabled_p ())
2273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274                              "Data access with gaps requires scalar "
2275                              "epilogue loop\n");
2276           if (loop->inner)
2277             {
2278               if (dump_enabled_p ())
2279                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                                  "Peeling for outer loop is not supported\n");
2281               return false;
2282             }
2283
2284           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2285         }
2286     }
2287
2288   return true;
2289 }
2290
2291
2292 /* Analyze the access pattern of the data-reference DR.
2293    In case of non-consecutive accesses call vect_analyze_group_access() to
2294    analyze groups of accesses.  */
2295
2296 static bool
2297 vect_analyze_data_ref_access (struct data_reference *dr)
2298 {
2299   tree step = DR_STEP (dr);
2300   tree scalar_type = TREE_TYPE (DR_REF (dr));
2301   gimple stmt = DR_STMT (dr);
2302   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2303   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2304   struct loop *loop = NULL;
2305
2306   if (loop_vinfo)
2307     loop = LOOP_VINFO_LOOP (loop_vinfo);
2308
2309   if (loop_vinfo && !step)
2310     {
2311       if (dump_enabled_p ())
2312         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313                          "bad data-ref access in loop\n");
2314       return false;
2315     }
2316
2317   /* Allow invariant loads in not nested loops.  */
2318   if (loop_vinfo && integer_zerop (step))
2319     {
2320       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2321       if (nested_in_vect_loop_p (loop, stmt))
2322         {
2323           if (dump_enabled_p ())
2324             dump_printf_loc (MSG_NOTE, vect_location,
2325                              "zero step in inner loop of nest\n");
2326           return false;
2327         }
2328       return DR_IS_READ (dr);
2329     }
2330
2331   if (loop && nested_in_vect_loop_p (loop, stmt))
2332     {
2333       /* Interleaved accesses are not yet supported within outer-loop
2334         vectorization for references in the inner-loop.  */
2335       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2336
2337       /* For the rest of the analysis we use the outer-loop step.  */
2338       step = STMT_VINFO_DR_STEP (stmt_info);
2339       if (integer_zerop (step))
2340         {
2341           if (dump_enabled_p ())
2342             dump_printf_loc (MSG_NOTE, vect_location,
2343                              "zero step in outer loop.\n");
2344           if (DR_IS_READ (dr))
2345             return true;
2346           else
2347             return false;
2348         }
2349     }
2350
2351   /* Consecutive?  */
2352   if (TREE_CODE (step) == INTEGER_CST)
2353     {
2354       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2355       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2356           || (dr_step < 0
2357               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2358         {
2359           /* Mark that it is not interleaving.  */
2360           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2361           return true;
2362         }
2363     }
2364
2365   if (loop && nested_in_vect_loop_p (loop, stmt))
2366     {
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_NOTE, vect_location,
2369                          "grouped access in outer loop.\n");
2370       return false;
2371     }
2372
2373   /* Assume this is a DR handled by non-constant strided load case.  */
2374   if (TREE_CODE (step) != INTEGER_CST)
2375     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2376
2377   /* Not consecutive access - check if it's a part of interleaving group.  */
2378   return vect_analyze_group_access (dr);
2379 }
2380
2381
2382
2383 /*  A helper function used in the comparator function to sort data
2384     references.  T1 and T2 are two data references to be compared.
2385     The function returns -1, 0, or 1.  */
2386
2387 static int
2388 compare_tree (tree t1, tree t2)
2389 {
2390   int i, cmp;
2391   enum tree_code code;
2392   char tclass;
2393
2394   if (t1 == t2)
2395     return 0;
2396   if (t1 == NULL)
2397     return -1;
2398   if (t2 == NULL)
2399     return 1;
2400
2401
2402   if (TREE_CODE (t1) != TREE_CODE (t2))
2403     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2404
2405   code = TREE_CODE (t1);
2406   switch (code)
2407     {
2408     /* For const values, we can just use hash values for comparisons.  */
2409     case INTEGER_CST:
2410     case REAL_CST:
2411     case FIXED_CST:
2412     case STRING_CST:
2413     case COMPLEX_CST:
2414     case VECTOR_CST:
2415       {
2416         hashval_t h1 = iterative_hash_expr (t1, 0);
2417         hashval_t h2 = iterative_hash_expr (t2, 0);
2418         if (h1 != h2)
2419           return h1 < h2 ? -1 : 1;
2420         break;
2421       }
2422
2423     case SSA_NAME:
2424       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2425       if (cmp != 0)
2426         return cmp;
2427
2428       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2429         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2430       break;
2431
2432     default:
2433       tclass = TREE_CODE_CLASS (code);
2434
2435       /* For var-decl, we could compare their UIDs.  */
2436       if (tclass == tcc_declaration)
2437         {
2438           if (DECL_UID (t1) != DECL_UID (t2))
2439             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2440           break;
2441         }
2442
2443       /* For expressions with operands, compare their operands recursively.  */
2444       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2445         {
2446           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2447           if (cmp != 0)
2448             return cmp;
2449         }
2450     }
2451
2452   return 0;
2453 }
2454
2455
2456 /* Compare two data-references DRA and DRB to group them into chunks
2457    suitable for grouping.  */
2458
2459 static int
2460 dr_group_sort_cmp (const void *dra_, const void *drb_)
2461 {
2462   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2463   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2464   int cmp;
2465
2466   /* Stabilize sort.  */
2467   if (dra == drb)
2468     return 0;
2469
2470   /* Ordering of DRs according to base.  */
2471   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2472     {
2473       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2474       if (cmp != 0)
2475         return cmp;
2476     }
2477
2478   /* And according to DR_OFFSET.  */
2479   if (!dr_equal_offsets_p (dra, drb))
2480     {
2481       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2482       if (cmp != 0)
2483         return cmp;
2484     }
2485
2486   /* Put reads before writes.  */
2487   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2488     return DR_IS_READ (dra) ? -1 : 1;
2489
2490   /* Then sort after access size.  */
2491   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2492                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2493     {
2494       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2495                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2496       if (cmp != 0)
2497         return cmp;
2498     }
2499
2500   /* And after step.  */
2501   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2502     {
2503       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2504       if (cmp != 0)
2505         return cmp;
2506     }
2507
2508   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2509   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2510   if (cmp == 0)
2511     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2512   return cmp;
2513 }
2514
2515 /* Function vect_analyze_data_ref_accesses.
2516
2517    Analyze the access pattern of all the data references in the loop.
2518
2519    FORNOW: the only access pattern that is considered vectorizable is a
2520            simple step 1 (consecutive) access.
2521
2522    FORNOW: handle only arrays and pointer accesses.  */
2523
2524 bool
2525 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2526 {
2527   unsigned int i;
2528   vec<data_reference_p> datarefs;
2529   struct data_reference *dr;
2530
2531   if (dump_enabled_p ())
2532     dump_printf_loc (MSG_NOTE, vect_location,
2533                      "=== vect_analyze_data_ref_accesses ===\n");
2534
2535   if (loop_vinfo)
2536     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2537   else
2538     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2539
2540   if (datarefs.is_empty ())
2541     return true;
2542
2543   /* Sort the array of datarefs to make building the interleaving chains
2544      linear.  Don't modify the original vector's order, it is needed for
2545      determining what dependencies are reversed.  */
2546   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2547   datarefs_copy.qsort (dr_group_sort_cmp);
2548
2549   /* Build the interleaving chains.  */
2550   for (i = 0; i < datarefs_copy.length () - 1;)
2551     {
2552       data_reference_p dra = datarefs_copy[i];
2553       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2554       stmt_vec_info lastinfo = NULL;
2555       for (i = i + 1; i < datarefs_copy.length (); ++i)
2556         {
2557           data_reference_p drb = datarefs_copy[i];
2558           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2559
2560           /* ???  Imperfect sorting (non-compatible types, non-modulo
2561              accesses, same accesses) can lead to a group to be artificially
2562              split here as we don't just skip over those.  If it really
2563              matters we can push those to a worklist and re-iterate
2564              over them.  The we can just skip ahead to the next DR here.  */
2565
2566           /* Check that the data-refs have same first location (except init)
2567              and they are both either store or load (not load and store,
2568              not masked loads or stores).  */
2569           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2570               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2571                                    DR_BASE_ADDRESS (drb), 0)
2572               || !dr_equal_offsets_p (dra, drb)
2573               || !gimple_assign_single_p (DR_STMT (dra))
2574               || !gimple_assign_single_p (DR_STMT (drb)))
2575             break;
2576
2577           /* Check that the data-refs have the same constant size and step.  */
2578           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2579           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2580           if (!tree_fits_uhwi_p (sza)
2581               || !tree_fits_uhwi_p (szb)
2582               || !tree_int_cst_equal (sza, szb)
2583               || !tree_fits_shwi_p (DR_STEP (dra))
2584               || !tree_fits_shwi_p (DR_STEP (drb))
2585               || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2586             break;
2587
2588           /* Do not place the same access in the interleaving chain twice.  */
2589           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2590             break;
2591
2592           /* Check the types are compatible.
2593              ???  We don't distinguish this during sorting.  */
2594           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2595                                    TREE_TYPE (DR_REF (drb))))
2596             break;
2597
2598           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2599           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2600           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2601           gcc_assert (init_a < init_b);
2602
2603           /* If init_b == init_a + the size of the type * k, we have an
2604              interleaving, and DRA is accessed before DRB.  */
2605           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2606           if ((init_b - init_a) % type_size_a != 0)
2607             break;
2608
2609           /* The step (if not zero) is greater than the difference between
2610              data-refs' inits.  This splits groups into suitable sizes.  */
2611           HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2612           if (step != 0 && step <= (init_b - init_a))
2613             break;
2614
2615           if (dump_enabled_p ())
2616             {
2617               dump_printf_loc (MSG_NOTE, vect_location,
2618                                "Detected interleaving ");
2619               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2620               dump_printf (MSG_NOTE,  " and ");
2621               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2622               dump_printf (MSG_NOTE, "\n");
2623             }
2624
2625           /* Link the found element into the group list.  */
2626           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2627             {
2628               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2629               lastinfo = stmtinfo_a;
2630             }
2631           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2632           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2633           lastinfo = stmtinfo_b;
2634         }
2635     }
2636
2637   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2638     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2639         && !vect_analyze_data_ref_access (dr))
2640       {
2641         if (dump_enabled_p ())
2642           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2643                            "not vectorized: complicated access pattern.\n");
2644
2645         if (bb_vinfo)
2646           {
2647             /* Mark the statement as not vectorizable.  */
2648             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2649             continue;
2650           }
2651         else
2652           {
2653             datarefs_copy.release ();
2654             return false;
2655           }
2656       }
2657
2658   datarefs_copy.release ();
2659   return true;
2660 }
2661
2662
2663 /* Operator == between two dr_with_seg_len objects.
2664
2665    This equality operator is used to make sure two data refs
2666    are the same one so that we will consider to combine the
2667    aliasing checks of those two pairs of data dependent data
2668    refs.  */
2669
2670 static bool
2671 operator == (const dr_with_seg_len& d1,
2672              const dr_with_seg_len& d2)
2673 {
2674   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2675                           DR_BASE_ADDRESS (d2.dr), 0)
2676            && compare_tree (d1.offset, d2.offset) == 0
2677            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2678 }
2679
2680 /* Function comp_dr_with_seg_len_pair.
2681
2682    Comparison function for sorting objects of dr_with_seg_len_pair_t
2683    so that we can combine aliasing checks in one scan.  */
2684
2685 static int
2686 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2687 {
2688   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2689   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2690
2691   const dr_with_seg_len &p11 = p1->first,
2692                         &p12 = p1->second,
2693                         &p21 = p2->first,
2694                         &p22 = p2->second;
2695
2696   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2697      if a and c have the same basic address snd step, and b and d have the same
2698      address and step.  Therefore, if any a&c or b&d don't have the same address
2699      and step, we don't care the order of those two pairs after sorting.  */
2700   int comp_res;
2701
2702   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2703                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2704     return comp_res;
2705   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2706                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2707     return comp_res;
2708   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2709     return comp_res;
2710   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2711     return comp_res;
2712   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2713     return comp_res;
2714   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2715     return comp_res;
2716
2717   return 0;
2718 }
2719
2720 template <class T> static void
2721 swap (T& a, T& b)
2722 {
2723   T c (a);
2724   a = b;
2725   b = c;
2726 }
2727
2728 /* Function vect_vfa_segment_size.
2729
2730    Create an expression that computes the size of segment
2731    that will be accessed for a data reference.  The functions takes into
2732    account that realignment loads may access one more vector.
2733
2734    Input:
2735      DR: The data reference.
2736      LENGTH_FACTOR: segment length to consider.
2737
2738    Return an expression whose value is the size of segment which will be
2739    accessed by DR.  */
2740
2741 static tree
2742 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2743 {
2744   tree segment_length;
2745
2746   if (integer_zerop (DR_STEP (dr)))
2747     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2748   else
2749     segment_length = size_binop (MULT_EXPR,
2750                                  fold_convert (sizetype, DR_STEP (dr)),
2751                                  fold_convert (sizetype, length_factor));
2752
2753   if (vect_supportable_dr_alignment (dr, false)
2754         == dr_explicit_realign_optimized)
2755     {
2756       tree vector_size = TYPE_SIZE_UNIT
2757                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2758
2759       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2760     }
2761   return segment_length;
2762 }
2763
2764 /* Function vect_prune_runtime_alias_test_list.
2765
2766    Prune a list of ddrs to be tested at run-time by versioning for alias.
2767    Merge several alias checks into one if possible.
2768    Return FALSE if resulting list of ddrs is longer then allowed by
2769    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2770
2771 bool
2772 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2773 {
2774   vec<ddr_p> may_alias_ddrs =
2775     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2776   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2777     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2778   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2779   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2780
2781   ddr_p ddr;
2782   unsigned int i;
2783   tree length_factor;
2784
2785   if (dump_enabled_p ())
2786     dump_printf_loc (MSG_NOTE, vect_location,
2787                      "=== vect_prune_runtime_alias_test_list ===\n");
2788
2789   if (may_alias_ddrs.is_empty ())
2790     return true;
2791
2792   /* Basically, for each pair of dependent data refs store_ptr_0
2793      and load_ptr_0, we create an expression:
2794
2795      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2796      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2797
2798      for aliasing checks.  However, in some cases we can decrease
2799      the number of checks by combining two checks into one.  For
2800      example, suppose we have another pair of data refs store_ptr_0
2801      and load_ptr_1, and if the following condition is satisfied:
2802
2803      load_ptr_0 < load_ptr_1  &&
2804      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2805
2806      (this condition means, in each iteration of vectorized loop,
2807      the accessed memory of store_ptr_0 cannot be between the memory
2808      of load_ptr_0 and load_ptr_1.)
2809
2810      we then can use only the following expression to finish the
2811      alising checks between store_ptr_0 & load_ptr_0 and
2812      store_ptr_0 & load_ptr_1:
2813
2814      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2815      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2816
2817      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2818      same basic address.  */
2819
2820   comp_alias_ddrs.create (may_alias_ddrs.length ());
2821
2822   /* First, we collect all data ref pairs for aliasing checks.  */
2823   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2824     {
2825       struct data_reference *dr_a, *dr_b;
2826       gimple dr_group_first_a, dr_group_first_b;
2827       tree segment_length_a, segment_length_b;
2828       gimple stmt_a, stmt_b;
2829
2830       dr_a = DDR_A (ddr);
2831       stmt_a = DR_STMT (DDR_A (ddr));
2832       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2833       if (dr_group_first_a)
2834         {
2835           stmt_a = dr_group_first_a;
2836           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2837         }
2838
2839       dr_b = DDR_B (ddr);
2840       stmt_b = DR_STMT (DDR_B (ddr));
2841       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2842       if (dr_group_first_b)
2843         {
2844           stmt_b = dr_group_first_b;
2845           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2846         }
2847
2848       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2849         length_factor = scalar_loop_iters;
2850       else
2851         length_factor = size_int (vect_factor);
2852       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2853       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2854
2855       dr_with_seg_len_pair_t dr_with_seg_len_pair
2856           (dr_with_seg_len (dr_a, segment_length_a),
2857            dr_with_seg_len (dr_b, segment_length_b));
2858
2859       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2860         swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2861
2862       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2863     }
2864
2865   /* Second, we sort the collected data ref pairs so that we can scan
2866      them once to combine all possible aliasing checks.  */
2867   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2868
2869   /* Third, we scan the sorted dr pairs and check if we can combine
2870      alias checks of two neighbouring dr pairs.  */
2871   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2872     {
2873       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2874       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2875                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2876                       *dr_a2 = &comp_alias_ddrs[i].first,
2877                       *dr_b2 = &comp_alias_ddrs[i].second;
2878
2879       /* Remove duplicate data ref pairs.  */
2880       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2881         {
2882           if (dump_enabled_p ())
2883             {
2884               dump_printf_loc (MSG_NOTE, vect_location,
2885                                "found equal ranges ");
2886               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2887                                  DR_REF (dr_a1->dr));
2888               dump_printf (MSG_NOTE,  ", ");
2889               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2890                                  DR_REF (dr_b1->dr));
2891               dump_printf (MSG_NOTE,  " and ");
2892               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2893                                  DR_REF (dr_a2->dr));
2894               dump_printf (MSG_NOTE,  ", ");
2895               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2896                                  DR_REF (dr_b2->dr));
2897               dump_printf (MSG_NOTE, "\n");
2898             }
2899
2900           comp_alias_ddrs.ordered_remove (i--);
2901           continue;
2902         }
2903
2904       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2905         {
2906           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2907              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2908           if (*dr_a1 == *dr_a2)
2909             {
2910               swap (dr_a1, dr_b1);
2911               swap (dr_a2, dr_b2);
2912             }
2913
2914           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2915                                 DR_BASE_ADDRESS (dr_a2->dr),
2916                                 0)
2917               || !tree_fits_shwi_p (dr_a1->offset)
2918               || !tree_fits_shwi_p (dr_a2->offset))
2919             continue;
2920
2921           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2922                                 - tree_to_shwi (dr_a1->offset));
2923
2924
2925           /* Now we check if the following condition is satisfied:
2926
2927              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2928
2929              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2930              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2931              have to make a best estimation.  We can get the minimum value
2932              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2933              then either of the following two conditions can guarantee the
2934              one above:
2935
2936              1: DIFF <= MIN_SEG_LEN_B
2937              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2938
2939              */
2940
2941           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2942                                           ? tree_to_shwi (dr_b1->seg_len)
2943                                           : vect_factor);
2944
2945           if (diff <= min_seg_len_b
2946               || (tree_fits_shwi_p (dr_a1->seg_len)
2947                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2948             {
2949               if (dump_enabled_p ())
2950                 {
2951                   dump_printf_loc (MSG_NOTE, vect_location,
2952                                    "merging ranges for ");
2953                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2954                                      DR_REF (dr_a1->dr));
2955                   dump_printf (MSG_NOTE,  ", ");
2956                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2957                                      DR_REF (dr_b1->dr));
2958                   dump_printf (MSG_NOTE,  " and ");
2959                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2960                                      DR_REF (dr_a2->dr));
2961                   dump_printf (MSG_NOTE,  ", ");
2962                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2963                                      DR_REF (dr_b2->dr));
2964                   dump_printf (MSG_NOTE, "\n");
2965                 }
2966
2967               dr_a1->seg_len = size_binop (PLUS_EXPR,
2968                                            dr_a2->seg_len, size_int (diff));
2969               comp_alias_ddrs.ordered_remove (i--);
2970             }
2971         }
2972     }
2973
2974   dump_printf_loc (MSG_NOTE, vect_location,
2975                    "improved number of alias checks from %d to %d\n",
2976                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2977   if ((int) comp_alias_ddrs.length () >
2978       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2979     return false;
2980
2981   return true;
2982 }
2983
2984 /* Check whether a non-affine read in stmt is suitable for gather load
2985    and if so, return a builtin decl for that operation.  */
2986
2987 tree
2988 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2989                    tree *offp, int *scalep)
2990 {
2991   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2992   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2993   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2994   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2995   tree offtype = NULL_TREE;
2996   tree decl, base, off;
2997   machine_mode pmode;
2998   int punsignedp, pvolatilep;
2999
3000   base = DR_REF (dr);
3001   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3002      see if we can use the def stmt of the address.  */
3003   if (is_gimple_call (stmt)
3004       && gimple_call_internal_p (stmt)
3005       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3006           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3007       && TREE_CODE (base) == MEM_REF
3008       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3009       && integer_zerop (TREE_OPERAND (base, 1))
3010       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3011     {
3012       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3013       if (is_gimple_assign (def_stmt)
3014           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3015         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3016     }
3017
3018   /* The gather builtins need address of the form
3019      loop_invariant + vector * {1, 2, 4, 8}
3020      or
3021      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3022      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3023      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3024      multiplications and additions in it.  To get a vector, we need
3025      a single SSA_NAME that will be defined in the loop and will
3026      contain everything that is not loop invariant and that can be
3027      vectorized.  The following code attempts to find such a preexistng
3028      SSA_NAME OFF and put the loop invariants into a tree BASE
3029      that can be gimplified before the loop.  */
3030   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3031                               &pmode, &punsignedp, &pvolatilep, false);
3032   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3033
3034   if (TREE_CODE (base) == MEM_REF)
3035     {
3036       if (!integer_zerop (TREE_OPERAND (base, 1)))
3037         {
3038           if (off == NULL_TREE)
3039             {
3040               offset_int moff = mem_ref_offset (base);
3041               off = wide_int_to_tree (sizetype, moff);
3042             }
3043           else
3044             off = size_binop (PLUS_EXPR, off,
3045                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3046         }
3047       base = TREE_OPERAND (base, 0);
3048     }
3049   else
3050     base = build_fold_addr_expr (base);
3051
3052   if (off == NULL_TREE)
3053     off = size_zero_node;
3054
3055   /* If base is not loop invariant, either off is 0, then we start with just
3056      the constant offset in the loop invariant BASE and continue with base
3057      as OFF, otherwise give up.
3058      We could handle that case by gimplifying the addition of base + off
3059      into some SSA_NAME and use that as off, but for now punt.  */
3060   if (!expr_invariant_in_loop_p (loop, base))
3061     {
3062       if (!integer_zerop (off))
3063         return NULL_TREE;
3064       off = base;
3065       base = size_int (pbitpos / BITS_PER_UNIT);
3066     }
3067   /* Otherwise put base + constant offset into the loop invariant BASE
3068      and continue with OFF.  */
3069   else
3070     {
3071       base = fold_convert (sizetype, base);
3072       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3073     }
3074
3075   /* OFF at this point may be either a SSA_NAME or some tree expression
3076      from get_inner_reference.  Try to peel off loop invariants from it
3077      into BASE as long as possible.  */
3078   STRIP_NOPS (off);
3079   while (offtype == NULL_TREE)
3080     {
3081       enum tree_code code;
3082       tree op0, op1, add = NULL_TREE;
3083
3084       if (TREE_CODE (off) == SSA_NAME)
3085         {
3086           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3087
3088           if (expr_invariant_in_loop_p (loop, off))
3089             return NULL_TREE;
3090
3091           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3092             break;
3093
3094           op0 = gimple_assign_rhs1 (def_stmt);
3095           code = gimple_assign_rhs_code (def_stmt);
3096           op1 = gimple_assign_rhs2 (def_stmt);
3097         }
3098       else
3099         {
3100           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3101             return NULL_TREE;
3102           code = TREE_CODE (off);
3103           extract_ops_from_tree (off, &code, &op0, &op1);
3104         }
3105       switch (code)
3106         {
3107         case POINTER_PLUS_EXPR:
3108         case PLUS_EXPR:
3109           if (expr_invariant_in_loop_p (loop, op0))
3110             {
3111               add = op0;
3112               off = op1;
3113             do_add:
3114               add = fold_convert (sizetype, add);
3115               if (scale != 1)
3116                 add = size_binop (MULT_EXPR, add, size_int (scale));
3117               base = size_binop (PLUS_EXPR, base, add);
3118               continue;
3119             }
3120           if (expr_invariant_in_loop_p (loop, op1))
3121             {
3122               add = op1;
3123               off = op0;
3124               goto do_add;
3125             }
3126           break;
3127         case MINUS_EXPR:
3128           if (expr_invariant_in_loop_p (loop, op1))
3129             {
3130               add = fold_convert (sizetype, op1);
3131               add = size_binop (MINUS_EXPR, size_zero_node, add);
3132               off = op0;
3133               goto do_add;
3134             }
3135           break;
3136         case MULT_EXPR:
3137           if (scale == 1 && tree_fits_shwi_p (op1))
3138             {
3139               scale = tree_to_shwi (op1);
3140               off = op0;
3141               continue;
3142             }
3143           break;
3144         case SSA_NAME:
3145           off = op0;
3146           continue;
3147         CASE_CONVERT:
3148           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3149               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3150             break;
3151           if (TYPE_PRECISION (TREE_TYPE (op0))
3152               == TYPE_PRECISION (TREE_TYPE (off)))
3153             {
3154               off = op0;
3155               continue;
3156             }
3157           if (TYPE_PRECISION (TREE_TYPE (op0))
3158               < TYPE_PRECISION (TREE_TYPE (off)))
3159             {
3160               off = op0;
3161               offtype = TREE_TYPE (off);
3162               STRIP_NOPS (off);
3163               continue;
3164             }
3165           break;
3166         default:
3167           break;
3168         }
3169       break;
3170     }
3171
3172   /* If at the end OFF still isn't a SSA_NAME or isn't
3173      defined in the loop, punt.  */
3174   if (TREE_CODE (off) != SSA_NAME
3175       || expr_invariant_in_loop_p (loop, off))
3176     return NULL_TREE;
3177
3178   if (offtype == NULL_TREE)
3179     offtype = TREE_TYPE (off);
3180
3181   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3182                                            offtype, scale);
3183   if (decl == NULL_TREE)
3184     return NULL_TREE;
3185
3186   if (basep)
3187     *basep = base;
3188   if (offp)
3189     *offp = off;
3190   if (scalep)
3191     *scalep = scale;
3192   return decl;
3193 }
3194
3195 /* Function vect_analyze_data_refs.
3196
3197   Find all the data references in the loop or basic block.
3198
3199    The general structure of the analysis of data refs in the vectorizer is as
3200    follows:
3201    1- vect_analyze_data_refs(loop/bb): call
3202       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3203       in the loop/bb and their dependences.
3204    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3205    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3206    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3207
3208 */
3209
3210 bool
3211 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3212                         bb_vec_info bb_vinfo,
3213                         int *min_vf, unsigned *n_stmts)
3214 {
3215   struct loop *loop = NULL;
3216   basic_block bb = NULL;
3217   unsigned int i;
3218   vec<data_reference_p> datarefs;
3219   struct data_reference *dr;
3220   tree scalar_type;
3221
3222   if (dump_enabled_p ())
3223     dump_printf_loc (MSG_NOTE, vect_location,
3224                      "=== vect_analyze_data_refs ===\n");
3225
3226   if (loop_vinfo)
3227     {
3228       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3229
3230       loop = LOOP_VINFO_LOOP (loop_vinfo);
3231       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3232       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3233         {
3234           if (dump_enabled_p ())
3235             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3236                              "not vectorized: loop contains function calls"
3237                              " or data references that cannot be analyzed\n");
3238           return false;
3239         }
3240
3241       for (i = 0; i < loop->num_nodes; i++)
3242         {
3243           gimple_stmt_iterator gsi;
3244
3245           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3246             {
3247               gimple stmt = gsi_stmt (gsi);
3248               if (is_gimple_debug (stmt))
3249                 continue;
3250               ++*n_stmts;
3251               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3252                 {
3253                   if (is_gimple_call (stmt) && loop->safelen)
3254                     {
3255                       tree fndecl = gimple_call_fndecl (stmt), op;
3256                       if (fndecl != NULL_TREE)
3257                         {
3258                           struct cgraph_node *node = cgraph_node::get (fndecl);
3259                           if (node != NULL && node->simd_clones != NULL)
3260                             {
3261                               unsigned int j, n = gimple_call_num_args (stmt);
3262                               for (j = 0; j < n; j++)
3263                                 {
3264                                   op = gimple_call_arg (stmt, j);
3265                                   if (DECL_P (op)
3266                                       || (REFERENCE_CLASS_P (op)
3267                                           && get_base_address (op)))
3268                                     break;
3269                                 }
3270                               op = gimple_call_lhs (stmt);
3271                               /* Ignore #pragma omp declare simd functions
3272                                  if they don't have data references in the
3273                                  call stmt itself.  */
3274                               if (j == n
3275                                   && !(op
3276                                        && (DECL_P (op)
3277                                            || (REFERENCE_CLASS_P (op)
3278                                                && get_base_address (op)))))
3279                                 continue;
3280                             }
3281                         }
3282                     }
3283                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3284                   if (dump_enabled_p ())
3285                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3286                                      "not vectorized: loop contains function "
3287                                      "calls or data references that cannot "
3288                                      "be analyzed\n");
3289                   return false;
3290                 }
3291             }
3292         }
3293
3294       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3295     }
3296   else
3297     {
3298       gimple_stmt_iterator gsi;
3299
3300       bb = BB_VINFO_BB (bb_vinfo);
3301       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3302         {
3303           gimple stmt = gsi_stmt (gsi);
3304           if (is_gimple_debug (stmt))
3305             continue;
3306           ++*n_stmts;
3307           if (!find_data_references_in_stmt (NULL, stmt,
3308                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3309             {
3310               /* Mark the rest of the basic-block as unvectorizable.  */
3311               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3312                 {
3313                   stmt = gsi_stmt (gsi);
3314                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3315                 }
3316               break;
3317             }
3318         }
3319
3320       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3321     }
3322
3323   /* Go through the data-refs, check that the analysis succeeded.  Update
3324      pointer from stmt_vec_info struct to DR and vectype.  */
3325
3326   FOR_EACH_VEC_ELT (datarefs, i, dr)
3327     {
3328       gimple stmt;
3329       stmt_vec_info stmt_info;
3330       tree base, offset, init;
3331       bool gather = false;
3332       bool simd_lane_access = false;
3333       int vf;
3334
3335 again:
3336       if (!dr || !DR_REF (dr))
3337         {
3338           if (dump_enabled_p ())
3339             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3340                              "not vectorized: unhandled data-ref\n");
3341           return false;
3342         }
3343
3344       stmt = DR_STMT (dr);
3345       stmt_info = vinfo_for_stmt (stmt);
3346
3347       /* Discard clobbers from the dataref vector.  We will remove
3348          clobber stmts during vectorization.  */
3349       if (gimple_clobber_p (stmt))
3350         {
3351           free_data_ref (dr);
3352           if (i == datarefs.length () - 1)
3353             {
3354               datarefs.pop ();
3355               break;
3356             }
3357           datarefs.ordered_remove (i);
3358           dr = datarefs[i];
3359           goto again;
3360         }
3361
3362       /* Check that analysis of the data-ref succeeded.  */
3363       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3364           || !DR_STEP (dr))
3365         {
3366           bool maybe_gather
3367             = DR_IS_READ (dr)
3368               && !TREE_THIS_VOLATILE (DR_REF (dr))
3369               && targetm.vectorize.builtin_gather != NULL;
3370           bool maybe_simd_lane_access
3371             = loop_vinfo && loop->simduid;
3372
3373           /* If target supports vector gather loads, or if this might be
3374              a SIMD lane access, see if they can't be used.  */
3375           if (loop_vinfo
3376               && (maybe_gather || maybe_simd_lane_access)
3377               && !nested_in_vect_loop_p (loop, stmt))
3378             {
3379               struct data_reference *newdr
3380                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3381                                    DR_REF (dr), stmt, true);
3382               gcc_assert (newdr != NULL && DR_REF (newdr));
3383               if (DR_BASE_ADDRESS (newdr)
3384                   && DR_OFFSET (newdr)
3385                   && DR_INIT (newdr)
3386                   && DR_STEP (newdr)
3387                   && integer_zerop (DR_STEP (newdr)))
3388                 {
3389                   if (maybe_simd_lane_access)
3390                     {
3391                       tree off = DR_OFFSET (newdr);
3392                       STRIP_NOPS (off);
3393                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3394                           && TREE_CODE (off) == MULT_EXPR
3395                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3396                         {
3397                           tree step = TREE_OPERAND (off, 1);
3398                           off = TREE_OPERAND (off, 0);
3399                           STRIP_NOPS (off);
3400                           if (CONVERT_EXPR_P (off)
3401                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3402                                                                           0)))
3403                                  < TYPE_PRECISION (TREE_TYPE (off)))
3404                             off = TREE_OPERAND (off, 0);
3405                           if (TREE_CODE (off) == SSA_NAME)
3406                             {
3407                               gimple def = SSA_NAME_DEF_STMT (off);
3408                               tree reft = TREE_TYPE (DR_REF (newdr));
3409                               if (is_gimple_call (def)
3410                                   && gimple_call_internal_p (def)
3411                                   && (gimple_call_internal_fn (def)
3412                                       == IFN_GOMP_SIMD_LANE))
3413                                 {
3414                                   tree arg = gimple_call_arg (def, 0);
3415                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3416                                   arg = SSA_NAME_VAR (arg);
3417                                   if (arg == loop->simduid
3418                                       /* For now.  */
3419                                       && tree_int_cst_equal
3420                                            (TYPE_SIZE_UNIT (reft),
3421                                             step))
3422                                     {
3423                                       DR_OFFSET (newdr) = ssize_int (0);
3424                                       DR_STEP (newdr) = step;
3425                                       DR_ALIGNED_TO (newdr)
3426                                         = size_int (BIGGEST_ALIGNMENT);
3427                                       dr = newdr;
3428                                       simd_lane_access = true;
3429                                     }
3430                                 }
3431                             }
3432                         }
3433                     }
3434                   if (!simd_lane_access && maybe_gather)
3435                     {
3436                       dr = newdr;
3437                       gather = true;
3438                     }
3439                 }
3440               if (!gather && !simd_lane_access)
3441                 free_data_ref (newdr);
3442             }
3443
3444           if (!gather && !simd_lane_access)
3445             {
3446               if (dump_enabled_p ())
3447                 {
3448                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3449                                    "not vectorized: data ref analysis "
3450                                    "failed ");
3451                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3452                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3453                 }
3454
3455               if (bb_vinfo)
3456                 break;
3457
3458               return false;
3459             }
3460         }
3461
3462       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3463         {
3464           if (dump_enabled_p ())
3465             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3466                              "not vectorized: base addr of dr is a "
3467                              "constant\n");
3468
3469           if (bb_vinfo)
3470             break;
3471
3472           if (gather || simd_lane_access)
3473             free_data_ref (dr);
3474           return false;
3475         }
3476
3477       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3478         {
3479           if (dump_enabled_p ())
3480             {
3481               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3482                                "not vectorized: volatile type ");
3483               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3484               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3485             }
3486
3487           if (bb_vinfo)
3488             break;
3489
3490           return false;
3491         }
3492
3493       if (stmt_can_throw_internal (stmt))
3494         {
3495           if (dump_enabled_p ())
3496             {
3497               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3498                                "not vectorized: statement can throw an "
3499                                "exception ");
3500               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3501               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3502             }
3503
3504           if (bb_vinfo)
3505             break;
3506
3507           if (gather || simd_lane_access)
3508             free_data_ref (dr);
3509           return false;
3510         }
3511
3512       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3513           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3514         {
3515           if (dump_enabled_p ())
3516             {
3517               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3518                                "not vectorized: statement is bitfield "
3519                                "access ");
3520               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3521               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3522             }
3523
3524           if (bb_vinfo)
3525             break;
3526
3527           if (gather || simd_lane_access)
3528             free_data_ref (dr);
3529           return false;
3530         }
3531
3532       base = unshare_expr (DR_BASE_ADDRESS (dr));
3533       offset = unshare_expr (DR_OFFSET (dr));
3534       init = unshare_expr (DR_INIT (dr));
3535
3536       if (is_gimple_call (stmt)
3537           && (!gimple_call_internal_p (stmt)
3538               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3539                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3540         {
3541           if (dump_enabled_p ())
3542             {
3543               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3544                                "not vectorized: dr in a call ");
3545               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3546               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3547             }
3548
3549           if (bb_vinfo)
3550             break;
3551
3552           if (gather || simd_lane_access)
3553             free_data_ref (dr);
3554           return false;
3555         }
3556
3557       /* Update DR field in stmt_vec_info struct.  */
3558
3559       /* If the dataref is in an inner-loop of the loop that is considered for
3560          for vectorization, we also want to analyze the access relative to
3561          the outer-loop (DR contains information only relative to the
3562          inner-most enclosing loop).  We do that by building a reference to the
3563          first location accessed by the inner-loop, and analyze it relative to
3564          the outer-loop.  */
3565       if (loop && nested_in_vect_loop_p (loop, stmt))
3566         {
3567           tree outer_step, outer_base, outer_init;
3568           HOST_WIDE_INT pbitsize, pbitpos;
3569           tree poffset;
3570           machine_mode pmode;
3571           int punsignedp, pvolatilep;
3572           affine_iv base_iv, offset_iv;
3573           tree dinit;
3574
3575           /* Build a reference to the first location accessed by the
3576              inner-loop: *(BASE+INIT).  (The first location is actually
3577              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3578           tree inner_base = build_fold_indirect_ref
3579                                 (fold_build_pointer_plus (base, init));
3580
3581           if (dump_enabled_p ())
3582             {
3583               dump_printf_loc (MSG_NOTE, vect_location,
3584                                "analyze in outer-loop: ");
3585               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3586               dump_printf (MSG_NOTE, "\n");
3587             }
3588
3589           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3590                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3591           gcc_assert (outer_base != NULL_TREE);
3592
3593           if (pbitpos % BITS_PER_UNIT != 0)
3594             {
3595               if (dump_enabled_p ())
3596                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3597                                  "failed: bit offset alignment.\n");
3598               return false;
3599             }
3600
3601           outer_base = build_fold_addr_expr (outer_base);
3602           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3603                           &base_iv, false))
3604             {
3605               if (dump_enabled_p ())
3606                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3607                                  "failed: evolution of base is not affine.\n");
3608               return false;
3609             }
3610
3611           if (offset)
3612             {
3613               if (poffset)
3614                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3615                                        poffset);
3616               else
3617                 poffset = offset;
3618             }
3619
3620           if (!poffset)
3621             {
3622               offset_iv.base = ssize_int (0);
3623               offset_iv.step = ssize_int (0);
3624             }
3625           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3626                                &offset_iv, false))
3627             {
3628               if (dump_enabled_p ())
3629                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3630                                  "evolution of offset is not affine.\n");
3631               return false;
3632             }
3633
3634           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3635           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3636           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3637           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3638           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3639
3640           outer_step = size_binop (PLUS_EXPR,
3641                                 fold_convert (ssizetype, base_iv.step),
3642                                 fold_convert (ssizetype, offset_iv.step));
3643
3644           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3645           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3646           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3647           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3648           STMT_VINFO_DR_OFFSET (stmt_info) =
3649                                 fold_convert (ssizetype, offset_iv.base);
3650           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3651                                 size_int (highest_pow2_factor (offset_iv.base));
3652
3653           if (dump_enabled_p ())
3654             {
3655               dump_printf_loc (MSG_NOTE, vect_location,
3656                                "\touter base_address: ");
3657               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3658                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3659               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3660               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3661                                  STMT_VINFO_DR_OFFSET (stmt_info));
3662               dump_printf (MSG_NOTE,
3663                            "\n\touter constant offset from base address: ");
3664               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3665                                  STMT_VINFO_DR_INIT (stmt_info));
3666               dump_printf (MSG_NOTE, "\n\touter step: ");
3667               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3668                                  STMT_VINFO_DR_STEP (stmt_info));
3669               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3670               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3671                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3672               dump_printf (MSG_NOTE, "\n");
3673             }
3674         }
3675
3676       if (STMT_VINFO_DATA_REF (stmt_info))
3677         {
3678           if (dump_enabled_p ())
3679             {
3680               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3681                                "not vectorized: more than one data ref "
3682                                "in stmt: ");
3683               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3684               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3685             }
3686
3687           if (bb_vinfo)
3688             break;
3689
3690           if (gather || simd_lane_access)
3691             free_data_ref (dr);
3692           return false;
3693         }
3694
3695       STMT_VINFO_DATA_REF (stmt_info) = dr;
3696       if (simd_lane_access)
3697         {
3698           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3699           free_data_ref (datarefs[i]);
3700           datarefs[i] = dr;
3701         }
3702
3703       /* Set vectype for STMT.  */
3704       scalar_type = TREE_TYPE (DR_REF (dr));
3705       STMT_VINFO_VECTYPE (stmt_info)
3706         = get_vectype_for_scalar_type (scalar_type);
3707       if (!STMT_VINFO_VECTYPE (stmt_info))
3708         {
3709           if (dump_enabled_p ())
3710             {
3711               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3712                                "not vectorized: no vectype for stmt: ");
3713               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3714               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3715               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3716                                  scalar_type);
3717               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3718             }
3719
3720           if (bb_vinfo)
3721             break;
3722
3723           if (gather || simd_lane_access)
3724             {
3725               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3726               if (gather)
3727                 free_data_ref (dr);
3728             }
3729           return false;
3730         }
3731       else
3732         {
3733           if (dump_enabled_p ())
3734             {
3735               dump_printf_loc (MSG_NOTE, vect_location,
3736                                "got vectype for stmt: ");
3737               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3738               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3739                                  STMT_VINFO_VECTYPE (stmt_info));
3740               dump_printf (MSG_NOTE, "\n");
3741             }
3742         }
3743
3744       /* Adjust the minimal vectorization factor according to the
3745          vector type.  */
3746       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3747       if (vf > *min_vf)
3748         *min_vf = vf;
3749
3750       if (gather)
3751         {
3752           tree off;
3753
3754           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3755           if (gather
3756               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3757             gather = false;
3758           if (!gather)
3759             {
3760               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3761               free_data_ref (dr);
3762               if (dump_enabled_p ())
3763                 {
3764                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3765                                    "not vectorized: not suitable for gather "
3766                                    "load ");
3767                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3768                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3769                 }
3770               return false;
3771             }
3772
3773           datarefs[i] = dr;
3774           STMT_VINFO_GATHER_P (stmt_info) = true;
3775         }
3776       else if (loop_vinfo
3777                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3778         {
3779           if (nested_in_vect_loop_p (loop, stmt)
3780               || !DR_IS_READ (dr))
3781             {
3782               if (dump_enabled_p ())
3783                 {
3784                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3785                                    "not vectorized: not suitable for strided "
3786                                    "load ");
3787                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3788                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3789                 }
3790               return false;
3791             }
3792           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3793         }
3794     }
3795
3796   /* If we stopped analysis at the first dataref we could not analyze
3797      when trying to vectorize a basic-block mark the rest of the datarefs
3798      as not vectorizable and truncate the vector of datarefs.  That
3799      avoids spending useless time in analyzing their dependence.  */
3800   if (i != datarefs.length ())
3801     {
3802       gcc_assert (bb_vinfo != NULL);
3803       for (unsigned j = i; j < datarefs.length (); ++j)
3804         {
3805           data_reference_p dr = datarefs[j];
3806           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3807           free_data_ref (dr);
3808         }
3809       datarefs.truncate (i);
3810     }
3811
3812   return true;
3813 }
3814
3815
3816 /* Function vect_get_new_vect_var.
3817
3818    Returns a name for a new variable.  The current naming scheme appends the
3819    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3820    the name of vectorizer generated variables, and appends that to NAME if
3821    provided.  */
3822
3823 tree
3824 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3825 {
3826   const char *prefix;
3827   tree new_vect_var;
3828
3829   switch (var_kind)
3830   {
3831   case vect_simple_var:
3832     prefix = "vect";
3833     break;
3834   case vect_scalar_var:
3835     prefix = "stmp";
3836     break;
3837   case vect_pointer_var:
3838     prefix = "vectp";
3839     break;
3840   default:
3841     gcc_unreachable ();
3842   }
3843
3844   if (name)
3845     {
3846       char* tmp = concat (prefix, "_", name, NULL);
3847       new_vect_var = create_tmp_reg (type, tmp);
3848       free (tmp);
3849     }
3850   else
3851     new_vect_var = create_tmp_reg (type, prefix);
3852
3853   return new_vect_var;
3854 }
3855
3856
3857 /* Function vect_create_addr_base_for_vector_ref.
3858
3859    Create an expression that computes the address of the first memory location
3860    that will be accessed for a data reference.
3861
3862    Input:
3863    STMT: The statement containing the data reference.
3864    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3865    OFFSET: Optional. If supplied, it is be added to the initial address.
3866    LOOP:    Specify relative to which loop-nest should the address be computed.
3867             For example, when the dataref is in an inner-loop nested in an
3868             outer-loop that is now being vectorized, LOOP can be either the
3869             outer-loop, or the inner-loop.  The first memory location accessed
3870             by the following dataref ('in' points to short):
3871
3872                 for (i=0; i<N; i++)
3873                    for (j=0; j<M; j++)
3874                      s += in[i+j]
3875
3876             is as follows:
3877             if LOOP=i_loop:     &in             (relative to i_loop)
3878             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3879    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3880             initial address.  Unlike OFFSET, which is number of elements to
3881             be added, BYTE_OFFSET is measured in bytes.
3882
3883    Output:
3884    1. Return an SSA_NAME whose value is the address of the memory location of
3885       the first vector of the data reference.
3886    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3887       these statement(s) which define the returned SSA_NAME.
3888
3889    FORNOW: We are only handling array accesses with step 1.  */
3890
3891 tree
3892 vect_create_addr_base_for_vector_ref (gimple stmt,
3893                                       gimple_seq *new_stmt_list,
3894                                       tree offset,
3895                                       struct loop *loop,
3896                                       tree byte_offset)
3897 {
3898   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3899   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3900   tree data_ref_base;
3901   const char *base_name;
3902   tree addr_base;
3903   tree dest;
3904   gimple_seq seq = NULL;
3905   tree base_offset;
3906   tree init;
3907   tree vect_ptr_type;
3908   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3909   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3910
3911   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3912     {
3913       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3914
3915       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3916
3917       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3918       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3919       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3920     }
3921   else
3922     {
3923       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3924       base_offset = unshare_expr (DR_OFFSET (dr));
3925       init = unshare_expr (DR_INIT (dr));
3926     }
3927
3928   if (loop_vinfo)
3929     base_name = get_name (data_ref_base);
3930   else
3931     {
3932       base_offset = ssize_int (0);
3933       init = ssize_int (0);
3934       base_name = get_name (DR_REF (dr));
3935     }
3936
3937   /* Create base_offset */
3938   base_offset = size_binop (PLUS_EXPR,
3939                             fold_convert (sizetype, base_offset),
3940                             fold_convert (sizetype, init));
3941
3942   if (offset)
3943     {
3944       offset = fold_build2 (MULT_EXPR, sizetype,
3945                             fold_convert (sizetype, offset), step);
3946       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3947                                  base_offset, offset);
3948     }
3949   if (byte_offset)
3950     {
3951       byte_offset = fold_convert (sizetype, byte_offset);
3952       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3953                                  base_offset, byte_offset);
3954     }
3955
3956   /* base + base_offset */
3957   if (loop_vinfo)
3958     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3959   else
3960     {
3961       addr_base = build1 (ADDR_EXPR,
3962                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3963                           unshare_expr (DR_REF (dr)));
3964     }
3965
3966   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3967   addr_base = fold_convert (vect_ptr_type, addr_base);
3968   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3969   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3970   gimple_seq_add_seq (new_stmt_list, seq);
3971
3972   if (DR_PTR_INFO (dr)
3973       && TREE_CODE (addr_base) == SSA_NAME)
3974     {
3975       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3976       unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3977       int misalign = DR_MISALIGNMENT (dr);
3978       if (offset || byte_offset || (misalign == -1))
3979         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3980       else
3981         set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
3982     }
3983
3984   if (dump_enabled_p ())
3985     {
3986       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3987       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3988       dump_printf (MSG_NOTE, "\n");
3989     }
3990
3991   return addr_base;
3992 }
3993
3994
3995 /* Function vect_create_data_ref_ptr.
3996
3997    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3998    location accessed in the loop by STMT, along with the def-use update
3999    chain to appropriately advance the pointer through the loop iterations.
4000    Also set aliasing information for the pointer.  This pointer is used by
4001    the callers to this function to create a memory reference expression for
4002    vector load/store access.
4003
4004    Input:
4005    1. STMT: a stmt that references memory. Expected to be of the form
4006          GIMPLE_ASSIGN <name, data-ref> or
4007          GIMPLE_ASSIGN <data-ref, name>.
4008    2. AGGR_TYPE: the type of the reference, which should be either a vector
4009         or an array.
4010    3. AT_LOOP: the loop where the vector memref is to be created.
4011    4. OFFSET (optional): an offset to be added to the initial address accessed
4012         by the data-ref in STMT.
4013    5. BSI: location where the new stmts are to be placed if there is no loop
4014    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4015         pointing to the initial address.
4016    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4017         to the initial address accessed by the data-ref in STMT.  This is
4018         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4019         in bytes.
4020
4021    Output:
4022    1. Declare a new ptr to vector_type, and have it point to the base of the
4023       data reference (initial addressed accessed by the data reference).
4024       For example, for vector of type V8HI, the following code is generated:
4025
4026       v8hi *ap;
4027       ap = (v8hi *)initial_address;
4028
4029       if OFFSET is not supplied:
4030          initial_address = &a[init];
4031       if OFFSET is supplied:
4032          initial_address = &a[init + OFFSET];
4033       if BYTE_OFFSET is supplied:
4034          initial_address = &a[init] + BYTE_OFFSET;
4035
4036       Return the initial_address in INITIAL_ADDRESS.
4037
4038    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4039       update the pointer in each iteration of the loop.
4040
4041       Return the increment stmt that updates the pointer in PTR_INCR.
4042
4043    3. Set INV_P to true if the access pattern of the data reference in the
4044       vectorized loop is invariant.  Set it to false otherwise.
4045
4046    4. Return the pointer.  */
4047
4048 tree
4049 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4050                           tree offset, tree *initial_address,
4051                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4052                           bool only_init, bool *inv_p, tree byte_offset)
4053 {
4054   const char *base_name;
4055   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4056   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4057   struct loop *loop = NULL;
4058   bool nested_in_vect_loop = false;
4059   struct loop *containing_loop = NULL;
4060   tree aggr_ptr_type;
4061   tree aggr_ptr;
4062   tree new_temp;
4063   gimple vec_stmt;
4064   gimple_seq new_stmt_list = NULL;
4065   edge pe = NULL;
4066   basic_block new_bb;
4067   tree aggr_ptr_init;
4068   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4069   tree aptr;
4070   gimple_stmt_iterator incr_gsi;
4071   bool insert_after;
4072   tree indx_before_incr, indx_after_incr;
4073   gimple incr;
4074   tree step;
4075   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4076
4077   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4078               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4079
4080   if (loop_vinfo)
4081     {
4082       loop = LOOP_VINFO_LOOP (loop_vinfo);
4083       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4084       containing_loop = (gimple_bb (stmt))->loop_father;
4085       pe = loop_preheader_edge (loop);
4086     }
4087   else
4088     {
4089       gcc_assert (bb_vinfo);
4090       only_init = true;
4091       *ptr_incr = NULL;
4092     }
4093
4094   /* Check the step (evolution) of the load in LOOP, and record
4095      whether it's invariant.  */
4096   if (nested_in_vect_loop)
4097     step = STMT_VINFO_DR_STEP (stmt_info);
4098   else
4099     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4100
4101   if (integer_zerop (step))
4102     *inv_p = true;
4103   else
4104     *inv_p = false;
4105
4106   /* Create an expression for the first address accessed by this load
4107      in LOOP.  */
4108   base_name = get_name (DR_BASE_ADDRESS (dr));
4109
4110   if (dump_enabled_p ())
4111     {
4112       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4113       dump_printf_loc (MSG_NOTE, vect_location,
4114                        "create %s-pointer variable to type: ",
4115                        get_tree_code_name (TREE_CODE (aggr_type)));
4116       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4117       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4118         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4119       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4120         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4121       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4122         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4123       else
4124         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4125       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4126       dump_printf (MSG_NOTE, "\n");
4127     }
4128
4129   /* (1) Create the new aggregate-pointer variable.
4130      Vector and array types inherit the alias set of their component
4131      type by default so we need to use a ref-all pointer if the data
4132      reference does not conflict with the created aggregated data
4133      reference because it is not addressable.  */
4134   bool need_ref_all = false;
4135   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4136                               get_alias_set (DR_REF (dr))))
4137     need_ref_all = true;
4138   /* Likewise for any of the data references in the stmt group.  */
4139   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4140     {
4141       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4142       do
4143         {
4144           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4145           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4146           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4147                                       get_alias_set (DR_REF (sdr))))
4148             {
4149               need_ref_all = true;
4150               break;
4151             }
4152           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4153         }
4154       while (orig_stmt);
4155     }
4156   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4157                                                need_ref_all);
4158   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4159
4160
4161   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4162      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4163      def-use update cycles for the pointer: one relative to the outer-loop
4164      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4165      to the inner-loop (which is the inner-most loop containing the dataref),
4166      and this is done be step (5) below.
4167
4168      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4169      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4170      redundant.  Steps (3),(4) create the following:
4171
4172         vp0 = &base_addr;
4173         LOOP:   vp1 = phi(vp0,vp2)
4174                 ...
4175                 ...
4176                 vp2 = vp1 + step
4177                 goto LOOP
4178
4179      If there is an inner-loop nested in loop, then step (5) will also be
4180      applied, and an additional update in the inner-loop will be created:
4181
4182         vp0 = &base_addr;
4183         LOOP:   vp1 = phi(vp0,vp2)
4184                 ...
4185         inner:     vp3 = phi(vp1,vp4)
4186                    vp4 = vp3 + inner_step
4187                    if () goto inner
4188                 ...
4189                 vp2 = vp1 + step
4190                 if () goto LOOP   */
4191
4192   /* (2) Calculate the initial address of the aggregate-pointer, and set
4193      the aggregate-pointer to point to it before the loop.  */
4194
4195   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4196
4197   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4198                                                    offset, loop, byte_offset);
4199   if (new_stmt_list)
4200     {
4201       if (pe)
4202         {
4203           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4204           gcc_assert (!new_bb);
4205         }
4206       else
4207         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4208     }
4209
4210   *initial_address = new_temp;
4211
4212   /* Create: p = (aggr_type *) initial_base  */
4213   if (TREE_CODE (new_temp) != SSA_NAME
4214       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4215     {
4216       vec_stmt = gimple_build_assign (aggr_ptr,
4217                                       fold_convert (aggr_ptr_type, new_temp));
4218       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4219       /* Copy the points-to information if it exists. */
4220       if (DR_PTR_INFO (dr))
4221         duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4222       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4223       if (pe)
4224         {
4225           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4226           gcc_assert (!new_bb);
4227         }
4228       else
4229         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4230     }
4231   else
4232     aggr_ptr_init = new_temp;
4233
4234   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4235      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4236      inner-loop nested in LOOP (during outer-loop vectorization).  */
4237
4238   /* No update in loop is required.  */
4239   if (only_init && (!loop_vinfo || at_loop == loop))
4240     aptr = aggr_ptr_init;
4241   else
4242     {
4243       /* The step of the aggregate pointer is the type size.  */
4244       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4245       /* One exception to the above is when the scalar step of the load in
4246          LOOP is zero. In this case the step here is also zero.  */
4247       if (*inv_p)
4248         iv_step = size_zero_node;
4249       else if (tree_int_cst_sgn (step) == -1)
4250         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4251
4252       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4253
4254       create_iv (aggr_ptr_init,
4255                  fold_convert (aggr_ptr_type, iv_step),
4256                  aggr_ptr, loop, &incr_gsi, insert_after,
4257                  &indx_before_incr, &indx_after_incr);
4258       incr = gsi_stmt (incr_gsi);
4259       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4260
4261       /* Copy the points-to information if it exists. */
4262       if (DR_PTR_INFO (dr))
4263         {
4264           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4265           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4266         }
4267       if (ptr_incr)
4268         *ptr_incr = incr;
4269
4270       aptr = indx_before_incr;
4271     }
4272
4273   if (!nested_in_vect_loop || only_init)
4274     return aptr;
4275
4276
4277   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4278      nested in LOOP, if exists.  */
4279
4280   gcc_assert (nested_in_vect_loop);
4281   if (!only_init)
4282     {
4283       standard_iv_increment_position (containing_loop, &incr_gsi,
4284                                       &insert_after);
4285       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4286                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4287                  &indx_after_incr);
4288       incr = gsi_stmt (incr_gsi);
4289       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4290
4291       /* Copy the points-to information if it exists. */
4292       if (DR_PTR_INFO (dr))
4293         {
4294           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4295           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4296         }
4297       if (ptr_incr)
4298         *ptr_incr = incr;
4299
4300       return indx_before_incr;
4301     }
4302   else
4303     gcc_unreachable ();
4304 }
4305
4306
4307 /* Function bump_vector_ptr
4308
4309    Increment a pointer (to a vector type) by vector-size. If requested,
4310    i.e. if PTR-INCR is given, then also connect the new increment stmt
4311    to the existing def-use update-chain of the pointer, by modifying
4312    the PTR_INCR as illustrated below:
4313
4314    The pointer def-use update-chain before this function:
4315                         DATAREF_PTR = phi (p_0, p_2)
4316                         ....
4317         PTR_INCR:       p_2 = DATAREF_PTR + step
4318
4319    The pointer def-use update-chain after this function:
4320                         DATAREF_PTR = phi (p_0, p_2)
4321                         ....
4322                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4323                         ....
4324         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4325
4326    Input:
4327    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4328                  in the loop.
4329    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4330               the loop.  The increment amount across iterations is expected
4331               to be vector_size.
4332    BSI - location where the new update stmt is to be placed.
4333    STMT - the original scalar memory-access stmt that is being vectorized.
4334    BUMP - optional. The offset by which to bump the pointer. If not given,
4335           the offset is assumed to be vector_size.
4336
4337    Output: Return NEW_DATAREF_PTR as illustrated above.
4338
4339 */
4340
4341 tree
4342 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4343                  gimple stmt, tree bump)
4344 {
4345   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4346   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4347   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4348   tree update = TYPE_SIZE_UNIT (vectype);
4349   gimple incr_stmt;
4350   ssa_op_iter iter;
4351   use_operand_p use_p;
4352   tree new_dataref_ptr;
4353
4354   if (bump)
4355     update = bump;
4356
4357   new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4358   incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4359                                             dataref_ptr, update);
4360   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4361
4362   /* Copy the points-to information if it exists. */
4363   if (DR_PTR_INFO (dr))
4364     {
4365       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4366       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4367     }
4368
4369   if (!ptr_incr)
4370     return new_dataref_ptr;
4371
4372   /* Update the vector-pointer's cross-iteration increment.  */
4373   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4374     {
4375       tree use = USE_FROM_PTR (use_p);
4376
4377       if (use == dataref_ptr)
4378         SET_USE (use_p, new_dataref_ptr);
4379       else
4380         gcc_assert (tree_int_cst_compare (use, update) == 0);
4381     }
4382
4383   return new_dataref_ptr;
4384 }
4385
4386
4387 /* Function vect_create_destination_var.
4388
4389    Create a new temporary of type VECTYPE.  */
4390
4391 tree
4392 vect_create_destination_var (tree scalar_dest, tree vectype)
4393 {
4394   tree vec_dest;
4395   const char *name;
4396   char *new_name;
4397   tree type;
4398   enum vect_var_kind kind;
4399
4400   kind = vectype ? vect_simple_var : vect_scalar_var;
4401   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4402
4403   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4404
4405   name = get_name (scalar_dest);
4406   if (name)
4407     asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4408   else
4409     asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4410   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4411   free (new_name);
4412
4413   return vec_dest;
4414 }
4415
4416 /* Function vect_grouped_store_supported.
4417
4418    Returns TRUE if interleave high and interleave low permutations
4419    are supported, and FALSE otherwise.  */
4420
4421 bool
4422 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4423 {
4424   machine_mode mode = TYPE_MODE (vectype);
4425
4426   /* vect_permute_store_chain requires the group size to be equal to 3 or
4427      be a power of two.  */
4428   if (count != 3 && exact_log2 (count) == -1)
4429     {
4430       if (dump_enabled_p ())
4431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4432                          "the size of the group of accesses"
4433                          " is not a power of 2 or not eqaul to 3\n");
4434       return false;
4435     }
4436
4437   /* Check that the permutation is supported.  */
4438   if (VECTOR_MODE_P (mode))
4439     {
4440       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4441       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4442
4443       if (count == 3)
4444         {
4445           unsigned int j0 = 0, j1 = 0, j2 = 0;
4446           unsigned int i, j;
4447
4448           for (j = 0; j < 3; j++)
4449             {
4450               int nelt0 = ((3 - j) * nelt) % 3;
4451               int nelt1 = ((3 - j) * nelt + 1) % 3;
4452               int nelt2 = ((3 - j) * nelt + 2) % 3;
4453               for (i = 0; i < nelt; i++)
4454                 {
4455                   if (3 * i + nelt0 < nelt)
4456                     sel[3 * i + nelt0] = j0++;
4457                   if (3 * i + nelt1 < nelt)
4458                     sel[3 * i + nelt1] = nelt + j1++;
4459                   if (3 * i + nelt2 < nelt)
4460                     sel[3 * i + nelt2] = 0;
4461                 }
4462               if (!can_vec_perm_p (mode, false, sel))
4463                 {
4464                   if (dump_enabled_p ())
4465                     dump_printf (MSG_MISSED_OPTIMIZATION,
4466                                  "permutaion op not supported by target.\n");
4467                   return false;
4468                 }
4469
4470               for (i = 0; i < nelt; i++)
4471                 {
4472                   if (3 * i + nelt0 < nelt)
4473                     sel[3 * i + nelt0] = 3 * i + nelt0;
4474                   if (3 * i + nelt1 < nelt)
4475                     sel[3 * i + nelt1] = 3 * i + nelt1;
4476                   if (3 * i + nelt2 < nelt)
4477                     sel[3 * i + nelt2] = nelt + j2++;
4478                 }
4479               if (!can_vec_perm_p (mode, false, sel))
4480                 {
4481                   if (dump_enabled_p ())
4482                     dump_printf (MSG_MISSED_OPTIMIZATION,
4483                                  "permutaion op not supported by target.\n");
4484                   return false;
4485                 }
4486             }
4487           return true;
4488         }
4489       else
4490         {
4491           /* If length is not equal to 3 then only power of 2 is supported.  */
4492           gcc_assert (exact_log2 (count) != -1);
4493
4494           for (i = 0; i < nelt / 2; i++)
4495             {
4496               sel[i * 2] = i;
4497               sel[i * 2 + 1] = i + nelt;
4498             }
4499             if (can_vec_perm_p (mode, false, sel))
4500               {
4501                 for (i = 0; i < nelt; i++)
4502                   sel[i] += nelt / 2;
4503                 if (can_vec_perm_p (mode, false, sel))
4504                   return true;
4505               }
4506         }
4507     }
4508
4509   if (dump_enabled_p ())
4510     dump_printf (MSG_MISSED_OPTIMIZATION,
4511                  "permutaion op not supported by target.\n");
4512   return false;
4513 }
4514
4515
4516 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4517    type VECTYPE.  */
4518
4519 bool
4520 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4521 {
4522   return vect_lanes_optab_supported_p ("vec_store_lanes",
4523                                        vec_store_lanes_optab,
4524                                        vectype, count);
4525 }
4526
4527
4528 /* Function vect_permute_store_chain.
4529
4530    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4531    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4532    the data correctly for the stores.  Return the final references for stores
4533    in RESULT_CHAIN.
4534
4535    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4536    The input is 4 vectors each containing 8 elements.  We assign a number to
4537    each element, the input sequence is:
4538
4539    1st vec:   0  1  2  3  4  5  6  7
4540    2nd vec:   8  9 10 11 12 13 14 15
4541    3rd vec:  16 17 18 19 20 21 22 23
4542    4th vec:  24 25 26 27 28 29 30 31
4543
4544    The output sequence should be:
4545
4546    1st vec:  0  8 16 24  1  9 17 25
4547    2nd vec:  2 10 18 26  3 11 19 27
4548    3rd vec:  4 12 20 28  5 13 21 30
4549    4th vec:  6 14 22 30  7 15 23 31
4550
4551    i.e., we interleave the contents of the four vectors in their order.
4552
4553    We use interleave_high/low instructions to create such output.  The input of
4554    each interleave_high/low operation is two vectors:
4555    1st vec    2nd vec
4556    0 1 2 3    4 5 6 7
4557    the even elements of the result vector are obtained left-to-right from the
4558    high/low elements of the first vector.  The odd elements of the result are
4559    obtained left-to-right from the high/low elements of the second vector.
4560    The output of interleave_high will be:   0 4 1 5
4561    and of interleave_low:                   2 6 3 7
4562
4563
4564    The permutation is done in log LENGTH stages.  In each stage interleave_high
4565    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4566    where the first argument is taken from the first half of DR_CHAIN and the
4567    second argument from it's second half.
4568    In our example,
4569
4570    I1: interleave_high (1st vec, 3rd vec)
4571    I2: interleave_low (1st vec, 3rd vec)
4572    I3: interleave_high (2nd vec, 4th vec)
4573    I4: interleave_low (2nd vec, 4th vec)
4574
4575    The output for the first stage is:
4576
4577    I1:  0 16  1 17  2 18  3 19
4578    I2:  4 20  5 21  6 22  7 23
4579    I3:  8 24  9 25 10 26 11 27
4580    I4: 12 28 13 29 14 30 15 31
4581
4582    The output of the second stage, i.e. the final result is:
4583
4584    I1:  0  8 16 24  1  9 17 25
4585    I2:  2 10 18 26  3 11 19 27
4586    I3:  4 12 20 28  5 13 21 30
4587    I4:  6 14 22 30  7 15 23 31.  */
4588
4589 void
4590 vect_permute_store_chain (vec<tree> dr_chain,
4591                           unsigned int length,
4592                           gimple stmt,
4593                           gimple_stmt_iterator *gsi,
4594                           vec<tree> *result_chain)
4595 {
4596   tree vect1, vect2, high, low;
4597   gimple perm_stmt;
4598   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4599   tree perm_mask_low, perm_mask_high;
4600   tree data_ref;
4601   tree perm3_mask_low, perm3_mask_high;
4602   unsigned int i, n, log_length = exact_log2 (length);
4603   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4604   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4605
4606   result_chain->quick_grow (length);
4607   memcpy (result_chain->address (), dr_chain.address (),
4608           length * sizeof (tree));
4609
4610   if (length == 3)
4611     {
4612       unsigned int j0 = 0, j1 = 0, j2 = 0;
4613
4614       for (j = 0; j < 3; j++)
4615         {
4616           int nelt0 = ((3 - j) * nelt) % 3;
4617           int nelt1 = ((3 - j) * nelt + 1) % 3;
4618           int nelt2 = ((3 - j) * nelt + 2) % 3;
4619
4620           for (i = 0; i < nelt; i++)
4621             {
4622               if (3 * i + nelt0 < nelt)
4623                 sel[3 * i + nelt0] = j0++;
4624               if (3 * i + nelt1 < nelt)
4625                 sel[3 * i + nelt1] = nelt + j1++;
4626               if (3 * i + nelt2 < nelt)
4627                 sel[3 * i + nelt2] = 0;
4628             }
4629           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
4630           gcc_assert (perm3_mask_low != NULL);
4631
4632           for (i = 0; i < nelt; i++)
4633             {
4634               if (3 * i + nelt0 < nelt)
4635                 sel[3 * i + nelt0] = 3 * i + nelt0;
4636               if (3 * i + nelt1 < nelt)
4637                 sel[3 * i + nelt1] = 3 * i + nelt1;
4638               if (3 * i + nelt2 < nelt)
4639                 sel[3 * i + nelt2] = nelt + j2++;
4640             }
4641           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
4642           gcc_assert (perm3_mask_high != NULL);
4643
4644           vect1 = dr_chain[0];
4645           vect2 = dr_chain[1];
4646
4647           /* Create interleaving stmt:
4648              low = VEC_PERM_EXPR <vect1, vect2,
4649                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4650                                    j + 2, nelt + j + 2, *, ...}>  */
4651           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4652           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4653                                                     vect1, vect2,
4654                                                     perm3_mask_low);
4655           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4656
4657           vect1 = data_ref;
4658           vect2 = dr_chain[2];
4659           /* Create interleaving stmt:
4660              low = VEC_PERM_EXPR <vect1, vect2,
4661                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4662                                    6, 7, nelt + j + 2, ...}>  */
4663           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4664           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4665                                                     vect1, vect2,
4666                                                     perm3_mask_high);
4667           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4668           (*result_chain)[j] = data_ref;
4669         }
4670     }
4671   else
4672     {
4673       /* If length is not equal to 3 then only power of 2 is supported.  */
4674       gcc_assert (exact_log2 (length) != -1);
4675
4676       for (i = 0, n = nelt / 2; i < n; i++)
4677         {
4678           sel[i * 2] = i;
4679           sel[i * 2 + 1] = i + nelt;
4680         }
4681         perm_mask_high = vect_gen_perm_mask (vectype, sel);
4682         gcc_assert (perm_mask_high != NULL);
4683
4684         for (i = 0; i < nelt; i++)
4685           sel[i] += nelt / 2;
4686         perm_mask_low = vect_gen_perm_mask (vectype, sel);
4687         gcc_assert (perm_mask_low != NULL);
4688
4689         for (i = 0, n = log_length; i < n; i++)
4690           {
4691             for (j = 0; j < length/2; j++)
4692               {
4693                 vect1 = dr_chain[j];
4694                 vect2 = dr_chain[j+length/2];
4695
4696                 /* Create interleaving stmt:
4697                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4698                                                         ...}>  */
4699                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4700                 perm_stmt
4701                   = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4702                                                   vect1, vect2, perm_mask_high);
4703                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4704                 (*result_chain)[2*j] = high;
4705
4706                 /* Create interleaving stmt:
4707                    low = VEC_PERM_EXPR <vect1, vect2,
4708                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4709                                          ...}>  */
4710                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4711                 perm_stmt
4712                   = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4713                                                   vect1, vect2, perm_mask_low);
4714                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4715                 (*result_chain)[2*j+1] = low;
4716               }
4717             memcpy (dr_chain.address (), result_chain->address (),
4718                     length * sizeof (tree));
4719           }
4720     }
4721 }
4722
4723 /* Function vect_setup_realignment
4724
4725    This function is called when vectorizing an unaligned load using
4726    the dr_explicit_realign[_optimized] scheme.
4727    This function generates the following code at the loop prolog:
4728
4729       p = initial_addr;
4730    x  msq_init = *(floor(p));   # prolog load
4731       realignment_token = call target_builtin;
4732     loop:
4733    x  msq = phi (msq_init, ---)
4734
4735    The stmts marked with x are generated only for the case of
4736    dr_explicit_realign_optimized.
4737
4738    The code above sets up a new (vector) pointer, pointing to the first
4739    location accessed by STMT, and a "floor-aligned" load using that pointer.
4740    It also generates code to compute the "realignment-token" (if the relevant
4741    target hook was defined), and creates a phi-node at the loop-header bb
4742    whose arguments are the result of the prolog-load (created by this
4743    function) and the result of a load that takes place in the loop (to be
4744    created by the caller to this function).
4745
4746    For the case of dr_explicit_realign_optimized:
4747    The caller to this function uses the phi-result (msq) to create the
4748    realignment code inside the loop, and sets up the missing phi argument,
4749    as follows:
4750     loop:
4751       msq = phi (msq_init, lsq)
4752       lsq = *(floor(p'));        # load in loop
4753       result = realign_load (msq, lsq, realignment_token);
4754
4755    For the case of dr_explicit_realign:
4756     loop:
4757       msq = *(floor(p));        # load in loop
4758       p' = p + (VS-1);
4759       lsq = *(floor(p'));       # load in loop
4760       result = realign_load (msq, lsq, realignment_token);
4761
4762    Input:
4763    STMT - (scalar) load stmt to be vectorized. This load accesses
4764           a memory location that may be unaligned.
4765    BSI - place where new code is to be inserted.
4766    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4767                               is used.
4768
4769    Output:
4770    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4771                        target hook, if defined.
4772    Return value - the result of the loop-header phi node.  */
4773
4774 tree
4775 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4776                         tree *realignment_token,
4777                         enum dr_alignment_support alignment_support_scheme,
4778                         tree init_addr,
4779                         struct loop **at_loop)
4780 {
4781   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4782   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4783   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4784   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4785   struct loop *loop = NULL;
4786   edge pe = NULL;
4787   tree scalar_dest = gimple_assign_lhs (stmt);
4788   tree vec_dest;
4789   gimple inc;
4790   tree ptr;
4791   tree data_ref;
4792   gimple new_stmt;
4793   basic_block new_bb;
4794   tree msq_init = NULL_TREE;
4795   tree new_temp;
4796   gimple phi_stmt;
4797   tree msq = NULL_TREE;
4798   gimple_seq stmts = NULL;
4799   bool inv_p;
4800   bool compute_in_loop = false;
4801   bool nested_in_vect_loop = false;
4802   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4803   struct loop *loop_for_initial_load = NULL;
4804
4805   if (loop_vinfo)
4806     {
4807       loop = LOOP_VINFO_LOOP (loop_vinfo);
4808       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4809     }
4810
4811   gcc_assert (alignment_support_scheme == dr_explicit_realign
4812               || alignment_support_scheme == dr_explicit_realign_optimized);
4813
4814   /* We need to generate three things:
4815      1. the misalignment computation
4816      2. the extra vector load (for the optimized realignment scheme).
4817      3. the phi node for the two vectors from which the realignment is
4818       done (for the optimized realignment scheme).  */
4819
4820   /* 1. Determine where to generate the misalignment computation.
4821
4822      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4823      calculation will be generated by this function, outside the loop (in the
4824      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4825      caller, inside the loop.
4826
4827      Background: If the misalignment remains fixed throughout the iterations of
4828      the loop, then both realignment schemes are applicable, and also the
4829      misalignment computation can be done outside LOOP.  This is because we are
4830      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4831      are a multiple of VS (the Vector Size), and therefore the misalignment in
4832      different vectorized LOOP iterations is always the same.
4833      The problem arises only if the memory access is in an inner-loop nested
4834      inside LOOP, which is now being vectorized using outer-loop vectorization.
4835      This is the only case when the misalignment of the memory access may not
4836      remain fixed throughout the iterations of the inner-loop (as explained in
4837      detail in vect_supportable_dr_alignment).  In this case, not only is the
4838      optimized realignment scheme not applicable, but also the misalignment
4839      computation (and generation of the realignment token that is passed to
4840      REALIGN_LOAD) have to be done inside the loop.
4841
4842      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4843      or not, which in turn determines if the misalignment is computed inside
4844      the inner-loop, or outside LOOP.  */
4845
4846   if (init_addr != NULL_TREE || !loop_vinfo)
4847     {
4848       compute_in_loop = true;
4849       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4850     }
4851
4852
4853   /* 2. Determine where to generate the extra vector load.
4854
4855      For the optimized realignment scheme, instead of generating two vector
4856      loads in each iteration, we generate a single extra vector load in the
4857      preheader of the loop, and in each iteration reuse the result of the
4858      vector load from the previous iteration.  In case the memory access is in
4859      an inner-loop nested inside LOOP, which is now being vectorized using
4860      outer-loop vectorization, we need to determine whether this initial vector
4861      load should be generated at the preheader of the inner-loop, or can be
4862      generated at the preheader of LOOP.  If the memory access has no evolution
4863      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4864      to be generated inside LOOP (in the preheader of the inner-loop).  */
4865
4866   if (nested_in_vect_loop)
4867     {
4868       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4869       bool invariant_in_outerloop =
4870             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4871       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4872     }
4873   else
4874     loop_for_initial_load = loop;
4875   if (at_loop)
4876     *at_loop = loop_for_initial_load;
4877
4878   if (loop_for_initial_load)
4879     pe = loop_preheader_edge (loop_for_initial_load);
4880
4881   /* 3. For the case of the optimized realignment, create the first vector
4882       load at the loop preheader.  */
4883
4884   if (alignment_support_scheme == dr_explicit_realign_optimized)
4885     {
4886       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4887
4888       gcc_assert (!compute_in_loop);
4889       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4890       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4891                                       NULL_TREE, &init_addr, NULL, &inc,
4892                                       true, &inv_p);
4893       new_temp = copy_ssa_name (ptr, NULL);
4894       new_stmt = gimple_build_assign_with_ops
4895                    (BIT_AND_EXPR, new_temp, ptr,
4896                     build_int_cst (TREE_TYPE (ptr),
4897                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4898       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4899       gcc_assert (!new_bb);
4900       data_ref
4901         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4902                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4903       new_stmt = gimple_build_assign (vec_dest, data_ref);
4904       new_temp = make_ssa_name (vec_dest, new_stmt);
4905       gimple_assign_set_lhs (new_stmt, new_temp);
4906       if (pe)
4907         {
4908           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4909           gcc_assert (!new_bb);
4910         }
4911       else
4912          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4913
4914       msq_init = gimple_assign_lhs (new_stmt);
4915     }
4916
4917   /* 4. Create realignment token using a target builtin, if available.
4918       It is done either inside the containing loop, or before LOOP (as
4919       determined above).  */
4920
4921   if (targetm.vectorize.builtin_mask_for_load)
4922     {
4923       tree builtin_decl;
4924
4925       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4926       if (!init_addr)
4927         {
4928           /* Generate the INIT_ADDR computation outside LOOP.  */
4929           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4930                                                         NULL_TREE, loop);
4931           if (loop)
4932             {
4933               pe = loop_preheader_edge (loop);
4934               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4935               gcc_assert (!new_bb);
4936             }
4937           else
4938              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4939         }
4940
4941       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4942       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4943       vec_dest =
4944         vect_create_destination_var (scalar_dest,
4945                                      gimple_call_return_type (new_stmt));
4946       new_temp = make_ssa_name (vec_dest, new_stmt);
4947       gimple_call_set_lhs (new_stmt, new_temp);
4948
4949       if (compute_in_loop)
4950         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4951       else
4952         {
4953           /* Generate the misalignment computation outside LOOP.  */
4954           pe = loop_preheader_edge (loop);
4955           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4956           gcc_assert (!new_bb);
4957         }
4958
4959       *realignment_token = gimple_call_lhs (new_stmt);
4960
4961       /* The result of the CALL_EXPR to this builtin is determined from
4962          the value of the parameter and no global variables are touched
4963          which makes the builtin a "const" function.  Requiring the
4964          builtin to have the "const" attribute makes it unnecessary
4965          to call mark_call_clobbered.  */
4966       gcc_assert (TREE_READONLY (builtin_decl));
4967     }
4968
4969   if (alignment_support_scheme == dr_explicit_realign)
4970     return msq;
4971
4972   gcc_assert (!compute_in_loop);
4973   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4974
4975
4976   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4977
4978   pe = loop_preheader_edge (containing_loop);
4979   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4980   msq = make_ssa_name (vec_dest, NULL);
4981   phi_stmt = create_phi_node (msq, containing_loop->header);
4982   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4983
4984   return msq;
4985 }
4986
4987
4988 /* Function vect_grouped_load_supported.
4989
4990    Returns TRUE if even and odd permutations are supported,
4991    and FALSE otherwise.  */
4992
4993 bool
4994 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4995 {
4996   machine_mode mode = TYPE_MODE (vectype);
4997
4998   /* vect_permute_load_chain requires the group size to be equal to 3 or
4999      be a power of two.  */
5000   if (count != 3 && exact_log2 (count) == -1)
5001     {
5002       if (dump_enabled_p ())
5003         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5004                          "the size of the group of accesses"
5005                          " is not a power of 2 or not equal to 3\n");
5006       return false;
5007     }
5008
5009   /* Check that the permutation is supported.  */
5010   if (VECTOR_MODE_P (mode))
5011     {
5012       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5013       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5014
5015       if (count == 3)
5016         {
5017           unsigned int k;
5018           for (k = 0; k < 3; k++)
5019             {
5020               for (i = 0; i < nelt; i++)
5021                 if (3 * i + k < 2 * nelt)
5022                   sel[i] = 3 * i + k;
5023                 else
5024                   sel[i] = 0;
5025               if (!can_vec_perm_p (mode, false, sel))
5026                 {
5027                   if (dump_enabled_p ())
5028                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5029                                      "shuffle of 3 loads is not supported by"
5030                                      " target\n");
5031                     return false;
5032                 }
5033               for (i = 0, j = 0; i < nelt; i++)
5034                 if (3 * i + k < 2 * nelt)
5035                   sel[i] = i;
5036                 else
5037                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5038               if (!can_vec_perm_p (mode, false, sel))
5039                 {
5040                   if (dump_enabled_p ())
5041                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5042                                      "shuffle of 3 loads is not supported by"
5043                                      " target\n");
5044                   return false;
5045                 }
5046             }
5047           return true;
5048         }
5049       else
5050         {
5051           /* If length is not equal to 3 then only power of 2 is supported.  */
5052           gcc_assert (exact_log2 (count) != -1);
5053           for (i = 0; i < nelt; i++)
5054             sel[i] = i * 2;
5055           if (can_vec_perm_p (mode, false, sel))
5056             {
5057               for (i = 0; i < nelt; i++)
5058                 sel[i] = i * 2 + 1;
5059               if (can_vec_perm_p (mode, false, sel))
5060                 return true;
5061             }
5062         }
5063     }
5064
5065   if (dump_enabled_p ())
5066     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5067                      "extract even/odd not supported by target\n");
5068   return false;
5069 }
5070
5071 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5072    type VECTYPE.  */
5073
5074 bool
5075 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5076 {
5077   return vect_lanes_optab_supported_p ("vec_load_lanes",
5078                                        vec_load_lanes_optab,
5079                                        vectype, count);
5080 }
5081
5082 /* Function vect_permute_load_chain.
5083
5084    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5085    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5086    the input data correctly.  Return the final references for loads in
5087    RESULT_CHAIN.
5088
5089    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5090    The input is 4 vectors each containing 8 elements. We assign a number to each
5091    element, the input sequence is:
5092
5093    1st vec:   0  1  2  3  4  5  6  7
5094    2nd vec:   8  9 10 11 12 13 14 15
5095    3rd vec:  16 17 18 19 20 21 22 23
5096    4th vec:  24 25 26 27 28 29 30 31
5097
5098    The output sequence should be:
5099
5100    1st vec:  0 4  8 12 16 20 24 28
5101    2nd vec:  1 5  9 13 17 21 25 29
5102    3rd vec:  2 6 10 14 18 22 26 30
5103    4th vec:  3 7 11 15 19 23 27 31
5104
5105    i.e., the first output vector should contain the first elements of each
5106    interleaving group, etc.
5107
5108    We use extract_even/odd instructions to create such output.  The input of
5109    each extract_even/odd operation is two vectors
5110    1st vec    2nd vec
5111    0 1 2 3    4 5 6 7
5112
5113    and the output is the vector of extracted even/odd elements.  The output of
5114    extract_even will be:   0 2 4 6
5115    and of extract_odd:     1 3 5 7
5116
5117
5118    The permutation is done in log LENGTH stages.  In each stage extract_even
5119    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5120    their order.  In our example,
5121
5122    E1: extract_even (1st vec, 2nd vec)
5123    E2: extract_odd (1st vec, 2nd vec)
5124    E3: extract_even (3rd vec, 4th vec)
5125    E4: extract_odd (3rd vec, 4th vec)
5126
5127    The output for the first stage will be:
5128
5129    E1:  0  2  4  6  8 10 12 14
5130    E2:  1  3  5  7  9 11 13 15
5131    E3: 16 18 20 22 24 26 28 30
5132    E4: 17 19 21 23 25 27 29 31
5133
5134    In order to proceed and create the correct sequence for the next stage (or
5135    for the correct output, if the second stage is the last one, as in our
5136    example), we first put the output of extract_even operation and then the
5137    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5138    The input for the second stage is:
5139
5140    1st vec (E1):  0  2  4  6  8 10 12 14
5141    2nd vec (E3): 16 18 20 22 24 26 28 30
5142    3rd vec (E2):  1  3  5  7  9 11 13 15
5143    4th vec (E4): 17 19 21 23 25 27 29 31
5144
5145    The output of the second stage:
5146
5147    E1: 0 4  8 12 16 20 24 28
5148    E2: 2 6 10 14 18 22 26 30
5149    E3: 1 5  9 13 17 21 25 29
5150    E4: 3 7 11 15 19 23 27 31
5151
5152    And RESULT_CHAIN after reordering:
5153
5154    1st vec (E1):  0 4  8 12 16 20 24 28
5155    2nd vec (E3):  1 5  9 13 17 21 25 29
5156    3rd vec (E2):  2 6 10 14 18 22 26 30
5157    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5158
5159 static void
5160 vect_permute_load_chain (vec<tree> dr_chain,
5161                          unsigned int length,
5162                          gimple stmt,
5163                          gimple_stmt_iterator *gsi,
5164                          vec<tree> *result_chain)
5165 {
5166   tree data_ref, first_vect, second_vect;
5167   tree perm_mask_even, perm_mask_odd;
5168   tree perm3_mask_low, perm3_mask_high;
5169   gimple perm_stmt;
5170   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5171   unsigned int i, j, log_length = exact_log2 (length);
5172   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5173   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5174
5175   result_chain->quick_grow (length);
5176   memcpy (result_chain->address (), dr_chain.address (),
5177           length * sizeof (tree));
5178
5179   if (length == 3)
5180     {
5181       unsigned int k;
5182
5183       for (k = 0; k < 3; k++)
5184         {
5185           for (i = 0; i < nelt; i++)
5186             if (3 * i + k < 2 * nelt)
5187               sel[i] = 3 * i + k;
5188             else
5189               sel[i] = 0;
5190           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
5191           gcc_assert (perm3_mask_low != NULL);
5192
5193           for (i = 0, j = 0; i < nelt; i++)
5194             if (3 * i + k < 2 * nelt)
5195               sel[i] = i;
5196             else
5197               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5198
5199           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
5200           gcc_assert (perm3_mask_high != NULL);
5201
5202           first_vect = dr_chain[0];
5203           second_vect = dr_chain[1];
5204
5205           /* Create interleaving stmt (low part of):
5206              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5207                                                              ...}>  */
5208           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5209           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5210                                                     first_vect, second_vect,
5211                                                     perm3_mask_low);
5212           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5213
5214           /* Create interleaving stmt (high part of):
5215              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5216                                                               ...}>  */
5217           first_vect = data_ref;
5218           second_vect = dr_chain[2];
5219           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5220           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5221                                                     first_vect, second_vect,
5222                                                     perm3_mask_high);
5223           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5224           (*result_chain)[k] = data_ref;
5225         }
5226     }
5227   else
5228     {
5229       /* If length is not equal to 3 then only power of 2 is supported.  */
5230       gcc_assert (exact_log2 (length) != -1);
5231
5232       for (i = 0; i < nelt; ++i)
5233         sel[i] = i * 2;
5234       perm_mask_even = vect_gen_perm_mask (vectype, sel);
5235       gcc_assert (perm_mask_even != NULL);
5236
5237       for (i = 0; i < nelt; ++i)
5238         sel[i] = i * 2 + 1;
5239       perm_mask_odd = vect_gen_perm_mask (vectype, sel);
5240       gcc_assert (perm_mask_odd != NULL);
5241
5242       for (i = 0; i < log_length; i++)
5243         {
5244           for (j = 0; j < length; j += 2)
5245             {
5246               first_vect = dr_chain[j];
5247               second_vect = dr_chain[j+1];
5248
5249               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5250               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5251               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5252                                                         first_vect, second_vect,
5253                                                         perm_mask_even);
5254               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5255               (*result_chain)[j/2] = data_ref;
5256
5257               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5258               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5259               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5260                                                         first_vect, second_vect,
5261                                                         perm_mask_odd);
5262               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5263               (*result_chain)[j/2+length/2] = data_ref;
5264             }
5265           memcpy (dr_chain.address (), result_chain->address (),
5266                   length * sizeof (tree));
5267         }
5268     }
5269 }
5270
5271 /* Function vect_shift_permute_load_chain.
5272
5273    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5274    sequence of stmts to reorder the input data accordingly.
5275    Return the final references for loads in RESULT_CHAIN.
5276    Return true if successed, false otherwise.
5277
5278    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5279    The input is 3 vectors each containing 8 elements.  We assign a
5280    number to each element, the input sequence is:
5281
5282    1st vec:   0  1  2  3  4  5  6  7
5283    2nd vec:   8  9 10 11 12 13 14 15
5284    3rd vec:  16 17 18 19 20 21 22 23
5285
5286    The output sequence should be:
5287
5288    1st vec:  0 3 6  9 12 15 18 21
5289    2nd vec:  1 4 7 10 13 16 19 22
5290    3rd vec:  2 5 8 11 14 17 20 23
5291
5292    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5293
5294    First we shuffle all 3 vectors to get correct elements order:
5295
5296    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5297    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5298    3rd vec:  (16 19 22) (17 20 23) (18 21)
5299
5300    Next we unite and shift vector 3 times:
5301
5302    1st step:
5303      shift right by 6 the concatenation of:
5304      "1st vec" and  "2nd vec"
5305        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5306      "2nd vec" and  "3rd vec"
5307        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5308      "3rd vec" and  "1st vec"
5309        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5310                              | New vectors                   |
5311
5312      So that now new vectors are:
5313
5314      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5315      2nd vec:  (10 13) (16 19 22) (17 20 23)
5316      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5317
5318    2nd step:
5319      shift right by 5 the concatenation of:
5320      "1st vec" and  "3rd vec"
5321        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5322      "2nd vec" and  "1st vec"
5323        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5324      "3rd vec" and  "2nd vec"
5325        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5326                           | New vectors                   |
5327
5328      So that now new vectors are:
5329
5330      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5331      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5332      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5333
5334    3rd step:
5335      shift right by 5 the concatenation of:
5336      "1st vec" and  "1st vec"
5337        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5338      shift right by 3 the concatenation of:
5339      "2nd vec" and  "2nd vec"
5340                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5341                           | New vectors                   |
5342
5343      So that now all vectors are READY:
5344      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5345      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5346      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5347
5348    This algorithm is faster than one in vect_permute_load_chain if:
5349      1.  "shift of a concatination" is faster than general permutation.
5350          This is usually so.
5351      2.  The TARGET machine can't execute vector instructions in parallel.
5352          This is because each step of the algorithm depends on previous.
5353          The algorithm in vect_permute_load_chain is much more parallel.
5354
5355    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5356 */
5357
5358 static bool
5359 vect_shift_permute_load_chain (vec<tree> dr_chain,
5360                                unsigned int length,
5361                                gimple stmt,
5362                                gimple_stmt_iterator *gsi,
5363                                vec<tree> *result_chain)
5364 {
5365   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5366   tree perm2_mask1, perm2_mask2, perm3_mask;
5367   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5368   gimple perm_stmt;
5369
5370   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5371   unsigned int i;
5372   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5373   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5374   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5375   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5376
5377   result_chain->quick_grow (length);
5378   memcpy (result_chain->address (), dr_chain.address (),
5379           length * sizeof (tree));
5380
5381   if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5382     {
5383       for (i = 0; i < nelt / 2; ++i)
5384         sel[i] = i * 2;
5385       for (i = 0; i < nelt / 2; ++i)
5386         sel[nelt / 2 + i] = i * 2 + 1;
5387       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5388         {
5389           if (dump_enabled_p ())
5390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5391                              "shuffle of 2 fields structure is not \
5392                               supported by target\n");
5393           return false;
5394         }
5395       perm2_mask1 = vect_gen_perm_mask (vectype, sel);
5396       gcc_assert (perm2_mask1 != NULL);
5397
5398       for (i = 0; i < nelt / 2; ++i)
5399         sel[i] = i * 2 + 1;
5400       for (i = 0; i < nelt / 2; ++i)
5401         sel[nelt / 2 + i] = i * 2;
5402       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5403         {
5404           if (dump_enabled_p ())
5405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5406                              "shuffle of 2 fields structure is not \
5407                               supported by target\n");
5408           return false;
5409         }
5410       perm2_mask2 = vect_gen_perm_mask (vectype, sel);
5411       gcc_assert (perm2_mask2 != NULL);
5412
5413       /* Generating permutation constant to shift all elements.
5414          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5415       for (i = 0; i < nelt; i++)
5416         sel[i] = nelt / 2 + i;
5417       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5418         {
5419           if (dump_enabled_p ())
5420             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5421                              "shift permutation is not supported by target\n");
5422           return false;
5423         }
5424       shift1_mask = vect_gen_perm_mask (vectype, sel);
5425       gcc_assert (shift1_mask != NULL);
5426
5427       /* Generating permutation constant to select vector from 2.
5428          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5429       for (i = 0; i < nelt / 2; i++)
5430         sel[i] = i;
5431       for (i = nelt / 2; i < nelt; i++)
5432         sel[i] = nelt + i;
5433       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5434         {
5435           if (dump_enabled_p ())
5436             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5437                              "select is not supported by target\n");
5438           return false;
5439         }
5440       select_mask = vect_gen_perm_mask (vectype, sel);
5441       gcc_assert (select_mask != NULL);
5442
5443       first_vect = dr_chain[0];
5444       second_vect = dr_chain[1];
5445
5446       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5447       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5448                                                 first_vect, first_vect,
5449                                                 perm2_mask1);
5450       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5451       vect[0] = data_ref;
5452
5453       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5454       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5455                                                 second_vect, second_vect,
5456                                                 perm2_mask2);
5457       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5458       vect[1] = data_ref;
5459
5460       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5461       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5462                                                 vect[0], vect[1],
5463                                                 shift1_mask);
5464       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5465       (*result_chain)[1] = data_ref;
5466
5467       data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5468       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5469                                                 vect[0], vect[1],
5470                                                 select_mask);
5471       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5472       (*result_chain)[0] = data_ref;
5473
5474       return true;
5475     }
5476   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5477     {
5478       unsigned int k = 0, l = 0;
5479
5480       /* Generating permutation constant to get all elements in rigth order.
5481          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5482       for (i = 0; i < nelt; i++)
5483         {
5484           if (3 * k + (l % 3) >= nelt)
5485             {
5486               k = 0;
5487               l += (3 - (nelt % 3));
5488             }
5489           sel[i] = 3 * k + (l % 3);
5490           k++;
5491         }
5492       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5493         {
5494           if (dump_enabled_p ())
5495             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5496                              "shuffle of 3 fields structure is not \
5497                               supported by target\n");
5498           return false;
5499         }
5500       perm3_mask = vect_gen_perm_mask (vectype, sel);
5501       gcc_assert (perm3_mask != NULL);
5502
5503       /* Generating permutation constant to shift all elements.
5504          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5505       for (i = 0; i < nelt; i++)
5506         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5507       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5508         {
5509           if (dump_enabled_p ())
5510             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5511                              "shift permutation is not supported by target\n");
5512           return false;
5513         }
5514       shift1_mask = vect_gen_perm_mask (vectype, sel);
5515       gcc_assert (shift1_mask != NULL);
5516
5517       /* Generating permutation constant to shift all elements.
5518          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5519       for (i = 0; i < nelt; i++)
5520         sel[i] = 2 * (nelt / 3) + 1 + i;
5521       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5522         {
5523           if (dump_enabled_p ())
5524             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5525                              "shift permutation is not supported by target\n");
5526           return false;
5527         }
5528       shift2_mask = vect_gen_perm_mask (vectype, sel);
5529       gcc_assert (shift2_mask != NULL);
5530
5531       /* Generating permutation constant to shift all elements.
5532          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5533       for (i = 0; i < nelt; i++)
5534         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5535       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5536         {
5537           if (dump_enabled_p ())
5538             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5539                              "shift permutation is not supported by target\n");
5540           return false;
5541         }
5542       shift3_mask = vect_gen_perm_mask (vectype, sel);
5543       gcc_assert (shift3_mask != NULL);
5544
5545       /* Generating permutation constant to shift all elements.
5546          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5547       for (i = 0; i < nelt; i++)
5548         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5549       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5550         {
5551           if (dump_enabled_p ())
5552             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5553                              "shift permutation is not supported by target\n");
5554           return false;
5555         }
5556       shift4_mask = vect_gen_perm_mask (vectype, sel);
5557       gcc_assert (shift4_mask != NULL);
5558
5559       for (k = 0; k < 3; k++)
5560         {
5561           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5562           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5563                                                     dr_chain[k], dr_chain[k],
5564                                                     perm3_mask);
5565           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5566           vect[k] = data_ref;
5567         }
5568
5569       for (k = 0; k < 3; k++)
5570         {
5571           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5572           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5573                                                     vect[k % 3],
5574                                                     vect[(k + 1) % 3],
5575                                                     shift1_mask);
5576           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5577           vect_shift[k] = data_ref;
5578         }
5579
5580       for (k = 0; k < 3; k++)
5581         {
5582           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5583           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5584                                                     vect_shift[(4 - k) % 3],
5585                                                     vect_shift[(3 - k) % 3],
5586                                                     shift2_mask);
5587           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588           vect[k] = data_ref;
5589         }
5590
5591       (*result_chain)[3 - (nelt % 3)] = vect[2];
5592
5593       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5594       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5595                                                 vect[0], vect[0],
5596                                                 shift3_mask);
5597       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5598       (*result_chain)[nelt % 3] = data_ref;
5599
5600       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5601       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5602                                                 vect[1], vect[1],
5603                                                 shift4_mask);
5604       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5605       (*result_chain)[0] = data_ref;
5606       return true;
5607     }
5608   return false;
5609 }
5610
5611 /* Function vect_transform_grouped_load.
5612
5613    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5614    to perform their permutation and ascribe the result vectorized statements to
5615    the scalar statements.
5616 */
5617
5618 void
5619 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5620                              gimple_stmt_iterator *gsi)
5621 {
5622   machine_mode mode;
5623   vec<tree> result_chain = vNULL;
5624
5625   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5626      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5627      vectors, that are ready for vector computation.  */
5628   result_chain.create (size);
5629
5630   /* If reassociation width for vector type is 2 or greater target machine can
5631      execute 2 or more vector instructions in parallel.  Otherwise try to
5632      get chain for loads group using vect_shift_permute_load_chain.  */
5633   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5634   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5635       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5636                                          gsi, &result_chain))
5637     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5638   vect_record_grouped_load_vectors (stmt, result_chain);
5639   result_chain.release ();
5640 }
5641
5642 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5643    generated as part of the vectorization of STMT.  Assign the statement
5644    for each vector to the associated scalar statement.  */
5645
5646 void
5647 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5648 {
5649   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5650   gimple next_stmt, new_stmt;
5651   unsigned int i, gap_count;
5652   tree tmp_data_ref;
5653
5654   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5655      Since we scan the chain starting from it's first node, their order
5656      corresponds the order of data-refs in RESULT_CHAIN.  */
5657   next_stmt = first_stmt;
5658   gap_count = 1;
5659   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5660     {
5661       if (!next_stmt)
5662         break;
5663
5664       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5665        code elimination pass later.  No need to check for the first stmt in
5666        the group, since it always exists.
5667        GROUP_GAP is the number of steps in elements from the previous
5668        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5669        correspond to the gaps.  */
5670       if (next_stmt != first_stmt
5671           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5672       {
5673         gap_count++;
5674         continue;
5675       }
5676
5677       while (next_stmt)
5678         {
5679           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5680           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5681              copies, and we put the new vector statement in the first available
5682              RELATED_STMT.  */
5683           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5684             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5685           else
5686             {
5687               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5688                 {
5689                   gimple prev_stmt =
5690                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5691                   gimple rel_stmt =
5692                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5693                   while (rel_stmt)
5694                     {
5695                       prev_stmt = rel_stmt;
5696                       rel_stmt =
5697                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5698                     }
5699
5700                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5701                     new_stmt;
5702                 }
5703             }
5704
5705           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5706           gap_count = 1;
5707           /* If NEXT_STMT accesses the same DR as the previous statement,
5708              put the same TMP_DATA_REF as its vectorized statement; otherwise
5709              get the next data-ref from RESULT_CHAIN.  */
5710           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5711             break;
5712         }
5713     }
5714 }
5715
5716 /* Function vect_force_dr_alignment_p.
5717
5718    Returns whether the alignment of a DECL can be forced to be aligned
5719    on ALIGNMENT bit boundary.  */
5720
5721 bool
5722 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5723 {
5724   if (TREE_CODE (decl) != VAR_DECL)
5725     return false;
5726
5727   /* With -fno-toplevel-reorder we may have already output the constant.  */
5728   if (TREE_ASM_WRITTEN (decl))
5729     return false;
5730
5731   /* Constant pool entries may be shared and not properly merged by LTO.  */
5732   if (DECL_IN_CONSTANT_POOL (decl))
5733     return false;
5734
5735   if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5736     {
5737       symtab_node *snode;
5738
5739       /* We cannot change alignment of symbols that may bind to symbols
5740          in other translation unit that may contain a definition with lower
5741          alignment.  */
5742       if (!decl_binds_to_current_def_p (decl))
5743         return false;
5744
5745       /* When compiling partition, be sure the symbol is not output by other
5746          partition.  */
5747       snode = symtab_node::get (decl);
5748       if (flag_ltrans
5749           && (snode->in_other_partition
5750               || snode->get_partitioning_class () == SYMBOL_DUPLICATE))
5751         return false;
5752     }
5753
5754   /* Do not override the alignment as specified by the ABI when the used
5755      attribute is set.  */
5756   if (DECL_PRESERVE_P (decl))
5757     return false;
5758
5759   /* Do not override explicit alignment set by the user when an explicit
5760      section name is also used.  This is a common idiom used by many
5761      software projects.  */
5762   if (TREE_STATIC (decl)
5763       && DECL_SECTION_NAME (decl) != NULL
5764       && !symtab_node::get (decl)->implicit_section)
5765     return false;
5766
5767   /* If symbol is an alias, we need to check that target is OK.  */
5768   if (TREE_STATIC (decl))
5769     {
5770       tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl;
5771       if (target != decl)
5772         {
5773           if (DECL_PRESERVE_P (target))
5774             return false;
5775           decl = target;
5776         }
5777     }
5778
5779   if (TREE_STATIC (decl))
5780     return (alignment <= MAX_OFILE_ALIGNMENT);
5781   else
5782     return (alignment <= MAX_STACK_ALIGNMENT);
5783 }
5784
5785
5786 /* Return whether the data reference DR is supported with respect to its
5787    alignment.
5788    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5789    it is aligned, i.e., check if it is possible to vectorize it with different
5790    alignment.  */
5791
5792 enum dr_alignment_support
5793 vect_supportable_dr_alignment (struct data_reference *dr,
5794                                bool check_aligned_accesses)
5795 {
5796   gimple stmt = DR_STMT (dr);
5797   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5798   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5799   machine_mode mode = TYPE_MODE (vectype);
5800   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5801   struct loop *vect_loop = NULL;
5802   bool nested_in_vect_loop = false;
5803
5804   if (aligned_access_p (dr) && !check_aligned_accesses)
5805     return dr_aligned;
5806
5807   /* For now assume all conditional loads/stores support unaligned
5808      access without any special code.  */
5809   if (is_gimple_call (stmt)
5810       && gimple_call_internal_p (stmt)
5811       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5812           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5813     return dr_unaligned_supported;
5814
5815   if (loop_vinfo)
5816     {
5817       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5818       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5819     }
5820
5821   /* Possibly unaligned access.  */
5822
5823   /* We can choose between using the implicit realignment scheme (generating
5824      a misaligned_move stmt) and the explicit realignment scheme (generating
5825      aligned loads with a REALIGN_LOAD).  There are two variants to the
5826      explicit realignment scheme: optimized, and unoptimized.
5827      We can optimize the realignment only if the step between consecutive
5828      vector loads is equal to the vector size.  Since the vector memory
5829      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5830      is guaranteed that the misalignment amount remains the same throughout the
5831      execution of the vectorized loop.  Therefore, we can create the
5832      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5833      at the loop preheader.
5834
5835      However, in the case of outer-loop vectorization, when vectorizing a
5836      memory access in the inner-loop nested within the LOOP that is now being
5837      vectorized, while it is guaranteed that the misalignment of the
5838      vectorized memory access will remain the same in different outer-loop
5839      iterations, it is *not* guaranteed that is will remain the same throughout
5840      the execution of the inner-loop.  This is because the inner-loop advances
5841      with the original scalar step (and not in steps of VS).  If the inner-loop
5842      step happens to be a multiple of VS, then the misalignment remains fixed
5843      and we can use the optimized realignment scheme.  For example:
5844
5845       for (i=0; i<N; i++)
5846         for (j=0; j<M; j++)
5847           s += a[i+j];
5848
5849      When vectorizing the i-loop in the above example, the step between
5850      consecutive vector loads is 1, and so the misalignment does not remain
5851      fixed across the execution of the inner-loop, and the realignment cannot
5852      be optimized (as illustrated in the following pseudo vectorized loop):
5853
5854       for (i=0; i<N; i+=4)
5855         for (j=0; j<M; j++){
5856           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5857                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5858                          // (assuming that we start from an aligned address).
5859           }
5860
5861      We therefore have to use the unoptimized realignment scheme:
5862
5863       for (i=0; i<N; i+=4)
5864           for (j=k; j<M; j+=4)
5865           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5866                            // that the misalignment of the initial address is
5867                            // 0).
5868
5869      The loop can then be vectorized as follows:
5870
5871       for (k=0; k<4; k++){
5872         rt = get_realignment_token (&vp[k]);
5873         for (i=0; i<N; i+=4){
5874           v1 = vp[i+k];
5875           for (j=k; j<M; j+=4){
5876             v2 = vp[i+j+VS-1];
5877             va = REALIGN_LOAD <v1,v2,rt>;
5878             vs += va;
5879             v1 = v2;
5880           }
5881         }
5882     } */
5883
5884   if (DR_IS_READ (dr))
5885     {
5886       bool is_packed = false;
5887       tree type = (TREE_TYPE (DR_REF (dr)));
5888
5889       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5890           && (!targetm.vectorize.builtin_mask_for_load
5891               || targetm.vectorize.builtin_mask_for_load ()))
5892         {
5893           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5894           if ((nested_in_vect_loop
5895                && (TREE_INT_CST_LOW (DR_STEP (dr))
5896                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5897               || !loop_vinfo)
5898             return dr_explicit_realign;
5899           else
5900             return dr_explicit_realign_optimized;
5901         }
5902       if (!known_alignment_for_access_p (dr))
5903         is_packed = not_size_aligned (DR_REF (dr));
5904
5905       if ((TYPE_USER_ALIGN (type) && !is_packed)
5906           || targetm.vectorize.
5907                support_vector_misalignment (mode, type,
5908                                             DR_MISALIGNMENT (dr), is_packed))
5909         /* Can't software pipeline the loads, but can at least do them.  */
5910         return dr_unaligned_supported;
5911     }
5912   else
5913     {
5914       bool is_packed = false;
5915       tree type = (TREE_TYPE (DR_REF (dr)));
5916
5917       if (!known_alignment_for_access_p (dr))
5918         is_packed = not_size_aligned (DR_REF (dr));
5919
5920      if ((TYPE_USER_ALIGN (type) && !is_packed)
5921          || targetm.vectorize.
5922               support_vector_misalignment (mode, type,
5923                                            DR_MISALIGNMENT (dr), is_packed))
5924        return dr_unaligned_supported;
5925     }
5926
5927   /* Unsupported.  */
5928   return dr_unaligned_unsupported;
5929 }