]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-data-refs.c
2015-10-29 Andrew MacLeod <amacleod@redhat.com>
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
CommitLineData
48e1416a 1/* Data References Analysis and Manipulation Utilities for Vectorization.
d353bf18 2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
48e1416a 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
fb85abff 4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
9ef16211 25#include "backend.h"
7c29e30e 26#include "target.h"
27#include "rtl.h"
fb85abff 28#include "tree.h"
9ef16211 29#include "gimple.h"
7c29e30e 30#include "predict.h"
31#include "tm_p.h"
9ef16211 32#include "ssa.h"
7c29e30e 33#include "optabs-tree.h"
34#include "cgraph.h"
35#include "gimple-pretty-print.h"
36#include "diagnostic-core.h"
37#include "dumpfile.h"
9ef16211 38#include "alias.h"
b20a8bb4 39#include "fold-const.h"
9ed99284 40#include "stor-layout.h"
bc61cadb 41#include "internal-fn.h"
42#include "tree-eh.h"
a8783bee 43#include "gimplify.h"
dcf1a1ec 44#include "gimple-iterator.h"
e795d6e1 45#include "gimplify-me.h"
05d9c18a 46#include "tree-ssa-loop-ivopts.h"
47#include "tree-ssa-loop-manip.h"
073c1fd5 48#include "tree-ssa-loop.h"
fb85abff 49#include "cfgloop.h"
fb85abff 50#include "tree-chrec.h"
51#include "tree-scalar-evolution.h"
52#include "tree-vectorizer.h"
8e3cb73b 53#include "expr.h"
f7715905 54#include "builtins.h"
0d8001a7 55#include "params.h"
fb85abff 56
94b7b4dd 57/* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59
60static bool
61vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
63{
3754d046 64 machine_mode mode, array_mode;
94b7b4dd 65 bool limit_p;
66
67 mode = TYPE_MODE (vectype);
68 limit_p = !targetm.array_mode_supported_p (mode, count);
69 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
70 MODE_INT, limit_p);
71
72 if (array_mode == BLKmode)
73 {
6d8fb6cf 74 if (dump_enabled_p ())
78bb46f5 75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
7bd765d4 77 GET_MODE_NAME (mode), count);
94b7b4dd 78 return false;
79 }
80
81 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
82 {
6d8fb6cf 83 if (dump_enabled_p ())
7bd765d4 84 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 85 "cannot use %s<%s><%s>\n", name,
7bd765d4 86 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
94b7b4dd 87 return false;
88 }
89
6d8fb6cf 90 if (dump_enabled_p ())
7bd765d4 91 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 92 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
7bd765d4 93 GET_MODE_NAME (mode));
94b7b4dd 94
95 return true;
96}
97
98
fb85abff 99/* Return the smallest scalar part of STMT.
282bf14c 100 This is used to determine the vectype of the stmt. We generally set the
101 vectype according to the type of the result (lhs). For stmts whose
fb85abff 102 result-type is different than the type of the arguments (e.g., demotion,
48e1416a 103 promotion), vectype will be reset appropriately (later). Note that we have
fb85abff 104 to visit the smallest datatype in this function, because that determines the
282bf14c 105 VF. If the smallest datatype in the loop is present only as the rhs of a
fb85abff 106 promotion operation - we'd miss it.
107 Such a case, where a variable of this datatype does not appear in the lhs
108 anywhere in the loop, can only occur if it's an invariant: e.g.:
48e1416a 109 'int_x = (int) short_inv', which we'd expect to have been optimized away by
282bf14c 110 invariant motion. However, we cannot rely on invariant motion to always
111 take invariants out of the loop, and so in the case of promotion we also
112 have to check the rhs.
fb85abff 113 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
114 types. */
115
116tree
42acab1c 117vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
fb85abff 118 HOST_WIDE_INT *rhs_size_unit)
119{
120 tree scalar_type = gimple_expr_type (stmt);
121 HOST_WIDE_INT lhs, rhs;
122
f9ae6f95 123 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
fb85abff 124
125 if (is_gimple_assign (stmt)
126 && (gimple_assign_cast_p (stmt)
127 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
1c928a0c 128 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
fb85abff 129 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
130 {
131 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
132
f9ae6f95 133 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
fb85abff 134 if (rhs < lhs)
135 scalar_type = rhs_type;
136 }
48e1416a 137
138 *lhs_size_unit = lhs;
fb85abff 139 *rhs_size_unit = rhs;
140 return scalar_type;
141}
142
143
fb85abff 144/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
145 tested at run-time. Return TRUE if DDR was successfully inserted.
146 Return false if versioning is not supported. */
147
148static bool
149vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
150{
151 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
152
153 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
154 return false;
155
6d8fb6cf 156 if (dump_enabled_p ())
fb85abff 157 {
7bd765d4 158 dump_printf_loc (MSG_NOTE, vect_location,
159 "mark for run-time aliasing test between ");
160 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
161 dump_printf (MSG_NOTE, " and ");
162 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
78bb46f5 163 dump_printf (MSG_NOTE, "\n");
fb85abff 164 }
165
166 if (optimize_loop_nest_for_size_p (loop))
167 {
6d8fb6cf 168 if (dump_enabled_p ())
78bb46f5 169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
170 "versioning not supported when optimizing"
171 " for size.\n");
fb85abff 172 return false;
173 }
174
175 /* FORNOW: We don't support versioning with outer-loop vectorization. */
176 if (loop->inner)
177 {
6d8fb6cf 178 if (dump_enabled_p ())
78bb46f5 179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
180 "versioning not yet supported for outer-loops.\n");
fb85abff 181 return false;
182 }
183
f634c3e9 184 /* FORNOW: We don't support creating runtime alias tests for non-constant
185 step. */
186 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
187 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
188 {
6d8fb6cf 189 if (dump_enabled_p ())
78bb46f5 190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 191 "versioning not yet supported for non-constant "
78bb46f5 192 "step\n");
f634c3e9 193 return false;
194 }
195
f1f41a6c 196 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
fb85abff 197 return true;
198}
199
37545e54 200
fb85abff 201/* Function vect_analyze_data_ref_dependence.
202
203 Return TRUE if there (might) exist a dependence between a memory-reference
204 DRA and a memory-reference DRB. When versioning for alias may check a
91a74fc6 205 dependence at run-time, return FALSE. Adjust *MAX_VF according to
206 the data dependence. */
48e1416a 207
fb85abff 208static bool
209vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
80508571 210 loop_vec_info loop_vinfo, int *max_vf)
fb85abff 211{
212 unsigned int i;
68f15e9d 213 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 214 struct data_reference *dra = DDR_A (ddr);
215 struct data_reference *drb = DDR_B (ddr);
48e1416a 216 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
fb85abff 217 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
fb85abff 218 lambda_vector dist_v;
219 unsigned int loop_depth;
48e1416a 220
68f15e9d 221 /* In loop analysis all data references should be vectorizable. */
6ea6a380 222 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
223 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
68f15e9d 224 gcc_unreachable ();
6ea6a380 225
68f15e9d 226 /* Independent data accesses. */
fb85abff 227 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
68f15e9d 228 return false;
37545e54 229
68f15e9d 230 if (dra == drb
231 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
fb85abff 232 return false;
48e1416a 233
0f52e33a 234 /* Even if we have an anti-dependence then, as the vectorized loop covers at
235 least two scalar iterations, there is always also a true dependence.
236 As the vectorizer does not re-order loads and stores we can ignore
237 the anti-dependence if TBAA can disambiguate both DRs similar to the
238 case with known negative distance anti-dependences (positive
239 distance anti-dependences would violate TBAA constraints). */
240 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
241 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
242 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
243 get_alias_set (DR_REF (drb))))
244 return false;
48e1416a 245
68f15e9d 246 /* Unknown data dependence. */
fb85abff 247 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
248 {
3d483a94 249 /* If user asserted safelen consecutive iterations can be
250 executed concurrently, assume independence. */
251 if (loop->safelen >= 2)
252 {
253 if (loop->safelen < *max_vf)
254 *max_vf = loop->safelen;
c7a8722c 255 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
3d483a94 256 return false;
257 }
258
0bd6d857 259 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
260 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
95e19962 261 {
262 if (dump_enabled_p ())
263 {
264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
265 "versioning for alias not supported for: "
266 "can't determine dependence between ");
267 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
268 DR_REF (dra));
269 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
271 DR_REF (drb));
78bb46f5 272 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
95e19962 273 }
4c2623ad 274 return true;
95e19962 275 }
276
6d8fb6cf 277 if (dump_enabled_p ())
68f15e9d 278 {
279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
280 "versioning for alias required: "
281 "can't determine dependence between ");
282 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
283 DR_REF (dra));
284 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (drb));
78bb46f5 287 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
68f15e9d 288 }
d4b21757 289
68f15e9d 290 /* Add to list of ddrs that need to be tested at run-time. */
291 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
37545e54 292 }
293
68f15e9d 294 /* Known data dependence. */
fb85abff 295 if (DDR_NUM_DIST_VECTS (ddr) == 0)
296 {
3d483a94 297 /* If user asserted safelen consecutive iterations can be
298 executed concurrently, assume independence. */
299 if (loop->safelen >= 2)
300 {
301 if (loop->safelen < *max_vf)
302 *max_vf = loop->safelen;
c7a8722c 303 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
3d483a94 304 return false;
305 }
306
0bd6d857 307 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
308 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
95e19962 309 {
310 if (dump_enabled_p ())
311 {
312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
313 "versioning for alias not supported for: "
314 "bad dist vector for ");
315 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
316 DR_REF (dra));
317 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (drb));
78bb46f5 320 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
95e19962 321 }
4c2623ad 322 return true;
95e19962 323 }
324
6d8fb6cf 325 if (dump_enabled_p ())
fb85abff 326 {
78bb46f5 327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 328 "versioning for alias required: "
329 "bad dist vector for ");
330 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
331 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
332 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
78bb46f5 333 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 334 }
335 /* Add to list of ddrs that need to be tested at run-time. */
336 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
48e1416a 337 }
fb85abff 338
339 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
f1f41a6c 340 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
fb85abff 341 {
342 int dist = dist_v[loop_depth];
343
6d8fb6cf 344 if (dump_enabled_p ())
7bd765d4 345 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 346 "dependence distance = %d.\n", dist);
fb85abff 347
91a74fc6 348 if (dist == 0)
fb85abff 349 {
6d8fb6cf 350 if (dump_enabled_p ())
fb85abff 351 {
78bb46f5 352 dump_printf_loc (MSG_NOTE, vect_location,
353 "dependence distance == 0 between ");
7bd765d4 354 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
355 dump_printf (MSG_NOTE, " and ");
356 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 358 }
359
4d525783 360 /* When we perform grouped accesses and perform implicit CSE
361 by detecting equal accesses and doing disambiguation with
362 runtime alias tests like for
363 .. = a[i];
364 .. = a[i+1];
365 a[i] = ..;
366 a[i+1] = ..;
367 *p = ..;
368 .. = a[i];
369 .. = a[i+1];
370 where we will end up loading { a[i], a[i+1] } once, make
371 sure that inserting group loads before the first load and
5a91be9e 372 stores after the last store will do the right thing.
373 Similar for groups like
374 a[i] = ...;
375 ... = a[i];
376 a[i+1] = ...;
377 where loads from the group interleave with the store. */
378 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
379 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
4d525783 380 {
42acab1c 381 gimple *earlier_stmt;
4d525783 382 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
383 if (DR_IS_WRITE
384 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
385 {
386 if (dump_enabled_p ())
387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 388 "READ_WRITE dependence in interleaving."
389 "\n");
4d525783 390 return true;
391 }
fb85abff 392 }
48e1416a 393
91a74fc6 394 continue;
395 }
396
397 if (dist > 0 && DDR_REVERSED_P (ddr))
398 {
399 /* If DDR_REVERSED_P the order of the data-refs in DDR was
400 reversed (to make distance vector positive), and the actual
401 distance is negative. */
6d8fb6cf 402 if (dump_enabled_p ())
7bd765d4 403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 404 "dependence distance negative.\n");
a8cf7702 405 /* Record a negative dependence distance to later limit the
406 amount of stmt copying / unrolling we can perform.
407 Only need to handle read-after-write dependence. */
408 if (DR_IS_READ (drb)
409 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
410 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
411 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
91a74fc6 412 continue;
413 }
414
415 if (abs (dist) >= 2
416 && abs (dist) < *max_vf)
417 {
418 /* The dependence distance requires reduction of the maximal
419 vectorization factor. */
420 *max_vf = abs (dist);
6d8fb6cf 421 if (dump_enabled_p ())
7bd765d4 422 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 423 "adjusting maximal vectorization factor to %i\n",
424 *max_vf);
fb85abff 425 }
426
91a74fc6 427 if (abs (dist) >= *max_vf)
fb85abff 428 {
48e1416a 429 /* Dependence distance does not create dependence, as far as
91a74fc6 430 vectorization is concerned, in this case. */
6d8fb6cf 431 if (dump_enabled_p ())
7bd765d4 432 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 433 "dependence distance >= VF.\n");
fb85abff 434 continue;
435 }
436
6d8fb6cf 437 if (dump_enabled_p ())
fb85abff 438 {
7bd765d4 439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 440 "not vectorized, possible dependence "
441 "between data-refs ");
7bd765d4 442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
443 dump_printf (MSG_NOTE, " and ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 445 dump_printf (MSG_NOTE, "\n");
fb85abff 446 }
447
448 return true;
449 }
450
451 return false;
452}
453
454/* Function vect_analyze_data_ref_dependences.
48e1416a 455
fb85abff 456 Examine all the data references in the loop, and make sure there do not
91a74fc6 457 exist any data dependences between them. Set *MAX_VF according to
458 the maximum vectorization factor the data dependences allow. */
48e1416a 459
fb85abff 460bool
68f15e9d 461vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
fb85abff 462{
463 unsigned int i;
fb85abff 464 struct data_dependence_relation *ddr;
465
6d8fb6cf 466 if (dump_enabled_p ())
7bd765d4 467 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 468 "=== vect_analyze_data_ref_dependences ===\n");
68f15e9d 469
41500e78 470 LOOP_VINFO_DDRS (loop_vinfo)
471 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
472 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
c7a8722c 473 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
68f15e9d 474 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
475 &LOOP_VINFO_DDRS (loop_vinfo),
476 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
477 return false;
478
479 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
480 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
481 return false;
482
483 return true;
484}
485
486
487/* Function vect_slp_analyze_data_ref_dependence.
488
489 Return TRUE if there (might) exist a dependence between a memory-reference
490 DRA and a memory-reference DRB. When versioning for alias may check a
491 dependence at run-time, return FALSE. Adjust *MAX_VF according to
492 the data dependence. */
493
494static bool
495vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
496{
497 struct data_reference *dra = DDR_A (ddr);
498 struct data_reference *drb = DDR_B (ddr);
499
500 /* We need to check dependences of statements marked as unvectorizable
501 as well, they still can prohibit vectorization. */
502
503 /* Independent data accesses. */
504 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
505 return false;
506
507 if (dra == drb)
508 return false;
509
510 /* Read-read is OK. */
511 if (DR_IS_READ (dra) && DR_IS_READ (drb))
512 return false;
513
1fa434e3 514 /* If dra and drb are part of the same interleaving chain consider
515 them independent. */
516 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
517 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
518 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
519 return false;
520
68f15e9d 521 /* Unknown data dependence. */
522 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
07e3bcbf 523 {
50e6c257 524 if (dump_enabled_p ())
525 {
526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
527 "can't determine dependence between ");
528 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
529 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
530 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
78bb46f5 531 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
50e6c257 532 }
07e3bcbf 533 }
50e6c257 534 else if (dump_enabled_p ())
07e3bcbf 535 {
68f15e9d 536 dump_printf_loc (MSG_NOTE, vect_location,
537 "determined dependence between ");
538 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
539 dump_printf (MSG_NOTE, " and ");
540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 541 dump_printf (MSG_NOTE, "\n");
07e3bcbf 542 }
48e1416a 543
50e6c257 544 /* We do not vectorize basic blocks with write-write dependencies. */
68f15e9d 545 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
546 return true;
547
50e6c257 548 /* If we have a read-write dependence check that the load is before the store.
68f15e9d 549 When we vectorize basic blocks, vector load can be only before
550 corresponding scalar load, and vector store can be only after its
551 corresponding scalar store. So the order of the acceses is preserved in
552 case the load is before the store. */
42acab1c 553 gimple *earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
68f15e9d 554 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
50e6c257 555 {
556 /* That only holds for load-store pairs taking part in vectorization. */
557 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
558 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
559 return false;
560 }
68f15e9d 561
562 return true;
563}
564
565
566/* Function vect_analyze_data_ref_dependences.
567
568 Examine all the data references in the basic-block, and make sure there
569 do not exist any data dependences between them. Set *MAX_VF according to
570 the maximum vectorization factor the data dependences allow. */
571
572bool
573vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
574{
575 struct data_dependence_relation *ddr;
576 unsigned int i;
577
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 580 "=== vect_slp_analyze_data_ref_dependences ===\n");
68f15e9d 581
582 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
583 &BB_VINFO_DDRS (bb_vinfo),
584 vNULL, true))
585 return false;
586
587 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
588 if (vect_slp_analyze_data_ref_dependence (ddr))
fb85abff 589 return false;
590
591 return true;
592}
593
594
595/* Function vect_compute_data_ref_alignment
596
597 Compute the misalignment of the data reference DR.
598
599 Output:
600 1. If during the misalignment computation it is found that the data reference
601 cannot be vectorized then false is returned.
602 2. DR_MISALIGNMENT (DR) is defined.
603
604 FOR NOW: No analysis is actually performed. Misalignment is calculated
605 only for trivial cases. TODO. */
606
607static bool
608vect_compute_data_ref_alignment (struct data_reference *dr)
609{
42acab1c 610 gimple *stmt = DR_STMT (dr);
48e1416a 611 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
fb85abff 612 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
37545e54 613 struct loop *loop = NULL;
fb85abff 614 tree ref = DR_REF (dr);
615 tree vectype;
616 tree base, base_addr;
994be998 617 tree misalign = NULL_TREE;
b4142d58 618 tree aligned_to;
619 unsigned HOST_WIDE_INT alignment;
48e1416a 620
6d8fb6cf 621 if (dump_enabled_p ())
7bd765d4 622 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 623 "vect_compute_data_ref_alignment:\n");
fb85abff 624
37545e54 625 if (loop_vinfo)
626 loop = LOOP_VINFO_LOOP (loop_vinfo);
48e1416a 627
fb85abff 628 /* Initialize misalignment to unknown. */
629 SET_DR_MISALIGNMENT (dr, -1);
630
994be998 631 if (tree_fits_shwi_p (DR_STEP (dr)))
632 misalign = DR_INIT (dr);
fb85abff 633 aligned_to = DR_ALIGNED_TO (dr);
634 base_addr = DR_BASE_ADDRESS (dr);
635 vectype = STMT_VINFO_VECTYPE (stmt_info);
636
637 /* In case the dataref is in an inner-loop of the loop that is being
638 vectorized (LOOP), we use the base and misalignment information
282bf14c 639 relative to the outer-loop (LOOP). This is ok only if the misalignment
fb85abff 640 stays the same throughout the execution of the inner-loop, which is why
641 we have to check that the stride of the dataref in the inner-loop evenly
642 divides by the vector size. */
37545e54 643 if (loop && nested_in_vect_loop_p (loop, stmt))
fb85abff 644 {
645 tree step = DR_STEP (dr);
48e1416a 646
994be998 647 if (tree_fits_shwi_p (step)
648 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
fb85abff 649 {
6d8fb6cf 650 if (dump_enabled_p ())
7bd765d4 651 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 652 "inner step divides the vector-size.\n");
fb85abff 653 misalign = STMT_VINFO_DR_INIT (stmt_info);
654 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
655 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
656 }
657 else
658 {
6d8fb6cf 659 if (dump_enabled_p ())
7bd765d4 660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 661 "inner step doesn't divide the vector-size.\n");
fb85abff 662 misalign = NULL_TREE;
663 }
664 }
665
c1bee668 666 /* Similarly we can only use base and misalignment information relative to
667 an innermost loop if the misalignment stays the same throughout the
668 execution of the loop. As above, this is the case if the stride of
669 the dataref evenly divides by the vector size. */
670 else
38682b67 671 {
672 tree step = DR_STEP (dr);
c1bee668 673 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
38682b67 674
994be998 675 if (tree_fits_shwi_p (step)
c1bee668 676 && ((tree_to_shwi (step) * vf)
677 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
38682b67 678 {
6d8fb6cf 679 if (dump_enabled_p ())
78bb46f5 680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
c1bee668 681 "step doesn't divide the vector-size.\n");
38682b67 682 misalign = NULL_TREE;
683 }
684 }
685
9dd88d41 686 /* To look at alignment of the base we have to preserve an inner MEM_REF
687 as that carries alignment information of the actual access. */
688 base = ref;
689 while (handled_component_p (base))
690 base = TREE_OPERAND (base, 0);
691 if (TREE_CODE (base) == MEM_REF)
692 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
693 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
694 unsigned int base_alignment = get_object_alignment (base);
695
696 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
697 DR_VECT_AUX (dr)->base_element_aligned = true;
698
b4142d58 699 alignment = TYPE_ALIGN_UNIT (vectype);
fb85abff 700
b4142d58 701 if ((compare_tree_int (aligned_to, alignment) < 0)
fb85abff 702 || !misalign)
703 {
6d8fb6cf 704 if (dump_enabled_p ())
fb85abff 705 {
7bd765d4 706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 707 "Unknown alignment for access: ");
b4142d58 708 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
78bb46f5 709 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 710 }
711 return true;
712 }
713
9dd88d41 714 if (base_alignment < TYPE_ALIGN (vectype))
fb85abff 715 {
b4142d58 716 /* Strip an inner MEM_REF to a bare decl if possible. */
717 if (TREE_CODE (base) == MEM_REF
718 && integer_zerop (TREE_OPERAND (base, 1))
719 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
720 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
721
5c8bc079 722 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
fb85abff 723 {
6d8fb6cf 724 if (dump_enabled_p ())
fb85abff 725 {
7bd765d4 726 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 727 "can't force alignment of ref: ");
7bd765d4 728 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
78bb46f5 729 dump_printf (MSG_NOTE, "\n");
fb85abff 730 }
731 return true;
732 }
48e1416a 733
fb85abff 734 /* Force the alignment of the decl.
735 NOTE: This is the only change to the code we make during
736 the analysis phase, before deciding to vectorize the loop. */
6d8fb6cf 737 if (dump_enabled_p ())
0822b158 738 {
7bd765d4 739 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
740 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
78bb46f5 741 dump_printf (MSG_NOTE, "\n");
0822b158 742 }
743
9dd88d41 744 DR_VECT_AUX (dr)->base_decl = base;
745 DR_VECT_AUX (dr)->base_misaligned = true;
746 DR_VECT_AUX (dr)->base_element_aligned = true;
fb85abff 747 }
748
85a846a2 749 /* If this is a backward running DR then first access in the larger
750 vectype actually is N-1 elements before the address in the DR.
751 Adjust misalign accordingly. */
b4142d58 752 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
85a846a2 753 {
754 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
755 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
756 otherwise we wouldn't be here. */
757 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
758 /* PLUS because DR_STEP was negative. */
759 misalign = size_binop (PLUS_EXPR, misalign, offset);
760 }
761
b4142d58 762 SET_DR_MISALIGNMENT (dr,
763 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
fb85abff 764
6d8fb6cf 765 if (dump_enabled_p ())
fb85abff 766 {
7bd765d4 767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
768 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
769 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
78bb46f5 770 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 771 }
772
773 return true;
774}
775
776
777/* Function vect_compute_data_refs_alignment
778
779 Compute the misalignment of data references in the loop.
780 Return FALSE if a data reference is found that cannot be vectorized. */
781
782static bool
e2c5c678 783vect_compute_data_refs_alignment (vec_info *vinfo)
fb85abff 784{
e2c5c678 785 vec<data_reference_p> datarefs = vinfo->datarefs;
fb85abff 786 struct data_reference *dr;
787 unsigned int i;
788
f1f41a6c 789 FOR_EACH_VEC_ELT (datarefs, i, dr)
25359ccf 790 {
791 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
792 if (STMT_VINFO_VECTORIZABLE (stmt_info)
793 && !vect_compute_data_ref_alignment (dr))
794 {
795 /* Strided accesses perform only component accesses, misalignment
796 information is irrelevant for them. */
797 if (STMT_VINFO_STRIDED_P (stmt_info)
798 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
799 continue;
800
801 if (is_a <bb_vec_info> (vinfo))
802 {
803 /* Mark unsupported statement as unvectorizable. */
804 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
805 continue;
806 }
807 else
808 return false;
809 }
810 }
fb85abff 811
812 return true;
813}
814
815
816/* Function vect_update_misalignment_for_peel
817
818 DR - the data reference whose misalignment is to be adjusted.
819 DR_PEEL - the data reference whose misalignment is being made
820 zero in the vector loop by the peel.
821 NPEEL - the number of iterations in the peel loop if the misalignment
822 of DR_PEEL is known at compile time. */
823
824static void
825vect_update_misalignment_for_peel (struct data_reference *dr,
826 struct data_reference *dr_peel, int npeel)
827{
828 unsigned int i;
f1f41a6c 829 vec<dr_p> same_align_drs;
fb85abff 830 struct data_reference *current_dr;
831 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
832 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
833 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
834 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
835
836 /* For interleaved data accesses the step in the loop must be multiplied by
837 the size of the interleaving group. */
ee612634 838 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
21009880 839 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
ee612634 840 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
21009880 841 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
fb85abff 842
843 /* It can be assumed that the data refs with the same alignment as dr_peel
844 are aligned in the vector loop. */
845 same_align_drs
846 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
f1f41a6c 847 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
fb85abff 848 {
849 if (current_dr != dr)
850 continue;
851 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
852 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
853 SET_DR_MISALIGNMENT (dr, 0);
854 return;
855 }
856
857 if (known_alignment_for_access_p (dr)
858 && known_alignment_for_access_p (dr_peel))
859 {
f1b8c740 860 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
fb85abff 861 int misal = DR_MISALIGNMENT (dr);
862 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
f1b8c740 863 misal += negative ? -npeel * dr_size : npeel * dr_size;
482a44fa 864 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
fb85abff 865 SET_DR_MISALIGNMENT (dr, misal);
866 return;
867 }
868
6d8fb6cf 869 if (dump_enabled_p ())
78bb46f5 870 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
fb85abff 871 SET_DR_MISALIGNMENT (dr, -1);
872}
873
874
875/* Function vect_verify_datarefs_alignment
876
877 Return TRUE if all data references in the loop can be
878 handled with respect to alignment. */
879
37545e54 880bool
e2c5c678 881vect_verify_datarefs_alignment (vec_info *vinfo)
fb85abff 882{
e2c5c678 883 vec<data_reference_p> datarefs = vinfo->datarefs;
fb85abff 884 struct data_reference *dr;
885 enum dr_alignment_support supportable_dr_alignment;
886 unsigned int i;
887
f1f41a6c 888 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 889 {
42acab1c 890 gimple *stmt = DR_STMT (dr);
fb85abff 891 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
892
1ad41595 893 if (!STMT_VINFO_RELEVANT_P (stmt_info))
894 continue;
895
6ea6a380 896 /* For interleaving, only the alignment of the first access matters.
897 Skip statements marked as not vectorizable. */
ee612634 898 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 899 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6ea6a380 900 || !STMT_VINFO_VECTORIZABLE (stmt_info))
fb85abff 901 continue;
902
e1c75243 903 /* Strided accesses perform only component accesses, alignment is
506aa8fc 904 irrelevant for them. */
e1c75243 905 if (STMT_VINFO_STRIDED_P (stmt_info)
994be998 906 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
506aa8fc 907 continue;
908
0822b158 909 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
fb85abff 910 if (!supportable_dr_alignment)
911 {
6d8fb6cf 912 if (dump_enabled_p ())
fb85abff 913 {
914 if (DR_IS_READ (dr))
7bd765d4 915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
916 "not vectorized: unsupported unaligned load.");
fb85abff 917 else
7bd765d4 918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
919 "not vectorized: unsupported unaligned "
920 "store.");
6ea6a380 921
7bd765d4 922 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
923 DR_REF (dr));
78bb46f5 924 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 925 }
926 return false;
927 }
6d8fb6cf 928 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
7bd765d4 929 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 930 "Vectorizing an unaligned access.\n");
fb85abff 931 }
932 return true;
933}
934
cfa724cf 935/* Given an memory reference EXP return whether its alignment is less
936 than its size. */
937
938static bool
939not_size_aligned (tree exp)
940{
e913b5cd 941 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
cfa724cf 942 return true;
943
e913b5cd 944 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
cfa724cf 945 > get_object_alignment (exp));
946}
fb85abff 947
948/* Function vector_alignment_reachable_p
949
950 Return true if vector alignment for DR is reachable by peeling
951 a few loop iterations. Return false otherwise. */
952
953static bool
954vector_alignment_reachable_p (struct data_reference *dr)
955{
42acab1c 956 gimple *stmt = DR_STMT (dr);
fb85abff 957 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
958 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
959
ee612634 960 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
fb85abff 961 {
962 /* For interleaved access we peel only if number of iterations in
963 the prolog loop ({VF - misalignment}), is a multiple of the
964 number of the interleaved accesses. */
965 int elem_size, mis_in_elements;
966 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
967
968 /* FORNOW: handle only known alignment. */
969 if (!known_alignment_for_access_p (dr))
970 return false;
971
972 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
973 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
974
21009880 975 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
fb85abff 976 return false;
977 }
978
979 /* If misalignment is known at the compile time then allow peeling
980 only if natural alignment is reachable through peeling. */
981 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
982 {
48e1416a 983 HOST_WIDE_INT elmsize =
fb85abff 984 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
6d8fb6cf 985 if (dump_enabled_p ())
fb85abff 986 {
78bb46f5 987 dump_printf_loc (MSG_NOTE, vect_location,
988 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
989 dump_printf (MSG_NOTE,
990 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
fb85abff 991 }
992 if (DR_MISALIGNMENT (dr) % elmsize)
993 {
6d8fb6cf 994 if (dump_enabled_p ())
78bb46f5 995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
996 "data size does not divide the misalignment.\n");
fb85abff 997 return false;
998 }
999 }
1000
1001 if (!known_alignment_for_access_p (dr))
1002 {
cfa724cf 1003 tree type = TREE_TYPE (DR_REF (dr));
1004 bool is_packed = not_size_aligned (DR_REF (dr));
6d8fb6cf 1005 if (dump_enabled_p ())
78bb46f5 1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1007 "Unknown misalignment, is_packed = %d\n",is_packed);
2cd0995e 1008 if ((TYPE_USER_ALIGN (type) && !is_packed)
1009 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
fb85abff 1010 return true;
1011 else
1012 return false;
1013 }
1014
1015 return true;
1016}
1017
0822b158 1018
1019/* Calculate the cost of the memory access represented by DR. */
1020
f97dec81 1021static void
0822b158 1022vect_get_data_access_cost (struct data_reference *dr,
1023 unsigned int *inside_cost,
f97dec81 1024 unsigned int *outside_cost,
1025 stmt_vector_for_cost *body_cost_vec)
0822b158 1026{
42acab1c 1027 gimple *stmt = DR_STMT (dr);
0822b158 1028 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1029 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1030 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1031 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1032 int ncopies = vf / nunits;
0822b158 1033
1ad41595 1034 if (DR_IS_READ (dr))
f97dec81 1035 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1036 NULL, body_cost_vec, false);
0822b158 1037 else
f97dec81 1038 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
0822b158 1039
6d8fb6cf 1040 if (dump_enabled_p ())
7bd765d4 1041 dump_printf_loc (MSG_NOTE, vect_location,
1042 "vect_get_data_access_cost: inside_cost = %d, "
78bb46f5 1043 "outside_cost = %d.\n", *inside_cost, *outside_cost);
0822b158 1044}
1045
1046
41500e78 1047typedef struct _vect_peel_info
1048{
1049 int npeel;
1050 struct data_reference *dr;
1051 unsigned int count;
1052} *vect_peel_info;
1053
1054typedef struct _vect_peel_extended_info
1055{
1056 struct _vect_peel_info peel_info;
1057 unsigned int inside_cost;
1058 unsigned int outside_cost;
1059 stmt_vector_for_cost body_cost_vec;
1060} *vect_peel_extended_info;
1061
1062
1063/* Peeling hashtable helpers. */
1064
1065struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1066{
1067 static inline hashval_t hash (const _vect_peel_info *);
1068 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1069};
1070
1071inline hashval_t
1072peel_info_hasher::hash (const _vect_peel_info *peel_info)
1073{
1074 return (hashval_t) peel_info->npeel;
1075}
1076
1077inline bool
1078peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1079{
1080 return (a->npeel == b->npeel);
1081}
1082
1083
0822b158 1084/* Insert DR into peeling hash table with NPEEL as key. */
1085
1086static void
41500e78 1087vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1088 loop_vec_info loop_vinfo, struct data_reference *dr,
0822b158 1089 int npeel)
1090{
1091 struct _vect_peel_info elem, *slot;
3e871d4d 1092 _vect_peel_info **new_slot;
0822b158 1093 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1094
1095 elem.npeel = npeel;
41500e78 1096 slot = peeling_htab->find (&elem);
0822b158 1097 if (slot)
1098 slot->count++;
1099 else
1100 {
1101 slot = XNEW (struct _vect_peel_info);
1102 slot->npeel = npeel;
1103 slot->dr = dr;
1104 slot->count = 1;
41500e78 1105 new_slot = peeling_htab->find_slot (slot, INSERT);
0822b158 1106 *new_slot = slot;
1107 }
1108
3e398f5b 1109 if (!supportable_dr_alignment
1110 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
0822b158 1111 slot->count += VECT_MAX_COST;
1112}
1113
1114
1115/* Traverse peeling hash table to find peeling option that aligns maximum
1116 number of data accesses. */
1117
3e871d4d 1118int
1119vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1120 _vect_peel_extended_info *max)
0822b158 1121{
3e871d4d 1122 vect_peel_info elem = *slot;
0822b158 1123
593fa4d1 1124 if (elem->count > max->peel_info.count
1125 || (elem->count == max->peel_info.count
1126 && max->peel_info.npeel > elem->npeel))
0822b158 1127 {
1128 max->peel_info.npeel = elem->npeel;
1129 max->peel_info.count = elem->count;
1130 max->peel_info.dr = elem->dr;
1131 }
1132
1133 return 1;
1134}
1135
1136
282bf14c 1137/* Traverse peeling hash table and calculate cost for each peeling option.
1138 Find the one with the lowest cost. */
0822b158 1139
3e871d4d 1140int
1141vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1142 _vect_peel_extended_info *min)
0822b158 1143{
3e871d4d 1144 vect_peel_info elem = *slot;
0822b158 1145 int save_misalignment, dummy;
1146 unsigned int inside_cost = 0, outside_cost = 0, i;
42acab1c 1147 gimple *stmt = DR_STMT (elem->dr);
0822b158 1148 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
f1f41a6c 1150 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
0822b158 1151 struct data_reference *dr;
f97dec81 1152 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
f97dec81 1153
f1f41a6c 1154 prologue_cost_vec.create (2);
1155 body_cost_vec.create (2);
1156 epilogue_cost_vec.create (2);
0822b158 1157
f1f41a6c 1158 FOR_EACH_VEC_ELT (datarefs, i, dr)
0822b158 1159 {
1160 stmt = DR_STMT (dr);
1161 stmt_info = vinfo_for_stmt (stmt);
1162 /* For interleaving, only the alignment of the first access
1163 matters. */
ee612634 1164 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 1165 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
0822b158 1166 continue;
1167
1168 save_misalignment = DR_MISALIGNMENT (dr);
1169 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
f97dec81 1170 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1171 &body_cost_vec);
0822b158 1172 SET_DR_MISALIGNMENT (dr, save_misalignment);
1173 }
1174
41ae9eb4 1175 outside_cost += vect_get_known_peeling_cost
1176 (loop_vinfo, elem->npeel, &dummy,
2a9a3444 1177 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1178 &prologue_cost_vec, &epilogue_cost_vec);
f97dec81 1179
1180 /* Prologue and epilogue costs are added to the target model later.
1181 These costs depend only on the scalar iteration cost, the
1182 number of peeling iterations finally chosen, and the number of
1183 misaligned statements. So discard the information found here. */
f1f41a6c 1184 prologue_cost_vec.release ();
1185 epilogue_cost_vec.release ();
0822b158 1186
1187 if (inside_cost < min->inside_cost
1188 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1189 {
1190 min->inside_cost = inside_cost;
1191 min->outside_cost = outside_cost;
f1f41a6c 1192 min->body_cost_vec.release ();
f97dec81 1193 min->body_cost_vec = body_cost_vec;
0822b158 1194 min->peel_info.dr = elem->dr;
1195 min->peel_info.npeel = elem->npeel;
1196 }
f97dec81 1197 else
f1f41a6c 1198 body_cost_vec.release ();
0822b158 1199
1200 return 1;
1201}
1202
1203
1204/* Choose best peeling option by traversing peeling hash table and either
1205 choosing an option with the lowest cost (if cost model is enabled) or the
1206 option that aligns as many accesses as possible. */
1207
1208static struct data_reference *
41500e78 1209vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1210 loop_vec_info loop_vinfo,
4db2b577 1211 unsigned int *npeel,
f97dec81 1212 stmt_vector_for_cost *body_cost_vec)
0822b158 1213{
1214 struct _vect_peel_extended_info res;
1215
1216 res.peel_info.dr = NULL;
9af5ce0c 1217 res.body_cost_vec = stmt_vector_for_cost ();
0822b158 1218
3e398f5b 1219 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
0822b158 1220 {
1221 res.inside_cost = INT_MAX;
1222 res.outside_cost = INT_MAX;
41500e78 1223 peeling_htab->traverse <_vect_peel_extended_info *,
1224 vect_peeling_hash_get_lowest_cost> (&res);
0822b158 1225 }
1226 else
1227 {
1228 res.peel_info.count = 0;
41500e78 1229 peeling_htab->traverse <_vect_peel_extended_info *,
1230 vect_peeling_hash_get_most_frequent> (&res);
0822b158 1231 }
1232
1233 *npeel = res.peel_info.npeel;
f97dec81 1234 *body_cost_vec = res.body_cost_vec;
0822b158 1235 return res.peel_info.dr;
1236}
1237
1238
fb85abff 1239/* Function vect_enhance_data_refs_alignment
1240
1241 This pass will use loop versioning and loop peeling in order to enhance
1242 the alignment of data references in the loop.
1243
1244 FOR NOW: we assume that whatever versioning/peeling takes place, only the
282bf14c 1245 original loop is to be vectorized. Any other loops that are created by
fb85abff 1246 the transformations performed in this pass - are not supposed to be
282bf14c 1247 vectorized. This restriction will be relaxed.
fb85abff 1248
1249 This pass will require a cost model to guide it whether to apply peeling
282bf14c 1250 or versioning or a combination of the two. For example, the scheme that
fb85abff 1251 intel uses when given a loop with several memory accesses, is as follows:
1252 choose one memory access ('p') which alignment you want to force by doing
282bf14c 1253 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
fb85abff 1254 other accesses are not necessarily aligned, or (2) use loop versioning to
1255 generate one loop in which all accesses are aligned, and another loop in
1256 which only 'p' is necessarily aligned.
1257
1258 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1259 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1260 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1261
282bf14c 1262 Devising a cost model is the most critical aspect of this work. It will
fb85abff 1263 guide us on which access to peel for, whether to use loop versioning, how
282bf14c 1264 many versions to create, etc. The cost model will probably consist of
fb85abff 1265 generic considerations as well as target specific considerations (on
1266 powerpc for example, misaligned stores are more painful than misaligned
1267 loads).
1268
1269 Here are the general steps involved in alignment enhancements:
1270
1271 -- original loop, before alignment analysis:
1272 for (i=0; i<N; i++){
1273 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1274 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1275 }
1276
1277 -- After vect_compute_data_refs_alignment:
1278 for (i=0; i<N; i++){
1279 x = q[i]; # DR_MISALIGNMENT(q) = 3
1280 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1281 }
1282
1283 -- Possibility 1: we do loop versioning:
1284 if (p is aligned) {
1285 for (i=0; i<N; i++){ # loop 1A
1286 x = q[i]; # DR_MISALIGNMENT(q) = 3
1287 p[i] = y; # DR_MISALIGNMENT(p) = 0
1288 }
1289 }
1290 else {
1291 for (i=0; i<N; i++){ # loop 1B
1292 x = q[i]; # DR_MISALIGNMENT(q) = 3
1293 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1294 }
1295 }
1296
1297 -- Possibility 2: we do loop peeling:
1298 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1299 x = q[i];
1300 p[i] = y;
1301 }
1302 for (i = 3; i < N; i++){ # loop 2A
1303 x = q[i]; # DR_MISALIGNMENT(q) = 0
1304 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1305 }
1306
1307 -- Possibility 3: combination of loop peeling and versioning:
1308 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1309 x = q[i];
1310 p[i] = y;
1311 }
1312 if (p is aligned) {
1313 for (i = 3; i<N; i++){ # loop 3A
1314 x = q[i]; # DR_MISALIGNMENT(q) = 0
1315 p[i] = y; # DR_MISALIGNMENT(p) = 0
1316 }
1317 }
1318 else {
1319 for (i = 3; i<N; i++){ # loop 3B
1320 x = q[i]; # DR_MISALIGNMENT(q) = 0
1321 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1322 }
1323 }
1324
282bf14c 1325 These loops are later passed to loop_transform to be vectorized. The
fb85abff 1326 vectorizer will use the alignment information to guide the transformation
1327 (whether to generate regular loads/stores, or with special handling for
1328 misalignment). */
1329
1330bool
1331vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1332{
f1f41a6c 1333 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
fb85abff 1334 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1335 enum dr_alignment_support supportable_dr_alignment;
0822b158 1336 struct data_reference *dr0 = NULL, *first_store = NULL;
fb85abff 1337 struct data_reference *dr;
0822b158 1338 unsigned int i, j;
fb85abff 1339 bool do_peeling = false;
1340 bool do_versioning = false;
1341 bool stat;
42acab1c 1342 gimple *stmt;
fb85abff 1343 stmt_vec_info stmt_info;
0822b158 1344 unsigned int npeel = 0;
1345 bool all_misalignments_unknown = true;
1346 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1347 unsigned possible_npeel_number = 1;
1348 tree vectype;
1349 unsigned int nelements, mis, same_align_drs_max = 0;
9af5ce0c 1350 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
41500e78 1351 hash_table<peel_info_hasher> peeling_htab (1);
fb85abff 1352
6d8fb6cf 1353 if (dump_enabled_p ())
7bd765d4 1354 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1355 "=== vect_enhance_data_refs_alignment ===\n");
fb85abff 1356
00ecf4da 1357 /* Reset data so we can safely be called multiple times. */
1358 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1359 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1360
fb85abff 1361 /* While cost model enhancements are expected in the future, the high level
1362 view of the code at this time is as follows:
1363
ec2886ed 1364 A) If there is a misaligned access then see if peeling to align
1365 this access can make all data references satisfy
454f25be 1366 vect_supportable_dr_alignment. If so, update data structures
1367 as needed and return true.
fb85abff 1368
1369 B) If peeling wasn't possible and there is a data reference with an
1370 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1371 then see if loop versioning checks can be used to make all data
1372 references satisfy vect_supportable_dr_alignment. If so, update
1373 data structures as needed and return true.
1374
1375 C) If neither peeling nor versioning were successful then return false if
1376 any data reference does not satisfy vect_supportable_dr_alignment.
1377
1378 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1379
1380 Note, Possibility 3 above (which is peeling and versioning together) is not
1381 being done at this time. */
1382
1383 /* (1) Peeling to force alignment. */
1384
1385 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1386 Considerations:
1387 + How many accesses will become aligned due to the peeling
1388 - How many accesses will become unaligned due to the peeling,
1389 and the cost of misaligned accesses.
48e1416a 1390 - The cost of peeling (the extra runtime checks, the increase
0822b158 1391 in code size). */
fb85abff 1392
f1f41a6c 1393 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 1394 {
1395 stmt = DR_STMT (dr);
1396 stmt_info = vinfo_for_stmt (stmt);
1397
1ad41595 1398 if (!STMT_VINFO_RELEVANT_P (stmt_info))
b04940e7 1399 continue;
1400
fb85abff 1401 /* For interleaving, only the alignment of the first access
1402 matters. */
ee612634 1403 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 1404 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
fb85abff 1405 continue;
1406
b04940e7 1407 /* For invariant accesses there is nothing to enhance. */
1408 if (integer_zerop (DR_STEP (dr)))
1409 continue;
1410
e1c75243 1411 /* Strided accesses perform only component accesses, alignment is
f634c3e9 1412 irrelevant for them. */
e1c75243 1413 if (STMT_VINFO_STRIDED_P (stmt_info)
994be998 1414 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
f634c3e9 1415 continue;
1416
0822b158 1417 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1418 do_peeling = vector_alignment_reachable_p (dr);
1419 if (do_peeling)
fb85abff 1420 {
0822b158 1421 if (known_alignment_for_access_p (dr))
1422 {
1423 unsigned int npeel_tmp;
f1b8c740 1424 bool negative = tree_int_cst_compare (DR_STEP (dr),
1425 size_zero_node) < 0;
0822b158 1426
1427 /* Save info about DR in the hash table. */
0822b158 1428 vectype = STMT_VINFO_VECTYPE (stmt_info);
1429 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1430 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1431 TREE_TYPE (DR_REF (dr))));
f1b8c740 1432 npeel_tmp = (negative
184a9a0f 1433 ? (mis - nelements) : (nelements - mis))
1434 & (nelements - 1);
0822b158 1435
1436 /* For multiple types, it is possible that the bigger type access
282bf14c 1437 will have more than one peeling option. E.g., a loop with two
0822b158 1438 types: one of size (vector size / 4), and the other one of
282bf14c 1439 size (vector size / 8). Vectorization factor will 8. If both
0822b158 1440 access are misaligned by 3, the first one needs one scalar
282bf14c 1441 iteration to be aligned, and the second one needs 5. But the
0822b158 1442 the first one will be aligned also by peeling 5 scalar
1443 iterations, and in that case both accesses will be aligned.
1444 Hence, except for the immediate peeling amount, we also want
1445 to try to add full vector size, while we don't exceed
1446 vectorization factor.
1447 We do this automtically for cost model, since we calculate cost
1448 for every peeling option. */
3e398f5b 1449 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
c1bee668 1450 {
1451 if (STMT_SLP_TYPE (stmt_info))
1452 possible_npeel_number
1453 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1454 else
1455 possible_npeel_number = vf / nelements;
1456 }
0822b158 1457
1458 /* Handle the aligned case. We may decide to align some other
1459 access, making DR unaligned. */
1460 if (DR_MISALIGNMENT (dr) == 0)
1461 {
1462 npeel_tmp = 0;
3e398f5b 1463 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
0822b158 1464 possible_npeel_number++;
1465 }
1466
1467 for (j = 0; j < possible_npeel_number; j++)
1468 {
41500e78 1469 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1470 dr, npeel_tmp);
0822b158 1471 npeel_tmp += nelements;
1472 }
1473
1474 all_misalignments_unknown = false;
1475 /* Data-ref that was chosen for the case that all the
1476 misalignments are unknown is not relevant anymore, since we
1477 have a data-ref with known alignment. */
1478 dr0 = NULL;
1479 }
1480 else
1481 {
6046367e 1482 /* If we don't know any misalignment values, we prefer
1483 peeling for data-ref that has the maximum number of data-refs
0822b158 1484 with the same alignment, unless the target prefers to align
1485 stores over load. */
1486 if (all_misalignments_unknown)
1487 {
6046367e 1488 unsigned same_align_drs
1489 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1490 if (!dr0
1491 || same_align_drs_max < same_align_drs)
0822b158 1492 {
6046367e 1493 same_align_drs_max = same_align_drs;
0822b158 1494 dr0 = dr;
1495 }
6046367e 1496 /* For data-refs with the same number of related
1497 accesses prefer the one where the misalign
1498 computation will be invariant in the outermost loop. */
1499 else if (same_align_drs_max == same_align_drs)
1500 {
1501 struct loop *ivloop0, *ivloop;
1502 ivloop0 = outermost_invariant_loop_for_expr
1503 (loop, DR_BASE_ADDRESS (dr0));
1504 ivloop = outermost_invariant_loop_for_expr
1505 (loop, DR_BASE_ADDRESS (dr));
1506 if ((ivloop && !ivloop0)
1507 || (ivloop && ivloop0
1508 && flow_loop_nested_p (ivloop, ivloop0)))
1509 dr0 = dr;
1510 }
0822b158 1511
9ff25603 1512 if (!first_store && DR_IS_WRITE (dr))
0822b158 1513 first_store = dr;
1514 }
1515
1516 /* If there are both known and unknown misaligned accesses in the
1517 loop, we choose peeling amount according to the known
1518 accesses. */
0822b158 1519 if (!supportable_dr_alignment)
1520 {
1521 dr0 = dr;
9ff25603 1522 if (!first_store && DR_IS_WRITE (dr))
0822b158 1523 first_store = dr;
1524 }
1525 }
1526 }
1527 else
1528 {
1529 if (!aligned_access_p (dr))
1530 {
6d8fb6cf 1531 if (dump_enabled_p ())
78bb46f5 1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "vector alignment may not be reachable\n");
0822b158 1534 break;
1535 }
1536 }
fb85abff 1537 }
1538
2cd0995e 1539 /* Check if we can possibly peel the loop. */
1540 if (!vect_can_advance_ivs_p (loop_vinfo)
5ee742c4 1541 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1542 || loop->inner)
fb85abff 1543 do_peeling = false;
1544
192f7876 1545 if (do_peeling
1546 && all_misalignments_unknown
0822b158 1547 && vect_supportable_dr_alignment (dr0, false))
1548 {
0822b158 1549 /* Check if the target requires to prefer stores over loads, i.e., if
1550 misaligned stores are more expensive than misaligned loads (taking
1551 drs with same alignment into account). */
1552 if (first_store && DR_IS_READ (dr0))
1553 {
1554 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1555 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1556 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1557 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
f1f41a6c 1558 stmt_vector_for_cost dummy;
1559 dummy.create (2);
f97dec81 1560
1561 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1562 &dummy);
1563 vect_get_data_access_cost (first_store, &store_inside_cost,
1564 &store_outside_cost, &dummy);
0822b158 1565
f1f41a6c 1566 dummy.release ();
0822b158 1567
1568 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1569 aligning the load DR0). */
1570 load_inside_penalty = store_inside_cost;
1571 load_outside_penalty = store_outside_cost;
f1f41a6c 1572 for (i = 0;
1573 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1574 DR_STMT (first_store))).iterate (i, &dr);
0822b158 1575 i++)
1576 if (DR_IS_READ (dr))
1577 {
1578 load_inside_penalty += load_inside_cost;
1579 load_outside_penalty += load_outside_cost;
1580 }
1581 else
1582 {
1583 load_inside_penalty += store_inside_cost;
1584 load_outside_penalty += store_outside_cost;
1585 }
1586
1587 /* Calculate the penalty for leaving DR0 unaligned (by
1588 aligning the FIRST_STORE). */
1589 store_inside_penalty = load_inside_cost;
1590 store_outside_penalty = load_outside_cost;
f1f41a6c 1591 for (i = 0;
1592 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1593 DR_STMT (dr0))).iterate (i, &dr);
0822b158 1594 i++)
1595 if (DR_IS_READ (dr))
1596 {
1597 store_inside_penalty += load_inside_cost;
1598 store_outside_penalty += load_outside_cost;
1599 }
1600 else
1601 {
1602 store_inside_penalty += store_inside_cost;
1603 store_outside_penalty += store_outside_cost;
1604 }
1605
1606 if (load_inside_penalty > store_inside_penalty
1607 || (load_inside_penalty == store_inside_penalty
1608 && load_outside_penalty > store_outside_penalty))
1609 dr0 = first_store;
1610 }
1611
1612 /* In case there are only loads with different unknown misalignments, use
eb10b471 1613 peeling only if it may help to align other accesses in the loop or
1614 if it may help improving load bandwith when we'd end up using
1615 unaligned loads. */
1616 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
f1f41a6c 1617 if (!first_store
1618 && !STMT_VINFO_SAME_ALIGN_REFS (
1619 vinfo_for_stmt (DR_STMT (dr0))).length ()
eb10b471 1620 && (vect_supportable_dr_alignment (dr0, false)
1621 != dr_unaligned_supported
1622 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1623 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
0822b158 1624 do_peeling = false;
1625 }
1626
1627 if (do_peeling && !dr0)
1628 {
1629 /* Peeling is possible, but there is no data access that is not supported
1630 unless aligned. So we try to choose the best possible peeling. */
1631
1632 /* We should get here only if there are drs with known misalignment. */
1633 gcc_assert (!all_misalignments_unknown);
1634
1635 /* Choose the best peeling from the hash table. */
41500e78 1636 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1637 loop_vinfo, &npeel,
f97dec81 1638 &body_cost_vec);
0822b158 1639 if (!dr0 || !npeel)
1640 do_peeling = false;
1641 }
1642
fb85abff 1643 if (do_peeling)
1644 {
0822b158 1645 stmt = DR_STMT (dr0);
1646 stmt_info = vinfo_for_stmt (stmt);
1647 vectype = STMT_VINFO_VECTYPE (stmt_info);
1648 nelements = TYPE_VECTOR_SUBPARTS (vectype);
fb85abff 1649
1650 if (known_alignment_for_access_p (dr0))
1651 {
f1b8c740 1652 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1653 size_zero_node) < 0;
0822b158 1654 if (!npeel)
1655 {
1656 /* Since it's known at compile time, compute the number of
1657 iterations in the peeled loop (the peeling factor) for use in
1658 updating DR_MISALIGNMENT values. The peeling factor is the
1659 vectorization factor minus the misalignment as an element
1660 count. */
1661 mis = DR_MISALIGNMENT (dr0);
1662 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
184a9a0f 1663 npeel = ((negative ? mis - nelements : nelements - mis)
1664 & (nelements - 1));
0822b158 1665 }
fb85abff 1666
48e1416a 1667 /* For interleaved data access every iteration accesses all the
fb85abff 1668 members of the group, therefore we divide the number of iterations
1669 by the group size. */
48e1416a 1670 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
ee612634 1671 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
21009880 1672 npeel /= GROUP_SIZE (stmt_info);
fb85abff 1673
6d8fb6cf 1674 if (dump_enabled_p ())
7bd765d4 1675 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1676 "Try peeling by %d\n", npeel);
fb85abff 1677 }
1678
1679 /* Ensure that all data refs can be vectorized after the peel. */
f1f41a6c 1680 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 1681 {
1682 int save_misalignment;
1683
1684 if (dr == dr0)
1685 continue;
1686
1687 stmt = DR_STMT (dr);
1688 stmt_info = vinfo_for_stmt (stmt);
1689 /* For interleaving, only the alignment of the first access
1690 matters. */
ee612634 1691 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 1692 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
fb85abff 1693 continue;
1694
e1c75243 1695 /* Strided accesses perform only component accesses, alignment is
f634c3e9 1696 irrelevant for them. */
e1c75243 1697 if (STMT_VINFO_STRIDED_P (stmt_info)
994be998 1698 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
f634c3e9 1699 continue;
1700
fb85abff 1701 save_misalignment = DR_MISALIGNMENT (dr);
1702 vect_update_misalignment_for_peel (dr, dr0, npeel);
0822b158 1703 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
fb85abff 1704 SET_DR_MISALIGNMENT (dr, save_misalignment);
48e1416a 1705
fb85abff 1706 if (!supportable_dr_alignment)
1707 {
1708 do_peeling = false;
1709 break;
1710 }
1711 }
1712
0822b158 1713 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1714 {
e2c5c678 1715 stat = vect_verify_datarefs_alignment (loop_vinfo);
0822b158 1716 if (!stat)
1717 do_peeling = false;
1718 else
9f793cdf 1719 {
f1f41a6c 1720 body_cost_vec.release ();
9f793cdf 1721 return stat;
1722 }
0822b158 1723 }
1724
eb10b471 1725 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
d7d7032a 1726 if (do_peeling)
1727 {
1728 unsigned max_allowed_peel
1729 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1730 if (max_allowed_peel != (unsigned)-1)
1731 {
1732 unsigned max_peel = npeel;
1733 if (max_peel == 0)
1734 {
42acab1c 1735 gimple *dr_stmt = DR_STMT (dr0);
d7d7032a 1736 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1737 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1738 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1739 }
1740 if (max_peel > max_allowed_peel)
1741 {
1742 do_peeling = false;
1743 if (dump_enabled_p ())
1744 dump_printf_loc (MSG_NOTE, vect_location,
1745 "Disable peeling, max peels reached: %d\n", max_peel);
1746 }
1747 }
1748 }
1749
eb10b471 1750 /* Cost model #2 - if peeling may result in a remaining loop not
1751 iterating enough to be vectorized then do not peel. */
1752 if (do_peeling
1753 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1754 {
1755 unsigned max_peel
1756 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1757 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1758 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1759 do_peeling = false;
1760 }
1761
fb85abff 1762 if (do_peeling)
1763 {
1764 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1765 If the misalignment of DR_i is identical to that of dr0 then set
1766 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1767 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1768 by the peeling factor times the element size of DR_i (MOD the
1769 vectorization factor times the size). Otherwise, the
1770 misalignment of DR_i must be set to unknown. */
f1f41a6c 1771 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 1772 if (dr != dr0)
1773 vect_update_misalignment_for_peel (dr, dr0, npeel);
1774
1775 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
0822b158 1776 if (npeel)
313a5120 1777 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
0822b158 1778 else
313a5120 1779 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1780 = DR_MISALIGNMENT (dr0);
fb85abff 1781 SET_DR_MISALIGNMENT (dr0, 0);
6d8fb6cf 1782 if (dump_enabled_p ())
7bd765d4 1783 {
1784 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1785 "Alignment of access forced using peeling.\n");
7bd765d4 1786 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1787 "Peeling for alignment will be applied.\n");
7bd765d4 1788 }
e4eca2de 1789 /* The inside-loop cost will be accounted for in vectorizable_load
1790 and vectorizable_store correctly with adjusted alignments.
1791 Drop the body_cst_vec on the floor here. */
1792 body_cost_vec.release ();
4db2b577 1793
e2c5c678 1794 stat = vect_verify_datarefs_alignment (loop_vinfo);
fb85abff 1795 gcc_assert (stat);
1796 return stat;
1797 }
1798 }
1799
f1f41a6c 1800 body_cost_vec.release ();
fb85abff 1801
1802 /* (2) Versioning to force alignment. */
1803
1804 /* Try versioning if:
1dbf9bd1 1805 1) optimize loop for speed
1806 2) there is at least one unsupported misaligned data ref with an unknown
fb85abff 1807 misalignment, and
1dbf9bd1 1808 3) all misaligned data refs with a known misalignment are supported, and
1809 4) the number of runtime alignment checks is within reason. */
fb85abff 1810
48e1416a 1811 do_versioning =
1dbf9bd1 1812 optimize_loop_nest_for_speed_p (loop)
fb85abff 1813 && (!loop->inner); /* FORNOW */
1814
1815 if (do_versioning)
1816 {
f1f41a6c 1817 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 1818 {
1819 stmt = DR_STMT (dr);
1820 stmt_info = vinfo_for_stmt (stmt);
1821
1822 /* For interleaving, only the alignment of the first access
1823 matters. */
1824 if (aligned_access_p (dr)
ee612634 1825 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 1826 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
fb85abff 1827 continue;
1828
e1c75243 1829 if (STMT_VINFO_STRIDED_P (stmt_info))
994be998 1830 {
1831 /* Strided loads perform only component accesses, alignment is
1832 irrelevant for them. */
1833 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1834 continue;
1835 do_versioning = false;
1836 break;
1837 }
f634c3e9 1838
0822b158 1839 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
fb85abff 1840
1841 if (!supportable_dr_alignment)
1842 {
42acab1c 1843 gimple *stmt;
fb85abff 1844 int mask;
1845 tree vectype;
1846
1847 if (known_alignment_for_access_p (dr)
f1f41a6c 1848 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
fb85abff 1849 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1850 {
1851 do_versioning = false;
1852 break;
1853 }
1854
1855 stmt = DR_STMT (dr);
1856 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1857 gcc_assert (vectype);
48e1416a 1858
fb85abff 1859 /* The rightmost bits of an aligned address must be zeros.
1860 Construct the mask needed for this test. For example,
1861 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1862 mask must be 15 = 0xf. */
1863 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1864
1865 /* FORNOW: use the same mask to test all potentially unaligned
1866 references in the loop. The vectorizer currently supports
1867 a single vector size, see the reference to
1868 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1869 vectorization factor is computed. */
1870 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1871 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1872 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
f1f41a6c 1873 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1874 DR_STMT (dr));
fb85abff 1875 }
1876 }
48e1416a 1877
fb85abff 1878 /* Versioning requires at least one misaligned data reference. */
10095225 1879 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
fb85abff 1880 do_versioning = false;
1881 else if (!do_versioning)
f1f41a6c 1882 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
fb85abff 1883 }
1884
1885 if (do_versioning)
1886 {
42acab1c 1887 vec<gimple *> may_misalign_stmts
fb85abff 1888 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
42acab1c 1889 gimple *stmt;
fb85abff 1890
1891 /* It can now be assumed that the data references in the statements
1892 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1893 of the loop being vectorized. */
f1f41a6c 1894 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
fb85abff 1895 {
1896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1897 dr = STMT_VINFO_DATA_REF (stmt_info);
1898 SET_DR_MISALIGNMENT (dr, 0);
6d8fb6cf 1899 if (dump_enabled_p ())
78bb46f5 1900 dump_printf_loc (MSG_NOTE, vect_location,
1901 "Alignment of access forced using versioning.\n");
fb85abff 1902 }
1903
6d8fb6cf 1904 if (dump_enabled_p ())
78bb46f5 1905 dump_printf_loc (MSG_NOTE, vect_location,
1906 "Versioning for alignment will be applied.\n");
fb85abff 1907
1908 /* Peeling and versioning can't be done together at this time. */
1909 gcc_assert (! (do_peeling && do_versioning));
1910
e2c5c678 1911 stat = vect_verify_datarefs_alignment (loop_vinfo);
fb85abff 1912 gcc_assert (stat);
1913 return stat;
1914 }
1915
1916 /* This point is reached if neither peeling nor versioning is being done. */
1917 gcc_assert (! (do_peeling || do_versioning));
1918
e2c5c678 1919 stat = vect_verify_datarefs_alignment (loop_vinfo);
fb85abff 1920 return stat;
1921}
1922
1923
91a74fc6 1924/* Function vect_find_same_alignment_drs.
1925
1926 Update group and alignment relations according to the chosen
1927 vectorization factor. */
1928
1929static void
1930vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1931 loop_vec_info loop_vinfo)
1932{
1933 unsigned int i;
1934 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1935 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1936 struct data_reference *dra = DDR_A (ddr);
1937 struct data_reference *drb = DDR_B (ddr);
1938 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1939 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1940 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1941 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1942 lambda_vector dist_v;
1943 unsigned int loop_depth;
1944
1945 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1946 return;
1947
0822b158 1948 if (dra == drb)
91a74fc6 1949 return;
1950
1951 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1952 return;
1953
1954 /* Loop-based vectorization and known data dependence. */
1955 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1956 return;
1957
85a846a2 1958 /* Data-dependence analysis reports a distance vector of zero
1959 for data-references that overlap only in the first iteration
1960 but have different sign step (see PR45764).
1961 So as a sanity check require equal DR_STEP. */
1962 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1963 return;
1964
91a74fc6 1965 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
f1f41a6c 1966 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
91a74fc6 1967 {
1968 int dist = dist_v[loop_depth];
1969
6d8fb6cf 1970 if (dump_enabled_p ())
7bd765d4 1971 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1972 "dependence distance = %d.\n", dist);
91a74fc6 1973
1974 /* Same loop iteration. */
1975 if (dist == 0
1976 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1977 {
1978 /* Two references with distance zero have the same alignment. */
f1f41a6c 1979 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1980 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
6d8fb6cf 1981 if (dump_enabled_p ())
91a74fc6 1982 {
78bb46f5 1983 dump_printf_loc (MSG_NOTE, vect_location,
1984 "accesses have the same alignment.\n");
7bd765d4 1985 dump_printf (MSG_NOTE,
78bb46f5 1986 "dependence distance modulo vf == 0 between ");
7bd765d4 1987 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1988 dump_printf (MSG_NOTE, " and ");
1989 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 1990 dump_printf (MSG_NOTE, "\n");
91a74fc6 1991 }
1992 }
1993 }
1994}
1995
1996
fb85abff 1997/* Function vect_analyze_data_refs_alignment
1998
1999 Analyze the alignment of the data-references in the loop.
2000 Return FALSE if a data reference is found that cannot be vectorized. */
2001
2002bool
e2c5c678 2003vect_analyze_data_refs_alignment (vec_info *vinfo)
fb85abff 2004{
6d8fb6cf 2005 if (dump_enabled_p ())
7bd765d4 2006 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2007 "=== vect_analyze_data_refs_alignment ===\n");
fb85abff 2008
91a74fc6 2009 /* Mark groups of data references with same alignment using
2010 data dependence information. */
e2c5c678 2011 if (is_a <loop_vec_info> (vinfo))
91a74fc6 2012 {
e2c5c678 2013 vec<ddr_p> ddrs = vinfo->ddrs;
91a74fc6 2014 struct data_dependence_relation *ddr;
2015 unsigned int i;
2016
f1f41a6c 2017 FOR_EACH_VEC_ELT (ddrs, i, ddr)
e2c5c678 2018 vect_find_same_alignment_drs (ddr, as_a <loop_vec_info> (vinfo));
91a74fc6 2019 }
2020
e2c5c678 2021 if (!vect_compute_data_refs_alignment (vinfo))
fb85abff 2022 {
6d8fb6cf 2023 if (dump_enabled_p ())
78bb46f5 2024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2025 "not vectorized: can't calculate alignment "
2026 "for data ref.\n");
fb85abff 2027 return false;
2028 }
2029
2030 return true;
2031}
2032
2033
ee612634 2034/* Analyze groups of accesses: check that DR belongs to a group of
2035 accesses of legal size, step, etc. Detect gaps, single element
2036 interleaving, and other special cases. Set grouped access info.
39e23eaa 2037 Collect groups of strided stores for further use in SLP analysis.
2038 Worker for vect_analyze_group_access. */
fb85abff 2039
2040static bool
39e23eaa 2041vect_analyze_group_access_1 (struct data_reference *dr)
fb85abff 2042{
2043 tree step = DR_STEP (dr);
2044 tree scalar_type = TREE_TYPE (DR_REF (dr));
f9ae6f95 2045 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
42acab1c 2046 gimple *stmt = DR_STMT (dr);
fb85abff 2047 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2048 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
37545e54 2049 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
994be998 2050 HOST_WIDE_INT dr_step = -1;
ee612634 2051 HOST_WIDE_INT groupsize, last_accessed_element = 1;
fb85abff 2052 bool slp_impossible = false;
d88ecd95 2053 struct loop *loop = NULL;
2054
2055 if (loop_vinfo)
2056 loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 2057
ee612634 2058 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2059 size of the interleaving group (including gaps). */
994be998 2060 if (tree_fits_shwi_p (step))
2061 {
2062 dr_step = tree_to_shwi (step);
2063 groupsize = absu_hwi (dr_step) / type_size;
2064 }
2065 else
2066 groupsize = 0;
fb85abff 2067
2068 /* Not consecutive access is possible only if it is a part of interleaving. */
21009880 2069 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
fb85abff 2070 {
2071 /* Check if it this DR is a part of interleaving, and is a single
2072 element of the group that is accessed in the loop. */
48e1416a 2073
fb85abff 2074 /* Gaps are supported only for loads. STEP must be a multiple of the type
2075 size. The size of the group must be a power of 2. */
2076 if (DR_IS_READ (dr)
2077 && (dr_step % type_size) == 0
ee612634 2078 && groupsize > 0
2079 && exact_log2 (groupsize) != -1)
fb85abff 2080 {
21009880 2081 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
ee612634 2082 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
6d8fb6cf 2083 if (dump_enabled_p ())
fb85abff 2084 {
78bb46f5 2085 dump_printf_loc (MSG_NOTE, vect_location,
2086 "Detected single element interleaving ");
7bd765d4 2087 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2088 dump_printf (MSG_NOTE, " step ");
2089 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
78bb46f5 2090 dump_printf (MSG_NOTE, "\n");
fb85abff 2091 }
a4ee7fac 2092
2093 if (loop_vinfo)
2094 {
6d8fb6cf 2095 if (dump_enabled_p ())
7bd765d4 2096 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2097 "Data access with gaps requires scalar "
2098 "epilogue loop\n");
d88ecd95 2099 if (loop->inner)
2100 {
6d8fb6cf 2101 if (dump_enabled_p ())
7bd765d4 2102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103 "Peeling for outer loop is not"
78bb46f5 2104 " supported\n");
d88ecd95 2105 return false;
2106 }
2107
2108 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
a4ee7fac 2109 }
2110
fb85abff 2111 return true;
2112 }
6ea6a380 2113
6d8fb6cf 2114 if (dump_enabled_p ())
6ea6a380 2115 {
7bd765d4 2116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2117 "not consecutive access ");
2118 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
6ea6a380 2119 }
2120
2121 if (bb_vinfo)
2122 {
2123 /* Mark the statement as unvectorizable. */
2124 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2125 return true;
2126 }
7bd765d4 2127
71de77d8 2128 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2129 STMT_VINFO_STRIDED_P (stmt_info) = true;
2130 return true;
fb85abff 2131 }
2132
21009880 2133 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
fb85abff 2134 {
2135 /* First stmt in the interleaving chain. Check the chain. */
42acab1c 2136 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
fb85abff 2137 struct data_reference *data_ref = dr;
1a0e7d51 2138 unsigned int count = 1;
fb85abff 2139 tree prev_init = DR_INIT (data_ref);
42acab1c 2140 gimple *prev = stmt;
8bbe6b75 2141 HOST_WIDE_INT diff, gaps = 0;
fb85abff 2142
2143 while (next)
2144 {
282bf14c 2145 /* Skip same data-refs. In case that two or more stmts share
2146 data-ref (supported only for loads), we vectorize only the first
2147 stmt, and the rest get their vectorized loads from the first
2148 one. */
fb85abff 2149 if (!tree_int_cst_compare (DR_INIT (data_ref),
2150 DR_INIT (STMT_VINFO_DATA_REF (
2151 vinfo_for_stmt (next)))))
2152 {
9ff25603 2153 if (DR_IS_WRITE (data_ref))
fb85abff 2154 {
6d8fb6cf 2155 if (dump_enabled_p ())
78bb46f5 2156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2157 "Two store stmts share the same dr.\n");
fb85abff 2158 return false;
2159 }
2160
00ecf4da 2161 if (dump_enabled_p ())
2162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2163 "Two or more load stmts share the same dr.\n");
2164
fb85abff 2165 /* For load use the same data-ref load. */
21009880 2166 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
fb85abff 2167
2168 prev = next;
21009880 2169 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
fb85abff 2170 continue;
2171 }
a4ee7fac 2172
fb85abff 2173 prev = next;
8bbe6b75 2174 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
fb85abff 2175
8bbe6b75 2176 /* All group members have the same STEP by construction. */
2177 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
fb85abff 2178
fb85abff 2179 /* Check that the distance between two accesses is equal to the type
2180 size. Otherwise, we have gaps. */
f9ae6f95 2181 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2182 - TREE_INT_CST_LOW (prev_init)) / type_size;
fb85abff 2183 if (diff != 1)
2184 {
2185 /* FORNOW: SLP of accesses with gaps is not supported. */
2186 slp_impossible = true;
9ff25603 2187 if (DR_IS_WRITE (data_ref))
fb85abff 2188 {
6d8fb6cf 2189 if (dump_enabled_p ())
78bb46f5 2190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191 "interleaved store with gaps\n");
fb85abff 2192 return false;
2193 }
b11576bf 2194
2195 gaps += diff - 1;
fb85abff 2196 }
2197
a4ee7fac 2198 last_accessed_element += diff;
2199
fb85abff 2200 /* Store the gap from the previous member of the group. If there is no
21009880 2201 gap in the access, GROUP_GAP is always 1. */
2202 GROUP_GAP (vinfo_for_stmt (next)) = diff;
fb85abff 2203
2204 prev_init = DR_INIT (data_ref);
21009880 2205 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
fb85abff 2206 /* Count the number of data-refs in the chain. */
2207 count++;
2208 }
2209
994be998 2210 if (groupsize == 0)
2211 groupsize = count + gaps;
fb85abff 2212
39e23eaa 2213 if (groupsize > UINT_MAX)
2214 {
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2217 "group is too large\n");
2218 return false;
2219 }
2220
994be998 2221 /* Check that the size of the interleaving is equal to count for stores,
fb85abff 2222 i.e., that there are no gaps. */
904bd865 2223 if (groupsize != count
2224 && !DR_IS_READ (dr))
fb85abff 2225 {
904bd865 2226 if (dump_enabled_p ())
2227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2228 "interleaved store with gaps\n");
2229 return false;
2230 }
2231
2232 /* If there is a gap after the last load in the group it is the
2233 difference between the groupsize and the last accessed
2234 element.
2235 When there is no gap, this difference should be 0. */
2236 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
fb85abff 2237
ee612634 2238 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
6d8fb6cf 2239 if (dump_enabled_p ())
904bd865 2240 {
2241 dump_printf_loc (MSG_NOTE, vect_location,
39e23eaa 2242 "Detected interleaving ");
2243 if (DR_IS_READ (dr))
2244 dump_printf (MSG_NOTE, "load ");
2245 else
2246 dump_printf (MSG_NOTE, "store ");
2247 dump_printf (MSG_NOTE, "of size %u starting with ",
2248 (unsigned)groupsize);
904bd865 2249 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2250 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2251 dump_printf_loc (MSG_NOTE, vect_location,
39e23eaa 2252 "There is a gap of %u elements after the group\n",
2253 GROUP_GAP (vinfo_for_stmt (stmt)));
904bd865 2254 }
fb85abff 2255
48e1416a 2256 /* SLP: create an SLP data structure for every interleaving group of
fb85abff 2257 stores for further analysis in vect_analyse_slp. */
9ff25603 2258 if (DR_IS_WRITE (dr) && !slp_impossible)
37545e54 2259 {
2260 if (loop_vinfo)
f1f41a6c 2261 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
37545e54 2262 if (bb_vinfo)
f1f41a6c 2263 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
37545e54 2264 }
a4ee7fac 2265
c1bee668 2266 /* If there is a gap in the end of the group or the group size cannot
2267 be made a multiple of the vector element count then we access excess
2268 elements in the last iteration and thus need to peel that off. */
2269 if (loop_vinfo
2270 && (groupsize - last_accessed_element > 0
2271 || exact_log2 (groupsize) == -1))
2272
a4ee7fac 2273 {
6d8fb6cf 2274 if (dump_enabled_p ())
7bd765d4 2275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2276 "Data access with gaps requires scalar "
2277 "epilogue loop\n");
d88ecd95 2278 if (loop->inner)
2279 {
6d8fb6cf 2280 if (dump_enabled_p ())
78bb46f5 2281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2282 "Peeling for outer loop is not supported\n");
d88ecd95 2283 return false;
2284 }
2285
2286 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
a4ee7fac 2287 }
fb85abff 2288 }
2289
2290 return true;
2291}
2292
39e23eaa 2293/* Analyze groups of accesses: check that DR belongs to a group of
2294 accesses of legal size, step, etc. Detect gaps, single element
2295 interleaving, and other special cases. Set grouped access info.
2296 Collect groups of strided stores for further use in SLP analysis. */
2297
2298static bool
2299vect_analyze_group_access (struct data_reference *dr)
2300{
2301 if (!vect_analyze_group_access_1 (dr))
2302 {
2303 /* Dissolve the group if present. */
42acab1c 2304 gimple *next;
2305 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
39e23eaa 2306 while (stmt)
2307 {
2308 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2309 next = GROUP_NEXT_ELEMENT (vinfo);
2310 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2311 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2312 stmt = next;
2313 }
2314 return false;
2315 }
2316 return true;
2317}
fb85abff 2318
2319/* Analyze the access pattern of the data-reference DR.
2320 In case of non-consecutive accesses call vect_analyze_group_access() to
ee612634 2321 analyze groups of accesses. */
fb85abff 2322
2323static bool
2324vect_analyze_data_ref_access (struct data_reference *dr)
2325{
2326 tree step = DR_STEP (dr);
2327 tree scalar_type = TREE_TYPE (DR_REF (dr));
42acab1c 2328 gimple *stmt = DR_STMT (dr);
fb85abff 2329 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2330 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
37545e54 2331 struct loop *loop = NULL;
fb85abff 2332
37545e54 2333 if (loop_vinfo)
2334 loop = LOOP_VINFO_LOOP (loop_vinfo);
48e1416a 2335
37545e54 2336 if (loop_vinfo && !step)
fb85abff 2337 {
6d8fb6cf 2338 if (dump_enabled_p ())
78bb46f5 2339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2340 "bad data-ref access in loop\n");
fb85abff 2341 return false;
2342 }
2343
9b0be19c 2344 /* Allow loads with zero step in inner-loop vectorization. */
f634c3e9 2345 if (loop_vinfo && integer_zerop (step))
b04940e7 2346 {
2347 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
9b0be19c 2348 if (!nested_in_vect_loop_p (loop, stmt))
2349 return DR_IS_READ (dr);
2350 /* Allow references with zero step for outer loops marked
2351 with pragma omp simd only - it guarantees absence of
2352 loop-carried dependencies between inner loop iterations. */
2353 if (!loop->force_vectorize)
afa60cb4 2354 {
2355 if (dump_enabled_p ())
2356 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2357 "zero step in inner loop of nest\n");
afa60cb4 2358 return false;
2359 }
b04940e7 2360 }
fb85abff 2361
37545e54 2362 if (loop && nested_in_vect_loop_p (loop, stmt))
fb85abff 2363 {
2364 /* Interleaved accesses are not yet supported within outer-loop
2365 vectorization for references in the inner-loop. */
21009880 2366 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
fb85abff 2367
2368 /* For the rest of the analysis we use the outer-loop step. */
2369 step = STMT_VINFO_DR_STEP (stmt_info);
f634c3e9 2370 if (integer_zerop (step))
fb85abff 2371 {
6d8fb6cf 2372 if (dump_enabled_p ())
7bd765d4 2373 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2374 "zero step in outer loop.\n");
0bd6d857 2375 return DR_IS_READ (dr);
fb85abff 2376 }
2377 }
2378
2379 /* Consecutive? */
f634c3e9 2380 if (TREE_CODE (step) == INTEGER_CST)
fb85abff 2381 {
f9ae6f95 2382 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
f634c3e9 2383 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2384 || (dr_step < 0
2385 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2386 {
2387 /* Mark that it is not interleaving. */
2388 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2389 return true;
2390 }
fb85abff 2391 }
2392
37545e54 2393 if (loop && nested_in_vect_loop_p (loop, stmt))
fb85abff 2394 {
6d8fb6cf 2395 if (dump_enabled_p ())
7bd765d4 2396 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2397 "grouped access in outer loop.\n");
fb85abff 2398 return false;
2399 }
2400
994be998 2401
f634c3e9 2402 /* Assume this is a DR handled by non-constant strided load case. */
2403 if (TREE_CODE (step) != INTEGER_CST)
e1c75243 2404 return (STMT_VINFO_STRIDED_P (stmt_info)
994be998 2405 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2406 || vect_analyze_group_access (dr)));
f634c3e9 2407
fb85abff 2408 /* Not consecutive access - check if it's a part of interleaving group. */
2409 return vect_analyze_group_access (dr);
2410}
2411
9b44c85a 2412
2413
2414/* A helper function used in the comparator function to sort data
2415 references. T1 and T2 are two data references to be compared.
2416 The function returns -1, 0, or 1. */
2417
2418static int
2419compare_tree (tree t1, tree t2)
2420{
2421 int i, cmp;
2422 enum tree_code code;
2423 char tclass;
2424
2425 if (t1 == t2)
2426 return 0;
2427 if (t1 == NULL)
2428 return -1;
2429 if (t2 == NULL)
2430 return 1;
2431
2432
2433 if (TREE_CODE (t1) != TREE_CODE (t2))
2434 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2435
2436 code = TREE_CODE (t1);
2437 switch (code)
2438 {
2439 /* For const values, we can just use hash values for comparisons. */
2440 case INTEGER_CST:
2441 case REAL_CST:
2442 case FIXED_CST:
2443 case STRING_CST:
2444 case COMPLEX_CST:
2445 case VECTOR_CST:
2446 {
2447 hashval_t h1 = iterative_hash_expr (t1, 0);
2448 hashval_t h2 = iterative_hash_expr (t2, 0);
2449 if (h1 != h2)
2450 return h1 < h2 ? -1 : 1;
2451 break;
2452 }
2453
2454 case SSA_NAME:
2455 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2456 if (cmp != 0)
2457 return cmp;
2458
2459 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2460 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2461 break;
2462
2463 default:
2464 tclass = TREE_CODE_CLASS (code);
2465
2466 /* For var-decl, we could compare their UIDs. */
2467 if (tclass == tcc_declaration)
2468 {
2469 if (DECL_UID (t1) != DECL_UID (t2))
2470 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2471 break;
2472 }
2473
2474 /* For expressions with operands, compare their operands recursively. */
2475 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2476 {
2477 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2478 if (cmp != 0)
2479 return cmp;
2480 }
2481 }
2482
2483 return 0;
2484}
2485
2486
68f15e9d 2487/* Compare two data-references DRA and DRB to group them into chunks
2488 suitable for grouping. */
2489
2490static int
2491dr_group_sort_cmp (const void *dra_, const void *drb_)
2492{
2493 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2494 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
68f15e9d 2495 int cmp;
2496
2497 /* Stabilize sort. */
2498 if (dra == drb)
2499 return 0;
2500
2501 /* Ordering of DRs according to base. */
2502 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2503 {
9b44c85a 2504 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2505 if (cmp != 0)
2506 return cmp;
68f15e9d 2507 }
2508
2509 /* And according to DR_OFFSET. */
2510 if (!dr_equal_offsets_p (dra, drb))
2511 {
9b44c85a 2512 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2513 if (cmp != 0)
2514 return cmp;
68f15e9d 2515 }
2516
2517 /* Put reads before writes. */
2518 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2519 return DR_IS_READ (dra) ? -1 : 1;
2520
2521 /* Then sort after access size. */
2522 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2523 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2524 {
9b44c85a 2525 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2526 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2527 if (cmp != 0)
2528 return cmp;
68f15e9d 2529 }
2530
2531 /* And after step. */
2532 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2533 {
9b44c85a 2534 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2535 if (cmp != 0)
2536 return cmp;
68f15e9d 2537 }
2538
2539 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2540 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2541 if (cmp == 0)
2542 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2543 return cmp;
2544}
fb85abff 2545
2546/* Function vect_analyze_data_ref_accesses.
2547
2548 Analyze the access pattern of all the data references in the loop.
2549
2550 FORNOW: the only access pattern that is considered vectorizable is a
2551 simple step 1 (consecutive) access.
2552
2553 FORNOW: handle only arrays and pointer accesses. */
2554
2555bool
e2c5c678 2556vect_analyze_data_ref_accesses (vec_info *vinfo)
fb85abff 2557{
2558 unsigned int i;
e2c5c678 2559 vec<data_reference_p> datarefs = vinfo->datarefs;
fb85abff 2560 struct data_reference *dr;
2561
6d8fb6cf 2562 if (dump_enabled_p ())
7bd765d4 2563 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2564 "=== vect_analyze_data_ref_accesses ===\n");
fb85abff 2565
68f15e9d 2566 if (datarefs.is_empty ())
2567 return true;
2568
2569 /* Sort the array of datarefs to make building the interleaving chains
863a3781 2570 linear. Don't modify the original vector's order, it is needed for
2571 determining what dependencies are reversed. */
2572 vec<data_reference_p> datarefs_copy = datarefs.copy ();
90a2d741 2573 datarefs_copy.qsort (dr_group_sort_cmp);
68f15e9d 2574
2575 /* Build the interleaving chains. */
863a3781 2576 for (i = 0; i < datarefs_copy.length () - 1;)
68f15e9d 2577 {
863a3781 2578 data_reference_p dra = datarefs_copy[i];
68f15e9d 2579 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2580 stmt_vec_info lastinfo = NULL;
863a3781 2581 for (i = i + 1; i < datarefs_copy.length (); ++i)
68f15e9d 2582 {
863a3781 2583 data_reference_p drb = datarefs_copy[i];
68f15e9d 2584 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2585
2586 /* ??? Imperfect sorting (non-compatible types, non-modulo
2587 accesses, same accesses) can lead to a group to be artificially
2588 split here as we don't just skip over those. If it really
2589 matters we can push those to a worklist and re-iterate
2590 over them. The we can just skip ahead to the next DR here. */
2591
2592 /* Check that the data-refs have same first location (except init)
5c0fac99 2593 and they are both either store or load (not load and store,
2594 not masked loads or stores). */
68f15e9d 2595 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2596 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2597 DR_BASE_ADDRESS (drb), 0)
5c0fac99 2598 || !dr_equal_offsets_p (dra, drb)
2599 || !gimple_assign_single_p (DR_STMT (dra))
2600 || !gimple_assign_single_p (DR_STMT (drb)))
68f15e9d 2601 break;
2602
994be998 2603 /* Check that the data-refs have the same constant size. */
68f15e9d 2604 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2605 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
e913b5cd 2606 if (!tree_fits_uhwi_p (sza)
2607 || !tree_fits_uhwi_p (szb)
994be998 2608 || !tree_int_cst_equal (sza, szb))
2609 break;
2610
2611 /* Check that the data-refs have the same step. */
2612 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
68f15e9d 2613 break;
2614
2615 /* Do not place the same access in the interleaving chain twice. */
2616 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2617 break;
2618
2619 /* Check the types are compatible.
2620 ??? We don't distinguish this during sorting. */
2621 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2622 TREE_TYPE (DR_REF (drb))))
2623 break;
2624
2625 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
f9ae6f95 2626 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2627 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
68f15e9d 2628 gcc_assert (init_a < init_b);
2629
2630 /* If init_b == init_a + the size of the type * k, we have an
2631 interleaving, and DRA is accessed before DRB. */
8c53c46c 2632 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
68f15e9d 2633 if ((init_b - init_a) % type_size_a != 0)
2634 break;
2635
3599384d 2636 /* If we have a store, the accesses are adjacent. This splits
2637 groups into chunks we support (we don't support vectorization
2638 of stores with gaps). */
2639 if (!DR_IS_READ (dra)
2640 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2641 (DR_INIT (datarefs_copy[i-1]))
2642 != type_size_a))
2643 break;
2644
994be998 2645 /* If the step (if not zero or non-constant) is greater than the
2646 difference between data-refs' inits this splits groups into
2647 suitable sizes. */
2648 if (tree_fits_shwi_p (DR_STEP (dra)))
2649 {
2650 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2651 if (step != 0 && step <= (init_b - init_a))
2652 break;
2653 }
68f15e9d 2654
2655 if (dump_enabled_p ())
2656 {
2657 dump_printf_loc (MSG_NOTE, vect_location,
2658 "Detected interleaving ");
39e23eaa 2659 if (DR_IS_READ (dra))
2660 dump_printf (MSG_NOTE, "load ");
2661 else
2662 dump_printf (MSG_NOTE, "store ");
68f15e9d 2663 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2664 dump_printf (MSG_NOTE, " and ");
2665 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 2666 dump_printf (MSG_NOTE, "\n");
68f15e9d 2667 }
2668
2669 /* Link the found element into the group list. */
2670 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2671 {
2672 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2673 lastinfo = stmtinfo_a;
2674 }
2675 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2676 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2677 lastinfo = stmtinfo_b;
2678 }
2679 }
2680
863a3781 2681 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
6ea6a380 2682 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2683 && !vect_analyze_data_ref_access (dr))
fb85abff 2684 {
6d8fb6cf 2685 if (dump_enabled_p ())
78bb46f5 2686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2687 "not vectorized: complicated access pattern.\n");
6ea6a380 2688
e2c5c678 2689 if (is_a <bb_vec_info> (vinfo))
6ea6a380 2690 {
2691 /* Mark the statement as not vectorizable. */
2692 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2693 continue;
2694 }
2695 else
863a3781 2696 {
2697 datarefs_copy.release ();
2698 return false;
2699 }
fb85abff 2700 }
2701
863a3781 2702 datarefs_copy.release ();
fb85abff 2703 return true;
2704}
2705
8a7b0f48 2706
43d14b66 2707/* Operator == between two dr_with_seg_len objects.
8a7b0f48 2708
2709 This equality operator is used to make sure two data refs
2710 are the same one so that we will consider to combine the
2711 aliasing checks of those two pairs of data dependent data
2712 refs. */
2713
2714static bool
43d14b66 2715operator == (const dr_with_seg_len& d1,
2716 const dr_with_seg_len& d2)
8a7b0f48 2717{
43d14b66 2718 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2719 DR_BASE_ADDRESS (d2.dr), 0)
2720 && compare_tree (d1.offset, d2.offset) == 0
2721 && compare_tree (d1.seg_len, d2.seg_len) == 0;
8a7b0f48 2722}
2723
43d14b66 2724/* Function comp_dr_with_seg_len_pair.
8a7b0f48 2725
43d14b66 2726 Comparison function for sorting objects of dr_with_seg_len_pair_t
8a7b0f48 2727 so that we can combine aliasing checks in one scan. */
2728
2729static int
43d14b66 2730comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
8a7b0f48 2731{
43d14b66 2732 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2733 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2734
2735 const dr_with_seg_len &p11 = p1->first,
2736 &p12 = p1->second,
2737 &p21 = p2->first,
2738 &p22 = p2->second;
2739
2740 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2741 if a and c have the same basic address snd step, and b and d have the same
2742 address and step. Therefore, if any a&c or b&d don't have the same address
2743 and step, we don't care the order of those two pairs after sorting. */
2744 int comp_res;
2745
2746 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2747 DR_BASE_ADDRESS (p21.dr))) != 0)
8a7b0f48 2748 return comp_res;
43d14b66 2749 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2750 DR_BASE_ADDRESS (p22.dr))) != 0)
2751 return comp_res;
2752 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2753 return comp_res;
2754 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2755 return comp_res;
2756 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2757 return comp_res;
2758 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
8a7b0f48 2759 return comp_res;
8a7b0f48 2760
2761 return 0;
2762}
2763
8a7b0f48 2764/* Function vect_vfa_segment_size.
2765
2766 Create an expression that computes the size of segment
2767 that will be accessed for a data reference. The functions takes into
2768 account that realignment loads may access one more vector.
2769
2770 Input:
2771 DR: The data reference.
2772 LENGTH_FACTOR: segment length to consider.
2773
2774 Return an expression whose value is the size of segment which will be
2775 accessed by DR. */
2776
2777static tree
2778vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2779{
2780 tree segment_length;
2781
2782 if (integer_zerop (DR_STEP (dr)))
2783 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2784 else
2785 segment_length = size_binop (MULT_EXPR,
43d14b66 2786 fold_convert (sizetype, DR_STEP (dr)),
2787 fold_convert (sizetype, length_factor));
8a7b0f48 2788
2789 if (vect_supportable_dr_alignment (dr, false)
43d14b66 2790 == dr_explicit_realign_optimized)
8a7b0f48 2791 {
2792 tree vector_size = TYPE_SIZE_UNIT
2793 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2794
2795 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2796 }
2797 return segment_length;
2798}
2799
fb85abff 2800/* Function vect_prune_runtime_alias_test_list.
2801
2802 Prune a list of ddrs to be tested at run-time by versioning for alias.
8a7b0f48 2803 Merge several alias checks into one if possible.
fb85abff 2804 Return FALSE if resulting list of ddrs is longer then allowed by
2805 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2806
2807bool
2808vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2809{
8a7b0f48 2810 vec<ddr_p> may_alias_ddrs =
fb85abff 2811 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
43d14b66 2812 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
8a7b0f48 2813 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2814 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2815 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2816
2817 ddr_p ddr;
2818 unsigned int i;
2819 tree length_factor;
fb85abff 2820
6d8fb6cf 2821 if (dump_enabled_p ())
7bd765d4 2822 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2823 "=== vect_prune_runtime_alias_test_list ===\n");
fb85abff 2824
8a7b0f48 2825 if (may_alias_ddrs.is_empty ())
2826 return true;
2827
2828 /* Basically, for each pair of dependent data refs store_ptr_0
2829 and load_ptr_0, we create an expression:
2830
2831 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2832 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2833
2834 for aliasing checks. However, in some cases we can decrease
2835 the number of checks by combining two checks into one. For
2836 example, suppose we have another pair of data refs store_ptr_0
2837 and load_ptr_1, and if the following condition is satisfied:
2838
2839 load_ptr_0 < load_ptr_1 &&
2840 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2841
2842 (this condition means, in each iteration of vectorized loop,
2843 the accessed memory of store_ptr_0 cannot be between the memory
2844 of load_ptr_0 and load_ptr_1.)
2845
2846 we then can use only the following expression to finish the
2847 alising checks between store_ptr_0 & load_ptr_0 and
2848 store_ptr_0 & load_ptr_1:
2849
2850 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2851 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2852
2853 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2854 same basic address. */
2855
2856 comp_alias_ddrs.create (may_alias_ddrs.length ());
2857
2858 /* First, we collect all data ref pairs for aliasing checks. */
2859 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
fb85abff 2860 {
8a7b0f48 2861 struct data_reference *dr_a, *dr_b;
42acab1c 2862 gimple *dr_group_first_a, *dr_group_first_b;
8a7b0f48 2863 tree segment_length_a, segment_length_b;
42acab1c 2864 gimple *stmt_a, *stmt_b;
8a7b0f48 2865
2866 dr_a = DDR_A (ddr);
2867 stmt_a = DR_STMT (DDR_A (ddr));
2868 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2869 if (dr_group_first_a)
2870 {
2871 stmt_a = dr_group_first_a;
2872 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2873 }
fb85abff 2874
8a7b0f48 2875 dr_b = DDR_B (ddr);
2876 stmt_b = DR_STMT (DDR_B (ddr));
2877 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2878 if (dr_group_first_b)
2879 {
2880 stmt_b = dr_group_first_b;
2881 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2882 }
fb85abff 2883
8a7b0f48 2884 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2885 length_factor = scalar_loop_iters;
2886 else
2887 length_factor = size_int (vect_factor);
2888 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2889 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2890
43d14b66 2891 dr_with_seg_len_pair_t dr_with_seg_len_pair
2892 (dr_with_seg_len (dr_a, segment_length_a),
2893 dr_with_seg_len (dr_b, segment_length_b));
2894
2895 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3d4d7ad1 2896 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
8a7b0f48 2897
2898 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2899 }
2900
2901 /* Second, we sort the collected data ref pairs so that we can scan
2902 them once to combine all possible aliasing checks. */
43d14b66 2903 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
fb85abff 2904
8a7b0f48 2905 /* Third, we scan the sorted dr pairs and check if we can combine
2906 alias checks of two neighbouring dr pairs. */
2907 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2908 {
2909 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
43d14b66 2910 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2911 *dr_b1 = &comp_alias_ddrs[i-1].second,
2912 *dr_a2 = &comp_alias_ddrs[i].first,
2913 *dr_b2 = &comp_alias_ddrs[i].second;
8a7b0f48 2914
2915 /* Remove duplicate data ref pairs. */
2916 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2917 {
2918 if (dump_enabled_p ())
fb85abff 2919 {
8a7b0f48 2920 dump_printf_loc (MSG_NOTE, vect_location,
2921 "found equal ranges ");
2922 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923 DR_REF (dr_a1->dr));
2924 dump_printf (MSG_NOTE, ", ");
2925 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926 DR_REF (dr_b1->dr));
2927 dump_printf (MSG_NOTE, " and ");
2928 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2929 DR_REF (dr_a2->dr));
2930 dump_printf (MSG_NOTE, ", ");
2931 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2932 DR_REF (dr_b2->dr));
2933 dump_printf (MSG_NOTE, "\n");
fb85abff 2934 }
8a7b0f48 2935
2936 comp_alias_ddrs.ordered_remove (i--);
2937 continue;
fb85abff 2938 }
48e1416a 2939
8a7b0f48 2940 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2941 {
2942 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2943 and DR_A1 and DR_A2 are two consecutive memrefs. */
2944 if (*dr_a1 == *dr_a2)
2945 {
3d4d7ad1 2946 std::swap (dr_a1, dr_b1);
2947 std::swap (dr_a2, dr_b2);
8a7b0f48 2948 }
2949
43d14b66 2950 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2951 DR_BASE_ADDRESS (dr_a2->dr),
2952 0)
55af3bae 2953 || !tree_fits_shwi_p (dr_a1->offset)
2954 || !tree_fits_shwi_p (dr_a2->offset))
8a7b0f48 2955 continue;
2956
55af3bae 2957 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2958 - tree_to_shwi (dr_a1->offset));
8a7b0f48 2959
2960
2961 /* Now we check if the following condition is satisfied:
2962
2963 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2964
2965 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2966 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2967 have to make a best estimation. We can get the minimum value
2968 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2969 then either of the following two conditions can guarantee the
2970 one above:
2971
2972 1: DIFF <= MIN_SEG_LEN_B
2973 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2974
2975 */
2976
55af3bae 2977 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2978 ? tree_to_shwi (dr_b1->seg_len)
2979 : vect_factor);
8a7b0f48 2980
2981 if (diff <= min_seg_len_b
55af3bae 2982 || (tree_fits_shwi_p (dr_a1->seg_len)
2983 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
8a7b0f48 2984 {
1c7f4889 2985 if (dump_enabled_p ())
2986 {
2987 dump_printf_loc (MSG_NOTE, vect_location,
2988 "merging ranges for ");
2989 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2990 DR_REF (dr_a1->dr));
2991 dump_printf (MSG_NOTE, ", ");
2992 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2993 DR_REF (dr_b1->dr));
2994 dump_printf (MSG_NOTE, " and ");
2995 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2996 DR_REF (dr_a2->dr));
2997 dump_printf (MSG_NOTE, ", ");
2998 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2999 DR_REF (dr_b2->dr));
3000 dump_printf (MSG_NOTE, "\n");
3001 }
3002
8a7b0f48 3003 dr_a1->seg_len = size_binop (PLUS_EXPR,
3004 dr_a2->seg_len, size_int (diff));
3005 comp_alias_ddrs.ordered_remove (i--);
3006 }
3007 }
fb85abff 3008 }
3009
1c7f4889 3010 dump_printf_loc (MSG_NOTE, vect_location,
3011 "improved number of alias checks from %d to %d\n",
3012 may_alias_ddrs.length (), comp_alias_ddrs.length ());
8a7b0f48 3013 if ((int) comp_alias_ddrs.length () >
3014 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
1c7f4889 3015 return false;
fb85abff 3016
3017 return true;
3018}
3019
0bd6d857 3020/* Check whether a non-affine read or write in stmt is suitable for gather load
3021 or scatter store and if so, return a builtin decl for that operation. */
16dfb112 3022
3023tree
42acab1c 3024vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
0bd6d857 3025 tree *offp, int *scalep)
16dfb112 3026{
3027 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3028 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3029 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3030 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3031 tree offtype = NULL_TREE;
3032 tree decl, base, off;
3754d046 3033 machine_mode pmode;
16dfb112 3034 int punsignedp, pvolatilep;
3035
c71d3c24 3036 base = DR_REF (dr);
3037 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3038 see if we can use the def stmt of the address. */
3039 if (is_gimple_call (stmt)
3040 && gimple_call_internal_p (stmt)
3041 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3042 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3043 && TREE_CODE (base) == MEM_REF
3044 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3045 && integer_zerop (TREE_OPERAND (base, 1))
3046 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3047 {
42acab1c 3048 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
c71d3c24 3049 if (is_gimple_assign (def_stmt)
3050 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3051 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3052 }
3053
0bd6d857 3054 /* The gather and scatter builtins need address of the form
16dfb112 3055 loop_invariant + vector * {1, 2, 4, 8}
3056 or
3057 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3058 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3059 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3060 multiplications and additions in it. To get a vector, we need
3061 a single SSA_NAME that will be defined in the loop and will
3062 contain everything that is not loop invariant and that can be
3063 vectorized. The following code attempts to find such a preexistng
3064 SSA_NAME OFF and put the loop invariants into a tree BASE
3065 that can be gimplified before the loop. */
c71d3c24 3066 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
16dfb112 3067 &pmode, &punsignedp, &pvolatilep, false);
3068 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3069
3070 if (TREE_CODE (base) == MEM_REF)
3071 {
3072 if (!integer_zerop (TREE_OPERAND (base, 1)))
3073 {
3074 if (off == NULL_TREE)
3075 {
5de9d3ed 3076 offset_int moff = mem_ref_offset (base);
e913b5cd 3077 off = wide_int_to_tree (sizetype, moff);
16dfb112 3078 }
3079 else
3080 off = size_binop (PLUS_EXPR, off,
3081 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3082 }
3083 base = TREE_OPERAND (base, 0);
3084 }
3085 else
3086 base = build_fold_addr_expr (base);
3087
3088 if (off == NULL_TREE)
3089 off = size_zero_node;
3090
3091 /* If base is not loop invariant, either off is 0, then we start with just
3092 the constant offset in the loop invariant BASE and continue with base
3093 as OFF, otherwise give up.
3094 We could handle that case by gimplifying the addition of base + off
3095 into some SSA_NAME and use that as off, but for now punt. */
3096 if (!expr_invariant_in_loop_p (loop, base))
3097 {
3098 if (!integer_zerop (off))
3099 return NULL_TREE;
3100 off = base;
3101 base = size_int (pbitpos / BITS_PER_UNIT);
3102 }
3103 /* Otherwise put base + constant offset into the loop invariant BASE
3104 and continue with OFF. */
3105 else
3106 {
3107 base = fold_convert (sizetype, base);
3108 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3109 }
3110
3111 /* OFF at this point may be either a SSA_NAME or some tree expression
3112 from get_inner_reference. Try to peel off loop invariants from it
3113 into BASE as long as possible. */
3114 STRIP_NOPS (off);
3115 while (offtype == NULL_TREE)
3116 {
3117 enum tree_code code;
3118 tree op0, op1, add = NULL_TREE;
3119
3120 if (TREE_CODE (off) == SSA_NAME)
3121 {
42acab1c 3122 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
16dfb112 3123
3124 if (expr_invariant_in_loop_p (loop, off))
3125 return NULL_TREE;
3126
3127 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3128 break;
3129
3130 op0 = gimple_assign_rhs1 (def_stmt);
3131 code = gimple_assign_rhs_code (def_stmt);
3132 op1 = gimple_assign_rhs2 (def_stmt);
3133 }
3134 else
3135 {
3136 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3137 return NULL_TREE;
3138 code = TREE_CODE (off);
3139 extract_ops_from_tree (off, &code, &op0, &op1);
3140 }
3141 switch (code)
3142 {
3143 case POINTER_PLUS_EXPR:
3144 case PLUS_EXPR:
3145 if (expr_invariant_in_loop_p (loop, op0))
3146 {
3147 add = op0;
3148 off = op1;
3149 do_add:
3150 add = fold_convert (sizetype, add);
3151 if (scale != 1)
3152 add = size_binop (MULT_EXPR, add, size_int (scale));
3153 base = size_binop (PLUS_EXPR, base, add);
3154 continue;
3155 }
3156 if (expr_invariant_in_loop_p (loop, op1))
3157 {
3158 add = op1;
3159 off = op0;
3160 goto do_add;
3161 }
3162 break;
3163 case MINUS_EXPR:
3164 if (expr_invariant_in_loop_p (loop, op1))
3165 {
3166 add = fold_convert (sizetype, op1);
3167 add = size_binop (MINUS_EXPR, size_zero_node, add);
3168 off = op0;
3169 goto do_add;
3170 }
3171 break;
3172 case MULT_EXPR:
e913b5cd 3173 if (scale == 1 && tree_fits_shwi_p (op1))
16dfb112 3174 {
e913b5cd 3175 scale = tree_to_shwi (op1);
16dfb112 3176 off = op0;
3177 continue;
3178 }
3179 break;
3180 case SSA_NAME:
3181 off = op0;
3182 continue;
3183 CASE_CONVERT:
3184 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3185 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3186 break;
3187 if (TYPE_PRECISION (TREE_TYPE (op0))
3188 == TYPE_PRECISION (TREE_TYPE (off)))
3189 {
3190 off = op0;
3191 continue;
3192 }
3193 if (TYPE_PRECISION (TREE_TYPE (op0))
3194 < TYPE_PRECISION (TREE_TYPE (off)))
3195 {
3196 off = op0;
3197 offtype = TREE_TYPE (off);
3198 STRIP_NOPS (off);
3199 continue;
3200 }
3201 break;
3202 default:
3203 break;
3204 }
3205 break;
3206 }
3207
3208 /* If at the end OFF still isn't a SSA_NAME or isn't
3209 defined in the loop, punt. */
3210 if (TREE_CODE (off) != SSA_NAME
3211 || expr_invariant_in_loop_p (loop, off))
3212 return NULL_TREE;
3213
3214 if (offtype == NULL_TREE)
3215 offtype = TREE_TYPE (off);
3216
0bd6d857 3217 if (DR_IS_READ (dr))
3218 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3219 offtype, scale);
3220 else
3221 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3222 offtype, scale);
3223
16dfb112 3224 if (decl == NULL_TREE)
3225 return NULL_TREE;
3226
3227 if (basep)
3228 *basep = base;
3229 if (offp)
3230 *offp = off;
3231 if (scalep)
3232 *scalep = scale;
3233 return decl;
3234}
3235
fb85abff 3236/* Function vect_analyze_data_refs.
3237
37545e54 3238 Find all the data references in the loop or basic block.
fb85abff 3239
3240 The general structure of the analysis of data refs in the vectorizer is as
3241 follows:
48e1416a 3242 1- vect_analyze_data_refs(loop/bb): call
37545e54 3243 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3244 in the loop/bb and their dependences.
fb85abff 3245 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3246 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3247 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3248
3249*/
3250
3251bool
e2c5c678 3252vect_analyze_data_refs (vec_info *vinfo, int *min_vf, unsigned *n_stmts)
fb85abff 3253{
37545e54 3254 struct loop *loop = NULL;
3255 basic_block bb = NULL;
fb85abff 3256 unsigned int i;
f1f41a6c 3257 vec<data_reference_p> datarefs;
fb85abff 3258 struct data_reference *dr;
3259 tree scalar_type;
3260
6d8fb6cf 3261 if (dump_enabled_p ())
7bd765d4 3262 dump_printf_loc (MSG_NOTE, vect_location,
3263 "=== vect_analyze_data_refs ===\n");
48e1416a 3264
e2c5c678 3265 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
37545e54 3266 {
d09768a4 3267 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3268
37545e54 3269 loop = LOOP_VINFO_LOOP (loop_vinfo);
d09768a4 3270 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3271 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
590a32b6 3272 {
6d8fb6cf 3273 if (dump_enabled_p ())
78bb46f5 3274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3275 "not vectorized: loop contains function calls"
3276 " or data references that cannot be analyzed\n");
590a32b6 3277 return false;
3278 }
3279
d09768a4 3280 for (i = 0; i < loop->num_nodes; i++)
3281 {
3282 gimple_stmt_iterator gsi;
3283
3284 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3285 {
42acab1c 3286 gimple *stmt = gsi_stmt (gsi);
ca91d3f8 3287 if (is_gimple_debug (stmt))
3288 continue;
3289 ++*n_stmts;
d09768a4 3290 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3291 {
3292 if (is_gimple_call (stmt) && loop->safelen)
3293 {
3294 tree fndecl = gimple_call_fndecl (stmt), op;
3295 if (fndecl != NULL_TREE)
3296 {
415d1b9a 3297 struct cgraph_node *node = cgraph_node::get (fndecl);
d09768a4 3298 if (node != NULL && node->simd_clones != NULL)
3299 {
3300 unsigned int j, n = gimple_call_num_args (stmt);
3301 for (j = 0; j < n; j++)
3302 {
3303 op = gimple_call_arg (stmt, j);
3304 if (DECL_P (op)
3305 || (REFERENCE_CLASS_P (op)
3306 && get_base_address (op)))
3307 break;
3308 }
3309 op = gimple_call_lhs (stmt);
3310 /* Ignore #pragma omp declare simd functions
3311 if they don't have data references in the
3312 call stmt itself. */
3313 if (j == n
3314 && !(op
3315 && (DECL_P (op)
3316 || (REFERENCE_CLASS_P (op)
3317 && get_base_address (op)))))
3318 continue;
3319 }
3320 }
3321 }
3322 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3323 if (dump_enabled_p ())
3324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3325 "not vectorized: loop contains function "
3326 "calls or data references that cannot "
3327 "be analyzed\n");
3328 return false;
3329 }
3330 }
3331 }
3332
3333 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
37545e54 3334 }
3335 else
3336 {
e2c5c678 3337 bb_vec_info bb_vinfo = as_a <bb_vec_info> (vinfo);
ec84182d 3338 gimple_stmt_iterator gsi;
3339
37545e54 3340 bb = BB_VINFO_BB (bb_vinfo);
ec84182d 3341 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3342 {
42acab1c 3343 gimple *stmt = gsi_stmt (gsi);
ca91d3f8 3344 if (is_gimple_debug (stmt))
3345 continue;
3346 ++*n_stmts;
ec84182d 3347 if (!find_data_references_in_stmt (NULL, stmt,
3348 &BB_VINFO_DATAREFS (bb_vinfo)))
3349 {
3350 /* Mark the rest of the basic-block as unvectorizable. */
3351 for (; !gsi_end_p (gsi); gsi_next (&gsi))
156b8feb 3352 {
3353 stmt = gsi_stmt (gsi);
3354 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3355 }
ec84182d 3356 break;
3357 }
3358 }
590a32b6 3359
37545e54 3360 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3361 }
fb85abff 3362
282bf14c 3363 /* Go through the data-refs, check that the analysis succeeded. Update
3364 pointer from stmt_vec_info struct to DR and vectype. */
fb85abff 3365
f1f41a6c 3366 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 3367 {
42acab1c 3368 gimple *stmt;
fb85abff 3369 stmt_vec_info stmt_info;
48e1416a 3370 tree base, offset, init;
0bd6d857 3371 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3d483a94 3372 bool simd_lane_access = false;
91a74fc6 3373 int vf;
48e1416a 3374
8911f4de 3375again:
fb85abff 3376 if (!dr || !DR_REF (dr))
3377 {
6d8fb6cf 3378 if (dump_enabled_p ())
7bd765d4 3379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 3380 "not vectorized: unhandled data-ref\n");
fb85abff 3381 return false;
3382 }
3383
3384 stmt = DR_STMT (dr);
3385 stmt_info = vinfo_for_stmt (stmt);
3386
8911f4de 3387 /* Discard clobbers from the dataref vector. We will remove
3388 clobber stmts during vectorization. */
3389 if (gimple_clobber_p (stmt))
3390 {
53c3c39b 3391 free_data_ref (dr);
8911f4de 3392 if (i == datarefs.length () - 1)
3393 {
3394 datarefs.pop ();
3395 break;
3396 }
4aaf064d 3397 datarefs.ordered_remove (i);
3398 dr = datarefs[i];
8911f4de 3399 goto again;
3400 }
3401
fb85abff 3402 /* Check that analysis of the data-ref succeeded. */
3403 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
16dfb112 3404 || !DR_STEP (dr))
fb85abff 3405 {
3d483a94 3406 bool maybe_gather
3407 = DR_IS_READ (dr)
16dfb112 3408 && !TREE_THIS_VOLATILE (DR_REF (dr))
3d483a94 3409 && targetm.vectorize.builtin_gather != NULL;
0bd6d857 3410 bool maybe_scatter
3411 = DR_IS_WRITE (dr)
3412 && !TREE_THIS_VOLATILE (DR_REF (dr))
3413 && targetm.vectorize.builtin_scatter != NULL;
3d483a94 3414 bool maybe_simd_lane_access
e2c5c678 3415 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3d483a94 3416
0bd6d857 3417 /* If target supports vector gather loads or scatter stores, or if
3418 this might be a SIMD lane access, see if they can't be used. */
e2c5c678 3419 if (is_a <loop_vec_info> (vinfo)
0bd6d857 3420 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
16dfb112 3421 && !nested_in_vect_loop_p (loop, stmt))
3422 {
3423 struct data_reference *newdr
3424 = create_data_ref (NULL, loop_containing_stmt (stmt),
0bd6d857 3425 DR_REF (dr), stmt, maybe_scatter ? false : true);
16dfb112 3426 gcc_assert (newdr != NULL && DR_REF (newdr));
3427 if (DR_BASE_ADDRESS (newdr)
3428 && DR_OFFSET (newdr)
3429 && DR_INIT (newdr)
3430 && DR_STEP (newdr)
3431 && integer_zerop (DR_STEP (newdr)))
3432 {
3d483a94 3433 if (maybe_simd_lane_access)
3434 {
3435 tree off = DR_OFFSET (newdr);
3436 STRIP_NOPS (off);
3437 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3438 && TREE_CODE (off) == MULT_EXPR
d85a2013 3439 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3d483a94 3440 {
3441 tree step = TREE_OPERAND (off, 1);
3442 off = TREE_OPERAND (off, 0);
3443 STRIP_NOPS (off);
3444 if (CONVERT_EXPR_P (off)
3445 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3446 0)))
3447 < TYPE_PRECISION (TREE_TYPE (off)))
3448 off = TREE_OPERAND (off, 0);
3449 if (TREE_CODE (off) == SSA_NAME)
3450 {
42acab1c 3451 gimple *def = SSA_NAME_DEF_STMT (off);
3d483a94 3452 tree reft = TREE_TYPE (DR_REF (newdr));
cae17039 3453 if (is_gimple_call (def)
3454 && gimple_call_internal_p (def)
3455 && (gimple_call_internal_fn (def)
3456 == IFN_GOMP_SIMD_LANE))
3d483a94 3457 {
3458 tree arg = gimple_call_arg (def, 0);
3459 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3460 arg = SSA_NAME_VAR (arg);
3461 if (arg == loop->simduid
3462 /* For now. */
3463 && tree_int_cst_equal
3464 (TYPE_SIZE_UNIT (reft),
3465 step))
3466 {
3467 DR_OFFSET (newdr) = ssize_int (0);
3468 DR_STEP (newdr) = step;
3b36c9f7 3469 DR_ALIGNED_TO (newdr)
3470 = size_int (BIGGEST_ALIGNMENT);
3d483a94 3471 dr = newdr;
3472 simd_lane_access = true;
3473 }
3474 }
3475 }
3476 }
3477 }
0bd6d857 3478 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3d483a94 3479 {
3480 dr = newdr;
0bd6d857 3481 if (maybe_gather)
3482 gatherscatter = GATHER;
3483 else
3484 gatherscatter = SCATTER;
3d483a94 3485 }
16dfb112 3486 }
0bd6d857 3487 if (gatherscatter == SG_NONE && !simd_lane_access)
16dfb112 3488 free_data_ref (newdr);
3489 }
6ea6a380 3490
0bd6d857 3491 if (gatherscatter == SG_NONE && !simd_lane_access)
16dfb112 3492 {
6d8fb6cf 3493 if (dump_enabled_p ())
16dfb112 3494 {
78bb46f5 3495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 3496 "not vectorized: data ref analysis "
3497 "failed ");
3498 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3499 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
16dfb112 3500 }
3c9feaca 3501
e2c5c678 3502 if (is_a <bb_vec_info> (vinfo))
07e3bcbf 3503 break;
16dfb112 3504
3505 return false;
3506 }
fb85abff 3507 }
3508
3509 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3510 {
6d8fb6cf 3511 if (dump_enabled_p ())
7bd765d4 3512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3513 "not vectorized: base addr of dr is a "
78bb46f5 3514 "constant\n");
3c9feaca 3515
e2c5c678 3516 if (is_a <bb_vec_info> (vinfo))
07e3bcbf 3517 break;
3c9feaca 3518
0bd6d857 3519 if (gatherscatter != SG_NONE || simd_lane_access)
16dfb112 3520 free_data_ref (dr);
3521 return false;
fb85abff 3522 }
3523
83ec4165 3524 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3525 {
6d8fb6cf 3526 if (dump_enabled_p ())
83ec4165 3527 {
7bd765d4 3528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3529 "not vectorized: volatile type ");
3530 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3531 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
83ec4165 3532 }
3c9feaca 3533
e2c5c678 3534 if (is_a <bb_vec_info> (vinfo))
07e3bcbf 3535 break;
3c9feaca 3536
83ec4165 3537 return false;
3538 }
3539
96b07a49 3540 if (stmt_can_throw_internal (stmt))
42a6710e 3541 {
6d8fb6cf 3542 if (dump_enabled_p ())
42a6710e 3543 {
7bd765d4 3544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3545 "not vectorized: statement can throw an "
3546 "exception ");
3547 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3548 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
42a6710e 3549 }
3c9feaca 3550
e2c5c678 3551 if (is_a <bb_vec_info> (vinfo))
07e3bcbf 3552 break;
3c9feaca 3553
0bd6d857 3554 if (gatherscatter != SG_NONE || simd_lane_access)
16dfb112 3555 free_data_ref (dr);
42a6710e 3556 return false;
3557 }
3558
87c952b8 3559 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3560 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3561 {
6d8fb6cf 3562 if (dump_enabled_p ())
87c952b8 3563 {
7bd765d4 3564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3565 "not vectorized: statement is bitfield "
3566 "access ");
3567 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3568 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
87c952b8 3569 }
3570
e2c5c678 3571 if (is_a <bb_vec_info> (vinfo))
07e3bcbf 3572 break;
87c952b8 3573
0bd6d857 3574 if (gatherscatter != SG_NONE || simd_lane_access)
87c952b8 3575 free_data_ref (dr);
3576 return false;
3577 }
3578
3579 base = unshare_expr (DR_BASE_ADDRESS (dr));
3580 offset = unshare_expr (DR_OFFSET (dr));
3581 init = unshare_expr (DR_INIT (dr));
3582
c71d3c24 3583 if (is_gimple_call (stmt)
3584 && (!gimple_call_internal_p (stmt)
3585 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3586 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
635bf3aa 3587 {
6d8fb6cf 3588 if (dump_enabled_p ())
635bf3aa 3589 {
7bd765d4 3590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 3591 "not vectorized: dr in a call ");
7bd765d4 3592 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3593 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
635bf3aa 3594 }
3595
e2c5c678 3596 if (is_a <bb_vec_info> (vinfo))
07e3bcbf 3597 break;
635bf3aa 3598
0bd6d857 3599 if (gatherscatter != SG_NONE || simd_lane_access)
635bf3aa 3600 free_data_ref (dr);
3601 return false;
3602 }
3603
fb85abff 3604 /* Update DR field in stmt_vec_info struct. */
fb85abff 3605
3606 /* If the dataref is in an inner-loop of the loop that is considered for
3607 for vectorization, we also want to analyze the access relative to
48e1416a 3608 the outer-loop (DR contains information only relative to the
fb85abff 3609 inner-most enclosing loop). We do that by building a reference to the
3610 first location accessed by the inner-loop, and analyze it relative to
48e1416a 3611 the outer-loop. */
3612 if (loop && nested_in_vect_loop_p (loop, stmt))
fb85abff 3613 {
3614 tree outer_step, outer_base, outer_init;
3615 HOST_WIDE_INT pbitsize, pbitpos;
3616 tree poffset;
3754d046 3617 machine_mode pmode;
fb85abff 3618 int punsignedp, pvolatilep;
3619 affine_iv base_iv, offset_iv;
3620 tree dinit;
3621
48e1416a 3622 /* Build a reference to the first location accessed by the
282bf14c 3623 inner-loop: *(BASE+INIT). (The first location is actually
fb85abff 3624 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3625 tree inner_base = build_fold_indirect_ref
2cc66f2a 3626 (fold_build_pointer_plus (base, init));
fb85abff 3627
6d8fb6cf 3628 if (dump_enabled_p ())
fb85abff 3629 {
7bd765d4 3630 dump_printf_loc (MSG_NOTE, vect_location,
3631 "analyze in outer-loop: ");
3632 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
78bb46f5 3633 dump_printf (MSG_NOTE, "\n");
fb85abff 3634 }
3635
48e1416a 3636 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
fb85abff 3637 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3638 gcc_assert (outer_base != NULL_TREE);
3639
3640 if (pbitpos % BITS_PER_UNIT != 0)
3641 {
6d8fb6cf 3642 if (dump_enabled_p ())
7bd765d4 3643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3644 "failed: bit offset alignment.\n");
fb85abff 3645 return false;
3646 }
3647
3648 outer_base = build_fold_addr_expr (outer_base);
48e1416a 3649 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
fb85abff 3650 &base_iv, false))
3651 {
6d8fb6cf 3652 if (dump_enabled_p ())
78bb46f5 3653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 3654 "failed: evolution of base is not affine.\n");
fb85abff 3655 return false;
3656 }
3657
3658 if (offset)
3659 {
3660 if (poffset)
48e1416a 3661 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
fb85abff 3662 poffset);
3663 else
3664 poffset = offset;
3665 }
3666
3667 if (!poffset)
3668 {
3669 offset_iv.base = ssize_int (0);
3670 offset_iv.step = ssize_int (0);
3671 }
48e1416a 3672 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
fb85abff 3673 &offset_iv, false))
3674 {
6d8fb6cf 3675 if (dump_enabled_p ())
78bb46f5 3676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 3677 "evolution of offset is not affine.\n");
fb85abff 3678 return false;
3679 }
3680
3681 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3682 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3683 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3684 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3685 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3686
3687 outer_step = size_binop (PLUS_EXPR,
3688 fold_convert (ssizetype, base_iv.step),
3689 fold_convert (ssizetype, offset_iv.step));
3690
3691 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3692 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
48e1416a 3693 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
fb85abff 3694 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
48e1416a 3695 STMT_VINFO_DR_OFFSET (stmt_info) =
fb85abff 3696 fold_convert (ssizetype, offset_iv.base);
48e1416a 3697 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
fb85abff 3698 size_int (highest_pow2_factor (offset_iv.base));
3699
6d8fb6cf 3700 if (dump_enabled_p ())
fb85abff 3701 {
7bd765d4 3702 dump_printf_loc (MSG_NOTE, vect_location,
3703 "\touter base_address: ");
3704 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3705 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3706 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3707 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3708 STMT_VINFO_DR_OFFSET (stmt_info));
3709 dump_printf (MSG_NOTE,
3710 "\n\touter constant offset from base address: ");
3711 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3712 STMT_VINFO_DR_INIT (stmt_info));
3713 dump_printf (MSG_NOTE, "\n\touter step: ");
3714 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3715 STMT_VINFO_DR_STEP (stmt_info));
3716 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3717 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3718 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
78bb46f5 3719 dump_printf (MSG_NOTE, "\n");
fb85abff 3720 }
3721 }
3722
3723 if (STMT_VINFO_DATA_REF (stmt_info))
3724 {
6d8fb6cf 3725 if (dump_enabled_p ())
fb85abff 3726 {
7bd765d4 3727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3728 "not vectorized: more than one data ref "
3729 "in stmt: ");
3730 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3731 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 3732 }
3c9feaca 3733
e2c5c678 3734 if (is_a <bb_vec_info> (vinfo))
07e3bcbf 3735 break;
3c9feaca 3736
0bd6d857 3737 if (gatherscatter != SG_NONE || simd_lane_access)
16dfb112 3738 free_data_ref (dr);
fb85abff 3739 return false;
3740 }
f083cd24 3741
fb85abff 3742 STMT_VINFO_DATA_REF (stmt_info) = dr;
3d483a94 3743 if (simd_lane_access)
3744 {
3745 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
53c3c39b 3746 free_data_ref (datarefs[i]);
3d483a94 3747 datarefs[i] = dr;
3748 }
48e1416a 3749
fb85abff 3750 /* Set vectype for STMT. */
3751 scalar_type = TREE_TYPE (DR_REF (dr));
53c3c39b 3752 STMT_VINFO_VECTYPE (stmt_info)
3753 = get_vectype_for_scalar_type (scalar_type);
48e1416a 3754 if (!STMT_VINFO_VECTYPE (stmt_info))
fb85abff 3755 {
6d8fb6cf 3756 if (dump_enabled_p ())
fb85abff 3757 {
78bb46f5 3758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 3759 "not vectorized: no vectype for stmt: ");
3760 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3761 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3762 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3763 scalar_type);
78bb46f5 3764 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 3765 }
6ea6a380 3766
e2c5c678 3767 if (is_a <bb_vec_info> (vinfo))
07e3bcbf 3768 break;
16dfb112 3769
0bd6d857 3770 if (gatherscatter != SG_NONE || simd_lane_access)
16dfb112 3771 {
3772 STMT_VINFO_DATA_REF (stmt_info) = NULL;
0bd6d857 3773 if (gatherscatter != SG_NONE)
53c3c39b 3774 free_data_ref (dr);
16dfb112 3775 }
3776 return false;
fb85abff 3777 }
0bf5f81b 3778 else
3779 {
3780 if (dump_enabled_p ())
3781 {
3782 dump_printf_loc (MSG_NOTE, vect_location,
3783 "got vectype for stmt: ");
3784 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3785 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3786 STMT_VINFO_VECTYPE (stmt_info));
78bb46f5 3787 dump_printf (MSG_NOTE, "\n");
0bf5f81b 3788 }
3789 }
91a74fc6 3790
3791 /* Adjust the minimal vectorization factor according to the
3792 vector type. */
3793 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3794 if (vf > *min_vf)
3795 *min_vf = vf;
16dfb112 3796
0bd6d857 3797 if (gatherscatter != SG_NONE)
16dfb112 3798 {
16dfb112 3799 tree off;
e2c5c678 3800 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3801 NULL, &off, NULL)
0bd6d857 3802 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
16dfb112 3803 {
706567b8 3804 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3805 free_data_ref (dr);
6d8fb6cf 3806 if (dump_enabled_p ())
16dfb112 3807 {
0bd6d857 3808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3809 (gatherscatter == GATHER) ?
3810 "not vectorized: not suitable for gather "
3811 "load " :
3812 "not vectorized: not suitable for scatter "
3813 "store ");
7bd765d4 3814 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3815 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
16dfb112 3816 }
3817 return false;
3818 }
3819
f1f41a6c 3820 datarefs[i] = dr;
0bd6d857 3821 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
f634c3e9 3822 }
0bd6d857 3823
e2c5c678 3824 else if (is_a <loop_vec_info> (vinfo)
f634c3e9 3825 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3826 {
e1c75243 3827 if (nested_in_vect_loop_p (loop, stmt))
f634c3e9 3828 {
6d8fb6cf 3829 if (dump_enabled_p ())
f634c3e9 3830 {
7bd765d4 3831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3832 "not vectorized: not suitable for strided "
3833 "load ");
3834 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3835 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
f634c3e9 3836 }
3837 return false;
3838 }
e1c75243 3839 STMT_VINFO_STRIDED_P (stmt_info) = true;
16dfb112 3840 }
fb85abff 3841 }
48e1416a 3842
07e3bcbf 3843 /* If we stopped analysis at the first dataref we could not analyze
3844 when trying to vectorize a basic-block mark the rest of the datarefs
3845 as not vectorizable and truncate the vector of datarefs. That
3846 avoids spending useless time in analyzing their dependence. */
3847 if (i != datarefs.length ())
3848 {
e2c5c678 3849 gcc_assert (is_a <bb_vec_info> (vinfo));
07e3bcbf 3850 for (unsigned j = i; j < datarefs.length (); ++j)
3851 {
3852 data_reference_p dr = datarefs[j];
3853 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3854 free_data_ref (dr);
3855 }
3856 datarefs.truncate (i);
3857 }
3858
fb85abff 3859 return true;
3860}
3861
3862
3863/* Function vect_get_new_vect_var.
3864
282bf14c 3865 Returns a name for a new variable. The current naming scheme appends the
48e1416a 3866 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3867 the name of vectorizer generated variables, and appends that to NAME if
fb85abff 3868 provided. */
3869
3870tree
3871vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3872{
3873 const char *prefix;
3874 tree new_vect_var;
3875
3876 switch (var_kind)
3877 {
3878 case vect_simple_var:
0bf5f81b 3879 prefix = "vect";
fb85abff 3880 break;
3881 case vect_scalar_var:
0bf5f81b 3882 prefix = "stmp";
fb85abff 3883 break;
3884 case vect_pointer_var:
0bf5f81b 3885 prefix = "vectp";
fb85abff 3886 break;
3887 default:
3888 gcc_unreachable ();
3889 }
3890
3891 if (name)
3892 {
0bf5f81b 3893 char* tmp = concat (prefix, "_", name, NULL);
35244493 3894 new_vect_var = create_tmp_reg (type, tmp);
fb85abff 3895 free (tmp);
3896 }
3897 else
35244493 3898 new_vect_var = create_tmp_reg (type, prefix);
fb85abff 3899
3900 return new_vect_var;
3901}
3902
23ffec42 3903/* Like vect_get_new_vect_var but return an SSA name. */
3904
3905tree
3906vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3907{
3908 const char *prefix;
3909 tree new_vect_var;
3910
3911 switch (var_kind)
3912 {
3913 case vect_simple_var:
3914 prefix = "vect";
3915 break;
3916 case vect_scalar_var:
3917 prefix = "stmp";
3918 break;
3919 case vect_pointer_var:
3920 prefix = "vectp";
3921 break;
3922 default:
3923 gcc_unreachable ();
3924 }
3925
3926 if (name)
3927 {
3928 char* tmp = concat (prefix, "_", name, NULL);
3929 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3930 free (tmp);
3931 }
3932 else
3933 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3934
3935 return new_vect_var;
3936}
3937
4a2edd22 3938/* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3939
3940static void
3941vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3942 stmt_vec_info stmt_info)
3943{
3944 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3945 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3946 int misalign = DR_MISALIGNMENT (dr);
3947 if (misalign == -1)
3948 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3949 else
3950 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3951}
fb85abff 3952
3953/* Function vect_create_addr_base_for_vector_ref.
3954
3955 Create an expression that computes the address of the first memory location
3956 that will be accessed for a data reference.
3957
3958 Input:
3959 STMT: The statement containing the data reference.
3960 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3961 OFFSET: Optional. If supplied, it is be added to the initial address.
3962 LOOP: Specify relative to which loop-nest should the address be computed.
3963 For example, when the dataref is in an inner-loop nested in an
3964 outer-loop that is now being vectorized, LOOP can be either the
282bf14c 3965 outer-loop, or the inner-loop. The first memory location accessed
fb85abff 3966 by the following dataref ('in' points to short):
3967
3968 for (i=0; i<N; i++)
3969 for (j=0; j<M; j++)
3970 s += in[i+j]
3971
3972 is as follows:
3973 if LOOP=i_loop: &in (relative to i_loop)
3974 if LOOP=j_loop: &in+i*2B (relative to j_loop)
1ec61bbd 3975 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3976 initial address. Unlike OFFSET, which is number of elements to
3977 be added, BYTE_OFFSET is measured in bytes.
fb85abff 3978
3979 Output:
48e1416a 3980 1. Return an SSA_NAME whose value is the address of the memory location of
fb85abff 3981 the first vector of the data reference.
3982 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3983 these statement(s) which define the returned SSA_NAME.
3984
3985 FORNOW: We are only handling array accesses with step 1. */
3986
3987tree
42acab1c 3988vect_create_addr_base_for_vector_ref (gimple *stmt,
fb85abff 3989 gimple_seq *new_stmt_list,
3990 tree offset,
1ec61bbd 3991 struct loop *loop,
3992 tree byte_offset)
fb85abff 3993{
3994 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3995 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
90d4c4af 3996 tree data_ref_base;
3c18ea71 3997 const char *base_name;
90d4c4af 3998 tree addr_base;
fb85abff 3999 tree dest;
4000 gimple_seq seq = NULL;
90d4c4af 4001 tree base_offset;
4002 tree init;
f083cd24 4003 tree vect_ptr_type;
fb85abff 4004 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
37545e54 4005 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
fb85abff 4006
37545e54 4007 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
fb85abff 4008 {
37545e54 4009 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 4010
37545e54 4011 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
fb85abff 4012
4013 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4014 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4015 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4016 }
90d4c4af 4017 else
4018 {
4019 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4020 base_offset = unshare_expr (DR_OFFSET (dr));
4021 init = unshare_expr (DR_INIT (dr));
4022 }
fb85abff 4023
37545e54 4024 if (loop_vinfo)
3c18ea71 4025 base_name = get_name (data_ref_base);
37545e54 4026 else
4027 {
4028 base_offset = ssize_int (0);
4029 init = ssize_int (0);
3c18ea71 4030 base_name = get_name (DR_REF (dr));
48e1416a 4031 }
37545e54 4032
fb85abff 4033 /* Create base_offset */
4034 base_offset = size_binop (PLUS_EXPR,
4035 fold_convert (sizetype, base_offset),
4036 fold_convert (sizetype, init));
fb85abff 4037
4038 if (offset)
4039 {
fb85abff 4040 offset = fold_build2 (MULT_EXPR, sizetype,
4041 fold_convert (sizetype, offset), step);
4042 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4043 base_offset, offset);
fb85abff 4044 }
1ec61bbd 4045 if (byte_offset)
4046 {
4047 byte_offset = fold_convert (sizetype, byte_offset);
4048 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4049 base_offset, byte_offset);
4050 }
fb85abff 4051
4052 /* base + base_offset */
37545e54 4053 if (loop_vinfo)
2cc66f2a 4054 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
37545e54 4055 else
4056 {
182cf5a9 4057 addr_base = build1 (ADDR_EXPR,
4058 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4059 unshare_expr (DR_REF (dr)));
37545e54 4060 }
48e1416a 4061
fb85abff 4062 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
90d4c4af 4063 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
8ee959f8 4064 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
fb85abff 4065 gimple_seq_add_seq (new_stmt_list, seq);
4066
f544b9a4 4067 if (DR_PTR_INFO (dr)
8ee959f8 4068 && TREE_CODE (addr_base) == SSA_NAME
4069 && !SSA_NAME_PTR_INFO (addr_base))
1259ab70 4070 {
4a2edd22 4071 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4072 if (offset || byte_offset)
90d4c4af 4073 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
1259ab70 4074 }
f544b9a4 4075
6d8fb6cf 4076 if (dump_enabled_p ())
fb85abff 4077 {
7bd765d4 4078 dump_printf_loc (MSG_NOTE, vect_location, "created ");
90d4c4af 4079 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
78bb46f5 4080 dump_printf (MSG_NOTE, "\n");
fb85abff 4081 }
f083cd24 4082
90d4c4af 4083 return addr_base;
fb85abff 4084}
4085
4086
4087/* Function vect_create_data_ref_ptr.
4088
bd5ba09f 4089 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4090 location accessed in the loop by STMT, along with the def-use update
4091 chain to appropriately advance the pointer through the loop iterations.
4092 Also set aliasing information for the pointer. This pointer is used by
4093 the callers to this function to create a memory reference expression for
4094 vector load/store access.
fb85abff 4095
4096 Input:
4097 1. STMT: a stmt that references memory. Expected to be of the form
4098 GIMPLE_ASSIGN <name, data-ref> or
4099 GIMPLE_ASSIGN <data-ref, name>.
bd5ba09f 4100 2. AGGR_TYPE: the type of the reference, which should be either a vector
4101 or an array.
4102 3. AT_LOOP: the loop where the vector memref is to be created.
4103 4. OFFSET (optional): an offset to be added to the initial address accessed
fb85abff 4104 by the data-ref in STMT.
bd5ba09f 4105 5. BSI: location where the new stmts are to be placed if there is no loop
4106 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
fb85abff 4107 pointing to the initial address.
1ec61bbd 4108 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4109 to the initial address accessed by the data-ref in STMT. This is
4110 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4111 in bytes.
fb85abff 4112
4113 Output:
4114 1. Declare a new ptr to vector_type, and have it point to the base of the
4115 data reference (initial addressed accessed by the data reference).
4116 For example, for vector of type V8HI, the following code is generated:
4117
bd5ba09f 4118 v8hi *ap;
4119 ap = (v8hi *)initial_address;
fb85abff 4120
4121 if OFFSET is not supplied:
4122 initial_address = &a[init];
4123 if OFFSET is supplied:
4124 initial_address = &a[init + OFFSET];
1ec61bbd 4125 if BYTE_OFFSET is supplied:
4126 initial_address = &a[init] + BYTE_OFFSET;
fb85abff 4127
4128 Return the initial_address in INITIAL_ADDRESS.
4129
4130 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
48e1416a 4131 update the pointer in each iteration of the loop.
fb85abff 4132
4133 Return the increment stmt that updates the pointer in PTR_INCR.
4134
48e1416a 4135 3. Set INV_P to true if the access pattern of the data reference in the
282bf14c 4136 vectorized loop is invariant. Set it to false otherwise.
fb85abff 4137
4138 4. Return the pointer. */
4139
4140tree
42acab1c 4141vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
bd5ba09f 4142 tree offset, tree *initial_address,
42acab1c 4143 gimple_stmt_iterator *gsi, gimple **ptr_incr,
1ec61bbd 4144 bool only_init, bool *inv_p, tree byte_offset)
fb85abff 4145{
3c18ea71 4146 const char *base_name;
fb85abff 4147 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4148 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
37545e54 4149 struct loop *loop = NULL;
4150 bool nested_in_vect_loop = false;
4151 struct loop *containing_loop = NULL;
bd5ba09f 4152 tree aggr_ptr_type;
4153 tree aggr_ptr;
fb85abff 4154 tree new_temp;
fb85abff 4155 gimple_seq new_stmt_list = NULL;
37545e54 4156 edge pe = NULL;
fb85abff 4157 basic_block new_bb;
bd5ba09f 4158 tree aggr_ptr_init;
fb85abff 4159 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
bd5ba09f 4160 tree aptr;
fb85abff 4161 gimple_stmt_iterator incr_gsi;
4162 bool insert_after;
4163 tree indx_before_incr, indx_after_incr;
42acab1c 4164 gimple *incr;
fb85abff 4165 tree step;
37545e54 4166 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
48e1416a 4167
bd5ba09f 4168 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4169 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4170
37545e54 4171 if (loop_vinfo)
4172 {
4173 loop = LOOP_VINFO_LOOP (loop_vinfo);
4174 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4175 containing_loop = (gimple_bb (stmt))->loop_father;
4176 pe = loop_preheader_edge (loop);
4177 }
4178 else
4179 {
4180 gcc_assert (bb_vinfo);
4181 only_init = true;
4182 *ptr_incr = NULL;
4183 }
48e1416a 4184
fb85abff 4185 /* Check the step (evolution) of the load in LOOP, and record
4186 whether it's invariant. */
4187 if (nested_in_vect_loop)
4188 step = STMT_VINFO_DR_STEP (stmt_info);
4189 else
4190 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
48e1416a 4191
8bbe6b75 4192 if (integer_zerop (step))
fb85abff 4193 *inv_p = true;
4194 else
4195 *inv_p = false;
4196
4197 /* Create an expression for the first address accessed by this load
48e1416a 4198 in LOOP. */
3c18ea71 4199 base_name = get_name (DR_BASE_ADDRESS (dr));
fb85abff 4200
6d8fb6cf 4201 if (dump_enabled_p ())
fb85abff 4202 {
3c18ea71 4203 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
7bd765d4 4204 dump_printf_loc (MSG_NOTE, vect_location,
4205 "create %s-pointer variable to type: ",
f3d35d4d 4206 get_tree_code_name (TREE_CODE (aggr_type)));
7bd765d4 4207 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3c18ea71 4208 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
7bd765d4 4209 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
19bacd59 4210 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4211 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3c18ea71 4212 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
7bd765d4 4213 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3c18ea71 4214 else
7bd765d4 4215 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3c18ea71 4216 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
78bb46f5 4217 dump_printf (MSG_NOTE, "\n");
fb85abff 4218 }
4219
90d4c4af 4220 /* (1) Create the new aggregate-pointer variable.
4221 Vector and array types inherit the alias set of their component
bd5ba09f 4222 type by default so we need to use a ref-all pointer if the data
4223 reference does not conflict with the created aggregated data
4224 reference because it is not addressable. */
90d4c4af 4225 bool need_ref_all = false;
4226 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
a34701c9 4227 get_alias_set (DR_REF (dr))))
90d4c4af 4228 need_ref_all = true;
a34701c9 4229 /* Likewise for any of the data references in the stmt group. */
21009880 4230 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
fb85abff 4231 {
42acab1c 4232 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
dd277d48 4233 do
4234 {
90d4c4af 4235 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4236 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4237 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4238 get_alias_set (DR_REF (sdr))))
dd277d48 4239 {
90d4c4af 4240 need_ref_all = true;
dd277d48 4241 break;
4242 }
90d4c4af 4243 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
dd277d48 4244 }
4245 while (orig_stmt);
fb85abff 4246 }
90d4c4af 4247 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4248 need_ref_all);
4249 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4250
fb85abff 4251
282bf14c 4252 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4253 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4254 def-use update cycles for the pointer: one relative to the outer-loop
4255 (LOOP), which is what steps (3) and (4) below do. The other is relative
4256 to the inner-loop (which is the inner-most loop containing the dataref),
4257 and this is done be step (5) below.
fb85abff 4258
282bf14c 4259 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4260 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4261 redundant. Steps (3),(4) create the following:
fb85abff 4262
4263 vp0 = &base_addr;
4264 LOOP: vp1 = phi(vp0,vp2)
48e1416a 4265 ...
fb85abff 4266 ...
4267 vp2 = vp1 + step
4268 goto LOOP
48e1416a 4269
282bf14c 4270 If there is an inner-loop nested in loop, then step (5) will also be
4271 applied, and an additional update in the inner-loop will be created:
fb85abff 4272
4273 vp0 = &base_addr;
4274 LOOP: vp1 = phi(vp0,vp2)
4275 ...
4276 inner: vp3 = phi(vp1,vp4)
4277 vp4 = vp3 + inner_step
4278 if () goto inner
4279 ...
4280 vp2 = vp1 + step
4281 if () goto LOOP */
4282
bd5ba09f 4283 /* (2) Calculate the initial address of the aggregate-pointer, and set
4284 the aggregate-pointer to point to it before the loop. */
fb85abff 4285
1ec61bbd 4286 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
fb85abff 4287
4288 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1ec61bbd 4289 offset, loop, byte_offset);
fb85abff 4290 if (new_stmt_list)
4291 {
37545e54 4292 if (pe)
4293 {
4294 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4295 gcc_assert (!new_bb);
4296 }
4297 else
bee862b6 4298 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
fb85abff 4299 }
4300
4301 *initial_address = new_temp;
8ee959f8 4302 aggr_ptr_init = new_temp;
fb85abff 4303
bd5ba09f 4304 /* (3) Handle the updating of the aggregate-pointer inside the loop.
282bf14c 4305 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4306 inner-loop nested in LOOP (during outer-loop vectorization). */
fb85abff 4307
37545e54 4308 /* No update in loop is required. */
48e1416a 4309 if (only_init && (!loop_vinfo || at_loop == loop))
bd5ba09f 4310 aptr = aggr_ptr_init;
fb85abff 4311 else
4312 {
bd5ba09f 4313 /* The step of the aggregate pointer is the type size. */
8bbe6b75 4314 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
48e1416a 4315 /* One exception to the above is when the scalar step of the load in
fb85abff 4316 LOOP is zero. In this case the step here is also zero. */
4317 if (*inv_p)
8bbe6b75 4318 iv_step = size_zero_node;
4319 else if (tree_int_cst_sgn (step) == -1)
4320 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
fb85abff 4321
4322 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4323
bd5ba09f 4324 create_iv (aggr_ptr_init,
8bbe6b75 4325 fold_convert (aggr_ptr_type, iv_step),
bd5ba09f 4326 aggr_ptr, loop, &incr_gsi, insert_after,
fb85abff 4327 &indx_before_incr, &indx_after_incr);
4328 incr = gsi_stmt (incr_gsi);
e2c5c678 4329 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
fb85abff 4330
4331 /* Copy the points-to information if it exists. */
4332 if (DR_PTR_INFO (dr))
4333 {
4a2edd22 4334 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4335 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
fb85abff 4336 }
fb85abff 4337 if (ptr_incr)
4338 *ptr_incr = incr;
4339
bd5ba09f 4340 aptr = indx_before_incr;
fb85abff 4341 }
4342
4343 if (!nested_in_vect_loop || only_init)
bd5ba09f 4344 return aptr;
fb85abff 4345
4346
bd5ba09f 4347 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
282bf14c 4348 nested in LOOP, if exists. */
fb85abff 4349
4350 gcc_assert (nested_in_vect_loop);
4351 if (!only_init)
4352 {
4353 standard_iv_increment_position (containing_loop, &incr_gsi,
4354 &insert_after);
bd5ba09f 4355 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
fb85abff 4356 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4357 &indx_after_incr);
4358 incr = gsi_stmt (incr_gsi);
e2c5c678 4359 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
fb85abff 4360
4361 /* Copy the points-to information if it exists. */
4362 if (DR_PTR_INFO (dr))
4363 {
4a2edd22 4364 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4365 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
fb85abff 4366 }
fb85abff 4367 if (ptr_incr)
4368 *ptr_incr = incr;
4369
48e1416a 4370 return indx_before_incr;
fb85abff 4371 }
4372 else
4373 gcc_unreachable ();
4374}
4375
4376
4377/* Function bump_vector_ptr
4378
4379 Increment a pointer (to a vector type) by vector-size. If requested,
48e1416a 4380 i.e. if PTR-INCR is given, then also connect the new increment stmt
fb85abff 4381 to the existing def-use update-chain of the pointer, by modifying
4382 the PTR_INCR as illustrated below:
4383
4384 The pointer def-use update-chain before this function:
4385 DATAREF_PTR = phi (p_0, p_2)
4386 ....
48e1416a 4387 PTR_INCR: p_2 = DATAREF_PTR + step
fb85abff 4388
4389 The pointer def-use update-chain after this function:
4390 DATAREF_PTR = phi (p_0, p_2)
4391 ....
4392 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4393 ....
4394 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4395
4396 Input:
48e1416a 4397 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
fb85abff 4398 in the loop.
48e1416a 4399 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
fb85abff 4400 the loop. The increment amount across iterations is expected
48e1416a 4401 to be vector_size.
fb85abff 4402 BSI - location where the new update stmt is to be placed.
4403 STMT - the original scalar memory-access stmt that is being vectorized.
4404 BUMP - optional. The offset by which to bump the pointer. If not given,
4405 the offset is assumed to be vector_size.
4406
4407 Output: Return NEW_DATAREF_PTR as illustrated above.
48e1416a 4408
fb85abff 4409*/
4410
4411tree
42acab1c 4412bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4413 gimple *stmt, tree bump)
fb85abff 4414{
4415 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4416 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4417 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
fb85abff 4418 tree update = TYPE_SIZE_UNIT (vectype);
1a91d914 4419 gassign *incr_stmt;
fb85abff 4420 ssa_op_iter iter;
4421 use_operand_p use_p;
4422 tree new_dataref_ptr;
4423
4424 if (bump)
4425 update = bump;
48e1416a 4426
8ee959f8 4427 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4428 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4429 else
4430 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
e9cf809e 4431 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4432 dataref_ptr, update);
fb85abff 4433 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4434
4435 /* Copy the points-to information if it exists. */
4436 if (DR_PTR_INFO (dr))
1259ab70 4437 {
4438 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
ceea063b 4439 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
1259ab70 4440 }
fb85abff 4441
4442 if (!ptr_incr)
4443 return new_dataref_ptr;
4444
4445 /* Update the vector-pointer's cross-iteration increment. */
4446 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4447 {
4448 tree use = USE_FROM_PTR (use_p);
4449
4450 if (use == dataref_ptr)
4451 SET_USE (use_p, new_dataref_ptr);
4452 else
4453 gcc_assert (tree_int_cst_compare (use, update) == 0);
4454 }
4455
4456 return new_dataref_ptr;
4457}
4458
4459
4460/* Function vect_create_destination_var.
4461
4462 Create a new temporary of type VECTYPE. */
4463
4464tree
4465vect_create_destination_var (tree scalar_dest, tree vectype)
4466{
4467 tree vec_dest;
0bf5f81b 4468 const char *name;
4469 char *new_name;
fb85abff 4470 tree type;
4471 enum vect_var_kind kind;
4472
4473 kind = vectype ? vect_simple_var : vect_scalar_var;
4474 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4475
4476 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4477
0bf5f81b 4478 name = get_name (scalar_dest);
4479 if (name)
b33b6e58 4480 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
0bf5f81b 4481 else
b33b6e58 4482 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
fb85abff 4483 vec_dest = vect_get_new_vect_var (type, kind, new_name);
0bf5f81b 4484 free (new_name);
fb85abff 4485
4486 return vec_dest;
4487}
4488
ee612634 4489/* Function vect_grouped_store_supported.
fb85abff 4490
42f6a6e8 4491 Returns TRUE if interleave high and interleave low permutations
4492 are supported, and FALSE otherwise. */
fb85abff 4493
4494bool
ee612634 4495vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
fb85abff 4496{
3754d046 4497 machine_mode mode = TYPE_MODE (vectype);
48e1416a 4498
d53391a8 4499 /* vect_permute_store_chain requires the group size to be equal to 3 or
4500 be a power of two. */
4501 if (count != 3 && exact_log2 (count) == -1)
481fc474 4502 {
6d8fb6cf 4503 if (dump_enabled_p ())
7bd765d4 4504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
d53391a8 4505 "the size of the group of accesses"
4506 " is not a power of 2 or not eqaul to 3\n");
481fc474 4507 return false;
4508 }
4509
42f6a6e8 4510 /* Check that the permutation is supported. */
8bec2124 4511 if (VECTOR_MODE_P (mode))
4512 {
4513 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4514 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
d53391a8 4515
4516 if (count == 3)
8bec2124 4517 {
d53391a8 4518 unsigned int j0 = 0, j1 = 0, j2 = 0;
4519 unsigned int i, j;
4520
4521 for (j = 0; j < 3; j++)
4522 {
4523 int nelt0 = ((3 - j) * nelt) % 3;
4524 int nelt1 = ((3 - j) * nelt + 1) % 3;
4525 int nelt2 = ((3 - j) * nelt + 2) % 3;
4526 for (i = 0; i < nelt; i++)
4527 {
4528 if (3 * i + nelt0 < nelt)
4529 sel[3 * i + nelt0] = j0++;
4530 if (3 * i + nelt1 < nelt)
4531 sel[3 * i + nelt1] = nelt + j1++;
4532 if (3 * i + nelt2 < nelt)
4533 sel[3 * i + nelt2] = 0;
4534 }
4535 if (!can_vec_perm_p (mode, false, sel))
4536 {
4537 if (dump_enabled_p ())
4538 dump_printf (MSG_MISSED_OPTIMIZATION,
4539 "permutaion op not supported by target.\n");
4540 return false;
4541 }
4542
4543 for (i = 0; i < nelt; i++)
4544 {
4545 if (3 * i + nelt0 < nelt)
4546 sel[3 * i + nelt0] = 3 * i + nelt0;
4547 if (3 * i + nelt1 < nelt)
4548 sel[3 * i + nelt1] = 3 * i + nelt1;
4549 if (3 * i + nelt2 < nelt)
4550 sel[3 * i + nelt2] = nelt + j2++;
4551 }
4552 if (!can_vec_perm_p (mode, false, sel))
4553 {
4554 if (dump_enabled_p ())
4555 dump_printf (MSG_MISSED_OPTIMIZATION,
4556 "permutaion op not supported by target.\n");
4557 return false;
4558 }
4559 }
4560 return true;
8bec2124 4561 }
d53391a8 4562 else
8bec2124 4563 {
d53391a8 4564 /* If length is not equal to 3 then only power of 2 is supported. */
4565 gcc_assert (exact_log2 (count) != -1);
4566
4567 for (i = 0; i < nelt / 2; i++)
4568 {
4569 sel[i * 2] = i;
4570 sel[i * 2 + 1] = i + nelt;
4571 }
4572 if (can_vec_perm_p (mode, false, sel))
4573 {
4574 for (i = 0; i < nelt; i++)
4575 sel[i] += nelt / 2;
4576 if (can_vec_perm_p (mode, false, sel))
4577 return true;
4578 }
8bec2124 4579 }
4580 }
fb85abff 4581
6d8fb6cf 4582 if (dump_enabled_p ())
7bd765d4 4583 dump_printf (MSG_MISSED_OPTIMIZATION,
d53391a8 4584 "permutaion op not supported by target.\n");
6620d7d7 4585 return false;
fb85abff 4586}
4587
4588
94b7b4dd 4589/* Return TRUE if vec_store_lanes is available for COUNT vectors of
4590 type VECTYPE. */
4591
4592bool
4593vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4594{
4595 return vect_lanes_optab_supported_p ("vec_store_lanes",
4596 vec_store_lanes_optab,
4597 vectype, count);
4598}
4599
4600
fb85abff 4601/* Function vect_permute_store_chain.
4602
4603 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
d53391a8 4604 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4605 the data correctly for the stores. Return the final references for stores
4606 in RESULT_CHAIN.
fb85abff 4607
4608 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
282bf14c 4609 The input is 4 vectors each containing 8 elements. We assign a number to
4610 each element, the input sequence is:
fb85abff 4611
4612 1st vec: 0 1 2 3 4 5 6 7
4613 2nd vec: 8 9 10 11 12 13 14 15
48e1416a 4614 3rd vec: 16 17 18 19 20 21 22 23
fb85abff 4615 4th vec: 24 25 26 27 28 29 30 31
4616
4617 The output sequence should be:
4618
4619 1st vec: 0 8 16 24 1 9 17 25
4620 2nd vec: 2 10 18 26 3 11 19 27
4621 3rd vec: 4 12 20 28 5 13 21 30
4622 4th vec: 6 14 22 30 7 15 23 31
4623
4624 i.e., we interleave the contents of the four vectors in their order.
4625
282bf14c 4626 We use interleave_high/low instructions to create such output. The input of
fb85abff 4627 each interleave_high/low operation is two vectors:
48e1416a 4628 1st vec 2nd vec
4629 0 1 2 3 4 5 6 7
4630 the even elements of the result vector are obtained left-to-right from the
282bf14c 4631 high/low elements of the first vector. The odd elements of the result are
fb85abff 4632 obtained left-to-right from the high/low elements of the second vector.
4633 The output of interleave_high will be: 0 4 1 5
4634 and of interleave_low: 2 6 3 7
4635
48e1416a 4636
282bf14c 4637 The permutation is done in log LENGTH stages. In each stage interleave_high
48e1416a 4638 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4639 where the first argument is taken from the first half of DR_CHAIN and the
4640 second argument from it's second half.
4641 In our example,
fb85abff 4642
4643 I1: interleave_high (1st vec, 3rd vec)
4644 I2: interleave_low (1st vec, 3rd vec)
4645 I3: interleave_high (2nd vec, 4th vec)
4646 I4: interleave_low (2nd vec, 4th vec)
4647
4648 The output for the first stage is:
4649
4650 I1: 0 16 1 17 2 18 3 19
4651 I2: 4 20 5 21 6 22 7 23
4652 I3: 8 24 9 25 10 26 11 27
4653 I4: 12 28 13 29 14 30 15 31
4654
4655 The output of the second stage, i.e. the final result is:
4656
4657 I1: 0 8 16 24 1 9 17 25
4658 I2: 2 10 18 26 3 11 19 27
4659 I3: 4 12 20 28 5 13 21 30
4660 I4: 6 14 22 30 7 15 23 31. */
48e1416a 4661
481fc474 4662void
f1f41a6c 4663vect_permute_store_chain (vec<tree> dr_chain,
48e1416a 4664 unsigned int length,
42acab1c 4665 gimple *stmt,
fb85abff 4666 gimple_stmt_iterator *gsi,
f1f41a6c 4667 vec<tree> *result_chain)
fb85abff 4668{
03d37e4e 4669 tree vect1, vect2, high, low;
42acab1c 4670 gimple *perm_stmt;
fb85abff 4671 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
8bec2124 4672 tree perm_mask_low, perm_mask_high;
d53391a8 4673 tree data_ref;
4674 tree perm3_mask_low, perm3_mask_high;
4675 unsigned int i, n, log_length = exact_log2 (length);
42f6a6e8 4676 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
8bec2124 4677 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
48e1416a 4678
f40aaf2d 4679 result_chain->quick_grow (length);
4680 memcpy (result_chain->address (), dr_chain.address (),
4681 length * sizeof (tree));
fb85abff 4682
d53391a8 4683 if (length == 3)
8bec2124 4684 {
d53391a8 4685 unsigned int j0 = 0, j1 = 0, j2 = 0;
42f6a6e8 4686
d53391a8 4687 for (j = 0; j < 3; j++)
4688 {
4689 int nelt0 = ((3 - j) * nelt) % 3;
4690 int nelt1 = ((3 - j) * nelt + 1) % 3;
4691 int nelt2 = ((3 - j) * nelt + 2) % 3;
8bec2124 4692
d53391a8 4693 for (i = 0; i < nelt; i++)
4694 {
4695 if (3 * i + nelt0 < nelt)
4696 sel[3 * i + nelt0] = j0++;
4697 if (3 * i + nelt1 < nelt)
4698 sel[3 * i + nelt1] = nelt + j1++;
4699 if (3 * i + nelt2 < nelt)
4700 sel[3 * i + nelt2] = 0;
4701 }
0761848d 4702 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
d53391a8 4703
4704 for (i = 0; i < nelt; i++)
4705 {
4706 if (3 * i + nelt0 < nelt)
4707 sel[3 * i + nelt0] = 3 * i + nelt0;
4708 if (3 * i + nelt1 < nelt)
4709 sel[3 * i + nelt1] = 3 * i + nelt1;
4710 if (3 * i + nelt2 < nelt)
4711 sel[3 * i + nelt2] = nelt + j2++;
4712 }
0761848d 4713 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
d53391a8 4714
4715 vect1 = dr_chain[0];
4716 vect2 = dr_chain[1];
fb85abff 4717
4718 /* Create interleaving stmt:
d53391a8 4719 low = VEC_PERM_EXPR <vect1, vect2,
4720 {j, nelt, *, j + 1, nelt + j + 1, *,
4721 j + 2, nelt + j + 2, *, ...}> */
4722 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
e9cf809e 4723 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4724 vect2, perm3_mask_low);
fb85abff 4725 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
fb85abff 4726
d53391a8 4727 vect1 = data_ref;
4728 vect2 = dr_chain[2];
fb85abff 4729 /* Create interleaving stmt:
d53391a8 4730 low = VEC_PERM_EXPR <vect1, vect2,
4731 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4732 6, 7, nelt + j + 2, ...}> */
4733 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
e9cf809e 4734 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4735 vect2, perm3_mask_high);
fb85abff 4736 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
d53391a8 4737 (*result_chain)[j] = data_ref;
fb85abff 4738 }
d53391a8 4739 }
4740 else
4741 {
4742 /* If length is not equal to 3 then only power of 2 is supported. */
4743 gcc_assert (exact_log2 (length) != -1);
4744
4745 for (i = 0, n = nelt / 2; i < n; i++)
4746 {
4747 sel[i * 2] = i;
4748 sel[i * 2 + 1] = i + nelt;
4749 }
0761848d 4750 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
d53391a8 4751
4752 for (i = 0; i < nelt; i++)
4753 sel[i] += nelt / 2;
0761848d 4754 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
d53391a8 4755
4756 for (i = 0, n = log_length; i < n; i++)
4757 {
4758 for (j = 0; j < length/2; j++)
4759 {
4760 vect1 = dr_chain[j];
4761 vect2 = dr_chain[j+length/2];
4762
4763 /* Create interleaving stmt:
4764 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4765 ...}> */
4766 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
e9cf809e 4767 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4768 vect2, perm_mask_high);
d53391a8 4769 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4770 (*result_chain)[2*j] = high;
4771
4772 /* Create interleaving stmt:
4773 low = VEC_PERM_EXPR <vect1, vect2,
4774 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4775 ...}> */
4776 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
e9cf809e 4777 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4778 vect2, perm_mask_low);
d53391a8 4779 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4780 (*result_chain)[2*j+1] = low;
4781 }
4782 memcpy (dr_chain.address (), result_chain->address (),
4783 length * sizeof (tree));
4784 }
fb85abff 4785 }
fb85abff 4786}
4787
4788/* Function vect_setup_realignment
48e1416a 4789
fb85abff 4790 This function is called when vectorizing an unaligned load using
4791 the dr_explicit_realign[_optimized] scheme.
4792 This function generates the following code at the loop prolog:
4793
4794 p = initial_addr;
4795 x msq_init = *(floor(p)); # prolog load
48e1416a 4796 realignment_token = call target_builtin;
fb85abff 4797 loop:
4798 x msq = phi (msq_init, ---)
4799
48e1416a 4800 The stmts marked with x are generated only for the case of
fb85abff 4801 dr_explicit_realign_optimized.
4802
48e1416a 4803 The code above sets up a new (vector) pointer, pointing to the first
fb85abff 4804 location accessed by STMT, and a "floor-aligned" load using that pointer.
4805 It also generates code to compute the "realignment-token" (if the relevant
4806 target hook was defined), and creates a phi-node at the loop-header bb
4807 whose arguments are the result of the prolog-load (created by this
4808 function) and the result of a load that takes place in the loop (to be
4809 created by the caller to this function).
4810
4811 For the case of dr_explicit_realign_optimized:
48e1416a 4812 The caller to this function uses the phi-result (msq) to create the
fb85abff 4813 realignment code inside the loop, and sets up the missing phi argument,
4814 as follows:
48e1416a 4815 loop:
fb85abff 4816 msq = phi (msq_init, lsq)
4817 lsq = *(floor(p')); # load in loop
4818 result = realign_load (msq, lsq, realignment_token);
4819
4820 For the case of dr_explicit_realign:
4821 loop:
4822 msq = *(floor(p)); # load in loop
4823 p' = p + (VS-1);
4824 lsq = *(floor(p')); # load in loop
4825 result = realign_load (msq, lsq, realignment_token);
4826
4827 Input:
4828 STMT - (scalar) load stmt to be vectorized. This load accesses
4829 a memory location that may be unaligned.
4830 BSI - place where new code is to be inserted.
4831 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
48e1416a 4832 is used.
4833
fb85abff 4834 Output:
4835 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4836 target hook, if defined.
4837 Return value - the result of the loop-header phi node. */
4838
4839tree
42acab1c 4840vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
fb85abff 4841 tree *realignment_token,
4842 enum dr_alignment_support alignment_support_scheme,
4843 tree init_addr,
4844 struct loop **at_loop)
4845{
4846 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4847 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4848 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2cb9ef39 4849 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
ad074595 4850 struct loop *loop = NULL;
4851 edge pe = NULL;
fb85abff 4852 tree scalar_dest = gimple_assign_lhs (stmt);
4853 tree vec_dest;
42acab1c 4854 gimple *inc;
fb85abff 4855 tree ptr;
4856 tree data_ref;
fb85abff 4857 basic_block new_bb;
4858 tree msq_init = NULL_TREE;
4859 tree new_temp;
1a91d914 4860 gphi *phi_stmt;
fb85abff 4861 tree msq = NULL_TREE;
4862 gimple_seq stmts = NULL;
4863 bool inv_p;
4864 bool compute_in_loop = false;
ad074595 4865 bool nested_in_vect_loop = false;
fb85abff 4866 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
ad074595 4867 struct loop *loop_for_initial_load = NULL;
4868
4869 if (loop_vinfo)
4870 {
4871 loop = LOOP_VINFO_LOOP (loop_vinfo);
4872 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4873 }
fb85abff 4874
4875 gcc_assert (alignment_support_scheme == dr_explicit_realign
4876 || alignment_support_scheme == dr_explicit_realign_optimized);
4877
4878 /* We need to generate three things:
4879 1. the misalignment computation
4880 2. the extra vector load (for the optimized realignment scheme).
4881 3. the phi node for the two vectors from which the realignment is
282bf14c 4882 done (for the optimized realignment scheme). */
fb85abff 4883
4884 /* 1. Determine where to generate the misalignment computation.
4885
4886 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4887 calculation will be generated by this function, outside the loop (in the
4888 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4889 caller, inside the loop.
4890
4891 Background: If the misalignment remains fixed throughout the iterations of
4892 the loop, then both realignment schemes are applicable, and also the
4893 misalignment computation can be done outside LOOP. This is because we are
4894 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4895 are a multiple of VS (the Vector Size), and therefore the misalignment in
4896 different vectorized LOOP iterations is always the same.
4897 The problem arises only if the memory access is in an inner-loop nested
4898 inside LOOP, which is now being vectorized using outer-loop vectorization.
4899 This is the only case when the misalignment of the memory access may not
4900 remain fixed throughout the iterations of the inner-loop (as explained in
4901 detail in vect_supportable_dr_alignment). In this case, not only is the
4902 optimized realignment scheme not applicable, but also the misalignment
4903 computation (and generation of the realignment token that is passed to
4904 REALIGN_LOAD) have to be done inside the loop.
4905
4906 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4907 or not, which in turn determines if the misalignment is computed inside
4908 the inner-loop, or outside LOOP. */
4909
ad074595 4910 if (init_addr != NULL_TREE || !loop_vinfo)
fb85abff 4911 {
4912 compute_in_loop = true;
4913 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4914 }
4915
4916
4917 /* 2. Determine where to generate the extra vector load.
4918
4919 For the optimized realignment scheme, instead of generating two vector
4920 loads in each iteration, we generate a single extra vector load in the
4921 preheader of the loop, and in each iteration reuse the result of the
4922 vector load from the previous iteration. In case the memory access is in
4923 an inner-loop nested inside LOOP, which is now being vectorized using
4924 outer-loop vectorization, we need to determine whether this initial vector
4925 load should be generated at the preheader of the inner-loop, or can be
4926 generated at the preheader of LOOP. If the memory access has no evolution
4927 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4928 to be generated inside LOOP (in the preheader of the inner-loop). */
4929
4930 if (nested_in_vect_loop)
4931 {
4932 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4933 bool invariant_in_outerloop =
4934 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4935 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4936 }
4937 else
4938 loop_for_initial_load = loop;
4939 if (at_loop)
4940 *at_loop = loop_for_initial_load;
4941
ad074595 4942 if (loop_for_initial_load)
4943 pe = loop_preheader_edge (loop_for_initial_load);
4944
fb85abff 4945 /* 3. For the case of the optimized realignment, create the first vector
4946 load at the loop preheader. */
4947
4948 if (alignment_support_scheme == dr_explicit_realign_optimized)
4949 {
4950 /* Create msq_init = *(floor(p1)) in the loop preheader */
1a91d914 4951 gassign *new_stmt;
fb85abff 4952
4953 gcc_assert (!compute_in_loop);
fb85abff 4954 vec_dest = vect_create_destination_var (scalar_dest, vectype);
bd5ba09f 4955 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4956 NULL_TREE, &init_addr, NULL, &inc,
4957 true, &inv_p);
23bab442 4958 if (TREE_CODE (ptr) == SSA_NAME)
4959 new_temp = copy_ssa_name (ptr);
4960 else
4961 new_temp = make_ssa_name (TREE_TYPE (ptr));
e9cf809e 4962 new_stmt = gimple_build_assign
4963 (new_temp, BIT_AND_EXPR, ptr,
86638c2e 4964 build_int_cst (TREE_TYPE (ptr),
4965 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
86638c2e 4966 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4967 gcc_assert (!new_bb);
2cb9ef39 4968 data_ref
4969 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4970 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
fb85abff 4971 new_stmt = gimple_build_assign (vec_dest, data_ref);
4972 new_temp = make_ssa_name (vec_dest, new_stmt);
4973 gimple_assign_set_lhs (new_stmt, new_temp);
ad074595 4974 if (pe)
4975 {
4976 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4977 gcc_assert (!new_bb);
4978 }
4979 else
4980 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4981
fb85abff 4982 msq_init = gimple_assign_lhs (new_stmt);
4983 }
4984
4985 /* 4. Create realignment token using a target builtin, if available.
4986 It is done either inside the containing loop, or before LOOP (as
4987 determined above). */
4988
4989 if (targetm.vectorize.builtin_mask_for_load)
4990 {
1a91d914 4991 gcall *new_stmt;
fb85abff 4992 tree builtin_decl;
4993
4994 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
ad074595 4995 if (!init_addr)
fb85abff 4996 {
4997 /* Generate the INIT_ADDR computation outside LOOP. */
4998 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4999 NULL_TREE, loop);
ad074595 5000 if (loop)
5001 {
5002 pe = loop_preheader_edge (loop);
5003 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5004 gcc_assert (!new_bb);
5005 }
5006 else
5007 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
fb85abff 5008 }
5009
5010 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5011 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5012 vec_dest =
5013 vect_create_destination_var (scalar_dest,
5014 gimple_call_return_type (new_stmt));
5015 new_temp = make_ssa_name (vec_dest, new_stmt);
5016 gimple_call_set_lhs (new_stmt, new_temp);
5017
5018 if (compute_in_loop)
5019 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5020 else
5021 {
5022 /* Generate the misalignment computation outside LOOP. */
5023 pe = loop_preheader_edge (loop);
5024 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5025 gcc_assert (!new_bb);
5026 }
5027
5028 *realignment_token = gimple_call_lhs (new_stmt);
5029
5030 /* The result of the CALL_EXPR to this builtin is determined from
5031 the value of the parameter and no global variables are touched
5032 which makes the builtin a "const" function. Requiring the
5033 builtin to have the "const" attribute makes it unnecessary
5034 to call mark_call_clobbered. */
5035 gcc_assert (TREE_READONLY (builtin_decl));
5036 }
5037
5038 if (alignment_support_scheme == dr_explicit_realign)
5039 return msq;
5040
5041 gcc_assert (!compute_in_loop);
5042 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5043
5044
5045 /* 5. Create msq = phi <msq_init, lsq> in loop */
5046
5047 pe = loop_preheader_edge (containing_loop);
5048 vec_dest = vect_create_destination_var (scalar_dest, vectype);
f9e245b2 5049 msq = make_ssa_name (vec_dest);
fb85abff 5050 phi_stmt = create_phi_node (msq, containing_loop->header);
60d535d2 5051 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
fb85abff 5052
5053 return msq;
5054}
5055
5056
ee612634 5057/* Function vect_grouped_load_supported.
fb85abff 5058
42f6a6e8 5059 Returns TRUE if even and odd permutations are supported,
fb85abff 5060 and FALSE otherwise. */
5061
5062bool
ee612634 5063vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
fb85abff 5064{
3754d046 5065 machine_mode mode = TYPE_MODE (vectype);
fb85abff 5066
1e1bca71 5067 /* vect_permute_load_chain requires the group size to be equal to 3 or
5068 be a power of two. */
5069 if (count != 3 && exact_log2 (count) == -1)
481fc474 5070 {
6d8fb6cf 5071 if (dump_enabled_p ())
7bd765d4 5072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1e1bca71 5073 "the size of the group of accesses"
5074 " is not a power of 2 or not equal to 3\n");
481fc474 5075 return false;
5076 }
5077
42f6a6e8 5078 /* Check that the permutation is supported. */
5079 if (VECTOR_MODE_P (mode))
5080 {
1e1bca71 5081 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
42f6a6e8 5082 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
fb85abff 5083
1e1bca71 5084 if (count == 3)
42f6a6e8 5085 {
1e1bca71 5086 unsigned int k;
5087 for (k = 0; k < 3; k++)
5088 {
5089 for (i = 0; i < nelt; i++)
5090 if (3 * i + k < 2 * nelt)
5091 sel[i] = 3 * i + k;
5092 else
5093 sel[i] = 0;
5094 if (!can_vec_perm_p (mode, false, sel))
5095 {
5096 if (dump_enabled_p ())
5097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5098 "shuffle of 3 loads is not supported by"
5099 " target\n");
5c6f6a61 5100 return false;
1e1bca71 5101 }
5102 for (i = 0, j = 0; i < nelt; i++)
5103 if (3 * i + k < 2 * nelt)
5104 sel[i] = i;
5105 else
5106 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5107 if (!can_vec_perm_p (mode, false, sel))
5108 {
5109 if (dump_enabled_p ())
5110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5111 "shuffle of 3 loads is not supported by"
5112 " target\n");
5113 return false;
5114 }
5115 }
5116 return true;
5117 }
5118 else
5119 {
5120 /* If length is not equal to 3 then only power of 2 is supported. */
5121 gcc_assert (exact_log2 (count) != -1);
42f6a6e8 5122 for (i = 0; i < nelt; i++)
1e1bca71 5123 sel[i] = i * 2;
42f6a6e8 5124 if (can_vec_perm_p (mode, false, sel))
1e1bca71 5125 {
5126 for (i = 0; i < nelt; i++)
5127 sel[i] = i * 2 + 1;
5128 if (can_vec_perm_p (mode, false, sel))
5129 return true;
5130 }
5131 }
42f6a6e8 5132 }
fb85abff 5133
6d8fb6cf 5134 if (dump_enabled_p ())
7bd765d4 5135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1e1bca71 5136 "extract even/odd not supported by target\n");
6620d7d7 5137 return false;
fb85abff 5138}
5139
94b7b4dd 5140/* Return TRUE if vec_load_lanes is available for COUNT vectors of
5141 type VECTYPE. */
5142
5143bool
5144vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5145{
5146 return vect_lanes_optab_supported_p ("vec_load_lanes",
5147 vec_load_lanes_optab,
5148 vectype, count);
5149}
fb85abff 5150
5151/* Function vect_permute_load_chain.
5152
5153 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
1e1bca71 5154 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5155 the input data correctly. Return the final references for loads in
5156 RESULT_CHAIN.
fb85abff 5157
5158 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5159 The input is 4 vectors each containing 8 elements. We assign a number to each
5160 element, the input sequence is:
5161
5162 1st vec: 0 1 2 3 4 5 6 7
5163 2nd vec: 8 9 10 11 12 13 14 15
48e1416a 5164 3rd vec: 16 17 18 19 20 21 22 23
fb85abff 5165 4th vec: 24 25 26 27 28 29 30 31
5166
5167 The output sequence should be:
5168
5169 1st vec: 0 4 8 12 16 20 24 28
5170 2nd vec: 1 5 9 13 17 21 25 29
48e1416a 5171 3rd vec: 2 6 10 14 18 22 26 30
fb85abff 5172 4th vec: 3 7 11 15 19 23 27 31
5173
5174 i.e., the first output vector should contain the first elements of each
5175 interleaving group, etc.
5176
282bf14c 5177 We use extract_even/odd instructions to create such output. The input of
5178 each extract_even/odd operation is two vectors
48e1416a 5179 1st vec 2nd vec
5180 0 1 2 3 4 5 6 7
fb85abff 5181
282bf14c 5182 and the output is the vector of extracted even/odd elements. The output of
fb85abff 5183 extract_even will be: 0 2 4 6
5184 and of extract_odd: 1 3 5 7
5185
48e1416a 5186
282bf14c 5187 The permutation is done in log LENGTH stages. In each stage extract_even
5188 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5189 their order. In our example,
fb85abff 5190
5191 E1: extract_even (1st vec, 2nd vec)
5192 E2: extract_odd (1st vec, 2nd vec)
5193 E3: extract_even (3rd vec, 4th vec)
5194 E4: extract_odd (3rd vec, 4th vec)
5195
5196 The output for the first stage will be:
5197
5198 E1: 0 2 4 6 8 10 12 14
5199 E2: 1 3 5 7 9 11 13 15
48e1416a 5200 E3: 16 18 20 22 24 26 28 30
fb85abff 5201 E4: 17 19 21 23 25 27 29 31
5202
5203 In order to proceed and create the correct sequence for the next stage (or
48e1416a 5204 for the correct output, if the second stage is the last one, as in our
5205 example), we first put the output of extract_even operation and then the
fb85abff 5206 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5207 The input for the second stage is:
5208
5209 1st vec (E1): 0 2 4 6 8 10 12 14
48e1416a 5210 2nd vec (E3): 16 18 20 22 24 26 28 30
5211 3rd vec (E2): 1 3 5 7 9 11 13 15
fb85abff 5212 4th vec (E4): 17 19 21 23 25 27 29 31
5213
5214 The output of the second stage:
5215
5216 E1: 0 4 8 12 16 20 24 28
5217 E2: 2 6 10 14 18 22 26 30
5218 E3: 1 5 9 13 17 21 25 29
5219 E4: 3 7 11 15 19 23 27 31
5220
5221 And RESULT_CHAIN after reordering:
5222
5223 1st vec (E1): 0 4 8 12 16 20 24 28
5224 2nd vec (E3): 1 5 9 13 17 21 25 29
48e1416a 5225 3rd vec (E2): 2 6 10 14 18 22 26 30
fb85abff 5226 4th vec (E4): 3 7 11 15 19 23 27 31. */
5227
481fc474 5228static void
f1f41a6c 5229vect_permute_load_chain (vec<tree> dr_chain,
48e1416a 5230 unsigned int length,
42acab1c 5231 gimple *stmt,
fb85abff 5232 gimple_stmt_iterator *gsi,
f1f41a6c 5233 vec<tree> *result_chain)
fb85abff 5234{
03d37e4e 5235 tree data_ref, first_vect, second_vect;
42f6a6e8 5236 tree perm_mask_even, perm_mask_odd;
1e1bca71 5237 tree perm3_mask_low, perm3_mask_high;
42acab1c 5238 gimple *perm_stmt;
fb85abff 5239 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
42f6a6e8 5240 unsigned int i, j, log_length = exact_log2 (length);
5241 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5242 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
fb85abff 5243
1648f21f 5244 result_chain->quick_grow (length);
5245 memcpy (result_chain->address (), dr_chain.address (),
5246 length * sizeof (tree));
42f6a6e8 5247
1e1bca71 5248 if (length == 3)
fb85abff 5249 {
1e1bca71 5250 unsigned int k;
fb85abff 5251
1e1bca71 5252 for (k = 0; k < 3; k++)
5253 {
5254 for (i = 0; i < nelt; i++)
5255 if (3 * i + k < 2 * nelt)
5256 sel[i] = 3 * i + k;
5257 else
5258 sel[i] = 0;
0761848d 5259 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
1e1bca71 5260
5261 for (i = 0, j = 0; i < nelt; i++)
5262 if (3 * i + k < 2 * nelt)
5263 sel[i] = i;
5264 else
5265 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5266
0761848d 5267 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
1e1bca71 5268
5269 first_vect = dr_chain[0];
5270 second_vect = dr_chain[1];
5271
5272 /* Create interleaving stmt (low part of):
5273 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5274 ...}> */
321d85d9 5275 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
e9cf809e 5276 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5277 second_vect, perm3_mask_low);
fb85abff 5278 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
48e1416a 5279
1e1bca71 5280 /* Create interleaving stmt (high part of):
5281 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5282 ...}> */
5283 first_vect = data_ref;
5284 second_vect = dr_chain[2];
321d85d9 5285 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
e9cf809e 5286 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5287 second_vect, perm3_mask_high);
fb85abff 5288 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
1e1bca71 5289 (*result_chain)[k] = data_ref;
fb85abff 5290 }
fb85abff 5291 }
1e1bca71 5292 else
5293 {
5294 /* If length is not equal to 3 then only power of 2 is supported. */
5295 gcc_assert (exact_log2 (length) != -1);
5296
5297 for (i = 0; i < nelt; ++i)
5298 sel[i] = i * 2;
0761848d 5299 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
1e1bca71 5300
5301 for (i = 0; i < nelt; ++i)
5302 sel[i] = i * 2 + 1;
0761848d 5303 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
fb85abff 5304
1e1bca71 5305 for (i = 0; i < log_length; i++)
5306 {
5307 for (j = 0; j < length; j += 2)
5308 {
5309 first_vect = dr_chain[j];
5310 second_vect = dr_chain[j+1];
5311
5312 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5313 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
e9cf809e 5314 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5315 first_vect, second_vect,
5316 perm_mask_even);
1e1bca71 5317 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5318 (*result_chain)[j/2] = data_ref;
5319
5320 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5321 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
e9cf809e 5322 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5323 first_vect, second_vect,
5324 perm_mask_odd);
1e1bca71 5325 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5326 (*result_chain)[j/2+length/2] = data_ref;
5327 }
5328 memcpy (dr_chain.address (), result_chain->address (),
5329 length * sizeof (tree));
5330 }
5331 }
5332}
fb85abff 5333
926f7a02 5334/* Function vect_shift_permute_load_chain.
5335
5336 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5337 sequence of stmts to reorder the input data accordingly.
5338 Return the final references for loads in RESULT_CHAIN.
5339 Return true if successed, false otherwise.
5340
5341 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5342 The input is 3 vectors each containing 8 elements. We assign a
5343 number to each element, the input sequence is:
5344
5345 1st vec: 0 1 2 3 4 5 6 7
5346 2nd vec: 8 9 10 11 12 13 14 15
5347 3rd vec: 16 17 18 19 20 21 22 23
5348
5349 The output sequence should be:
5350
5351 1st vec: 0 3 6 9 12 15 18 21
5352 2nd vec: 1 4 7 10 13 16 19 22
5353 3rd vec: 2 5 8 11 14 17 20 23
5354
5355 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5356
5357 First we shuffle all 3 vectors to get correct elements order:
5358
5359 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5360 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5361 3rd vec: (16 19 22) (17 20 23) (18 21)
5362
5363 Next we unite and shift vector 3 times:
5364
5365 1st step:
5366 shift right by 6 the concatenation of:
5367 "1st vec" and "2nd vec"
5368 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5369 "2nd vec" and "3rd vec"
5370 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5371 "3rd vec" and "1st vec"
5372 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5373 | New vectors |
5374
5375 So that now new vectors are:
5376
5377 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5378 2nd vec: (10 13) (16 19 22) (17 20 23)
5379 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5380
5381 2nd step:
5382 shift right by 5 the concatenation of:
5383 "1st vec" and "3rd vec"
5384 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5385 "2nd vec" and "1st vec"
5386 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5387 "3rd vec" and "2nd vec"
5388 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5389 | New vectors |
5390
5391 So that now new vectors are:
5392
5393 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5394 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5395 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5396
5397 3rd step:
5398 shift right by 5 the concatenation of:
5399 "1st vec" and "1st vec"
5400 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5401 shift right by 3 the concatenation of:
5402 "2nd vec" and "2nd vec"
5403 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5404 | New vectors |
5405
5406 So that now all vectors are READY:
5407 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5408 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5409 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5410
5411 This algorithm is faster than one in vect_permute_load_chain if:
5412 1. "shift of a concatination" is faster than general permutation.
5413 This is usually so.
5414 2. The TARGET machine can't execute vector instructions in parallel.
5415 This is because each step of the algorithm depends on previous.
5416 The algorithm in vect_permute_load_chain is much more parallel.
5417
5418 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5419*/
5420
5421static bool
5422vect_shift_permute_load_chain (vec<tree> dr_chain,
5423 unsigned int length,
42acab1c 5424 gimple *stmt,
926f7a02 5425 gimple_stmt_iterator *gsi,
5426 vec<tree> *result_chain)
5427{
5428 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5429 tree perm2_mask1, perm2_mask2, perm3_mask;
5430 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
42acab1c 5431 gimple *perm_stmt;
926f7a02 5432
5433 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5434 unsigned int i;
5435 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5436 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5437 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5438 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5439
5440 result_chain->quick_grow (length);
5441 memcpy (result_chain->address (), dr_chain.address (),
5442 length * sizeof (tree));
5443
2cc1223e 5444 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
926f7a02 5445 {
2cc1223e 5446 unsigned int j, log_length = exact_log2 (length);
926f7a02 5447 for (i = 0; i < nelt / 2; ++i)
5448 sel[i] = i * 2;
5449 for (i = 0; i < nelt / 2; ++i)
5450 sel[nelt / 2 + i] = i * 2 + 1;
5451 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5452 {
5453 if (dump_enabled_p ())
5454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5455 "shuffle of 2 fields structure is not \
5456 supported by target\n");
5457 return false;
5458 }
0761848d 5459 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5460
5461 for (i = 0; i < nelt / 2; ++i)
5462 sel[i] = i * 2 + 1;
5463 for (i = 0; i < nelt / 2; ++i)
5464 sel[nelt / 2 + i] = i * 2;
5465 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5466 {
5467 if (dump_enabled_p ())
5468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5469 "shuffle of 2 fields structure is not \
5470 supported by target\n");
5471 return false;
5472 }
0761848d 5473 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5474
5475 /* Generating permutation constant to shift all elements.
5476 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5477 for (i = 0; i < nelt; i++)
5478 sel[i] = nelt / 2 + i;
5479 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5480 {
5481 if (dump_enabled_p ())
5482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5483 "shift permutation is not supported by target\n");
5484 return false;
5485 }
0761848d 5486 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5487
5488 /* Generating permutation constant to select vector from 2.
5489 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5490 for (i = 0; i < nelt / 2; i++)
5491 sel[i] = i;
5492 for (i = nelt / 2; i < nelt; i++)
5493 sel[i] = nelt + i;
5494 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5495 {
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5498 "select is not supported by target\n");
5499 return false;
5500 }
0761848d 5501 select_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5502
2cc1223e 5503 for (i = 0; i < log_length; i++)
5504 {
5505 for (j = 0; j < length; j += 2)
5506 {
5507 first_vect = dr_chain[j];
5508 second_vect = dr_chain[j + 1];
5509
5510 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
e9cf809e 5511 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5512 first_vect, first_vect,
5513 perm2_mask1);
2cc1223e 5514 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5515 vect[0] = data_ref;
5516
5517 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
e9cf809e 5518 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5519 second_vect, second_vect,
5520 perm2_mask2);
2cc1223e 5521 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5522 vect[1] = data_ref;
926f7a02 5523
2cc1223e 5524 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
e9cf809e 5525 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5526 vect[0], vect[1], shift1_mask);
2cc1223e 5527 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5528 (*result_chain)[j/2 + length/2] = data_ref;
5529
5530 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
e9cf809e 5531 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5532 vect[0], vect[1], select_mask);
2cc1223e 5533 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5534 (*result_chain)[j/2] = data_ref;
5535 }
5536 memcpy (dr_chain.address (), result_chain->address (),
5537 length * sizeof (tree));
5538 }
926f7a02 5539 return true;
5540 }
5541 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5542 {
5543 unsigned int k = 0, l = 0;
5544
5545 /* Generating permutation constant to get all elements in rigth order.
5546 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5547 for (i = 0; i < nelt; i++)
5548 {
5549 if (3 * k + (l % 3) >= nelt)
5550 {
5551 k = 0;
5552 l += (3 - (nelt % 3));
5553 }
5554 sel[i] = 3 * k + (l % 3);
5555 k++;
5556 }
5557 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5558 {
5559 if (dump_enabled_p ())
5560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5561 "shuffle of 3 fields structure is not \
5562 supported by target\n");
5563 return false;
5564 }
0761848d 5565 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5566
5567 /* Generating permutation constant to shift all elements.
5568 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5569 for (i = 0; i < nelt; i++)
5570 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5571 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5572 {
5573 if (dump_enabled_p ())
5574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5575 "shift permutation is not supported by target\n");
5576 return false;
5577 }
0761848d 5578 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5579
5580 /* Generating permutation constant to shift all elements.
5581 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5582 for (i = 0; i < nelt; i++)
5583 sel[i] = 2 * (nelt / 3) + 1 + i;
5584 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5585 {
5586 if (dump_enabled_p ())
5587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5588 "shift permutation is not supported by target\n");
5589 return false;
5590 }
0761848d 5591 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5592
5593 /* Generating permutation constant to shift all elements.
5594 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5595 for (i = 0; i < nelt; i++)
5596 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5597 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5598 {
5599 if (dump_enabled_p ())
5600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5601 "shift permutation is not supported by target\n");
5602 return false;
5603 }
0761848d 5604 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5605
5606 /* Generating permutation constant to shift all elements.
5607 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5608 for (i = 0; i < nelt; i++)
5609 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5610 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5611 {
5612 if (dump_enabled_p ())
5613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5614 "shift permutation is not supported by target\n");
5615 return false;
5616 }
0761848d 5617 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5618
5619 for (k = 0; k < 3; k++)
5620 {
321d85d9 5621 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
e9cf809e 5622 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5623 dr_chain[k], dr_chain[k],
5624 perm3_mask);
926f7a02 5625 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5626 vect[k] = data_ref;
5627 }
5628
5629 for (k = 0; k < 3; k++)
5630 {
5631 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
e9cf809e 5632 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5633 vect[k % 3], vect[(k + 1) % 3],
5634 shift1_mask);
926f7a02 5635 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5636 vect_shift[k] = data_ref;
5637 }
5638
5639 for (k = 0; k < 3; k++)
5640 {
5641 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
e9cf809e 5642 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5643 vect_shift[(4 - k) % 3],
5644 vect_shift[(3 - k) % 3],
5645 shift2_mask);
926f7a02 5646 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5647 vect[k] = data_ref;
5648 }
5649
5650 (*result_chain)[3 - (nelt % 3)] = vect[2];
5651
5652 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
e9cf809e 5653 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5654 vect[0], shift3_mask);
926f7a02 5655 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5656 (*result_chain)[nelt % 3] = data_ref;
5657
5658 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
e9cf809e 5659 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5660 vect[1], shift4_mask);
926f7a02 5661 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5662 (*result_chain)[0] = data_ref;
5663 return true;
5664 }
5665 return false;
5666}
5667
ee612634 5668/* Function vect_transform_grouped_load.
fb85abff 5669
5670 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5671 to perform their permutation and ascribe the result vectorized statements to
5672 the scalar statements.
5673*/
5674
481fc474 5675void
42acab1c 5676vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
fb85abff 5677 gimple_stmt_iterator *gsi)
5678{
3754d046 5679 machine_mode mode;
1e094109 5680 vec<tree> result_chain = vNULL;
fb85abff 5681
48e1416a 5682 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5683 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
fb85abff 5684 vectors, that are ready for vector computation. */
f1f41a6c 5685 result_chain.create (size);
926f7a02 5686
5687 /* If reassociation width for vector type is 2 or greater target machine can
5688 execute 2 or more vector instructions in parallel. Otherwise try to
5689 get chain for loads group using vect_shift_permute_load_chain. */
5690 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5691 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6da5aa28 5692 || exact_log2 (size) != -1
926f7a02 5693 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5694 gsi, &result_chain))
5695 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
ee612634 5696 vect_record_grouped_load_vectors (stmt, result_chain);
f1f41a6c 5697 result_chain.release ();
94b7b4dd 5698}
5699
ee612634 5700/* RESULT_CHAIN contains the output of a group of grouped loads that were
94b7b4dd 5701 generated as part of the vectorization of STMT. Assign the statement
5702 for each vector to the associated scalar statement. */
5703
5704void
42acab1c 5705vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
94b7b4dd 5706{
42acab1c 5707 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5708 gimple *next_stmt, *new_stmt;
94b7b4dd 5709 unsigned int i, gap_count;
5710 tree tmp_data_ref;
fb85abff 5711
48e1416a 5712 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5713 Since we scan the chain starting from it's first node, their order
fb85abff 5714 corresponds the order of data-refs in RESULT_CHAIN. */
5715 next_stmt = first_stmt;
5716 gap_count = 1;
f1f41a6c 5717 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
fb85abff 5718 {
5719 if (!next_stmt)
5720 break;
5721
282bf14c 5722 /* Skip the gaps. Loads created for the gaps will be removed by dead
5723 code elimination pass later. No need to check for the first stmt in
fb85abff 5724 the group, since it always exists.
21009880 5725 GROUP_GAP is the number of steps in elements from the previous
5726 access (if there is no gap GROUP_GAP is 1). We skip loads that
282bf14c 5727 correspond to the gaps. */
48e1416a 5728 if (next_stmt != first_stmt
21009880 5729 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
fb85abff 5730 {
5731 gap_count++;
5732 continue;
5733 }
5734
5735 while (next_stmt)
5736 {
5737 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5738 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5739 copies, and we put the new vector statement in the first available
5740 RELATED_STMT. */
5741 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5742 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5743 else
5744 {
21009880 5745 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
fb85abff 5746 {
42acab1c 5747 gimple *prev_stmt =
fb85abff 5748 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
42acab1c 5749 gimple *rel_stmt =
fb85abff 5750 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5751 while (rel_stmt)
5752 {
5753 prev_stmt = rel_stmt;
48e1416a 5754 rel_stmt =
fb85abff 5755 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5756 }
5757
48e1416a 5758 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
fb85abff 5759 new_stmt;
5760 }
5761 }
5762
21009880 5763 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
fb85abff 5764 gap_count = 1;
5765 /* If NEXT_STMT accesses the same DR as the previous statement,
5766 put the same TMP_DATA_REF as its vectorized statement; otherwise
5767 get the next data-ref from RESULT_CHAIN. */
21009880 5768 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
fb85abff 5769 break;
5770 }
5771 }
fb85abff 5772}
5773
5774/* Function vect_force_dr_alignment_p.
5775
5776 Returns whether the alignment of a DECL can be forced to be aligned
5777 on ALIGNMENT bit boundary. */
5778
48e1416a 5779bool
fb85abff 5780vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5781{
5782 if (TREE_CODE (decl) != VAR_DECL)
5783 return false;
5784
331d5983 5785 if (decl_in_symtab_p (decl)
5786 && !symtab_node::get (decl)->can_increase_alignment_p ())
8cab13cf 5787 return false;
5788
fb85abff 5789 if (TREE_STATIC (decl))
5790 return (alignment <= MAX_OFILE_ALIGNMENT);
5791 else
5792 return (alignment <= MAX_STACK_ALIGNMENT);
5793}
5794
fb85abff 5795
0822b158 5796/* Return whether the data reference DR is supported with respect to its
5797 alignment.
5798 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5799 it is aligned, i.e., check if it is possible to vectorize it with different
fb85abff 5800 alignment. */
5801
5802enum dr_alignment_support
0822b158 5803vect_supportable_dr_alignment (struct data_reference *dr,
5804 bool check_aligned_accesses)
fb85abff 5805{
42acab1c 5806 gimple *stmt = DR_STMT (dr);
fb85abff 5807 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5808 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3754d046 5809 machine_mode mode = TYPE_MODE (vectype);
37545e54 5810 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5811 struct loop *vect_loop = NULL;
5812 bool nested_in_vect_loop = false;
fb85abff 5813
0822b158 5814 if (aligned_access_p (dr) && !check_aligned_accesses)
fb85abff 5815 return dr_aligned;
5816
c71d3c24 5817 /* For now assume all conditional loads/stores support unaligned
5818 access without any special code. */
5819 if (is_gimple_call (stmt)
5820 && gimple_call_internal_p (stmt)
5821 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5822 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5823 return dr_unaligned_supported;
5824
ad074595 5825 if (loop_vinfo)
5826 {
5827 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5828 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5829 }
37545e54 5830
fb85abff 5831 /* Possibly unaligned access. */
5832
5833 /* We can choose between using the implicit realignment scheme (generating
5834 a misaligned_move stmt) and the explicit realignment scheme (generating
282bf14c 5835 aligned loads with a REALIGN_LOAD). There are two variants to the
5836 explicit realignment scheme: optimized, and unoptimized.
fb85abff 5837 We can optimize the realignment only if the step between consecutive
5838 vector loads is equal to the vector size. Since the vector memory
5839 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5840 is guaranteed that the misalignment amount remains the same throughout the
5841 execution of the vectorized loop. Therefore, we can create the
5842 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5843 at the loop preheader.
5844
5845 However, in the case of outer-loop vectorization, when vectorizing a
5846 memory access in the inner-loop nested within the LOOP that is now being
5847 vectorized, while it is guaranteed that the misalignment of the
5848 vectorized memory access will remain the same in different outer-loop
5849 iterations, it is *not* guaranteed that is will remain the same throughout
5850 the execution of the inner-loop. This is because the inner-loop advances
5851 with the original scalar step (and not in steps of VS). If the inner-loop
5852 step happens to be a multiple of VS, then the misalignment remains fixed
5853 and we can use the optimized realignment scheme. For example:
5854
5855 for (i=0; i<N; i++)
5856 for (j=0; j<M; j++)
5857 s += a[i+j];
5858
5859 When vectorizing the i-loop in the above example, the step between
5860 consecutive vector loads is 1, and so the misalignment does not remain
5861 fixed across the execution of the inner-loop, and the realignment cannot
5862 be optimized (as illustrated in the following pseudo vectorized loop):
5863
5864 for (i=0; i<N; i+=4)
5865 for (j=0; j<M; j++){
5866 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5867 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5868 // (assuming that we start from an aligned address).
5869 }
5870
5871 We therefore have to use the unoptimized realignment scheme:
5872
5873 for (i=0; i<N; i+=4)
5874 for (j=k; j<M; j+=4)
5875 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5876 // that the misalignment of the initial address is
5877 // 0).
5878
5879 The loop can then be vectorized as follows:
5880
5881 for (k=0; k<4; k++){
5882 rt = get_realignment_token (&vp[k]);
5883 for (i=0; i<N; i+=4){
5884 v1 = vp[i+k];
5885 for (j=k; j<M; j+=4){
5886 v2 = vp[i+j+VS-1];
5887 va = REALIGN_LOAD <v1,v2,rt>;
5888 vs += va;
5889 v1 = v2;
5890 }
5891 }
5892 } */
5893
5894 if (DR_IS_READ (dr))
5895 {
c6b19c5f 5896 bool is_packed = false;
5897 tree type = (TREE_TYPE (DR_REF (dr)));
5898
d6bf3b14 5899 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
fb85abff 5900 && (!targetm.vectorize.builtin_mask_for_load
5901 || targetm.vectorize.builtin_mask_for_load ()))
5902 {
5903 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ad074595 5904 if ((nested_in_vect_loop
f9ae6f95 5905 && (TREE_INT_CST_LOW (DR_STEP (dr))
ad074595 5906 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5907 || !loop_vinfo)
fb85abff 5908 return dr_explicit_realign;
5909 else
5910 return dr_explicit_realign_optimized;
5911 }
c6b19c5f 5912 if (!known_alignment_for_access_p (dr))
cfa724cf 5913 is_packed = not_size_aligned (DR_REF (dr));
48e1416a 5914
2cd0995e 5915 if ((TYPE_USER_ALIGN (type) && !is_packed)
5916 || targetm.vectorize.
5917 support_vector_misalignment (mode, type,
5918 DR_MISALIGNMENT (dr), is_packed))
fb85abff 5919 /* Can't software pipeline the loads, but can at least do them. */
5920 return dr_unaligned_supported;
5921 }
c6b19c5f 5922 else
5923 {
5924 bool is_packed = false;
5925 tree type = (TREE_TYPE (DR_REF (dr)));
fb85abff 5926
c6b19c5f 5927 if (!known_alignment_for_access_p (dr))
cfa724cf 5928 is_packed = not_size_aligned (DR_REF (dr));
48e1416a 5929
2cd0995e 5930 if ((TYPE_USER_ALIGN (type) && !is_packed)
5931 || targetm.vectorize.
5932 support_vector_misalignment (mode, type,
5933 DR_MISALIGNMENT (dr), is_packed))
c6b19c5f 5934 return dr_unaligned_supported;
5935 }
48e1416a 5936
fb85abff 5937 /* Unsupported. */
5938 return dr_unaligned_unsupported;
5939}