]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-data-refs.c
2015-06-17 Andrew MacLeod <amacleod@redhat.com>
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
CommitLineData
48e1416a 1/* Data References Analysis and Manipulation Utilities for Vectorization.
d353bf18 2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
48e1416a 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
fb85abff 4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
7bd765d4 25#include "dumpfile.h"
fb85abff 26#include "tm.h"
b20a8bb4 27#include "alias.h"
28#include "symtab.h"
fb85abff 29#include "tree.h"
b20a8bb4 30#include "fold-const.h"
9ed99284 31#include "stor-layout.h"
218a4f88 32#include "tm_p.h"
fb85abff 33#include "target.h"
94ea8568 34#include "predict.h"
94ea8568 35#include "hard-reg-set.h"
94ea8568 36#include "function.h"
37#include "dominance.h"
38#include "cfg.h"
fb85abff 39#include "basic-block.h"
ce084dfc 40#include "gimple-pretty-print.h"
bc61cadb 41#include "tree-ssa-alias.h"
42#include "internal-fn.h"
43#include "tree-eh.h"
44#include "gimple-expr.h"
073c1fd5 45#include "gimple.h"
a8783bee 46#include "gimplify.h"
dcf1a1ec 47#include "gimple-iterator.h"
e795d6e1 48#include "gimplify-me.h"
073c1fd5 49#include "gimple-ssa.h"
50#include "tree-phinodes.h"
51#include "ssa-iterators.h"
9ed99284 52#include "stringpool.h"
073c1fd5 53#include "tree-ssanames.h"
05d9c18a 54#include "tree-ssa-loop-ivopts.h"
55#include "tree-ssa-loop-manip.h"
073c1fd5 56#include "tree-ssa-loop.h"
fb85abff 57#include "cfgloop.h"
fb85abff 58#include "tree-chrec.h"
59#include "tree-scalar-evolution.h"
60#include "tree-vectorizer.h"
0b205f4c 61#include "diagnostic-core.h"
1140c305 62#include "plugin-api.h"
63#include "ipa-ref.h"
d09768a4 64#include "cgraph.h"
8e3cb73b 65/* Need to include rtl.h, expr.h, etc. for optabs. */
d53441c8 66#include "rtl.h"
67#include "flags.h"
d53441c8 68#include "insn-config.h"
69#include "expmed.h"
70#include "dojump.h"
71#include "explow.h"
72#include "calls.h"
73#include "emit-rtl.h"
74#include "varasm.h"
75#include "stmt.h"
8e3cb73b 76#include "expr.h"
34517c64 77#include "insn-codes.h"
8e3cb73b 78#include "optabs.h"
f7715905 79#include "builtins.h"
fb85abff 80
94b7b4dd 81/* Return true if load- or store-lanes optab OPTAB is implemented for
82 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
83
84static bool
85vect_lanes_optab_supported_p (const char *name, convert_optab optab,
86 tree vectype, unsigned HOST_WIDE_INT count)
87{
3754d046 88 machine_mode mode, array_mode;
94b7b4dd 89 bool limit_p;
90
91 mode = TYPE_MODE (vectype);
92 limit_p = !targetm.array_mode_supported_p (mode, count);
93 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
94 MODE_INT, limit_p);
95
96 if (array_mode == BLKmode)
97 {
6d8fb6cf 98 if (dump_enabled_p ())
78bb46f5 99 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
100 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
7bd765d4 101 GET_MODE_NAME (mode), count);
94b7b4dd 102 return false;
103 }
104
105 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
106 {
6d8fb6cf 107 if (dump_enabled_p ())
7bd765d4 108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 109 "cannot use %s<%s><%s>\n", name,
7bd765d4 110 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
94b7b4dd 111 return false;
112 }
113
6d8fb6cf 114 if (dump_enabled_p ())
7bd765d4 115 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 116 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
7bd765d4 117 GET_MODE_NAME (mode));
94b7b4dd 118
119 return true;
120}
121
122
fb85abff 123/* Return the smallest scalar part of STMT.
282bf14c 124 This is used to determine the vectype of the stmt. We generally set the
125 vectype according to the type of the result (lhs). For stmts whose
fb85abff 126 result-type is different than the type of the arguments (e.g., demotion,
48e1416a 127 promotion), vectype will be reset appropriately (later). Note that we have
fb85abff 128 to visit the smallest datatype in this function, because that determines the
282bf14c 129 VF. If the smallest datatype in the loop is present only as the rhs of a
fb85abff 130 promotion operation - we'd miss it.
131 Such a case, where a variable of this datatype does not appear in the lhs
132 anywhere in the loop, can only occur if it's an invariant: e.g.:
48e1416a 133 'int_x = (int) short_inv', which we'd expect to have been optimized away by
282bf14c 134 invariant motion. However, we cannot rely on invariant motion to always
135 take invariants out of the loop, and so in the case of promotion we also
136 have to check the rhs.
fb85abff 137 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
138 types. */
139
140tree
141vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
142 HOST_WIDE_INT *rhs_size_unit)
143{
144 tree scalar_type = gimple_expr_type (stmt);
145 HOST_WIDE_INT lhs, rhs;
146
f9ae6f95 147 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
fb85abff 148
149 if (is_gimple_assign (stmt)
150 && (gimple_assign_cast_p (stmt)
151 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
1c928a0c 152 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
fb85abff 153 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
154 {
155 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
156
f9ae6f95 157 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
fb85abff 158 if (rhs < lhs)
159 scalar_type = rhs_type;
160 }
48e1416a 161
162 *lhs_size_unit = lhs;
fb85abff 163 *rhs_size_unit = rhs;
164 return scalar_type;
165}
166
167
fb85abff 168/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
169 tested at run-time. Return TRUE if DDR was successfully inserted.
170 Return false if versioning is not supported. */
171
172static bool
173vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
174{
175 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176
177 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
178 return false;
179
6d8fb6cf 180 if (dump_enabled_p ())
fb85abff 181 {
7bd765d4 182 dump_printf_loc (MSG_NOTE, vect_location,
183 "mark for run-time aliasing test between ");
184 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
185 dump_printf (MSG_NOTE, " and ");
186 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
78bb46f5 187 dump_printf (MSG_NOTE, "\n");
fb85abff 188 }
189
190 if (optimize_loop_nest_for_size_p (loop))
191 {
6d8fb6cf 192 if (dump_enabled_p ())
78bb46f5 193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
194 "versioning not supported when optimizing"
195 " for size.\n");
fb85abff 196 return false;
197 }
198
199 /* FORNOW: We don't support versioning with outer-loop vectorization. */
200 if (loop->inner)
201 {
6d8fb6cf 202 if (dump_enabled_p ())
78bb46f5 203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
204 "versioning not yet supported for outer-loops.\n");
fb85abff 205 return false;
206 }
207
f634c3e9 208 /* FORNOW: We don't support creating runtime alias tests for non-constant
209 step. */
210 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
211 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
212 {
6d8fb6cf 213 if (dump_enabled_p ())
78bb46f5 214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 215 "versioning not yet supported for non-constant "
78bb46f5 216 "step\n");
f634c3e9 217 return false;
218 }
219
f1f41a6c 220 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
fb85abff 221 return true;
222}
223
37545e54 224
fb85abff 225/* Function vect_analyze_data_ref_dependence.
226
227 Return TRUE if there (might) exist a dependence between a memory-reference
228 DRA and a memory-reference DRB. When versioning for alias may check a
91a74fc6 229 dependence at run-time, return FALSE. Adjust *MAX_VF according to
230 the data dependence. */
48e1416a 231
fb85abff 232static bool
233vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
80508571 234 loop_vec_info loop_vinfo, int *max_vf)
fb85abff 235{
236 unsigned int i;
68f15e9d 237 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 238 struct data_reference *dra = DDR_A (ddr);
239 struct data_reference *drb = DDR_B (ddr);
48e1416a 240 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
fb85abff 241 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
fb85abff 242 lambda_vector dist_v;
243 unsigned int loop_depth;
48e1416a 244
68f15e9d 245 /* In loop analysis all data references should be vectorizable. */
6ea6a380 246 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
247 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
68f15e9d 248 gcc_unreachable ();
6ea6a380 249
68f15e9d 250 /* Independent data accesses. */
fb85abff 251 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
68f15e9d 252 return false;
37545e54 253
68f15e9d 254 if (dra == drb
255 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
fb85abff 256 return false;
48e1416a 257
0f52e33a 258 /* Even if we have an anti-dependence then, as the vectorized loop covers at
259 least two scalar iterations, there is always also a true dependence.
260 As the vectorizer does not re-order loads and stores we can ignore
261 the anti-dependence if TBAA can disambiguate both DRs similar to the
262 case with known negative distance anti-dependences (positive
263 distance anti-dependences would violate TBAA constraints). */
264 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
265 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
266 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
267 get_alias_set (DR_REF (drb))))
268 return false;
48e1416a 269
68f15e9d 270 /* Unknown data dependence. */
fb85abff 271 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
272 {
3d483a94 273 /* If user asserted safelen consecutive iterations can be
274 executed concurrently, assume independence. */
275 if (loop->safelen >= 2)
276 {
277 if (loop->safelen < *max_vf)
278 *max_vf = loop->safelen;
c7a8722c 279 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
3d483a94 280 return false;
281 }
282
95e19962 283 if (STMT_VINFO_GATHER_P (stmtinfo_a)
284 || STMT_VINFO_GATHER_P (stmtinfo_b))
285 {
286 if (dump_enabled_p ())
287 {
288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
289 "versioning for alias not supported for: "
290 "can't determine dependence between ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
292 DR_REF (dra));
293 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
294 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
295 DR_REF (drb));
78bb46f5 296 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
95e19962 297 }
4c2623ad 298 return true;
95e19962 299 }
300
6d8fb6cf 301 if (dump_enabled_p ())
68f15e9d 302 {
303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
304 "versioning for alias required: "
305 "can't determine dependence between ");
306 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
307 DR_REF (dra));
308 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
309 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
310 DR_REF (drb));
78bb46f5 311 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
68f15e9d 312 }
d4b21757 313
68f15e9d 314 /* Add to list of ddrs that need to be tested at run-time. */
315 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
37545e54 316 }
317
68f15e9d 318 /* Known data dependence. */
fb85abff 319 if (DDR_NUM_DIST_VECTS (ddr) == 0)
320 {
3d483a94 321 /* If user asserted safelen consecutive iterations can be
322 executed concurrently, assume independence. */
323 if (loop->safelen >= 2)
324 {
325 if (loop->safelen < *max_vf)
326 *max_vf = loop->safelen;
c7a8722c 327 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
3d483a94 328 return false;
329 }
330
95e19962 331 if (STMT_VINFO_GATHER_P (stmtinfo_a)
332 || STMT_VINFO_GATHER_P (stmtinfo_b))
333 {
334 if (dump_enabled_p ())
335 {
336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
337 "versioning for alias not supported for: "
338 "bad dist vector for ");
339 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
340 DR_REF (dra));
341 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
342 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
343 DR_REF (drb));
78bb46f5 344 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
95e19962 345 }
4c2623ad 346 return true;
95e19962 347 }
348
6d8fb6cf 349 if (dump_enabled_p ())
fb85abff 350 {
78bb46f5 351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 352 "versioning for alias required: "
353 "bad dist vector for ");
354 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
355 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
78bb46f5 357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 358 }
359 /* Add to list of ddrs that need to be tested at run-time. */
360 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
48e1416a 361 }
fb85abff 362
363 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
f1f41a6c 364 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
fb85abff 365 {
366 int dist = dist_v[loop_depth];
367
6d8fb6cf 368 if (dump_enabled_p ())
7bd765d4 369 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 370 "dependence distance = %d.\n", dist);
fb85abff 371
91a74fc6 372 if (dist == 0)
fb85abff 373 {
6d8fb6cf 374 if (dump_enabled_p ())
fb85abff 375 {
78bb46f5 376 dump_printf_loc (MSG_NOTE, vect_location,
377 "dependence distance == 0 between ");
7bd765d4 378 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
379 dump_printf (MSG_NOTE, " and ");
380 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 381 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 382 }
383
4d525783 384 /* When we perform grouped accesses and perform implicit CSE
385 by detecting equal accesses and doing disambiguation with
386 runtime alias tests like for
387 .. = a[i];
388 .. = a[i+1];
389 a[i] = ..;
390 a[i+1] = ..;
391 *p = ..;
392 .. = a[i];
393 .. = a[i+1];
394 where we will end up loading { a[i], a[i+1] } once, make
395 sure that inserting group loads before the first load and
5a91be9e 396 stores after the last store will do the right thing.
397 Similar for groups like
398 a[i] = ...;
399 ... = a[i];
400 a[i+1] = ...;
401 where loads from the group interleave with the store. */
402 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
403 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
4d525783 404 {
405 gimple earlier_stmt;
406 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
407 if (DR_IS_WRITE
408 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
409 {
410 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 412 "READ_WRITE dependence in interleaving."
413 "\n");
4d525783 414 return true;
415 }
fb85abff 416 }
48e1416a 417
91a74fc6 418 continue;
419 }
420
421 if (dist > 0 && DDR_REVERSED_P (ddr))
422 {
423 /* If DDR_REVERSED_P the order of the data-refs in DDR was
424 reversed (to make distance vector positive), and the actual
425 distance is negative. */
6d8fb6cf 426 if (dump_enabled_p ())
7bd765d4 427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 428 "dependence distance negative.\n");
a8cf7702 429 /* Record a negative dependence distance to later limit the
430 amount of stmt copying / unrolling we can perform.
431 Only need to handle read-after-write dependence. */
432 if (DR_IS_READ (drb)
433 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
434 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
435 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
91a74fc6 436 continue;
437 }
438
439 if (abs (dist) >= 2
440 && abs (dist) < *max_vf)
441 {
442 /* The dependence distance requires reduction of the maximal
443 vectorization factor. */
444 *max_vf = abs (dist);
6d8fb6cf 445 if (dump_enabled_p ())
7bd765d4 446 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 447 "adjusting maximal vectorization factor to %i\n",
448 *max_vf);
fb85abff 449 }
450
91a74fc6 451 if (abs (dist) >= *max_vf)
fb85abff 452 {
48e1416a 453 /* Dependence distance does not create dependence, as far as
91a74fc6 454 vectorization is concerned, in this case. */
6d8fb6cf 455 if (dump_enabled_p ())
7bd765d4 456 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 457 "dependence distance >= VF.\n");
fb85abff 458 continue;
459 }
460
6d8fb6cf 461 if (dump_enabled_p ())
fb85abff 462 {
7bd765d4 463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 464 "not vectorized, possible dependence "
465 "between data-refs ");
7bd765d4 466 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
467 dump_printf (MSG_NOTE, " and ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 469 dump_printf (MSG_NOTE, "\n");
fb85abff 470 }
471
472 return true;
473 }
474
475 return false;
476}
477
478/* Function vect_analyze_data_ref_dependences.
48e1416a 479
fb85abff 480 Examine all the data references in the loop, and make sure there do not
91a74fc6 481 exist any data dependences between them. Set *MAX_VF according to
482 the maximum vectorization factor the data dependences allow. */
48e1416a 483
fb85abff 484bool
68f15e9d 485vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
fb85abff 486{
487 unsigned int i;
fb85abff 488 struct data_dependence_relation *ddr;
489
6d8fb6cf 490 if (dump_enabled_p ())
7bd765d4 491 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 492 "=== vect_analyze_data_ref_dependences ===\n");
68f15e9d 493
c7a8722c 494 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
68f15e9d 495 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
496 &LOOP_VINFO_DDRS (loop_vinfo),
497 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
498 return false;
499
500 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
501 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
502 return false;
503
504 return true;
505}
506
507
508/* Function vect_slp_analyze_data_ref_dependence.
509
510 Return TRUE if there (might) exist a dependence between a memory-reference
511 DRA and a memory-reference DRB. When versioning for alias may check a
512 dependence at run-time, return FALSE. Adjust *MAX_VF according to
513 the data dependence. */
514
515static bool
516vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
517{
518 struct data_reference *dra = DDR_A (ddr);
519 struct data_reference *drb = DDR_B (ddr);
520
521 /* We need to check dependences of statements marked as unvectorizable
522 as well, they still can prohibit vectorization. */
523
524 /* Independent data accesses. */
525 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
526 return false;
527
528 if (dra == drb)
529 return false;
530
531 /* Read-read is OK. */
532 if (DR_IS_READ (dra) && DR_IS_READ (drb))
533 return false;
534
1fa434e3 535 /* If dra and drb are part of the same interleaving chain consider
536 them independent. */
537 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
538 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
539 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
540 return false;
541
68f15e9d 542 /* Unknown data dependence. */
543 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
07e3bcbf 544 {
50e6c257 545 if (dump_enabled_p ())
546 {
547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
548 "can't determine dependence between ");
549 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
550 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
551 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
78bb46f5 552 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
50e6c257 553 }
07e3bcbf 554 }
50e6c257 555 else if (dump_enabled_p ())
07e3bcbf 556 {
68f15e9d 557 dump_printf_loc (MSG_NOTE, vect_location,
558 "determined dependence between ");
559 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
560 dump_printf (MSG_NOTE, " and ");
561 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 562 dump_printf (MSG_NOTE, "\n");
07e3bcbf 563 }
48e1416a 564
50e6c257 565 /* We do not vectorize basic blocks with write-write dependencies. */
68f15e9d 566 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
567 return true;
568
50e6c257 569 /* If we have a read-write dependence check that the load is before the store.
68f15e9d 570 When we vectorize basic blocks, vector load can be only before
571 corresponding scalar load, and vector store can be only after its
572 corresponding scalar store. So the order of the acceses is preserved in
573 case the load is before the store. */
50e6c257 574 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
68f15e9d 575 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
50e6c257 576 {
577 /* That only holds for load-store pairs taking part in vectorization. */
578 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
579 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
580 return false;
581 }
68f15e9d 582
583 return true;
584}
585
586
587/* Function vect_analyze_data_ref_dependences.
588
589 Examine all the data references in the basic-block, and make sure there
590 do not exist any data dependences between them. Set *MAX_VF according to
591 the maximum vectorization factor the data dependences allow. */
592
593bool
594vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
595{
596 struct data_dependence_relation *ddr;
597 unsigned int i;
598
599 if (dump_enabled_p ())
600 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 601 "=== vect_slp_analyze_data_ref_dependences ===\n");
68f15e9d 602
603 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
604 &BB_VINFO_DDRS (bb_vinfo),
605 vNULL, true))
606 return false;
607
608 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
609 if (vect_slp_analyze_data_ref_dependence (ddr))
fb85abff 610 return false;
611
612 return true;
613}
614
615
616/* Function vect_compute_data_ref_alignment
617
618 Compute the misalignment of the data reference DR.
619
620 Output:
621 1. If during the misalignment computation it is found that the data reference
622 cannot be vectorized then false is returned.
623 2. DR_MISALIGNMENT (DR) is defined.
624
625 FOR NOW: No analysis is actually performed. Misalignment is calculated
626 only for trivial cases. TODO. */
627
628static bool
629vect_compute_data_ref_alignment (struct data_reference *dr)
630{
631 gimple stmt = DR_STMT (dr);
48e1416a 632 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
fb85abff 633 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
37545e54 634 struct loop *loop = NULL;
fb85abff 635 tree ref = DR_REF (dr);
636 tree vectype;
637 tree base, base_addr;
638 bool base_aligned;
994be998 639 tree misalign = NULL_TREE;
b4142d58 640 tree aligned_to;
641 unsigned HOST_WIDE_INT alignment;
48e1416a 642
6d8fb6cf 643 if (dump_enabled_p ())
7bd765d4 644 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 645 "vect_compute_data_ref_alignment:\n");
fb85abff 646
37545e54 647 if (loop_vinfo)
648 loop = LOOP_VINFO_LOOP (loop_vinfo);
48e1416a 649
fb85abff 650 /* Initialize misalignment to unknown. */
651 SET_DR_MISALIGNMENT (dr, -1);
652
e1c75243 653 /* Strided accesses perform only component accesses, misalignment information
a72a38f4 654 is irrelevant for them. */
e1c75243 655 if (STMT_VINFO_STRIDED_P (stmt_info)
994be998 656 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
a72a38f4 657 return true;
658
994be998 659 if (tree_fits_shwi_p (DR_STEP (dr)))
660 misalign = DR_INIT (dr);
fb85abff 661 aligned_to = DR_ALIGNED_TO (dr);
662 base_addr = DR_BASE_ADDRESS (dr);
663 vectype = STMT_VINFO_VECTYPE (stmt_info);
664
665 /* In case the dataref is in an inner-loop of the loop that is being
666 vectorized (LOOP), we use the base and misalignment information
282bf14c 667 relative to the outer-loop (LOOP). This is ok only if the misalignment
fb85abff 668 stays the same throughout the execution of the inner-loop, which is why
669 we have to check that the stride of the dataref in the inner-loop evenly
670 divides by the vector size. */
37545e54 671 if (loop && nested_in_vect_loop_p (loop, stmt))
fb85abff 672 {
673 tree step = DR_STEP (dr);
48e1416a 674
994be998 675 if (tree_fits_shwi_p (step)
676 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
fb85abff 677 {
6d8fb6cf 678 if (dump_enabled_p ())
7bd765d4 679 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 680 "inner step divides the vector-size.\n");
fb85abff 681 misalign = STMT_VINFO_DR_INIT (stmt_info);
682 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
683 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
684 }
685 else
686 {
6d8fb6cf 687 if (dump_enabled_p ())
7bd765d4 688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 689 "inner step doesn't divide the vector-size.\n");
fb85abff 690 misalign = NULL_TREE;
691 }
692 }
693
38682b67 694 /* Similarly, if we're doing basic-block vectorization, we can only use
695 base and misalignment information relative to an innermost loop if the
696 misalignment stays the same throughout the execution of the loop.
697 As above, this is the case if the stride of the dataref evenly divides
698 by the vector size. */
699 if (!loop)
700 {
701 tree step = DR_STEP (dr);
38682b67 702
994be998 703 if (tree_fits_shwi_p (step)
704 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
38682b67 705 {
6d8fb6cf 706 if (dump_enabled_p ())
78bb46f5 707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
708 "SLP: step doesn't divide the vector-size.\n");
38682b67 709 misalign = NULL_TREE;
710 }
711 }
712
b4142d58 713 alignment = TYPE_ALIGN_UNIT (vectype);
fb85abff 714
b4142d58 715 if ((compare_tree_int (aligned_to, alignment) < 0)
fb85abff 716 || !misalign)
717 {
6d8fb6cf 718 if (dump_enabled_p ())
fb85abff 719 {
7bd765d4 720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 721 "Unknown alignment for access: ");
b4142d58 722 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
78bb46f5 723 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 724 }
725 return true;
726 }
727
b4142d58 728 /* To look at alignment of the base we have to preserve an inner MEM_REF
729 as that carries alignment information of the actual access. */
730 base = ref;
731 while (handled_component_p (base))
732 base = TREE_OPERAND (base, 0);
733 if (TREE_CODE (base) == MEM_REF)
734 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
735 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
736
737 if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
fb85abff 738 base_aligned = true;
739 else
48e1416a 740 base_aligned = false;
fb85abff 741
48e1416a 742 if (!base_aligned)
fb85abff 743 {
b4142d58 744 /* Strip an inner MEM_REF to a bare decl if possible. */
745 if (TREE_CODE (base) == MEM_REF
746 && integer_zerop (TREE_OPERAND (base, 1))
747 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
748 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
749
5c8bc079 750 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
fb85abff 751 {
6d8fb6cf 752 if (dump_enabled_p ())
fb85abff 753 {
7bd765d4 754 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 755 "can't force alignment of ref: ");
7bd765d4 756 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
78bb46f5 757 dump_printf (MSG_NOTE, "\n");
fb85abff 758 }
759 return true;
760 }
48e1416a 761
fb85abff 762 /* Force the alignment of the decl.
763 NOTE: This is the only change to the code we make during
764 the analysis phase, before deciding to vectorize the loop. */
6d8fb6cf 765 if (dump_enabled_p ())
0822b158 766 {
7bd765d4 767 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
768 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
78bb46f5 769 dump_printf (MSG_NOTE, "\n");
0822b158 770 }
771
23e1875f 772 ((dataref_aux *)dr->aux)->base_decl = base;
773 ((dataref_aux *)dr->aux)->base_misaligned = true;
fb85abff 774 }
775
85a846a2 776 /* If this is a backward running DR then first access in the larger
777 vectype actually is N-1 elements before the address in the DR.
778 Adjust misalign accordingly. */
b4142d58 779 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
85a846a2 780 {
781 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
782 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
783 otherwise we wouldn't be here. */
784 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
785 /* PLUS because DR_STEP was negative. */
786 misalign = size_binop (PLUS_EXPR, misalign, offset);
787 }
788
b4142d58 789 SET_DR_MISALIGNMENT (dr,
790 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
fb85abff 791
6d8fb6cf 792 if (dump_enabled_p ())
fb85abff 793 {
7bd765d4 794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
795 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
796 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
78bb46f5 797 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 798 }
799
800 return true;
801}
802
803
804/* Function vect_compute_data_refs_alignment
805
806 Compute the misalignment of data references in the loop.
807 Return FALSE if a data reference is found that cannot be vectorized. */
808
809static bool
48e1416a 810vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
37545e54 811 bb_vec_info bb_vinfo)
fb85abff 812{
f1f41a6c 813 vec<data_reference_p> datarefs;
fb85abff 814 struct data_reference *dr;
815 unsigned int i;
816
37545e54 817 if (loop_vinfo)
818 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
819 else
820 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
48e1416a 821
f1f41a6c 822 FOR_EACH_VEC_ELT (datarefs, i, dr)
6ea6a380 823 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
824 && !vect_compute_data_ref_alignment (dr))
825 {
826 if (bb_vinfo)
827 {
828 /* Mark unsupported statement as unvectorizable. */
829 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
830 continue;
831 }
832 else
833 return false;
834 }
fb85abff 835
836 return true;
837}
838
839
840/* Function vect_update_misalignment_for_peel
841
842 DR - the data reference whose misalignment is to be adjusted.
843 DR_PEEL - the data reference whose misalignment is being made
844 zero in the vector loop by the peel.
845 NPEEL - the number of iterations in the peel loop if the misalignment
846 of DR_PEEL is known at compile time. */
847
848static void
849vect_update_misalignment_for_peel (struct data_reference *dr,
850 struct data_reference *dr_peel, int npeel)
851{
852 unsigned int i;
f1f41a6c 853 vec<dr_p> same_align_drs;
fb85abff 854 struct data_reference *current_dr;
855 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
856 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
857 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
858 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
859
860 /* For interleaved data accesses the step in the loop must be multiplied by
861 the size of the interleaving group. */
ee612634 862 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
21009880 863 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
ee612634 864 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
21009880 865 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
fb85abff 866
867 /* It can be assumed that the data refs with the same alignment as dr_peel
868 are aligned in the vector loop. */
869 same_align_drs
870 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
f1f41a6c 871 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
fb85abff 872 {
873 if (current_dr != dr)
874 continue;
875 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
876 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
877 SET_DR_MISALIGNMENT (dr, 0);
878 return;
879 }
880
881 if (known_alignment_for_access_p (dr)
882 && known_alignment_for_access_p (dr_peel))
883 {
f1b8c740 884 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
fb85abff 885 int misal = DR_MISALIGNMENT (dr);
886 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
f1b8c740 887 misal += negative ? -npeel * dr_size : npeel * dr_size;
482a44fa 888 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
fb85abff 889 SET_DR_MISALIGNMENT (dr, misal);
890 return;
891 }
892
6d8fb6cf 893 if (dump_enabled_p ())
78bb46f5 894 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
fb85abff 895 SET_DR_MISALIGNMENT (dr, -1);
896}
897
898
899/* Function vect_verify_datarefs_alignment
900
901 Return TRUE if all data references in the loop can be
902 handled with respect to alignment. */
903
37545e54 904bool
905vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
fb85abff 906{
f1f41a6c 907 vec<data_reference_p> datarefs;
fb85abff 908 struct data_reference *dr;
909 enum dr_alignment_support supportable_dr_alignment;
910 unsigned int i;
911
37545e54 912 if (loop_vinfo)
913 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
914 else
915 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
916
f1f41a6c 917 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 918 {
919 gimple stmt = DR_STMT (dr);
920 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
921
1ad41595 922 if (!STMT_VINFO_RELEVANT_P (stmt_info))
923 continue;
924
6ea6a380 925 /* For interleaving, only the alignment of the first access matters.
926 Skip statements marked as not vectorizable. */
ee612634 927 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 928 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6ea6a380 929 || !STMT_VINFO_VECTORIZABLE (stmt_info))
fb85abff 930 continue;
931
e1c75243 932 /* Strided accesses perform only component accesses, alignment is
506aa8fc 933 irrelevant for them. */
e1c75243 934 if (STMT_VINFO_STRIDED_P (stmt_info)
994be998 935 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
506aa8fc 936 continue;
937
0822b158 938 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
fb85abff 939 if (!supportable_dr_alignment)
940 {
6d8fb6cf 941 if (dump_enabled_p ())
fb85abff 942 {
943 if (DR_IS_READ (dr))
7bd765d4 944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "not vectorized: unsupported unaligned load.");
fb85abff 946 else
7bd765d4 947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
948 "not vectorized: unsupported unaligned "
949 "store.");
6ea6a380 950
7bd765d4 951 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
952 DR_REF (dr));
78bb46f5 953 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 954 }
955 return false;
956 }
6d8fb6cf 957 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
7bd765d4 958 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 959 "Vectorizing an unaligned access.\n");
fb85abff 960 }
961 return true;
962}
963
cfa724cf 964/* Given an memory reference EXP return whether its alignment is less
965 than its size. */
966
967static bool
968not_size_aligned (tree exp)
969{
e913b5cd 970 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
cfa724cf 971 return true;
972
e913b5cd 973 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
cfa724cf 974 > get_object_alignment (exp));
975}
fb85abff 976
977/* Function vector_alignment_reachable_p
978
979 Return true if vector alignment for DR is reachable by peeling
980 a few loop iterations. Return false otherwise. */
981
982static bool
983vector_alignment_reachable_p (struct data_reference *dr)
984{
985 gimple stmt = DR_STMT (dr);
986 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
987 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
988
ee612634 989 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
fb85abff 990 {
991 /* For interleaved access we peel only if number of iterations in
992 the prolog loop ({VF - misalignment}), is a multiple of the
993 number of the interleaved accesses. */
994 int elem_size, mis_in_elements;
995 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
996
997 /* FORNOW: handle only known alignment. */
998 if (!known_alignment_for_access_p (dr))
999 return false;
1000
1001 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1002 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1003
21009880 1004 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
fb85abff 1005 return false;
1006 }
1007
1008 /* If misalignment is known at the compile time then allow peeling
1009 only if natural alignment is reachable through peeling. */
1010 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1011 {
48e1416a 1012 HOST_WIDE_INT elmsize =
fb85abff 1013 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
6d8fb6cf 1014 if (dump_enabled_p ())
fb85abff 1015 {
78bb46f5 1016 dump_printf_loc (MSG_NOTE, vect_location,
1017 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1018 dump_printf (MSG_NOTE,
1019 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
fb85abff 1020 }
1021 if (DR_MISALIGNMENT (dr) % elmsize)
1022 {
6d8fb6cf 1023 if (dump_enabled_p ())
78bb46f5 1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 "data size does not divide the misalignment.\n");
fb85abff 1026 return false;
1027 }
1028 }
1029
1030 if (!known_alignment_for_access_p (dr))
1031 {
cfa724cf 1032 tree type = TREE_TYPE (DR_REF (dr));
1033 bool is_packed = not_size_aligned (DR_REF (dr));
6d8fb6cf 1034 if (dump_enabled_p ())
78bb46f5 1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1036 "Unknown misalignment, is_packed = %d\n",is_packed);
2cd0995e 1037 if ((TYPE_USER_ALIGN (type) && !is_packed)
1038 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
fb85abff 1039 return true;
1040 else
1041 return false;
1042 }
1043
1044 return true;
1045}
1046
0822b158 1047
1048/* Calculate the cost of the memory access represented by DR. */
1049
f97dec81 1050static void
0822b158 1051vect_get_data_access_cost (struct data_reference *dr,
1052 unsigned int *inside_cost,
f97dec81 1053 unsigned int *outside_cost,
1054 stmt_vector_for_cost *body_cost_vec)
0822b158 1055{
1056 gimple stmt = DR_STMT (dr);
1057 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1058 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1059 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1060 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1061 int ncopies = vf / nunits;
0822b158 1062
1ad41595 1063 if (DR_IS_READ (dr))
f97dec81 1064 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1065 NULL, body_cost_vec, false);
0822b158 1066 else
f97dec81 1067 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
0822b158 1068
6d8fb6cf 1069 if (dump_enabled_p ())
7bd765d4 1070 dump_printf_loc (MSG_NOTE, vect_location,
1071 "vect_get_data_access_cost: inside_cost = %d, "
78bb46f5 1072 "outside_cost = %d.\n", *inside_cost, *outside_cost);
0822b158 1073}
1074
1075
0822b158 1076/* Insert DR into peeling hash table with NPEEL as key. */
1077
1078static void
1079vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1080 int npeel)
1081{
1082 struct _vect_peel_info elem, *slot;
3e871d4d 1083 _vect_peel_info **new_slot;
0822b158 1084 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1085
1086 elem.npeel = npeel;
c1f445d2 1087 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
0822b158 1088 if (slot)
1089 slot->count++;
1090 else
1091 {
1092 slot = XNEW (struct _vect_peel_info);
1093 slot->npeel = npeel;
1094 slot->dr = dr;
1095 slot->count = 1;
c1f445d2 1096 new_slot
1097 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
0822b158 1098 *new_slot = slot;
1099 }
1100
3e398f5b 1101 if (!supportable_dr_alignment
1102 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
0822b158 1103 slot->count += VECT_MAX_COST;
1104}
1105
1106
1107/* Traverse peeling hash table to find peeling option that aligns maximum
1108 number of data accesses. */
1109
3e871d4d 1110int
1111vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1112 _vect_peel_extended_info *max)
0822b158 1113{
3e871d4d 1114 vect_peel_info elem = *slot;
0822b158 1115
593fa4d1 1116 if (elem->count > max->peel_info.count
1117 || (elem->count == max->peel_info.count
1118 && max->peel_info.npeel > elem->npeel))
0822b158 1119 {
1120 max->peel_info.npeel = elem->npeel;
1121 max->peel_info.count = elem->count;
1122 max->peel_info.dr = elem->dr;
1123 }
1124
1125 return 1;
1126}
1127
1128
282bf14c 1129/* Traverse peeling hash table and calculate cost for each peeling option.
1130 Find the one with the lowest cost. */
0822b158 1131
3e871d4d 1132int
1133vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1134 _vect_peel_extended_info *min)
0822b158 1135{
3e871d4d 1136 vect_peel_info elem = *slot;
0822b158 1137 int save_misalignment, dummy;
1138 unsigned int inside_cost = 0, outside_cost = 0, i;
1139 gimple stmt = DR_STMT (elem->dr);
1140 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1141 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
f1f41a6c 1142 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
0822b158 1143 struct data_reference *dr;
f97dec81 1144 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
f97dec81 1145
f1f41a6c 1146 prologue_cost_vec.create (2);
1147 body_cost_vec.create (2);
1148 epilogue_cost_vec.create (2);
0822b158 1149
f1f41a6c 1150 FOR_EACH_VEC_ELT (datarefs, i, dr)
0822b158 1151 {
1152 stmt = DR_STMT (dr);
1153 stmt_info = vinfo_for_stmt (stmt);
1154 /* For interleaving, only the alignment of the first access
1155 matters. */
ee612634 1156 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 1157 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
0822b158 1158 continue;
1159
1160 save_misalignment = DR_MISALIGNMENT (dr);
1161 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
f97dec81 1162 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1163 &body_cost_vec);
0822b158 1164 SET_DR_MISALIGNMENT (dr, save_misalignment);
1165 }
1166
7a66d0cf 1167 auto_vec<stmt_info_for_cost> scalar_cost_vec;
1168 vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
41ae9eb4 1169 outside_cost += vect_get_known_peeling_cost
1170 (loop_vinfo, elem->npeel, &dummy,
7a66d0cf 1171 &scalar_cost_vec, &prologue_cost_vec, &epilogue_cost_vec);
f97dec81 1172
1173 /* Prologue and epilogue costs are added to the target model later.
1174 These costs depend only on the scalar iteration cost, the
1175 number of peeling iterations finally chosen, and the number of
1176 misaligned statements. So discard the information found here. */
f1f41a6c 1177 prologue_cost_vec.release ();
1178 epilogue_cost_vec.release ();
0822b158 1179
1180 if (inside_cost < min->inside_cost
1181 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1182 {
1183 min->inside_cost = inside_cost;
1184 min->outside_cost = outside_cost;
f1f41a6c 1185 min->body_cost_vec.release ();
f97dec81 1186 min->body_cost_vec = body_cost_vec;
0822b158 1187 min->peel_info.dr = elem->dr;
1188 min->peel_info.npeel = elem->npeel;
1189 }
f97dec81 1190 else
f1f41a6c 1191 body_cost_vec.release ();
0822b158 1192
1193 return 1;
1194}
1195
1196
1197/* Choose best peeling option by traversing peeling hash table and either
1198 choosing an option with the lowest cost (if cost model is enabled) or the
1199 option that aligns as many accesses as possible. */
1200
1201static struct data_reference *
1202vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
4db2b577 1203 unsigned int *npeel,
f97dec81 1204 stmt_vector_for_cost *body_cost_vec)
0822b158 1205{
1206 struct _vect_peel_extended_info res;
1207
1208 res.peel_info.dr = NULL;
9af5ce0c 1209 res.body_cost_vec = stmt_vector_for_cost ();
0822b158 1210
3e398f5b 1211 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
0822b158 1212 {
1213 res.inside_cost = INT_MAX;
1214 res.outside_cost = INT_MAX;
3e871d4d 1215 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
c1f445d2 1216 ->traverse <_vect_peel_extended_info *,
1217 vect_peeling_hash_get_lowest_cost> (&res);
0822b158 1218 }
1219 else
1220 {
1221 res.peel_info.count = 0;
3e871d4d 1222 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
c1f445d2 1223 ->traverse <_vect_peel_extended_info *,
1224 vect_peeling_hash_get_most_frequent> (&res);
0822b158 1225 }
1226
1227 *npeel = res.peel_info.npeel;
f97dec81 1228 *body_cost_vec = res.body_cost_vec;
0822b158 1229 return res.peel_info.dr;
1230}
1231
1232
fb85abff 1233/* Function vect_enhance_data_refs_alignment
1234
1235 This pass will use loop versioning and loop peeling in order to enhance
1236 the alignment of data references in the loop.
1237
1238 FOR NOW: we assume that whatever versioning/peeling takes place, only the
282bf14c 1239 original loop is to be vectorized. Any other loops that are created by
fb85abff 1240 the transformations performed in this pass - are not supposed to be
282bf14c 1241 vectorized. This restriction will be relaxed.
fb85abff 1242
1243 This pass will require a cost model to guide it whether to apply peeling
282bf14c 1244 or versioning or a combination of the two. For example, the scheme that
fb85abff 1245 intel uses when given a loop with several memory accesses, is as follows:
1246 choose one memory access ('p') which alignment you want to force by doing
282bf14c 1247 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
fb85abff 1248 other accesses are not necessarily aligned, or (2) use loop versioning to
1249 generate one loop in which all accesses are aligned, and another loop in
1250 which only 'p' is necessarily aligned.
1251
1252 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1253 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1254 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1255
282bf14c 1256 Devising a cost model is the most critical aspect of this work. It will
fb85abff 1257 guide us on which access to peel for, whether to use loop versioning, how
282bf14c 1258 many versions to create, etc. The cost model will probably consist of
fb85abff 1259 generic considerations as well as target specific considerations (on
1260 powerpc for example, misaligned stores are more painful than misaligned
1261 loads).
1262
1263 Here are the general steps involved in alignment enhancements:
1264
1265 -- original loop, before alignment analysis:
1266 for (i=0; i<N; i++){
1267 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1268 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1269 }
1270
1271 -- After vect_compute_data_refs_alignment:
1272 for (i=0; i<N; i++){
1273 x = q[i]; # DR_MISALIGNMENT(q) = 3
1274 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1275 }
1276
1277 -- Possibility 1: we do loop versioning:
1278 if (p is aligned) {
1279 for (i=0; i<N; i++){ # loop 1A
1280 x = q[i]; # DR_MISALIGNMENT(q) = 3
1281 p[i] = y; # DR_MISALIGNMENT(p) = 0
1282 }
1283 }
1284 else {
1285 for (i=0; i<N; i++){ # loop 1B
1286 x = q[i]; # DR_MISALIGNMENT(q) = 3
1287 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1288 }
1289 }
1290
1291 -- Possibility 2: we do loop peeling:
1292 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1293 x = q[i];
1294 p[i] = y;
1295 }
1296 for (i = 3; i < N; i++){ # loop 2A
1297 x = q[i]; # DR_MISALIGNMENT(q) = 0
1298 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1299 }
1300
1301 -- Possibility 3: combination of loop peeling and versioning:
1302 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1303 x = q[i];
1304 p[i] = y;
1305 }
1306 if (p is aligned) {
1307 for (i = 3; i<N; i++){ # loop 3A
1308 x = q[i]; # DR_MISALIGNMENT(q) = 0
1309 p[i] = y; # DR_MISALIGNMENT(p) = 0
1310 }
1311 }
1312 else {
1313 for (i = 3; i<N; i++){ # loop 3B
1314 x = q[i]; # DR_MISALIGNMENT(q) = 0
1315 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1316 }
1317 }
1318
282bf14c 1319 These loops are later passed to loop_transform to be vectorized. The
fb85abff 1320 vectorizer will use the alignment information to guide the transformation
1321 (whether to generate regular loads/stores, or with special handling for
1322 misalignment). */
1323
1324bool
1325vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1326{
f1f41a6c 1327 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
fb85abff 1328 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1329 enum dr_alignment_support supportable_dr_alignment;
0822b158 1330 struct data_reference *dr0 = NULL, *first_store = NULL;
fb85abff 1331 struct data_reference *dr;
0822b158 1332 unsigned int i, j;
fb85abff 1333 bool do_peeling = false;
1334 bool do_versioning = false;
1335 bool stat;
1336 gimple stmt;
1337 stmt_vec_info stmt_info;
0822b158 1338 unsigned int npeel = 0;
1339 bool all_misalignments_unknown = true;
1340 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1341 unsigned possible_npeel_number = 1;
1342 tree vectype;
1343 unsigned int nelements, mis, same_align_drs_max = 0;
9af5ce0c 1344 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
fb85abff 1345
6d8fb6cf 1346 if (dump_enabled_p ())
7bd765d4 1347 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1348 "=== vect_enhance_data_refs_alignment ===\n");
fb85abff 1349
1350 /* While cost model enhancements are expected in the future, the high level
1351 view of the code at this time is as follows:
1352
ec2886ed 1353 A) If there is a misaligned access then see if peeling to align
1354 this access can make all data references satisfy
454f25be 1355 vect_supportable_dr_alignment. If so, update data structures
1356 as needed and return true.
fb85abff 1357
1358 B) If peeling wasn't possible and there is a data reference with an
1359 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1360 then see if loop versioning checks can be used to make all data
1361 references satisfy vect_supportable_dr_alignment. If so, update
1362 data structures as needed and return true.
1363
1364 C) If neither peeling nor versioning were successful then return false if
1365 any data reference does not satisfy vect_supportable_dr_alignment.
1366
1367 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1368
1369 Note, Possibility 3 above (which is peeling and versioning together) is not
1370 being done at this time. */
1371
1372 /* (1) Peeling to force alignment. */
1373
1374 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1375 Considerations:
1376 + How many accesses will become aligned due to the peeling
1377 - How many accesses will become unaligned due to the peeling,
1378 and the cost of misaligned accesses.
48e1416a 1379 - The cost of peeling (the extra runtime checks, the increase
0822b158 1380 in code size). */
fb85abff 1381
f1f41a6c 1382 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 1383 {
1384 stmt = DR_STMT (dr);
1385 stmt_info = vinfo_for_stmt (stmt);
1386
1ad41595 1387 if (!STMT_VINFO_RELEVANT_P (stmt_info))
b04940e7 1388 continue;
1389
fb85abff 1390 /* For interleaving, only the alignment of the first access
1391 matters. */
ee612634 1392 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 1393 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
fb85abff 1394 continue;
1395
b04940e7 1396 /* For invariant accesses there is nothing to enhance. */
1397 if (integer_zerop (DR_STEP (dr)))
1398 continue;
1399
e1c75243 1400 /* Strided accesses perform only component accesses, alignment is
f634c3e9 1401 irrelevant for them. */
e1c75243 1402 if (STMT_VINFO_STRIDED_P (stmt_info)
994be998 1403 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
f634c3e9 1404 continue;
1405
0822b158 1406 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1407 do_peeling = vector_alignment_reachable_p (dr);
1408 if (do_peeling)
fb85abff 1409 {
0822b158 1410 if (known_alignment_for_access_p (dr))
1411 {
1412 unsigned int npeel_tmp;
f1b8c740 1413 bool negative = tree_int_cst_compare (DR_STEP (dr),
1414 size_zero_node) < 0;
0822b158 1415
1416 /* Save info about DR in the hash table. */
c1f445d2 1417 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1418 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1419 = new hash_table<peel_info_hasher> (1);
0822b158 1420
1421 vectype = STMT_VINFO_VECTYPE (stmt_info);
1422 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1423 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1424 TREE_TYPE (DR_REF (dr))));
f1b8c740 1425 npeel_tmp = (negative
184a9a0f 1426 ? (mis - nelements) : (nelements - mis))
1427 & (nelements - 1);
0822b158 1428
1429 /* For multiple types, it is possible that the bigger type access
282bf14c 1430 will have more than one peeling option. E.g., a loop with two
0822b158 1431 types: one of size (vector size / 4), and the other one of
282bf14c 1432 size (vector size / 8). Vectorization factor will 8. If both
0822b158 1433 access are misaligned by 3, the first one needs one scalar
282bf14c 1434 iteration to be aligned, and the second one needs 5. But the
0822b158 1435 the first one will be aligned also by peeling 5 scalar
1436 iterations, and in that case both accesses will be aligned.
1437 Hence, except for the immediate peeling amount, we also want
1438 to try to add full vector size, while we don't exceed
1439 vectorization factor.
1440 We do this automtically for cost model, since we calculate cost
1441 for every peeling option. */
3e398f5b 1442 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
0822b158 1443 possible_npeel_number = vf /nelements;
1444
1445 /* Handle the aligned case. We may decide to align some other
1446 access, making DR unaligned. */
1447 if (DR_MISALIGNMENT (dr) == 0)
1448 {
1449 npeel_tmp = 0;
3e398f5b 1450 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
0822b158 1451 possible_npeel_number++;
1452 }
1453
1454 for (j = 0; j < possible_npeel_number; j++)
1455 {
1456 gcc_assert (npeel_tmp <= vf);
1457 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1458 npeel_tmp += nelements;
1459 }
1460
1461 all_misalignments_unknown = false;
1462 /* Data-ref that was chosen for the case that all the
1463 misalignments are unknown is not relevant anymore, since we
1464 have a data-ref with known alignment. */
1465 dr0 = NULL;
1466 }
1467 else
1468 {
6046367e 1469 /* If we don't know any misalignment values, we prefer
1470 peeling for data-ref that has the maximum number of data-refs
0822b158 1471 with the same alignment, unless the target prefers to align
1472 stores over load. */
1473 if (all_misalignments_unknown)
1474 {
6046367e 1475 unsigned same_align_drs
1476 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1477 if (!dr0
1478 || same_align_drs_max < same_align_drs)
0822b158 1479 {
6046367e 1480 same_align_drs_max = same_align_drs;
0822b158 1481 dr0 = dr;
1482 }
6046367e 1483 /* For data-refs with the same number of related
1484 accesses prefer the one where the misalign
1485 computation will be invariant in the outermost loop. */
1486 else if (same_align_drs_max == same_align_drs)
1487 {
1488 struct loop *ivloop0, *ivloop;
1489 ivloop0 = outermost_invariant_loop_for_expr
1490 (loop, DR_BASE_ADDRESS (dr0));
1491 ivloop = outermost_invariant_loop_for_expr
1492 (loop, DR_BASE_ADDRESS (dr));
1493 if ((ivloop && !ivloop0)
1494 || (ivloop && ivloop0
1495 && flow_loop_nested_p (ivloop, ivloop0)))
1496 dr0 = dr;
1497 }
0822b158 1498
9ff25603 1499 if (!first_store && DR_IS_WRITE (dr))
0822b158 1500 first_store = dr;
1501 }
1502
1503 /* If there are both known and unknown misaligned accesses in the
1504 loop, we choose peeling amount according to the known
1505 accesses. */
0822b158 1506 if (!supportable_dr_alignment)
1507 {
1508 dr0 = dr;
9ff25603 1509 if (!first_store && DR_IS_WRITE (dr))
0822b158 1510 first_store = dr;
1511 }
1512 }
1513 }
1514 else
1515 {
1516 if (!aligned_access_p (dr))
1517 {
6d8fb6cf 1518 if (dump_enabled_p ())
78bb46f5 1519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1520 "vector alignment may not be reachable\n");
0822b158 1521 break;
1522 }
1523 }
fb85abff 1524 }
1525
2cd0995e 1526 /* Check if we can possibly peel the loop. */
1527 if (!vect_can_advance_ivs_p (loop_vinfo)
fb85abff 1528 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1529 do_peeling = false;
1530
192f7876 1531 if (do_peeling
1532 && all_misalignments_unknown
0822b158 1533 && vect_supportable_dr_alignment (dr0, false))
1534 {
0822b158 1535 /* Check if the target requires to prefer stores over loads, i.e., if
1536 misaligned stores are more expensive than misaligned loads (taking
1537 drs with same alignment into account). */
1538 if (first_store && DR_IS_READ (dr0))
1539 {
1540 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1541 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1542 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1543 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
f1f41a6c 1544 stmt_vector_for_cost dummy;
1545 dummy.create (2);
f97dec81 1546
1547 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1548 &dummy);
1549 vect_get_data_access_cost (first_store, &store_inside_cost,
1550 &store_outside_cost, &dummy);
0822b158 1551
f1f41a6c 1552 dummy.release ();
0822b158 1553
1554 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1555 aligning the load DR0). */
1556 load_inside_penalty = store_inside_cost;
1557 load_outside_penalty = store_outside_cost;
f1f41a6c 1558 for (i = 0;
1559 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1560 DR_STMT (first_store))).iterate (i, &dr);
0822b158 1561 i++)
1562 if (DR_IS_READ (dr))
1563 {
1564 load_inside_penalty += load_inside_cost;
1565 load_outside_penalty += load_outside_cost;
1566 }
1567 else
1568 {
1569 load_inside_penalty += store_inside_cost;
1570 load_outside_penalty += store_outside_cost;
1571 }
1572
1573 /* Calculate the penalty for leaving DR0 unaligned (by
1574 aligning the FIRST_STORE). */
1575 store_inside_penalty = load_inside_cost;
1576 store_outside_penalty = load_outside_cost;
f1f41a6c 1577 for (i = 0;
1578 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1579 DR_STMT (dr0))).iterate (i, &dr);
0822b158 1580 i++)
1581 if (DR_IS_READ (dr))
1582 {
1583 store_inside_penalty += load_inside_cost;
1584 store_outside_penalty += load_outside_cost;
1585 }
1586 else
1587 {
1588 store_inside_penalty += store_inside_cost;
1589 store_outside_penalty += store_outside_cost;
1590 }
1591
1592 if (load_inside_penalty > store_inside_penalty
1593 || (load_inside_penalty == store_inside_penalty
1594 && load_outside_penalty > store_outside_penalty))
1595 dr0 = first_store;
1596 }
1597
1598 /* In case there are only loads with different unknown misalignments, use
eb10b471 1599 peeling only if it may help to align other accesses in the loop or
1600 if it may help improving load bandwith when we'd end up using
1601 unaligned loads. */
1602 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
f1f41a6c 1603 if (!first_store
1604 && !STMT_VINFO_SAME_ALIGN_REFS (
1605 vinfo_for_stmt (DR_STMT (dr0))).length ()
eb10b471 1606 && (vect_supportable_dr_alignment (dr0, false)
1607 != dr_unaligned_supported
1608 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1609 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
0822b158 1610 do_peeling = false;
1611 }
1612
1613 if (do_peeling && !dr0)
1614 {
1615 /* Peeling is possible, but there is no data access that is not supported
1616 unless aligned. So we try to choose the best possible peeling. */
1617
1618 /* We should get here only if there are drs with known misalignment. */
1619 gcc_assert (!all_misalignments_unknown);
1620
1621 /* Choose the best peeling from the hash table. */
4db2b577 1622 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
f97dec81 1623 &body_cost_vec);
0822b158 1624 if (!dr0 || !npeel)
1625 do_peeling = false;
1626 }
1627
fb85abff 1628 if (do_peeling)
1629 {
0822b158 1630 stmt = DR_STMT (dr0);
1631 stmt_info = vinfo_for_stmt (stmt);
1632 vectype = STMT_VINFO_VECTYPE (stmt_info);
1633 nelements = TYPE_VECTOR_SUBPARTS (vectype);
fb85abff 1634
1635 if (known_alignment_for_access_p (dr0))
1636 {
f1b8c740 1637 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1638 size_zero_node) < 0;
0822b158 1639 if (!npeel)
1640 {
1641 /* Since it's known at compile time, compute the number of
1642 iterations in the peeled loop (the peeling factor) for use in
1643 updating DR_MISALIGNMENT values. The peeling factor is the
1644 vectorization factor minus the misalignment as an element
1645 count. */
1646 mis = DR_MISALIGNMENT (dr0);
1647 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
184a9a0f 1648 npeel = ((negative ? mis - nelements : nelements - mis)
1649 & (nelements - 1));
0822b158 1650 }
fb85abff 1651
48e1416a 1652 /* For interleaved data access every iteration accesses all the
fb85abff 1653 members of the group, therefore we divide the number of iterations
1654 by the group size. */
48e1416a 1655 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
ee612634 1656 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
21009880 1657 npeel /= GROUP_SIZE (stmt_info);
fb85abff 1658
6d8fb6cf 1659 if (dump_enabled_p ())
7bd765d4 1660 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1661 "Try peeling by %d\n", npeel);
fb85abff 1662 }
1663
1664 /* Ensure that all data refs can be vectorized after the peel. */
f1f41a6c 1665 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 1666 {
1667 int save_misalignment;
1668
1669 if (dr == dr0)
1670 continue;
1671
1672 stmt = DR_STMT (dr);
1673 stmt_info = vinfo_for_stmt (stmt);
1674 /* For interleaving, only the alignment of the first access
1675 matters. */
ee612634 1676 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 1677 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
fb85abff 1678 continue;
1679
e1c75243 1680 /* Strided accesses perform only component accesses, alignment is
f634c3e9 1681 irrelevant for them. */
e1c75243 1682 if (STMT_VINFO_STRIDED_P (stmt_info)
994be998 1683 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
f634c3e9 1684 continue;
1685
fb85abff 1686 save_misalignment = DR_MISALIGNMENT (dr);
1687 vect_update_misalignment_for_peel (dr, dr0, npeel);
0822b158 1688 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
fb85abff 1689 SET_DR_MISALIGNMENT (dr, save_misalignment);
48e1416a 1690
fb85abff 1691 if (!supportable_dr_alignment)
1692 {
1693 do_peeling = false;
1694 break;
1695 }
1696 }
1697
0822b158 1698 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1699 {
1700 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1701 if (!stat)
1702 do_peeling = false;
1703 else
9f793cdf 1704 {
f1f41a6c 1705 body_cost_vec.release ();
9f793cdf 1706 return stat;
1707 }
0822b158 1708 }
1709
eb10b471 1710 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
d7d7032a 1711 if (do_peeling)
1712 {
1713 unsigned max_allowed_peel
1714 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1715 if (max_allowed_peel != (unsigned)-1)
1716 {
1717 unsigned max_peel = npeel;
1718 if (max_peel == 0)
1719 {
1720 gimple dr_stmt = DR_STMT (dr0);
1721 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1722 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1723 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1724 }
1725 if (max_peel > max_allowed_peel)
1726 {
1727 do_peeling = false;
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_NOTE, vect_location,
1730 "Disable peeling, max peels reached: %d\n", max_peel);
1731 }
1732 }
1733 }
1734
eb10b471 1735 /* Cost model #2 - if peeling may result in a remaining loop not
1736 iterating enough to be vectorized then do not peel. */
1737 if (do_peeling
1738 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1739 {
1740 unsigned max_peel
1741 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1742 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1743 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1744 do_peeling = false;
1745 }
1746
fb85abff 1747 if (do_peeling)
1748 {
1749 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1750 If the misalignment of DR_i is identical to that of dr0 then set
1751 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1752 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1753 by the peeling factor times the element size of DR_i (MOD the
1754 vectorization factor times the size). Otherwise, the
1755 misalignment of DR_i must be set to unknown. */
f1f41a6c 1756 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 1757 if (dr != dr0)
1758 vect_update_misalignment_for_peel (dr, dr0, npeel);
1759
1760 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
0822b158 1761 if (npeel)
313a5120 1762 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
0822b158 1763 else
313a5120 1764 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1765 = DR_MISALIGNMENT (dr0);
fb85abff 1766 SET_DR_MISALIGNMENT (dr0, 0);
6d8fb6cf 1767 if (dump_enabled_p ())
7bd765d4 1768 {
1769 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1770 "Alignment of access forced using peeling.\n");
7bd765d4 1771 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1772 "Peeling for alignment will be applied.\n");
7bd765d4 1773 }
e4eca2de 1774 /* The inside-loop cost will be accounted for in vectorizable_load
1775 and vectorizable_store correctly with adjusted alignments.
1776 Drop the body_cst_vec on the floor here. */
1777 body_cost_vec.release ();
4db2b577 1778
37545e54 1779 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
fb85abff 1780 gcc_assert (stat);
1781 return stat;
1782 }
1783 }
1784
f1f41a6c 1785 body_cost_vec.release ();
fb85abff 1786
1787 /* (2) Versioning to force alignment. */
1788
1789 /* Try versioning if:
1dbf9bd1 1790 1) optimize loop for speed
1791 2) there is at least one unsupported misaligned data ref with an unknown
fb85abff 1792 misalignment, and
1dbf9bd1 1793 3) all misaligned data refs with a known misalignment are supported, and
1794 4) the number of runtime alignment checks is within reason. */
fb85abff 1795
48e1416a 1796 do_versioning =
1dbf9bd1 1797 optimize_loop_nest_for_speed_p (loop)
fb85abff 1798 && (!loop->inner); /* FORNOW */
1799
1800 if (do_versioning)
1801 {
f1f41a6c 1802 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 1803 {
1804 stmt = DR_STMT (dr);
1805 stmt_info = vinfo_for_stmt (stmt);
1806
1807 /* For interleaving, only the alignment of the first access
1808 matters. */
1809 if (aligned_access_p (dr)
ee612634 1810 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
21009880 1811 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
fb85abff 1812 continue;
1813
e1c75243 1814 if (STMT_VINFO_STRIDED_P (stmt_info))
994be998 1815 {
1816 /* Strided loads perform only component accesses, alignment is
1817 irrelevant for them. */
1818 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1819 continue;
1820 do_versioning = false;
1821 break;
1822 }
f634c3e9 1823
0822b158 1824 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
fb85abff 1825
1826 if (!supportable_dr_alignment)
1827 {
1828 gimple stmt;
1829 int mask;
1830 tree vectype;
1831
1832 if (known_alignment_for_access_p (dr)
f1f41a6c 1833 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
fb85abff 1834 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1835 {
1836 do_versioning = false;
1837 break;
1838 }
1839
1840 stmt = DR_STMT (dr);
1841 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1842 gcc_assert (vectype);
48e1416a 1843
fb85abff 1844 /* The rightmost bits of an aligned address must be zeros.
1845 Construct the mask needed for this test. For example,
1846 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1847 mask must be 15 = 0xf. */
1848 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1849
1850 /* FORNOW: use the same mask to test all potentially unaligned
1851 references in the loop. The vectorizer currently supports
1852 a single vector size, see the reference to
1853 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1854 vectorization factor is computed. */
1855 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1856 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1857 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
f1f41a6c 1858 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1859 DR_STMT (dr));
fb85abff 1860 }
1861 }
48e1416a 1862
fb85abff 1863 /* Versioning requires at least one misaligned data reference. */
10095225 1864 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
fb85abff 1865 do_versioning = false;
1866 else if (!do_versioning)
f1f41a6c 1867 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
fb85abff 1868 }
1869
1870 if (do_versioning)
1871 {
f1f41a6c 1872 vec<gimple> may_misalign_stmts
fb85abff 1873 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1874 gimple stmt;
1875
1876 /* It can now be assumed that the data references in the statements
1877 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1878 of the loop being vectorized. */
f1f41a6c 1879 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
fb85abff 1880 {
1881 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1882 dr = STMT_VINFO_DATA_REF (stmt_info);
1883 SET_DR_MISALIGNMENT (dr, 0);
6d8fb6cf 1884 if (dump_enabled_p ())
78bb46f5 1885 dump_printf_loc (MSG_NOTE, vect_location,
1886 "Alignment of access forced using versioning.\n");
fb85abff 1887 }
1888
6d8fb6cf 1889 if (dump_enabled_p ())
78bb46f5 1890 dump_printf_loc (MSG_NOTE, vect_location,
1891 "Versioning for alignment will be applied.\n");
fb85abff 1892
1893 /* Peeling and versioning can't be done together at this time. */
1894 gcc_assert (! (do_peeling && do_versioning));
1895
37545e54 1896 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
fb85abff 1897 gcc_assert (stat);
1898 return stat;
1899 }
1900
1901 /* This point is reached if neither peeling nor versioning is being done. */
1902 gcc_assert (! (do_peeling || do_versioning));
1903
37545e54 1904 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
fb85abff 1905 return stat;
1906}
1907
1908
91a74fc6 1909/* Function vect_find_same_alignment_drs.
1910
1911 Update group and alignment relations according to the chosen
1912 vectorization factor. */
1913
1914static void
1915vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1916 loop_vec_info loop_vinfo)
1917{
1918 unsigned int i;
1919 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1920 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1921 struct data_reference *dra = DDR_A (ddr);
1922 struct data_reference *drb = DDR_B (ddr);
1923 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1924 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1925 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1926 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1927 lambda_vector dist_v;
1928 unsigned int loop_depth;
1929
1930 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1931 return;
1932
0822b158 1933 if (dra == drb)
91a74fc6 1934 return;
1935
1936 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1937 return;
1938
1939 /* Loop-based vectorization and known data dependence. */
1940 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1941 return;
1942
85a846a2 1943 /* Data-dependence analysis reports a distance vector of zero
1944 for data-references that overlap only in the first iteration
1945 but have different sign step (see PR45764).
1946 So as a sanity check require equal DR_STEP. */
1947 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1948 return;
1949
91a74fc6 1950 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
f1f41a6c 1951 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
91a74fc6 1952 {
1953 int dist = dist_v[loop_depth];
1954
6d8fb6cf 1955 if (dump_enabled_p ())
7bd765d4 1956 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1957 "dependence distance = %d.\n", dist);
91a74fc6 1958
1959 /* Same loop iteration. */
1960 if (dist == 0
1961 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1962 {
1963 /* Two references with distance zero have the same alignment. */
f1f41a6c 1964 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1965 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
6d8fb6cf 1966 if (dump_enabled_p ())
91a74fc6 1967 {
78bb46f5 1968 dump_printf_loc (MSG_NOTE, vect_location,
1969 "accesses have the same alignment.\n");
7bd765d4 1970 dump_printf (MSG_NOTE,
78bb46f5 1971 "dependence distance modulo vf == 0 between ");
7bd765d4 1972 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1973 dump_printf (MSG_NOTE, " and ");
1974 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 1975 dump_printf (MSG_NOTE, "\n");
91a74fc6 1976 }
1977 }
1978 }
1979}
1980
1981
fb85abff 1982/* Function vect_analyze_data_refs_alignment
1983
1984 Analyze the alignment of the data-references in the loop.
1985 Return FALSE if a data reference is found that cannot be vectorized. */
1986
1987bool
48e1416a 1988vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
37545e54 1989 bb_vec_info bb_vinfo)
fb85abff 1990{
6d8fb6cf 1991 if (dump_enabled_p ())
7bd765d4 1992 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1993 "=== vect_analyze_data_refs_alignment ===\n");
fb85abff 1994
91a74fc6 1995 /* Mark groups of data references with same alignment using
1996 data dependence information. */
1997 if (loop_vinfo)
1998 {
f1f41a6c 1999 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
91a74fc6 2000 struct data_dependence_relation *ddr;
2001 unsigned int i;
2002
f1f41a6c 2003 FOR_EACH_VEC_ELT (ddrs, i, ddr)
91a74fc6 2004 vect_find_same_alignment_drs (ddr, loop_vinfo);
2005 }
2006
37545e54 2007 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
fb85abff 2008 {
6d8fb6cf 2009 if (dump_enabled_p ())
78bb46f5 2010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2011 "not vectorized: can't calculate alignment "
2012 "for data ref.\n");
fb85abff 2013 return false;
2014 }
2015
2016 return true;
2017}
2018
2019
ee612634 2020/* Analyze groups of accesses: check that DR belongs to a group of
2021 accesses of legal size, step, etc. Detect gaps, single element
2022 interleaving, and other special cases. Set grouped access info.
fb85abff 2023 Collect groups of strided stores for further use in SLP analysis. */
2024
2025static bool
2026vect_analyze_group_access (struct data_reference *dr)
2027{
2028 tree step = DR_STEP (dr);
2029 tree scalar_type = TREE_TYPE (DR_REF (dr));
f9ae6f95 2030 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
fb85abff 2031 gimple stmt = DR_STMT (dr);
2032 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2033 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
37545e54 2034 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
994be998 2035 HOST_WIDE_INT dr_step = -1;
ee612634 2036 HOST_WIDE_INT groupsize, last_accessed_element = 1;
fb85abff 2037 bool slp_impossible = false;
d88ecd95 2038 struct loop *loop = NULL;
2039
2040 if (loop_vinfo)
2041 loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 2042
ee612634 2043 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2044 size of the interleaving group (including gaps). */
994be998 2045 if (tree_fits_shwi_p (step))
2046 {
2047 dr_step = tree_to_shwi (step);
2048 groupsize = absu_hwi (dr_step) / type_size;
2049 }
2050 else
2051 groupsize = 0;
fb85abff 2052
2053 /* Not consecutive access is possible only if it is a part of interleaving. */
21009880 2054 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
fb85abff 2055 {
2056 /* Check if it this DR is a part of interleaving, and is a single
2057 element of the group that is accessed in the loop. */
48e1416a 2058
fb85abff 2059 /* Gaps are supported only for loads. STEP must be a multiple of the type
2060 size. The size of the group must be a power of 2. */
2061 if (DR_IS_READ (dr)
2062 && (dr_step % type_size) == 0
ee612634 2063 && groupsize > 0
2064 && exact_log2 (groupsize) != -1)
fb85abff 2065 {
21009880 2066 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
ee612634 2067 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
6d8fb6cf 2068 if (dump_enabled_p ())
fb85abff 2069 {
78bb46f5 2070 dump_printf_loc (MSG_NOTE, vect_location,
2071 "Detected single element interleaving ");
7bd765d4 2072 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2073 dump_printf (MSG_NOTE, " step ");
2074 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
78bb46f5 2075 dump_printf (MSG_NOTE, "\n");
fb85abff 2076 }
a4ee7fac 2077
2078 if (loop_vinfo)
2079 {
6d8fb6cf 2080 if (dump_enabled_p ())
7bd765d4 2081 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2082 "Data access with gaps requires scalar "
2083 "epilogue loop\n");
d88ecd95 2084 if (loop->inner)
2085 {
6d8fb6cf 2086 if (dump_enabled_p ())
7bd765d4 2087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2088 "Peeling for outer loop is not"
78bb46f5 2089 " supported\n");
d88ecd95 2090 return false;
2091 }
2092
2093 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
a4ee7fac 2094 }
2095
fb85abff 2096 return true;
2097 }
6ea6a380 2098
6d8fb6cf 2099 if (dump_enabled_p ())
6ea6a380 2100 {
7bd765d4 2101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2102 "not consecutive access ");
2103 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2104 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
6ea6a380 2105 }
2106
2107 if (bb_vinfo)
2108 {
2109 /* Mark the statement as unvectorizable. */
2110 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2111 return true;
2112 }
7bd765d4 2113
fb85abff 2114 return false;
2115 }
2116
21009880 2117 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
fb85abff 2118 {
2119 /* First stmt in the interleaving chain. Check the chain. */
21009880 2120 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
fb85abff 2121 struct data_reference *data_ref = dr;
1a0e7d51 2122 unsigned int count = 1;
fb85abff 2123 tree prev_init = DR_INIT (data_ref);
2124 gimple prev = stmt;
8bbe6b75 2125 HOST_WIDE_INT diff, gaps = 0;
fb85abff 2126
2127 while (next)
2128 {
282bf14c 2129 /* Skip same data-refs. In case that two or more stmts share
2130 data-ref (supported only for loads), we vectorize only the first
2131 stmt, and the rest get their vectorized loads from the first
2132 one. */
fb85abff 2133 if (!tree_int_cst_compare (DR_INIT (data_ref),
2134 DR_INIT (STMT_VINFO_DATA_REF (
2135 vinfo_for_stmt (next)))))
2136 {
9ff25603 2137 if (DR_IS_WRITE (data_ref))
fb85abff 2138 {
6d8fb6cf 2139 if (dump_enabled_p ())
78bb46f5 2140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2141 "Two store stmts share the same dr.\n");
fb85abff 2142 return false;
2143 }
2144
fb85abff 2145 /* For load use the same data-ref load. */
21009880 2146 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
fb85abff 2147
2148 prev = next;
21009880 2149 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
fb85abff 2150 continue;
2151 }
a4ee7fac 2152
fb85abff 2153 prev = next;
8bbe6b75 2154 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
fb85abff 2155
8bbe6b75 2156 /* All group members have the same STEP by construction. */
2157 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
fb85abff 2158
fb85abff 2159 /* Check that the distance between two accesses is equal to the type
2160 size. Otherwise, we have gaps. */
f9ae6f95 2161 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2162 - TREE_INT_CST_LOW (prev_init)) / type_size;
fb85abff 2163 if (diff != 1)
2164 {
2165 /* FORNOW: SLP of accesses with gaps is not supported. */
2166 slp_impossible = true;
9ff25603 2167 if (DR_IS_WRITE (data_ref))
fb85abff 2168 {
6d8fb6cf 2169 if (dump_enabled_p ())
78bb46f5 2170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2171 "interleaved store with gaps\n");
fb85abff 2172 return false;
2173 }
b11576bf 2174
2175 gaps += diff - 1;
fb85abff 2176 }
2177
a4ee7fac 2178 last_accessed_element += diff;
2179
fb85abff 2180 /* Store the gap from the previous member of the group. If there is no
21009880 2181 gap in the access, GROUP_GAP is always 1. */
2182 GROUP_GAP (vinfo_for_stmt (next)) = diff;
fb85abff 2183
2184 prev_init = DR_INIT (data_ref);
21009880 2185 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
fb85abff 2186 /* Count the number of data-refs in the chain. */
2187 count++;
2188 }
2189
994be998 2190 if (groupsize == 0)
2191 groupsize = count + gaps;
fb85abff 2192
994be998 2193 /* Check that the size of the interleaving is equal to count for stores,
fb85abff 2194 i.e., that there are no gaps. */
904bd865 2195 if (groupsize != count
2196 && !DR_IS_READ (dr))
fb85abff 2197 {
904bd865 2198 if (dump_enabled_p ())
2199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2200 "interleaved store with gaps\n");
2201 return false;
2202 }
2203
2204 /* If there is a gap after the last load in the group it is the
2205 difference between the groupsize and the last accessed
2206 element.
2207 When there is no gap, this difference should be 0. */
2208 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
fb85abff 2209
ee612634 2210 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
6d8fb6cf 2211 if (dump_enabled_p ())
904bd865 2212 {
2213 dump_printf_loc (MSG_NOTE, vect_location,
2214 "Detected interleaving of size %d starting with ",
2215 (int)groupsize);
2216 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2217 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2218 dump_printf_loc (MSG_NOTE, vect_location,
2219 "There is a gap of %d elements after the group\n",
2220 (int)GROUP_GAP (vinfo_for_stmt (stmt)));
2221 }
fb85abff 2222
48e1416a 2223 /* SLP: create an SLP data structure for every interleaving group of
fb85abff 2224 stores for further analysis in vect_analyse_slp. */
9ff25603 2225 if (DR_IS_WRITE (dr) && !slp_impossible)
37545e54 2226 {
2227 if (loop_vinfo)
f1f41a6c 2228 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
37545e54 2229 if (bb_vinfo)
f1f41a6c 2230 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
37545e54 2231 }
a4ee7fac 2232
2233 /* There is a gap in the end of the group. */
ee612634 2234 if (groupsize - last_accessed_element > 0 && loop_vinfo)
a4ee7fac 2235 {
6d8fb6cf 2236 if (dump_enabled_p ())
7bd765d4 2237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2238 "Data access with gaps requires scalar "
2239 "epilogue loop\n");
d88ecd95 2240 if (loop->inner)
2241 {
6d8fb6cf 2242 if (dump_enabled_p ())
78bb46f5 2243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244 "Peeling for outer loop is not supported\n");
d88ecd95 2245 return false;
2246 }
2247
2248 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
a4ee7fac 2249 }
fb85abff 2250 }
2251
2252 return true;
2253}
2254
2255
2256/* Analyze the access pattern of the data-reference DR.
2257 In case of non-consecutive accesses call vect_analyze_group_access() to
ee612634 2258 analyze groups of accesses. */
fb85abff 2259
2260static bool
2261vect_analyze_data_ref_access (struct data_reference *dr)
2262{
2263 tree step = DR_STEP (dr);
2264 tree scalar_type = TREE_TYPE (DR_REF (dr));
2265 gimple stmt = DR_STMT (dr);
2266 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2267 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
37545e54 2268 struct loop *loop = NULL;
fb85abff 2269
37545e54 2270 if (loop_vinfo)
2271 loop = LOOP_VINFO_LOOP (loop_vinfo);
48e1416a 2272
37545e54 2273 if (loop_vinfo && !step)
fb85abff 2274 {
6d8fb6cf 2275 if (dump_enabled_p ())
78bb46f5 2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277 "bad data-ref access in loop\n");
fb85abff 2278 return false;
2279 }
2280
9b0be19c 2281 /* Allow loads with zero step in inner-loop vectorization. */
f634c3e9 2282 if (loop_vinfo && integer_zerop (step))
b04940e7 2283 {
2284 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
9b0be19c 2285 if (!nested_in_vect_loop_p (loop, stmt))
2286 return DR_IS_READ (dr);
2287 /* Allow references with zero step for outer loops marked
2288 with pragma omp simd only - it guarantees absence of
2289 loop-carried dependencies between inner loop iterations. */
2290 if (!loop->force_vectorize)
afa60cb4 2291 {
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2294 "zero step in inner loop of nest\n");
afa60cb4 2295 return false;
2296 }
b04940e7 2297 }
fb85abff 2298
37545e54 2299 if (loop && nested_in_vect_loop_p (loop, stmt))
fb85abff 2300 {
2301 /* Interleaved accesses are not yet supported within outer-loop
2302 vectorization for references in the inner-loop. */
21009880 2303 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
fb85abff 2304
2305 /* For the rest of the analysis we use the outer-loop step. */
2306 step = STMT_VINFO_DR_STEP (stmt_info);
f634c3e9 2307 if (integer_zerop (step))
fb85abff 2308 {
6d8fb6cf 2309 if (dump_enabled_p ())
7bd765d4 2310 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2311 "zero step in outer loop.\n");
fb85abff 2312 if (DR_IS_READ (dr))
48e1416a 2313 return true;
fb85abff 2314 else
2315 return false;
2316 }
2317 }
2318
2319 /* Consecutive? */
f634c3e9 2320 if (TREE_CODE (step) == INTEGER_CST)
fb85abff 2321 {
f9ae6f95 2322 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
f634c3e9 2323 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2324 || (dr_step < 0
2325 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2326 {
2327 /* Mark that it is not interleaving. */
2328 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2329 return true;
2330 }
fb85abff 2331 }
2332
37545e54 2333 if (loop && nested_in_vect_loop_p (loop, stmt))
fb85abff 2334 {
6d8fb6cf 2335 if (dump_enabled_p ())
7bd765d4 2336 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2337 "grouped access in outer loop.\n");
fb85abff 2338 return false;
2339 }
2340
994be998 2341
f634c3e9 2342 /* Assume this is a DR handled by non-constant strided load case. */
2343 if (TREE_CODE (step) != INTEGER_CST)
e1c75243 2344 return (STMT_VINFO_STRIDED_P (stmt_info)
994be998 2345 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2346 || vect_analyze_group_access (dr)));
f634c3e9 2347
fb85abff 2348 /* Not consecutive access - check if it's a part of interleaving group. */
2349 return vect_analyze_group_access (dr);
2350}
2351
9b44c85a 2352
2353
2354/* A helper function used in the comparator function to sort data
2355 references. T1 and T2 are two data references to be compared.
2356 The function returns -1, 0, or 1. */
2357
2358static int
2359compare_tree (tree t1, tree t2)
2360{
2361 int i, cmp;
2362 enum tree_code code;
2363 char tclass;
2364
2365 if (t1 == t2)
2366 return 0;
2367 if (t1 == NULL)
2368 return -1;
2369 if (t2 == NULL)
2370 return 1;
2371
2372
2373 if (TREE_CODE (t1) != TREE_CODE (t2))
2374 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2375
2376 code = TREE_CODE (t1);
2377 switch (code)
2378 {
2379 /* For const values, we can just use hash values for comparisons. */
2380 case INTEGER_CST:
2381 case REAL_CST:
2382 case FIXED_CST:
2383 case STRING_CST:
2384 case COMPLEX_CST:
2385 case VECTOR_CST:
2386 {
2387 hashval_t h1 = iterative_hash_expr (t1, 0);
2388 hashval_t h2 = iterative_hash_expr (t2, 0);
2389 if (h1 != h2)
2390 return h1 < h2 ? -1 : 1;
2391 break;
2392 }
2393
2394 case SSA_NAME:
2395 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2396 if (cmp != 0)
2397 return cmp;
2398
2399 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2400 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2401 break;
2402
2403 default:
2404 tclass = TREE_CODE_CLASS (code);
2405
2406 /* For var-decl, we could compare their UIDs. */
2407 if (tclass == tcc_declaration)
2408 {
2409 if (DECL_UID (t1) != DECL_UID (t2))
2410 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2411 break;
2412 }
2413
2414 /* For expressions with operands, compare their operands recursively. */
2415 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2416 {
2417 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2418 if (cmp != 0)
2419 return cmp;
2420 }
2421 }
2422
2423 return 0;
2424}
2425
2426
68f15e9d 2427/* Compare two data-references DRA and DRB to group them into chunks
2428 suitable for grouping. */
2429
2430static int
2431dr_group_sort_cmp (const void *dra_, const void *drb_)
2432{
2433 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2434 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
68f15e9d 2435 int cmp;
2436
2437 /* Stabilize sort. */
2438 if (dra == drb)
2439 return 0;
2440
2441 /* Ordering of DRs according to base. */
2442 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2443 {
9b44c85a 2444 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2445 if (cmp != 0)
2446 return cmp;
68f15e9d 2447 }
2448
2449 /* And according to DR_OFFSET. */
2450 if (!dr_equal_offsets_p (dra, drb))
2451 {
9b44c85a 2452 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2453 if (cmp != 0)
2454 return cmp;
68f15e9d 2455 }
2456
2457 /* Put reads before writes. */
2458 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2459 return DR_IS_READ (dra) ? -1 : 1;
2460
2461 /* Then sort after access size. */
2462 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2463 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2464 {
9b44c85a 2465 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2466 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2467 if (cmp != 0)
2468 return cmp;
68f15e9d 2469 }
2470
2471 /* And after step. */
2472 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2473 {
9b44c85a 2474 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2475 if (cmp != 0)
2476 return cmp;
68f15e9d 2477 }
2478
2479 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2480 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2481 if (cmp == 0)
2482 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2483 return cmp;
2484}
fb85abff 2485
2486/* Function vect_analyze_data_ref_accesses.
2487
2488 Analyze the access pattern of all the data references in the loop.
2489
2490 FORNOW: the only access pattern that is considered vectorizable is a
2491 simple step 1 (consecutive) access.
2492
2493 FORNOW: handle only arrays and pointer accesses. */
2494
2495bool
37545e54 2496vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
fb85abff 2497{
2498 unsigned int i;
f1f41a6c 2499 vec<data_reference_p> datarefs;
fb85abff 2500 struct data_reference *dr;
2501
6d8fb6cf 2502 if (dump_enabled_p ())
7bd765d4 2503 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2504 "=== vect_analyze_data_ref_accesses ===\n");
fb85abff 2505
37545e54 2506 if (loop_vinfo)
2507 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2508 else
2509 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2510
68f15e9d 2511 if (datarefs.is_empty ())
2512 return true;
2513
2514 /* Sort the array of datarefs to make building the interleaving chains
863a3781 2515 linear. Don't modify the original vector's order, it is needed for
2516 determining what dependencies are reversed. */
2517 vec<data_reference_p> datarefs_copy = datarefs.copy ();
90a2d741 2518 datarefs_copy.qsort (dr_group_sort_cmp);
68f15e9d 2519
2520 /* Build the interleaving chains. */
863a3781 2521 for (i = 0; i < datarefs_copy.length () - 1;)
68f15e9d 2522 {
863a3781 2523 data_reference_p dra = datarefs_copy[i];
68f15e9d 2524 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2525 stmt_vec_info lastinfo = NULL;
863a3781 2526 for (i = i + 1; i < datarefs_copy.length (); ++i)
68f15e9d 2527 {
863a3781 2528 data_reference_p drb = datarefs_copy[i];
68f15e9d 2529 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2530
2531 /* ??? Imperfect sorting (non-compatible types, non-modulo
2532 accesses, same accesses) can lead to a group to be artificially
2533 split here as we don't just skip over those. If it really
2534 matters we can push those to a worklist and re-iterate
2535 over them. The we can just skip ahead to the next DR here. */
2536
2537 /* Check that the data-refs have same first location (except init)
5c0fac99 2538 and they are both either store or load (not load and store,
2539 not masked loads or stores). */
68f15e9d 2540 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2541 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2542 DR_BASE_ADDRESS (drb), 0)
5c0fac99 2543 || !dr_equal_offsets_p (dra, drb)
2544 || !gimple_assign_single_p (DR_STMT (dra))
2545 || !gimple_assign_single_p (DR_STMT (drb)))
68f15e9d 2546 break;
2547
994be998 2548 /* Check that the data-refs have the same constant size. */
68f15e9d 2549 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2550 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
e913b5cd 2551 if (!tree_fits_uhwi_p (sza)
2552 || !tree_fits_uhwi_p (szb)
994be998 2553 || !tree_int_cst_equal (sza, szb))
2554 break;
2555
2556 /* Check that the data-refs have the same step. */
2557 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
68f15e9d 2558 break;
2559
2560 /* Do not place the same access in the interleaving chain twice. */
2561 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2562 break;
2563
2564 /* Check the types are compatible.
2565 ??? We don't distinguish this during sorting. */
2566 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2567 TREE_TYPE (DR_REF (drb))))
2568 break;
2569
2570 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
f9ae6f95 2571 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2572 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
68f15e9d 2573 gcc_assert (init_a < init_b);
2574
2575 /* If init_b == init_a + the size of the type * k, we have an
2576 interleaving, and DRA is accessed before DRB. */
8c53c46c 2577 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
68f15e9d 2578 if ((init_b - init_a) % type_size_a != 0)
2579 break;
2580
3599384d 2581 /* If we have a store, the accesses are adjacent. This splits
2582 groups into chunks we support (we don't support vectorization
2583 of stores with gaps). */
2584 if (!DR_IS_READ (dra)
2585 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2586 (DR_INIT (datarefs_copy[i-1]))
2587 != type_size_a))
2588 break;
2589
994be998 2590 /* If the step (if not zero or non-constant) is greater than the
2591 difference between data-refs' inits this splits groups into
2592 suitable sizes. */
2593 if (tree_fits_shwi_p (DR_STEP (dra)))
2594 {
2595 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2596 if (step != 0 && step <= (init_b - init_a))
2597 break;
2598 }
68f15e9d 2599
2600 if (dump_enabled_p ())
2601 {
2602 dump_printf_loc (MSG_NOTE, vect_location,
2603 "Detected interleaving ");
2604 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2605 dump_printf (MSG_NOTE, " and ");
2606 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
78bb46f5 2607 dump_printf (MSG_NOTE, "\n");
68f15e9d 2608 }
2609
2610 /* Link the found element into the group list. */
2611 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2612 {
2613 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2614 lastinfo = stmtinfo_a;
2615 }
2616 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2617 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2618 lastinfo = stmtinfo_b;
2619 }
2620 }
2621
863a3781 2622 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
6ea6a380 2623 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2624 && !vect_analyze_data_ref_access (dr))
fb85abff 2625 {
6d8fb6cf 2626 if (dump_enabled_p ())
78bb46f5 2627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2628 "not vectorized: complicated access pattern.\n");
6ea6a380 2629
2630 if (bb_vinfo)
2631 {
2632 /* Mark the statement as not vectorizable. */
2633 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2634 continue;
2635 }
2636 else
863a3781 2637 {
2638 datarefs_copy.release ();
2639 return false;
2640 }
fb85abff 2641 }
2642
863a3781 2643 datarefs_copy.release ();
fb85abff 2644 return true;
2645}
2646
8a7b0f48 2647
43d14b66 2648/* Operator == between two dr_with_seg_len objects.
8a7b0f48 2649
2650 This equality operator is used to make sure two data refs
2651 are the same one so that we will consider to combine the
2652 aliasing checks of those two pairs of data dependent data
2653 refs. */
2654
2655static bool
43d14b66 2656operator == (const dr_with_seg_len& d1,
2657 const dr_with_seg_len& d2)
8a7b0f48 2658{
43d14b66 2659 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2660 DR_BASE_ADDRESS (d2.dr), 0)
2661 && compare_tree (d1.offset, d2.offset) == 0
2662 && compare_tree (d1.seg_len, d2.seg_len) == 0;
8a7b0f48 2663}
2664
43d14b66 2665/* Function comp_dr_with_seg_len_pair.
8a7b0f48 2666
43d14b66 2667 Comparison function for sorting objects of dr_with_seg_len_pair_t
8a7b0f48 2668 so that we can combine aliasing checks in one scan. */
2669
2670static int
43d14b66 2671comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
8a7b0f48 2672{
43d14b66 2673 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2674 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2675
2676 const dr_with_seg_len &p11 = p1->first,
2677 &p12 = p1->second,
2678 &p21 = p2->first,
2679 &p22 = p2->second;
2680
2681 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2682 if a and c have the same basic address snd step, and b and d have the same
2683 address and step. Therefore, if any a&c or b&d don't have the same address
2684 and step, we don't care the order of those two pairs after sorting. */
2685 int comp_res;
2686
2687 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2688 DR_BASE_ADDRESS (p21.dr))) != 0)
8a7b0f48 2689 return comp_res;
43d14b66 2690 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2691 DR_BASE_ADDRESS (p22.dr))) != 0)
2692 return comp_res;
2693 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2694 return comp_res;
2695 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2696 return comp_res;
2697 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2698 return comp_res;
2699 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
8a7b0f48 2700 return comp_res;
8a7b0f48 2701
2702 return 0;
2703}
2704
8a7b0f48 2705/* Function vect_vfa_segment_size.
2706
2707 Create an expression that computes the size of segment
2708 that will be accessed for a data reference. The functions takes into
2709 account that realignment loads may access one more vector.
2710
2711 Input:
2712 DR: The data reference.
2713 LENGTH_FACTOR: segment length to consider.
2714
2715 Return an expression whose value is the size of segment which will be
2716 accessed by DR. */
2717
2718static tree
2719vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2720{
2721 tree segment_length;
2722
2723 if (integer_zerop (DR_STEP (dr)))
2724 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2725 else
2726 segment_length = size_binop (MULT_EXPR,
43d14b66 2727 fold_convert (sizetype, DR_STEP (dr)),
2728 fold_convert (sizetype, length_factor));
8a7b0f48 2729
2730 if (vect_supportable_dr_alignment (dr, false)
43d14b66 2731 == dr_explicit_realign_optimized)
8a7b0f48 2732 {
2733 tree vector_size = TYPE_SIZE_UNIT
2734 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2735
2736 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2737 }
2738 return segment_length;
2739}
2740
fb85abff 2741/* Function vect_prune_runtime_alias_test_list.
2742
2743 Prune a list of ddrs to be tested at run-time by versioning for alias.
8a7b0f48 2744 Merge several alias checks into one if possible.
fb85abff 2745 Return FALSE if resulting list of ddrs is longer then allowed by
2746 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2747
2748bool
2749vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2750{
8a7b0f48 2751 vec<ddr_p> may_alias_ddrs =
fb85abff 2752 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
43d14b66 2753 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
8a7b0f48 2754 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2755 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2756 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2757
2758 ddr_p ddr;
2759 unsigned int i;
2760 tree length_factor;
fb85abff 2761
6d8fb6cf 2762 if (dump_enabled_p ())
7bd765d4 2763 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 2764 "=== vect_prune_runtime_alias_test_list ===\n");
fb85abff 2765
8a7b0f48 2766 if (may_alias_ddrs.is_empty ())
2767 return true;
2768
2769 /* Basically, for each pair of dependent data refs store_ptr_0
2770 and load_ptr_0, we create an expression:
2771
2772 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2773 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2774
2775 for aliasing checks. However, in some cases we can decrease
2776 the number of checks by combining two checks into one. For
2777 example, suppose we have another pair of data refs store_ptr_0
2778 and load_ptr_1, and if the following condition is satisfied:
2779
2780 load_ptr_0 < load_ptr_1 &&
2781 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2782
2783 (this condition means, in each iteration of vectorized loop,
2784 the accessed memory of store_ptr_0 cannot be between the memory
2785 of load_ptr_0 and load_ptr_1.)
2786
2787 we then can use only the following expression to finish the
2788 alising checks between store_ptr_0 & load_ptr_0 and
2789 store_ptr_0 & load_ptr_1:
2790
2791 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2792 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2793
2794 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2795 same basic address. */
2796
2797 comp_alias_ddrs.create (may_alias_ddrs.length ());
2798
2799 /* First, we collect all data ref pairs for aliasing checks. */
2800 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
fb85abff 2801 {
8a7b0f48 2802 struct data_reference *dr_a, *dr_b;
2803 gimple dr_group_first_a, dr_group_first_b;
2804 tree segment_length_a, segment_length_b;
2805 gimple stmt_a, stmt_b;
2806
2807 dr_a = DDR_A (ddr);
2808 stmt_a = DR_STMT (DDR_A (ddr));
2809 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2810 if (dr_group_first_a)
2811 {
2812 stmt_a = dr_group_first_a;
2813 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2814 }
fb85abff 2815
8a7b0f48 2816 dr_b = DDR_B (ddr);
2817 stmt_b = DR_STMT (DDR_B (ddr));
2818 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2819 if (dr_group_first_b)
2820 {
2821 stmt_b = dr_group_first_b;
2822 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2823 }
fb85abff 2824
8a7b0f48 2825 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2826 length_factor = scalar_loop_iters;
2827 else
2828 length_factor = size_int (vect_factor);
2829 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2830 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2831
43d14b66 2832 dr_with_seg_len_pair_t dr_with_seg_len_pair
2833 (dr_with_seg_len (dr_a, segment_length_a),
2834 dr_with_seg_len (dr_b, segment_length_b));
2835
2836 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3d4d7ad1 2837 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
8a7b0f48 2838
2839 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2840 }
2841
2842 /* Second, we sort the collected data ref pairs so that we can scan
2843 them once to combine all possible aliasing checks. */
43d14b66 2844 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
fb85abff 2845
8a7b0f48 2846 /* Third, we scan the sorted dr pairs and check if we can combine
2847 alias checks of two neighbouring dr pairs. */
2848 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2849 {
2850 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
43d14b66 2851 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2852 *dr_b1 = &comp_alias_ddrs[i-1].second,
2853 *dr_a2 = &comp_alias_ddrs[i].first,
2854 *dr_b2 = &comp_alias_ddrs[i].second;
8a7b0f48 2855
2856 /* Remove duplicate data ref pairs. */
2857 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2858 {
2859 if (dump_enabled_p ())
fb85abff 2860 {
8a7b0f48 2861 dump_printf_loc (MSG_NOTE, vect_location,
2862 "found equal ranges ");
2863 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2864 DR_REF (dr_a1->dr));
2865 dump_printf (MSG_NOTE, ", ");
2866 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2867 DR_REF (dr_b1->dr));
2868 dump_printf (MSG_NOTE, " and ");
2869 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2870 DR_REF (dr_a2->dr));
2871 dump_printf (MSG_NOTE, ", ");
2872 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2873 DR_REF (dr_b2->dr));
2874 dump_printf (MSG_NOTE, "\n");
fb85abff 2875 }
8a7b0f48 2876
2877 comp_alias_ddrs.ordered_remove (i--);
2878 continue;
fb85abff 2879 }
48e1416a 2880
8a7b0f48 2881 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2882 {
2883 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2884 and DR_A1 and DR_A2 are two consecutive memrefs. */
2885 if (*dr_a1 == *dr_a2)
2886 {
3d4d7ad1 2887 std::swap (dr_a1, dr_b1);
2888 std::swap (dr_a2, dr_b2);
8a7b0f48 2889 }
2890
43d14b66 2891 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2892 DR_BASE_ADDRESS (dr_a2->dr),
2893 0)
55af3bae 2894 || !tree_fits_shwi_p (dr_a1->offset)
2895 || !tree_fits_shwi_p (dr_a2->offset))
8a7b0f48 2896 continue;
2897
55af3bae 2898 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2899 - tree_to_shwi (dr_a1->offset));
8a7b0f48 2900
2901
2902 /* Now we check if the following condition is satisfied:
2903
2904 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2905
2906 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2907 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2908 have to make a best estimation. We can get the minimum value
2909 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2910 then either of the following two conditions can guarantee the
2911 one above:
2912
2913 1: DIFF <= MIN_SEG_LEN_B
2914 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2915
2916 */
2917
55af3bae 2918 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2919 ? tree_to_shwi (dr_b1->seg_len)
2920 : vect_factor);
8a7b0f48 2921
2922 if (diff <= min_seg_len_b
55af3bae 2923 || (tree_fits_shwi_p (dr_a1->seg_len)
2924 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
8a7b0f48 2925 {
1c7f4889 2926 if (dump_enabled_p ())
2927 {
2928 dump_printf_loc (MSG_NOTE, vect_location,
2929 "merging ranges for ");
2930 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931 DR_REF (dr_a1->dr));
2932 dump_printf (MSG_NOTE, ", ");
2933 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2934 DR_REF (dr_b1->dr));
2935 dump_printf (MSG_NOTE, " and ");
2936 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2937 DR_REF (dr_a2->dr));
2938 dump_printf (MSG_NOTE, ", ");
2939 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2940 DR_REF (dr_b2->dr));
2941 dump_printf (MSG_NOTE, "\n");
2942 }
2943
8a7b0f48 2944 dr_a1->seg_len = size_binop (PLUS_EXPR,
2945 dr_a2->seg_len, size_int (diff));
2946 comp_alias_ddrs.ordered_remove (i--);
2947 }
2948 }
fb85abff 2949 }
2950
1c7f4889 2951 dump_printf_loc (MSG_NOTE, vect_location,
2952 "improved number of alias checks from %d to %d\n",
2953 may_alias_ddrs.length (), comp_alias_ddrs.length ());
8a7b0f48 2954 if ((int) comp_alias_ddrs.length () >
2955 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
1c7f4889 2956 return false;
fb85abff 2957
2958 return true;
2959}
2960
16dfb112 2961/* Check whether a non-affine read in stmt is suitable for gather load
2962 and if so, return a builtin decl for that operation. */
2963
2964tree
2965vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2966 tree *offp, int *scalep)
2967{
2968 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2969 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2970 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2971 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2972 tree offtype = NULL_TREE;
2973 tree decl, base, off;
3754d046 2974 machine_mode pmode;
16dfb112 2975 int punsignedp, pvolatilep;
2976
c71d3c24 2977 base = DR_REF (dr);
2978 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2979 see if we can use the def stmt of the address. */
2980 if (is_gimple_call (stmt)
2981 && gimple_call_internal_p (stmt)
2982 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2983 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2984 && TREE_CODE (base) == MEM_REF
2985 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2986 && integer_zerop (TREE_OPERAND (base, 1))
2987 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2988 {
2989 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2990 if (is_gimple_assign (def_stmt)
2991 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2992 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2993 }
2994
16dfb112 2995 /* The gather builtins need address of the form
2996 loop_invariant + vector * {1, 2, 4, 8}
2997 or
2998 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2999 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3000 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3001 multiplications and additions in it. To get a vector, we need
3002 a single SSA_NAME that will be defined in the loop and will
3003 contain everything that is not loop invariant and that can be
3004 vectorized. The following code attempts to find such a preexistng
3005 SSA_NAME OFF and put the loop invariants into a tree BASE
3006 that can be gimplified before the loop. */
c71d3c24 3007 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
16dfb112 3008 &pmode, &punsignedp, &pvolatilep, false);
3009 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3010
3011 if (TREE_CODE (base) == MEM_REF)
3012 {
3013 if (!integer_zerop (TREE_OPERAND (base, 1)))
3014 {
3015 if (off == NULL_TREE)
3016 {
5de9d3ed 3017 offset_int moff = mem_ref_offset (base);
e913b5cd 3018 off = wide_int_to_tree (sizetype, moff);
16dfb112 3019 }
3020 else
3021 off = size_binop (PLUS_EXPR, off,
3022 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3023 }
3024 base = TREE_OPERAND (base, 0);
3025 }
3026 else
3027 base = build_fold_addr_expr (base);
3028
3029 if (off == NULL_TREE)
3030 off = size_zero_node;
3031
3032 /* If base is not loop invariant, either off is 0, then we start with just
3033 the constant offset in the loop invariant BASE and continue with base
3034 as OFF, otherwise give up.
3035 We could handle that case by gimplifying the addition of base + off
3036 into some SSA_NAME and use that as off, but for now punt. */
3037 if (!expr_invariant_in_loop_p (loop, base))
3038 {
3039 if (!integer_zerop (off))
3040 return NULL_TREE;
3041 off = base;
3042 base = size_int (pbitpos / BITS_PER_UNIT);
3043 }
3044 /* Otherwise put base + constant offset into the loop invariant BASE
3045 and continue with OFF. */
3046 else
3047 {
3048 base = fold_convert (sizetype, base);
3049 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3050 }
3051
3052 /* OFF at this point may be either a SSA_NAME or some tree expression
3053 from get_inner_reference. Try to peel off loop invariants from it
3054 into BASE as long as possible. */
3055 STRIP_NOPS (off);
3056 while (offtype == NULL_TREE)
3057 {
3058 enum tree_code code;
3059 tree op0, op1, add = NULL_TREE;
3060
3061 if (TREE_CODE (off) == SSA_NAME)
3062 {
3063 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3064
3065 if (expr_invariant_in_loop_p (loop, off))
3066 return NULL_TREE;
3067
3068 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3069 break;
3070
3071 op0 = gimple_assign_rhs1 (def_stmt);
3072 code = gimple_assign_rhs_code (def_stmt);
3073 op1 = gimple_assign_rhs2 (def_stmt);
3074 }
3075 else
3076 {
3077 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3078 return NULL_TREE;
3079 code = TREE_CODE (off);
3080 extract_ops_from_tree (off, &code, &op0, &op1);
3081 }
3082 switch (code)
3083 {
3084 case POINTER_PLUS_EXPR:
3085 case PLUS_EXPR:
3086 if (expr_invariant_in_loop_p (loop, op0))
3087 {
3088 add = op0;
3089 off = op1;
3090 do_add:
3091 add = fold_convert (sizetype, add);
3092 if (scale != 1)
3093 add = size_binop (MULT_EXPR, add, size_int (scale));
3094 base = size_binop (PLUS_EXPR, base, add);
3095 continue;
3096 }
3097 if (expr_invariant_in_loop_p (loop, op1))
3098 {
3099 add = op1;
3100 off = op0;
3101 goto do_add;
3102 }
3103 break;
3104 case MINUS_EXPR:
3105 if (expr_invariant_in_loop_p (loop, op1))
3106 {
3107 add = fold_convert (sizetype, op1);
3108 add = size_binop (MINUS_EXPR, size_zero_node, add);
3109 off = op0;
3110 goto do_add;
3111 }
3112 break;
3113 case MULT_EXPR:
e913b5cd 3114 if (scale == 1 && tree_fits_shwi_p (op1))
16dfb112 3115 {
e913b5cd 3116 scale = tree_to_shwi (op1);
16dfb112 3117 off = op0;
3118 continue;
3119 }
3120 break;
3121 case SSA_NAME:
3122 off = op0;
3123 continue;
3124 CASE_CONVERT:
3125 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3126 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3127 break;
3128 if (TYPE_PRECISION (TREE_TYPE (op0))
3129 == TYPE_PRECISION (TREE_TYPE (off)))
3130 {
3131 off = op0;
3132 continue;
3133 }
3134 if (TYPE_PRECISION (TREE_TYPE (op0))
3135 < TYPE_PRECISION (TREE_TYPE (off)))
3136 {
3137 off = op0;
3138 offtype = TREE_TYPE (off);
3139 STRIP_NOPS (off);
3140 continue;
3141 }
3142 break;
3143 default:
3144 break;
3145 }
3146 break;
3147 }
3148
3149 /* If at the end OFF still isn't a SSA_NAME or isn't
3150 defined in the loop, punt. */
3151 if (TREE_CODE (off) != SSA_NAME
3152 || expr_invariant_in_loop_p (loop, off))
3153 return NULL_TREE;
3154
3155 if (offtype == NULL_TREE)
3156 offtype = TREE_TYPE (off);
3157
3158 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3159 offtype, scale);
3160 if (decl == NULL_TREE)
3161 return NULL_TREE;
3162
3163 if (basep)
3164 *basep = base;
3165 if (offp)
3166 *offp = off;
3167 if (scalep)
3168 *scalep = scale;
3169 return decl;
3170}
3171
fb85abff 3172/* Function vect_analyze_data_refs.
3173
37545e54 3174 Find all the data references in the loop or basic block.
fb85abff 3175
3176 The general structure of the analysis of data refs in the vectorizer is as
3177 follows:
48e1416a 3178 1- vect_analyze_data_refs(loop/bb): call
37545e54 3179 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3180 in the loop/bb and their dependences.
fb85abff 3181 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3182 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3183 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3184
3185*/
3186
3187bool
91a74fc6 3188vect_analyze_data_refs (loop_vec_info loop_vinfo,
3189 bb_vec_info bb_vinfo,
ca91d3f8 3190 int *min_vf, unsigned *n_stmts)
fb85abff 3191{
37545e54 3192 struct loop *loop = NULL;
3193 basic_block bb = NULL;
fb85abff 3194 unsigned int i;
f1f41a6c 3195 vec<data_reference_p> datarefs;
fb85abff 3196 struct data_reference *dr;
3197 tree scalar_type;
3198
6d8fb6cf 3199 if (dump_enabled_p ())
7bd765d4 3200 dump_printf_loc (MSG_NOTE, vect_location,
3201 "=== vect_analyze_data_refs ===\n");
48e1416a 3202
37545e54 3203 if (loop_vinfo)
3204 {
d09768a4 3205 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3206
37545e54 3207 loop = LOOP_VINFO_LOOP (loop_vinfo);
d09768a4 3208 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3209 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
590a32b6 3210 {
6d8fb6cf 3211 if (dump_enabled_p ())
78bb46f5 3212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3213 "not vectorized: loop contains function calls"
3214 " or data references that cannot be analyzed\n");
590a32b6 3215 return false;
3216 }
3217
d09768a4 3218 for (i = 0; i < loop->num_nodes; i++)
3219 {
3220 gimple_stmt_iterator gsi;
3221
3222 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3223 {
3224 gimple stmt = gsi_stmt (gsi);
ca91d3f8 3225 if (is_gimple_debug (stmt))
3226 continue;
3227 ++*n_stmts;
d09768a4 3228 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3229 {
3230 if (is_gimple_call (stmt) && loop->safelen)
3231 {
3232 tree fndecl = gimple_call_fndecl (stmt), op;
3233 if (fndecl != NULL_TREE)
3234 {
415d1b9a 3235 struct cgraph_node *node = cgraph_node::get (fndecl);
d09768a4 3236 if (node != NULL && node->simd_clones != NULL)
3237 {
3238 unsigned int j, n = gimple_call_num_args (stmt);
3239 for (j = 0; j < n; j++)
3240 {
3241 op = gimple_call_arg (stmt, j);
3242 if (DECL_P (op)
3243 || (REFERENCE_CLASS_P (op)
3244 && get_base_address (op)))
3245 break;
3246 }
3247 op = gimple_call_lhs (stmt);
3248 /* Ignore #pragma omp declare simd functions
3249 if they don't have data references in the
3250 call stmt itself. */
3251 if (j == n
3252 && !(op
3253 && (DECL_P (op)
3254 || (REFERENCE_CLASS_P (op)
3255 && get_base_address (op)))))
3256 continue;
3257 }
3258 }
3259 }
3260 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3261 if (dump_enabled_p ())
3262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3263 "not vectorized: loop contains function "
3264 "calls or data references that cannot "
3265 "be analyzed\n");
3266 return false;
3267 }
3268 }
3269 }
3270
3271 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
37545e54 3272 }
3273 else
3274 {
ec84182d 3275 gimple_stmt_iterator gsi;
3276
37545e54 3277 bb = BB_VINFO_BB (bb_vinfo);
ec84182d 3278 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3279 {
3280 gimple stmt = gsi_stmt (gsi);
ca91d3f8 3281 if (is_gimple_debug (stmt))
3282 continue;
3283 ++*n_stmts;
ec84182d 3284 if (!find_data_references_in_stmt (NULL, stmt,
3285 &BB_VINFO_DATAREFS (bb_vinfo)))
3286 {
3287 /* Mark the rest of the basic-block as unvectorizable. */
3288 for (; !gsi_end_p (gsi); gsi_next (&gsi))
156b8feb 3289 {
3290 stmt = gsi_stmt (gsi);
3291 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3292 }
ec84182d 3293 break;
3294 }
3295 }
590a32b6 3296
37545e54 3297 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3298 }
fb85abff 3299
282bf14c 3300 /* Go through the data-refs, check that the analysis succeeded. Update
3301 pointer from stmt_vec_info struct to DR and vectype. */
fb85abff 3302
f1f41a6c 3303 FOR_EACH_VEC_ELT (datarefs, i, dr)
fb85abff 3304 {
3305 gimple stmt;
3306 stmt_vec_info stmt_info;
48e1416a 3307 tree base, offset, init;
16dfb112 3308 bool gather = false;
3d483a94 3309 bool simd_lane_access = false;
91a74fc6 3310 int vf;
48e1416a 3311
8911f4de 3312again:
fb85abff 3313 if (!dr || !DR_REF (dr))
3314 {
6d8fb6cf 3315 if (dump_enabled_p ())
7bd765d4 3316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 3317 "not vectorized: unhandled data-ref\n");
fb85abff 3318 return false;
3319 }
3320
3321 stmt = DR_STMT (dr);
3322 stmt_info = vinfo_for_stmt (stmt);
3323
8911f4de 3324 /* Discard clobbers from the dataref vector. We will remove
3325 clobber stmts during vectorization. */
3326 if (gimple_clobber_p (stmt))
3327 {
53c3c39b 3328 free_data_ref (dr);
8911f4de 3329 if (i == datarefs.length () - 1)
3330 {
3331 datarefs.pop ();
3332 break;
3333 }
4aaf064d 3334 datarefs.ordered_remove (i);
3335 dr = datarefs[i];
8911f4de 3336 goto again;
3337 }
3338
fb85abff 3339 /* Check that analysis of the data-ref succeeded. */
3340 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
16dfb112 3341 || !DR_STEP (dr))
fb85abff 3342 {
3d483a94 3343 bool maybe_gather
3344 = DR_IS_READ (dr)
16dfb112 3345 && !TREE_THIS_VOLATILE (DR_REF (dr))
3d483a94 3346 && targetm.vectorize.builtin_gather != NULL;
3347 bool maybe_simd_lane_access
3348 = loop_vinfo && loop->simduid;
3349
3350 /* If target supports vector gather loads, or if this might be
3351 a SIMD lane access, see if they can't be used. */
3352 if (loop_vinfo
3353 && (maybe_gather || maybe_simd_lane_access)
16dfb112 3354 && !nested_in_vect_loop_p (loop, stmt))
3355 {
3356 struct data_reference *newdr
3357 = create_data_ref (NULL, loop_containing_stmt (stmt),
3358 DR_REF (dr), stmt, true);
3359 gcc_assert (newdr != NULL && DR_REF (newdr));
3360 if (DR_BASE_ADDRESS (newdr)
3361 && DR_OFFSET (newdr)
3362 && DR_INIT (newdr)
3363 && DR_STEP (newdr)
3364 && integer_zerop (DR_STEP (newdr)))
3365 {
3d483a94 3366 if (maybe_simd_lane_access)
3367 {
3368 tree off = DR_OFFSET (newdr);
3369 STRIP_NOPS (off);
3370 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3371 && TREE_CODE (off) == MULT_EXPR
d85a2013 3372 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3d483a94 3373 {
3374 tree step = TREE_OPERAND (off, 1);
3375 off = TREE_OPERAND (off, 0);
3376 STRIP_NOPS (off);
3377 if (CONVERT_EXPR_P (off)
3378 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3379 0)))
3380 < TYPE_PRECISION (TREE_TYPE (off)))
3381 off = TREE_OPERAND (off, 0);
3382 if (TREE_CODE (off) == SSA_NAME)
3383 {
3384 gimple def = SSA_NAME_DEF_STMT (off);
3385 tree reft = TREE_TYPE (DR_REF (newdr));
cae17039 3386 if (is_gimple_call (def)
3387 && gimple_call_internal_p (def)
3388 && (gimple_call_internal_fn (def)
3389 == IFN_GOMP_SIMD_LANE))
3d483a94 3390 {
3391 tree arg = gimple_call_arg (def, 0);
3392 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3393 arg = SSA_NAME_VAR (arg);
3394 if (arg == loop->simduid
3395 /* For now. */
3396 && tree_int_cst_equal
3397 (TYPE_SIZE_UNIT (reft),
3398 step))
3399 {
3400 DR_OFFSET (newdr) = ssize_int (0);
3401 DR_STEP (newdr) = step;
3b36c9f7 3402 DR_ALIGNED_TO (newdr)
3403 = size_int (BIGGEST_ALIGNMENT);
3d483a94 3404 dr = newdr;
3405 simd_lane_access = true;
3406 }
3407 }
3408 }
3409 }
3410 }
3411 if (!simd_lane_access && maybe_gather)
3412 {
3413 dr = newdr;
3414 gather = true;
3415 }
16dfb112 3416 }
3d483a94 3417 if (!gather && !simd_lane_access)
16dfb112 3418 free_data_ref (newdr);
3419 }
6ea6a380 3420
3d483a94 3421 if (!gather && !simd_lane_access)
16dfb112 3422 {
6d8fb6cf 3423 if (dump_enabled_p ())
16dfb112 3424 {
78bb46f5 3425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 3426 "not vectorized: data ref analysis "
3427 "failed ");
3428 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3429 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
16dfb112 3430 }
3c9feaca 3431
16dfb112 3432 if (bb_vinfo)
07e3bcbf 3433 break;
16dfb112 3434
3435 return false;
3436 }
fb85abff 3437 }
3438
3439 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3440 {
6d8fb6cf 3441 if (dump_enabled_p ())
7bd765d4 3442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3443 "not vectorized: base addr of dr is a "
78bb46f5 3444 "constant\n");
3c9feaca 3445
3446 if (bb_vinfo)
07e3bcbf 3447 break;
3c9feaca 3448
3d483a94 3449 if (gather || simd_lane_access)
16dfb112 3450 free_data_ref (dr);
3451 return false;
fb85abff 3452 }
3453
83ec4165 3454 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3455 {
6d8fb6cf 3456 if (dump_enabled_p ())
83ec4165 3457 {
7bd765d4 3458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3459 "not vectorized: volatile type ");
3460 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3461 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
83ec4165 3462 }
3c9feaca 3463
3464 if (bb_vinfo)
07e3bcbf 3465 break;
3c9feaca 3466
83ec4165 3467 return false;
3468 }
3469
96b07a49 3470 if (stmt_can_throw_internal (stmt))
42a6710e 3471 {
6d8fb6cf 3472 if (dump_enabled_p ())
42a6710e 3473 {
7bd765d4 3474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3475 "not vectorized: statement can throw an "
3476 "exception ");
3477 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3478 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
42a6710e 3479 }
3c9feaca 3480
3481 if (bb_vinfo)
07e3bcbf 3482 break;
3c9feaca 3483
3d483a94 3484 if (gather || simd_lane_access)
16dfb112 3485 free_data_ref (dr);
42a6710e 3486 return false;
3487 }
3488
87c952b8 3489 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3490 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3491 {
6d8fb6cf 3492 if (dump_enabled_p ())
87c952b8 3493 {
7bd765d4 3494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3495 "not vectorized: statement is bitfield "
3496 "access ");
3497 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3498 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
87c952b8 3499 }
3500
3501 if (bb_vinfo)
07e3bcbf 3502 break;
87c952b8 3503
3d483a94 3504 if (gather || simd_lane_access)
87c952b8 3505 free_data_ref (dr);
3506 return false;
3507 }
3508
3509 base = unshare_expr (DR_BASE_ADDRESS (dr));
3510 offset = unshare_expr (DR_OFFSET (dr));
3511 init = unshare_expr (DR_INIT (dr));
3512
c71d3c24 3513 if (is_gimple_call (stmt)
3514 && (!gimple_call_internal_p (stmt)
3515 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3516 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
635bf3aa 3517 {
6d8fb6cf 3518 if (dump_enabled_p ())
635bf3aa 3519 {
7bd765d4 3520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 3521 "not vectorized: dr in a call ");
7bd765d4 3522 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3523 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
635bf3aa 3524 }
3525
3526 if (bb_vinfo)
07e3bcbf 3527 break;
635bf3aa 3528
3d483a94 3529 if (gather || simd_lane_access)
635bf3aa 3530 free_data_ref (dr);
3531 return false;
3532 }
3533
fb85abff 3534 /* Update DR field in stmt_vec_info struct. */
fb85abff 3535
3536 /* If the dataref is in an inner-loop of the loop that is considered for
3537 for vectorization, we also want to analyze the access relative to
48e1416a 3538 the outer-loop (DR contains information only relative to the
fb85abff 3539 inner-most enclosing loop). We do that by building a reference to the
3540 first location accessed by the inner-loop, and analyze it relative to
48e1416a 3541 the outer-loop. */
3542 if (loop && nested_in_vect_loop_p (loop, stmt))
fb85abff 3543 {
3544 tree outer_step, outer_base, outer_init;
3545 HOST_WIDE_INT pbitsize, pbitpos;
3546 tree poffset;
3754d046 3547 machine_mode pmode;
fb85abff 3548 int punsignedp, pvolatilep;
3549 affine_iv base_iv, offset_iv;
3550 tree dinit;
3551
48e1416a 3552 /* Build a reference to the first location accessed by the
282bf14c 3553 inner-loop: *(BASE+INIT). (The first location is actually
fb85abff 3554 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3555 tree inner_base = build_fold_indirect_ref
2cc66f2a 3556 (fold_build_pointer_plus (base, init));
fb85abff 3557
6d8fb6cf 3558 if (dump_enabled_p ())
fb85abff 3559 {
7bd765d4 3560 dump_printf_loc (MSG_NOTE, vect_location,
3561 "analyze in outer-loop: ");
3562 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
78bb46f5 3563 dump_printf (MSG_NOTE, "\n");
fb85abff 3564 }
3565
48e1416a 3566 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
fb85abff 3567 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3568 gcc_assert (outer_base != NULL_TREE);
3569
3570 if (pbitpos % BITS_PER_UNIT != 0)
3571 {
6d8fb6cf 3572 if (dump_enabled_p ())
7bd765d4 3573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3574 "failed: bit offset alignment.\n");
fb85abff 3575 return false;
3576 }
3577
3578 outer_base = build_fold_addr_expr (outer_base);
48e1416a 3579 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
fb85abff 3580 &base_iv, false))
3581 {
6d8fb6cf 3582 if (dump_enabled_p ())
78bb46f5 3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 3584 "failed: evolution of base is not affine.\n");
fb85abff 3585 return false;
3586 }
3587
3588 if (offset)
3589 {
3590 if (poffset)
48e1416a 3591 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
fb85abff 3592 poffset);
3593 else
3594 poffset = offset;
3595 }
3596
3597 if (!poffset)
3598 {
3599 offset_iv.base = ssize_int (0);
3600 offset_iv.step = ssize_int (0);
3601 }
48e1416a 3602 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
fb85abff 3603 &offset_iv, false))
3604 {
6d8fb6cf 3605 if (dump_enabled_p ())
78bb46f5 3606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 3607 "evolution of offset is not affine.\n");
fb85abff 3608 return false;
3609 }
3610
3611 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3612 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3613 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3614 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3615 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3616
3617 outer_step = size_binop (PLUS_EXPR,
3618 fold_convert (ssizetype, base_iv.step),
3619 fold_convert (ssizetype, offset_iv.step));
3620
3621 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3622 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
48e1416a 3623 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
fb85abff 3624 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
48e1416a 3625 STMT_VINFO_DR_OFFSET (stmt_info) =
fb85abff 3626 fold_convert (ssizetype, offset_iv.base);
48e1416a 3627 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
fb85abff 3628 size_int (highest_pow2_factor (offset_iv.base));
3629
6d8fb6cf 3630 if (dump_enabled_p ())
fb85abff 3631 {
7bd765d4 3632 dump_printf_loc (MSG_NOTE, vect_location,
3633 "\touter base_address: ");
3634 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3635 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3636 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3637 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3638 STMT_VINFO_DR_OFFSET (stmt_info));
3639 dump_printf (MSG_NOTE,
3640 "\n\touter constant offset from base address: ");
3641 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3642 STMT_VINFO_DR_INIT (stmt_info));
3643 dump_printf (MSG_NOTE, "\n\touter step: ");
3644 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3645 STMT_VINFO_DR_STEP (stmt_info));
3646 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3647 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3648 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
78bb46f5 3649 dump_printf (MSG_NOTE, "\n");
fb85abff 3650 }
3651 }
3652
3653 if (STMT_VINFO_DATA_REF (stmt_info))
3654 {
6d8fb6cf 3655 if (dump_enabled_p ())
fb85abff 3656 {
7bd765d4 3657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3658 "not vectorized: more than one data ref "
3659 "in stmt: ");
3660 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3661 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 3662 }
3c9feaca 3663
3664 if (bb_vinfo)
07e3bcbf 3665 break;
3c9feaca 3666
3d483a94 3667 if (gather || simd_lane_access)
16dfb112 3668 free_data_ref (dr);
fb85abff 3669 return false;
3670 }
f083cd24 3671
fb85abff 3672 STMT_VINFO_DATA_REF (stmt_info) = dr;
3d483a94 3673 if (simd_lane_access)
3674 {
3675 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
53c3c39b 3676 free_data_ref (datarefs[i]);
3d483a94 3677 datarefs[i] = dr;
3678 }
48e1416a 3679
fb85abff 3680 /* Set vectype for STMT. */
3681 scalar_type = TREE_TYPE (DR_REF (dr));
53c3c39b 3682 STMT_VINFO_VECTYPE (stmt_info)
3683 = get_vectype_for_scalar_type (scalar_type);
48e1416a 3684 if (!STMT_VINFO_VECTYPE (stmt_info))
fb85abff 3685 {
6d8fb6cf 3686 if (dump_enabled_p ())
fb85abff 3687 {
78bb46f5 3688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 3689 "not vectorized: no vectype for stmt: ");
3690 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3691 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3692 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3693 scalar_type);
78bb46f5 3694 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 3695 }
6ea6a380 3696
3697 if (bb_vinfo)
07e3bcbf 3698 break;
16dfb112 3699
3d483a94 3700 if (gather || simd_lane_access)
16dfb112 3701 {
3702 STMT_VINFO_DATA_REF (stmt_info) = NULL;
53c3c39b 3703 if (gather)
3704 free_data_ref (dr);
16dfb112 3705 }
3706 return false;
fb85abff 3707 }
0bf5f81b 3708 else
3709 {
3710 if (dump_enabled_p ())
3711 {
3712 dump_printf_loc (MSG_NOTE, vect_location,
3713 "got vectype for stmt: ");
3714 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3715 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3716 STMT_VINFO_VECTYPE (stmt_info));
78bb46f5 3717 dump_printf (MSG_NOTE, "\n");
0bf5f81b 3718 }
3719 }
91a74fc6 3720
3721 /* Adjust the minimal vectorization factor according to the
3722 vector type. */
3723 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3724 if (vf > *min_vf)
3725 *min_vf = vf;
16dfb112 3726
3727 if (gather)
3728 {
16dfb112 3729 tree off;
16dfb112 3730
7a10f06b 3731 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3732 if (gather
3733 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3734 gather = false;
f634c3e9 3735 if (!gather)
16dfb112 3736 {
706567b8 3737 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3738 free_data_ref (dr);
6d8fb6cf 3739 if (dump_enabled_p ())
16dfb112 3740 {
7bd765d4 3741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3742 "not vectorized: not suitable for gather "
3743 "load ");
3744 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3745 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
16dfb112 3746 }
3747 return false;
3748 }
3749
f1f41a6c 3750 datarefs[i] = dr;
f634c3e9 3751 STMT_VINFO_GATHER_P (stmt_info) = true;
3752 }
3753 else if (loop_vinfo
3754 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3755 {
e1c75243 3756 if (nested_in_vect_loop_p (loop, stmt))
f634c3e9 3757 {
6d8fb6cf 3758 if (dump_enabled_p ())
f634c3e9 3759 {
7bd765d4 3760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3761 "not vectorized: not suitable for strided "
3762 "load ");
3763 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 3764 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
f634c3e9 3765 }
3766 return false;
3767 }
e1c75243 3768 STMT_VINFO_STRIDED_P (stmt_info) = true;
16dfb112 3769 }
fb85abff 3770 }
48e1416a 3771
07e3bcbf 3772 /* If we stopped analysis at the first dataref we could not analyze
3773 when trying to vectorize a basic-block mark the rest of the datarefs
3774 as not vectorizable and truncate the vector of datarefs. That
3775 avoids spending useless time in analyzing their dependence. */
3776 if (i != datarefs.length ())
3777 {
3778 gcc_assert (bb_vinfo != NULL);
3779 for (unsigned j = i; j < datarefs.length (); ++j)
3780 {
3781 data_reference_p dr = datarefs[j];
3782 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3783 free_data_ref (dr);
3784 }
3785 datarefs.truncate (i);
3786 }
3787
fb85abff 3788 return true;
3789}
3790
3791
3792/* Function vect_get_new_vect_var.
3793
282bf14c 3794 Returns a name for a new variable. The current naming scheme appends the
48e1416a 3795 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3796 the name of vectorizer generated variables, and appends that to NAME if
fb85abff 3797 provided. */
3798
3799tree
3800vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3801{
3802 const char *prefix;
3803 tree new_vect_var;
3804
3805 switch (var_kind)
3806 {
3807 case vect_simple_var:
0bf5f81b 3808 prefix = "vect";
fb85abff 3809 break;
3810 case vect_scalar_var:
0bf5f81b 3811 prefix = "stmp";
fb85abff 3812 break;
3813 case vect_pointer_var:
0bf5f81b 3814 prefix = "vectp";
fb85abff 3815 break;
3816 default:
3817 gcc_unreachable ();
3818 }
3819
3820 if (name)
3821 {
0bf5f81b 3822 char* tmp = concat (prefix, "_", name, NULL);
35244493 3823 new_vect_var = create_tmp_reg (type, tmp);
fb85abff 3824 free (tmp);
3825 }
3826 else
35244493 3827 new_vect_var = create_tmp_reg (type, prefix);
fb85abff 3828
3829 return new_vect_var;
3830}
3831
4a2edd22 3832/* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3833
3834static void
3835vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3836 stmt_vec_info stmt_info)
3837{
3838 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3839 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3840 int misalign = DR_MISALIGNMENT (dr);
3841 if (misalign == -1)
3842 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3843 else
3844 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3845}
fb85abff 3846
3847/* Function vect_create_addr_base_for_vector_ref.
3848
3849 Create an expression that computes the address of the first memory location
3850 that will be accessed for a data reference.
3851
3852 Input:
3853 STMT: The statement containing the data reference.
3854 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3855 OFFSET: Optional. If supplied, it is be added to the initial address.
3856 LOOP: Specify relative to which loop-nest should the address be computed.
3857 For example, when the dataref is in an inner-loop nested in an
3858 outer-loop that is now being vectorized, LOOP can be either the
282bf14c 3859 outer-loop, or the inner-loop. The first memory location accessed
fb85abff 3860 by the following dataref ('in' points to short):
3861
3862 for (i=0; i<N; i++)
3863 for (j=0; j<M; j++)
3864 s += in[i+j]
3865
3866 is as follows:
3867 if LOOP=i_loop: &in (relative to i_loop)
3868 if LOOP=j_loop: &in+i*2B (relative to j_loop)
1ec61bbd 3869 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3870 initial address. Unlike OFFSET, which is number of elements to
3871 be added, BYTE_OFFSET is measured in bytes.
fb85abff 3872
3873 Output:
48e1416a 3874 1. Return an SSA_NAME whose value is the address of the memory location of
fb85abff 3875 the first vector of the data reference.
3876 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3877 these statement(s) which define the returned SSA_NAME.
3878
3879 FORNOW: We are only handling array accesses with step 1. */
3880
3881tree
3882vect_create_addr_base_for_vector_ref (gimple stmt,
3883 gimple_seq *new_stmt_list,
3884 tree offset,
1ec61bbd 3885 struct loop *loop,
3886 tree byte_offset)
fb85abff 3887{
3888 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3889 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
90d4c4af 3890 tree data_ref_base;
3c18ea71 3891 const char *base_name;
90d4c4af 3892 tree addr_base;
fb85abff 3893 tree dest;
3894 gimple_seq seq = NULL;
90d4c4af 3895 tree base_offset;
3896 tree init;
f083cd24 3897 tree vect_ptr_type;
fb85abff 3898 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
37545e54 3899 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
fb85abff 3900
37545e54 3901 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
fb85abff 3902 {
37545e54 3903 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 3904
37545e54 3905 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
fb85abff 3906
3907 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3908 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3909 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3910 }
90d4c4af 3911 else
3912 {
3913 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3914 base_offset = unshare_expr (DR_OFFSET (dr));
3915 init = unshare_expr (DR_INIT (dr));
3916 }
fb85abff 3917
37545e54 3918 if (loop_vinfo)
3c18ea71 3919 base_name = get_name (data_ref_base);
37545e54 3920 else
3921 {
3922 base_offset = ssize_int (0);
3923 init = ssize_int (0);
3c18ea71 3924 base_name = get_name (DR_REF (dr));
48e1416a 3925 }
37545e54 3926
fb85abff 3927 /* Create base_offset */
3928 base_offset = size_binop (PLUS_EXPR,
3929 fold_convert (sizetype, base_offset),
3930 fold_convert (sizetype, init));
fb85abff 3931
3932 if (offset)
3933 {
fb85abff 3934 offset = fold_build2 (MULT_EXPR, sizetype,
3935 fold_convert (sizetype, offset), step);
3936 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3937 base_offset, offset);
fb85abff 3938 }
1ec61bbd 3939 if (byte_offset)
3940 {
3941 byte_offset = fold_convert (sizetype, byte_offset);
3942 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3943 base_offset, byte_offset);
3944 }
fb85abff 3945
3946 /* base + base_offset */
37545e54 3947 if (loop_vinfo)
2cc66f2a 3948 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
37545e54 3949 else
3950 {
182cf5a9 3951 addr_base = build1 (ADDR_EXPR,
3952 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3953 unshare_expr (DR_REF (dr)));
37545e54 3954 }
48e1416a 3955
fb85abff 3956 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
90d4c4af 3957 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
8ee959f8 3958 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
fb85abff 3959 gimple_seq_add_seq (new_stmt_list, seq);
3960
f544b9a4 3961 if (DR_PTR_INFO (dr)
8ee959f8 3962 && TREE_CODE (addr_base) == SSA_NAME
3963 && !SSA_NAME_PTR_INFO (addr_base))
1259ab70 3964 {
4a2edd22 3965 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3966 if (offset || byte_offset)
90d4c4af 3967 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
1259ab70 3968 }
f544b9a4 3969
6d8fb6cf 3970 if (dump_enabled_p ())
fb85abff 3971 {
7bd765d4 3972 dump_printf_loc (MSG_NOTE, vect_location, "created ");
90d4c4af 3973 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
78bb46f5 3974 dump_printf (MSG_NOTE, "\n");
fb85abff 3975 }
f083cd24 3976
90d4c4af 3977 return addr_base;
fb85abff 3978}
3979
3980
3981/* Function vect_create_data_ref_ptr.
3982
bd5ba09f 3983 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3984 location accessed in the loop by STMT, along with the def-use update
3985 chain to appropriately advance the pointer through the loop iterations.
3986 Also set aliasing information for the pointer. This pointer is used by
3987 the callers to this function to create a memory reference expression for
3988 vector load/store access.
fb85abff 3989
3990 Input:
3991 1. STMT: a stmt that references memory. Expected to be of the form
3992 GIMPLE_ASSIGN <name, data-ref> or
3993 GIMPLE_ASSIGN <data-ref, name>.
bd5ba09f 3994 2. AGGR_TYPE: the type of the reference, which should be either a vector
3995 or an array.
3996 3. AT_LOOP: the loop where the vector memref is to be created.
3997 4. OFFSET (optional): an offset to be added to the initial address accessed
fb85abff 3998 by the data-ref in STMT.
bd5ba09f 3999 5. BSI: location where the new stmts are to be placed if there is no loop
4000 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
fb85abff 4001 pointing to the initial address.
1ec61bbd 4002 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4003 to the initial address accessed by the data-ref in STMT. This is
4004 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4005 in bytes.
fb85abff 4006
4007 Output:
4008 1. Declare a new ptr to vector_type, and have it point to the base of the
4009 data reference (initial addressed accessed by the data reference).
4010 For example, for vector of type V8HI, the following code is generated:
4011
bd5ba09f 4012 v8hi *ap;
4013 ap = (v8hi *)initial_address;
fb85abff 4014
4015 if OFFSET is not supplied:
4016 initial_address = &a[init];
4017 if OFFSET is supplied:
4018 initial_address = &a[init + OFFSET];
1ec61bbd 4019 if BYTE_OFFSET is supplied:
4020 initial_address = &a[init] + BYTE_OFFSET;
fb85abff 4021
4022 Return the initial_address in INITIAL_ADDRESS.
4023
4024 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
48e1416a 4025 update the pointer in each iteration of the loop.
fb85abff 4026
4027 Return the increment stmt that updates the pointer in PTR_INCR.
4028
48e1416a 4029 3. Set INV_P to true if the access pattern of the data reference in the
282bf14c 4030 vectorized loop is invariant. Set it to false otherwise.
fb85abff 4031
4032 4. Return the pointer. */
4033
4034tree
bd5ba09f 4035vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4036 tree offset, tree *initial_address,
4037 gimple_stmt_iterator *gsi, gimple *ptr_incr,
1ec61bbd 4038 bool only_init, bool *inv_p, tree byte_offset)
fb85abff 4039{
3c18ea71 4040 const char *base_name;
fb85abff 4041 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4042 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
37545e54 4043 struct loop *loop = NULL;
4044 bool nested_in_vect_loop = false;
4045 struct loop *containing_loop = NULL;
bd5ba09f 4046 tree aggr_ptr_type;
4047 tree aggr_ptr;
fb85abff 4048 tree new_temp;
fb85abff 4049 gimple_seq new_stmt_list = NULL;
37545e54 4050 edge pe = NULL;
fb85abff 4051 basic_block new_bb;
bd5ba09f 4052 tree aggr_ptr_init;
fb85abff 4053 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
bd5ba09f 4054 tree aptr;
fb85abff 4055 gimple_stmt_iterator incr_gsi;
4056 bool insert_after;
4057 tree indx_before_incr, indx_after_incr;
4058 gimple incr;
4059 tree step;
37545e54 4060 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
48e1416a 4061
bd5ba09f 4062 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4063 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4064
37545e54 4065 if (loop_vinfo)
4066 {
4067 loop = LOOP_VINFO_LOOP (loop_vinfo);
4068 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4069 containing_loop = (gimple_bb (stmt))->loop_father;
4070 pe = loop_preheader_edge (loop);
4071 }
4072 else
4073 {
4074 gcc_assert (bb_vinfo);
4075 only_init = true;
4076 *ptr_incr = NULL;
4077 }
48e1416a 4078
fb85abff 4079 /* Check the step (evolution) of the load in LOOP, and record
4080 whether it's invariant. */
4081 if (nested_in_vect_loop)
4082 step = STMT_VINFO_DR_STEP (stmt_info);
4083 else
4084 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
48e1416a 4085
8bbe6b75 4086 if (integer_zerop (step))
fb85abff 4087 *inv_p = true;
4088 else
4089 *inv_p = false;
4090
4091 /* Create an expression for the first address accessed by this load
48e1416a 4092 in LOOP. */
3c18ea71 4093 base_name = get_name (DR_BASE_ADDRESS (dr));
fb85abff 4094
6d8fb6cf 4095 if (dump_enabled_p ())
fb85abff 4096 {
3c18ea71 4097 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
7bd765d4 4098 dump_printf_loc (MSG_NOTE, vect_location,
4099 "create %s-pointer variable to type: ",
f3d35d4d 4100 get_tree_code_name (TREE_CODE (aggr_type)));
7bd765d4 4101 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3c18ea71 4102 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
7bd765d4 4103 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
19bacd59 4104 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4105 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3c18ea71 4106 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
7bd765d4 4107 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3c18ea71 4108 else
7bd765d4 4109 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3c18ea71 4110 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
78bb46f5 4111 dump_printf (MSG_NOTE, "\n");
fb85abff 4112 }
4113
90d4c4af 4114 /* (1) Create the new aggregate-pointer variable.
4115 Vector and array types inherit the alias set of their component
bd5ba09f 4116 type by default so we need to use a ref-all pointer if the data
4117 reference does not conflict with the created aggregated data
4118 reference because it is not addressable. */
90d4c4af 4119 bool need_ref_all = false;
4120 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
a34701c9 4121 get_alias_set (DR_REF (dr))))
90d4c4af 4122 need_ref_all = true;
a34701c9 4123 /* Likewise for any of the data references in the stmt group. */
21009880 4124 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
fb85abff 4125 {
21009880 4126 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
dd277d48 4127 do
4128 {
90d4c4af 4129 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4130 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4131 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4132 get_alias_set (DR_REF (sdr))))
dd277d48 4133 {
90d4c4af 4134 need_ref_all = true;
dd277d48 4135 break;
4136 }
90d4c4af 4137 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
dd277d48 4138 }
4139 while (orig_stmt);
fb85abff 4140 }
90d4c4af 4141 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4142 need_ref_all);
4143 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4144
fb85abff 4145
282bf14c 4146 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4147 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4148 def-use update cycles for the pointer: one relative to the outer-loop
4149 (LOOP), which is what steps (3) and (4) below do. The other is relative
4150 to the inner-loop (which is the inner-most loop containing the dataref),
4151 and this is done be step (5) below.
fb85abff 4152
282bf14c 4153 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4154 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4155 redundant. Steps (3),(4) create the following:
fb85abff 4156
4157 vp0 = &base_addr;
4158 LOOP: vp1 = phi(vp0,vp2)
48e1416a 4159 ...
fb85abff 4160 ...
4161 vp2 = vp1 + step
4162 goto LOOP
48e1416a 4163
282bf14c 4164 If there is an inner-loop nested in loop, then step (5) will also be
4165 applied, and an additional update in the inner-loop will be created:
fb85abff 4166
4167 vp0 = &base_addr;
4168 LOOP: vp1 = phi(vp0,vp2)
4169 ...
4170 inner: vp3 = phi(vp1,vp4)
4171 vp4 = vp3 + inner_step
4172 if () goto inner
4173 ...
4174 vp2 = vp1 + step
4175 if () goto LOOP */
4176
bd5ba09f 4177 /* (2) Calculate the initial address of the aggregate-pointer, and set
4178 the aggregate-pointer to point to it before the loop. */
fb85abff 4179
1ec61bbd 4180 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
fb85abff 4181
4182 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
1ec61bbd 4183 offset, loop, byte_offset);
fb85abff 4184 if (new_stmt_list)
4185 {
37545e54 4186 if (pe)
4187 {
4188 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4189 gcc_assert (!new_bb);
4190 }
4191 else
bee862b6 4192 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
fb85abff 4193 }
4194
4195 *initial_address = new_temp;
8ee959f8 4196 aggr_ptr_init = new_temp;
fb85abff 4197
bd5ba09f 4198 /* (3) Handle the updating of the aggregate-pointer inside the loop.
282bf14c 4199 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4200 inner-loop nested in LOOP (during outer-loop vectorization). */
fb85abff 4201
37545e54 4202 /* No update in loop is required. */
48e1416a 4203 if (only_init && (!loop_vinfo || at_loop == loop))
bd5ba09f 4204 aptr = aggr_ptr_init;
fb85abff 4205 else
4206 {
bd5ba09f 4207 /* The step of the aggregate pointer is the type size. */
8bbe6b75 4208 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
48e1416a 4209 /* One exception to the above is when the scalar step of the load in
fb85abff 4210 LOOP is zero. In this case the step here is also zero. */
4211 if (*inv_p)
8bbe6b75 4212 iv_step = size_zero_node;
4213 else if (tree_int_cst_sgn (step) == -1)
4214 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
fb85abff 4215
4216 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4217
bd5ba09f 4218 create_iv (aggr_ptr_init,
8bbe6b75 4219 fold_convert (aggr_ptr_type, iv_step),
bd5ba09f 4220 aggr_ptr, loop, &incr_gsi, insert_after,
fb85abff 4221 &indx_before_incr, &indx_after_incr);
4222 incr = gsi_stmt (incr_gsi);
37545e54 4223 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
fb85abff 4224
4225 /* Copy the points-to information if it exists. */
4226 if (DR_PTR_INFO (dr))
4227 {
4a2edd22 4228 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4229 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
fb85abff 4230 }
fb85abff 4231 if (ptr_incr)
4232 *ptr_incr = incr;
4233
bd5ba09f 4234 aptr = indx_before_incr;
fb85abff 4235 }
4236
4237 if (!nested_in_vect_loop || only_init)
bd5ba09f 4238 return aptr;
fb85abff 4239
4240
bd5ba09f 4241 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
282bf14c 4242 nested in LOOP, if exists. */
fb85abff 4243
4244 gcc_assert (nested_in_vect_loop);
4245 if (!only_init)
4246 {
4247 standard_iv_increment_position (containing_loop, &incr_gsi,
4248 &insert_after);
bd5ba09f 4249 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
fb85abff 4250 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4251 &indx_after_incr);
4252 incr = gsi_stmt (incr_gsi);
37545e54 4253 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
fb85abff 4254
4255 /* Copy the points-to information if it exists. */
4256 if (DR_PTR_INFO (dr))
4257 {
4a2edd22 4258 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4259 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
fb85abff 4260 }
fb85abff 4261 if (ptr_incr)
4262 *ptr_incr = incr;
4263
48e1416a 4264 return indx_before_incr;
fb85abff 4265 }
4266 else
4267 gcc_unreachable ();
4268}
4269
4270
4271/* Function bump_vector_ptr
4272
4273 Increment a pointer (to a vector type) by vector-size. If requested,
48e1416a 4274 i.e. if PTR-INCR is given, then also connect the new increment stmt
fb85abff 4275 to the existing def-use update-chain of the pointer, by modifying
4276 the PTR_INCR as illustrated below:
4277
4278 The pointer def-use update-chain before this function:
4279 DATAREF_PTR = phi (p_0, p_2)
4280 ....
48e1416a 4281 PTR_INCR: p_2 = DATAREF_PTR + step
fb85abff 4282
4283 The pointer def-use update-chain after this function:
4284 DATAREF_PTR = phi (p_0, p_2)
4285 ....
4286 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4287 ....
4288 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4289
4290 Input:
48e1416a 4291 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
fb85abff 4292 in the loop.
48e1416a 4293 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
fb85abff 4294 the loop. The increment amount across iterations is expected
48e1416a 4295 to be vector_size.
fb85abff 4296 BSI - location where the new update stmt is to be placed.
4297 STMT - the original scalar memory-access stmt that is being vectorized.
4298 BUMP - optional. The offset by which to bump the pointer. If not given,
4299 the offset is assumed to be vector_size.
4300
4301 Output: Return NEW_DATAREF_PTR as illustrated above.
48e1416a 4302
fb85abff 4303*/
4304
4305tree
4306bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4307 gimple stmt, tree bump)
4308{
4309 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4310 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4311 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
fb85abff 4312 tree update = TYPE_SIZE_UNIT (vectype);
1a91d914 4313 gassign *incr_stmt;
fb85abff 4314 ssa_op_iter iter;
4315 use_operand_p use_p;
4316 tree new_dataref_ptr;
4317
4318 if (bump)
4319 update = bump;
48e1416a 4320
8ee959f8 4321 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4322 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4323 else
4324 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
e9cf809e 4325 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4326 dataref_ptr, update);
fb85abff 4327 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4328
4329 /* Copy the points-to information if it exists. */
4330 if (DR_PTR_INFO (dr))
1259ab70 4331 {
4332 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
ceea063b 4333 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
1259ab70 4334 }
fb85abff 4335
4336 if (!ptr_incr)
4337 return new_dataref_ptr;
4338
4339 /* Update the vector-pointer's cross-iteration increment. */
4340 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4341 {
4342 tree use = USE_FROM_PTR (use_p);
4343
4344 if (use == dataref_ptr)
4345 SET_USE (use_p, new_dataref_ptr);
4346 else
4347 gcc_assert (tree_int_cst_compare (use, update) == 0);
4348 }
4349
4350 return new_dataref_ptr;
4351}
4352
4353
4354/* Function vect_create_destination_var.
4355
4356 Create a new temporary of type VECTYPE. */
4357
4358tree
4359vect_create_destination_var (tree scalar_dest, tree vectype)
4360{
4361 tree vec_dest;
0bf5f81b 4362 const char *name;
4363 char *new_name;
fb85abff 4364 tree type;
4365 enum vect_var_kind kind;
4366
4367 kind = vectype ? vect_simple_var : vect_scalar_var;
4368 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4369
4370 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4371
0bf5f81b 4372 name = get_name (scalar_dest);
4373 if (name)
b33b6e58 4374 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
0bf5f81b 4375 else
b33b6e58 4376 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
fb85abff 4377 vec_dest = vect_get_new_vect_var (type, kind, new_name);
0bf5f81b 4378 free (new_name);
fb85abff 4379
4380 return vec_dest;
4381}
4382
ee612634 4383/* Function vect_grouped_store_supported.
fb85abff 4384
42f6a6e8 4385 Returns TRUE if interleave high and interleave low permutations
4386 are supported, and FALSE otherwise. */
fb85abff 4387
4388bool
ee612634 4389vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
fb85abff 4390{
3754d046 4391 machine_mode mode = TYPE_MODE (vectype);
48e1416a 4392
d53391a8 4393 /* vect_permute_store_chain requires the group size to be equal to 3 or
4394 be a power of two. */
4395 if (count != 3 && exact_log2 (count) == -1)
481fc474 4396 {
6d8fb6cf 4397 if (dump_enabled_p ())
7bd765d4 4398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
d53391a8 4399 "the size of the group of accesses"
4400 " is not a power of 2 or not eqaul to 3\n");
481fc474 4401 return false;
4402 }
4403
42f6a6e8 4404 /* Check that the permutation is supported. */
8bec2124 4405 if (VECTOR_MODE_P (mode))
4406 {
4407 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4408 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
d53391a8 4409
4410 if (count == 3)
8bec2124 4411 {
d53391a8 4412 unsigned int j0 = 0, j1 = 0, j2 = 0;
4413 unsigned int i, j;
4414
4415 for (j = 0; j < 3; j++)
4416 {
4417 int nelt0 = ((3 - j) * nelt) % 3;
4418 int nelt1 = ((3 - j) * nelt + 1) % 3;
4419 int nelt2 = ((3 - j) * nelt + 2) % 3;
4420 for (i = 0; i < nelt; i++)
4421 {
4422 if (3 * i + nelt0 < nelt)
4423 sel[3 * i + nelt0] = j0++;
4424 if (3 * i + nelt1 < nelt)
4425 sel[3 * i + nelt1] = nelt + j1++;
4426 if (3 * i + nelt2 < nelt)
4427 sel[3 * i + nelt2] = 0;
4428 }
4429 if (!can_vec_perm_p (mode, false, sel))
4430 {
4431 if (dump_enabled_p ())
4432 dump_printf (MSG_MISSED_OPTIMIZATION,
4433 "permutaion op not supported by target.\n");
4434 return false;
4435 }
4436
4437 for (i = 0; i < nelt; i++)
4438 {
4439 if (3 * i + nelt0 < nelt)
4440 sel[3 * i + nelt0] = 3 * i + nelt0;
4441 if (3 * i + nelt1 < nelt)
4442 sel[3 * i + nelt1] = 3 * i + nelt1;
4443 if (3 * i + nelt2 < nelt)
4444 sel[3 * i + nelt2] = nelt + j2++;
4445 }
4446 if (!can_vec_perm_p (mode, false, sel))
4447 {
4448 if (dump_enabled_p ())
4449 dump_printf (MSG_MISSED_OPTIMIZATION,
4450 "permutaion op not supported by target.\n");
4451 return false;
4452 }
4453 }
4454 return true;
8bec2124 4455 }
d53391a8 4456 else
8bec2124 4457 {
d53391a8 4458 /* If length is not equal to 3 then only power of 2 is supported. */
4459 gcc_assert (exact_log2 (count) != -1);
4460
4461 for (i = 0; i < nelt / 2; i++)
4462 {
4463 sel[i * 2] = i;
4464 sel[i * 2 + 1] = i + nelt;
4465 }
4466 if (can_vec_perm_p (mode, false, sel))
4467 {
4468 for (i = 0; i < nelt; i++)
4469 sel[i] += nelt / 2;
4470 if (can_vec_perm_p (mode, false, sel))
4471 return true;
4472 }
8bec2124 4473 }
4474 }
fb85abff 4475
6d8fb6cf 4476 if (dump_enabled_p ())
7bd765d4 4477 dump_printf (MSG_MISSED_OPTIMIZATION,
d53391a8 4478 "permutaion op not supported by target.\n");
6620d7d7 4479 return false;
fb85abff 4480}
4481
4482
94b7b4dd 4483/* Return TRUE if vec_store_lanes is available for COUNT vectors of
4484 type VECTYPE. */
4485
4486bool
4487vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4488{
4489 return vect_lanes_optab_supported_p ("vec_store_lanes",
4490 vec_store_lanes_optab,
4491 vectype, count);
4492}
4493
4494
fb85abff 4495/* Function vect_permute_store_chain.
4496
4497 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
d53391a8 4498 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4499 the data correctly for the stores. Return the final references for stores
4500 in RESULT_CHAIN.
fb85abff 4501
4502 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
282bf14c 4503 The input is 4 vectors each containing 8 elements. We assign a number to
4504 each element, the input sequence is:
fb85abff 4505
4506 1st vec: 0 1 2 3 4 5 6 7
4507 2nd vec: 8 9 10 11 12 13 14 15
48e1416a 4508 3rd vec: 16 17 18 19 20 21 22 23
fb85abff 4509 4th vec: 24 25 26 27 28 29 30 31
4510
4511 The output sequence should be:
4512
4513 1st vec: 0 8 16 24 1 9 17 25
4514 2nd vec: 2 10 18 26 3 11 19 27
4515 3rd vec: 4 12 20 28 5 13 21 30
4516 4th vec: 6 14 22 30 7 15 23 31
4517
4518 i.e., we interleave the contents of the four vectors in their order.
4519
282bf14c 4520 We use interleave_high/low instructions to create such output. The input of
fb85abff 4521 each interleave_high/low operation is two vectors:
48e1416a 4522 1st vec 2nd vec
4523 0 1 2 3 4 5 6 7
4524 the even elements of the result vector are obtained left-to-right from the
282bf14c 4525 high/low elements of the first vector. The odd elements of the result are
fb85abff 4526 obtained left-to-right from the high/low elements of the second vector.
4527 The output of interleave_high will be: 0 4 1 5
4528 and of interleave_low: 2 6 3 7
4529
48e1416a 4530
282bf14c 4531 The permutation is done in log LENGTH stages. In each stage interleave_high
48e1416a 4532 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4533 where the first argument is taken from the first half of DR_CHAIN and the
4534 second argument from it's second half.
4535 In our example,
fb85abff 4536
4537 I1: interleave_high (1st vec, 3rd vec)
4538 I2: interleave_low (1st vec, 3rd vec)
4539 I3: interleave_high (2nd vec, 4th vec)
4540 I4: interleave_low (2nd vec, 4th vec)
4541
4542 The output for the first stage is:
4543
4544 I1: 0 16 1 17 2 18 3 19
4545 I2: 4 20 5 21 6 22 7 23
4546 I3: 8 24 9 25 10 26 11 27
4547 I4: 12 28 13 29 14 30 15 31
4548
4549 The output of the second stage, i.e. the final result is:
4550
4551 I1: 0 8 16 24 1 9 17 25
4552 I2: 2 10 18 26 3 11 19 27
4553 I3: 4 12 20 28 5 13 21 30
4554 I4: 6 14 22 30 7 15 23 31. */
48e1416a 4555
481fc474 4556void
f1f41a6c 4557vect_permute_store_chain (vec<tree> dr_chain,
48e1416a 4558 unsigned int length,
fb85abff 4559 gimple stmt,
4560 gimple_stmt_iterator *gsi,
f1f41a6c 4561 vec<tree> *result_chain)
fb85abff 4562{
03d37e4e 4563 tree vect1, vect2, high, low;
fb85abff 4564 gimple perm_stmt;
4565 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
8bec2124 4566 tree perm_mask_low, perm_mask_high;
d53391a8 4567 tree data_ref;
4568 tree perm3_mask_low, perm3_mask_high;
4569 unsigned int i, n, log_length = exact_log2 (length);
42f6a6e8 4570 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
8bec2124 4571 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
48e1416a 4572
f40aaf2d 4573 result_chain->quick_grow (length);
4574 memcpy (result_chain->address (), dr_chain.address (),
4575 length * sizeof (tree));
fb85abff 4576
d53391a8 4577 if (length == 3)
8bec2124 4578 {
d53391a8 4579 unsigned int j0 = 0, j1 = 0, j2 = 0;
42f6a6e8 4580
d53391a8 4581 for (j = 0; j < 3; j++)
4582 {
4583 int nelt0 = ((3 - j) * nelt) % 3;
4584 int nelt1 = ((3 - j) * nelt + 1) % 3;
4585 int nelt2 = ((3 - j) * nelt + 2) % 3;
8bec2124 4586
d53391a8 4587 for (i = 0; i < nelt; i++)
4588 {
4589 if (3 * i + nelt0 < nelt)
4590 sel[3 * i + nelt0] = j0++;
4591 if (3 * i + nelt1 < nelt)
4592 sel[3 * i + nelt1] = nelt + j1++;
4593 if (3 * i + nelt2 < nelt)
4594 sel[3 * i + nelt2] = 0;
4595 }
0761848d 4596 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
d53391a8 4597
4598 for (i = 0; i < nelt; i++)
4599 {
4600 if (3 * i + nelt0 < nelt)
4601 sel[3 * i + nelt0] = 3 * i + nelt0;
4602 if (3 * i + nelt1 < nelt)
4603 sel[3 * i + nelt1] = 3 * i + nelt1;
4604 if (3 * i + nelt2 < nelt)
4605 sel[3 * i + nelt2] = nelt + j2++;
4606 }
0761848d 4607 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
d53391a8 4608
4609 vect1 = dr_chain[0];
4610 vect2 = dr_chain[1];
fb85abff 4611
4612 /* Create interleaving stmt:
d53391a8 4613 low = VEC_PERM_EXPR <vect1, vect2,
4614 {j, nelt, *, j + 1, nelt + j + 1, *,
4615 j + 2, nelt + j + 2, *, ...}> */
4616 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
e9cf809e 4617 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4618 vect2, perm3_mask_low);
fb85abff 4619 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
fb85abff 4620
d53391a8 4621 vect1 = data_ref;
4622 vect2 = dr_chain[2];
fb85abff 4623 /* Create interleaving stmt:
d53391a8 4624 low = VEC_PERM_EXPR <vect1, vect2,
4625 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4626 6, 7, nelt + j + 2, ...}> */
4627 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
e9cf809e 4628 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4629 vect2, perm3_mask_high);
fb85abff 4630 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
d53391a8 4631 (*result_chain)[j] = data_ref;
fb85abff 4632 }
d53391a8 4633 }
4634 else
4635 {
4636 /* If length is not equal to 3 then only power of 2 is supported. */
4637 gcc_assert (exact_log2 (length) != -1);
4638
4639 for (i = 0, n = nelt / 2; i < n; i++)
4640 {
4641 sel[i * 2] = i;
4642 sel[i * 2 + 1] = i + nelt;
4643 }
0761848d 4644 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
d53391a8 4645
4646 for (i = 0; i < nelt; i++)
4647 sel[i] += nelt / 2;
0761848d 4648 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
d53391a8 4649
4650 for (i = 0, n = log_length; i < n; i++)
4651 {
4652 for (j = 0; j < length/2; j++)
4653 {
4654 vect1 = dr_chain[j];
4655 vect2 = dr_chain[j+length/2];
4656
4657 /* Create interleaving stmt:
4658 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4659 ...}> */
4660 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
e9cf809e 4661 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4662 vect2, perm_mask_high);
d53391a8 4663 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4664 (*result_chain)[2*j] = high;
4665
4666 /* Create interleaving stmt:
4667 low = VEC_PERM_EXPR <vect1, vect2,
4668 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4669 ...}> */
4670 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
e9cf809e 4671 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4672 vect2, perm_mask_low);
d53391a8 4673 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4674 (*result_chain)[2*j+1] = low;
4675 }
4676 memcpy (dr_chain.address (), result_chain->address (),
4677 length * sizeof (tree));
4678 }
fb85abff 4679 }
fb85abff 4680}
4681
4682/* Function vect_setup_realignment
48e1416a 4683
fb85abff 4684 This function is called when vectorizing an unaligned load using
4685 the dr_explicit_realign[_optimized] scheme.
4686 This function generates the following code at the loop prolog:
4687
4688 p = initial_addr;
4689 x msq_init = *(floor(p)); # prolog load
48e1416a 4690 realignment_token = call target_builtin;
fb85abff 4691 loop:
4692 x msq = phi (msq_init, ---)
4693
48e1416a 4694 The stmts marked with x are generated only for the case of
fb85abff 4695 dr_explicit_realign_optimized.
4696
48e1416a 4697 The code above sets up a new (vector) pointer, pointing to the first
fb85abff 4698 location accessed by STMT, and a "floor-aligned" load using that pointer.
4699 It also generates code to compute the "realignment-token" (if the relevant
4700 target hook was defined), and creates a phi-node at the loop-header bb
4701 whose arguments are the result of the prolog-load (created by this
4702 function) and the result of a load that takes place in the loop (to be
4703 created by the caller to this function).
4704
4705 For the case of dr_explicit_realign_optimized:
48e1416a 4706 The caller to this function uses the phi-result (msq) to create the
fb85abff 4707 realignment code inside the loop, and sets up the missing phi argument,
4708 as follows:
48e1416a 4709 loop:
fb85abff 4710 msq = phi (msq_init, lsq)
4711 lsq = *(floor(p')); # load in loop
4712 result = realign_load (msq, lsq, realignment_token);
4713
4714 For the case of dr_explicit_realign:
4715 loop:
4716 msq = *(floor(p)); # load in loop
4717 p' = p + (VS-1);
4718 lsq = *(floor(p')); # load in loop
4719 result = realign_load (msq, lsq, realignment_token);
4720
4721 Input:
4722 STMT - (scalar) load stmt to be vectorized. This load accesses
4723 a memory location that may be unaligned.
4724 BSI - place where new code is to be inserted.
4725 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
48e1416a 4726 is used.
4727
fb85abff 4728 Output:
4729 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4730 target hook, if defined.
4731 Return value - the result of the loop-header phi node. */
4732
4733tree
4734vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4735 tree *realignment_token,
4736 enum dr_alignment_support alignment_support_scheme,
4737 tree init_addr,
4738 struct loop **at_loop)
4739{
4740 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4741 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4742 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2cb9ef39 4743 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
ad074595 4744 struct loop *loop = NULL;
4745 edge pe = NULL;
fb85abff 4746 tree scalar_dest = gimple_assign_lhs (stmt);
4747 tree vec_dest;
4748 gimple inc;
4749 tree ptr;
4750 tree data_ref;
fb85abff 4751 basic_block new_bb;
4752 tree msq_init = NULL_TREE;
4753 tree new_temp;
1a91d914 4754 gphi *phi_stmt;
fb85abff 4755 tree msq = NULL_TREE;
4756 gimple_seq stmts = NULL;
4757 bool inv_p;
4758 bool compute_in_loop = false;
ad074595 4759 bool nested_in_vect_loop = false;
fb85abff 4760 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
ad074595 4761 struct loop *loop_for_initial_load = NULL;
4762
4763 if (loop_vinfo)
4764 {
4765 loop = LOOP_VINFO_LOOP (loop_vinfo);
4766 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4767 }
fb85abff 4768
4769 gcc_assert (alignment_support_scheme == dr_explicit_realign
4770 || alignment_support_scheme == dr_explicit_realign_optimized);
4771
4772 /* We need to generate three things:
4773 1. the misalignment computation
4774 2. the extra vector load (for the optimized realignment scheme).
4775 3. the phi node for the two vectors from which the realignment is
282bf14c 4776 done (for the optimized realignment scheme). */
fb85abff 4777
4778 /* 1. Determine where to generate the misalignment computation.
4779
4780 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4781 calculation will be generated by this function, outside the loop (in the
4782 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4783 caller, inside the loop.
4784
4785 Background: If the misalignment remains fixed throughout the iterations of
4786 the loop, then both realignment schemes are applicable, and also the
4787 misalignment computation can be done outside LOOP. This is because we are
4788 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4789 are a multiple of VS (the Vector Size), and therefore the misalignment in
4790 different vectorized LOOP iterations is always the same.
4791 The problem arises only if the memory access is in an inner-loop nested
4792 inside LOOP, which is now being vectorized using outer-loop vectorization.
4793 This is the only case when the misalignment of the memory access may not
4794 remain fixed throughout the iterations of the inner-loop (as explained in
4795 detail in vect_supportable_dr_alignment). In this case, not only is the
4796 optimized realignment scheme not applicable, but also the misalignment
4797 computation (and generation of the realignment token that is passed to
4798 REALIGN_LOAD) have to be done inside the loop.
4799
4800 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4801 or not, which in turn determines if the misalignment is computed inside
4802 the inner-loop, or outside LOOP. */
4803
ad074595 4804 if (init_addr != NULL_TREE || !loop_vinfo)
fb85abff 4805 {
4806 compute_in_loop = true;
4807 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4808 }
4809
4810
4811 /* 2. Determine where to generate the extra vector load.
4812
4813 For the optimized realignment scheme, instead of generating two vector
4814 loads in each iteration, we generate a single extra vector load in the
4815 preheader of the loop, and in each iteration reuse the result of the
4816 vector load from the previous iteration. In case the memory access is in
4817 an inner-loop nested inside LOOP, which is now being vectorized using
4818 outer-loop vectorization, we need to determine whether this initial vector
4819 load should be generated at the preheader of the inner-loop, or can be
4820 generated at the preheader of LOOP. If the memory access has no evolution
4821 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4822 to be generated inside LOOP (in the preheader of the inner-loop). */
4823
4824 if (nested_in_vect_loop)
4825 {
4826 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4827 bool invariant_in_outerloop =
4828 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4829 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4830 }
4831 else
4832 loop_for_initial_load = loop;
4833 if (at_loop)
4834 *at_loop = loop_for_initial_load;
4835
ad074595 4836 if (loop_for_initial_load)
4837 pe = loop_preheader_edge (loop_for_initial_load);
4838
fb85abff 4839 /* 3. For the case of the optimized realignment, create the first vector
4840 load at the loop preheader. */
4841
4842 if (alignment_support_scheme == dr_explicit_realign_optimized)
4843 {
4844 /* Create msq_init = *(floor(p1)) in the loop preheader */
1a91d914 4845 gassign *new_stmt;
fb85abff 4846
4847 gcc_assert (!compute_in_loop);
fb85abff 4848 vec_dest = vect_create_destination_var (scalar_dest, vectype);
bd5ba09f 4849 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4850 NULL_TREE, &init_addr, NULL, &inc,
4851 true, &inv_p);
f9e245b2 4852 new_temp = copy_ssa_name (ptr);
e9cf809e 4853 new_stmt = gimple_build_assign
4854 (new_temp, BIT_AND_EXPR, ptr,
86638c2e 4855 build_int_cst (TREE_TYPE (ptr),
4856 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
86638c2e 4857 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4858 gcc_assert (!new_bb);
2cb9ef39 4859 data_ref
4860 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4861 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
fb85abff 4862 new_stmt = gimple_build_assign (vec_dest, data_ref);
4863 new_temp = make_ssa_name (vec_dest, new_stmt);
4864 gimple_assign_set_lhs (new_stmt, new_temp);
ad074595 4865 if (pe)
4866 {
4867 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4868 gcc_assert (!new_bb);
4869 }
4870 else
4871 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4872
fb85abff 4873 msq_init = gimple_assign_lhs (new_stmt);
4874 }
4875
4876 /* 4. Create realignment token using a target builtin, if available.
4877 It is done either inside the containing loop, or before LOOP (as
4878 determined above). */
4879
4880 if (targetm.vectorize.builtin_mask_for_load)
4881 {
1a91d914 4882 gcall *new_stmt;
fb85abff 4883 tree builtin_decl;
4884
4885 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
ad074595 4886 if (!init_addr)
fb85abff 4887 {
4888 /* Generate the INIT_ADDR computation outside LOOP. */
4889 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4890 NULL_TREE, loop);
ad074595 4891 if (loop)
4892 {
4893 pe = loop_preheader_edge (loop);
4894 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4895 gcc_assert (!new_bb);
4896 }
4897 else
4898 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
fb85abff 4899 }
4900
4901 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4902 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4903 vec_dest =
4904 vect_create_destination_var (scalar_dest,
4905 gimple_call_return_type (new_stmt));
4906 new_temp = make_ssa_name (vec_dest, new_stmt);
4907 gimple_call_set_lhs (new_stmt, new_temp);
4908
4909 if (compute_in_loop)
4910 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4911 else
4912 {
4913 /* Generate the misalignment computation outside LOOP. */
4914 pe = loop_preheader_edge (loop);
4915 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4916 gcc_assert (!new_bb);
4917 }
4918
4919 *realignment_token = gimple_call_lhs (new_stmt);
4920
4921 /* The result of the CALL_EXPR to this builtin is determined from
4922 the value of the parameter and no global variables are touched
4923 which makes the builtin a "const" function. Requiring the
4924 builtin to have the "const" attribute makes it unnecessary
4925 to call mark_call_clobbered. */
4926 gcc_assert (TREE_READONLY (builtin_decl));
4927 }
4928
4929 if (alignment_support_scheme == dr_explicit_realign)
4930 return msq;
4931
4932 gcc_assert (!compute_in_loop);
4933 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4934
4935
4936 /* 5. Create msq = phi <msq_init, lsq> in loop */
4937
4938 pe = loop_preheader_edge (containing_loop);
4939 vec_dest = vect_create_destination_var (scalar_dest, vectype);
f9e245b2 4940 msq = make_ssa_name (vec_dest);
fb85abff 4941 phi_stmt = create_phi_node (msq, containing_loop->header);
60d535d2 4942 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
fb85abff 4943
4944 return msq;
4945}
4946
4947
ee612634 4948/* Function vect_grouped_load_supported.
fb85abff 4949
42f6a6e8 4950 Returns TRUE if even and odd permutations are supported,
fb85abff 4951 and FALSE otherwise. */
4952
4953bool
ee612634 4954vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
fb85abff 4955{
3754d046 4956 machine_mode mode = TYPE_MODE (vectype);
fb85abff 4957
1e1bca71 4958 /* vect_permute_load_chain requires the group size to be equal to 3 or
4959 be a power of two. */
4960 if (count != 3 && exact_log2 (count) == -1)
481fc474 4961 {
6d8fb6cf 4962 if (dump_enabled_p ())
7bd765d4 4963 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1e1bca71 4964 "the size of the group of accesses"
4965 " is not a power of 2 or not equal to 3\n");
481fc474 4966 return false;
4967 }
4968
42f6a6e8 4969 /* Check that the permutation is supported. */
4970 if (VECTOR_MODE_P (mode))
4971 {
1e1bca71 4972 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
42f6a6e8 4973 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
fb85abff 4974
1e1bca71 4975 if (count == 3)
42f6a6e8 4976 {
1e1bca71 4977 unsigned int k;
4978 for (k = 0; k < 3; k++)
4979 {
4980 for (i = 0; i < nelt; i++)
4981 if (3 * i + k < 2 * nelt)
4982 sel[i] = 3 * i + k;
4983 else
4984 sel[i] = 0;
4985 if (!can_vec_perm_p (mode, false, sel))
4986 {
4987 if (dump_enabled_p ())
4988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4989 "shuffle of 3 loads is not supported by"
4990 " target\n");
5c6f6a61 4991 return false;
1e1bca71 4992 }
4993 for (i = 0, j = 0; i < nelt; i++)
4994 if (3 * i + k < 2 * nelt)
4995 sel[i] = i;
4996 else
4997 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4998 if (!can_vec_perm_p (mode, false, sel))
4999 {
5000 if (dump_enabled_p ())
5001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5002 "shuffle of 3 loads is not supported by"
5003 " target\n");
5004 return false;
5005 }
5006 }
5007 return true;
5008 }
5009 else
5010 {
5011 /* If length is not equal to 3 then only power of 2 is supported. */
5012 gcc_assert (exact_log2 (count) != -1);
42f6a6e8 5013 for (i = 0; i < nelt; i++)
1e1bca71 5014 sel[i] = i * 2;
42f6a6e8 5015 if (can_vec_perm_p (mode, false, sel))
1e1bca71 5016 {
5017 for (i = 0; i < nelt; i++)
5018 sel[i] = i * 2 + 1;
5019 if (can_vec_perm_p (mode, false, sel))
5020 return true;
5021 }
5022 }
42f6a6e8 5023 }
fb85abff 5024
6d8fb6cf 5025 if (dump_enabled_p ())
7bd765d4 5026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1e1bca71 5027 "extract even/odd not supported by target\n");
6620d7d7 5028 return false;
fb85abff 5029}
5030
94b7b4dd 5031/* Return TRUE if vec_load_lanes is available for COUNT vectors of
5032 type VECTYPE. */
5033
5034bool
5035vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5036{
5037 return vect_lanes_optab_supported_p ("vec_load_lanes",
5038 vec_load_lanes_optab,
5039 vectype, count);
5040}
fb85abff 5041
5042/* Function vect_permute_load_chain.
5043
5044 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
1e1bca71 5045 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5046 the input data correctly. Return the final references for loads in
5047 RESULT_CHAIN.
fb85abff 5048
5049 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5050 The input is 4 vectors each containing 8 elements. We assign a number to each
5051 element, the input sequence is:
5052
5053 1st vec: 0 1 2 3 4 5 6 7
5054 2nd vec: 8 9 10 11 12 13 14 15
48e1416a 5055 3rd vec: 16 17 18 19 20 21 22 23
fb85abff 5056 4th vec: 24 25 26 27 28 29 30 31
5057
5058 The output sequence should be:
5059
5060 1st vec: 0 4 8 12 16 20 24 28
5061 2nd vec: 1 5 9 13 17 21 25 29
48e1416a 5062 3rd vec: 2 6 10 14 18 22 26 30
fb85abff 5063 4th vec: 3 7 11 15 19 23 27 31
5064
5065 i.e., the first output vector should contain the first elements of each
5066 interleaving group, etc.
5067
282bf14c 5068 We use extract_even/odd instructions to create such output. The input of
5069 each extract_even/odd operation is two vectors
48e1416a 5070 1st vec 2nd vec
5071 0 1 2 3 4 5 6 7
fb85abff 5072
282bf14c 5073 and the output is the vector of extracted even/odd elements. The output of
fb85abff 5074 extract_even will be: 0 2 4 6
5075 and of extract_odd: 1 3 5 7
5076
48e1416a 5077
282bf14c 5078 The permutation is done in log LENGTH stages. In each stage extract_even
5079 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5080 their order. In our example,
fb85abff 5081
5082 E1: extract_even (1st vec, 2nd vec)
5083 E2: extract_odd (1st vec, 2nd vec)
5084 E3: extract_even (3rd vec, 4th vec)
5085 E4: extract_odd (3rd vec, 4th vec)
5086
5087 The output for the first stage will be:
5088
5089 E1: 0 2 4 6 8 10 12 14
5090 E2: 1 3 5 7 9 11 13 15
48e1416a 5091 E3: 16 18 20 22 24 26 28 30
fb85abff 5092 E4: 17 19 21 23 25 27 29 31
5093
5094 In order to proceed and create the correct sequence for the next stage (or
48e1416a 5095 for the correct output, if the second stage is the last one, as in our
5096 example), we first put the output of extract_even operation and then the
fb85abff 5097 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5098 The input for the second stage is:
5099
5100 1st vec (E1): 0 2 4 6 8 10 12 14
48e1416a 5101 2nd vec (E3): 16 18 20 22 24 26 28 30
5102 3rd vec (E2): 1 3 5 7 9 11 13 15
fb85abff 5103 4th vec (E4): 17 19 21 23 25 27 29 31
5104
5105 The output of the second stage:
5106
5107 E1: 0 4 8 12 16 20 24 28
5108 E2: 2 6 10 14 18 22 26 30
5109 E3: 1 5 9 13 17 21 25 29
5110 E4: 3 7 11 15 19 23 27 31
5111
5112 And RESULT_CHAIN after reordering:
5113
5114 1st vec (E1): 0 4 8 12 16 20 24 28
5115 2nd vec (E3): 1 5 9 13 17 21 25 29
48e1416a 5116 3rd vec (E2): 2 6 10 14 18 22 26 30
fb85abff 5117 4th vec (E4): 3 7 11 15 19 23 27 31. */
5118
481fc474 5119static void
f1f41a6c 5120vect_permute_load_chain (vec<tree> dr_chain,
48e1416a 5121 unsigned int length,
fb85abff 5122 gimple stmt,
5123 gimple_stmt_iterator *gsi,
f1f41a6c 5124 vec<tree> *result_chain)
fb85abff 5125{
03d37e4e 5126 tree data_ref, first_vect, second_vect;
42f6a6e8 5127 tree perm_mask_even, perm_mask_odd;
1e1bca71 5128 tree perm3_mask_low, perm3_mask_high;
fb85abff 5129 gimple perm_stmt;
5130 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
42f6a6e8 5131 unsigned int i, j, log_length = exact_log2 (length);
5132 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5133 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
fb85abff 5134
1648f21f 5135 result_chain->quick_grow (length);
5136 memcpy (result_chain->address (), dr_chain.address (),
5137 length * sizeof (tree));
42f6a6e8 5138
1e1bca71 5139 if (length == 3)
fb85abff 5140 {
1e1bca71 5141 unsigned int k;
fb85abff 5142
1e1bca71 5143 for (k = 0; k < 3; k++)
5144 {
5145 for (i = 0; i < nelt; i++)
5146 if (3 * i + k < 2 * nelt)
5147 sel[i] = 3 * i + k;
5148 else
5149 sel[i] = 0;
0761848d 5150 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
1e1bca71 5151
5152 for (i = 0, j = 0; i < nelt; i++)
5153 if (3 * i + k < 2 * nelt)
5154 sel[i] = i;
5155 else
5156 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5157
0761848d 5158 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
1e1bca71 5159
5160 first_vect = dr_chain[0];
5161 second_vect = dr_chain[1];
5162
5163 /* Create interleaving stmt (low part of):
5164 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5165 ...}> */
321d85d9 5166 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
e9cf809e 5167 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5168 second_vect, perm3_mask_low);
fb85abff 5169 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
48e1416a 5170
1e1bca71 5171 /* Create interleaving stmt (high part of):
5172 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5173 ...}> */
5174 first_vect = data_ref;
5175 second_vect = dr_chain[2];
321d85d9 5176 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
e9cf809e 5177 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5178 second_vect, perm3_mask_high);
fb85abff 5179 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
1e1bca71 5180 (*result_chain)[k] = data_ref;
fb85abff 5181 }
fb85abff 5182 }
1e1bca71 5183 else
5184 {
5185 /* If length is not equal to 3 then only power of 2 is supported. */
5186 gcc_assert (exact_log2 (length) != -1);
5187
5188 for (i = 0; i < nelt; ++i)
5189 sel[i] = i * 2;
0761848d 5190 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
1e1bca71 5191
5192 for (i = 0; i < nelt; ++i)
5193 sel[i] = i * 2 + 1;
0761848d 5194 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
fb85abff 5195
1e1bca71 5196 for (i = 0; i < log_length; i++)
5197 {
5198 for (j = 0; j < length; j += 2)
5199 {
5200 first_vect = dr_chain[j];
5201 second_vect = dr_chain[j+1];
5202
5203 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5204 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
e9cf809e 5205 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5206 first_vect, second_vect,
5207 perm_mask_even);
1e1bca71 5208 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5209 (*result_chain)[j/2] = data_ref;
5210
5211 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5212 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
e9cf809e 5213 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5214 first_vect, second_vect,
5215 perm_mask_odd);
1e1bca71 5216 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5217 (*result_chain)[j/2+length/2] = data_ref;
5218 }
5219 memcpy (dr_chain.address (), result_chain->address (),
5220 length * sizeof (tree));
5221 }
5222 }
5223}
fb85abff 5224
926f7a02 5225/* Function vect_shift_permute_load_chain.
5226
5227 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5228 sequence of stmts to reorder the input data accordingly.
5229 Return the final references for loads in RESULT_CHAIN.
5230 Return true if successed, false otherwise.
5231
5232 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5233 The input is 3 vectors each containing 8 elements. We assign a
5234 number to each element, the input sequence is:
5235
5236 1st vec: 0 1 2 3 4 5 6 7
5237 2nd vec: 8 9 10 11 12 13 14 15
5238 3rd vec: 16 17 18 19 20 21 22 23
5239
5240 The output sequence should be:
5241
5242 1st vec: 0 3 6 9 12 15 18 21
5243 2nd vec: 1 4 7 10 13 16 19 22
5244 3rd vec: 2 5 8 11 14 17 20 23
5245
5246 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5247
5248 First we shuffle all 3 vectors to get correct elements order:
5249
5250 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5251 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5252 3rd vec: (16 19 22) (17 20 23) (18 21)
5253
5254 Next we unite and shift vector 3 times:
5255
5256 1st step:
5257 shift right by 6 the concatenation of:
5258 "1st vec" and "2nd vec"
5259 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5260 "2nd vec" and "3rd vec"
5261 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5262 "3rd vec" and "1st vec"
5263 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5264 | New vectors |
5265
5266 So that now new vectors are:
5267
5268 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5269 2nd vec: (10 13) (16 19 22) (17 20 23)
5270 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5271
5272 2nd step:
5273 shift right by 5 the concatenation of:
5274 "1st vec" and "3rd vec"
5275 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5276 "2nd vec" and "1st vec"
5277 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5278 "3rd vec" and "2nd vec"
5279 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5280 | New vectors |
5281
5282 So that now new vectors are:
5283
5284 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5285 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5286 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5287
5288 3rd step:
5289 shift right by 5 the concatenation of:
5290 "1st vec" and "1st vec"
5291 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5292 shift right by 3 the concatenation of:
5293 "2nd vec" and "2nd vec"
5294 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5295 | New vectors |
5296
5297 So that now all vectors are READY:
5298 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5299 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5300 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5301
5302 This algorithm is faster than one in vect_permute_load_chain if:
5303 1. "shift of a concatination" is faster than general permutation.
5304 This is usually so.
5305 2. The TARGET machine can't execute vector instructions in parallel.
5306 This is because each step of the algorithm depends on previous.
5307 The algorithm in vect_permute_load_chain is much more parallel.
5308
5309 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5310*/
5311
5312static bool
5313vect_shift_permute_load_chain (vec<tree> dr_chain,
5314 unsigned int length,
5315 gimple stmt,
5316 gimple_stmt_iterator *gsi,
5317 vec<tree> *result_chain)
5318{
5319 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5320 tree perm2_mask1, perm2_mask2, perm3_mask;
5321 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5322 gimple perm_stmt;
5323
5324 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5325 unsigned int i;
5326 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5327 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5328 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5329 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5330
5331 result_chain->quick_grow (length);
5332 memcpy (result_chain->address (), dr_chain.address (),
5333 length * sizeof (tree));
5334
2cc1223e 5335 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
926f7a02 5336 {
2cc1223e 5337 unsigned int j, log_length = exact_log2 (length);
926f7a02 5338 for (i = 0; i < nelt / 2; ++i)
5339 sel[i] = i * 2;
5340 for (i = 0; i < nelt / 2; ++i)
5341 sel[nelt / 2 + i] = i * 2 + 1;
5342 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5343 {
5344 if (dump_enabled_p ())
5345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5346 "shuffle of 2 fields structure is not \
5347 supported by target\n");
5348 return false;
5349 }
0761848d 5350 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5351
5352 for (i = 0; i < nelt / 2; ++i)
5353 sel[i] = i * 2 + 1;
5354 for (i = 0; i < nelt / 2; ++i)
5355 sel[nelt / 2 + i] = i * 2;
5356 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5357 {
5358 if (dump_enabled_p ())
5359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5360 "shuffle of 2 fields structure is not \
5361 supported by target\n");
5362 return false;
5363 }
0761848d 5364 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5365
5366 /* Generating permutation constant to shift all elements.
5367 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5368 for (i = 0; i < nelt; i++)
5369 sel[i] = nelt / 2 + i;
5370 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5371 {
5372 if (dump_enabled_p ())
5373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5374 "shift permutation is not supported by target\n");
5375 return false;
5376 }
0761848d 5377 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5378
5379 /* Generating permutation constant to select vector from 2.
5380 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5381 for (i = 0; i < nelt / 2; i++)
5382 sel[i] = i;
5383 for (i = nelt / 2; i < nelt; i++)
5384 sel[i] = nelt + i;
5385 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5386 {
5387 if (dump_enabled_p ())
5388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5389 "select is not supported by target\n");
5390 return false;
5391 }
0761848d 5392 select_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5393
2cc1223e 5394 for (i = 0; i < log_length; i++)
5395 {
5396 for (j = 0; j < length; j += 2)
5397 {
5398 first_vect = dr_chain[j];
5399 second_vect = dr_chain[j + 1];
5400
5401 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
e9cf809e 5402 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5403 first_vect, first_vect,
5404 perm2_mask1);
2cc1223e 5405 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5406 vect[0] = data_ref;
5407
5408 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
e9cf809e 5409 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5410 second_vect, second_vect,
5411 perm2_mask2);
2cc1223e 5412 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5413 vect[1] = data_ref;
926f7a02 5414
2cc1223e 5415 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
e9cf809e 5416 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5417 vect[0], vect[1], shift1_mask);
2cc1223e 5418 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5419 (*result_chain)[j/2 + length/2] = data_ref;
5420
5421 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
e9cf809e 5422 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5423 vect[0], vect[1], select_mask);
2cc1223e 5424 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5425 (*result_chain)[j/2] = data_ref;
5426 }
5427 memcpy (dr_chain.address (), result_chain->address (),
5428 length * sizeof (tree));
5429 }
926f7a02 5430 return true;
5431 }
5432 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5433 {
5434 unsigned int k = 0, l = 0;
5435
5436 /* Generating permutation constant to get all elements in rigth order.
5437 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5438 for (i = 0; i < nelt; i++)
5439 {
5440 if (3 * k + (l % 3) >= nelt)
5441 {
5442 k = 0;
5443 l += (3 - (nelt % 3));
5444 }
5445 sel[i] = 3 * k + (l % 3);
5446 k++;
5447 }
5448 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5449 {
5450 if (dump_enabled_p ())
5451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5452 "shuffle of 3 fields structure is not \
5453 supported by target\n");
5454 return false;
5455 }
0761848d 5456 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5457
5458 /* Generating permutation constant to shift all elements.
5459 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5460 for (i = 0; i < nelt; i++)
5461 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5462 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5463 {
5464 if (dump_enabled_p ())
5465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5466 "shift permutation is not supported by target\n");
5467 return false;
5468 }
0761848d 5469 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5470
5471 /* Generating permutation constant to shift all elements.
5472 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5473 for (i = 0; i < nelt; i++)
5474 sel[i] = 2 * (nelt / 3) + 1 + i;
5475 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5476 {
5477 if (dump_enabled_p ())
5478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5479 "shift permutation is not supported by target\n");
5480 return false;
5481 }
0761848d 5482 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5483
5484 /* Generating permutation constant to shift all elements.
5485 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5486 for (i = 0; i < nelt; i++)
5487 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5488 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5489 {
5490 if (dump_enabled_p ())
5491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5492 "shift permutation is not supported by target\n");
5493 return false;
5494 }
0761848d 5495 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5496
5497 /* Generating permutation constant to shift all elements.
5498 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5499 for (i = 0; i < nelt; i++)
5500 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5501 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5502 {
5503 if (dump_enabled_p ())
5504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5505 "shift permutation is not supported by target\n");
5506 return false;
5507 }
0761848d 5508 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
926f7a02 5509
5510 for (k = 0; k < 3; k++)
5511 {
321d85d9 5512 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
e9cf809e 5513 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5514 dr_chain[k], dr_chain[k],
5515 perm3_mask);
926f7a02 5516 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5517 vect[k] = data_ref;
5518 }
5519
5520 for (k = 0; k < 3; k++)
5521 {
5522 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
e9cf809e 5523 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5524 vect[k % 3], vect[(k + 1) % 3],
5525 shift1_mask);
926f7a02 5526 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5527 vect_shift[k] = data_ref;
5528 }
5529
5530 for (k = 0; k < 3; k++)
5531 {
5532 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
e9cf809e 5533 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5534 vect_shift[(4 - k) % 3],
5535 vect_shift[(3 - k) % 3],
5536 shift2_mask);
926f7a02 5537 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5538 vect[k] = data_ref;
5539 }
5540
5541 (*result_chain)[3 - (nelt % 3)] = vect[2];
5542
5543 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
e9cf809e 5544 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5545 vect[0], shift3_mask);
926f7a02 5546 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5547 (*result_chain)[nelt % 3] = data_ref;
5548
5549 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
e9cf809e 5550 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5551 vect[1], shift4_mask);
926f7a02 5552 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5553 (*result_chain)[0] = data_ref;
5554 return true;
5555 }
5556 return false;
5557}
5558
ee612634 5559/* Function vect_transform_grouped_load.
fb85abff 5560
5561 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5562 to perform their permutation and ascribe the result vectorized statements to
5563 the scalar statements.
5564*/
5565
481fc474 5566void
f1f41a6c 5567vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
fb85abff 5568 gimple_stmt_iterator *gsi)
5569{
3754d046 5570 machine_mode mode;
1e094109 5571 vec<tree> result_chain = vNULL;
fb85abff 5572
48e1416a 5573 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5574 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
fb85abff 5575 vectors, that are ready for vector computation. */
f1f41a6c 5576 result_chain.create (size);
926f7a02 5577
5578 /* If reassociation width for vector type is 2 or greater target machine can
5579 execute 2 or more vector instructions in parallel. Otherwise try to
5580 get chain for loads group using vect_shift_permute_load_chain. */
5581 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5582 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6da5aa28 5583 || exact_log2 (size) != -1
926f7a02 5584 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5585 gsi, &result_chain))
5586 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
ee612634 5587 vect_record_grouped_load_vectors (stmt, result_chain);
f1f41a6c 5588 result_chain.release ();
94b7b4dd 5589}
5590
ee612634 5591/* RESULT_CHAIN contains the output of a group of grouped loads that were
94b7b4dd 5592 generated as part of the vectorization of STMT. Assign the statement
5593 for each vector to the associated scalar statement. */
5594
5595void
f1f41a6c 5596vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
94b7b4dd 5597{
21009880 5598 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
94b7b4dd 5599 gimple next_stmt, new_stmt;
5600 unsigned int i, gap_count;
5601 tree tmp_data_ref;
fb85abff 5602
48e1416a 5603 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5604 Since we scan the chain starting from it's first node, their order
fb85abff 5605 corresponds the order of data-refs in RESULT_CHAIN. */
5606 next_stmt = first_stmt;
5607 gap_count = 1;
f1f41a6c 5608 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
fb85abff 5609 {
5610 if (!next_stmt)
5611 break;
5612
282bf14c 5613 /* Skip the gaps. Loads created for the gaps will be removed by dead
5614 code elimination pass later. No need to check for the first stmt in
fb85abff 5615 the group, since it always exists.
21009880 5616 GROUP_GAP is the number of steps in elements from the previous
5617 access (if there is no gap GROUP_GAP is 1). We skip loads that
282bf14c 5618 correspond to the gaps. */
48e1416a 5619 if (next_stmt != first_stmt
21009880 5620 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
fb85abff 5621 {
5622 gap_count++;
5623 continue;
5624 }
5625
5626 while (next_stmt)
5627 {
5628 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5629 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5630 copies, and we put the new vector statement in the first available
5631 RELATED_STMT. */
5632 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5633 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5634 else
5635 {
21009880 5636 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
fb85abff 5637 {
5638 gimple prev_stmt =
5639 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5640 gimple rel_stmt =
5641 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5642 while (rel_stmt)
5643 {
5644 prev_stmt = rel_stmt;
48e1416a 5645 rel_stmt =
fb85abff 5646 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5647 }
5648
48e1416a 5649 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
fb85abff 5650 new_stmt;
5651 }
5652 }
5653
21009880 5654 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
fb85abff 5655 gap_count = 1;
5656 /* If NEXT_STMT accesses the same DR as the previous statement,
5657 put the same TMP_DATA_REF as its vectorized statement; otherwise
5658 get the next data-ref from RESULT_CHAIN. */
21009880 5659 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
fb85abff 5660 break;
5661 }
5662 }
fb85abff 5663}
5664
5665/* Function vect_force_dr_alignment_p.
5666
5667 Returns whether the alignment of a DECL can be forced to be aligned
5668 on ALIGNMENT bit boundary. */
5669
48e1416a 5670bool
fb85abff 5671vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5672{
5673 if (TREE_CODE (decl) != VAR_DECL)
5674 return false;
5675
331d5983 5676 if (decl_in_symtab_p (decl)
5677 && !symtab_node::get (decl)->can_increase_alignment_p ())
8cab13cf 5678 return false;
5679
fb85abff 5680 if (TREE_STATIC (decl))
5681 return (alignment <= MAX_OFILE_ALIGNMENT);
5682 else
5683 return (alignment <= MAX_STACK_ALIGNMENT);
5684}
5685
fb85abff 5686
0822b158 5687/* Return whether the data reference DR is supported with respect to its
5688 alignment.
5689 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5690 it is aligned, i.e., check if it is possible to vectorize it with different
fb85abff 5691 alignment. */
5692
5693enum dr_alignment_support
0822b158 5694vect_supportable_dr_alignment (struct data_reference *dr,
5695 bool check_aligned_accesses)
fb85abff 5696{
5697 gimple stmt = DR_STMT (dr);
5698 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5699 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3754d046 5700 machine_mode mode = TYPE_MODE (vectype);
37545e54 5701 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5702 struct loop *vect_loop = NULL;
5703 bool nested_in_vect_loop = false;
fb85abff 5704
0822b158 5705 if (aligned_access_p (dr) && !check_aligned_accesses)
fb85abff 5706 return dr_aligned;
5707
c71d3c24 5708 /* For now assume all conditional loads/stores support unaligned
5709 access without any special code. */
5710 if (is_gimple_call (stmt)
5711 && gimple_call_internal_p (stmt)
5712 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5713 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5714 return dr_unaligned_supported;
5715
ad074595 5716 if (loop_vinfo)
5717 {
5718 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5719 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5720 }
37545e54 5721
fb85abff 5722 /* Possibly unaligned access. */
5723
5724 /* We can choose between using the implicit realignment scheme (generating
5725 a misaligned_move stmt) and the explicit realignment scheme (generating
282bf14c 5726 aligned loads with a REALIGN_LOAD). There are two variants to the
5727 explicit realignment scheme: optimized, and unoptimized.
fb85abff 5728 We can optimize the realignment only if the step between consecutive
5729 vector loads is equal to the vector size. Since the vector memory
5730 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5731 is guaranteed that the misalignment amount remains the same throughout the
5732 execution of the vectorized loop. Therefore, we can create the
5733 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5734 at the loop preheader.
5735
5736 However, in the case of outer-loop vectorization, when vectorizing a
5737 memory access in the inner-loop nested within the LOOP that is now being
5738 vectorized, while it is guaranteed that the misalignment of the
5739 vectorized memory access will remain the same in different outer-loop
5740 iterations, it is *not* guaranteed that is will remain the same throughout
5741 the execution of the inner-loop. This is because the inner-loop advances
5742 with the original scalar step (and not in steps of VS). If the inner-loop
5743 step happens to be a multiple of VS, then the misalignment remains fixed
5744 and we can use the optimized realignment scheme. For example:
5745
5746 for (i=0; i<N; i++)
5747 for (j=0; j<M; j++)
5748 s += a[i+j];
5749
5750 When vectorizing the i-loop in the above example, the step between
5751 consecutive vector loads is 1, and so the misalignment does not remain
5752 fixed across the execution of the inner-loop, and the realignment cannot
5753 be optimized (as illustrated in the following pseudo vectorized loop):
5754
5755 for (i=0; i<N; i+=4)
5756 for (j=0; j<M; j++){
5757 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5758 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5759 // (assuming that we start from an aligned address).
5760 }
5761
5762 We therefore have to use the unoptimized realignment scheme:
5763
5764 for (i=0; i<N; i+=4)
5765 for (j=k; j<M; j+=4)
5766 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5767 // that the misalignment of the initial address is
5768 // 0).
5769
5770 The loop can then be vectorized as follows:
5771
5772 for (k=0; k<4; k++){
5773 rt = get_realignment_token (&vp[k]);
5774 for (i=0; i<N; i+=4){
5775 v1 = vp[i+k];
5776 for (j=k; j<M; j+=4){
5777 v2 = vp[i+j+VS-1];
5778 va = REALIGN_LOAD <v1,v2,rt>;
5779 vs += va;
5780 v1 = v2;
5781 }
5782 }
5783 } */
5784
5785 if (DR_IS_READ (dr))
5786 {
c6b19c5f 5787 bool is_packed = false;
5788 tree type = (TREE_TYPE (DR_REF (dr)));
5789
d6bf3b14 5790 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
fb85abff 5791 && (!targetm.vectorize.builtin_mask_for_load
5792 || targetm.vectorize.builtin_mask_for_load ()))
5793 {
5794 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ad074595 5795 if ((nested_in_vect_loop
f9ae6f95 5796 && (TREE_INT_CST_LOW (DR_STEP (dr))
ad074595 5797 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5798 || !loop_vinfo)
fb85abff 5799 return dr_explicit_realign;
5800 else
5801 return dr_explicit_realign_optimized;
5802 }
c6b19c5f 5803 if (!known_alignment_for_access_p (dr))
cfa724cf 5804 is_packed = not_size_aligned (DR_REF (dr));
48e1416a 5805
2cd0995e 5806 if ((TYPE_USER_ALIGN (type) && !is_packed)
5807 || targetm.vectorize.
5808 support_vector_misalignment (mode, type,
5809 DR_MISALIGNMENT (dr), is_packed))
fb85abff 5810 /* Can't software pipeline the loads, but can at least do them. */
5811 return dr_unaligned_supported;
5812 }
c6b19c5f 5813 else
5814 {
5815 bool is_packed = false;
5816 tree type = (TREE_TYPE (DR_REF (dr)));
fb85abff 5817
c6b19c5f 5818 if (!known_alignment_for_access_p (dr))
cfa724cf 5819 is_packed = not_size_aligned (DR_REF (dr));
48e1416a 5820
2cd0995e 5821 if ((TYPE_USER_ALIGN (type) && !is_packed)
5822 || targetm.vectorize.
5823 support_vector_misalignment (mode, type,
5824 DR_MISALIGNMENT (dr), is_packed))
c6b19c5f 5825 return dr_unaligned_supported;
5826 }
48e1416a 5827
fb85abff 5828 /* Unsupported. */
5829 return dr_unaligned_unsupported;
5830}