]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-data-refs.c
genattrtab.c (write_header): Include hash-set.h...
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
CommitLineData
b8698a0f 1/* Data References Analysis and Manipulation Utilities for Vectorization.
5624e564 2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
b8698a0f 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
ebfd146a
IR
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
78c60e3d 25#include "dumpfile.h"
ebfd146a 26#include "tm.h"
40e23961
MC
27#include "hash-set.h"
28#include "machmode.h"
29#include "vec.h"
30#include "double-int.h"
31#include "input.h"
32#include "alias.h"
33#include "symtab.h"
34#include "wide-int.h"
35#include "inchash.h"
ebfd146a 36#include "tree.h"
40e23961 37#include "fold-const.h"
d8a2d370 38#include "stor-layout.h"
237e9c04 39#include "tm_p.h"
ebfd146a 40#include "target.h"
60393bbc 41#include "predict.h"
60393bbc
AM
42#include "hard-reg-set.h"
43#include "input.h"
44#include "function.h"
45#include "dominance.h"
46#include "cfg.h"
ebfd146a 47#include "basic-block.h"
cf835838 48#include "gimple-pretty-print.h"
2fb9a547
AM
49#include "tree-ssa-alias.h"
50#include "internal-fn.h"
51#include "tree-eh.h"
52#include "gimple-expr.h"
53#include "is-a.h"
18f429e2 54#include "gimple.h"
45b0be94 55#include "gimplify.h"
5be5c238 56#include "gimple-iterator.h"
18f429e2 57#include "gimplify-me.h"
442b4905
AM
58#include "gimple-ssa.h"
59#include "tree-phinodes.h"
60#include "ssa-iterators.h"
d8a2d370 61#include "stringpool.h"
442b4905 62#include "tree-ssanames.h"
e28030cf
AM
63#include "tree-ssa-loop-ivopts.h"
64#include "tree-ssa-loop-manip.h"
442b4905 65#include "tree-ssa-loop.h"
7ee2468b 66#include "dumpfile.h"
ebfd146a 67#include "cfgloop.h"
ebfd146a
IR
68#include "tree-chrec.h"
69#include "tree-scalar-evolution.h"
70#include "tree-vectorizer.h"
718f9c0f 71#include "diagnostic-core.h"
c582198b
AM
72#include "hash-map.h"
73#include "plugin-api.h"
74#include "ipa-ref.h"
0136f8f0 75#include "cgraph.h"
2eb79bbb
SB
76/* Need to include rtl.h, expr.h, etc. for optabs. */
77#include "expr.h"
b0710fe1 78#include "insn-codes.h"
2eb79bbb 79#include "optabs.h"
9b2b7279 80#include "builtins.h"
6ad386b7 81#include "varasm.h"
ebfd146a 82
272c6793
RS
83/* Return true if load- or store-lanes optab OPTAB is implemented for
84 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
85
86static bool
87vect_lanes_optab_supported_p (const char *name, convert_optab optab,
88 tree vectype, unsigned HOST_WIDE_INT count)
89{
ef4bddc2 90 machine_mode mode, array_mode;
272c6793
RS
91 bool limit_p;
92
93 mode = TYPE_MODE (vectype);
94 limit_p = !targetm.array_mode_supported_p (mode, count);
95 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
96 MODE_INT, limit_p);
97
98 if (array_mode == BLKmode)
99 {
73fbfcad 100 if (dump_enabled_p ())
e645e942
TJ
101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
102 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
78c60e3d 103 GET_MODE_NAME (mode), count);
272c6793
RS
104 return false;
105 }
106
107 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
108 {
73fbfcad 109 if (dump_enabled_p ())
78c60e3d 110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942 111 "cannot use %s<%s><%s>\n", name,
78c60e3d 112 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
272c6793
RS
113 return false;
114 }
115
73fbfcad 116 if (dump_enabled_p ())
78c60e3d 117 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 118 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
78c60e3d 119 GET_MODE_NAME (mode));
272c6793
RS
120
121 return true;
122}
123
124
ebfd146a 125/* Return the smallest scalar part of STMT.
ff802fa1
IR
126 This is used to determine the vectype of the stmt. We generally set the
127 vectype according to the type of the result (lhs). For stmts whose
ebfd146a 128 result-type is different than the type of the arguments (e.g., demotion,
b8698a0f 129 promotion), vectype will be reset appropriately (later). Note that we have
ebfd146a 130 to visit the smallest datatype in this function, because that determines the
ff802fa1 131 VF. If the smallest datatype in the loop is present only as the rhs of a
ebfd146a
IR
132 promotion operation - we'd miss it.
133 Such a case, where a variable of this datatype does not appear in the lhs
134 anywhere in the loop, can only occur if it's an invariant: e.g.:
b8698a0f 135 'int_x = (int) short_inv', which we'd expect to have been optimized away by
ff802fa1
IR
136 invariant motion. However, we cannot rely on invariant motion to always
137 take invariants out of the loop, and so in the case of promotion we also
138 have to check the rhs.
ebfd146a
IR
139 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
140 types. */
141
142tree
143vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
144 HOST_WIDE_INT *rhs_size_unit)
145{
146 tree scalar_type = gimple_expr_type (stmt);
147 HOST_WIDE_INT lhs, rhs;
148
149 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
150
151 if (is_gimple_assign (stmt)
152 && (gimple_assign_cast_p (stmt)
153 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
39f3fed6 154 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
ebfd146a
IR
155 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
156 {
157 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
158
159 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
160 if (rhs < lhs)
161 scalar_type = rhs_type;
162 }
b8698a0f
L
163
164 *lhs_size_unit = lhs;
ebfd146a
IR
165 *rhs_size_unit = rhs;
166 return scalar_type;
167}
168
169
ebfd146a
IR
170/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
171 tested at run-time. Return TRUE if DDR was successfully inserted.
172 Return false if versioning is not supported. */
173
174static bool
175vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
176{
177 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
178
179 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
180 return false;
181
73fbfcad 182 if (dump_enabled_p ())
ebfd146a 183 {
78c60e3d
SS
184 dump_printf_loc (MSG_NOTE, vect_location,
185 "mark for run-time aliasing test between ");
186 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
187 dump_printf (MSG_NOTE, " and ");
188 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
e645e942 189 dump_printf (MSG_NOTE, "\n");
ebfd146a
IR
190 }
191
192 if (optimize_loop_nest_for_size_p (loop))
193 {
73fbfcad 194 if (dump_enabled_p ())
e645e942
TJ
195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
196 "versioning not supported when optimizing"
197 " for size.\n");
ebfd146a
IR
198 return false;
199 }
200
201 /* FORNOW: We don't support versioning with outer-loop vectorization. */
202 if (loop->inner)
203 {
73fbfcad 204 if (dump_enabled_p ())
e645e942
TJ
205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
206 "versioning not yet supported for outer-loops.\n");
ebfd146a
IR
207 return false;
208 }
209
319e6439
RG
210 /* FORNOW: We don't support creating runtime alias tests for non-constant
211 step. */
212 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
213 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
214 {
73fbfcad 215 if (dump_enabled_p ())
e645e942 216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78c60e3d 217 "versioning not yet supported for non-constant "
e645e942 218 "step\n");
319e6439
RG
219 return false;
220 }
221
9771b263 222 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
ebfd146a
IR
223 return true;
224}
225
a70d6342 226
ebfd146a
IR
227/* Function vect_analyze_data_ref_dependence.
228
229 Return TRUE if there (might) exist a dependence between a memory-reference
230 DRA and a memory-reference DRB. When versioning for alias may check a
777e1f09
RG
231 dependence at run-time, return FALSE. Adjust *MAX_VF according to
232 the data dependence. */
b8698a0f 233
ebfd146a
IR
234static bool
235vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
5bfdb7d8 236 loop_vec_info loop_vinfo, int *max_vf)
ebfd146a
IR
237{
238 unsigned int i;
5abe1e05 239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a
IR
240 struct data_reference *dra = DDR_A (ddr);
241 struct data_reference *drb = DDR_B (ddr);
b8698a0f 242 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
ebfd146a 243 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
ebfd146a
IR
244 lambda_vector dist_v;
245 unsigned int loop_depth;
b8698a0f 246
5abe1e05 247 /* In loop analysis all data references should be vectorizable. */
4b5caab7
IR
248 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
249 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
5abe1e05 250 gcc_unreachable ();
4b5caab7 251
5abe1e05 252 /* Independent data accesses. */
ebfd146a 253 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
5abe1e05 254 return false;
a70d6342 255
5abe1e05
RB
256 if (dra == drb
257 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
ebfd146a 258 return false;
5961d779
RB
259
260 /* Even if we have an anti-dependence then, as the vectorized loop covers at
261 least two scalar iterations, there is always also a true dependence.
262 As the vectorizer does not re-order loads and stores we can ignore
263 the anti-dependence if TBAA can disambiguate both DRs similar to the
264 case with known negative distance anti-dependences (positive
265 distance anti-dependences would violate TBAA constraints). */
266 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
267 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
268 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
269 get_alias_set (DR_REF (drb))))
270 return false;
b8698a0f 271
5abe1e05 272 /* Unknown data dependence. */
ebfd146a
IR
273 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
274 {
74bf76ed
JJ
275 /* If user asserted safelen consecutive iterations can be
276 executed concurrently, assume independence. */
277 if (loop->safelen >= 2)
278 {
279 if (loop->safelen < *max_vf)
280 *max_vf = loop->safelen;
d1417442 281 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
74bf76ed
JJ
282 return false;
283 }
284
90eb75f2
RB
285 if (STMT_VINFO_GATHER_P (stmtinfo_a)
286 || STMT_VINFO_GATHER_P (stmtinfo_b))
287 {
288 if (dump_enabled_p ())
289 {
290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
291 "versioning for alias not supported for: "
292 "can't determine dependence between ");
293 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
294 DR_REF (dra));
295 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (drb));
e645e942 298 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
90eb75f2 299 }
fdf6a7b9 300 return true;
90eb75f2
RB
301 }
302
73fbfcad 303 if (dump_enabled_p ())
5abe1e05
RB
304 {
305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306 "versioning for alias required: "
307 "can't determine dependence between ");
308 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
309 DR_REF (dra));
310 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (drb));
e645e942 313 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
5abe1e05 314 }
e4a707c4 315
5abe1e05
RB
316 /* Add to list of ddrs that need to be tested at run-time. */
317 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
a70d6342
IR
318 }
319
5abe1e05 320 /* Known data dependence. */
ebfd146a
IR
321 if (DDR_NUM_DIST_VECTS (ddr) == 0)
322 {
74bf76ed
JJ
323 /* If user asserted safelen consecutive iterations can be
324 executed concurrently, assume independence. */
325 if (loop->safelen >= 2)
326 {
327 if (loop->safelen < *max_vf)
328 *max_vf = loop->safelen;
d1417442 329 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
74bf76ed
JJ
330 return false;
331 }
332
90eb75f2
RB
333 if (STMT_VINFO_GATHER_P (stmtinfo_a)
334 || STMT_VINFO_GATHER_P (stmtinfo_b))
335 {
336 if (dump_enabled_p ())
337 {
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "versioning for alias not supported for: "
340 "bad dist vector for ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
342 DR_REF (dra));
343 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (drb));
e645e942 346 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
90eb75f2 347 }
fdf6a7b9 348 return true;
90eb75f2
RB
349 }
350
73fbfcad 351 if (dump_enabled_p ())
ebfd146a 352 {
e645e942 353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78c60e3d
SS
354 "versioning for alias required: "
355 "bad dist vector for ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
358 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
e645e942 359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a
IR
360 }
361 /* Add to list of ddrs that need to be tested at run-time. */
362 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
b8698a0f 363 }
ebfd146a
IR
364
365 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
9771b263 366 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
ebfd146a
IR
367 {
368 int dist = dist_v[loop_depth];
369
73fbfcad 370 if (dump_enabled_p ())
78c60e3d 371 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 372 "dependence distance = %d.\n", dist);
ebfd146a 373
777e1f09 374 if (dist == 0)
ebfd146a 375 {
73fbfcad 376 if (dump_enabled_p ())
ebfd146a 377 {
e645e942
TJ
378 dump_printf_loc (MSG_NOTE, vect_location,
379 "dependence distance == 0 between ");
78c60e3d
SS
380 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
381 dump_printf (MSG_NOTE, " and ");
382 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
e645e942 383 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a
IR
384 }
385
5185d248
RB
386 /* When we perform grouped accesses and perform implicit CSE
387 by detecting equal accesses and doing disambiguation with
388 runtime alias tests like for
389 .. = a[i];
390 .. = a[i+1];
391 a[i] = ..;
392 a[i+1] = ..;
393 *p = ..;
394 .. = a[i];
395 .. = a[i+1];
396 where we will end up loading { a[i], a[i+1] } once, make
397 sure that inserting group loads before the first load and
e33f43b9
RB
398 stores after the last store will do the right thing.
399 Similar for groups like
400 a[i] = ...;
401 ... = a[i];
402 a[i+1] = ...;
403 where loads from the group interleave with the store. */
404 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
405 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
5185d248
RB
406 {
407 gimple earlier_stmt;
408 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
409 if (DR_IS_WRITE
410 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
411 {
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942
TJ
414 "READ_WRITE dependence in interleaving."
415 "\n");
5185d248
RB
416 return true;
417 }
ebfd146a 418 }
b8698a0f 419
777e1f09
RG
420 continue;
421 }
422
423 if (dist > 0 && DDR_REVERSED_P (ddr))
424 {
425 /* If DDR_REVERSED_P the order of the data-refs in DDR was
426 reversed (to make distance vector positive), and the actual
427 distance is negative. */
73fbfcad 428 if (dump_enabled_p ())
78c60e3d 429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942 430 "dependence distance negative.\n");
f2556b68
RB
431 /* Record a negative dependence distance to later limit the
432 amount of stmt copying / unrolling we can perform.
433 Only need to handle read-after-write dependence. */
434 if (DR_IS_READ (drb)
435 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
436 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
437 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
777e1f09
RG
438 continue;
439 }
440
441 if (abs (dist) >= 2
442 && abs (dist) < *max_vf)
443 {
444 /* The dependence distance requires reduction of the maximal
445 vectorization factor. */
446 *max_vf = abs (dist);
73fbfcad 447 if (dump_enabled_p ())
78c60e3d 448 dump_printf_loc (MSG_NOTE, vect_location,
e645e942
TJ
449 "adjusting maximal vectorization factor to %i\n",
450 *max_vf);
ebfd146a
IR
451 }
452
777e1f09 453 if (abs (dist) >= *max_vf)
ebfd146a 454 {
b8698a0f 455 /* Dependence distance does not create dependence, as far as
777e1f09 456 vectorization is concerned, in this case. */
73fbfcad 457 if (dump_enabled_p ())
78c60e3d 458 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 459 "dependence distance >= VF.\n");
ebfd146a
IR
460 continue;
461 }
462
73fbfcad 463 if (dump_enabled_p ())
ebfd146a 464 {
78c60e3d 465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942
TJ
466 "not vectorized, possible dependence "
467 "between data-refs ");
78c60e3d
SS
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
469 dump_printf (MSG_NOTE, " and ");
470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
e645e942 471 dump_printf (MSG_NOTE, "\n");
ebfd146a
IR
472 }
473
474 return true;
475 }
476
477 return false;
478}
479
480/* Function vect_analyze_data_ref_dependences.
b8698a0f 481
ebfd146a 482 Examine all the data references in the loop, and make sure there do not
777e1f09
RG
483 exist any data dependences between them. Set *MAX_VF according to
484 the maximum vectorization factor the data dependences allow. */
b8698a0f 485
ebfd146a 486bool
5abe1e05 487vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
ebfd146a
IR
488{
489 unsigned int i;
ebfd146a
IR
490 struct data_dependence_relation *ddr;
491
73fbfcad 492 if (dump_enabled_p ())
78c60e3d 493 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 494 "=== vect_analyze_data_ref_dependences ===\n");
5abe1e05 495
d1417442 496 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
5abe1e05
RB
497 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
498 &LOOP_VINFO_DDRS (loop_vinfo),
499 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
500 return false;
501
502 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
503 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
504 return false;
505
506 return true;
507}
508
509
510/* Function vect_slp_analyze_data_ref_dependence.
511
512 Return TRUE if there (might) exist a dependence between a memory-reference
513 DRA and a memory-reference DRB. When versioning for alias may check a
514 dependence at run-time, return FALSE. Adjust *MAX_VF according to
515 the data dependence. */
516
517static bool
518vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
519{
520 struct data_reference *dra = DDR_A (ddr);
521 struct data_reference *drb = DDR_B (ddr);
522
523 /* We need to check dependences of statements marked as unvectorizable
524 as well, they still can prohibit vectorization. */
525
526 /* Independent data accesses. */
527 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
528 return false;
529
530 if (dra == drb)
531 return false;
532
533 /* Read-read is OK. */
534 if (DR_IS_READ (dra) && DR_IS_READ (drb))
535 return false;
536
e6c9d234
RB
537 /* If dra and drb are part of the same interleaving chain consider
538 them independent. */
539 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
540 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
541 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
542 return false;
543
5abe1e05
RB
544 /* Unknown data dependence. */
545 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
fcac74a1 546 {
649d196d
RB
547 if (dump_enabled_p ())
548 {
549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
550 "can't determine dependence between ");
551 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
552 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
553 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
e645e942 554 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
649d196d 555 }
fcac74a1 556 }
649d196d 557 else if (dump_enabled_p ())
fcac74a1 558 {
5abe1e05
RB
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "determined dependence between ");
561 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
562 dump_printf (MSG_NOTE, " and ");
563 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
e645e942 564 dump_printf (MSG_NOTE, "\n");
fcac74a1 565 }
b8698a0f 566
649d196d 567 /* We do not vectorize basic blocks with write-write dependencies. */
5abe1e05
RB
568 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
569 return true;
570
649d196d 571 /* If we have a read-write dependence check that the load is before the store.
5abe1e05
RB
572 When we vectorize basic blocks, vector load can be only before
573 corresponding scalar load, and vector store can be only after its
574 corresponding scalar store. So the order of the acceses is preserved in
575 case the load is before the store. */
649d196d 576 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
5abe1e05 577 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
649d196d
RB
578 {
579 /* That only holds for load-store pairs taking part in vectorization. */
580 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
581 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
582 return false;
583 }
5abe1e05
RB
584
585 return true;
586}
587
588
589/* Function vect_analyze_data_ref_dependences.
590
591 Examine all the data references in the basic-block, and make sure there
592 do not exist any data dependences between them. Set *MAX_VF according to
593 the maximum vectorization factor the data dependences allow. */
594
595bool
596vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
597{
598 struct data_dependence_relation *ddr;
599 unsigned int i;
600
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 603 "=== vect_slp_analyze_data_ref_dependences ===\n");
5abe1e05
RB
604
605 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
606 &BB_VINFO_DDRS (bb_vinfo),
607 vNULL, true))
608 return false;
609
610 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
611 if (vect_slp_analyze_data_ref_dependence (ddr))
ebfd146a
IR
612 return false;
613
614 return true;
615}
616
617
618/* Function vect_compute_data_ref_alignment
619
620 Compute the misalignment of the data reference DR.
621
622 Output:
623 1. If during the misalignment computation it is found that the data reference
624 cannot be vectorized then false is returned.
625 2. DR_MISALIGNMENT (DR) is defined.
626
627 FOR NOW: No analysis is actually performed. Misalignment is calculated
628 only for trivial cases. TODO. */
629
630static bool
631vect_compute_data_ref_alignment (struct data_reference *dr)
632{
633 gimple stmt = DR_STMT (dr);
b8698a0f 634 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
ebfd146a 635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
a70d6342 636 struct loop *loop = NULL;
ebfd146a
IR
637 tree ref = DR_REF (dr);
638 tree vectype;
639 tree base, base_addr;
640 bool base_aligned;
641 tree misalign;
642 tree aligned_to, alignment;
b8698a0f 643
73fbfcad 644 if (dump_enabled_p ())
78c60e3d 645 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 646 "vect_compute_data_ref_alignment:\n");
ebfd146a 647
a70d6342
IR
648 if (loop_vinfo)
649 loop = LOOP_VINFO_LOOP (loop_vinfo);
b8698a0f 650
ebfd146a
IR
651 /* Initialize misalignment to unknown. */
652 SET_DR_MISALIGNMENT (dr, -1);
653
7595989b
RG
654 /* Strided loads perform only component accesses, misalignment information
655 is irrelevant for them. */
656 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
657 return true;
658
ebfd146a
IR
659 misalign = DR_INIT (dr);
660 aligned_to = DR_ALIGNED_TO (dr);
661 base_addr = DR_BASE_ADDRESS (dr);
662 vectype = STMT_VINFO_VECTYPE (stmt_info);
663
664 /* In case the dataref is in an inner-loop of the loop that is being
665 vectorized (LOOP), we use the base and misalignment information
ff802fa1 666 relative to the outer-loop (LOOP). This is ok only if the misalignment
ebfd146a
IR
667 stays the same throughout the execution of the inner-loop, which is why
668 we have to check that the stride of the dataref in the inner-loop evenly
669 divides by the vector size. */
a70d6342 670 if (loop && nested_in_vect_loop_p (loop, stmt))
ebfd146a
IR
671 {
672 tree step = DR_STEP (dr);
673 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
b8698a0f 674
ebfd146a
IR
675 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
676 {
73fbfcad 677 if (dump_enabled_p ())
78c60e3d 678 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 679 "inner step divides the vector-size.\n");
ebfd146a
IR
680 misalign = STMT_VINFO_DR_INIT (stmt_info);
681 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
682 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
683 }
684 else
685 {
73fbfcad 686 if (dump_enabled_p ())
78c60e3d 687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942 688 "inner step doesn't divide the vector-size.\n");
ebfd146a
IR
689 misalign = NULL_TREE;
690 }
691 }
692
3ebde0e9
UW
693 /* Similarly, if we're doing basic-block vectorization, we can only use
694 base and misalignment information relative to an innermost loop if the
695 misalignment stays the same throughout the execution of the loop.
696 As above, this is the case if the stride of the dataref evenly divides
697 by the vector size. */
698 if (!loop)
699 {
700 tree step = DR_STEP (dr);
701 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
702
703 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
704 {
73fbfcad 705 if (dump_enabled_p ())
e645e942
TJ
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "SLP: step doesn't divide the vector-size.\n");
3ebde0e9
UW
708 misalign = NULL_TREE;
709 }
710 }
711
ebfd146a
IR
712 base = build_fold_indirect_ref (base_addr);
713 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
714
715 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
716 || !misalign)
717 {
73fbfcad 718 if (dump_enabled_p ())
ebfd146a 719 {
78c60e3d 720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942 721 "Unknown alignment for access: ");
78c60e3d 722 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
e645e942 723 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a
IR
724 }
725 return true;
726 }
727
b8698a0f 728 if ((DECL_P (base)
ebfd146a
IR
729 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
730 alignment) >= 0)
731 || (TREE_CODE (base_addr) == SSA_NAME
732 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
733 TREE_TYPE (base_addr)))),
7cf64710 734 alignment) >= 0)
0eb77834 735 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
ebfd146a
IR
736 base_aligned = true;
737 else
b8698a0f 738 base_aligned = false;
ebfd146a 739
b8698a0f 740 if (!base_aligned)
ebfd146a 741 {
d6682315
RG
742 /* Do not change the alignment of global variables here if
743 flag_section_anchors is enabled as we already generated
744 RTL for other functions. Most global variables should
745 have been aligned during the IPA increase_alignment pass. */
746 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
747 || (TREE_STATIC (base) && flag_section_anchors))
ebfd146a 748 {
73fbfcad 749 if (dump_enabled_p ())
ebfd146a 750 {
78c60e3d 751 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 752 "can't force alignment of ref: ");
78c60e3d 753 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
e645e942 754 dump_printf (MSG_NOTE, "\n");
ebfd146a
IR
755 }
756 return true;
757 }
b8698a0f 758
ebfd146a
IR
759 /* Force the alignment of the decl.
760 NOTE: This is the only change to the code we make during
761 the analysis phase, before deciding to vectorize the loop. */
73fbfcad 762 if (dump_enabled_p ())
720f5239 763 {
78c60e3d
SS
764 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
765 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
e645e942 766 dump_printf (MSG_NOTE, "\n");
720f5239
IR
767 }
768
c716e67f
XDL
769 ((dataref_aux *)dr->aux)->base_decl = base;
770 ((dataref_aux *)dr->aux)->base_misaligned = true;
ebfd146a
IR
771 }
772
46241ea9
RG
773 /* If this is a backward running DR then first access in the larger
774 vectype actually is N-1 elements before the address in the DR.
775 Adjust misalign accordingly. */
776 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
777 {
778 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
779 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
780 otherwise we wouldn't be here. */
781 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
782 /* PLUS because DR_STEP was negative. */
783 misalign = size_binop (PLUS_EXPR, misalign, offset);
784 }
785
ebfd146a
IR
786 /* Modulo alignment. */
787 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
788
cc269bb6 789 if (!tree_fits_uhwi_p (misalign))
ebfd146a
IR
790 {
791 /* Negative or overflowed misalignment value. */
73fbfcad 792 if (dump_enabled_p ())
78c60e3d 793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942 794 "unexpected misalign value\n");
ebfd146a
IR
795 return false;
796 }
797
eb1ce453 798 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
ebfd146a 799
73fbfcad 800 if (dump_enabled_p ())
ebfd146a 801 {
78c60e3d
SS
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
803 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
804 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
e645e942 805 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a
IR
806 }
807
808 return true;
809}
810
811
812/* Function vect_compute_data_refs_alignment
813
814 Compute the misalignment of data references in the loop.
815 Return FALSE if a data reference is found that cannot be vectorized. */
816
817static bool
b8698a0f 818vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
a70d6342 819 bb_vec_info bb_vinfo)
ebfd146a 820{
9771b263 821 vec<data_reference_p> datarefs;
ebfd146a
IR
822 struct data_reference *dr;
823 unsigned int i;
824
a70d6342
IR
825 if (loop_vinfo)
826 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
827 else
828 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
b8698a0f 829
9771b263 830 FOR_EACH_VEC_ELT (datarefs, i, dr)
4b5caab7
IR
831 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
832 && !vect_compute_data_ref_alignment (dr))
833 {
834 if (bb_vinfo)
835 {
836 /* Mark unsupported statement as unvectorizable. */
837 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
838 continue;
839 }
840 else
841 return false;
842 }
ebfd146a
IR
843
844 return true;
845}
846
847
848/* Function vect_update_misalignment_for_peel
849
850 DR - the data reference whose misalignment is to be adjusted.
851 DR_PEEL - the data reference whose misalignment is being made
852 zero in the vector loop by the peel.
853 NPEEL - the number of iterations in the peel loop if the misalignment
854 of DR_PEEL is known at compile time. */
855
856static void
857vect_update_misalignment_for_peel (struct data_reference *dr,
858 struct data_reference *dr_peel, int npeel)
859{
860 unsigned int i;
9771b263 861 vec<dr_p> same_align_drs;
ebfd146a
IR
862 struct data_reference *current_dr;
863 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
864 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
865 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
866 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
867
868 /* For interleaved data accesses the step in the loop must be multiplied by
869 the size of the interleaving group. */
0d0293ac 870 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
e14c1050 871 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
0d0293ac 872 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
e14c1050 873 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
ebfd146a
IR
874
875 /* It can be assumed that the data refs with the same alignment as dr_peel
876 are aligned in the vector loop. */
877 same_align_drs
878 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
9771b263 879 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
ebfd146a
IR
880 {
881 if (current_dr != dr)
882 continue;
883 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
884 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
885 SET_DR_MISALIGNMENT (dr, 0);
886 return;
887 }
888
889 if (known_alignment_for_access_p (dr)
890 && known_alignment_for_access_p (dr_peel))
891 {
d8ba5b19 892 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
ebfd146a
IR
893 int misal = DR_MISALIGNMENT (dr);
894 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
d8ba5b19 895 misal += negative ? -npeel * dr_size : npeel * dr_size;
5aea1e76 896 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
ebfd146a
IR
897 SET_DR_MISALIGNMENT (dr, misal);
898 return;
899 }
900
73fbfcad 901 if (dump_enabled_p ())
e645e942 902 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
ebfd146a
IR
903 SET_DR_MISALIGNMENT (dr, -1);
904}
905
906
907/* Function vect_verify_datarefs_alignment
908
909 Return TRUE if all data references in the loop can be
910 handled with respect to alignment. */
911
a70d6342
IR
912bool
913vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
ebfd146a 914{
9771b263 915 vec<data_reference_p> datarefs;
ebfd146a
IR
916 struct data_reference *dr;
917 enum dr_alignment_support supportable_dr_alignment;
918 unsigned int i;
919
a70d6342
IR
920 if (loop_vinfo)
921 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
922 else
923 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
924
9771b263 925 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a
IR
926 {
927 gimple stmt = DR_STMT (dr);
928 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
929
38eec4c6
UW
930 if (!STMT_VINFO_RELEVANT_P (stmt_info))
931 continue;
932
4b5caab7
IR
933 /* For interleaving, only the alignment of the first access matters.
934 Skip statements marked as not vectorizable. */
0d0293ac 935 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
e14c1050 936 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4b5caab7 937 || !STMT_VINFO_VECTORIZABLE (stmt_info))
ebfd146a
IR
938 continue;
939
a82960aa
RG
940 /* Strided loads perform only component accesses, alignment is
941 irrelevant for them. */
942 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
943 continue;
944
720f5239 945 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
ebfd146a
IR
946 if (!supportable_dr_alignment)
947 {
73fbfcad 948 if (dump_enabled_p ())
ebfd146a
IR
949 {
950 if (DR_IS_READ (dr))
78c60e3d
SS
951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
952 "not vectorized: unsupported unaligned load.");
ebfd146a 953 else
78c60e3d
SS
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "not vectorized: unsupported unaligned "
956 "store.");
4b5caab7 957
78c60e3d
SS
958 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 DR_REF (dr));
e645e942 960 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a
IR
961 }
962 return false;
963 }
73fbfcad 964 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
78c60e3d 965 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 966 "Vectorizing an unaligned access.\n");
ebfd146a
IR
967 }
968 return true;
969}
970
4c9bcf89
RG
971/* Given an memory reference EXP return whether its alignment is less
972 than its size. */
973
974static bool
975not_size_aligned (tree exp)
976{
cc269bb6 977 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
4c9bcf89
RG
978 return true;
979
eb1ce453 980 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
4c9bcf89
RG
981 > get_object_alignment (exp));
982}
ebfd146a
IR
983
984/* Function vector_alignment_reachable_p
985
986 Return true if vector alignment for DR is reachable by peeling
987 a few loop iterations. Return false otherwise. */
988
989static bool
990vector_alignment_reachable_p (struct data_reference *dr)
991{
992 gimple stmt = DR_STMT (dr);
993 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
994 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
995
0d0293ac 996 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
ebfd146a
IR
997 {
998 /* For interleaved access we peel only if number of iterations in
999 the prolog loop ({VF - misalignment}), is a multiple of the
1000 number of the interleaved accesses. */
1001 int elem_size, mis_in_elements;
1002 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1003
1004 /* FORNOW: handle only known alignment. */
1005 if (!known_alignment_for_access_p (dr))
1006 return false;
1007
1008 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1009 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1010
e14c1050 1011 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
ebfd146a
IR
1012 return false;
1013 }
1014
1015 /* If misalignment is known at the compile time then allow peeling
1016 only if natural alignment is reachable through peeling. */
1017 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1018 {
b8698a0f 1019 HOST_WIDE_INT elmsize =
ebfd146a 1020 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
73fbfcad 1021 if (dump_enabled_p ())
ebfd146a 1022 {
e645e942
TJ
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1025 dump_printf (MSG_NOTE,
1026 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
ebfd146a
IR
1027 }
1028 if (DR_MISALIGNMENT (dr) % elmsize)
1029 {
73fbfcad 1030 if (dump_enabled_p ())
e645e942
TJ
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "data size does not divide the misalignment.\n");
ebfd146a
IR
1033 return false;
1034 }
1035 }
1036
1037 if (!known_alignment_for_access_p (dr))
1038 {
4c9bcf89
RG
1039 tree type = TREE_TYPE (DR_REF (dr));
1040 bool is_packed = not_size_aligned (DR_REF (dr));
73fbfcad 1041 if (dump_enabled_p ())
e645e942
TJ
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043 "Unknown misalignment, is_packed = %d\n",is_packed);
afb119be
RB
1044 if ((TYPE_USER_ALIGN (type) && !is_packed)
1045 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
ebfd146a
IR
1046 return true;
1047 else
1048 return false;
1049 }
1050
1051 return true;
1052}
1053
720f5239
IR
1054
1055/* Calculate the cost of the memory access represented by DR. */
1056
92345349 1057static void
720f5239
IR
1058vect_get_data_access_cost (struct data_reference *dr,
1059 unsigned int *inside_cost,
92345349
BS
1060 unsigned int *outside_cost,
1061 stmt_vector_for_cost *body_cost_vec)
720f5239
IR
1062{
1063 gimple stmt = DR_STMT (dr);
1064 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1065 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1066 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1067 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1068 int ncopies = vf / nunits;
720f5239 1069
38eec4c6 1070 if (DR_IS_READ (dr))
92345349
BS
1071 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1072 NULL, body_cost_vec, false);
720f5239 1073 else
92345349 1074 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
720f5239 1075
73fbfcad 1076 if (dump_enabled_p ())
78c60e3d
SS
1077 dump_printf_loc (MSG_NOTE, vect_location,
1078 "vect_get_data_access_cost: inside_cost = %d, "
e645e942 1079 "outside_cost = %d.\n", *inside_cost, *outside_cost);
720f5239
IR
1080}
1081
1082
720f5239
IR
1083/* Insert DR into peeling hash table with NPEEL as key. */
1084
1085static void
1086vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1087 int npeel)
1088{
1089 struct _vect_peel_info elem, *slot;
bf190e8d 1090 _vect_peel_info **new_slot;
720f5239
IR
1091 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1092
1093 elem.npeel = npeel;
c203e8a7 1094 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
720f5239
IR
1095 if (slot)
1096 slot->count++;
1097 else
1098 {
1099 slot = XNEW (struct _vect_peel_info);
1100 slot->npeel = npeel;
1101 slot->dr = dr;
1102 slot->count = 1;
c203e8a7
TS
1103 new_slot
1104 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
720f5239
IR
1105 *new_slot = slot;
1106 }
1107
8b5e1202
SO
1108 if (!supportable_dr_alignment
1109 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
720f5239
IR
1110 slot->count += VECT_MAX_COST;
1111}
1112
1113
1114/* Traverse peeling hash table to find peeling option that aligns maximum
1115 number of data accesses. */
1116
bf190e8d
LC
1117int
1118vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1119 _vect_peel_extended_info *max)
720f5239 1120{
bf190e8d 1121 vect_peel_info elem = *slot;
720f5239 1122
44542f8e
IR
1123 if (elem->count > max->peel_info.count
1124 || (elem->count == max->peel_info.count
1125 && max->peel_info.npeel > elem->npeel))
720f5239
IR
1126 {
1127 max->peel_info.npeel = elem->npeel;
1128 max->peel_info.count = elem->count;
1129 max->peel_info.dr = elem->dr;
1130 }
1131
1132 return 1;
1133}
1134
1135
ff802fa1
IR
1136/* Traverse peeling hash table and calculate cost for each peeling option.
1137 Find the one with the lowest cost. */
720f5239 1138
bf190e8d
LC
1139int
1140vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1141 _vect_peel_extended_info *min)
720f5239 1142{
bf190e8d 1143 vect_peel_info elem = *slot;
720f5239
IR
1144 int save_misalignment, dummy;
1145 unsigned int inside_cost = 0, outside_cost = 0, i;
1146 gimple stmt = DR_STMT (elem->dr);
1147 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1148 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
9771b263 1149 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
720f5239 1150 struct data_reference *dr;
92345349
BS
1151 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1152 int single_iter_cost;
1153
9771b263
DN
1154 prologue_cost_vec.create (2);
1155 body_cost_vec.create (2);
1156 epilogue_cost_vec.create (2);
720f5239 1157
9771b263 1158 FOR_EACH_VEC_ELT (datarefs, i, dr)
720f5239
IR
1159 {
1160 stmt = DR_STMT (dr);
1161 stmt_info = vinfo_for_stmt (stmt);
1162 /* For interleaving, only the alignment of the first access
1163 matters. */
0d0293ac 1164 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
e14c1050 1165 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
720f5239
IR
1166 continue;
1167
1168 save_misalignment = DR_MISALIGNMENT (dr);
1169 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
92345349
BS
1170 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1171 &body_cost_vec);
720f5239
IR
1172 SET_DR_MISALIGNMENT (dr, save_misalignment);
1173 }
1174
92345349
BS
1175 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1176 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1177 &dummy, single_iter_cost,
1178 &prologue_cost_vec,
1179 &epilogue_cost_vec);
1180
1181 /* Prologue and epilogue costs are added to the target model later.
1182 These costs depend only on the scalar iteration cost, the
1183 number of peeling iterations finally chosen, and the number of
1184 misaligned statements. So discard the information found here. */
9771b263
DN
1185 prologue_cost_vec.release ();
1186 epilogue_cost_vec.release ();
720f5239
IR
1187
1188 if (inside_cost < min->inside_cost
1189 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1190 {
1191 min->inside_cost = inside_cost;
1192 min->outside_cost = outside_cost;
9771b263 1193 min->body_cost_vec.release ();
92345349 1194 min->body_cost_vec = body_cost_vec;
720f5239
IR
1195 min->peel_info.dr = elem->dr;
1196 min->peel_info.npeel = elem->npeel;
1197 }
92345349 1198 else
9771b263 1199 body_cost_vec.release ();
720f5239
IR
1200
1201 return 1;
1202}
1203
1204
1205/* Choose best peeling option by traversing peeling hash table and either
1206 choosing an option with the lowest cost (if cost model is enabled) or the
1207 option that aligns as many accesses as possible. */
1208
1209static struct data_reference *
1210vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
c3e7ee41 1211 unsigned int *npeel,
92345349 1212 stmt_vector_for_cost *body_cost_vec)
720f5239
IR
1213{
1214 struct _vect_peel_extended_info res;
1215
1216 res.peel_info.dr = NULL;
c3284718 1217 res.body_cost_vec = stmt_vector_for_cost ();
720f5239 1218
8b5e1202 1219 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
720f5239
IR
1220 {
1221 res.inside_cost = INT_MAX;
1222 res.outside_cost = INT_MAX;
bf190e8d 1223 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
c203e8a7
TS
1224 ->traverse <_vect_peel_extended_info *,
1225 vect_peeling_hash_get_lowest_cost> (&res);
720f5239
IR
1226 }
1227 else
1228 {
1229 res.peel_info.count = 0;
bf190e8d 1230 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
c203e8a7
TS
1231 ->traverse <_vect_peel_extended_info *,
1232 vect_peeling_hash_get_most_frequent> (&res);
720f5239
IR
1233 }
1234
1235 *npeel = res.peel_info.npeel;
92345349 1236 *body_cost_vec = res.body_cost_vec;
720f5239
IR
1237 return res.peel_info.dr;
1238}
1239
1240
ebfd146a
IR
1241/* Function vect_enhance_data_refs_alignment
1242
1243 This pass will use loop versioning and loop peeling in order to enhance
1244 the alignment of data references in the loop.
1245
1246 FOR NOW: we assume that whatever versioning/peeling takes place, only the
ff802fa1 1247 original loop is to be vectorized. Any other loops that are created by
ebfd146a 1248 the transformations performed in this pass - are not supposed to be
ff802fa1 1249 vectorized. This restriction will be relaxed.
ebfd146a
IR
1250
1251 This pass will require a cost model to guide it whether to apply peeling
ff802fa1 1252 or versioning or a combination of the two. For example, the scheme that
ebfd146a
IR
1253 intel uses when given a loop with several memory accesses, is as follows:
1254 choose one memory access ('p') which alignment you want to force by doing
ff802fa1 1255 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
ebfd146a
IR
1256 other accesses are not necessarily aligned, or (2) use loop versioning to
1257 generate one loop in which all accesses are aligned, and another loop in
1258 which only 'p' is necessarily aligned.
1259
1260 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1261 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1262 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1263
ff802fa1 1264 Devising a cost model is the most critical aspect of this work. It will
ebfd146a 1265 guide us on which access to peel for, whether to use loop versioning, how
ff802fa1 1266 many versions to create, etc. The cost model will probably consist of
ebfd146a
IR
1267 generic considerations as well as target specific considerations (on
1268 powerpc for example, misaligned stores are more painful than misaligned
1269 loads).
1270
1271 Here are the general steps involved in alignment enhancements:
1272
1273 -- original loop, before alignment analysis:
1274 for (i=0; i<N; i++){
1275 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1276 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1277 }
1278
1279 -- After vect_compute_data_refs_alignment:
1280 for (i=0; i<N; i++){
1281 x = q[i]; # DR_MISALIGNMENT(q) = 3
1282 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1283 }
1284
1285 -- Possibility 1: we do loop versioning:
1286 if (p is aligned) {
1287 for (i=0; i<N; i++){ # loop 1A
1288 x = q[i]; # DR_MISALIGNMENT(q) = 3
1289 p[i] = y; # DR_MISALIGNMENT(p) = 0
1290 }
1291 }
1292 else {
1293 for (i=0; i<N; i++){ # loop 1B
1294 x = q[i]; # DR_MISALIGNMENT(q) = 3
1295 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1296 }
1297 }
1298
1299 -- Possibility 2: we do loop peeling:
1300 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1301 x = q[i];
1302 p[i] = y;
1303 }
1304 for (i = 3; i < N; i++){ # loop 2A
1305 x = q[i]; # DR_MISALIGNMENT(q) = 0
1306 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1307 }
1308
1309 -- Possibility 3: combination of loop peeling and versioning:
1310 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1311 x = q[i];
1312 p[i] = y;
1313 }
1314 if (p is aligned) {
1315 for (i = 3; i<N; i++){ # loop 3A
1316 x = q[i]; # DR_MISALIGNMENT(q) = 0
1317 p[i] = y; # DR_MISALIGNMENT(p) = 0
1318 }
1319 }
1320 else {
1321 for (i = 3; i<N; i++){ # loop 3B
1322 x = q[i]; # DR_MISALIGNMENT(q) = 0
1323 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1324 }
1325 }
1326
ff802fa1 1327 These loops are later passed to loop_transform to be vectorized. The
ebfd146a
IR
1328 vectorizer will use the alignment information to guide the transformation
1329 (whether to generate regular loads/stores, or with special handling for
1330 misalignment). */
1331
1332bool
1333vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1334{
9771b263 1335 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
ebfd146a
IR
1336 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1337 enum dr_alignment_support supportable_dr_alignment;
720f5239 1338 struct data_reference *dr0 = NULL, *first_store = NULL;
ebfd146a 1339 struct data_reference *dr;
720f5239 1340 unsigned int i, j;
ebfd146a
IR
1341 bool do_peeling = false;
1342 bool do_versioning = false;
1343 bool stat;
1344 gimple stmt;
1345 stmt_vec_info stmt_info;
720f5239
IR
1346 unsigned int npeel = 0;
1347 bool all_misalignments_unknown = true;
1348 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1349 unsigned possible_npeel_number = 1;
1350 tree vectype;
1351 unsigned int nelements, mis, same_align_drs_max = 0;
c3284718 1352 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
ebfd146a 1353
73fbfcad 1354 if (dump_enabled_p ())
78c60e3d 1355 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1356 "=== vect_enhance_data_refs_alignment ===\n");
ebfd146a
IR
1357
1358 /* While cost model enhancements are expected in the future, the high level
1359 view of the code at this time is as follows:
1360
673beced
RE
1361 A) If there is a misaligned access then see if peeling to align
1362 this access can make all data references satisfy
8f439681
RE
1363 vect_supportable_dr_alignment. If so, update data structures
1364 as needed and return true.
ebfd146a
IR
1365
1366 B) If peeling wasn't possible and there is a data reference with an
1367 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1368 then see if loop versioning checks can be used to make all data
1369 references satisfy vect_supportable_dr_alignment. If so, update
1370 data structures as needed and return true.
1371
1372 C) If neither peeling nor versioning were successful then return false if
1373 any data reference does not satisfy vect_supportable_dr_alignment.
1374
1375 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1376
1377 Note, Possibility 3 above (which is peeling and versioning together) is not
1378 being done at this time. */
1379
1380 /* (1) Peeling to force alignment. */
1381
1382 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1383 Considerations:
1384 + How many accesses will become aligned due to the peeling
1385 - How many accesses will become unaligned due to the peeling,
1386 and the cost of misaligned accesses.
b8698a0f 1387 - The cost of peeling (the extra runtime checks, the increase
720f5239 1388 in code size). */
ebfd146a 1389
9771b263 1390 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a
IR
1391 {
1392 stmt = DR_STMT (dr);
1393 stmt_info = vinfo_for_stmt (stmt);
1394
38eec4c6 1395 if (!STMT_VINFO_RELEVANT_P (stmt_info))
39becbac
RG
1396 continue;
1397
ebfd146a
IR
1398 /* For interleaving, only the alignment of the first access
1399 matters. */
0d0293ac 1400 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
e14c1050 1401 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
ebfd146a
IR
1402 continue;
1403
39becbac
RG
1404 /* For invariant accesses there is nothing to enhance. */
1405 if (integer_zerop (DR_STEP (dr)))
1406 continue;
1407
319e6439
RG
1408 /* Strided loads perform only component accesses, alignment is
1409 irrelevant for them. */
1410 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1411 continue;
1412
720f5239
IR
1413 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1414 do_peeling = vector_alignment_reachable_p (dr);
1415 if (do_peeling)
ebfd146a 1416 {
720f5239
IR
1417 if (known_alignment_for_access_p (dr))
1418 {
1419 unsigned int npeel_tmp;
d8ba5b19
RG
1420 bool negative = tree_int_cst_compare (DR_STEP (dr),
1421 size_zero_node) < 0;
720f5239
IR
1422
1423 /* Save info about DR in the hash table. */
c203e8a7
TS
1424 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1425 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1426 = new hash_table<peel_info_hasher> (1);
720f5239
IR
1427
1428 vectype = STMT_VINFO_VECTYPE (stmt_info);
1429 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1430 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1431 TREE_TYPE (DR_REF (dr))));
d8ba5b19 1432 npeel_tmp = (negative
8b8bba2d
RG
1433 ? (mis - nelements) : (nelements - mis))
1434 & (nelements - 1);
720f5239
IR
1435
1436 /* For multiple types, it is possible that the bigger type access
ff802fa1 1437 will have more than one peeling option. E.g., a loop with two
720f5239 1438 types: one of size (vector size / 4), and the other one of
ff802fa1 1439 size (vector size / 8). Vectorization factor will 8. If both
720f5239 1440 access are misaligned by 3, the first one needs one scalar
ff802fa1 1441 iteration to be aligned, and the second one needs 5. But the
720f5239
IR
1442 the first one will be aligned also by peeling 5 scalar
1443 iterations, and in that case both accesses will be aligned.
1444 Hence, except for the immediate peeling amount, we also want
1445 to try to add full vector size, while we don't exceed
1446 vectorization factor.
1447 We do this automtically for cost model, since we calculate cost
1448 for every peeling option. */
8b5e1202 1449 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
720f5239
IR
1450 possible_npeel_number = vf /nelements;
1451
1452 /* Handle the aligned case. We may decide to align some other
1453 access, making DR unaligned. */
1454 if (DR_MISALIGNMENT (dr) == 0)
1455 {
1456 npeel_tmp = 0;
8b5e1202 1457 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
720f5239
IR
1458 possible_npeel_number++;
1459 }
1460
1461 for (j = 0; j < possible_npeel_number; j++)
1462 {
1463 gcc_assert (npeel_tmp <= vf);
1464 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1465 npeel_tmp += nelements;
1466 }
1467
1468 all_misalignments_unknown = false;
1469 /* Data-ref that was chosen for the case that all the
1470 misalignments are unknown is not relevant anymore, since we
1471 have a data-ref with known alignment. */
1472 dr0 = NULL;
1473 }
1474 else
1475 {
4ba5ea11
RB
1476 /* If we don't know any misalignment values, we prefer
1477 peeling for data-ref that has the maximum number of data-refs
720f5239
IR
1478 with the same alignment, unless the target prefers to align
1479 stores over load. */
1480 if (all_misalignments_unknown)
1481 {
4ba5ea11
RB
1482 unsigned same_align_drs
1483 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1484 if (!dr0
1485 || same_align_drs_max < same_align_drs)
720f5239 1486 {
4ba5ea11 1487 same_align_drs_max = same_align_drs;
720f5239
IR
1488 dr0 = dr;
1489 }
4ba5ea11
RB
1490 /* For data-refs with the same number of related
1491 accesses prefer the one where the misalign
1492 computation will be invariant in the outermost loop. */
1493 else if (same_align_drs_max == same_align_drs)
1494 {
1495 struct loop *ivloop0, *ivloop;
1496 ivloop0 = outermost_invariant_loop_for_expr
1497 (loop, DR_BASE_ADDRESS (dr0));
1498 ivloop = outermost_invariant_loop_for_expr
1499 (loop, DR_BASE_ADDRESS (dr));
1500 if ((ivloop && !ivloop0)
1501 || (ivloop && ivloop0
1502 && flow_loop_nested_p (ivloop, ivloop0)))
1503 dr0 = dr;
1504 }
720f5239 1505
b0af49c4 1506 if (!first_store && DR_IS_WRITE (dr))
720f5239
IR
1507 first_store = dr;
1508 }
1509
1510 /* If there are both known and unknown misaligned accesses in the
1511 loop, we choose peeling amount according to the known
1512 accesses. */
720f5239
IR
1513 if (!supportable_dr_alignment)
1514 {
1515 dr0 = dr;
b0af49c4 1516 if (!first_store && DR_IS_WRITE (dr))
720f5239
IR
1517 first_store = dr;
1518 }
1519 }
1520 }
1521 else
1522 {
1523 if (!aligned_access_p (dr))
1524 {
73fbfcad 1525 if (dump_enabled_p ())
e645e942
TJ
1526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1527 "vector alignment may not be reachable\n");
720f5239
IR
1528 break;
1529 }
1530 }
ebfd146a
IR
1531 }
1532
afb119be
RB
1533 /* Check if we can possibly peel the loop. */
1534 if (!vect_can_advance_ivs_p (loop_vinfo)
ebfd146a
IR
1535 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1536 do_peeling = false;
1537
b1aef01e
RB
1538 /* If we don't know how many times the peeling loop will run
1539 assume it will run VF-1 times and disable peeling if the remaining
1540 iters are less than the vectorization factor. */
1541 if (do_peeling
1542 && all_misalignments_unknown
1543 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1544 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1545 < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1546 do_peeling = false;
1547
1548 if (do_peeling
1549 && all_misalignments_unknown
720f5239
IR
1550 && vect_supportable_dr_alignment (dr0, false))
1551 {
720f5239
IR
1552 /* Check if the target requires to prefer stores over loads, i.e., if
1553 misaligned stores are more expensive than misaligned loads (taking
1554 drs with same alignment into account). */
1555 if (first_store && DR_IS_READ (dr0))
1556 {
1557 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1558 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1559 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1560 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
9771b263
DN
1561 stmt_vector_for_cost dummy;
1562 dummy.create (2);
92345349
BS
1563
1564 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1565 &dummy);
1566 vect_get_data_access_cost (first_store, &store_inside_cost,
1567 &store_outside_cost, &dummy);
720f5239 1568
9771b263 1569 dummy.release ();
720f5239
IR
1570
1571 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1572 aligning the load DR0). */
1573 load_inside_penalty = store_inside_cost;
1574 load_outside_penalty = store_outside_cost;
9771b263
DN
1575 for (i = 0;
1576 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1577 DR_STMT (first_store))).iterate (i, &dr);
720f5239
IR
1578 i++)
1579 if (DR_IS_READ (dr))
1580 {
1581 load_inside_penalty += load_inside_cost;
1582 load_outside_penalty += load_outside_cost;
1583 }
1584 else
1585 {
1586 load_inside_penalty += store_inside_cost;
1587 load_outside_penalty += store_outside_cost;
1588 }
1589
1590 /* Calculate the penalty for leaving DR0 unaligned (by
1591 aligning the FIRST_STORE). */
1592 store_inside_penalty = load_inside_cost;
1593 store_outside_penalty = load_outside_cost;
9771b263
DN
1594 for (i = 0;
1595 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1596 DR_STMT (dr0))).iterate (i, &dr);
720f5239
IR
1597 i++)
1598 if (DR_IS_READ (dr))
1599 {
1600 store_inside_penalty += load_inside_cost;
1601 store_outside_penalty += load_outside_cost;
1602 }
1603 else
1604 {
1605 store_inside_penalty += store_inside_cost;
1606 store_outside_penalty += store_outside_cost;
1607 }
1608
1609 if (load_inside_penalty > store_inside_penalty
1610 || (load_inside_penalty == store_inside_penalty
1611 && load_outside_penalty > store_outside_penalty))
1612 dr0 = first_store;
1613 }
1614
1615 /* In case there are only loads with different unknown misalignments, use
1616 peeling only if it may help to align other accesses in the loop. */
9771b263
DN
1617 if (!first_store
1618 && !STMT_VINFO_SAME_ALIGN_REFS (
1619 vinfo_for_stmt (DR_STMT (dr0))).length ()
720f5239
IR
1620 && vect_supportable_dr_alignment (dr0, false)
1621 != dr_unaligned_supported)
1622 do_peeling = false;
1623 }
1624
1625 if (do_peeling && !dr0)
1626 {
1627 /* Peeling is possible, but there is no data access that is not supported
1628 unless aligned. So we try to choose the best possible peeling. */
1629
1630 /* We should get here only if there are drs with known misalignment. */
1631 gcc_assert (!all_misalignments_unknown);
1632
1633 /* Choose the best peeling from the hash table. */
c3e7ee41 1634 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
92345349 1635 &body_cost_vec);
720f5239
IR
1636 if (!dr0 || !npeel)
1637 do_peeling = false;
b1aef01e
RB
1638
1639 /* If peeling by npeel will result in a remaining loop not iterating
1640 enough to be vectorized then do not peel. */
1641 if (do_peeling
1642 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1643 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1644 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1645 do_peeling = false;
720f5239
IR
1646 }
1647
ebfd146a
IR
1648 if (do_peeling)
1649 {
720f5239
IR
1650 stmt = DR_STMT (dr0);
1651 stmt_info = vinfo_for_stmt (stmt);
1652 vectype = STMT_VINFO_VECTYPE (stmt_info);
1653 nelements = TYPE_VECTOR_SUBPARTS (vectype);
ebfd146a
IR
1654
1655 if (known_alignment_for_access_p (dr0))
1656 {
d8ba5b19
RG
1657 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1658 size_zero_node) < 0;
720f5239
IR
1659 if (!npeel)
1660 {
1661 /* Since it's known at compile time, compute the number of
1662 iterations in the peeled loop (the peeling factor) for use in
1663 updating DR_MISALIGNMENT values. The peeling factor is the
1664 vectorization factor minus the misalignment as an element
1665 count. */
1666 mis = DR_MISALIGNMENT (dr0);
1667 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
8b8bba2d
RG
1668 npeel = ((negative ? mis - nelements : nelements - mis)
1669 & (nelements - 1));
720f5239 1670 }
ebfd146a 1671
b8698a0f 1672 /* For interleaved data access every iteration accesses all the
ebfd146a
IR
1673 members of the group, therefore we divide the number of iterations
1674 by the group size. */
b8698a0f 1675 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
0d0293ac 1676 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
e14c1050 1677 npeel /= GROUP_SIZE (stmt_info);
ebfd146a 1678
73fbfcad 1679 if (dump_enabled_p ())
78c60e3d 1680 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1681 "Try peeling by %d\n", npeel);
ebfd146a
IR
1682 }
1683
1684 /* Ensure that all data refs can be vectorized after the peel. */
9771b263 1685 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a
IR
1686 {
1687 int save_misalignment;
1688
1689 if (dr == dr0)
1690 continue;
1691
1692 stmt = DR_STMT (dr);
1693 stmt_info = vinfo_for_stmt (stmt);
1694 /* For interleaving, only the alignment of the first access
1695 matters. */
0d0293ac 1696 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
e14c1050 1697 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
ebfd146a
IR
1698 continue;
1699
319e6439
RG
1700 /* Strided loads perform only component accesses, alignment is
1701 irrelevant for them. */
1702 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1703 continue;
1704
ebfd146a
IR
1705 save_misalignment = DR_MISALIGNMENT (dr);
1706 vect_update_misalignment_for_peel (dr, dr0, npeel);
720f5239 1707 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
ebfd146a 1708 SET_DR_MISALIGNMENT (dr, save_misalignment);
b8698a0f 1709
ebfd146a
IR
1710 if (!supportable_dr_alignment)
1711 {
1712 do_peeling = false;
1713 break;
1714 }
1715 }
1716
720f5239
IR
1717 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1718 {
1719 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1720 if (!stat)
1721 do_peeling = false;
1722 else
c7e62a26 1723 {
9771b263 1724 body_cost_vec.release ();
c7e62a26
RG
1725 return stat;
1726 }
720f5239
IR
1727 }
1728
4f17aa0b
XDL
1729 if (do_peeling)
1730 {
1731 unsigned max_allowed_peel
1732 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1733 if (max_allowed_peel != (unsigned)-1)
1734 {
1735 unsigned max_peel = npeel;
1736 if (max_peel == 0)
1737 {
1738 gimple dr_stmt = DR_STMT (dr0);
1739 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1740 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1741 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1742 }
1743 if (max_peel > max_allowed_peel)
1744 {
1745 do_peeling = false;
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "Disable peeling, max peels reached: %d\n", max_peel);
1749 }
1750 }
1751 }
1752
ebfd146a
IR
1753 if (do_peeling)
1754 {
c3e7ee41 1755 stmt_info_for_cost *si;
92345349 1756 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
c3e7ee41 1757
ebfd146a
IR
1758 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1759 If the misalignment of DR_i is identical to that of dr0 then set
1760 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1761 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1762 by the peeling factor times the element size of DR_i (MOD the
1763 vectorization factor times the size). Otherwise, the
1764 misalignment of DR_i must be set to unknown. */
9771b263 1765 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a
IR
1766 if (dr != dr0)
1767 vect_update_misalignment_for_peel (dr, dr0, npeel);
1768
1769 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
720f5239 1770 if (npeel)
15e693cc 1771 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
720f5239 1772 else
15e693cc
RB
1773 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1774 = DR_MISALIGNMENT (dr0);
ebfd146a 1775 SET_DR_MISALIGNMENT (dr0, 0);
73fbfcad 1776 if (dump_enabled_p ())
78c60e3d
SS
1777 {
1778 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1779 "Alignment of access forced using peeling.\n");
78c60e3d 1780 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1781 "Peeling for alignment will be applied.\n");
78c60e3d 1782 }
c3e7ee41
BS
1783 /* We've delayed passing the inside-loop peeling costs to the
1784 target cost model until we were sure peeling would happen.
1785 Do so now. */
9771b263 1786 if (body_cost_vec.exists ())
c3e7ee41 1787 {
9771b263 1788 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
92345349
BS
1789 {
1790 struct _stmt_vec_info *stmt_info
1791 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1792 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1793 si->misalign, vect_body);
1794 }
9771b263 1795 body_cost_vec.release ();
c3e7ee41
BS
1796 }
1797
a70d6342 1798 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
ebfd146a
IR
1799 gcc_assert (stat);
1800 return stat;
1801 }
1802 }
1803
9771b263 1804 body_cost_vec.release ();
ebfd146a
IR
1805
1806 /* (2) Versioning to force alignment. */
1807
1808 /* Try versioning if:
d6d11272
XDL
1809 1) optimize loop for speed
1810 2) there is at least one unsupported misaligned data ref with an unknown
ebfd146a 1811 misalignment, and
d6d11272
XDL
1812 3) all misaligned data refs with a known misalignment are supported, and
1813 4) the number of runtime alignment checks is within reason. */
ebfd146a 1814
b8698a0f 1815 do_versioning =
d6d11272 1816 optimize_loop_nest_for_speed_p (loop)
ebfd146a
IR
1817 && (!loop->inner); /* FORNOW */
1818
1819 if (do_versioning)
1820 {
9771b263 1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a
IR
1822 {
1823 stmt = DR_STMT (dr);
1824 stmt_info = vinfo_for_stmt (stmt);
1825
1826 /* For interleaving, only the alignment of the first access
1827 matters. */
1828 if (aligned_access_p (dr)
0d0293ac 1829 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
e14c1050 1830 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
ebfd146a
IR
1831 continue;
1832
319e6439
RG
1833 /* Strided loads perform only component accesses, alignment is
1834 irrelevant for them. */
1835 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1836 continue;
1837
720f5239 1838 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
ebfd146a
IR
1839
1840 if (!supportable_dr_alignment)
1841 {
1842 gimple stmt;
1843 int mask;
1844 tree vectype;
1845
1846 if (known_alignment_for_access_p (dr)
9771b263 1847 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
ebfd146a
IR
1848 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1849 {
1850 do_versioning = false;
1851 break;
1852 }
1853
1854 stmt = DR_STMT (dr);
1855 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1856 gcc_assert (vectype);
b8698a0f 1857
ebfd146a
IR
1858 /* The rightmost bits of an aligned address must be zeros.
1859 Construct the mask needed for this test. For example,
1860 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1861 mask must be 15 = 0xf. */
1862 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1863
1864 /* FORNOW: use the same mask to test all potentially unaligned
1865 references in the loop. The vectorizer currently supports
1866 a single vector size, see the reference to
1867 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1868 vectorization factor is computed. */
1869 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1870 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1871 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
9771b263
DN
1872 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1873 DR_STMT (dr));
ebfd146a
IR
1874 }
1875 }
b8698a0f 1876
ebfd146a 1877 /* Versioning requires at least one misaligned data reference. */
e9dbe7bb 1878 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
ebfd146a
IR
1879 do_versioning = false;
1880 else if (!do_versioning)
9771b263 1881 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
ebfd146a
IR
1882 }
1883
1884 if (do_versioning)
1885 {
9771b263 1886 vec<gimple> may_misalign_stmts
ebfd146a
IR
1887 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1888 gimple stmt;
1889
1890 /* It can now be assumed that the data references in the statements
1891 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1892 of the loop being vectorized. */
9771b263 1893 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
ebfd146a
IR
1894 {
1895 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1896 dr = STMT_VINFO_DATA_REF (stmt_info);
1897 SET_DR_MISALIGNMENT (dr, 0);
73fbfcad 1898 if (dump_enabled_p ())
e645e942
TJ
1899 dump_printf_loc (MSG_NOTE, vect_location,
1900 "Alignment of access forced using versioning.\n");
ebfd146a
IR
1901 }
1902
73fbfcad 1903 if (dump_enabled_p ())
e645e942
TJ
1904 dump_printf_loc (MSG_NOTE, vect_location,
1905 "Versioning for alignment will be applied.\n");
ebfd146a
IR
1906
1907 /* Peeling and versioning can't be done together at this time. */
1908 gcc_assert (! (do_peeling && do_versioning));
1909
a70d6342 1910 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
ebfd146a
IR
1911 gcc_assert (stat);
1912 return stat;
1913 }
1914
1915 /* This point is reached if neither peeling nor versioning is being done. */
1916 gcc_assert (! (do_peeling || do_versioning));
1917
a70d6342 1918 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
ebfd146a
IR
1919 return stat;
1920}
1921
1922
777e1f09
RG
1923/* Function vect_find_same_alignment_drs.
1924
1925 Update group and alignment relations according to the chosen
1926 vectorization factor. */
1927
1928static void
1929vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1930 loop_vec_info loop_vinfo)
1931{
1932 unsigned int i;
1933 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1934 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1935 struct data_reference *dra = DDR_A (ddr);
1936 struct data_reference *drb = DDR_B (ddr);
1937 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1938 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1939 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1940 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1941 lambda_vector dist_v;
1942 unsigned int loop_depth;
1943
1944 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1945 return;
1946
720f5239 1947 if (dra == drb)
777e1f09
RG
1948 return;
1949
1950 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1951 return;
1952
1953 /* Loop-based vectorization and known data dependence. */
1954 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1955 return;
1956
46241ea9
RG
1957 /* Data-dependence analysis reports a distance vector of zero
1958 for data-references that overlap only in the first iteration
1959 but have different sign step (see PR45764).
1960 So as a sanity check require equal DR_STEP. */
1961 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1962 return;
1963
777e1f09 1964 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
9771b263 1965 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
777e1f09
RG
1966 {
1967 int dist = dist_v[loop_depth];
1968
73fbfcad 1969 if (dump_enabled_p ())
78c60e3d 1970 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1971 "dependence distance = %d.\n", dist);
777e1f09
RG
1972
1973 /* Same loop iteration. */
1974 if (dist == 0
1975 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1976 {
1977 /* Two references with distance zero have the same alignment. */
9771b263
DN
1978 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1979 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
73fbfcad 1980 if (dump_enabled_p ())
777e1f09 1981 {
e645e942
TJ
1982 dump_printf_loc (MSG_NOTE, vect_location,
1983 "accesses have the same alignment.\n");
78c60e3d 1984 dump_printf (MSG_NOTE,
e645e942 1985 "dependence distance modulo vf == 0 between ");
78c60e3d
SS
1986 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1987 dump_printf (MSG_NOTE, " and ");
1988 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
e645e942 1989 dump_printf (MSG_NOTE, "\n");
777e1f09
RG
1990 }
1991 }
1992 }
1993}
1994
1995
ebfd146a
IR
1996/* Function vect_analyze_data_refs_alignment
1997
1998 Analyze the alignment of the data-references in the loop.
1999 Return FALSE if a data reference is found that cannot be vectorized. */
2000
2001bool
b8698a0f 2002vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
a70d6342 2003 bb_vec_info bb_vinfo)
ebfd146a 2004{
73fbfcad 2005 if (dump_enabled_p ())
78c60e3d 2006 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2007 "=== vect_analyze_data_refs_alignment ===\n");
ebfd146a 2008
777e1f09
RG
2009 /* Mark groups of data references with same alignment using
2010 data dependence information. */
2011 if (loop_vinfo)
2012 {
9771b263 2013 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
777e1f09
RG
2014 struct data_dependence_relation *ddr;
2015 unsigned int i;
2016
9771b263 2017 FOR_EACH_VEC_ELT (ddrs, i, ddr)
777e1f09
RG
2018 vect_find_same_alignment_drs (ddr, loop_vinfo);
2019 }
2020
a70d6342 2021 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
ebfd146a 2022 {
73fbfcad 2023 if (dump_enabled_p ())
e645e942
TJ
2024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2025 "not vectorized: can't calculate alignment "
2026 "for data ref.\n");
ebfd146a
IR
2027 return false;
2028 }
2029
2030 return true;
2031}
2032
2033
0d0293ac
MM
2034/* Analyze groups of accesses: check that DR belongs to a group of
2035 accesses of legal size, step, etc. Detect gaps, single element
2036 interleaving, and other special cases. Set grouped access info.
ebfd146a
IR
2037 Collect groups of strided stores for further use in SLP analysis. */
2038
2039static bool
2040vect_analyze_group_access (struct data_reference *dr)
2041{
2042 tree step = DR_STEP (dr);
2043 tree scalar_type = TREE_TYPE (DR_REF (dr));
2044 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2045 gimple stmt = DR_STMT (dr);
2046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2047 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
a70d6342 2048 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
ebfd146a 2049 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
0d0293ac 2050 HOST_WIDE_INT groupsize, last_accessed_element = 1;
ebfd146a 2051 bool slp_impossible = false;
deaf836c
IR
2052 struct loop *loop = NULL;
2053
2054 if (loop_vinfo)
2055 loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 2056
0d0293ac
MM
2057 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2058 size of the interleaving group (including gaps). */
08940f33 2059 groupsize = absu_hwi (dr_step) / type_size;
ebfd146a
IR
2060
2061 /* Not consecutive access is possible only if it is a part of interleaving. */
e14c1050 2062 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
ebfd146a
IR
2063 {
2064 /* Check if it this DR is a part of interleaving, and is a single
2065 element of the group that is accessed in the loop. */
b8698a0f 2066
ebfd146a
IR
2067 /* Gaps are supported only for loads. STEP must be a multiple of the type
2068 size. The size of the group must be a power of 2. */
2069 if (DR_IS_READ (dr)
2070 && (dr_step % type_size) == 0
0d0293ac
MM
2071 && groupsize > 0
2072 && exact_log2 (groupsize) != -1)
ebfd146a 2073 {
e14c1050 2074 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
0d0293ac 2075 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
73fbfcad 2076 if (dump_enabled_p ())
ebfd146a 2077 {
e645e942
TJ
2078 dump_printf_loc (MSG_NOTE, vect_location,
2079 "Detected single element interleaving ");
78c60e3d
SS
2080 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2081 dump_printf (MSG_NOTE, " step ");
2082 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
e645e942 2083 dump_printf (MSG_NOTE, "\n");
ebfd146a 2084 }
48df3fa6
IR
2085
2086 if (loop_vinfo)
2087 {
73fbfcad 2088 if (dump_enabled_p ())
78c60e3d 2089 dump_printf_loc (MSG_NOTE, vect_location,
e645e942
TJ
2090 "Data access with gaps requires scalar "
2091 "epilogue loop\n");
deaf836c
IR
2092 if (loop->inner)
2093 {
73fbfcad 2094 if (dump_enabled_p ())
78c60e3d
SS
2095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096 "Peeling for outer loop is not"
e645e942 2097 " supported\n");
deaf836c
IR
2098 return false;
2099 }
2100
2101 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
48df3fa6
IR
2102 }
2103
ebfd146a
IR
2104 return true;
2105 }
4b5caab7 2106
73fbfcad 2107 if (dump_enabled_p ())
4b5caab7 2108 {
78c60e3d 2109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942
TJ
2110 "not consecutive access ");
2111 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2112 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4b5caab7
IR
2113 }
2114
2115 if (bb_vinfo)
2116 {
2117 /* Mark the statement as unvectorizable. */
2118 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2119 return true;
2120 }
78c60e3d 2121
ebfd146a
IR
2122 return false;
2123 }
2124
e14c1050 2125 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
ebfd146a
IR
2126 {
2127 /* First stmt in the interleaving chain. Check the chain. */
e14c1050 2128 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
ebfd146a 2129 struct data_reference *data_ref = dr;
df398a37 2130 unsigned int count = 1;
ebfd146a
IR
2131 tree prev_init = DR_INIT (data_ref);
2132 gimple prev = stmt;
08940f33
RB
2133 HOST_WIDE_INT diff, gaps = 0;
2134 unsigned HOST_WIDE_INT count_in_bytes;
ebfd146a
IR
2135
2136 while (next)
2137 {
ff802fa1
IR
2138 /* Skip same data-refs. In case that two or more stmts share
2139 data-ref (supported only for loads), we vectorize only the first
2140 stmt, and the rest get their vectorized loads from the first
2141 one. */
ebfd146a
IR
2142 if (!tree_int_cst_compare (DR_INIT (data_ref),
2143 DR_INIT (STMT_VINFO_DATA_REF (
2144 vinfo_for_stmt (next)))))
2145 {
b0af49c4 2146 if (DR_IS_WRITE (data_ref))
ebfd146a 2147 {
73fbfcad 2148 if (dump_enabled_p ())
e645e942
TJ
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "Two store stmts share the same dr.\n");
ebfd146a
IR
2151 return false;
2152 }
2153
ebfd146a 2154 /* For load use the same data-ref load. */
e14c1050 2155 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
ebfd146a
IR
2156
2157 prev = next;
e14c1050 2158 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
ebfd146a
IR
2159 continue;
2160 }
48df3fa6 2161
ebfd146a 2162 prev = next;
08940f33 2163 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
ebfd146a 2164
08940f33
RB
2165 /* All group members have the same STEP by construction. */
2166 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
ebfd146a 2167
ebfd146a
IR
2168 /* Check that the distance between two accesses is equal to the type
2169 size. Otherwise, we have gaps. */
2170 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2171 - TREE_INT_CST_LOW (prev_init)) / type_size;
2172 if (diff != 1)
2173 {
2174 /* FORNOW: SLP of accesses with gaps is not supported. */
2175 slp_impossible = true;
b0af49c4 2176 if (DR_IS_WRITE (data_ref))
ebfd146a 2177 {
73fbfcad 2178 if (dump_enabled_p ())
e645e942
TJ
2179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180 "interleaved store with gaps\n");
ebfd146a
IR
2181 return false;
2182 }
4da39468
IR
2183
2184 gaps += diff - 1;
ebfd146a
IR
2185 }
2186
48df3fa6
IR
2187 last_accessed_element += diff;
2188
ebfd146a 2189 /* Store the gap from the previous member of the group. If there is no
e14c1050
IR
2190 gap in the access, GROUP_GAP is always 1. */
2191 GROUP_GAP (vinfo_for_stmt (next)) = diff;
ebfd146a
IR
2192
2193 prev_init = DR_INIT (data_ref);
e14c1050 2194 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
ebfd146a
IR
2195 /* Count the number of data-refs in the chain. */
2196 count++;
2197 }
2198
2199 /* COUNT is the number of accesses found, we multiply it by the size of
2200 the type to get COUNT_IN_BYTES. */
2201 count_in_bytes = type_size * count;
2202
b8698a0f 2203 /* Check that the size of the interleaving (including gaps) is not
a70d6342 2204 greater than STEP. */
08940f33
RB
2205 if (dr_step != 0
2206 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
ebfd146a 2207 {
73fbfcad 2208 if (dump_enabled_p ())
ebfd146a 2209 {
e645e942 2210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78c60e3d 2211 "interleaving size is greater than step for ");
e645e942
TJ
2212 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2213 DR_REF (dr));
2214 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a
IR
2215 }
2216 return false;
2217 }
2218
2219 /* Check that the size of the interleaving is equal to STEP for stores,
2220 i.e., that there are no gaps. */
08940f33
RB
2221 if (dr_step != 0
2222 && absu_hwi (dr_step) != count_in_bytes)
ebfd146a
IR
2223 {
2224 if (DR_IS_READ (dr))
2225 {
2226 slp_impossible = true;
2227 /* There is a gap after the last load in the group. This gap is a
0d0293ac
MM
2228 difference between the groupsize and the number of elements.
2229 When there is no gap, this difference should be 0. */
2230 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
ebfd146a
IR
2231 }
2232 else
2233 {
73fbfcad 2234 if (dump_enabled_p ())
e645e942
TJ
2235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236 "interleaved store with gaps\n");
ebfd146a
IR
2237 return false;
2238 }
2239 }
2240
2241 /* Check that STEP is a multiple of type size. */
08940f33
RB
2242 if (dr_step != 0
2243 && (dr_step % type_size) != 0)
ebfd146a 2244 {
73fbfcad 2245 if (dump_enabled_p ())
ebfd146a 2246 {
78c60e3d
SS
2247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248 "step is not a multiple of type size: step ");
2249 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2250 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2251 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2252 TYPE_SIZE_UNIT (scalar_type));
e645e942 2253 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a
IR
2254 }
2255 return false;
2256 }
2257
0d0293ac
MM
2258 if (groupsize == 0)
2259 groupsize = count;
b8698a0f 2260
0d0293ac 2261 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
73fbfcad 2262 if (dump_enabled_p ())
e645e942
TJ
2263 dump_printf_loc (MSG_NOTE, vect_location,
2264 "Detected interleaving of size %d\n", (int)groupsize);
ebfd146a 2265
b8698a0f 2266 /* SLP: create an SLP data structure for every interleaving group of
ebfd146a 2267 stores for further analysis in vect_analyse_slp. */
b0af49c4 2268 if (DR_IS_WRITE (dr) && !slp_impossible)
a70d6342
IR
2269 {
2270 if (loop_vinfo)
9771b263 2271 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
a70d6342 2272 if (bb_vinfo)
9771b263 2273 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
a70d6342 2274 }
48df3fa6
IR
2275
2276 /* There is a gap in the end of the group. */
0d0293ac 2277 if (groupsize - last_accessed_element > 0 && loop_vinfo)
48df3fa6 2278 {
73fbfcad 2279 if (dump_enabled_p ())
78c60e3d 2280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942
TJ
2281 "Data access with gaps requires scalar "
2282 "epilogue loop\n");
deaf836c
IR
2283 if (loop->inner)
2284 {
73fbfcad 2285 if (dump_enabled_p ())
e645e942
TJ
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "Peeling for outer loop is not supported\n");
deaf836c
IR
2288 return false;
2289 }
2290
2291 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
48df3fa6 2292 }
ebfd146a
IR
2293 }
2294
2295 return true;
2296}
2297
2298
2299/* Analyze the access pattern of the data-reference DR.
2300 In case of non-consecutive accesses call vect_analyze_group_access() to
0d0293ac 2301 analyze groups of accesses. */
ebfd146a
IR
2302
2303static bool
2304vect_analyze_data_ref_access (struct data_reference *dr)
2305{
2306 tree step = DR_STEP (dr);
2307 tree scalar_type = TREE_TYPE (DR_REF (dr));
2308 gimple stmt = DR_STMT (dr);
2309 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2310 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
a70d6342 2311 struct loop *loop = NULL;
ebfd146a 2312
a70d6342
IR
2313 if (loop_vinfo)
2314 loop = LOOP_VINFO_LOOP (loop_vinfo);
b8698a0f 2315
a70d6342 2316 if (loop_vinfo && !step)
ebfd146a 2317 {
73fbfcad 2318 if (dump_enabled_p ())
e645e942
TJ
2319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 "bad data-ref access in loop\n");
ebfd146a
IR
2321 return false;
2322 }
2323
6e8dad05 2324 /* Allow invariant loads in not nested loops. */
319e6439 2325 if (loop_vinfo && integer_zerop (step))
39becbac
RG
2326 {
2327 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
6e8dad05
RB
2328 if (nested_in_vect_loop_p (loop, stmt))
2329 {
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2332 "zero step in inner loop of nest\n");
6e8dad05
RB
2333 return false;
2334 }
39becbac
RG
2335 return DR_IS_READ (dr);
2336 }
ebfd146a 2337
a70d6342 2338 if (loop && nested_in_vect_loop_p (loop, stmt))
ebfd146a
IR
2339 {
2340 /* Interleaved accesses are not yet supported within outer-loop
2341 vectorization for references in the inner-loop. */
e14c1050 2342 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
ebfd146a
IR
2343
2344 /* For the rest of the analysis we use the outer-loop step. */
2345 step = STMT_VINFO_DR_STEP (stmt_info);
319e6439 2346 if (integer_zerop (step))
ebfd146a 2347 {
73fbfcad 2348 if (dump_enabled_p ())
78c60e3d 2349 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2350 "zero step in outer loop.\n");
ebfd146a 2351 if (DR_IS_READ (dr))
b8698a0f 2352 return true;
ebfd146a
IR
2353 else
2354 return false;
2355 }
2356 }
2357
2358 /* Consecutive? */
319e6439 2359 if (TREE_CODE (step) == INTEGER_CST)
ebfd146a 2360 {
319e6439
RG
2361 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2362 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2363 || (dr_step < 0
2364 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2365 {
2366 /* Mark that it is not interleaving. */
2367 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2368 return true;
2369 }
ebfd146a
IR
2370 }
2371
a70d6342 2372 if (loop && nested_in_vect_loop_p (loop, stmt))
ebfd146a 2373 {
73fbfcad 2374 if (dump_enabled_p ())
78c60e3d 2375 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2376 "grouped access in outer loop.\n");
ebfd146a
IR
2377 return false;
2378 }
2379
319e6439
RG
2380 /* Assume this is a DR handled by non-constant strided load case. */
2381 if (TREE_CODE (step) != INTEGER_CST)
2382 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2383
ebfd146a
IR
2384 /* Not consecutive access - check if it's a part of interleaving group. */
2385 return vect_analyze_group_access (dr);
2386}
2387
839c74bc
CH
2388
2389
2390/* A helper function used in the comparator function to sort data
2391 references. T1 and T2 are two data references to be compared.
2392 The function returns -1, 0, or 1. */
2393
2394static int
2395compare_tree (tree t1, tree t2)
2396{
2397 int i, cmp;
2398 enum tree_code code;
2399 char tclass;
2400
2401 if (t1 == t2)
2402 return 0;
2403 if (t1 == NULL)
2404 return -1;
2405 if (t2 == NULL)
2406 return 1;
2407
2408
2409 if (TREE_CODE (t1) != TREE_CODE (t2))
2410 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2411
2412 code = TREE_CODE (t1);
2413 switch (code)
2414 {
2415 /* For const values, we can just use hash values for comparisons. */
2416 case INTEGER_CST:
2417 case REAL_CST:
2418 case FIXED_CST:
2419 case STRING_CST:
2420 case COMPLEX_CST:
2421 case VECTOR_CST:
2422 {
2423 hashval_t h1 = iterative_hash_expr (t1, 0);
2424 hashval_t h2 = iterative_hash_expr (t2, 0);
2425 if (h1 != h2)
2426 return h1 < h2 ? -1 : 1;
2427 break;
2428 }
2429
2430 case SSA_NAME:
2431 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2432 if (cmp != 0)
2433 return cmp;
2434
2435 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2436 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2437 break;
2438
2439 default:
2440 tclass = TREE_CODE_CLASS (code);
2441
2442 /* For var-decl, we could compare their UIDs. */
2443 if (tclass == tcc_declaration)
2444 {
2445 if (DECL_UID (t1) != DECL_UID (t2))
2446 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2447 break;
2448 }
2449
2450 /* For expressions with operands, compare their operands recursively. */
2451 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2452 {
2453 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2454 if (cmp != 0)
2455 return cmp;
2456 }
2457 }
2458
2459 return 0;
2460}
2461
2462
5abe1e05
RB
2463/* Compare two data-references DRA and DRB to group them into chunks
2464 suitable for grouping. */
2465
2466static int
2467dr_group_sort_cmp (const void *dra_, const void *drb_)
2468{
2469 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2470 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
5abe1e05
RB
2471 int cmp;
2472
2473 /* Stabilize sort. */
2474 if (dra == drb)
2475 return 0;
2476
2477 /* Ordering of DRs according to base. */
2478 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2479 {
839c74bc
CH
2480 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2481 if (cmp != 0)
2482 return cmp;
5abe1e05
RB
2483 }
2484
2485 /* And according to DR_OFFSET. */
2486 if (!dr_equal_offsets_p (dra, drb))
2487 {
839c74bc
CH
2488 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2489 if (cmp != 0)
2490 return cmp;
5abe1e05
RB
2491 }
2492
2493 /* Put reads before writes. */
2494 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2495 return DR_IS_READ (dra) ? -1 : 1;
2496
2497 /* Then sort after access size. */
2498 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2499 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2500 {
839c74bc
CH
2501 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2502 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2503 if (cmp != 0)
2504 return cmp;
5abe1e05
RB
2505 }
2506
2507 /* And after step. */
2508 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2509 {
839c74bc
CH
2510 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2511 if (cmp != 0)
2512 return cmp;
5abe1e05
RB
2513 }
2514
2515 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2516 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2517 if (cmp == 0)
2518 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2519 return cmp;
2520}
ebfd146a
IR
2521
2522/* Function vect_analyze_data_ref_accesses.
2523
2524 Analyze the access pattern of all the data references in the loop.
2525
2526 FORNOW: the only access pattern that is considered vectorizable is a
2527 simple step 1 (consecutive) access.
2528
2529 FORNOW: handle only arrays and pointer accesses. */
2530
2531bool
a70d6342 2532vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
ebfd146a
IR
2533{
2534 unsigned int i;
9771b263 2535 vec<data_reference_p> datarefs;
ebfd146a
IR
2536 struct data_reference *dr;
2537
73fbfcad 2538 if (dump_enabled_p ())
78c60e3d 2539 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2540 "=== vect_analyze_data_ref_accesses ===\n");
ebfd146a 2541
a70d6342
IR
2542 if (loop_vinfo)
2543 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2544 else
2545 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2546
5abe1e05
RB
2547 if (datarefs.is_empty ())
2548 return true;
2549
2550 /* Sort the array of datarefs to make building the interleaving chains
3d54b29d
JJ
2551 linear. Don't modify the original vector's order, it is needed for
2552 determining what dependencies are reversed. */
2553 vec<data_reference_p> datarefs_copy = datarefs.copy ();
75509ba2 2554 datarefs_copy.qsort (dr_group_sort_cmp);
5abe1e05
RB
2555
2556 /* Build the interleaving chains. */
3d54b29d 2557 for (i = 0; i < datarefs_copy.length () - 1;)
5abe1e05 2558 {
3d54b29d 2559 data_reference_p dra = datarefs_copy[i];
5abe1e05
RB
2560 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2561 stmt_vec_info lastinfo = NULL;
3d54b29d 2562 for (i = i + 1; i < datarefs_copy.length (); ++i)
5abe1e05 2563 {
3d54b29d 2564 data_reference_p drb = datarefs_copy[i];
5abe1e05
RB
2565 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2566
2567 /* ??? Imperfect sorting (non-compatible types, non-modulo
2568 accesses, same accesses) can lead to a group to be artificially
2569 split here as we don't just skip over those. If it really
2570 matters we can push those to a worklist and re-iterate
2571 over them. The we can just skip ahead to the next DR here. */
2572
2573 /* Check that the data-refs have same first location (except init)
61331c48
JJ
2574 and they are both either store or load (not load and store,
2575 not masked loads or stores). */
5abe1e05
RB
2576 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2577 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2578 DR_BASE_ADDRESS (drb), 0)
61331c48
JJ
2579 || !dr_equal_offsets_p (dra, drb)
2580 || !gimple_assign_single_p (DR_STMT (dra))
2581 || !gimple_assign_single_p (DR_STMT (drb)))
5abe1e05
RB
2582 break;
2583
2584 /* Check that the data-refs have the same constant size and step. */
2585 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2586 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
cc269bb6
RS
2587 if (!tree_fits_uhwi_p (sza)
2588 || !tree_fits_uhwi_p (szb)
5abe1e05 2589 || !tree_int_cst_equal (sza, szb)
9541ffee
RS
2590 || !tree_fits_shwi_p (DR_STEP (dra))
2591 || !tree_fits_shwi_p (DR_STEP (drb))
5abe1e05
RB
2592 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2593 break;
2594
2595 /* Do not place the same access in the interleaving chain twice. */
2596 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2597 break;
2598
2599 /* Check the types are compatible.
2600 ??? We don't distinguish this during sorting. */
2601 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2602 TREE_TYPE (DR_REF (drb))))
2603 break;
2604
2605 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2606 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2607 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2608 gcc_assert (init_a < init_b);
2609
2610 /* If init_b == init_a + the size of the type * k, we have an
2611 interleaving, and DRA is accessed before DRB. */
eb1ce453 2612 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
5abe1e05
RB
2613 if ((init_b - init_a) % type_size_a != 0)
2614 break;
2615
2616 /* The step (if not zero) is greater than the difference between
2617 data-refs' inits. This splits groups into suitable sizes. */
eb1ce453 2618 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
5abe1e05
RB
2619 if (step != 0 && step <= (init_b - init_a))
2620 break;
2621
2622 if (dump_enabled_p ())
2623 {
2624 dump_printf_loc (MSG_NOTE, vect_location,
2625 "Detected interleaving ");
2626 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2627 dump_printf (MSG_NOTE, " and ");
2628 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
e645e942 2629 dump_printf (MSG_NOTE, "\n");
5abe1e05
RB
2630 }
2631
2632 /* Link the found element into the group list. */
2633 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2634 {
2635 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2636 lastinfo = stmtinfo_a;
2637 }
2638 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2639 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2640 lastinfo = stmtinfo_b;
2641 }
2642 }
2643
3d54b29d 2644 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
4b5caab7
IR
2645 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2646 && !vect_analyze_data_ref_access (dr))
ebfd146a 2647 {
73fbfcad 2648 if (dump_enabled_p ())
e645e942
TJ
2649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2650 "not vectorized: complicated access pattern.\n");
4b5caab7
IR
2651
2652 if (bb_vinfo)
2653 {
2654 /* Mark the statement as not vectorizable. */
2655 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2656 continue;
2657 }
2658 else
3d54b29d
JJ
2659 {
2660 datarefs_copy.release ();
2661 return false;
2662 }
ebfd146a
IR
2663 }
2664
3d54b29d 2665 datarefs_copy.release ();
ebfd146a
IR
2666 return true;
2667}
2668
a05a89fa 2669
93bdc3ed 2670/* Operator == between two dr_with_seg_len objects.
a05a89fa
CH
2671
2672 This equality operator is used to make sure two data refs
2673 are the same one so that we will consider to combine the
2674 aliasing checks of those two pairs of data dependent data
2675 refs. */
2676
2677static bool
93bdc3ed
CH
2678operator == (const dr_with_seg_len& d1,
2679 const dr_with_seg_len& d2)
a05a89fa 2680{
93bdc3ed
CH
2681 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2682 DR_BASE_ADDRESS (d2.dr), 0)
2683 && compare_tree (d1.offset, d2.offset) == 0
2684 && compare_tree (d1.seg_len, d2.seg_len) == 0;
a05a89fa
CH
2685}
2686
93bdc3ed 2687/* Function comp_dr_with_seg_len_pair.
a05a89fa 2688
93bdc3ed 2689 Comparison function for sorting objects of dr_with_seg_len_pair_t
a05a89fa
CH
2690 so that we can combine aliasing checks in one scan. */
2691
2692static int
93bdc3ed 2693comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
a05a89fa 2694{
93bdc3ed
CH
2695 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2696 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2697
2698 const dr_with_seg_len &p11 = p1->first,
2699 &p12 = p1->second,
2700 &p21 = p2->first,
2701 &p22 = p2->second;
2702
2703 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2704 if a and c have the same basic address snd step, and b and d have the same
2705 address and step. Therefore, if any a&c or b&d don't have the same address
2706 and step, we don't care the order of those two pairs after sorting. */
2707 int comp_res;
2708
2709 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2710 DR_BASE_ADDRESS (p21.dr))) != 0)
a05a89fa 2711 return comp_res;
93bdc3ed
CH
2712 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2713 DR_BASE_ADDRESS (p22.dr))) != 0)
2714 return comp_res;
2715 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2716 return comp_res;
2717 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2718 return comp_res;
2719 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2720 return comp_res;
2721 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
a05a89fa 2722 return comp_res;
a05a89fa
CH
2723
2724 return 0;
2725}
2726
a05a89fa
CH
2727/* Function vect_vfa_segment_size.
2728
2729 Create an expression that computes the size of segment
2730 that will be accessed for a data reference. The functions takes into
2731 account that realignment loads may access one more vector.
2732
2733 Input:
2734 DR: The data reference.
2735 LENGTH_FACTOR: segment length to consider.
2736
2737 Return an expression whose value is the size of segment which will be
2738 accessed by DR. */
2739
2740static tree
2741vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2742{
2743 tree segment_length;
2744
2745 if (integer_zerop (DR_STEP (dr)))
2746 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2747 else
2748 segment_length = size_binop (MULT_EXPR,
93bdc3ed
CH
2749 fold_convert (sizetype, DR_STEP (dr)),
2750 fold_convert (sizetype, length_factor));
a05a89fa
CH
2751
2752 if (vect_supportable_dr_alignment (dr, false)
93bdc3ed 2753 == dr_explicit_realign_optimized)
a05a89fa
CH
2754 {
2755 tree vector_size = TYPE_SIZE_UNIT
2756 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2757
2758 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2759 }
2760 return segment_length;
2761}
2762
ebfd146a
IR
2763/* Function vect_prune_runtime_alias_test_list.
2764
2765 Prune a list of ddrs to be tested at run-time by versioning for alias.
a05a89fa 2766 Merge several alias checks into one if possible.
ebfd146a
IR
2767 Return FALSE if resulting list of ddrs is longer then allowed by
2768 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2769
2770bool
2771vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2772{
a05a89fa 2773 vec<ddr_p> may_alias_ddrs =
ebfd146a 2774 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
93bdc3ed 2775 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
a05a89fa
CH
2776 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2777 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2778 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2779
2780 ddr_p ddr;
2781 unsigned int i;
2782 tree length_factor;
ebfd146a 2783
73fbfcad 2784 if (dump_enabled_p ())
78c60e3d 2785 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 2786 "=== vect_prune_runtime_alias_test_list ===\n");
ebfd146a 2787
a05a89fa
CH
2788 if (may_alias_ddrs.is_empty ())
2789 return true;
2790
2791 /* Basically, for each pair of dependent data refs store_ptr_0
2792 and load_ptr_0, we create an expression:
2793
2794 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2795 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2796
2797 for aliasing checks. However, in some cases we can decrease
2798 the number of checks by combining two checks into one. For
2799 example, suppose we have another pair of data refs store_ptr_0
2800 and load_ptr_1, and if the following condition is satisfied:
2801
2802 load_ptr_0 < load_ptr_1 &&
2803 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2804
2805 (this condition means, in each iteration of vectorized loop,
2806 the accessed memory of store_ptr_0 cannot be between the memory
2807 of load_ptr_0 and load_ptr_1.)
2808
2809 we then can use only the following expression to finish the
2810 alising checks between store_ptr_0 & load_ptr_0 and
2811 store_ptr_0 & load_ptr_1:
2812
2813 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2814 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2815
2816 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2817 same basic address. */
2818
2819 comp_alias_ddrs.create (may_alias_ddrs.length ());
2820
2821 /* First, we collect all data ref pairs for aliasing checks. */
2822 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
ebfd146a 2823 {
a05a89fa
CH
2824 struct data_reference *dr_a, *dr_b;
2825 gimple dr_group_first_a, dr_group_first_b;
2826 tree segment_length_a, segment_length_b;
2827 gimple stmt_a, stmt_b;
2828
2829 dr_a = DDR_A (ddr);
2830 stmt_a = DR_STMT (DDR_A (ddr));
2831 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2832 if (dr_group_first_a)
2833 {
2834 stmt_a = dr_group_first_a;
2835 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2836 }
ebfd146a 2837
a05a89fa
CH
2838 dr_b = DDR_B (ddr);
2839 stmt_b = DR_STMT (DDR_B (ddr));
2840 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2841 if (dr_group_first_b)
2842 {
2843 stmt_b = dr_group_first_b;
2844 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2845 }
ebfd146a 2846
a05a89fa
CH
2847 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2848 length_factor = scalar_loop_iters;
2849 else
2850 length_factor = size_int (vect_factor);
2851 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2852 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2853
93bdc3ed
CH
2854 dr_with_seg_len_pair_t dr_with_seg_len_pair
2855 (dr_with_seg_len (dr_a, segment_length_a),
2856 dr_with_seg_len (dr_b, segment_length_b));
2857
2858 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
9310366b 2859 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
a05a89fa
CH
2860
2861 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2862 }
2863
2864 /* Second, we sort the collected data ref pairs so that we can scan
2865 them once to combine all possible aliasing checks. */
93bdc3ed 2866 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
ebfd146a 2867
a05a89fa
CH
2868 /* Third, we scan the sorted dr pairs and check if we can combine
2869 alias checks of two neighbouring dr pairs. */
2870 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2871 {
2872 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
93bdc3ed
CH
2873 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2874 *dr_b1 = &comp_alias_ddrs[i-1].second,
2875 *dr_a2 = &comp_alias_ddrs[i].first,
2876 *dr_b2 = &comp_alias_ddrs[i].second;
a05a89fa
CH
2877
2878 /* Remove duplicate data ref pairs. */
2879 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2880 {
2881 if (dump_enabled_p ())
ebfd146a 2882 {
a05a89fa
CH
2883 dump_printf_loc (MSG_NOTE, vect_location,
2884 "found equal ranges ");
2885 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2886 DR_REF (dr_a1->dr));
2887 dump_printf (MSG_NOTE, ", ");
2888 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2889 DR_REF (dr_b1->dr));
2890 dump_printf (MSG_NOTE, " and ");
2891 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2892 DR_REF (dr_a2->dr));
2893 dump_printf (MSG_NOTE, ", ");
2894 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2895 DR_REF (dr_b2->dr));
2896 dump_printf (MSG_NOTE, "\n");
ebfd146a 2897 }
a05a89fa
CH
2898
2899 comp_alias_ddrs.ordered_remove (i--);
2900 continue;
ebfd146a 2901 }
b8698a0f 2902
a05a89fa
CH
2903 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2904 {
2905 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2906 and DR_A1 and DR_A2 are two consecutive memrefs. */
2907 if (*dr_a1 == *dr_a2)
2908 {
9310366b
UB
2909 std::swap (dr_a1, dr_b1);
2910 std::swap (dr_a2, dr_b2);
a05a89fa
CH
2911 }
2912
93bdc3ed
CH
2913 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2914 DR_BASE_ADDRESS (dr_a2->dr),
2915 0)
9541ffee
RS
2916 || !tree_fits_shwi_p (dr_a1->offset)
2917 || !tree_fits_shwi_p (dr_a2->offset))
a05a89fa
CH
2918 continue;
2919
eb1ce453
KZ
2920 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2921 - tree_to_shwi (dr_a1->offset));
a05a89fa
CH
2922
2923
2924 /* Now we check if the following condition is satisfied:
2925
2926 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2927
2928 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2929 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2930 have to make a best estimation. We can get the minimum value
2931 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2932 then either of the following two conditions can guarantee the
2933 one above:
2934
2935 1: DIFF <= MIN_SEG_LEN_B
2936 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2937
2938 */
2939
807e902e
KZ
2940 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2941 ? tree_to_shwi (dr_b1->seg_len)
2942 : vect_factor);
a05a89fa
CH
2943
2944 if (diff <= min_seg_len_b
807e902e
KZ
2945 || (tree_fits_shwi_p (dr_a1->seg_len)
2946 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
a05a89fa 2947 {
d55d9ed0
RB
2948 if (dump_enabled_p ())
2949 {
2950 dump_printf_loc (MSG_NOTE, vect_location,
2951 "merging ranges for ");
2952 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2953 DR_REF (dr_a1->dr));
2954 dump_printf (MSG_NOTE, ", ");
2955 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2956 DR_REF (dr_b1->dr));
2957 dump_printf (MSG_NOTE, " and ");
2958 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2959 DR_REF (dr_a2->dr));
2960 dump_printf (MSG_NOTE, ", ");
2961 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2962 DR_REF (dr_b2->dr));
2963 dump_printf (MSG_NOTE, "\n");
2964 }
2965
a05a89fa
CH
2966 dr_a1->seg_len = size_binop (PLUS_EXPR,
2967 dr_a2->seg_len, size_int (diff));
2968 comp_alias_ddrs.ordered_remove (i--);
2969 }
2970 }
ebfd146a
IR
2971 }
2972
d55d9ed0
RB
2973 dump_printf_loc (MSG_NOTE, vect_location,
2974 "improved number of alias checks from %d to %d\n",
2975 may_alias_ddrs.length (), comp_alias_ddrs.length ());
a05a89fa
CH
2976 if ((int) comp_alias_ddrs.length () >
2977 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
d55d9ed0 2978 return false;
ebfd146a
IR
2979
2980 return true;
2981}
2982
aec7ae7d
JJ
2983/* Check whether a non-affine read in stmt is suitable for gather load
2984 and if so, return a builtin decl for that operation. */
2985
2986tree
2987vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2988 tree *offp, int *scalep)
2989{
2990 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2991 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2992 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2993 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2994 tree offtype = NULL_TREE;
2995 tree decl, base, off;
ef4bddc2 2996 machine_mode pmode;
aec7ae7d
JJ
2997 int punsignedp, pvolatilep;
2998
5ce9450f
JJ
2999 base = DR_REF (dr);
3000 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3001 see if we can use the def stmt of the address. */
3002 if (is_gimple_call (stmt)
3003 && gimple_call_internal_p (stmt)
3004 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3005 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3006 && TREE_CODE (base) == MEM_REF
3007 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3008 && integer_zerop (TREE_OPERAND (base, 1))
3009 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3010 {
3011 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3012 if (is_gimple_assign (def_stmt)
3013 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3014 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3015 }
3016
aec7ae7d
JJ
3017 /* The gather builtins need address of the form
3018 loop_invariant + vector * {1, 2, 4, 8}
3019 or
3020 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3021 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3022 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3023 multiplications and additions in it. To get a vector, we need
3024 a single SSA_NAME that will be defined in the loop and will
3025 contain everything that is not loop invariant and that can be
3026 vectorized. The following code attempts to find such a preexistng
3027 SSA_NAME OFF and put the loop invariants into a tree BASE
3028 that can be gimplified before the loop. */
5ce9450f 3029 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
b3ecff82 3030 &pmode, &punsignedp, &pvolatilep, false);
aec7ae7d
JJ
3031 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3032
3033 if (TREE_CODE (base) == MEM_REF)
3034 {
3035 if (!integer_zerop (TREE_OPERAND (base, 1)))
3036 {
3037 if (off == NULL_TREE)
3038 {
807e902e
KZ
3039 offset_int moff = mem_ref_offset (base);
3040 off = wide_int_to_tree (sizetype, moff);
aec7ae7d
JJ
3041 }
3042 else
3043 off = size_binop (PLUS_EXPR, off,
3044 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3045 }
3046 base = TREE_OPERAND (base, 0);
3047 }
3048 else
3049 base = build_fold_addr_expr (base);
3050
3051 if (off == NULL_TREE)
3052 off = size_zero_node;
3053
3054 /* If base is not loop invariant, either off is 0, then we start with just
3055 the constant offset in the loop invariant BASE and continue with base
3056 as OFF, otherwise give up.
3057 We could handle that case by gimplifying the addition of base + off
3058 into some SSA_NAME and use that as off, but for now punt. */
3059 if (!expr_invariant_in_loop_p (loop, base))
3060 {
3061 if (!integer_zerop (off))
3062 return NULL_TREE;
3063 off = base;
3064 base = size_int (pbitpos / BITS_PER_UNIT);
3065 }
3066 /* Otherwise put base + constant offset into the loop invariant BASE
3067 and continue with OFF. */
3068 else
3069 {
3070 base = fold_convert (sizetype, base);
3071 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3072 }
3073
3074 /* OFF at this point may be either a SSA_NAME or some tree expression
3075 from get_inner_reference. Try to peel off loop invariants from it
3076 into BASE as long as possible. */
3077 STRIP_NOPS (off);
3078 while (offtype == NULL_TREE)
3079 {
3080 enum tree_code code;
3081 tree op0, op1, add = NULL_TREE;
3082
3083 if (TREE_CODE (off) == SSA_NAME)
3084 {
3085 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3086
3087 if (expr_invariant_in_loop_p (loop, off))
3088 return NULL_TREE;
3089
3090 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3091 break;
3092
3093 op0 = gimple_assign_rhs1 (def_stmt);
3094 code = gimple_assign_rhs_code (def_stmt);
3095 op1 = gimple_assign_rhs2 (def_stmt);
3096 }
3097 else
3098 {
3099 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3100 return NULL_TREE;
3101 code = TREE_CODE (off);
3102 extract_ops_from_tree (off, &code, &op0, &op1);
3103 }
3104 switch (code)
3105 {
3106 case POINTER_PLUS_EXPR:
3107 case PLUS_EXPR:
3108 if (expr_invariant_in_loop_p (loop, op0))
3109 {
3110 add = op0;
3111 off = op1;
3112 do_add:
3113 add = fold_convert (sizetype, add);
3114 if (scale != 1)
3115 add = size_binop (MULT_EXPR, add, size_int (scale));
3116 base = size_binop (PLUS_EXPR, base, add);
3117 continue;
3118 }
3119 if (expr_invariant_in_loop_p (loop, op1))
3120 {
3121 add = op1;
3122 off = op0;
3123 goto do_add;
3124 }
3125 break;
3126 case MINUS_EXPR:
3127 if (expr_invariant_in_loop_p (loop, op1))
3128 {
3129 add = fold_convert (sizetype, op1);
3130 add = size_binop (MINUS_EXPR, size_zero_node, add);
3131 off = op0;
3132 goto do_add;
3133 }
3134 break;
3135 case MULT_EXPR:
9541ffee 3136 if (scale == 1 && tree_fits_shwi_p (op1))
aec7ae7d 3137 {
9439e9a1 3138 scale = tree_to_shwi (op1);
aec7ae7d
JJ
3139 off = op0;
3140 continue;
3141 }
3142 break;
3143 case SSA_NAME:
3144 off = op0;
3145 continue;
3146 CASE_CONVERT:
3147 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3148 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3149 break;
3150 if (TYPE_PRECISION (TREE_TYPE (op0))
3151 == TYPE_PRECISION (TREE_TYPE (off)))
3152 {
3153 off = op0;
3154 continue;
3155 }
3156 if (TYPE_PRECISION (TREE_TYPE (op0))
3157 < TYPE_PRECISION (TREE_TYPE (off)))
3158 {
3159 off = op0;
3160 offtype = TREE_TYPE (off);
3161 STRIP_NOPS (off);
3162 continue;
3163 }
3164 break;
3165 default:
3166 break;
3167 }
3168 break;
3169 }
3170
3171 /* If at the end OFF still isn't a SSA_NAME or isn't
3172 defined in the loop, punt. */
3173 if (TREE_CODE (off) != SSA_NAME
3174 || expr_invariant_in_loop_p (loop, off))
3175 return NULL_TREE;
3176
3177 if (offtype == NULL_TREE)
3178 offtype = TREE_TYPE (off);
3179
3180 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3181 offtype, scale);
3182 if (decl == NULL_TREE)
3183 return NULL_TREE;
3184
3185 if (basep)
3186 *basep = base;
3187 if (offp)
3188 *offp = off;
3189 if (scalep)
3190 *scalep = scale;
3191 return decl;
3192}
3193
ebfd146a
IR
3194/* Function vect_analyze_data_refs.
3195
a70d6342 3196 Find all the data references in the loop or basic block.
ebfd146a
IR
3197
3198 The general structure of the analysis of data refs in the vectorizer is as
3199 follows:
b8698a0f 3200 1- vect_analyze_data_refs(loop/bb): call
a70d6342
IR
3201 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3202 in the loop/bb and their dependences.
ebfd146a
IR
3203 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3204 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3205 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3206
3207*/
3208
3209bool
777e1f09
RG
3210vect_analyze_data_refs (loop_vec_info loop_vinfo,
3211 bb_vec_info bb_vinfo,
1428105c 3212 int *min_vf, unsigned *n_stmts)
ebfd146a 3213{
a70d6342
IR
3214 struct loop *loop = NULL;
3215 basic_block bb = NULL;
ebfd146a 3216 unsigned int i;
9771b263 3217 vec<data_reference_p> datarefs;
ebfd146a
IR
3218 struct data_reference *dr;
3219 tree scalar_type;
3220
73fbfcad 3221 if (dump_enabled_p ())
78c60e3d
SS
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "=== vect_analyze_data_refs ===\n");
b8698a0f 3224
a70d6342
IR
3225 if (loop_vinfo)
3226 {
0136f8f0
AH
3227 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3228
a70d6342 3229 loop = LOOP_VINFO_LOOP (loop_vinfo);
0136f8f0
AH
3230 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3231 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
22a8be9e 3232 {
73fbfcad 3233 if (dump_enabled_p ())
e645e942
TJ
3234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3235 "not vectorized: loop contains function calls"
3236 " or data references that cannot be analyzed\n");
22a8be9e
SP
3237 return false;
3238 }
3239
0136f8f0
AH
3240 for (i = 0; i < loop->num_nodes; i++)
3241 {
3242 gimple_stmt_iterator gsi;
3243
3244 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3245 {
3246 gimple stmt = gsi_stmt (gsi);
1428105c
RB
3247 if (is_gimple_debug (stmt))
3248 continue;
3249 ++*n_stmts;
0136f8f0
AH
3250 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3251 {
3252 if (is_gimple_call (stmt) && loop->safelen)
3253 {
3254 tree fndecl = gimple_call_fndecl (stmt), op;
3255 if (fndecl != NULL_TREE)
3256 {
d52f5295 3257 struct cgraph_node *node = cgraph_node::get (fndecl);
0136f8f0
AH
3258 if (node != NULL && node->simd_clones != NULL)
3259 {
3260 unsigned int j, n = gimple_call_num_args (stmt);
3261 for (j = 0; j < n; j++)
3262 {
3263 op = gimple_call_arg (stmt, j);
3264 if (DECL_P (op)
3265 || (REFERENCE_CLASS_P (op)
3266 && get_base_address (op)))
3267 break;
3268 }
3269 op = gimple_call_lhs (stmt);
3270 /* Ignore #pragma omp declare simd functions
3271 if they don't have data references in the
3272 call stmt itself. */
3273 if (j == n
3274 && !(op
3275 && (DECL_P (op)
3276 || (REFERENCE_CLASS_P (op)
3277 && get_base_address (op)))))
3278 continue;
3279 }
3280 }
3281 }
3282 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3283 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3285 "not vectorized: loop contains function "
3286 "calls or data references that cannot "
3287 "be analyzed\n");
3288 return false;
3289 }
3290 }
3291 }
3292
3293 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
a70d6342
IR
3294 }
3295 else
3296 {
1aedeafe
RG
3297 gimple_stmt_iterator gsi;
3298
a70d6342 3299 bb = BB_VINFO_BB (bb_vinfo);
1aedeafe
RG
3300 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3301 {
3302 gimple stmt = gsi_stmt (gsi);
1428105c
RB
3303 if (is_gimple_debug (stmt))
3304 continue;
3305 ++*n_stmts;
1aedeafe
RG
3306 if (!find_data_references_in_stmt (NULL, stmt,
3307 &BB_VINFO_DATAREFS (bb_vinfo)))
3308 {
3309 /* Mark the rest of the basic-block as unvectorizable. */
3310 for (; !gsi_end_p (gsi); gsi_next (&gsi))
d4d5e146
RG
3311 {
3312 stmt = gsi_stmt (gsi);
3313 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3314 }
1aedeafe
RG
3315 break;
3316 }
3317 }
22a8be9e 3318
a70d6342
IR
3319 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3320 }
ebfd146a 3321
ff802fa1
IR
3322 /* Go through the data-refs, check that the analysis succeeded. Update
3323 pointer from stmt_vec_info struct to DR and vectype. */
ebfd146a 3324
9771b263 3325 FOR_EACH_VEC_ELT (datarefs, i, dr)
ebfd146a
IR
3326 {
3327 gimple stmt;
3328 stmt_vec_info stmt_info;
b8698a0f 3329 tree base, offset, init;
aec7ae7d 3330 bool gather = false;
74bf76ed 3331 bool simd_lane_access = false;
777e1f09 3332 int vf;
b8698a0f 3333
fbd7e877 3334again:
ebfd146a
IR
3335 if (!dr || !DR_REF (dr))
3336 {
73fbfcad 3337 if (dump_enabled_p ())
78c60e3d 3338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942 3339 "not vectorized: unhandled data-ref\n");
ebfd146a
IR
3340 return false;
3341 }
3342
3343 stmt = DR_STMT (dr);
3344 stmt_info = vinfo_for_stmt (stmt);
3345
fbd7e877
RB
3346 /* Discard clobbers from the dataref vector. We will remove
3347 clobber stmts during vectorization. */
3348 if (gimple_clobber_p (stmt))
3349 {
d3ef8c53 3350 free_data_ref (dr);
fbd7e877
RB
3351 if (i == datarefs.length () - 1)
3352 {
3353 datarefs.pop ();
3354 break;
3355 }
41475e96
JJ
3356 datarefs.ordered_remove (i);
3357 dr = datarefs[i];
fbd7e877
RB
3358 goto again;
3359 }
3360
ebfd146a
IR
3361 /* Check that analysis of the data-ref succeeded. */
3362 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
aec7ae7d 3363 || !DR_STEP (dr))
ebfd146a 3364 {
74bf76ed
JJ
3365 bool maybe_gather
3366 = DR_IS_READ (dr)
aec7ae7d 3367 && !TREE_THIS_VOLATILE (DR_REF (dr))
74bf76ed
JJ
3368 && targetm.vectorize.builtin_gather != NULL;
3369 bool maybe_simd_lane_access
3370 = loop_vinfo && loop->simduid;
3371
3372 /* If target supports vector gather loads, or if this might be
3373 a SIMD lane access, see if they can't be used. */
3374 if (loop_vinfo
3375 && (maybe_gather || maybe_simd_lane_access)
aec7ae7d
JJ
3376 && !nested_in_vect_loop_p (loop, stmt))
3377 {
3378 struct data_reference *newdr
3379 = create_data_ref (NULL, loop_containing_stmt (stmt),
3380 DR_REF (dr), stmt, true);
3381 gcc_assert (newdr != NULL && DR_REF (newdr));
3382 if (DR_BASE_ADDRESS (newdr)
3383 && DR_OFFSET (newdr)
3384 && DR_INIT (newdr)
3385 && DR_STEP (newdr)
3386 && integer_zerop (DR_STEP (newdr)))
3387 {
74bf76ed
JJ
3388 if (maybe_simd_lane_access)
3389 {
3390 tree off = DR_OFFSET (newdr);
3391 STRIP_NOPS (off);
3392 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3393 && TREE_CODE (off) == MULT_EXPR
cc269bb6 3394 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
74bf76ed
JJ
3395 {
3396 tree step = TREE_OPERAND (off, 1);
3397 off = TREE_OPERAND (off, 0);
3398 STRIP_NOPS (off);
3399 if (CONVERT_EXPR_P (off)
3400 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3401 0)))
3402 < TYPE_PRECISION (TREE_TYPE (off)))
3403 off = TREE_OPERAND (off, 0);
3404 if (TREE_CODE (off) == SSA_NAME)
3405 {
3406 gimple def = SSA_NAME_DEF_STMT (off);
3407 tree reft = TREE_TYPE (DR_REF (newdr));
cd4447e2
JJ
3408 if (is_gimple_call (def)
3409 && gimple_call_internal_p (def)
3410 && (gimple_call_internal_fn (def)
3411 == IFN_GOMP_SIMD_LANE))
74bf76ed
JJ
3412 {
3413 tree arg = gimple_call_arg (def, 0);
3414 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3415 arg = SSA_NAME_VAR (arg);
3416 if (arg == loop->simduid
3417 /* For now. */
3418 && tree_int_cst_equal
3419 (TYPE_SIZE_UNIT (reft),
3420 step))
3421 {
3422 DR_OFFSET (newdr) = ssize_int (0);
3423 DR_STEP (newdr) = step;
995a1b4a
JJ
3424 DR_ALIGNED_TO (newdr)
3425 = size_int (BIGGEST_ALIGNMENT);
74bf76ed
JJ
3426 dr = newdr;
3427 simd_lane_access = true;
3428 }
3429 }
3430 }
3431 }
3432 }
3433 if (!simd_lane_access && maybe_gather)
3434 {
3435 dr = newdr;
3436 gather = true;
3437 }
aec7ae7d 3438 }
74bf76ed 3439 if (!gather && !simd_lane_access)
aec7ae7d
JJ
3440 free_data_ref (newdr);
3441 }
4b5caab7 3442
74bf76ed 3443 if (!gather && !simd_lane_access)
aec7ae7d 3444 {
73fbfcad 3445 if (dump_enabled_p ())
aec7ae7d 3446 {
e645e942 3447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78c60e3d
SS
3448 "not vectorized: data ref analysis "
3449 "failed ");
3450 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
e645e942 3451 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
aec7ae7d 3452 }
ba65ae42 3453
aec7ae7d 3454 if (bb_vinfo)
fcac74a1 3455 break;
aec7ae7d
JJ
3456
3457 return false;
3458 }
ebfd146a
IR
3459 }
3460
3461 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3462 {
73fbfcad 3463 if (dump_enabled_p ())
78c60e3d
SS
3464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3465 "not vectorized: base addr of dr is a "
e645e942 3466 "constant\n");
ba65ae42
IR
3467
3468 if (bb_vinfo)
fcac74a1 3469 break;
ba65ae42 3470
74bf76ed 3471 if (gather || simd_lane_access)
aec7ae7d
JJ
3472 free_data_ref (dr);
3473 return false;
ebfd146a
IR
3474 }
3475
8f7de592
IR
3476 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3477 {
73fbfcad 3478 if (dump_enabled_p ())
8f7de592 3479 {
78c60e3d
SS
3480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3481 "not vectorized: volatile type ");
3482 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
e645e942 3483 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
8f7de592 3484 }
ba65ae42
IR
3485
3486 if (bb_vinfo)
fcac74a1 3487 break;
ba65ae42 3488
8f7de592
IR
3489 return false;
3490 }
3491
822ba6d7 3492 if (stmt_can_throw_internal (stmt))
5a2c1986 3493 {
73fbfcad 3494 if (dump_enabled_p ())
5a2c1986 3495 {
78c60e3d
SS
3496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3497 "not vectorized: statement can throw an "
3498 "exception ");
3499 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
e645e942 3500 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
5a2c1986 3501 }
ba65ae42
IR
3502
3503 if (bb_vinfo)
fcac74a1 3504 break;
ba65ae42 3505
74bf76ed 3506 if (gather || simd_lane_access)
aec7ae7d 3507 free_data_ref (dr);
5a2c1986
IR
3508 return false;
3509 }
3510
508ef0c6
RG
3511 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3512 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3513 {
73fbfcad 3514 if (dump_enabled_p ())
508ef0c6 3515 {
78c60e3d
SS
3516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3517 "not vectorized: statement is bitfield "
3518 "access ");
3519 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
e645e942 3520 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
508ef0c6
RG
3521 }
3522
3523 if (bb_vinfo)
fcac74a1 3524 break;
508ef0c6 3525
74bf76ed 3526 if (gather || simd_lane_access)
508ef0c6
RG
3527 free_data_ref (dr);
3528 return false;
3529 }
3530
3531 base = unshare_expr (DR_BASE_ADDRESS (dr));
3532 offset = unshare_expr (DR_OFFSET (dr));
3533 init = unshare_expr (DR_INIT (dr));
3534
5ce9450f
JJ
3535 if (is_gimple_call (stmt)
3536 && (!gimple_call_internal_p (stmt)
3537 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3538 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
9c239085 3539 {
73fbfcad 3540 if (dump_enabled_p ())
9c239085 3541 {
78c60e3d 3542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e645e942 3543 "not vectorized: dr in a call ");
78c60e3d 3544 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
e645e942 3545 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
9c239085
JJ
3546 }
3547
3548 if (bb_vinfo)
fcac74a1 3549 break;
9c239085 3550
74bf76ed 3551 if (gather || simd_lane_access)
9c239085
JJ
3552 free_data_ref (dr);
3553 return false;
3554 }
3555
ebfd146a 3556 /* Update DR field in stmt_vec_info struct. */
ebfd146a
IR
3557
3558 /* If the dataref is in an inner-loop of the loop that is considered for
3559 for vectorization, we also want to analyze the access relative to
b8698a0f 3560 the outer-loop (DR contains information only relative to the
ebfd146a
IR
3561 inner-most enclosing loop). We do that by building a reference to the
3562 first location accessed by the inner-loop, and analyze it relative to
b8698a0f
L
3563 the outer-loop. */
3564 if (loop && nested_in_vect_loop_p (loop, stmt))
ebfd146a
IR
3565 {
3566 tree outer_step, outer_base, outer_init;
3567 HOST_WIDE_INT pbitsize, pbitpos;
3568 tree poffset;
ef4bddc2 3569 machine_mode pmode;
ebfd146a
IR
3570 int punsignedp, pvolatilep;
3571 affine_iv base_iv, offset_iv;
3572 tree dinit;
3573
b8698a0f 3574 /* Build a reference to the first location accessed by the
ff802fa1 3575 inner-loop: *(BASE+INIT). (The first location is actually
ebfd146a
IR
3576 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3577 tree inner_base = build_fold_indirect_ref
5d49b6a7 3578 (fold_build_pointer_plus (base, init));
ebfd146a 3579
73fbfcad 3580 if (dump_enabled_p ())
ebfd146a 3581 {
78c60e3d
SS
3582 dump_printf_loc (MSG_NOTE, vect_location,
3583 "analyze in outer-loop: ");
3584 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
e645e942 3585 dump_printf (MSG_NOTE, "\n");
ebfd146a
IR
3586 }
3587
b8698a0f 3588 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
b3ecff82 3589 &poffset, &pmode, &punsignedp, &pvolatilep, false);
ebfd146a
IR
3590 gcc_assert (outer_base != NULL_TREE);
3591
3592 if (pbitpos % BITS_PER_UNIT != 0)
3593 {
73fbfcad 3594 if (dump_enabled_p ())
78c60e3d
SS
3595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3596 "failed: bit offset alignment.\n");
ebfd146a
IR
3597 return false;
3598 }
3599
3600 outer_base = build_fold_addr_expr (outer_base);
b8698a0f 3601 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
ebfd146a
IR
3602 &base_iv, false))
3603 {
73fbfcad 3604 if (dump_enabled_p ())
e645e942 3605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78c60e3d 3606 "failed: evolution of base is not affine.\n");
ebfd146a
IR
3607 return false;
3608 }
3609
3610 if (offset)
3611 {
3612 if (poffset)
b8698a0f 3613 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
ebfd146a
IR
3614 poffset);
3615 else
3616 poffset = offset;
3617 }
3618
3619 if (!poffset)
3620 {
3621 offset_iv.base = ssize_int (0);
3622 offset_iv.step = ssize_int (0);
3623 }
b8698a0f 3624 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
ebfd146a
IR
3625 &offset_iv, false))
3626 {
73fbfcad 3627 if (dump_enabled_p ())
e645e942 3628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78c60e3d 3629 "evolution of offset is not affine.\n");
ebfd146a
IR
3630 return false;
3631 }
3632
3633 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3634 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3635 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3636 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3637 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3638
3639 outer_step = size_binop (PLUS_EXPR,
3640 fold_convert (ssizetype, base_iv.step),
3641 fold_convert (ssizetype, offset_iv.step));
3642
3643 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3644 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
b8698a0f 3645 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
ebfd146a 3646 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
b8698a0f 3647 STMT_VINFO_DR_OFFSET (stmt_info) =
ebfd146a 3648 fold_convert (ssizetype, offset_iv.base);
b8698a0f 3649 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
ebfd146a
IR
3650 size_int (highest_pow2_factor (offset_iv.base));
3651
73fbfcad 3652 if (dump_enabled_p ())
ebfd146a 3653 {
78c60e3d
SS
3654 dump_printf_loc (MSG_NOTE, vect_location,
3655 "\touter base_address: ");
3656 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3657 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3658 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3659 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660 STMT_VINFO_DR_OFFSET (stmt_info));
3661 dump_printf (MSG_NOTE,
3662 "\n\touter constant offset from base address: ");
3663 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3664 STMT_VINFO_DR_INIT (stmt_info));
3665 dump_printf (MSG_NOTE, "\n\touter step: ");
3666 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3667 STMT_VINFO_DR_STEP (stmt_info));
3668 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3669 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3670 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
e645e942 3671 dump_printf (MSG_NOTE, "\n");
ebfd146a
IR
3672 }
3673 }
3674
3675 if (STMT_VINFO_DATA_REF (stmt_info))
3676 {
73fbfcad 3677 if (dump_enabled_p ())
ebfd146a 3678 {
78c60e3d
SS
3679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3680 "not vectorized: more than one data ref "
3681 "in stmt: ");
3682 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
e645e942 3683 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a 3684 }
ba65ae42
IR
3685
3686 if (bb_vinfo)
fcac74a1 3687 break;
ba65ae42 3688
74bf76ed 3689 if (gather || simd_lane_access)
aec7ae7d 3690 free_data_ref (dr);
ebfd146a
IR
3691 return false;
3692 }
8644a673 3693
ebfd146a 3694 STMT_VINFO_DATA_REF (stmt_info) = dr;
74bf76ed
JJ
3695 if (simd_lane_access)
3696 {
3697 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
d3ef8c53 3698 free_data_ref (datarefs[i]);
74bf76ed
JJ
3699 datarefs[i] = dr;
3700 }
b8698a0f 3701
ebfd146a
IR
3702 /* Set vectype for STMT. */
3703 scalar_type = TREE_TYPE (DR_REF (dr));
d3ef8c53
JJ
3704 STMT_VINFO_VECTYPE (stmt_info)
3705 = get_vectype_for_scalar_type (scalar_type);
b8698a0f 3706 if (!STMT_VINFO_VECTYPE (stmt_info))
ebfd146a 3707 {
73fbfcad 3708 if (dump_enabled_p ())
ebfd146a 3709 {
e645e942 3710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78c60e3d
SS
3711 "not vectorized: no vectype for stmt: ");
3712 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3713 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3714 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3715 scalar_type);
e645e942 3716 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
ebfd146a 3717 }
4b5caab7
IR
3718
3719 if (bb_vinfo)
fcac74a1 3720 break;
aec7ae7d 3721
74bf76ed 3722 if (gather || simd_lane_access)
aec7ae7d
JJ
3723 {
3724 STMT_VINFO_DATA_REF (stmt_info) = NULL;
d3ef8c53
JJ
3725 if (gather)
3726 free_data_ref (dr);
aec7ae7d
JJ
3727 }
3728 return false;
ebfd146a 3729 }
451dabda
RB
3730 else
3731 {
3732 if (dump_enabled_p ())
3733 {
3734 dump_printf_loc (MSG_NOTE, vect_location,
3735 "got vectype for stmt: ");
3736 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3737 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3738 STMT_VINFO_VECTYPE (stmt_info));
e645e942 3739 dump_printf (MSG_NOTE, "\n");
451dabda
RB
3740 }
3741 }
777e1f09
RG
3742
3743 /* Adjust the minimal vectorization factor according to the
3744 vector type. */
3745 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3746 if (vf > *min_vf)
3747 *min_vf = vf;
aec7ae7d
JJ
3748
3749 if (gather)
3750 {
aec7ae7d 3751 tree off;
aec7ae7d 3752
7d75abc8
MM
3753 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3754 if (gather
3755 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3756 gather = false;
319e6439 3757 if (!gather)
aec7ae7d 3758 {
6f723d33
JJ
3759 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3760 free_data_ref (dr);
73fbfcad 3761 if (dump_enabled_p ())
aec7ae7d 3762 {
78c60e3d
SS
3763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3764 "not vectorized: not suitable for gather "
3765 "load ");
3766 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
e645e942 3767 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
aec7ae7d
JJ
3768 }
3769 return false;
3770 }
3771
9771b263 3772 datarefs[i] = dr;
319e6439
RG
3773 STMT_VINFO_GATHER_P (stmt_info) = true;
3774 }
3775 else if (loop_vinfo
3776 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3777 {
51a905b2
RB
3778 if (nested_in_vect_loop_p (loop, stmt)
3779 || !DR_IS_READ (dr))
319e6439 3780 {
73fbfcad 3781 if (dump_enabled_p ())
319e6439 3782 {
78c60e3d
SS
3783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784 "not vectorized: not suitable for strided "
3785 "load ");
3786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
e645e942 3787 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
319e6439
RG
3788 }
3789 return false;
3790 }
3791 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
aec7ae7d 3792 }
ebfd146a 3793 }
b8698a0f 3794
fcac74a1
RB
3795 /* If we stopped analysis at the first dataref we could not analyze
3796 when trying to vectorize a basic-block mark the rest of the datarefs
3797 as not vectorizable and truncate the vector of datarefs. That
3798 avoids spending useless time in analyzing their dependence. */
3799 if (i != datarefs.length ())
3800 {
3801 gcc_assert (bb_vinfo != NULL);
3802 for (unsigned j = i; j < datarefs.length (); ++j)
3803 {
3804 data_reference_p dr = datarefs[j];
3805 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3806 free_data_ref (dr);
3807 }
3808 datarefs.truncate (i);
3809 }
3810
ebfd146a
IR
3811 return true;
3812}
3813
3814
3815/* Function vect_get_new_vect_var.
3816
ff802fa1 3817 Returns a name for a new variable. The current naming scheme appends the
b8698a0f
L
3818 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3819 the name of vectorizer generated variables, and appends that to NAME if
ebfd146a
IR
3820 provided. */
3821
3822tree
3823vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3824{
3825 const char *prefix;
3826 tree new_vect_var;
3827
3828 switch (var_kind)
3829 {
3830 case vect_simple_var:
451dabda 3831 prefix = "vect";
ebfd146a
IR
3832 break;
3833 case vect_scalar_var:
451dabda 3834 prefix = "stmp";
ebfd146a
IR
3835 break;
3836 case vect_pointer_var:
451dabda 3837 prefix = "vectp";
ebfd146a
IR
3838 break;
3839 default:
3840 gcc_unreachable ();
3841 }
3842
3843 if (name)
3844 {
451dabda 3845 char* tmp = concat (prefix, "_", name, NULL);
65876d24 3846 new_vect_var = create_tmp_reg (type, tmp);
ebfd146a
IR
3847 free (tmp);
3848 }
3849 else
65876d24 3850 new_vect_var = create_tmp_reg (type, prefix);
ebfd146a
IR
3851
3852 return new_vect_var;
3853}
3854
3855
3856/* Function vect_create_addr_base_for_vector_ref.
3857
3858 Create an expression that computes the address of the first memory location
3859 that will be accessed for a data reference.
3860
3861 Input:
3862 STMT: The statement containing the data reference.
3863 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3864 OFFSET: Optional. If supplied, it is be added to the initial address.
3865 LOOP: Specify relative to which loop-nest should the address be computed.
3866 For example, when the dataref is in an inner-loop nested in an
3867 outer-loop that is now being vectorized, LOOP can be either the
ff802fa1 3868 outer-loop, or the inner-loop. The first memory location accessed
ebfd146a
IR
3869 by the following dataref ('in' points to short):
3870
3871 for (i=0; i<N; i++)
3872 for (j=0; j<M; j++)
3873 s += in[i+j]
3874
3875 is as follows:
3876 if LOOP=i_loop: &in (relative to i_loop)
3877 if LOOP=j_loop: &in+i*2B (relative to j_loop)
356bbc4c
JJ
3878 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3879 initial address. Unlike OFFSET, which is number of elements to
3880 be added, BYTE_OFFSET is measured in bytes.
ebfd146a
IR
3881
3882 Output:
b8698a0f 3883 1. Return an SSA_NAME whose value is the address of the memory location of
ebfd146a
IR
3884 the first vector of the data reference.
3885 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3886 these statement(s) which define the returned SSA_NAME.
3887
3888 FORNOW: We are only handling array accesses with step 1. */
3889
3890tree
3891vect_create_addr_base_for_vector_ref (gimple stmt,
3892 gimple_seq *new_stmt_list,
3893 tree offset,
356bbc4c
JJ
3894 struct loop *loop,
3895 tree byte_offset)
ebfd146a
IR
3896{
3897 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3898 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4bdd44c4 3899 tree data_ref_base;
595c2679 3900 const char *base_name;
4bdd44c4 3901 tree addr_base;
ebfd146a
IR
3902 tree dest;
3903 gimple_seq seq = NULL;
4bdd44c4
RB
3904 tree base_offset;
3905 tree init;
8644a673 3906 tree vect_ptr_type;
ebfd146a 3907 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
a70d6342 3908 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
ebfd146a 3909
a70d6342 3910 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
ebfd146a 3911 {
a70d6342 3912 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 3913
a70d6342 3914 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
ebfd146a
IR
3915
3916 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3917 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3918 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3919 }
4bdd44c4
RB
3920 else
3921 {
3922 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3923 base_offset = unshare_expr (DR_OFFSET (dr));
3924 init = unshare_expr (DR_INIT (dr));
3925 }
ebfd146a 3926
a70d6342 3927 if (loop_vinfo)
595c2679 3928 base_name = get_name (data_ref_base);
a70d6342
IR
3929 else
3930 {
3931 base_offset = ssize_int (0);
3932 init = ssize_int (0);
595c2679 3933 base_name = get_name (DR_REF (dr));
b8698a0f 3934 }
a70d6342 3935
ebfd146a
IR
3936 /* Create base_offset */
3937 base_offset = size_binop (PLUS_EXPR,
3938 fold_convert (sizetype, base_offset),
3939 fold_convert (sizetype, init));
ebfd146a
IR
3940
3941 if (offset)
3942 {
ebfd146a
IR
3943 offset = fold_build2 (MULT_EXPR, sizetype,
3944 fold_convert (sizetype, offset), step);
3945 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3946 base_offset, offset);
ebfd146a 3947 }
356bbc4c
JJ
3948 if (byte_offset)
3949 {
3950 byte_offset = fold_convert (sizetype, byte_offset);
3951 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3952 base_offset, byte_offset);
3953 }
ebfd146a
IR
3954
3955 /* base + base_offset */
a70d6342 3956 if (loop_vinfo)
5d49b6a7 3957 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
a70d6342
IR
3958 else
3959 {
70f34814
RG
3960 addr_base = build1 (ADDR_EXPR,
3961 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3962 unshare_expr (DR_REF (dr)));
a70d6342 3963 }
b8698a0f 3964
ebfd146a 3965 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4bdd44c4
RB
3966 addr_base = fold_convert (vect_ptr_type, addr_base);
3967 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3968 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
ebfd146a
IR
3969 gimple_seq_add_seq (new_stmt_list, seq);
3970
17fc049f 3971 if (DR_PTR_INFO (dr)
4bdd44c4 3972 && TREE_CODE (addr_base) == SSA_NAME)
128aaeed 3973 {
4bdd44c4 3974 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
417dfefb
GW
3975 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3976 int misalign = DR_MISALIGNMENT (dr);
3977 if (offset || byte_offset || (misalign == -1))
4bdd44c4 3978 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
417dfefb
GW
3979 else
3980 set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
128aaeed 3981 }
17fc049f 3982
73fbfcad 3983 if (dump_enabled_p ())
ebfd146a 3984 {
78c60e3d 3985 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4bdd44c4 3986 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
e645e942 3987 dump_printf (MSG_NOTE, "\n");
ebfd146a 3988 }
8644a673 3989
4bdd44c4 3990 return addr_base;
ebfd146a
IR
3991}
3992
3993
3994/* Function vect_create_data_ref_ptr.
3995
920e8172
RS
3996 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3997 location accessed in the loop by STMT, along with the def-use update
3998 chain to appropriately advance the pointer through the loop iterations.
3999 Also set aliasing information for the pointer. This pointer is used by
4000 the callers to this function to create a memory reference expression for
4001 vector load/store access.
ebfd146a
IR
4002
4003 Input:
4004 1. STMT: a stmt that references memory. Expected to be of the form
4005 GIMPLE_ASSIGN <name, data-ref> or
4006 GIMPLE_ASSIGN <data-ref, name>.
920e8172
RS
4007 2. AGGR_TYPE: the type of the reference, which should be either a vector
4008 or an array.
4009 3. AT_LOOP: the loop where the vector memref is to be created.
4010 4. OFFSET (optional): an offset to be added to the initial address accessed
ebfd146a 4011 by the data-ref in STMT.
920e8172
RS
4012 5. BSI: location where the new stmts are to be placed if there is no loop
4013 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
ebfd146a 4014 pointing to the initial address.
356bbc4c
JJ
4015 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4016 to the initial address accessed by the data-ref in STMT. This is
4017 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4018 in bytes.
ebfd146a
IR
4019
4020 Output:
4021 1. Declare a new ptr to vector_type, and have it point to the base of the
4022 data reference (initial addressed accessed by the data reference).
4023 For example, for vector of type V8HI, the following code is generated:
4024
920e8172
RS
4025 v8hi *ap;
4026 ap = (v8hi *)initial_address;
ebfd146a
IR
4027
4028 if OFFSET is not supplied:
4029 initial_address = &a[init];
4030 if OFFSET is supplied:
4031 initial_address = &a[init + OFFSET];
356bbc4c
JJ
4032 if BYTE_OFFSET is supplied:
4033 initial_address = &a[init] + BYTE_OFFSET;
ebfd146a
IR
4034
4035 Return the initial_address in INITIAL_ADDRESS.
4036
4037 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
b8698a0f 4038 update the pointer in each iteration of the loop.
ebfd146a
IR
4039
4040 Return the increment stmt that updates the pointer in PTR_INCR.
4041
b8698a0f 4042 3. Set INV_P to true if the access pattern of the data reference in the
ff802fa1 4043 vectorized loop is invariant. Set it to false otherwise.
ebfd146a
IR
4044
4045 4. Return the pointer. */
4046
4047tree
920e8172
RS
4048vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4049 tree offset, tree *initial_address,
4050 gimple_stmt_iterator *gsi, gimple *ptr_incr,
356bbc4c 4051 bool only_init, bool *inv_p, tree byte_offset)
ebfd146a 4052{
595c2679 4053 const char *base_name;
ebfd146a
IR
4054 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4055 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
a70d6342
IR
4056 struct loop *loop = NULL;
4057 bool nested_in_vect_loop = false;
4058 struct loop *containing_loop = NULL;
920e8172
RS
4059 tree aggr_ptr_type;
4060 tree aggr_ptr;
ebfd146a
IR
4061 tree new_temp;
4062 gimple vec_stmt;
4063 gimple_seq new_stmt_list = NULL;
a70d6342 4064 edge pe = NULL;
ebfd146a 4065 basic_block new_bb;
920e8172 4066 tree aggr_ptr_init;
ebfd146a 4067 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
920e8172 4068 tree aptr;
ebfd146a
IR
4069 gimple_stmt_iterator incr_gsi;
4070 bool insert_after;
4071 tree indx_before_incr, indx_after_incr;
4072 gimple incr;
4073 tree step;
a70d6342 4074 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
b8698a0f 4075
920e8172
RS
4076 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4077 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4078
a70d6342
IR
4079 if (loop_vinfo)
4080 {
4081 loop = LOOP_VINFO_LOOP (loop_vinfo);
4082 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4083 containing_loop = (gimple_bb (stmt))->loop_father;
4084 pe = loop_preheader_edge (loop);
4085 }
4086 else
4087 {
4088 gcc_assert (bb_vinfo);
4089 only_init = true;
4090 *ptr_incr = NULL;
4091 }
b8698a0f 4092
ebfd146a
IR
4093 /* Check the step (evolution) of the load in LOOP, and record
4094 whether it's invariant. */
4095 if (nested_in_vect_loop)
4096 step = STMT_VINFO_DR_STEP (stmt_info);
4097 else
4098 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
b8698a0f 4099
08940f33 4100 if (integer_zerop (step))
ebfd146a
IR
4101 *inv_p = true;
4102 else
4103 *inv_p = false;
4104
4105 /* Create an expression for the first address accessed by this load
b8698a0f 4106 in LOOP. */
595c2679 4107 base_name = get_name (DR_BASE_ADDRESS (dr));
ebfd146a 4108
73fbfcad 4109 if (dump_enabled_p ())
ebfd146a 4110 {
595c2679 4111 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
78c60e3d
SS
4112 dump_printf_loc (MSG_NOTE, vect_location,
4113 "create %s-pointer variable to type: ",
5806f481 4114 get_tree_code_name (TREE_CODE (aggr_type)));
78c60e3d 4115 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
595c2679 4116 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
78c60e3d 4117 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
38000232
MG
4118 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4119 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
595c2679 4120 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
78c60e3d 4121 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
595c2679 4122 else
78c60e3d 4123 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
595c2679 4124 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
e645e942 4125 dump_printf (MSG_NOTE, "\n");
ebfd146a
IR
4126 }
4127
4bdd44c4
RB
4128 /* (1) Create the new aggregate-pointer variable.
4129 Vector and array types inherit the alias set of their component
920e8172
RS
4130 type by default so we need to use a ref-all pointer if the data
4131 reference does not conflict with the created aggregated data
4132 reference because it is not addressable. */
4bdd44c4
RB
4133 bool need_ref_all = false;
4134 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3f49ba3f 4135 get_alias_set (DR_REF (dr))))
4bdd44c4 4136 need_ref_all = true;
3f49ba3f 4137 /* Likewise for any of the data references in the stmt group. */
e14c1050 4138 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
ebfd146a 4139 {
e14c1050 4140 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
5006671f
RG
4141 do
4142 {
4bdd44c4
RB
4143 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4144 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4145 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4146 get_alias_set (DR_REF (sdr))))
5006671f 4147 {
4bdd44c4 4148 need_ref_all = true;
5006671f
RG
4149 break;
4150 }
4bdd44c4 4151 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
5006671f
RG
4152 }
4153 while (orig_stmt);
ebfd146a 4154 }
4bdd44c4
RB
4155 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4156 need_ref_all);
4157 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4158
ebfd146a 4159
ff802fa1
IR
4160 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4161 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4162 def-use update cycles for the pointer: one relative to the outer-loop
4163 (LOOP), which is what steps (3) and (4) below do. The other is relative
4164 to the inner-loop (which is the inner-most loop containing the dataref),
4165 and this is done be step (5) below.
ebfd146a 4166
ff802fa1
IR
4167 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4168 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4169 redundant. Steps (3),(4) create the following:
ebfd146a
IR
4170
4171 vp0 = &base_addr;
4172 LOOP: vp1 = phi(vp0,vp2)
b8698a0f 4173 ...
ebfd146a
IR
4174 ...
4175 vp2 = vp1 + step
4176 goto LOOP
b8698a0f 4177
ff802fa1
IR
4178 If there is an inner-loop nested in loop, then step (5) will also be
4179 applied, and an additional update in the inner-loop will be created:
ebfd146a
IR
4180
4181 vp0 = &base_addr;
4182 LOOP: vp1 = phi(vp0,vp2)
4183 ...
4184 inner: vp3 = phi(vp1,vp4)
4185 vp4 = vp3 + inner_step
4186 if () goto inner
4187 ...
4188 vp2 = vp1 + step
4189 if () goto LOOP */
4190
920e8172
RS
4191 /* (2) Calculate the initial address of the aggregate-pointer, and set
4192 the aggregate-pointer to point to it before the loop. */
ebfd146a 4193
356bbc4c 4194 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
ebfd146a
IR
4195
4196 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
356bbc4c 4197 offset, loop, byte_offset);
ebfd146a
IR
4198 if (new_stmt_list)
4199 {
a70d6342
IR
4200 if (pe)
4201 {
4202 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4203 gcc_assert (!new_bb);
4204 }
4205 else
1b29f05e 4206 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
ebfd146a
IR
4207 }
4208
4209 *initial_address = new_temp;
4210
920e8172 4211 /* Create: p = (aggr_type *) initial_base */
17fc049f 4212 if (TREE_CODE (new_temp) != SSA_NAME
920e8172 4213 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
a70d6342 4214 {
920e8172
RS
4215 vec_stmt = gimple_build_assign (aggr_ptr,
4216 fold_convert (aggr_ptr_type, new_temp));
4217 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
17fc049f
RG
4218 /* Copy the points-to information if it exists. */
4219 if (DR_PTR_INFO (dr))
920e8172
RS
4220 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4221 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
17fc049f
RG
4222 if (pe)
4223 {
4224 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4225 gcc_assert (!new_bb);
4226 }
4227 else
1b29f05e 4228 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
a70d6342
IR
4229 }
4230 else
920e8172 4231 aggr_ptr_init = new_temp;
ebfd146a 4232
920e8172 4233 /* (3) Handle the updating of the aggregate-pointer inside the loop.
ff802fa1
IR
4234 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4235 inner-loop nested in LOOP (during outer-loop vectorization). */
ebfd146a 4236
a70d6342 4237 /* No update in loop is required. */
b8698a0f 4238 if (only_init && (!loop_vinfo || at_loop == loop))
920e8172 4239 aptr = aggr_ptr_init;
ebfd146a
IR
4240 else
4241 {
920e8172 4242 /* The step of the aggregate pointer is the type size. */
08940f33 4243 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
b8698a0f 4244 /* One exception to the above is when the scalar step of the load in
ebfd146a
IR
4245 LOOP is zero. In this case the step here is also zero. */
4246 if (*inv_p)
08940f33
RB
4247 iv_step = size_zero_node;
4248 else if (tree_int_cst_sgn (step) == -1)
4249 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
ebfd146a
IR
4250
4251 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4252
920e8172 4253 create_iv (aggr_ptr_init,
08940f33 4254 fold_convert (aggr_ptr_type, iv_step),
920e8172 4255 aggr_ptr, loop, &incr_gsi, insert_after,
ebfd146a
IR
4256 &indx_before_incr, &indx_after_incr);
4257 incr = gsi_stmt (incr_gsi);
a70d6342 4258 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
ebfd146a
IR
4259
4260 /* Copy the points-to information if it exists. */
4261 if (DR_PTR_INFO (dr))
4262 {
4263 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4264 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4265 }
ebfd146a
IR
4266 if (ptr_incr)
4267 *ptr_incr = incr;
4268
920e8172 4269 aptr = indx_before_incr;
ebfd146a
IR
4270 }
4271
4272 if (!nested_in_vect_loop || only_init)
920e8172 4273 return aptr;
ebfd146a
IR
4274
4275
920e8172 4276 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
ff802fa1 4277 nested in LOOP, if exists. */
ebfd146a
IR
4278
4279 gcc_assert (nested_in_vect_loop);
4280 if (!only_init)
4281 {
4282 standard_iv_increment_position (containing_loop, &incr_gsi,
4283 &insert_after);
920e8172 4284 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
ebfd146a
IR
4285 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4286 &indx_after_incr);
4287 incr = gsi_stmt (incr_gsi);
a70d6342 4288 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
ebfd146a
IR
4289
4290 /* Copy the points-to information if it exists. */
4291 if (DR_PTR_INFO (dr))
4292 {
4293 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4294 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4295 }
ebfd146a
IR
4296 if (ptr_incr)
4297 *ptr_incr = incr;
4298
b8698a0f 4299 return indx_before_incr;
ebfd146a
IR
4300 }
4301 else
4302 gcc_unreachable ();
4303}
4304
4305
4306/* Function bump_vector_ptr
4307
4308 Increment a pointer (to a vector type) by vector-size. If requested,
b8698a0f 4309 i.e. if PTR-INCR is given, then also connect the new increment stmt
ebfd146a
IR
4310 to the existing def-use update-chain of the pointer, by modifying
4311 the PTR_INCR as illustrated below:
4312
4313 The pointer def-use update-chain before this function:
4314 DATAREF_PTR = phi (p_0, p_2)
4315 ....
b8698a0f 4316 PTR_INCR: p_2 = DATAREF_PTR + step
ebfd146a
IR
4317
4318 The pointer def-use update-chain after this function:
4319 DATAREF_PTR = phi (p_0, p_2)
4320 ....
4321 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4322 ....
4323 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4324
4325 Input:
b8698a0f 4326 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
ebfd146a 4327 in the loop.
b8698a0f 4328 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
ebfd146a 4329 the loop. The increment amount across iterations is expected
b8698a0f 4330 to be vector_size.
ebfd146a
IR
4331 BSI - location where the new update stmt is to be placed.
4332 STMT - the original scalar memory-access stmt that is being vectorized.
4333 BUMP - optional. The offset by which to bump the pointer. If not given,
4334 the offset is assumed to be vector_size.
4335
4336 Output: Return NEW_DATAREF_PTR as illustrated above.
b8698a0f 4337
ebfd146a
IR
4338*/
4339
4340tree
4341bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4342 gimple stmt, tree bump)
4343{
4344 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4345 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4346 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ebfd146a 4347 tree update = TYPE_SIZE_UNIT (vectype);
538dd0b7 4348 gassign *incr_stmt;
ebfd146a
IR
4349 ssa_op_iter iter;
4350 use_operand_p use_p;
4351 tree new_dataref_ptr;
4352
4353 if (bump)
4354 update = bump;
b8698a0f 4355
b731b390 4356 new_dataref_ptr = copy_ssa_name (dataref_ptr);
0d0e4a03
JJ
4357 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4358 dataref_ptr, update);
ebfd146a
IR
4359 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4360
4361 /* Copy the points-to information if it exists. */
4362 if (DR_PTR_INFO (dr))
128aaeed
RB
4363 {
4364 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
644ffefd 4365 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
128aaeed 4366 }
ebfd146a
IR
4367
4368 if (!ptr_incr)
4369 return new_dataref_ptr;
4370
4371 /* Update the vector-pointer's cross-iteration increment. */
4372 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4373 {
4374 tree use = USE_FROM_PTR (use_p);
4375
4376 if (use == dataref_ptr)
4377 SET_USE (use_p, new_dataref_ptr);
4378 else
4379 gcc_assert (tree_int_cst_compare (use, update) == 0);
4380 }
4381
4382 return new_dataref_ptr;
4383}
4384
4385
4386/* Function vect_create_destination_var.
4387
4388 Create a new temporary of type VECTYPE. */
4389
4390tree
4391vect_create_destination_var (tree scalar_dest, tree vectype)
4392{
4393 tree vec_dest;
451dabda
RB
4394 const char *name;
4395 char *new_name;
ebfd146a
IR
4396 tree type;
4397 enum vect_var_kind kind;
4398
4399 kind = vectype ? vect_simple_var : vect_scalar_var;
4400 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4401
4402 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4403
451dabda
RB
4404 name = get_name (scalar_dest);
4405 if (name)
378b2932 4406 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
451dabda 4407 else
378b2932 4408 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
ebfd146a 4409 vec_dest = vect_get_new_vect_var (type, kind, new_name);
451dabda 4410 free (new_name);
ebfd146a
IR
4411
4412 return vec_dest;
4413}
4414
0d0293ac 4415/* Function vect_grouped_store_supported.
ebfd146a 4416
e2c83630
RH
4417 Returns TRUE if interleave high and interleave low permutations
4418 are supported, and FALSE otherwise. */
ebfd146a
IR
4419
4420bool
0d0293ac 4421vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
ebfd146a 4422{
ef4bddc2 4423 machine_mode mode = TYPE_MODE (vectype);
b8698a0f 4424
e1377713
ES
4425 /* vect_permute_store_chain requires the group size to be equal to 3 or
4426 be a power of two. */
4427 if (count != 3 && exact_log2 (count) == -1)
b602d918 4428 {
73fbfcad 4429 if (dump_enabled_p ())
78c60e3d 4430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
e1377713
ES
4431 "the size of the group of accesses"
4432 " is not a power of 2 or not eqaul to 3\n");
b602d918
RS
4433 return false;
4434 }
4435
e2c83630 4436 /* Check that the permutation is supported. */
3fcc1b55
JJ
4437 if (VECTOR_MODE_P (mode))
4438 {
4439 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4440 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
e1377713
ES
4441
4442 if (count == 3)
3fcc1b55 4443 {
e1377713
ES
4444 unsigned int j0 = 0, j1 = 0, j2 = 0;
4445 unsigned int i, j;
4446
4447 for (j = 0; j < 3; j++)
4448 {
4449 int nelt0 = ((3 - j) * nelt) % 3;
4450 int nelt1 = ((3 - j) * nelt + 1) % 3;
4451 int nelt2 = ((3 - j) * nelt + 2) % 3;
4452 for (i = 0; i < nelt; i++)
4453 {
4454 if (3 * i + nelt0 < nelt)
4455 sel[3 * i + nelt0] = j0++;
4456 if (3 * i + nelt1 < nelt)
4457 sel[3 * i + nelt1] = nelt + j1++;
4458 if (3 * i + nelt2 < nelt)
4459 sel[3 * i + nelt2] = 0;
4460 }
4461 if (!can_vec_perm_p (mode, false, sel))
4462 {
4463 if (dump_enabled_p ())
4464 dump_printf (MSG_MISSED_OPTIMIZATION,
4465 "permutaion op not supported by target.\n");
4466 return false;
4467 }
4468
4469 for (i = 0; i < nelt; i++)
4470 {
4471 if (3 * i + nelt0 < nelt)
4472 sel[3 * i + nelt0] = 3 * i + nelt0;
4473 if (3 * i + nelt1 < nelt)
4474 sel[3 * i + nelt1] = 3 * i + nelt1;
4475 if (3 * i + nelt2 < nelt)
4476 sel[3 * i + nelt2] = nelt + j2++;
4477 }
4478 if (!can_vec_perm_p (mode, false, sel))
4479 {
4480 if (dump_enabled_p ())
4481 dump_printf (MSG_MISSED_OPTIMIZATION,
4482 "permutaion op not supported by target.\n");
4483 return false;
4484 }
4485 }
4486 return true;
3fcc1b55 4487 }
e1377713 4488 else
3fcc1b55 4489 {
e1377713
ES
4490 /* If length is not equal to 3 then only power of 2 is supported. */
4491 gcc_assert (exact_log2 (count) != -1);
4492
4493 for (i = 0; i < nelt / 2; i++)
4494 {
4495 sel[i * 2] = i;
4496 sel[i * 2 + 1] = i + nelt;
4497 }
4498 if (can_vec_perm_p (mode, false, sel))
4499 {
4500 for (i = 0; i < nelt; i++)
4501 sel[i] += nelt / 2;
4502 if (can_vec_perm_p (mode, false, sel))
4503 return true;
4504 }
3fcc1b55
JJ
4505 }
4506 }
ebfd146a 4507
73fbfcad 4508 if (dump_enabled_p ())
78c60e3d 4509 dump_printf (MSG_MISSED_OPTIMIZATION,
e1377713 4510 "permutaion op not supported by target.\n");
a6b3dfde 4511 return false;
ebfd146a
IR
4512}
4513
4514
272c6793
RS
4515/* Return TRUE if vec_store_lanes is available for COUNT vectors of
4516 type VECTYPE. */
4517
4518bool
4519vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4520{
4521 return vect_lanes_optab_supported_p ("vec_store_lanes",
4522 vec_store_lanes_optab,
4523 vectype, count);
4524}
4525
4526
ebfd146a
IR
4527/* Function vect_permute_store_chain.
4528
4529 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
e1377713
ES
4530 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4531 the data correctly for the stores. Return the final references for stores
4532 in RESULT_CHAIN.
ebfd146a
IR
4533
4534 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
ff802fa1
IR
4535 The input is 4 vectors each containing 8 elements. We assign a number to
4536 each element, the input sequence is:
ebfd146a
IR
4537
4538 1st vec: 0 1 2 3 4 5 6 7
4539 2nd vec: 8 9 10 11 12 13 14 15
b8698a0f 4540 3rd vec: 16 17 18 19 20 21 22 23
ebfd146a
IR
4541 4th vec: 24 25 26 27 28 29 30 31
4542
4543 The output sequence should be:
4544
4545 1st vec: 0 8 16 24 1 9 17 25
4546 2nd vec: 2 10 18 26 3 11 19 27
4547 3rd vec: 4 12 20 28 5 13 21 30
4548 4th vec: 6 14 22 30 7 15 23 31
4549
4550 i.e., we interleave the contents of the four vectors in their order.
4551
ff802fa1 4552 We use interleave_high/low instructions to create such output. The input of
ebfd146a 4553 each interleave_high/low operation is two vectors:
b8698a0f
L
4554 1st vec 2nd vec
4555 0 1 2 3 4 5 6 7
4556 the even elements of the result vector are obtained left-to-right from the
ff802fa1 4557 high/low elements of the first vector. The odd elements of the result are
ebfd146a
IR
4558 obtained left-to-right from the high/low elements of the second vector.
4559 The output of interleave_high will be: 0 4 1 5
4560 and of interleave_low: 2 6 3 7
4561
b8698a0f 4562
ff802fa1 4563 The permutation is done in log LENGTH stages. In each stage interleave_high
b8698a0f
L
4564 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4565 where the first argument is taken from the first half of DR_CHAIN and the
4566 second argument from it's second half.
4567 In our example,
ebfd146a
IR
4568
4569 I1: interleave_high (1st vec, 3rd vec)
4570 I2: interleave_low (1st vec, 3rd vec)
4571 I3: interleave_high (2nd vec, 4th vec)
4572 I4: interleave_low (2nd vec, 4th vec)
4573
4574 The output for the first stage is:
4575
4576 I1: 0 16 1 17 2 18 3 19
4577 I2: 4 20 5 21 6 22 7 23
4578 I3: 8 24 9 25 10 26 11 27
4579 I4: 12 28 13 29 14 30 15 31
4580
4581 The output of the second stage, i.e. the final result is:
4582
4583 I1: 0 8 16 24 1 9 17 25
4584 I2: 2 10 18 26 3 11 19 27
4585 I3: 4 12 20 28 5 13 21 30
4586 I4: 6 14 22 30 7 15 23 31. */
b8698a0f 4587
b602d918 4588void
9771b263 4589vect_permute_store_chain (vec<tree> dr_chain,
b8698a0f 4590 unsigned int length,
ebfd146a
IR
4591 gimple stmt,
4592 gimple_stmt_iterator *gsi,
9771b263 4593 vec<tree> *result_chain)
ebfd146a 4594{
83d5977e 4595 tree vect1, vect2, high, low;
ebfd146a
IR
4596 gimple perm_stmt;
4597 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3fcc1b55 4598 tree perm_mask_low, perm_mask_high;
e1377713
ES
4599 tree data_ref;
4600 tree perm3_mask_low, perm3_mask_high;
4601 unsigned int i, n, log_length = exact_log2 (length);
e2c83630 4602 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
3fcc1b55 4603 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
b8698a0f 4604
b6b9227d
JJ
4605 result_chain->quick_grow (length);
4606 memcpy (result_chain->address (), dr_chain.address (),
4607 length * sizeof (tree));
ebfd146a 4608
e1377713 4609 if (length == 3)
3fcc1b55 4610 {
e1377713 4611 unsigned int j0 = 0, j1 = 0, j2 = 0;
e2c83630 4612
e1377713
ES
4613 for (j = 0; j < 3; j++)
4614 {
4615 int nelt0 = ((3 - j) * nelt) % 3;
4616 int nelt1 = ((3 - j) * nelt + 1) % 3;
4617 int nelt2 = ((3 - j) * nelt + 2) % 3;
3fcc1b55 4618
e1377713
ES
4619 for (i = 0; i < nelt; i++)
4620 {
4621 if (3 * i + nelt0 < nelt)
4622 sel[3 * i + nelt0] = j0++;
4623 if (3 * i + nelt1 < nelt)
4624 sel[3 * i + nelt1] = nelt + j1++;
4625 if (3 * i + nelt2 < nelt)
4626 sel[3 * i + nelt2] = 0;
4627 }
557be5a8 4628 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
e1377713
ES
4629
4630 for (i = 0; i < nelt; i++)
4631 {
4632 if (3 * i + nelt0 < nelt)
4633 sel[3 * i + nelt0] = 3 * i + nelt0;
4634 if (3 * i + nelt1 < nelt)
4635 sel[3 * i + nelt1] = 3 * i + nelt1;
4636 if (3 * i + nelt2 < nelt)
4637 sel[3 * i + nelt2] = nelt + j2++;
4638 }
557be5a8 4639 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
e1377713
ES
4640
4641 vect1 = dr_chain[0];
4642 vect2 = dr_chain[1];
ebfd146a
IR
4643
4644 /* Create interleaving stmt:
e1377713
ES
4645 low = VEC_PERM_EXPR <vect1, vect2,
4646 {j, nelt, *, j + 1, nelt + j + 1, *,
4647 j + 2, nelt + j + 2, *, ...}> */
4648 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
0d0e4a03
JJ
4649 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4650 vect2, perm3_mask_low);
ebfd146a 4651 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
ebfd146a 4652
e1377713
ES
4653 vect1 = data_ref;
4654 vect2 = dr_chain[2];
ebfd146a 4655 /* Create interleaving stmt:
e1377713
ES
4656 low = VEC_PERM_EXPR <vect1, vect2,
4657 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4658 6, 7, nelt + j + 2, ...}> */
4659 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
0d0e4a03
JJ
4660 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4661 vect2, perm3_mask_high);
ebfd146a 4662 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
e1377713 4663 (*result_chain)[j] = data_ref;
ebfd146a 4664 }
e1377713
ES
4665 }
4666 else
4667 {
4668 /* If length is not equal to 3 then only power of 2 is supported. */
4669 gcc_assert (exact_log2 (length) != -1);
4670
4671 for (i = 0, n = nelt / 2; i < n; i++)
4672 {
4673 sel[i * 2] = i;
4674 sel[i * 2 + 1] = i + nelt;
4675 }
557be5a8 4676 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
e1377713
ES
4677
4678 for (i = 0; i < nelt; i++)
4679 sel[i] += nelt / 2;
557be5a8 4680 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
e1377713
ES
4681
4682 for (i = 0, n = log_length; i < n; i++)
4683 {
4684 for (j = 0; j < length/2; j++)
4685 {
4686 vect1 = dr_chain[j];
4687 vect2 = dr_chain[j+length/2];
4688
4689 /* Create interleaving stmt:
4690 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4691 ...}> */
4692 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
0d0e4a03
JJ
4693 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4694 vect2, perm_mask_high);
e1377713
ES
4695 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4696 (*result_chain)[2*j] = high;
4697
4698 /* Create interleaving stmt:
4699 low = VEC_PERM_EXPR <vect1, vect2,
4700 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4701 ...}> */
4702 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
0d0e4a03
JJ
4703 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4704 vect2, perm_mask_low);
e1377713
ES
4705 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4706 (*result_chain)[2*j+1] = low;
4707 }
4708 memcpy (dr_chain.address (), result_chain->address (),
4709 length * sizeof (tree));
4710 }
ebfd146a 4711 }
ebfd146a
IR
4712}
4713
4714/* Function vect_setup_realignment
b8698a0f 4715
ebfd146a
IR
4716 This function is called when vectorizing an unaligned load using
4717 the dr_explicit_realign[_optimized] scheme.
4718 This function generates the following code at the loop prolog:
4719
4720 p = initial_addr;
4721 x msq_init = *(floor(p)); # prolog load
b8698a0f 4722 realignment_token = call target_builtin;
ebfd146a
IR
4723 loop:
4724 x msq = phi (msq_init, ---)
4725
b8698a0f 4726 The stmts marked with x are generated only for the case of
ebfd146a
IR
4727 dr_explicit_realign_optimized.
4728
b8698a0f 4729 The code above sets up a new (vector) pointer, pointing to the first
ebfd146a
IR
4730 location accessed by STMT, and a "floor-aligned" load using that pointer.
4731 It also generates code to compute the "realignment-token" (if the relevant
4732 target hook was defined), and creates a phi-node at the loop-header bb
4733 whose arguments are the result of the prolog-load (created by this
4734 function) and the result of a load that takes place in the loop (to be
4735 created by the caller to this function).
4736
4737 For the case of dr_explicit_realign_optimized:
b8698a0f 4738 The caller to this function uses the phi-result (msq) to create the
ebfd146a
IR
4739 realignment code inside the loop, and sets up the missing phi argument,
4740 as follows:
b8698a0f 4741 loop:
ebfd146a
IR
4742 msq = phi (msq_init, lsq)
4743 lsq = *(floor(p')); # load in loop
4744 result = realign_load (msq, lsq, realignment_token);
4745
4746 For the case of dr_explicit_realign:
4747 loop:
4748 msq = *(floor(p)); # load in loop
4749 p' = p + (VS-1);
4750 lsq = *(floor(p')); # load in loop
4751 result = realign_load (msq, lsq, realignment_token);
4752
4753 Input:
4754 STMT - (scalar) load stmt to be vectorized. This load accesses
4755 a memory location that may be unaligned.
4756 BSI - place where new code is to be inserted.
4757 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
b8698a0f
L
4758 is used.
4759
ebfd146a
IR
4760 Output:
4761 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4762 target hook, if defined.
4763 Return value - the result of the loop-header phi node. */
4764
4765tree
4766vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4767 tree *realignment_token,
4768 enum dr_alignment_support alignment_support_scheme,
4769 tree init_addr,
4770 struct loop **at_loop)
4771{
4772 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4773 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4774 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
20ede5c6 4775 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
69f11a13
IR
4776 struct loop *loop = NULL;
4777 edge pe = NULL;
ebfd146a
IR
4778 tree scalar_dest = gimple_assign_lhs (stmt);
4779 tree vec_dest;
4780 gimple inc;
4781 tree ptr;
4782 tree data_ref;
ebfd146a
IR
4783 basic_block new_bb;
4784 tree msq_init = NULL_TREE;
4785 tree new_temp;
538dd0b7 4786 gphi *phi_stmt;
ebfd146a
IR
4787 tree msq = NULL_TREE;
4788 gimple_seq stmts = NULL;
4789 bool inv_p;
4790 bool compute_in_loop = false;
69f11a13 4791 bool nested_in_vect_loop = false;
ebfd146a 4792 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
69f11a13
IR
4793 struct loop *loop_for_initial_load = NULL;
4794
4795 if (loop_vinfo)
4796 {
4797 loop = LOOP_VINFO_LOOP (loop_vinfo);
4798 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4799 }
ebfd146a
IR
4800
4801 gcc_assert (alignment_support_scheme == dr_explicit_realign
4802 || alignment_support_scheme == dr_explicit_realign_optimized);
4803
4804 /* We need to generate three things:
4805 1. the misalignment computation
4806 2. the extra vector load (for the optimized realignment scheme).
4807 3. the phi node for the two vectors from which the realignment is
ff802fa1 4808 done (for the optimized realignment scheme). */
ebfd146a
IR
4809
4810 /* 1. Determine where to generate the misalignment computation.
4811
4812 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4813 calculation will be generated by this function, outside the loop (in the
4814 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4815 caller, inside the loop.
4816
4817 Background: If the misalignment remains fixed throughout the iterations of
4818 the loop, then both realignment schemes are applicable, and also the
4819 misalignment computation can be done outside LOOP. This is because we are
4820 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4821 are a multiple of VS (the Vector Size), and therefore the misalignment in
4822 different vectorized LOOP iterations is always the same.
4823 The problem arises only if the memory access is in an inner-loop nested
4824 inside LOOP, which is now being vectorized using outer-loop vectorization.
4825 This is the only case when the misalignment of the memory access may not
4826 remain fixed throughout the iterations of the inner-loop (as explained in
4827 detail in vect_supportable_dr_alignment). In this case, not only is the
4828 optimized realignment scheme not applicable, but also the misalignment
4829 computation (and generation of the realignment token that is passed to
4830 REALIGN_LOAD) have to be done inside the loop.
4831
4832 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4833 or not, which in turn determines if the misalignment is computed inside
4834 the inner-loop, or outside LOOP. */
4835
69f11a13 4836 if (init_addr != NULL_TREE || !loop_vinfo)
ebfd146a
IR
4837 {
4838 compute_in_loop = true;
4839 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4840 }
4841
4842
4843 /* 2. Determine where to generate the extra vector load.
4844
4845 For the optimized realignment scheme, instead of generating two vector
4846 loads in each iteration, we generate a single extra vector load in the
4847 preheader of the loop, and in each iteration reuse the result of the
4848 vector load from the previous iteration. In case the memory access is in
4849 an inner-loop nested inside LOOP, which is now being vectorized using
4850 outer-loop vectorization, we need to determine whether this initial vector
4851 load should be generated at the preheader of the inner-loop, or can be
4852 generated at the preheader of LOOP. If the memory access has no evolution
4853 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4854 to be generated inside LOOP (in the preheader of the inner-loop). */
4855
4856 if (nested_in_vect_loop)
4857 {
4858 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4859 bool invariant_in_outerloop =
4860 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4861 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4862 }
4863 else
4864 loop_for_initial_load = loop;
4865 if (at_loop)
4866 *at_loop = loop_for_initial_load;
4867
69f11a13
IR
4868 if (loop_for_initial_load)
4869 pe = loop_preheader_edge (loop_for_initial_load);
4870
ebfd146a
IR
4871 /* 3. For the case of the optimized realignment, create the first vector
4872 load at the loop preheader. */
4873
4874 if (alignment_support_scheme == dr_explicit_realign_optimized)
4875 {
4876 /* Create msq_init = *(floor(p1)) in the loop preheader */
538dd0b7 4877 gassign *new_stmt;
ebfd146a
IR
4878
4879 gcc_assert (!compute_in_loop);
ebfd146a 4880 vec_dest = vect_create_destination_var (scalar_dest, vectype);
920e8172
RS
4881 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4882 NULL_TREE, &init_addr, NULL, &inc,
4883 true, &inv_p);
b731b390 4884 new_temp = copy_ssa_name (ptr);
0d0e4a03
JJ
4885 new_stmt = gimple_build_assign
4886 (new_temp, BIT_AND_EXPR, ptr,
75421dcd
RG
4887 build_int_cst (TREE_TYPE (ptr),
4888 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
75421dcd
RG
4889 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4890 gcc_assert (!new_bb);
20ede5c6
RG
4891 data_ref
4892 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4893 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
ebfd146a
IR
4894 new_stmt = gimple_build_assign (vec_dest, data_ref);
4895 new_temp = make_ssa_name (vec_dest, new_stmt);
4896 gimple_assign_set_lhs (new_stmt, new_temp);
69f11a13
IR
4897 if (pe)
4898 {
4899 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4900 gcc_assert (!new_bb);
4901 }
4902 else
4903 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4904
ebfd146a
IR
4905 msq_init = gimple_assign_lhs (new_stmt);
4906 }
4907
4908 /* 4. Create realignment token using a target builtin, if available.
4909 It is done either inside the containing loop, or before LOOP (as
4910 determined above). */
4911
4912 if (targetm.vectorize.builtin_mask_for_load)
4913 {
538dd0b7 4914 gcall *new_stmt;
ebfd146a
IR
4915 tree builtin_decl;
4916
4917 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
69f11a13 4918 if (!init_addr)
ebfd146a
IR
4919 {
4920 /* Generate the INIT_ADDR computation outside LOOP. */
4921 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4922 NULL_TREE, loop);
69f11a13
IR
4923 if (loop)
4924 {
4925 pe = loop_preheader_edge (loop);
4926 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4927 gcc_assert (!new_bb);
4928 }
4929 else
4930 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
ebfd146a
IR
4931 }
4932
4933 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4934 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4935 vec_dest =
4936 vect_create_destination_var (scalar_dest,
4937 gimple_call_return_type (new_stmt));
4938 new_temp = make_ssa_name (vec_dest, new_stmt);
4939 gimple_call_set_lhs (new_stmt, new_temp);
4940
4941 if (compute_in_loop)
4942 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4943 else
4944 {
4945 /* Generate the misalignment computation outside LOOP. */
4946 pe = loop_preheader_edge (loop);
4947 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4948 gcc_assert (!new_bb);
4949 }
4950
4951 *realignment_token = gimple_call_lhs (new_stmt);
4952
4953 /* The result of the CALL_EXPR to this builtin is determined from
4954 the value of the parameter and no global variables are touched
4955 which makes the builtin a "const" function. Requiring the
4956 builtin to have the "const" attribute makes it unnecessary
4957 to call mark_call_clobbered. */
4958 gcc_assert (TREE_READONLY (builtin_decl));
4959 }
4960
4961 if (alignment_support_scheme == dr_explicit_realign)
4962 return msq;
4963
4964 gcc_assert (!compute_in_loop);
4965 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4966
4967
4968 /* 5. Create msq = phi <msq_init, lsq> in loop */
4969
4970 pe = loop_preheader_edge (containing_loop);
4971 vec_dest = vect_create_destination_var (scalar_dest, vectype);
b731b390 4972 msq = make_ssa_name (vec_dest);
ebfd146a 4973 phi_stmt = create_phi_node (msq, containing_loop->header);
9e227d60 4974 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
ebfd146a
IR
4975
4976 return msq;
4977}
4978
4979
0d0293ac 4980/* Function vect_grouped_load_supported.
ebfd146a 4981
e2c83630 4982 Returns TRUE if even and odd permutations are supported,
ebfd146a
IR
4983 and FALSE otherwise. */
4984
4985bool
0d0293ac 4986vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
ebfd146a 4987{
ef4bddc2 4988 machine_mode mode = TYPE_MODE (vectype);
ebfd146a 4989
2c23db6d
ES
4990 /* vect_permute_load_chain requires the group size to be equal to 3 or
4991 be a power of two. */
4992 if (count != 3 && exact_log2 (count) == -1)
b602d918 4993 {
73fbfcad 4994 if (dump_enabled_p ())
78c60e3d 4995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2c23db6d
ES
4996 "the size of the group of accesses"
4997 " is not a power of 2 or not equal to 3\n");
b602d918
RS
4998 return false;
4999 }
5000
e2c83630
RH
5001 /* Check that the permutation is supported. */
5002 if (VECTOR_MODE_P (mode))
5003 {
2c23db6d 5004 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
e2c83630 5005 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
ebfd146a 5006
2c23db6d 5007 if (count == 3)
e2c83630 5008 {
2c23db6d
ES
5009 unsigned int k;
5010 for (k = 0; k < 3; k++)
5011 {
5012 for (i = 0; i < nelt; i++)
5013 if (3 * i + k < 2 * nelt)
5014 sel[i] = 3 * i + k;
5015 else
5016 sel[i] = 0;
5017 if (!can_vec_perm_p (mode, false, sel))
5018 {
5019 if (dump_enabled_p ())
5020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5021 "shuffle of 3 loads is not supported by"
5022 " target\n");
5023 return false;
5024 }
5025 for (i = 0, j = 0; i < nelt; i++)
5026 if (3 * i + k < 2 * nelt)
5027 sel[i] = i;
5028 else
5029 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5030 if (!can_vec_perm_p (mode, false, sel))
5031 {
5032 if (dump_enabled_p ())
5033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5034 "shuffle of 3 loads is not supported by"
5035 " target\n");
5036 return false;
5037 }
5038 }
5039 return true;
5040 }
5041 else
5042 {
5043 /* If length is not equal to 3 then only power of 2 is supported. */
5044 gcc_assert (exact_log2 (count) != -1);
e2c83630 5045 for (i = 0; i < nelt; i++)
2c23db6d 5046 sel[i] = i * 2;
e2c83630 5047 if (can_vec_perm_p (mode, false, sel))
2c23db6d
ES
5048 {
5049 for (i = 0; i < nelt; i++)
5050 sel[i] = i * 2 + 1;
5051 if (can_vec_perm_p (mode, false, sel))
5052 return true;
5053 }
5054 }
e2c83630 5055 }
ebfd146a 5056
73fbfcad 5057 if (dump_enabled_p ())
78c60e3d 5058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2c23db6d 5059 "extract even/odd not supported by target\n");
a6b3dfde 5060 return false;
ebfd146a
IR
5061}
5062
272c6793
RS
5063/* Return TRUE if vec_load_lanes is available for COUNT vectors of
5064 type VECTYPE. */
5065
5066bool
5067vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5068{
5069 return vect_lanes_optab_supported_p ("vec_load_lanes",
5070 vec_load_lanes_optab,
5071 vectype, count);
5072}
ebfd146a
IR
5073
5074/* Function vect_permute_load_chain.
5075
5076 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
2c23db6d
ES
5077 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5078 the input data correctly. Return the final references for loads in
5079 RESULT_CHAIN.
ebfd146a
IR
5080
5081 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5082 The input is 4 vectors each containing 8 elements. We assign a number to each
5083 element, the input sequence is:
5084
5085 1st vec: 0 1 2 3 4 5 6 7
5086 2nd vec: 8 9 10 11 12 13 14 15
b8698a0f 5087 3rd vec: 16 17 18 19 20 21 22 23
ebfd146a
IR
5088 4th vec: 24 25 26 27 28 29 30 31
5089
5090 The output sequence should be:
5091
5092 1st vec: 0 4 8 12 16 20 24 28
5093 2nd vec: 1 5 9 13 17 21 25 29
b8698a0f 5094 3rd vec: 2 6 10 14 18 22 26 30
ebfd146a
IR
5095 4th vec: 3 7 11 15 19 23 27 31
5096
5097 i.e., the first output vector should contain the first elements of each
5098 interleaving group, etc.
5099
ff802fa1
IR
5100 We use extract_even/odd instructions to create such output. The input of
5101 each extract_even/odd operation is two vectors
b8698a0f
L
5102 1st vec 2nd vec
5103 0 1 2 3 4 5 6 7
ebfd146a 5104
ff802fa1 5105 and the output is the vector of extracted even/odd elements. The output of
ebfd146a
IR
5106 extract_even will be: 0 2 4 6
5107 and of extract_odd: 1 3 5 7
5108
b8698a0f 5109
ff802fa1
IR
5110 The permutation is done in log LENGTH stages. In each stage extract_even
5111 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5112 their order. In our example,
ebfd146a
IR
5113
5114 E1: extract_even (1st vec, 2nd vec)
5115 E2: extract_odd (1st vec, 2nd vec)
5116 E3: extract_even (3rd vec, 4th vec)
5117 E4: extract_odd (3rd vec, 4th vec)
5118
5119 The output for the first stage will be:
5120
5121 E1: 0 2 4 6 8 10 12 14
5122 E2: 1 3 5 7 9 11 13 15
b8698a0f 5123 E3: 16 18 20 22 24 26 28 30
ebfd146a
IR
5124 E4: 17 19 21 23 25 27 29 31
5125
5126 In order to proceed and create the correct sequence for the next stage (or
b8698a0f
L
5127 for the correct output, if the second stage is the last one, as in our
5128 example), we first put the output of extract_even operation and then the
ebfd146a
IR
5129 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5130 The input for the second stage is:
5131
5132 1st vec (E1): 0 2 4 6 8 10 12 14
b8698a0f
L
5133 2nd vec (E3): 16 18 20 22 24 26 28 30
5134 3rd vec (E2): 1 3 5 7 9 11 13 15
ebfd146a
IR
5135 4th vec (E4): 17 19 21 23 25 27 29 31
5136
5137 The output of the second stage:
5138
5139 E1: 0 4 8 12 16 20 24 28
5140 E2: 2 6 10 14 18 22 26 30
5141 E3: 1 5 9 13 17 21 25 29
5142 E4: 3 7 11 15 19 23 27 31
5143
5144 And RESULT_CHAIN after reordering:
5145
5146 1st vec (E1): 0 4 8 12 16 20 24 28
5147 2nd vec (E3): 1 5 9 13 17 21 25 29
b8698a0f 5148 3rd vec (E2): 2 6 10 14 18 22 26 30
ebfd146a
IR
5149 4th vec (E4): 3 7 11 15 19 23 27 31. */
5150
b602d918 5151static void
9771b263 5152vect_permute_load_chain (vec<tree> dr_chain,
b8698a0f 5153 unsigned int length,
ebfd146a
IR
5154 gimple stmt,
5155 gimple_stmt_iterator *gsi,
9771b263 5156 vec<tree> *result_chain)
ebfd146a 5157{
83d5977e 5158 tree data_ref, first_vect, second_vect;
e2c83630 5159 tree perm_mask_even, perm_mask_odd;
2c23db6d 5160 tree perm3_mask_low, perm3_mask_high;
ebfd146a
IR
5161 gimple perm_stmt;
5162 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
e2c83630
RH
5163 unsigned int i, j, log_length = exact_log2 (length);
5164 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5165 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
ebfd146a 5166
3f292312
JJ
5167 result_chain->quick_grow (length);
5168 memcpy (result_chain->address (), dr_chain.address (),
5169 length * sizeof (tree));
e2c83630 5170
2c23db6d 5171 if (length == 3)
ebfd146a 5172 {
2c23db6d 5173 unsigned int k;
ebfd146a 5174
2c23db6d
ES
5175 for (k = 0; k < 3; k++)
5176 {
5177 for (i = 0; i < nelt; i++)
5178 if (3 * i + k < 2 * nelt)
5179 sel[i] = 3 * i + k;
5180 else
5181 sel[i] = 0;
557be5a8 5182 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
2c23db6d
ES
5183
5184 for (i = 0, j = 0; i < nelt; i++)
5185 if (3 * i + k < 2 * nelt)
5186 sel[i] = i;
5187 else
5188 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5189
557be5a8 5190 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
2c23db6d
ES
5191
5192 first_vect = dr_chain[0];
5193 second_vect = dr_chain[1];
5194
5195 /* Create interleaving stmt (low part of):
5196 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5197 ...}> */
f598c55c 5198 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
0d0e4a03
JJ
5199 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5200 second_vect, perm3_mask_low);
ebfd146a 5201 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
b8698a0f 5202
2c23db6d
ES
5203 /* Create interleaving stmt (high part of):
5204 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5205 ...}> */
5206 first_vect = data_ref;
5207 second_vect = dr_chain[2];
f598c55c 5208 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
0d0e4a03
JJ
5209 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5210 second_vect, perm3_mask_high);
ebfd146a 5211 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2c23db6d 5212 (*result_chain)[k] = data_ref;
ebfd146a 5213 }
ebfd146a 5214 }
2c23db6d
ES
5215 else
5216 {
5217 /* If length is not equal to 3 then only power of 2 is supported. */
5218 gcc_assert (exact_log2 (length) != -1);
5219
5220 for (i = 0; i < nelt; ++i)
5221 sel[i] = i * 2;
557be5a8 5222 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
2c23db6d
ES
5223
5224 for (i = 0; i < nelt; ++i)
5225 sel[i] = i * 2 + 1;
557be5a8 5226 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
ebfd146a 5227
2c23db6d
ES
5228 for (i = 0; i < log_length; i++)
5229 {
5230 for (j = 0; j < length; j += 2)
5231 {
5232 first_vect = dr_chain[j];
5233 second_vect = dr_chain[j+1];
5234
5235 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5236 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
0d0e4a03
JJ
5237 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5238 first_vect, second_vect,
5239 perm_mask_even);
2c23db6d
ES
5240 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5241 (*result_chain)[j/2] = data_ref;
5242
5243 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5244 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
0d0e4a03
JJ
5245 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5246 first_vect, second_vect,
5247 perm_mask_odd);
2c23db6d
ES
5248 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5249 (*result_chain)[j/2+length/2] = data_ref;
5250 }
5251 memcpy (dr_chain.address (), result_chain->address (),
5252 length * sizeof (tree));
5253 }
5254 }
5255}
ebfd146a 5256
f7917029
ES
5257/* Function vect_shift_permute_load_chain.
5258
5259 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5260 sequence of stmts to reorder the input data accordingly.
5261 Return the final references for loads in RESULT_CHAIN.
5262 Return true if successed, false otherwise.
5263
5264 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5265 The input is 3 vectors each containing 8 elements. We assign a
5266 number to each element, the input sequence is:
5267
5268 1st vec: 0 1 2 3 4 5 6 7
5269 2nd vec: 8 9 10 11 12 13 14 15
5270 3rd vec: 16 17 18 19 20 21 22 23
5271
5272 The output sequence should be:
5273
5274 1st vec: 0 3 6 9 12 15 18 21
5275 2nd vec: 1 4 7 10 13 16 19 22
5276 3rd vec: 2 5 8 11 14 17 20 23
5277
5278 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5279
5280 First we shuffle all 3 vectors to get correct elements order:
5281
5282 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5283 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5284 3rd vec: (16 19 22) (17 20 23) (18 21)
5285
5286 Next we unite and shift vector 3 times:
5287
5288 1st step:
5289 shift right by 6 the concatenation of:
5290 "1st vec" and "2nd vec"
5291 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5292 "2nd vec" and "3rd vec"
5293 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5294 "3rd vec" and "1st vec"
5295 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5296 | New vectors |
5297
5298 So that now new vectors are:
5299
5300 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5301 2nd vec: (10 13) (16 19 22) (17 20 23)
5302 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5303
5304 2nd step:
5305 shift right by 5 the concatenation of:
5306 "1st vec" and "3rd vec"
5307 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5308 "2nd vec" and "1st vec"
5309 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5310 "3rd vec" and "2nd vec"
5311 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5312 | New vectors |
5313
5314 So that now new vectors are:
5315
5316 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5317 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5318 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5319
5320 3rd step:
5321 shift right by 5 the concatenation of:
5322 "1st vec" and "1st vec"
5323 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5324 shift right by 3 the concatenation of:
5325 "2nd vec" and "2nd vec"
5326 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5327 | New vectors |
5328
5329 So that now all vectors are READY:
5330 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5331 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5332 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5333
5334 This algorithm is faster than one in vect_permute_load_chain if:
5335 1. "shift of a concatination" is faster than general permutation.
5336 This is usually so.
5337 2. The TARGET machine can't execute vector instructions in parallel.
5338 This is because each step of the algorithm depends on previous.
5339 The algorithm in vect_permute_load_chain is much more parallel.
5340
5341 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5342*/
5343
5344static bool
5345vect_shift_permute_load_chain (vec<tree> dr_chain,
5346 unsigned int length,
5347 gimple stmt,
5348 gimple_stmt_iterator *gsi,
5349 vec<tree> *result_chain)
5350{
5351 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5352 tree perm2_mask1, perm2_mask2, perm3_mask;
5353 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5354 gimple perm_stmt;
5355
5356 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5357 unsigned int i;
5358 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5359 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5360 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5361 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5362
5363 result_chain->quick_grow (length);
5364 memcpy (result_chain->address (), dr_chain.address (),
5365 length * sizeof (tree));
5366
af4c011e 5367 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
f7917029 5368 {
af4c011e 5369 unsigned int j, log_length = exact_log2 (length);
f7917029
ES
5370 for (i = 0; i < nelt / 2; ++i)
5371 sel[i] = i * 2;
5372 for (i = 0; i < nelt / 2; ++i)
5373 sel[nelt / 2 + i] = i * 2 + 1;
5374 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5375 {
5376 if (dump_enabled_p ())
5377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5378 "shuffle of 2 fields structure is not \
5379 supported by target\n");
5380 return false;
5381 }
557be5a8 5382 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
f7917029
ES
5383
5384 for (i = 0; i < nelt / 2; ++i)
5385 sel[i] = i * 2 + 1;
5386 for (i = 0; i < nelt / 2; ++i)
5387 sel[nelt / 2 + i] = i * 2;
5388 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5389 {
5390 if (dump_enabled_p ())
5391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5392 "shuffle of 2 fields structure is not \
5393 supported by target\n");
5394 return false;
5395 }
557be5a8 5396 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
f7917029
ES
5397
5398 /* Generating permutation constant to shift all elements.
5399 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5400 for (i = 0; i < nelt; i++)
5401 sel[i] = nelt / 2 + i;
5402 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5403 {
5404 if (dump_enabled_p ())
5405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5406 "shift permutation is not supported by target\n");
5407 return false;
5408 }
557be5a8 5409 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
f7917029
ES
5410
5411 /* Generating permutation constant to select vector from 2.
5412 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5413 for (i = 0; i < nelt / 2; i++)
5414 sel[i] = i;
5415 for (i = nelt / 2; i < nelt; i++)
5416 sel[i] = nelt + i;
5417 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5418 {
5419 if (dump_enabled_p ())
5420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5421 "select is not supported by target\n");
5422 return false;
5423 }
557be5a8 5424 select_mask = vect_gen_perm_mask_checked (vectype, sel);
f7917029 5425
af4c011e
ES
5426 for (i = 0; i < log_length; i++)
5427 {
5428 for (j = 0; j < length; j += 2)
5429 {
5430 first_vect = dr_chain[j];
5431 second_vect = dr_chain[j + 1];
5432
5433 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
0d0e4a03
JJ
5434 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5435 first_vect, first_vect,
5436 perm2_mask1);
af4c011e
ES
5437 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5438 vect[0] = data_ref;
5439
5440 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
0d0e4a03
JJ
5441 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5442 second_vect, second_vect,
5443 perm2_mask2);
af4c011e
ES
5444 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5445 vect[1] = data_ref;
f7917029 5446
af4c011e 5447 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
0d0e4a03
JJ
5448 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5449 vect[0], vect[1], shift1_mask);
af4c011e
ES
5450 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5451 (*result_chain)[j/2 + length/2] = data_ref;
5452
5453 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
0d0e4a03
JJ
5454 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5455 vect[0], vect[1], select_mask);
af4c011e
ES
5456 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5457 (*result_chain)[j/2] = data_ref;
5458 }
5459 memcpy (dr_chain.address (), result_chain->address (),
5460 length * sizeof (tree));
5461 }
f7917029
ES
5462 return true;
5463 }
5464 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5465 {
5466 unsigned int k = 0, l = 0;
5467
5468 /* Generating permutation constant to get all elements in rigth order.
5469 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5470 for (i = 0; i < nelt; i++)
5471 {
5472 if (3 * k + (l % 3) >= nelt)
5473 {
5474 k = 0;
5475 l += (3 - (nelt % 3));
5476 }
5477 sel[i] = 3 * k + (l % 3);
5478 k++;
5479 }
5480 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5481 {
5482 if (dump_enabled_p ())
5483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5484 "shuffle of 3 fields structure is not \
5485 supported by target\n");
5486 return false;
5487 }
557be5a8 5488 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
f7917029
ES
5489
5490 /* Generating permutation constant to shift all elements.
5491 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5492 for (i = 0; i < nelt; i++)
5493 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5494 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5495 {
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5498 "shift permutation is not supported by target\n");
5499 return false;
5500 }
557be5a8 5501 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
f7917029
ES
5502
5503 /* Generating permutation constant to shift all elements.
5504 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5505 for (i = 0; i < nelt; i++)
5506 sel[i] = 2 * (nelt / 3) + 1 + i;
5507 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5508 {
5509 if (dump_enabled_p ())
5510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5511 "shift permutation is not supported by target\n");
5512 return false;
5513 }
557be5a8 5514 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
f7917029
ES
5515
5516 /* Generating permutation constant to shift all elements.
5517 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5518 for (i = 0; i < nelt; i++)
5519 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5520 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5521 {
5522 if (dump_enabled_p ())
5523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5524 "shift permutation is not supported by target\n");
5525 return false;
5526 }
557be5a8 5527 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
f7917029
ES
5528
5529 /* Generating permutation constant to shift all elements.
5530 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5531 for (i = 0; i < nelt; i++)
5532 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5533 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5534 {
5535 if (dump_enabled_p ())
5536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537 "shift permutation is not supported by target\n");
5538 return false;
5539 }
557be5a8 5540 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
f7917029
ES
5541
5542 for (k = 0; k < 3; k++)
5543 {
f598c55c 5544 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
0d0e4a03
JJ
5545 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5546 dr_chain[k], dr_chain[k],
5547 perm3_mask);
f7917029
ES
5548 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5549 vect[k] = data_ref;
5550 }
5551
5552 for (k = 0; k < 3; k++)
5553 {
5554 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
0d0e4a03
JJ
5555 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5556 vect[k % 3], vect[(k + 1) % 3],
5557 shift1_mask);
f7917029
ES
5558 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5559 vect_shift[k] = data_ref;
5560 }
5561
5562 for (k = 0; k < 3; k++)
5563 {
5564 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
0d0e4a03
JJ
5565 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5566 vect_shift[(4 - k) % 3],
5567 vect_shift[(3 - k) % 3],
5568 shift2_mask);
f7917029
ES
5569 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5570 vect[k] = data_ref;
5571 }
5572
5573 (*result_chain)[3 - (nelt % 3)] = vect[2];
5574
5575 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
0d0e4a03
JJ
5576 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5577 vect[0], shift3_mask);
f7917029
ES
5578 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5579 (*result_chain)[nelt % 3] = data_ref;
5580
5581 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
0d0e4a03
JJ
5582 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5583 vect[1], shift4_mask);
f7917029
ES
5584 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5585 (*result_chain)[0] = data_ref;
5586 return true;
5587 }
5588 return false;
5589}
5590
0d0293ac 5591/* Function vect_transform_grouped_load.
ebfd146a
IR
5592
5593 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5594 to perform their permutation and ascribe the result vectorized statements to
5595 the scalar statements.
5596*/
5597
b602d918 5598void
9771b263 5599vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
ebfd146a
IR
5600 gimple_stmt_iterator *gsi)
5601{
ef4bddc2 5602 machine_mode mode;
6e1aa848 5603 vec<tree> result_chain = vNULL;
ebfd146a 5604
b8698a0f
L
5605 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5606 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
ebfd146a 5607 vectors, that are ready for vector computation. */
9771b263 5608 result_chain.create (size);
f7917029
ES
5609
5610 /* If reassociation width for vector type is 2 or greater target machine can
5611 execute 2 or more vector instructions in parallel. Otherwise try to
5612 get chain for loads group using vect_shift_permute_load_chain. */
5613 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5614 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
f101d09c 5615 || exact_log2 (size) != -1
f7917029
ES
5616 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5617 gsi, &result_chain))
5618 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
0d0293ac 5619 vect_record_grouped_load_vectors (stmt, result_chain);
9771b263 5620 result_chain.release ();
272c6793
RS
5621}
5622
0d0293ac 5623/* RESULT_CHAIN contains the output of a group of grouped loads that were
272c6793
RS
5624 generated as part of the vectorization of STMT. Assign the statement
5625 for each vector to the associated scalar statement. */
5626
5627void
9771b263 5628vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
272c6793 5629{
e14c1050 5630 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
272c6793
RS
5631 gimple next_stmt, new_stmt;
5632 unsigned int i, gap_count;
5633 tree tmp_data_ref;
ebfd146a 5634
b8698a0f
L
5635 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5636 Since we scan the chain starting from it's first node, their order
ebfd146a
IR
5637 corresponds the order of data-refs in RESULT_CHAIN. */
5638 next_stmt = first_stmt;
5639 gap_count = 1;
9771b263 5640 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
ebfd146a
IR
5641 {
5642 if (!next_stmt)
5643 break;
5644
ff802fa1
IR
5645 /* Skip the gaps. Loads created for the gaps will be removed by dead
5646 code elimination pass later. No need to check for the first stmt in
ebfd146a 5647 the group, since it always exists.
e14c1050
IR
5648 GROUP_GAP is the number of steps in elements from the previous
5649 access (if there is no gap GROUP_GAP is 1). We skip loads that
ff802fa1 5650 correspond to the gaps. */
b8698a0f 5651 if (next_stmt != first_stmt
e14c1050 5652 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
ebfd146a
IR
5653 {
5654 gap_count++;
5655 continue;
5656 }
5657
5658 while (next_stmt)
5659 {
5660 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5661 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5662 copies, and we put the new vector statement in the first available
5663 RELATED_STMT. */
5664 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5665 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5666 else
5667 {
e14c1050 5668 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
ebfd146a
IR
5669 {
5670 gimple prev_stmt =
5671 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5672 gimple rel_stmt =
5673 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5674 while (rel_stmt)
5675 {
5676 prev_stmt = rel_stmt;
b8698a0f 5677 rel_stmt =
ebfd146a
IR
5678 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5679 }
5680
b8698a0f 5681 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
ebfd146a
IR
5682 new_stmt;
5683 }
5684 }
5685
e14c1050 5686 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
ebfd146a
IR
5687 gap_count = 1;
5688 /* If NEXT_STMT accesses the same DR as the previous statement,
5689 put the same TMP_DATA_REF as its vectorized statement; otherwise
5690 get the next data-ref from RESULT_CHAIN. */
e14c1050 5691 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
ebfd146a
IR
5692 break;
5693 }
5694 }
ebfd146a
IR
5695}
5696
5697/* Function vect_force_dr_alignment_p.
5698
5699 Returns whether the alignment of a DECL can be forced to be aligned
5700 on ALIGNMENT bit boundary. */
5701
b8698a0f 5702bool
ebfd146a
IR
5703vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5704{
5705 if (TREE_CODE (decl) != VAR_DECL)
5706 return false;
5707
6192fa79
JH
5708 /* With -fno-toplevel-reorder we may have already output the constant. */
5709 if (TREE_ASM_WRITTEN (decl))
5710 return false;
5711
5712 /* Constant pool entries may be shared and not properly merged by LTO. */
5713 if (DECL_IN_CONSTANT_POOL (decl))
5714 return false;
ebfd146a 5715
6ad386b7
JH
5716 if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5717 {
5718 symtab_node *snode;
5719
5720 /* We cannot change alignment of symbols that may bind to symbols
5721 in other translation unit that may contain a definition with lower
5722 alignment. */
5723 if (!decl_binds_to_current_def_p (decl))
5724 return false;
5725
5726 /* When compiling partition, be sure the symbol is not output by other
5727 partition. */
d52f5295 5728 snode = symtab_node::get (decl);
6ad386b7
JH
5729 if (flag_ltrans
5730 && (snode->in_other_partition
d52f5295 5731 || snode->get_partitioning_class () == SYMBOL_DUPLICATE))
6ad386b7
JH
5732 return false;
5733 }
ebfd146a 5734
f89dcfd8
RG
5735 /* Do not override the alignment as specified by the ABI when the used
5736 attribute is set. */
5737 if (DECL_PRESERVE_P (decl))
af4d0d91
RG
5738 return false;
5739
79e02217
JJ
5740 /* Do not override explicit alignment set by the user when an explicit
5741 section name is also used. This is a common idiom used by many
5742 software projects. */
24d047a3 5743 if (TREE_STATIC (decl)
f961457f 5744 && DECL_SECTION_NAME (decl) != NULL
d52f5295 5745 && !symtab_node::get (decl)->implicit_section)
79e02217
JJ
5746 return false;
5747
6ad386b7
JH
5748 /* If symbol is an alias, we need to check that target is OK. */
5749 if (TREE_STATIC (decl))
5750 {
d52f5295 5751 tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl;
6ad386b7
JH
5752 if (target != decl)
5753 {
5754 if (DECL_PRESERVE_P (target))
5755 return false;
5756 decl = target;
5757 }
5758 }
5759
ebfd146a
IR
5760 if (TREE_STATIC (decl))
5761 return (alignment <= MAX_OFILE_ALIGNMENT);
5762 else
5763 return (alignment <= MAX_STACK_ALIGNMENT);
5764}
5765
ebfd146a 5766
720f5239
IR
5767/* Return whether the data reference DR is supported with respect to its
5768 alignment.
5769 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5770 it is aligned, i.e., check if it is possible to vectorize it with different
ebfd146a
IR
5771 alignment. */
5772
5773enum dr_alignment_support
720f5239
IR
5774vect_supportable_dr_alignment (struct data_reference *dr,
5775 bool check_aligned_accesses)
ebfd146a
IR
5776{
5777 gimple stmt = DR_STMT (dr);
5778 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5779 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ef4bddc2 5780 machine_mode mode = TYPE_MODE (vectype);
a70d6342
IR
5781 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5782 struct loop *vect_loop = NULL;
5783 bool nested_in_vect_loop = false;
ebfd146a 5784
720f5239 5785 if (aligned_access_p (dr) && !check_aligned_accesses)
ebfd146a
IR
5786 return dr_aligned;
5787
5ce9450f
JJ
5788 /* For now assume all conditional loads/stores support unaligned
5789 access without any special code. */
5790 if (is_gimple_call (stmt)
5791 && gimple_call_internal_p (stmt)
5792 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5793 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5794 return dr_unaligned_supported;
5795
69f11a13
IR
5796 if (loop_vinfo)
5797 {
5798 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5799 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5800 }
a70d6342 5801
ebfd146a
IR
5802 /* Possibly unaligned access. */
5803
5804 /* We can choose between using the implicit realignment scheme (generating
5805 a misaligned_move stmt) and the explicit realignment scheme (generating
ff802fa1
IR
5806 aligned loads with a REALIGN_LOAD). There are two variants to the
5807 explicit realignment scheme: optimized, and unoptimized.
ebfd146a
IR
5808 We can optimize the realignment only if the step between consecutive
5809 vector loads is equal to the vector size. Since the vector memory
5810 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5811 is guaranteed that the misalignment amount remains the same throughout the
5812 execution of the vectorized loop. Therefore, we can create the
5813 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5814 at the loop preheader.
5815
5816 However, in the case of outer-loop vectorization, when vectorizing a
5817 memory access in the inner-loop nested within the LOOP that is now being
5818 vectorized, while it is guaranteed that the misalignment of the
5819 vectorized memory access will remain the same in different outer-loop
5820 iterations, it is *not* guaranteed that is will remain the same throughout
5821 the execution of the inner-loop. This is because the inner-loop advances
5822 with the original scalar step (and not in steps of VS). If the inner-loop
5823 step happens to be a multiple of VS, then the misalignment remains fixed
5824 and we can use the optimized realignment scheme. For example:
5825
5826 for (i=0; i<N; i++)
5827 for (j=0; j<M; j++)
5828 s += a[i+j];
5829
5830 When vectorizing the i-loop in the above example, the step between
5831 consecutive vector loads is 1, and so the misalignment does not remain
5832 fixed across the execution of the inner-loop, and the realignment cannot
5833 be optimized (as illustrated in the following pseudo vectorized loop):
5834
5835 for (i=0; i<N; i+=4)
5836 for (j=0; j<M; j++){
5837 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5838 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5839 // (assuming that we start from an aligned address).
5840 }
5841
5842 We therefore have to use the unoptimized realignment scheme:
5843
5844 for (i=0; i<N; i+=4)
5845 for (j=k; j<M; j+=4)
5846 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5847 // that the misalignment of the initial address is
5848 // 0).
5849
5850 The loop can then be vectorized as follows:
5851
5852 for (k=0; k<4; k++){
5853 rt = get_realignment_token (&vp[k]);
5854 for (i=0; i<N; i+=4){
5855 v1 = vp[i+k];
5856 for (j=k; j<M; j+=4){
5857 v2 = vp[i+j+VS-1];
5858 va = REALIGN_LOAD <v1,v2,rt>;
5859 vs += va;
5860 v1 = v2;
5861 }
5862 }
5863 } */
5864
5865 if (DR_IS_READ (dr))
5866 {
0601d0cf
RE
5867 bool is_packed = false;
5868 tree type = (TREE_TYPE (DR_REF (dr)));
5869
947131ba 5870 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
ebfd146a
IR
5871 && (!targetm.vectorize.builtin_mask_for_load
5872 || targetm.vectorize.builtin_mask_for_load ()))
5873 {
5874 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
69f11a13
IR
5875 if ((nested_in_vect_loop
5876 && (TREE_INT_CST_LOW (DR_STEP (dr))
5877 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5878 || !loop_vinfo)
ebfd146a
IR
5879 return dr_explicit_realign;
5880 else
5881 return dr_explicit_realign_optimized;
5882 }
0601d0cf 5883 if (!known_alignment_for_access_p (dr))
4c9bcf89 5884 is_packed = not_size_aligned (DR_REF (dr));
b8698a0f 5885
afb119be
RB
5886 if ((TYPE_USER_ALIGN (type) && !is_packed)
5887 || targetm.vectorize.
5888 support_vector_misalignment (mode, type,
5889 DR_MISALIGNMENT (dr), is_packed))
ebfd146a
IR
5890 /* Can't software pipeline the loads, but can at least do them. */
5891 return dr_unaligned_supported;
5892 }
0601d0cf
RE
5893 else
5894 {
5895 bool is_packed = false;
5896 tree type = (TREE_TYPE (DR_REF (dr)));
ebfd146a 5897
0601d0cf 5898 if (!known_alignment_for_access_p (dr))
4c9bcf89 5899 is_packed = not_size_aligned (DR_REF (dr));
b8698a0f 5900
afb119be
RB
5901 if ((TYPE_USER_ALIGN (type) && !is_packed)
5902 || targetm.vectorize.
5903 support_vector_misalignment (mode, type,
5904 DR_MISALIGNMENT (dr), is_packed))
0601d0cf
RE
5905 return dr_unaligned_supported;
5906 }
b8698a0f 5907
ebfd146a
IR
5908 /* Unsupported. */
5909 return dr_unaligned_unsupported;
5910}