]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-data-refs.c
arc.c (hwloop_optimize): Add missing space in string literal.
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
80 }
81 }
82
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 {
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
90 }
91
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
96
97 return true;
98 }
99
100
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
117
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
120 HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
122 {
123 tree scalar_type = gimple_expr_type (stmt_info->stmt);
124 HOST_WIDE_INT lhs, rhs;
125
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
130
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132
133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
134 if (assign
135 && (gimple_assign_cast_p (assign)
136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
141 {
142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
143
144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
145 if (rhs < lhs)
146 scalar_type = rhs_type;
147 }
148 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
149 {
150 unsigned int i = 0;
151 if (gimple_call_internal_p (call))
152 {
153 internal_fn ifn = gimple_call_internal_fn (call);
154 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
155 /* gimple_expr_type already picked the type of the loaded
156 or stored data. */
157 i = ~0U;
158 else if (internal_fn_mask_index (ifn) == 0)
159 i = 1;
160 }
161 if (i < gimple_call_num_args (call))
162 {
163 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
164 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
165 {
166 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
167 if (rhs < lhs)
168 scalar_type = rhs_type;
169 }
170 }
171 }
172
173 *lhs_size_unit = lhs;
174 *rhs_size_unit = rhs;
175 return scalar_type;
176 }
177
178
179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
180 tested at run-time. Return TRUE if DDR was successfully inserted.
181 Return false if versioning is not supported. */
182
183 static opt_result
184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
185 {
186 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187
188 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
189 return opt_result::failure_at (vect_location,
190 "will not create alias checks, as"
191 " --param vect-max-version-for-alias-checks"
192 " == 0\n");
193
194 opt_result res
195 = runtime_alias_check_p (ddr, loop,
196 optimize_loop_nest_for_speed_p (loop));
197 if (!res)
198 return res;
199
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return opt_result::success ();
202 }
203
204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
205
206 static void
207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
208 {
209 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
210 for (unsigned int i = 0; i < checks.length(); ++i)
211 if (checks[i] == value)
212 return;
213
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location,
216 "need run-time check that %T is nonzero\n",
217 value);
218 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
219 }
220
221 /* Return true if we know that the order of vectorized DR_INFO_A and
222 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
223 DR_INFO_B. At least one of the accesses is a write. */
224
225 static bool
226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
227 {
228 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
229 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
230
231 /* Single statements are always kept in their original order. */
232 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
233 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
234 return true;
235
236 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
237 SLP group are emitted at the position of the last scalar load and
238 all loads in an interleaving group are emitted at the position
239 of the first scalar load.
240 Stores in a group are emitted at the position of the last scalar store.
241 Compute that position and check whether the resulting order matches
242 the current one.
243 We have not yet decided between SLP and interleaving so we have
244 to conservatively assume both. */
245 stmt_vec_info il_a;
246 stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
247 if (last_a)
248 {
249 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
250 s = DR_GROUP_NEXT_ELEMENT (s))
251 last_a = get_later_stmt (last_a, s);
252 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
253 {
254 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
255 s = DR_GROUP_NEXT_ELEMENT (s))
256 if (get_later_stmt (il_a, s) == il_a)
257 il_a = s;
258 }
259 else
260 il_a = last_a;
261 }
262 else
263 last_a = il_a = stmtinfo_a;
264 stmt_vec_info il_b;
265 stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
266 if (last_b)
267 {
268 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
269 s = DR_GROUP_NEXT_ELEMENT (s))
270 last_b = get_later_stmt (last_b, s);
271 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
272 {
273 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
274 s = DR_GROUP_NEXT_ELEMENT (s))
275 if (get_later_stmt (il_b, s) == il_b)
276 il_b = s;
277 }
278 else
279 il_b = last_b;
280 }
281 else
282 last_b = il_b = stmtinfo_b;
283 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
284 return (/* SLP */
285 (get_later_stmt (last_a, last_b) == last_a) == a_after_b
286 /* Interleaving */
287 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
288 /* Mixed */
289 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
290 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
291 }
292
293 /* A subroutine of vect_analyze_data_ref_dependence. Handle
294 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
295 distances. These distances are conservatively correct but they don't
296 reflect a guaranteed dependence.
297
298 Return true if this function does all the work necessary to avoid
299 an alias or false if the caller should use the dependence distances
300 to limit the vectorization factor in the usual way. LOOP_DEPTH is
301 the depth of the loop described by LOOP_VINFO and the other arguments
302 are as for vect_analyze_data_ref_dependence. */
303
304 static bool
305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
306 loop_vec_info loop_vinfo,
307 int loop_depth, unsigned int *max_vf)
308 {
309 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
310 lambda_vector dist_v;
311 unsigned int i;
312 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
313 {
314 int dist = dist_v[loop_depth];
315 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
316 {
317 /* If the user asserted safelen >= DIST consecutive iterations
318 can be executed concurrently, assume independence.
319
320 ??? An alternative would be to add the alias check even
321 in this case, and vectorize the fallback loop with the
322 maximum VF set to safelen. However, if the user has
323 explicitly given a length, it's less likely that that
324 would be a win. */
325 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
326 {
327 if ((unsigned int) loop->safelen < *max_vf)
328 *max_vf = loop->safelen;
329 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
330 continue;
331 }
332
333 /* For dependence distances of 2 or more, we have the option
334 of limiting VF or checking for an alias at runtime.
335 Prefer to check at runtime if we can, to avoid limiting
336 the VF unnecessarily when the bases are in fact independent.
337
338 Note that the alias checks will be removed if the VF ends up
339 being small enough. */
340 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
341 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
342 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
343 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
344 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
345 }
346 }
347 return true;
348 }
349
350
351 /* Function vect_analyze_data_ref_dependence.
352
353 FIXME: I needed to change the sense of the returned flag.
354
355 Return FALSE if there (might) exist a dependence between a memory-reference
356 DRA and a memory-reference DRB. When versioning for alias may check a
357 dependence at run-time, return TRUE. Adjust *MAX_VF according to
358 the data dependence. */
359
360 static opt_result
361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
362 loop_vec_info loop_vinfo,
363 unsigned int *max_vf)
364 {
365 unsigned int i;
366 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
367 struct data_reference *dra = DDR_A (ddr);
368 struct data_reference *drb = DDR_B (ddr);
369 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
370 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
371 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
372 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
373 lambda_vector dist_v;
374 unsigned int loop_depth;
375
376 /* In loop analysis all data references should be vectorizable. */
377 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379 gcc_unreachable ();
380
381 /* Independent data accesses. */
382 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383 return opt_result::success ();
384
385 if (dra == drb
386 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387 return opt_result::success ();
388
389 /* We do not have to consider dependences between accesses that belong
390 to the same group, unless the stride could be smaller than the
391 group size. */
392 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396 return opt_result::success ();
397
398 /* Even if we have an anti-dependence then, as the vectorized loop covers at
399 least two scalar iterations, there is always also a true dependence.
400 As the vectorizer does not re-order loads and stores we can ignore
401 the anti-dependence if TBAA can disambiguate both DRs similar to the
402 case with known negative distance anti-dependences (positive
403 distance anti-dependences would violate TBAA constraints). */
404 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407 get_alias_set (DR_REF (drb))))
408 return opt_result::success ();
409
410 /* Unknown data dependence. */
411 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
412 {
413 /* If user asserted safelen consecutive iterations can be
414 executed concurrently, assume independence. */
415 if (loop->safelen >= 2)
416 {
417 if ((unsigned int) loop->safelen < *max_vf)
418 *max_vf = loop->safelen;
419 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
420 return opt_result::success ();
421 }
422
423 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
424 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
425 return opt_result::failure_at
426 (stmtinfo_a->stmt,
427 "versioning for alias not supported for: "
428 "can't determine dependence between %T and %T\n",
429 DR_REF (dra), DR_REF (drb));
430
431 if (dump_enabled_p ())
432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
433 "versioning for alias required: "
434 "can't determine dependence between %T and %T\n",
435 DR_REF (dra), DR_REF (drb));
436
437 /* Add to list of ddrs that need to be tested at run-time. */
438 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
439 }
440
441 /* Known data dependence. */
442 if (DDR_NUM_DIST_VECTS (ddr) == 0)
443 {
444 /* If user asserted safelen consecutive iterations can be
445 executed concurrently, assume independence. */
446 if (loop->safelen >= 2)
447 {
448 if ((unsigned int) loop->safelen < *max_vf)
449 *max_vf = loop->safelen;
450 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
451 return opt_result::success ();
452 }
453
454 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
455 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
456 return opt_result::failure_at
457 (stmtinfo_a->stmt,
458 "versioning for alias not supported for: "
459 "bad dist vector for %T and %T\n",
460 DR_REF (dra), DR_REF (drb));
461
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
464 "versioning for alias required: "
465 "bad dist vector for %T and %T\n",
466 DR_REF (dra), DR_REF (drb));
467 /* Add to list of ddrs that need to be tested at run-time. */
468 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
469 }
470
471 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
472
473 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
474 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
475 loop_depth, max_vf))
476 return opt_result::success ();
477
478 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
479 {
480 int dist = dist_v[loop_depth];
481
482 if (dump_enabled_p ())
483 dump_printf_loc (MSG_NOTE, vect_location,
484 "dependence distance = %d.\n", dist);
485
486 if (dist == 0)
487 {
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location,
490 "dependence distance == 0 between %T and %T\n",
491 DR_REF (dra), DR_REF (drb));
492
493 /* When we perform grouped accesses and perform implicit CSE
494 by detecting equal accesses and doing disambiguation with
495 runtime alias tests like for
496 .. = a[i];
497 .. = a[i+1];
498 a[i] = ..;
499 a[i+1] = ..;
500 *p = ..;
501 .. = a[i];
502 .. = a[i+1];
503 where we will end up loading { a[i], a[i+1] } once, make
504 sure that inserting group loads before the first load and
505 stores after the last store will do the right thing.
506 Similar for groups like
507 a[i] = ...;
508 ... = a[i];
509 a[i+1] = ...;
510 where loads from the group interleave with the store. */
511 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
512 return opt_result::failure_at (stmtinfo_a->stmt,
513 "READ_WRITE dependence"
514 " in interleaving.\n");
515
516 if (loop->safelen < 2)
517 {
518 tree indicator = dr_zero_step_indicator (dra);
519 if (!indicator || integer_zerop (indicator))
520 return opt_result::failure_at (stmtinfo_a->stmt,
521 "access also has a zero step\n");
522 else if (TREE_CODE (indicator) != INTEGER_CST)
523 vect_check_nonzero_value (loop_vinfo, indicator);
524 }
525 continue;
526 }
527
528 if (dist > 0 && DDR_REVERSED_P (ddr))
529 {
530 /* If DDR_REVERSED_P the order of the data-refs in DDR was
531 reversed (to make distance vector positive), and the actual
532 distance is negative. */
533 if (dump_enabled_p ())
534 dump_printf_loc (MSG_NOTE, vect_location,
535 "dependence distance negative.\n");
536 /* When doing outer loop vectorization, we need to check if there is
537 a backward dependence at the inner loop level if the dependence
538 at the outer loop is reversed. See PR81740. */
539 if (nested_in_vect_loop_p (loop, stmtinfo_a)
540 || nested_in_vect_loop_p (loop, stmtinfo_b))
541 {
542 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
543 DDR_LOOP_NEST (ddr));
544 if (dist_v[inner_depth] < 0)
545 return opt_result::failure_at (stmtinfo_a->stmt,
546 "not vectorized, dependence "
547 "between data-refs %T and %T\n",
548 DR_REF (dra), DR_REF (drb));
549 }
550 /* Record a negative dependence distance to later limit the
551 amount of stmt copying / unrolling we can perform.
552 Only need to handle read-after-write dependence. */
553 if (DR_IS_READ (drb)
554 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
555 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
556 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
557 continue;
558 }
559
560 unsigned int abs_dist = abs (dist);
561 if (abs_dist >= 2 && abs_dist < *max_vf)
562 {
563 /* The dependence distance requires reduction of the maximal
564 vectorization factor. */
565 *max_vf = abs_dist;
566 if (dump_enabled_p ())
567 dump_printf_loc (MSG_NOTE, vect_location,
568 "adjusting maximal vectorization factor to %i\n",
569 *max_vf);
570 }
571
572 if (abs_dist >= *max_vf)
573 {
574 /* Dependence distance does not create dependence, as far as
575 vectorization is concerned, in this case. */
576 if (dump_enabled_p ())
577 dump_printf_loc (MSG_NOTE, vect_location,
578 "dependence distance >= VF.\n");
579 continue;
580 }
581
582 return opt_result::failure_at (stmtinfo_a->stmt,
583 "not vectorized, possible dependence "
584 "between data-refs %T and %T\n",
585 DR_REF (dra), DR_REF (drb));
586 }
587
588 return opt_result::success ();
589 }
590
591 /* Function vect_analyze_data_ref_dependences.
592
593 Examine all the data references in the loop, and make sure there do not
594 exist any data dependences between them. Set *MAX_VF according to
595 the maximum vectorization factor the data dependences allow. */
596
597 opt_result
598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
599 unsigned int *max_vf)
600 {
601 unsigned int i;
602 struct data_dependence_relation *ddr;
603
604 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
605
606 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
607 {
608 LOOP_VINFO_DDRS (loop_vinfo)
609 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
610 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
611 /* We need read-read dependences to compute
612 STMT_VINFO_SAME_ALIGN_REFS. */
613 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
614 &LOOP_VINFO_DDRS (loop_vinfo),
615 LOOP_VINFO_LOOP_NEST (loop_vinfo),
616 true);
617 gcc_assert (res);
618 }
619
620 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
621
622 /* For epilogues we either have no aliases or alias versioning
623 was applied to original loop. Therefore we may just get max_vf
624 using VF of original loop. */
625 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
626 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
627 else
628 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
629 {
630 opt_result res
631 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
632 if (!res)
633 return res;
634 }
635
636 return opt_result::success ();
637 }
638
639
640 /* Function vect_slp_analyze_data_ref_dependence.
641
642 Return TRUE if there (might) exist a dependence between a memory-reference
643 DRA and a memory-reference DRB for VINFO. When versioning for alias
644 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
645 according to the data dependence. */
646
647 static bool
648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
649 struct data_dependence_relation *ddr)
650 {
651 struct data_reference *dra = DDR_A (ddr);
652 struct data_reference *drb = DDR_B (ddr);
653 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
654 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
655
656 /* We need to check dependences of statements marked as unvectorizable
657 as well, they still can prohibit vectorization. */
658
659 /* Independent data accesses. */
660 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
661 return false;
662
663 if (dra == drb)
664 return false;
665
666 /* Read-read is OK. */
667 if (DR_IS_READ (dra) && DR_IS_READ (drb))
668 return false;
669
670 /* If dra and drb are part of the same interleaving chain consider
671 them independent. */
672 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
673 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
674 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
675 return false;
676
677 /* Unknown data dependence. */
678 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "can't determine dependence between %T and %T\n",
683 DR_REF (dra), DR_REF (drb));
684 }
685 else if (dump_enabled_p ())
686 dump_printf_loc (MSG_NOTE, vect_location,
687 "determined dependence between %T and %T\n",
688 DR_REF (dra), DR_REF (drb));
689
690 return true;
691 }
692
693
694 /* Analyze dependences involved in the transform of SLP NODE. STORES
695 contain the vector of scalar stores of this instance if we are
696 disambiguating the loads. */
697
698 static bool
699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
700 vec<stmt_vec_info> stores,
701 stmt_vec_info last_store_info)
702 {
703 /* This walks over all stmts involved in the SLP load/store done
704 in NODE verifying we can sink them up to the last stmt in the
705 group. */
706 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
707 vec_info *vinfo = last_access_info->vinfo;
708 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
709 {
710 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
711 if (access_info == last_access_info)
712 continue;
713 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
714 ao_ref ref;
715 bool ref_initialized_p = false;
716 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
717 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
718 {
719 gimple *stmt = gsi_stmt (gsi);
720 if (! gimple_vuse (stmt)
721 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
722 continue;
723
724 /* If we couldn't record a (single) data reference for this
725 stmt we have to resort to the alias oracle. */
726 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
727 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
728 if (!dr_b)
729 {
730 /* We are moving a store or sinking a load - this means
731 we cannot use TBAA for disambiguation. */
732 if (!ref_initialized_p)
733 ao_ref_init (&ref, DR_REF (dr_a));
734 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
735 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
736 return false;
737 continue;
738 }
739
740 bool dependent = false;
741 /* If we run into a store of this same instance (we've just
742 marked those) then delay dependence checking until we run
743 into the last store because this is where it will have
744 been sunk to (and we verify if we can do that as well). */
745 if (gimple_visited_p (stmt))
746 {
747 if (stmt_info != last_store_info)
748 continue;
749 unsigned i;
750 stmt_vec_info store_info;
751 FOR_EACH_VEC_ELT (stores, i, store_info)
752 {
753 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
754 ddr_p ddr = initialize_data_dependence_relation
755 (dr_a, store_dr, vNULL);
756 dependent
757 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
758 free_dependence_relation (ddr);
759 if (dependent)
760 break;
761 }
762 }
763 else
764 {
765 ddr_p ddr = initialize_data_dependence_relation (dr_a,
766 dr_b, vNULL);
767 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
768 free_dependence_relation (ddr);
769 }
770 if (dependent)
771 return false;
772 }
773 }
774 return true;
775 }
776
777
778 /* Function vect_analyze_data_ref_dependences.
779
780 Examine all the data references in the basic-block, and make sure there
781 do not exist any data dependences between them. Set *MAX_VF according to
782 the maximum vectorization factor the data dependences allow. */
783
784 bool
785 vect_slp_analyze_instance_dependence (slp_instance instance)
786 {
787 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
788
789 /* The stores of this instance are at the root of the SLP tree. */
790 slp_tree store = SLP_INSTANCE_TREE (instance);
791 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
792 store = NULL;
793
794 /* Verify we can sink stores to the vectorized stmt insert location. */
795 stmt_vec_info last_store_info = NULL;
796 if (store)
797 {
798 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
799 return false;
800
801 /* Mark stores in this instance and remember the last one. */
802 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
803 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
804 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
805 }
806
807 bool res = true;
808
809 /* Verify we can sink loads to the vectorized stmt insert location,
810 special-casing stores of this instance. */
811 slp_tree load;
812 unsigned int i;
813 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
814 if (! vect_slp_analyze_node_dependences (instance, load,
815 store
816 ? SLP_TREE_SCALAR_STMTS (store)
817 : vNULL, last_store_info))
818 {
819 res = false;
820 break;
821 }
822
823 /* Unset the visited flag. */
824 if (store)
825 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
826 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
827
828 return res;
829 }
830
831 /* Record the base alignment guarantee given by DRB, which occurs
832 in STMT_INFO. */
833
834 static void
835 vect_record_base_alignment (stmt_vec_info stmt_info,
836 innermost_loop_behavior *drb)
837 {
838 vec_info *vinfo = stmt_info->vinfo;
839 bool existed;
840 innermost_loop_behavior *&entry
841 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
842 if (!existed || entry->base_alignment < drb->base_alignment)
843 {
844 entry = drb;
845 if (dump_enabled_p ())
846 dump_printf_loc (MSG_NOTE, vect_location,
847 "recording new base alignment for %T\n"
848 " alignment: %d\n"
849 " misalignment: %d\n"
850 " based on: %G",
851 drb->base_address,
852 drb->base_alignment,
853 drb->base_misalignment,
854 stmt_info->stmt);
855 }
856 }
857
858 /* If the region we're going to vectorize is reached, all unconditional
859 data references occur at least once. We can therefore pool the base
860 alignment guarantees from each unconditional reference. Do this by
861 going through all the data references in VINFO and checking whether
862 the containing statement makes the reference unconditionally. If so,
863 record the alignment of the base address in VINFO so that it can be
864 used for all other references with the same base. */
865
866 void
867 vect_record_base_alignments (vec_info *vinfo)
868 {
869 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
870 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
871 data_reference *dr;
872 unsigned int i;
873 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
874 {
875 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
876 stmt_vec_info stmt_info = dr_info->stmt;
877 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
878 && STMT_VINFO_VECTORIZABLE (stmt_info)
879 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
880 {
881 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
882
883 /* If DR is nested in the loop that is being vectorized, we can also
884 record the alignment of the base wrt the outer loop. */
885 if (loop && nested_in_vect_loop_p (loop, stmt_info))
886 vect_record_base_alignment
887 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
888 }
889 }
890 }
891
892 /* Return the target alignment for the vectorized form of DR_INFO. */
893
894 static poly_uint64
895 vect_calculate_target_alignment (dr_vec_info *dr_info)
896 {
897 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
898 return targetm.vectorize.preferred_vector_alignment (vectype);
899 }
900
901 /* Function vect_compute_data_ref_alignment
902
903 Compute the misalignment of the data reference DR_INFO.
904
905 Output:
906 1. DR_MISALIGNMENT (DR_INFO) is defined.
907
908 FOR NOW: No analysis is actually performed. Misalignment is calculated
909 only for trivial cases. TODO. */
910
911 static void
912 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
913 {
914 stmt_vec_info stmt_info = dr_info->stmt;
915 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
916 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
917 class loop *loop = NULL;
918 tree ref = DR_REF (dr_info->dr);
919 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
920
921 if (dump_enabled_p ())
922 dump_printf_loc (MSG_NOTE, vect_location,
923 "vect_compute_data_ref_alignment:\n");
924
925 if (loop_vinfo)
926 loop = LOOP_VINFO_LOOP (loop_vinfo);
927
928 /* Initialize misalignment to unknown. */
929 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
930
931 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
932 return;
933
934 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
935 bool step_preserves_misalignment_p;
936
937 poly_uint64 vector_alignment
938 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
939 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
940
941 unsigned HOST_WIDE_INT vect_align_c;
942 if (!vector_alignment.is_constant (&vect_align_c))
943 return;
944
945 /* No step for BB vectorization. */
946 if (!loop)
947 {
948 gcc_assert (integer_zerop (drb->step));
949 step_preserves_misalignment_p = true;
950 }
951
952 /* In case the dataref is in an inner-loop of the loop that is being
953 vectorized (LOOP), we use the base and misalignment information
954 relative to the outer-loop (LOOP). This is ok only if the misalignment
955 stays the same throughout the execution of the inner-loop, which is why
956 we have to check that the stride of the dataref in the inner-loop evenly
957 divides by the vector alignment. */
958 else if (nested_in_vect_loop_p (loop, stmt_info))
959 {
960 step_preserves_misalignment_p
961 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
962
963 if (dump_enabled_p ())
964 {
965 if (step_preserves_misalignment_p)
966 dump_printf_loc (MSG_NOTE, vect_location,
967 "inner step divides the vector alignment.\n");
968 else
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "inner step doesn't divide the vector"
971 " alignment.\n");
972 }
973 }
974
975 /* Similarly we can only use base and misalignment information relative to
976 an innermost loop if the misalignment stays the same throughout the
977 execution of the loop. As above, this is the case if the stride of
978 the dataref evenly divides by the alignment. */
979 else
980 {
981 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
982 step_preserves_misalignment_p
983 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
984
985 if (!step_preserves_misalignment_p && dump_enabled_p ())
986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
987 "step doesn't divide the vector alignment.\n");
988 }
989
990 unsigned int base_alignment = drb->base_alignment;
991 unsigned int base_misalignment = drb->base_misalignment;
992
993 /* Calculate the maximum of the pooled base address alignment and the
994 alignment that we can compute for DR itself. */
995 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
996 if (entry && base_alignment < (*entry)->base_alignment)
997 {
998 base_alignment = (*entry)->base_alignment;
999 base_misalignment = (*entry)->base_misalignment;
1000 }
1001
1002 if (drb->offset_alignment < vect_align_c
1003 || !step_preserves_misalignment_p
1004 /* We need to know whether the step wrt the vectorized loop is
1005 negative when computing the starting misalignment below. */
1006 || TREE_CODE (drb->step) != INTEGER_CST)
1007 {
1008 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010 "Unknown alignment for access: %T\n", ref);
1011 return;
1012 }
1013
1014 if (base_alignment < vect_align_c)
1015 {
1016 unsigned int max_alignment;
1017 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1018 if (max_alignment < vect_align_c
1019 || !vect_can_force_dr_alignment_p (base,
1020 vect_align_c * BITS_PER_UNIT))
1021 {
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "can't force alignment of ref: %T\n", ref);
1025 return;
1026 }
1027
1028 /* Force the alignment of the decl.
1029 NOTE: This is the only change to the code we make during
1030 the analysis phase, before deciding to vectorize the loop. */
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_NOTE, vect_location,
1033 "force alignment of %T\n", ref);
1034
1035 dr_info->base_decl = base;
1036 dr_info->base_misaligned = true;
1037 base_misalignment = 0;
1038 }
1039 poly_int64 misalignment
1040 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1041
1042 /* If this is a backward running DR then first access in the larger
1043 vectype actually is N-1 elements before the address in the DR.
1044 Adjust misalign accordingly. */
1045 if (tree_int_cst_sgn (drb->step) < 0)
1046 /* PLUS because STEP is negative. */
1047 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1048 * TREE_INT_CST_LOW (drb->step));
1049
1050 unsigned int const_misalignment;
1051 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1052 {
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 "Non-constant misalignment for access: %T\n", ref);
1056 return;
1057 }
1058
1059 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1060
1061 if (dump_enabled_p ())
1062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063 "misalign = %d bytes of ref %T\n",
1064 DR_MISALIGNMENT (dr_info), ref);
1065
1066 return;
1067 }
1068
1069 /* Function vect_update_misalignment_for_peel.
1070 Sets DR_INFO's misalignment
1071 - to 0 if it has the same alignment as DR_PEEL_INFO,
1072 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1073 - to -1 (unknown) otherwise.
1074
1075 DR_INFO - the data reference whose misalignment is to be adjusted.
1076 DR_PEEL_INFO - the data reference whose misalignment is being made
1077 zero in the vector loop by the peel.
1078 NPEEL - the number of iterations in the peel loop if the misalignment
1079 of DR_PEEL_INFO is known at compile time. */
1080
1081 static void
1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1083 dr_vec_info *dr_peel_info, int npeel)
1084 {
1085 unsigned int i;
1086 vec<dr_p> same_aligned_drs;
1087 struct data_reference *current_dr;
1088 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1089
1090 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1091 it is aligned in the vector loop. */
1092 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1093 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1094 {
1095 if (current_dr != dr_info->dr)
1096 continue;
1097 gcc_assert (!known_alignment_for_access_p (dr_info)
1098 || !known_alignment_for_access_p (dr_peel_info)
1099 || (DR_MISALIGNMENT (dr_info)
1100 == DR_MISALIGNMENT (dr_peel_info)));
1101 SET_DR_MISALIGNMENT (dr_info, 0);
1102 return;
1103 }
1104
1105 unsigned HOST_WIDE_INT alignment;
1106 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1107 && known_alignment_for_access_p (dr_info)
1108 && known_alignment_for_access_p (dr_peel_info))
1109 {
1110 int misal = DR_MISALIGNMENT (dr_info);
1111 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1112 misal &= alignment - 1;
1113 SET_DR_MISALIGNMENT (dr_info, misal);
1114 return;
1115 }
1116
1117 if (dump_enabled_p ())
1118 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1119 "to unknown (-1).\n");
1120 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1121 }
1122
1123
1124 /* Function verify_data_ref_alignment
1125
1126 Return TRUE if DR_INFO can be handled with respect to alignment. */
1127
1128 static opt_result
1129 verify_data_ref_alignment (dr_vec_info *dr_info)
1130 {
1131 enum dr_alignment_support supportable_dr_alignment
1132 = vect_supportable_dr_alignment (dr_info, false);
1133 if (!supportable_dr_alignment)
1134 return opt_result::failure_at
1135 (dr_info->stmt->stmt,
1136 DR_IS_READ (dr_info->dr)
1137 ? "not vectorized: unsupported unaligned load: %T\n"
1138 : "not vectorized: unsupported unaligned store: %T\n",
1139 DR_REF (dr_info->dr));
1140
1141 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1142 dump_printf_loc (MSG_NOTE, vect_location,
1143 "Vectorizing an unaligned access.\n");
1144
1145 return opt_result::success ();
1146 }
1147
1148 /* Function vect_verify_datarefs_alignment
1149
1150 Return TRUE if all data references in the loop can be
1151 handled with respect to alignment. */
1152
1153 opt_result
1154 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1155 {
1156 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1157 struct data_reference *dr;
1158 unsigned int i;
1159
1160 FOR_EACH_VEC_ELT (datarefs, i, dr)
1161 {
1162 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1163 stmt_vec_info stmt_info = dr_info->stmt;
1164
1165 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1166 continue;
1167
1168 /* For interleaving, only the alignment of the first access matters. */
1169 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1171 continue;
1172
1173 /* Strided accesses perform only component accesses, alignment is
1174 irrelevant for them. */
1175 if (STMT_VINFO_STRIDED_P (stmt_info)
1176 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1177 continue;
1178
1179 opt_result res = verify_data_ref_alignment (dr_info);
1180 if (!res)
1181 return res;
1182 }
1183
1184 return opt_result::success ();
1185 }
1186
1187 /* Given an memory reference EXP return whether its alignment is less
1188 than its size. */
1189
1190 static bool
1191 not_size_aligned (tree exp)
1192 {
1193 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1194 return true;
1195
1196 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1197 > get_object_alignment (exp));
1198 }
1199
1200 /* Function vector_alignment_reachable_p
1201
1202 Return true if vector alignment for DR_INFO is reachable by peeling
1203 a few loop iterations. Return false otherwise. */
1204
1205 static bool
1206 vector_alignment_reachable_p (dr_vec_info *dr_info)
1207 {
1208 stmt_vec_info stmt_info = dr_info->stmt;
1209 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1210
1211 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212 {
1213 /* For interleaved access we peel only if number of iterations in
1214 the prolog loop ({VF - misalignment}), is a multiple of the
1215 number of the interleaved accesses. */
1216 int elem_size, mis_in_elements;
1217
1218 /* FORNOW: handle only known alignment. */
1219 if (!known_alignment_for_access_p (dr_info))
1220 return false;
1221
1222 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1223 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1224 elem_size = vector_element_size (vector_size, nelements);
1225 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1226
1227 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1228 return false;
1229 }
1230
1231 /* If misalignment is known at the compile time then allow peeling
1232 only if natural alignment is reachable through peeling. */
1233 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1234 {
1235 HOST_WIDE_INT elmsize =
1236 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1237 if (dump_enabled_p ())
1238 {
1239 dump_printf_loc (MSG_NOTE, vect_location,
1240 "data size = %wd. misalignment = %d.\n", elmsize,
1241 DR_MISALIGNMENT (dr_info));
1242 }
1243 if (DR_MISALIGNMENT (dr_info) % elmsize)
1244 {
1245 if (dump_enabled_p ())
1246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247 "data size does not divide the misalignment.\n");
1248 return false;
1249 }
1250 }
1251
1252 if (!known_alignment_for_access_p (dr_info))
1253 {
1254 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1255 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "Unknown misalignment, %snaturally aligned\n",
1259 is_packed ? "not " : "");
1260 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1261 }
1262
1263 return true;
1264 }
1265
1266
1267 /* Calculate the cost of the memory access represented by DR_INFO. */
1268
1269 static void
1270 vect_get_data_access_cost (dr_vec_info *dr_info,
1271 unsigned int *inside_cost,
1272 unsigned int *outside_cost,
1273 stmt_vector_for_cost *body_cost_vec,
1274 stmt_vector_for_cost *prologue_cost_vec)
1275 {
1276 stmt_vec_info stmt_info = dr_info->stmt;
1277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278 int ncopies;
1279
1280 if (PURE_SLP_STMT (stmt_info))
1281 ncopies = 1;
1282 else
1283 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1284
1285 if (DR_IS_READ (dr_info->dr))
1286 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287 prologue_cost_vec, body_cost_vec, false);
1288 else
1289 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1290
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_NOTE, vect_location,
1293 "vect_get_data_access_cost: inside_cost = %d, "
1294 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1295 }
1296
1297
1298 typedef struct _vect_peel_info
1299 {
1300 dr_vec_info *dr_info;
1301 int npeel;
1302 unsigned int count;
1303 } *vect_peel_info;
1304
1305 typedef struct _vect_peel_extended_info
1306 {
1307 struct _vect_peel_info peel_info;
1308 unsigned int inside_cost;
1309 unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1311
1312
1313 /* Peeling hashtable helpers. */
1314
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1316 {
1317 static inline hashval_t hash (const _vect_peel_info *);
1318 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1319 };
1320
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1323 {
1324 return (hashval_t) peel_info->npeel;
1325 }
1326
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1329 {
1330 return (a->npeel == b->npeel);
1331 }
1332
1333
1334 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1335
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1339 int npeel)
1340 {
1341 struct _vect_peel_info elem, *slot;
1342 _vect_peel_info **new_slot;
1343 bool supportable_dr_alignment
1344 = vect_supportable_dr_alignment (dr_info, true);
1345
1346 elem.npeel = npeel;
1347 slot = peeling_htab->find (&elem);
1348 if (slot)
1349 slot->count++;
1350 else
1351 {
1352 slot = XNEW (struct _vect_peel_info);
1353 slot->npeel = npeel;
1354 slot->dr_info = dr_info;
1355 slot->count = 1;
1356 new_slot = peeling_htab->find_slot (slot, INSERT);
1357 *new_slot = slot;
1358 }
1359
1360 if (!supportable_dr_alignment
1361 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1362 slot->count += VECT_MAX_COST;
1363 }
1364
1365
1366 /* Traverse peeling hash table to find peeling option that aligns maximum
1367 number of data accesses. */
1368
1369 int
1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1371 _vect_peel_extended_info *max)
1372 {
1373 vect_peel_info elem = *slot;
1374
1375 if (elem->count > max->peel_info.count
1376 || (elem->count == max->peel_info.count
1377 && max->peel_info.npeel > elem->npeel))
1378 {
1379 max->peel_info.npeel = elem->npeel;
1380 max->peel_info.count = elem->count;
1381 max->peel_info.dr_info = elem->dr_info;
1382 }
1383
1384 return 1;
1385 }
1386
1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1388 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1389 we assume DR0_INFO's misalignment will be zero after peeling. */
1390
1391 static void
1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1393 dr_vec_info *dr0_info,
1394 unsigned int *inside_cost,
1395 unsigned int *outside_cost,
1396 stmt_vector_for_cost *body_cost_vec,
1397 stmt_vector_for_cost *prologue_cost_vec,
1398 unsigned int npeel,
1399 bool unknown_misalignment)
1400 {
1401 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402 unsigned i;
1403 data_reference *dr;
1404
1405 FOR_EACH_VEC_ELT (datarefs, i, dr)
1406 {
1407 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1408 stmt_vec_info stmt_info = dr_info->stmt;
1409 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410 continue;
1411
1412 /* For interleaving, only the alignment of the first access
1413 matters. */
1414 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416 continue;
1417
1418 /* Strided accesses perform only component accesses, alignment is
1419 irrelevant for them. */
1420 if (STMT_VINFO_STRIDED_P (stmt_info)
1421 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422 continue;
1423
1424 int save_misalignment;
1425 save_misalignment = DR_MISALIGNMENT (dr_info);
1426 if (npeel == 0)
1427 ;
1428 else if (unknown_misalignment && dr_info == dr0_info)
1429 SET_DR_MISALIGNMENT (dr_info, 0);
1430 else
1431 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1432 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1433 body_cost_vec, prologue_cost_vec);
1434 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1435 }
1436 }
1437
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439 Find the one with the lowest cost. */
1440
1441 int
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443 _vect_peel_extended_info *min)
1444 {
1445 vect_peel_info elem = *slot;
1446 int dummy;
1447 unsigned int inside_cost = 0, outside_cost = 0;
1448 stmt_vec_info stmt_info = elem->dr_info->stmt;
1449 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451 epilogue_cost_vec;
1452
1453 prologue_cost_vec.create (2);
1454 body_cost_vec.create (2);
1455 epilogue_cost_vec.create (2);
1456
1457 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1458 &outside_cost, &body_cost_vec,
1459 &prologue_cost_vec, elem->npeel, false);
1460
1461 body_cost_vec.release ();
1462
1463 outside_cost += vect_get_known_peeling_cost
1464 (loop_vinfo, elem->npeel, &dummy,
1465 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466 &prologue_cost_vec, &epilogue_cost_vec);
1467
1468 /* Prologue and epilogue costs are added to the target model later.
1469 These costs depend only on the scalar iteration cost, the
1470 number of peeling iterations finally chosen, and the number of
1471 misaligned statements. So discard the information found here. */
1472 prologue_cost_vec.release ();
1473 epilogue_cost_vec.release ();
1474
1475 if (inside_cost < min->inside_cost
1476 || (inside_cost == min->inside_cost
1477 && outside_cost < min->outside_cost))
1478 {
1479 min->inside_cost = inside_cost;
1480 min->outside_cost = outside_cost;
1481 min->peel_info.dr_info = elem->dr_info;
1482 min->peel_info.npeel = elem->npeel;
1483 min->peel_info.count = elem->count;
1484 }
1485
1486 return 1;
1487 }
1488
1489
1490 /* Choose best peeling option by traversing peeling hash table and either
1491 choosing an option with the lowest cost (if cost model is enabled) or the
1492 option that aligns as many accesses as possible. */
1493
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496 loop_vec_info loop_vinfo)
1497 {
1498 struct _vect_peel_extended_info res;
1499
1500 res.peel_info.dr_info = NULL;
1501
1502 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1503 {
1504 res.inside_cost = INT_MAX;
1505 res.outside_cost = INT_MAX;
1506 peeling_htab->traverse <_vect_peel_extended_info *,
1507 vect_peeling_hash_get_lowest_cost> (&res);
1508 }
1509 else
1510 {
1511 res.peel_info.count = 0;
1512 peeling_htab->traverse <_vect_peel_extended_info *,
1513 vect_peeling_hash_get_most_frequent> (&res);
1514 res.inside_cost = 0;
1515 res.outside_cost = 0;
1516 }
1517
1518 return res;
1519 }
1520
1521 /* Return true if the new peeling NPEEL is supported. */
1522
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1525 unsigned npeel)
1526 {
1527 unsigned i;
1528 struct data_reference *dr = NULL;
1529 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530 enum dr_alignment_support supportable_dr_alignment;
1531
1532 /* Ensure that all data refs can be vectorized after the peel. */
1533 FOR_EACH_VEC_ELT (datarefs, i, dr)
1534 {
1535 int save_misalignment;
1536
1537 if (dr == dr0_info->dr)
1538 continue;
1539
1540 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541 stmt_vec_info stmt_info = dr_info->stmt;
1542 /* For interleaving, only the alignment of the first access
1543 matters. */
1544 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546 continue;
1547
1548 /* Strided accesses perform only component accesses, alignment is
1549 irrelevant for them. */
1550 if (STMT_VINFO_STRIDED_P (stmt_info)
1551 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552 continue;
1553
1554 save_misalignment = DR_MISALIGNMENT (dr_info);
1555 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1556 supportable_dr_alignment
1557 = vect_supportable_dr_alignment (dr_info, false);
1558 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1559
1560 if (!supportable_dr_alignment)
1561 return false;
1562 }
1563
1564 return true;
1565 }
1566
1567 /* Function vect_enhance_data_refs_alignment
1568
1569 This pass will use loop versioning and loop peeling in order to enhance
1570 the alignment of data references in the loop.
1571
1572 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573 original loop is to be vectorized. Any other loops that are created by
1574 the transformations performed in this pass - are not supposed to be
1575 vectorized. This restriction will be relaxed.
1576
1577 This pass will require a cost model to guide it whether to apply peeling
1578 or versioning or a combination of the two. For example, the scheme that
1579 intel uses when given a loop with several memory accesses, is as follows:
1580 choose one memory access ('p') which alignment you want to force by doing
1581 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1582 other accesses are not necessarily aligned, or (2) use loop versioning to
1583 generate one loop in which all accesses are aligned, and another loop in
1584 which only 'p' is necessarily aligned.
1585
1586 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589
1590 Devising a cost model is the most critical aspect of this work. It will
1591 guide us on which access to peel for, whether to use loop versioning, how
1592 many versions to create, etc. The cost model will probably consist of
1593 generic considerations as well as target specific considerations (on
1594 powerpc for example, misaligned stores are more painful than misaligned
1595 loads).
1596
1597 Here are the general steps involved in alignment enhancements:
1598
1599 -- original loop, before alignment analysis:
1600 for (i=0; i<N; i++){
1601 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1602 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1603 }
1604
1605 -- After vect_compute_data_refs_alignment:
1606 for (i=0; i<N; i++){
1607 x = q[i]; # DR_MISALIGNMENT(q) = 3
1608 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1609 }
1610
1611 -- Possibility 1: we do loop versioning:
1612 if (p is aligned) {
1613 for (i=0; i<N; i++){ # loop 1A
1614 x = q[i]; # DR_MISALIGNMENT(q) = 3
1615 p[i] = y; # DR_MISALIGNMENT(p) = 0
1616 }
1617 }
1618 else {
1619 for (i=0; i<N; i++){ # loop 1B
1620 x = q[i]; # DR_MISALIGNMENT(q) = 3
1621 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1622 }
1623 }
1624
1625 -- Possibility 2: we do loop peeling:
1626 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1627 x = q[i];
1628 p[i] = y;
1629 }
1630 for (i = 3; i < N; i++){ # loop 2A
1631 x = q[i]; # DR_MISALIGNMENT(q) = 0
1632 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1633 }
1634
1635 -- Possibility 3: combination of loop peeling and versioning:
1636 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1637 x = q[i];
1638 p[i] = y;
1639 }
1640 if (p is aligned) {
1641 for (i = 3; i<N; i++){ # loop 3A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = 0
1644 }
1645 }
1646 else {
1647 for (i = 3; i<N; i++){ # loop 3B
1648 x = q[i]; # DR_MISALIGNMENT(q) = 0
1649 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1650 }
1651 }
1652
1653 These loops are later passed to loop_transform to be vectorized. The
1654 vectorizer will use the alignment information to guide the transformation
1655 (whether to generate regular loads/stores, or with special handling for
1656 misalignment). */
1657
1658 opt_result
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 {
1661 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663 enum dr_alignment_support supportable_dr_alignment;
1664 dr_vec_info *first_store = NULL;
1665 dr_vec_info *dr0_info = NULL;
1666 struct data_reference *dr;
1667 unsigned int i, j;
1668 bool do_peeling = false;
1669 bool do_versioning = false;
1670 unsigned int npeel = 0;
1671 bool one_misalignment_known = false;
1672 bool one_misalignment_unknown = false;
1673 bool one_dr_unsupportable = false;
1674 dr_vec_info *unsupportable_dr_info = NULL;
1675 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676 unsigned possible_npeel_number = 1;
1677 tree vectype;
1678 unsigned int mis, same_align_drs_max = 0;
1679 hash_table<peel_info_hasher> peeling_htab (1);
1680
1681 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682
1683 /* Reset data so we can safely be called multiple times. */
1684 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686
1687 /* While cost model enhancements are expected in the future, the high level
1688 view of the code at this time is as follows:
1689
1690 A) If there is a misaligned access then see if peeling to align
1691 this access can make all data references satisfy
1692 vect_supportable_dr_alignment. If so, update data structures
1693 as needed and return true.
1694
1695 B) If peeling wasn't possible and there is a data reference with an
1696 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697 then see if loop versioning checks can be used to make all data
1698 references satisfy vect_supportable_dr_alignment. If so, update
1699 data structures as needed and return true.
1700
1701 C) If neither peeling nor versioning were successful then return false if
1702 any data reference does not satisfy vect_supportable_dr_alignment.
1703
1704 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705
1706 Note, Possibility 3 above (which is peeling and versioning together) is not
1707 being done at this time. */
1708
1709 /* (1) Peeling to force alignment. */
1710
1711 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712 Considerations:
1713 + How many accesses will become aligned due to the peeling
1714 - How many accesses will become unaligned due to the peeling,
1715 and the cost of misaligned accesses.
1716 - The cost of peeling (the extra runtime checks, the increase
1717 in code size). */
1718
1719 FOR_EACH_VEC_ELT (datarefs, i, dr)
1720 {
1721 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1722 stmt_vec_info stmt_info = dr_info->stmt;
1723
1724 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725 continue;
1726
1727 /* For interleaving, only the alignment of the first access
1728 matters. */
1729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1731 continue;
1732
1733 /* For scatter-gather or invariant accesses there is nothing
1734 to enhance. */
1735 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1736 || integer_zerop (DR_STEP (dr)))
1737 continue;
1738
1739 /* Strided accesses perform only component accesses, alignment is
1740 irrelevant for them. */
1741 if (STMT_VINFO_STRIDED_P (stmt_info)
1742 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1743 continue;
1744
1745 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1746 do_peeling = vector_alignment_reachable_p (dr_info);
1747 if (do_peeling)
1748 {
1749 if (known_alignment_for_access_p (dr_info))
1750 {
1751 unsigned int npeel_tmp = 0;
1752 bool negative = tree_int_cst_compare (DR_STEP (dr),
1753 size_zero_node) < 0;
1754
1755 vectype = STMT_VINFO_VECTYPE (stmt_info);
1756 /* If known_alignment_for_access_p then we have set
1757 DR_MISALIGNMENT which is only done if we know it at compiler
1758 time, so it is safe to assume target alignment is constant.
1759 */
1760 unsigned int target_align =
1761 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1762 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1763 mis = (negative
1764 ? DR_MISALIGNMENT (dr_info)
1765 : -DR_MISALIGNMENT (dr_info));
1766 if (DR_MISALIGNMENT (dr_info) != 0)
1767 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769 /* For multiple types, it is possible that the bigger type access
1770 will have more than one peeling option. E.g., a loop with two
1771 types: one of size (vector size / 4), and the other one of
1772 size (vector size / 8). Vectorization factor will 8. If both
1773 accesses are misaligned by 3, the first one needs one scalar
1774 iteration to be aligned, and the second one needs 5. But the
1775 first one will be aligned also by peeling 5 scalar
1776 iterations, and in that case both accesses will be aligned.
1777 Hence, except for the immediate peeling amount, we also want
1778 to try to add full vector size, while we don't exceed
1779 vectorization factor.
1780 We do this automatically for cost model, since we calculate
1781 cost for every peeling option. */
1782 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783 {
1784 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786 possible_npeel_number
1787 = vect_get_num_vectors (nscalars, vectype);
1788
1789 /* NPEEL_TMP is 0 when there is no misalignment, but also
1790 allow peeling NELEMENTS. */
1791 if (DR_MISALIGNMENT (dr_info) == 0)
1792 possible_npeel_number++;
1793 }
1794
1795 /* Save info about DR in the hash table. Also include peeling
1796 amounts according to the explanation above. */
1797 for (j = 0; j < possible_npeel_number; j++)
1798 {
1799 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800 dr_info, npeel_tmp);
1801 npeel_tmp += target_align / dr_size;
1802 }
1803
1804 one_misalignment_known = true;
1805 }
1806 else
1807 {
1808 /* If we don't know any misalignment values, we prefer
1809 peeling for data-ref that has the maximum number of data-refs
1810 with the same alignment, unless the target prefers to align
1811 stores over load. */
1812 unsigned same_align_drs
1813 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814 if (!dr0_info
1815 || same_align_drs_max < same_align_drs)
1816 {
1817 same_align_drs_max = same_align_drs;
1818 dr0_info = dr_info;
1819 }
1820 /* For data-refs with the same number of related
1821 accesses prefer the one where the misalign
1822 computation will be invariant in the outermost loop. */
1823 else if (same_align_drs_max == same_align_drs)
1824 {
1825 class loop *ivloop0, *ivloop;
1826 ivloop0 = outermost_invariant_loop_for_expr
1827 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1828 ivloop = outermost_invariant_loop_for_expr
1829 (loop, DR_BASE_ADDRESS (dr));
1830 if ((ivloop && !ivloop0)
1831 || (ivloop && ivloop0
1832 && flow_loop_nested_p (ivloop, ivloop0)))
1833 dr0_info = dr_info;
1834 }
1835
1836 one_misalignment_unknown = true;
1837
1838 /* Check for data refs with unsupportable alignment that
1839 can be peeled. */
1840 if (!supportable_dr_alignment)
1841 {
1842 one_dr_unsupportable = true;
1843 unsupportable_dr_info = dr_info;
1844 }
1845
1846 if (!first_store && DR_IS_WRITE (dr))
1847 first_store = dr_info;
1848 }
1849 }
1850 else
1851 {
1852 if (!aligned_access_p (dr_info))
1853 {
1854 if (dump_enabled_p ())
1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856 "vector alignment may not be reachable\n");
1857 break;
1858 }
1859 }
1860 }
1861
1862 /* Check if we can possibly peel the loop. */
1863 if (!vect_can_advance_ivs_p (loop_vinfo)
1864 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865 || loop->inner)
1866 do_peeling = false;
1867
1868 struct _vect_peel_extended_info peel_for_known_alignment;
1869 struct _vect_peel_extended_info peel_for_unknown_alignment;
1870 struct _vect_peel_extended_info best_peel;
1871
1872 peel_for_unknown_alignment.inside_cost = INT_MAX;
1873 peel_for_unknown_alignment.outside_cost = INT_MAX;
1874 peel_for_unknown_alignment.peel_info.count = 0;
1875
1876 if (do_peeling
1877 && one_misalignment_unknown)
1878 {
1879 /* Check if the target requires to prefer stores over loads, i.e., if
1880 misaligned stores are more expensive than misaligned loads (taking
1881 drs with same alignment into account). */
1882 unsigned int load_inside_cost = 0;
1883 unsigned int load_outside_cost = 0;
1884 unsigned int store_inside_cost = 0;
1885 unsigned int store_outside_cost = 0;
1886 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888 stmt_vector_for_cost dummy;
1889 dummy.create (2);
1890 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1891 &load_inside_cost,
1892 &load_outside_cost,
1893 &dummy, &dummy, estimated_npeels, true);
1894 dummy.release ();
1895
1896 if (first_store)
1897 {
1898 dummy.create (2);
1899 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1900 &store_inside_cost,
1901 &store_outside_cost,
1902 &dummy, &dummy,
1903 estimated_npeels, true);
1904 dummy.release ();
1905 }
1906 else
1907 {
1908 store_inside_cost = INT_MAX;
1909 store_outside_cost = INT_MAX;
1910 }
1911
1912 if (load_inside_cost > store_inside_cost
1913 || (load_inside_cost == store_inside_cost
1914 && load_outside_cost > store_outside_cost))
1915 {
1916 dr0_info = first_store;
1917 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919 }
1920 else
1921 {
1922 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924 }
1925
1926 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927 prologue_cost_vec.create (2);
1928 epilogue_cost_vec.create (2);
1929
1930 int dummy2;
1931 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932 (loop_vinfo, estimated_npeels, &dummy2,
1933 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934 &prologue_cost_vec, &epilogue_cost_vec);
1935
1936 prologue_cost_vec.release ();
1937 epilogue_cost_vec.release ();
1938
1939 peel_for_unknown_alignment.peel_info.count = 1
1940 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1941 }
1942
1943 peel_for_unknown_alignment.peel_info.npeel = 0;
1944 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1945
1946 best_peel = peel_for_unknown_alignment;
1947
1948 peel_for_known_alignment.inside_cost = INT_MAX;
1949 peel_for_known_alignment.outside_cost = INT_MAX;
1950 peel_for_known_alignment.peel_info.count = 0;
1951 peel_for_known_alignment.peel_info.dr_info = NULL;
1952
1953 if (do_peeling && one_misalignment_known)
1954 {
1955 /* Peeling is possible, but there is no data access that is not supported
1956 unless aligned. So we try to choose the best possible peeling from
1957 the hash table. */
1958 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1959 (&peeling_htab, loop_vinfo);
1960 }
1961
1962 /* Compare costs of peeling for known and unknown alignment. */
1963 if (peel_for_known_alignment.peel_info.dr_info != NULL
1964 && peel_for_unknown_alignment.inside_cost
1965 >= peel_for_known_alignment.inside_cost)
1966 {
1967 best_peel = peel_for_known_alignment;
1968
1969 /* If the best peeling for known alignment has NPEEL == 0, perform no
1970 peeling at all except if there is an unsupportable dr that we can
1971 align. */
1972 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1973 do_peeling = false;
1974 }
1975
1976 /* If there is an unsupportable data ref, prefer this over all choices so far
1977 since we'd have to discard a chosen peeling except when it accidentally
1978 aligned the unsupportable data ref. */
1979 if (one_dr_unsupportable)
1980 dr0_info = unsupportable_dr_info;
1981 else if (do_peeling)
1982 {
1983 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1984 TODO: Use nopeel_outside_cost or get rid of it? */
1985 unsigned nopeel_inside_cost = 0;
1986 unsigned nopeel_outside_cost = 0;
1987
1988 stmt_vector_for_cost dummy;
1989 dummy.create (2);
1990 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1991 &nopeel_outside_cost, &dummy, &dummy,
1992 0, false);
1993 dummy.release ();
1994
1995 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1996 costs will be recorded. */
1997 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1998 prologue_cost_vec.create (2);
1999 epilogue_cost_vec.create (2);
2000
2001 int dummy2;
2002 nopeel_outside_cost += vect_get_known_peeling_cost
2003 (loop_vinfo, 0, &dummy2,
2004 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2005 &prologue_cost_vec, &epilogue_cost_vec);
2006
2007 prologue_cost_vec.release ();
2008 epilogue_cost_vec.release ();
2009
2010 npeel = best_peel.peel_info.npeel;
2011 dr0_info = best_peel.peel_info.dr_info;
2012
2013 /* If no peeling is not more expensive than the best peeling we
2014 have so far, don't perform any peeling. */
2015 if (nopeel_inside_cost <= best_peel.inside_cost)
2016 do_peeling = false;
2017 }
2018
2019 if (do_peeling)
2020 {
2021 stmt_vec_info stmt_info = dr0_info->stmt;
2022 vectype = STMT_VINFO_VECTYPE (stmt_info);
2023
2024 if (known_alignment_for_access_p (dr0_info))
2025 {
2026 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2027 size_zero_node) < 0;
2028 if (!npeel)
2029 {
2030 /* Since it's known at compile time, compute the number of
2031 iterations in the peeled loop (the peeling factor) for use in
2032 updating DR_MISALIGNMENT values. The peeling factor is the
2033 vectorization factor minus the misalignment as an element
2034 count. */
2035 mis = (negative
2036 ? DR_MISALIGNMENT (dr0_info)
2037 : -DR_MISALIGNMENT (dr0_info));
2038 /* If known_alignment_for_access_p then we have set
2039 DR_MISALIGNMENT which is only done if we know it at compiler
2040 time, so it is safe to assume target alignment is constant.
2041 */
2042 unsigned int target_align =
2043 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2044 npeel = ((mis & (target_align - 1))
2045 / vect_get_scalar_dr_size (dr0_info));
2046 }
2047
2048 /* For interleaved data access every iteration accesses all the
2049 members of the group, therefore we divide the number of iterations
2050 by the group size. */
2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 npeel /= DR_GROUP_SIZE (stmt_info);
2053
2054 if (dump_enabled_p ())
2055 dump_printf_loc (MSG_NOTE, vect_location,
2056 "Try peeling by %d\n", npeel);
2057 }
2058
2059 /* Ensure that all datarefs can be vectorized after the peel. */
2060 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2061 do_peeling = false;
2062
2063 /* Check if all datarefs are supportable and log. */
2064 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2065 {
2066 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2067 if (!stat)
2068 do_peeling = false;
2069 else
2070 return stat;
2071 }
2072
2073 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2074 if (do_peeling)
2075 {
2076 unsigned max_allowed_peel
2077 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2078 if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2079 max_allowed_peel = 0;
2080 if (max_allowed_peel != (unsigned)-1)
2081 {
2082 unsigned max_peel = npeel;
2083 if (max_peel == 0)
2084 {
2085 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2086 unsigned HOST_WIDE_INT target_align_c;
2087 if (target_align.is_constant (&target_align_c))
2088 max_peel =
2089 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2090 else
2091 {
2092 do_peeling = false;
2093 if (dump_enabled_p ())
2094 dump_printf_loc (MSG_NOTE, vect_location,
2095 "Disable peeling, max peels set and vector"
2096 " alignment unknown\n");
2097 }
2098 }
2099 if (max_peel > max_allowed_peel)
2100 {
2101 do_peeling = false;
2102 if (dump_enabled_p ())
2103 dump_printf_loc (MSG_NOTE, vect_location,
2104 "Disable peeling, max peels reached: %d\n", max_peel);
2105 }
2106 }
2107 }
2108
2109 /* Cost model #2 - if peeling may result in a remaining loop not
2110 iterating enough to be vectorized then do not peel. Since this
2111 is a cost heuristic rather than a correctness decision, use the
2112 most likely runtime value for variable vectorization factors. */
2113 if (do_peeling
2114 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2115 {
2116 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2117 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2118 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2119 < assumed_vf + max_peel)
2120 do_peeling = false;
2121 }
2122
2123 if (do_peeling)
2124 {
2125 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2126 If the misalignment of DR_i is identical to that of dr0 then set
2127 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2128 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2129 by the peeling factor times the element size of DR_i (MOD the
2130 vectorization factor times the size). Otherwise, the
2131 misalignment of DR_i must be set to unknown. */
2132 FOR_EACH_VEC_ELT (datarefs, i, dr)
2133 if (dr != dr0_info->dr)
2134 {
2135 /* Strided accesses perform only component accesses, alignment
2136 is irrelevant for them. */
2137 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2138 stmt_info = dr_info->stmt;
2139 if (STMT_VINFO_STRIDED_P (stmt_info)
2140 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2141 continue;
2142
2143 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2144 }
2145
2146 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2147 if (npeel)
2148 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2149 else
2150 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2151 = DR_MISALIGNMENT (dr0_info);
2152 SET_DR_MISALIGNMENT (dr0_info, 0);
2153 if (dump_enabled_p ())
2154 {
2155 dump_printf_loc (MSG_NOTE, vect_location,
2156 "Alignment of access forced using peeling.\n");
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "Peeling for alignment will be applied.\n");
2159 }
2160
2161 /* The inside-loop cost will be accounted for in vectorizable_load
2162 and vectorizable_store correctly with adjusted alignments.
2163 Drop the body_cst_vec on the floor here. */
2164 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2165 gcc_assert (stat);
2166 return stat;
2167 }
2168 }
2169
2170 /* (2) Versioning to force alignment. */
2171
2172 /* Try versioning if:
2173 1) optimize loop for speed and the cost-model is not cheap
2174 2) there is at least one unsupported misaligned data ref with an unknown
2175 misalignment, and
2176 3) all misaligned data refs with a known misalignment are supported, and
2177 4) the number of runtime alignment checks is within reason. */
2178
2179 do_versioning
2180 = (optimize_loop_nest_for_speed_p (loop)
2181 && !loop->inner /* FORNOW */
2182 && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2183
2184 if (do_versioning)
2185 {
2186 FOR_EACH_VEC_ELT (datarefs, i, dr)
2187 {
2188 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2189 stmt_vec_info stmt_info = dr_info->stmt;
2190
2191 /* For interleaving, only the alignment of the first access
2192 matters. */
2193 if (aligned_access_p (dr_info)
2194 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2195 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2196 continue;
2197
2198 if (STMT_VINFO_STRIDED_P (stmt_info))
2199 {
2200 /* Strided loads perform only component accesses, alignment is
2201 irrelevant for them. */
2202 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2203 continue;
2204 do_versioning = false;
2205 break;
2206 }
2207
2208 supportable_dr_alignment
2209 = vect_supportable_dr_alignment (dr_info, false);
2210
2211 if (!supportable_dr_alignment)
2212 {
2213 int mask;
2214 tree vectype;
2215
2216 if (known_alignment_for_access_p (dr_info)
2217 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2218 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2219 {
2220 do_versioning = false;
2221 break;
2222 }
2223
2224 vectype = STMT_VINFO_VECTYPE (stmt_info);
2225 gcc_assert (vectype);
2226
2227 /* At present we don't support versioning for alignment
2228 with variable VF, since there's no guarantee that the
2229 VF is a power of two. We could relax this if we added
2230 a way of enforcing a power-of-two size. */
2231 unsigned HOST_WIDE_INT size;
2232 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2233 {
2234 do_versioning = false;
2235 break;
2236 }
2237
2238 /* Forcing alignment in the first iteration is no good if
2239 we don't keep it across iterations. For now, just disable
2240 versioning in this case.
2241 ?? We could actually unroll the loop to achieve the required
2242 overall step alignment, and forcing the alignment could be
2243 done by doing some iterations of the non-vectorized loop. */
2244 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2245 * DR_STEP_ALIGNMENT (dr),
2246 DR_TARGET_ALIGNMENT (dr_info)))
2247 {
2248 do_versioning = false;
2249 break;
2250 }
2251
2252 /* The rightmost bits of an aligned address must be zeros.
2253 Construct the mask needed for this test. For example,
2254 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2255 mask must be 15 = 0xf. */
2256 mask = size - 1;
2257
2258 /* FORNOW: use the same mask to test all potentially unaligned
2259 references in the loop. The vectorizer currently supports
2260 a single vector size, see the reference to
2261 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2262 vectorization factor is computed. */
2263 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2264 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2265 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2266 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2267 }
2268 }
2269
2270 /* Versioning requires at least one misaligned data reference. */
2271 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2272 do_versioning = false;
2273 else if (!do_versioning)
2274 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2275 }
2276
2277 if (do_versioning)
2278 {
2279 vec<stmt_vec_info> may_misalign_stmts
2280 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2281 stmt_vec_info stmt_info;
2282
2283 /* It can now be assumed that the data references in the statements
2284 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2285 of the loop being vectorized. */
2286 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2287 {
2288 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2289 SET_DR_MISALIGNMENT (dr_info, 0);
2290 if (dump_enabled_p ())
2291 dump_printf_loc (MSG_NOTE, vect_location,
2292 "Alignment of access forced using versioning.\n");
2293 }
2294
2295 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_NOTE, vect_location,
2297 "Versioning for alignment will be applied.\n");
2298
2299 /* Peeling and versioning can't be done together at this time. */
2300 gcc_assert (! (do_peeling && do_versioning));
2301
2302 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2303 gcc_assert (stat);
2304 return stat;
2305 }
2306
2307 /* This point is reached if neither peeling nor versioning is being done. */
2308 gcc_assert (! (do_peeling || do_versioning));
2309
2310 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2311 return stat;
2312 }
2313
2314
2315 /* Function vect_find_same_alignment_drs.
2316
2317 Update group and alignment relations in VINFO according to the chosen
2318 vectorization factor. */
2319
2320 static void
2321 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2322 {
2323 struct data_reference *dra = DDR_A (ddr);
2324 struct data_reference *drb = DDR_B (ddr);
2325 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2326 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2327 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2328 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2329
2330 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2331 return;
2332
2333 if (dra == drb)
2334 return;
2335
2336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2338 return;
2339
2340 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2341 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2342 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2343 return;
2344
2345 /* Two references with distance zero have the same alignment. */
2346 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2347 - wi::to_poly_offset (DR_INIT (drb)));
2348 if (maybe_ne (diff, 0))
2349 {
2350 /* Get the wider of the two alignments. */
2351 poly_uint64 align_a =
2352 exact_div (vect_calculate_target_alignment (dr_info_a),
2353 BITS_PER_UNIT);
2354 poly_uint64 align_b =
2355 exact_div (vect_calculate_target_alignment (dr_info_b),
2356 BITS_PER_UNIT);
2357 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2358 if (!align_a.is_constant (&align_a_c)
2359 || !align_b.is_constant (&align_b_c))
2360 return;
2361
2362 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2363
2364 /* Require the gap to be a multiple of the larger vector alignment. */
2365 if (!multiple_p (diff, max_align))
2366 return;
2367 }
2368
2369 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2370 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2371 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_NOTE, vect_location,
2373 "accesses have the same alignment: %T and %T\n",
2374 DR_REF (dra), DR_REF (drb));
2375 }
2376
2377
2378 /* Function vect_analyze_data_refs_alignment
2379
2380 Analyze the alignment of the data-references in the loop.
2381 Return FALSE if a data reference is found that cannot be vectorized. */
2382
2383 opt_result
2384 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2385 {
2386 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2387
2388 /* Mark groups of data references with same alignment using
2389 data dependence information. */
2390 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2391 struct data_dependence_relation *ddr;
2392 unsigned int i;
2393
2394 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2395 vect_find_same_alignment_drs (vinfo, ddr);
2396
2397 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2398 struct data_reference *dr;
2399
2400 vect_record_base_alignments (vinfo);
2401 FOR_EACH_VEC_ELT (datarefs, i, dr)
2402 {
2403 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2404 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2405 vect_compute_data_ref_alignment (dr_info);
2406 }
2407
2408 return opt_result::success ();
2409 }
2410
2411
2412 /* Analyze alignment of DRs of stmts in NODE. */
2413
2414 static bool
2415 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2416 {
2417 /* We vectorize from the first scalar stmt in the node unless
2418 the node is permuted in which case we start from the first
2419 element in the group. */
2420 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2421 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2422 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2423 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2424
2425 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2426 vect_compute_data_ref_alignment (dr_info);
2427 /* For creating the data-ref pointer we need alignment of the
2428 first element anyway. */
2429 if (dr_info != first_dr_info)
2430 vect_compute_data_ref_alignment (first_dr_info);
2431 if (! verify_data_ref_alignment (dr_info))
2432 {
2433 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2435 "not vectorized: bad data alignment in basic "
2436 "block.\n");
2437 return false;
2438 }
2439
2440 return true;
2441 }
2442
2443 /* Function vect_slp_analyze_instance_alignment
2444
2445 Analyze the alignment of the data-references in the SLP instance.
2446 Return FALSE if a data reference is found that cannot be vectorized. */
2447
2448 bool
2449 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2450 {
2451 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2452
2453 slp_tree node;
2454 unsigned i;
2455 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2456 if (! vect_slp_analyze_and_verify_node_alignment (node))
2457 return false;
2458
2459 node = SLP_INSTANCE_TREE (instance);
2460 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2461 && ! vect_slp_analyze_and_verify_node_alignment
2462 (SLP_INSTANCE_TREE (instance)))
2463 return false;
2464
2465 return true;
2466 }
2467
2468
2469 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2470 accesses of legal size, step, etc. Detect gaps, single element
2471 interleaving, and other special cases. Set grouped access info.
2472 Collect groups of strided stores for further use in SLP analysis.
2473 Worker for vect_analyze_group_access. */
2474
2475 static bool
2476 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2477 {
2478 data_reference *dr = dr_info->dr;
2479 tree step = DR_STEP (dr);
2480 tree scalar_type = TREE_TYPE (DR_REF (dr));
2481 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2482 stmt_vec_info stmt_info = dr_info->stmt;
2483 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2484 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2485 HOST_WIDE_INT dr_step = -1;
2486 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2487 bool slp_impossible = false;
2488
2489 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2490 size of the interleaving group (including gaps). */
2491 if (tree_fits_shwi_p (step))
2492 {
2493 dr_step = tree_to_shwi (step);
2494 /* Check that STEP is a multiple of type size. Otherwise there is
2495 a non-element-sized gap at the end of the group which we
2496 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2497 ??? As we can handle non-constant step fine here we should
2498 simply remove uses of DR_GROUP_GAP between the last and first
2499 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2500 simply not include that gap. */
2501 if ((dr_step % type_size) != 0)
2502 {
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_NOTE, vect_location,
2505 "Step %T is not a multiple of the element size"
2506 " for %T\n",
2507 step, DR_REF (dr));
2508 return false;
2509 }
2510 groupsize = absu_hwi (dr_step) / type_size;
2511 }
2512 else
2513 groupsize = 0;
2514
2515 /* Not consecutive access is possible only if it is a part of interleaving. */
2516 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2517 {
2518 /* Check if it this DR is a part of interleaving, and is a single
2519 element of the group that is accessed in the loop. */
2520
2521 /* Gaps are supported only for loads. STEP must be a multiple of the type
2522 size. */
2523 if (DR_IS_READ (dr)
2524 && (dr_step % type_size) == 0
2525 && groupsize > 0)
2526 {
2527 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2528 DR_GROUP_SIZE (stmt_info) = groupsize;
2529 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2530 if (dump_enabled_p ())
2531 dump_printf_loc (MSG_NOTE, vect_location,
2532 "Detected single element interleaving %T"
2533 " step %T\n",
2534 DR_REF (dr), step);
2535
2536 return true;
2537 }
2538
2539 if (dump_enabled_p ())
2540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2541 "not consecutive access %G", stmt_info->stmt);
2542
2543 if (bb_vinfo)
2544 {
2545 /* Mark the statement as unvectorizable. */
2546 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2547 return true;
2548 }
2549
2550 if (dump_enabled_p ())
2551 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2552 STMT_VINFO_STRIDED_P (stmt_info) = true;
2553 return true;
2554 }
2555
2556 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2557 {
2558 /* First stmt in the interleaving chain. Check the chain. */
2559 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2560 struct data_reference *data_ref = dr;
2561 unsigned int count = 1;
2562 tree prev_init = DR_INIT (data_ref);
2563 HOST_WIDE_INT diff, gaps = 0;
2564
2565 /* By construction, all group members have INTEGER_CST DR_INITs. */
2566 while (next)
2567 {
2568 /* We never have the same DR multiple times. */
2569 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2570 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2571
2572 data_ref = STMT_VINFO_DATA_REF (next);
2573
2574 /* All group members have the same STEP by construction. */
2575 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2576
2577 /* Check that the distance between two accesses is equal to the type
2578 size. Otherwise, we have gaps. */
2579 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2580 - TREE_INT_CST_LOW (prev_init)) / type_size;
2581 if (diff != 1)
2582 {
2583 /* FORNOW: SLP of accesses with gaps is not supported. */
2584 slp_impossible = true;
2585 if (DR_IS_WRITE (data_ref))
2586 {
2587 if (dump_enabled_p ())
2588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2589 "interleaved store with gaps\n");
2590 return false;
2591 }
2592
2593 gaps += diff - 1;
2594 }
2595
2596 last_accessed_element += diff;
2597
2598 /* Store the gap from the previous member of the group. If there is no
2599 gap in the access, DR_GROUP_GAP is always 1. */
2600 DR_GROUP_GAP (next) = diff;
2601
2602 prev_init = DR_INIT (data_ref);
2603 next = DR_GROUP_NEXT_ELEMENT (next);
2604 /* Count the number of data-refs in the chain. */
2605 count++;
2606 }
2607
2608 if (groupsize == 0)
2609 groupsize = count + gaps;
2610
2611 /* This could be UINT_MAX but as we are generating code in a very
2612 inefficient way we have to cap earlier. See PR78699 for example. */
2613 if (groupsize > 4096)
2614 {
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617 "group is too large\n");
2618 return false;
2619 }
2620
2621 /* Check that the size of the interleaving is equal to count for stores,
2622 i.e., that there are no gaps. */
2623 if (groupsize != count
2624 && !DR_IS_READ (dr))
2625 {
2626 groupsize = count;
2627 STMT_VINFO_STRIDED_P (stmt_info) = true;
2628 }
2629
2630 /* If there is a gap after the last load in the group it is the
2631 difference between the groupsize and the last accessed
2632 element.
2633 When there is no gap, this difference should be 0. */
2634 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2635
2636 DR_GROUP_SIZE (stmt_info) = groupsize;
2637 if (dump_enabled_p ())
2638 {
2639 dump_printf_loc (MSG_NOTE, vect_location,
2640 "Detected interleaving ");
2641 if (DR_IS_READ (dr))
2642 dump_printf (MSG_NOTE, "load ");
2643 else if (STMT_VINFO_STRIDED_P (stmt_info))
2644 dump_printf (MSG_NOTE, "strided store ");
2645 else
2646 dump_printf (MSG_NOTE, "store ");
2647 dump_printf (MSG_NOTE, "of size %u\n",
2648 (unsigned)groupsize);
2649 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2650 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2651 while (next)
2652 {
2653 if (DR_GROUP_GAP (next) != 1)
2654 dump_printf_loc (MSG_NOTE, vect_location,
2655 "\t<gap of %d elements>\n",
2656 DR_GROUP_GAP (next) - 1);
2657 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2658 next = DR_GROUP_NEXT_ELEMENT (next);
2659 }
2660 if (DR_GROUP_GAP (stmt_info) != 0)
2661 dump_printf_loc (MSG_NOTE, vect_location,
2662 "\t<gap of %d elements>\n",
2663 DR_GROUP_GAP (stmt_info));
2664 }
2665
2666 /* SLP: create an SLP data structure for every interleaving group of
2667 stores for further analysis in vect_analyse_slp. */
2668 if (DR_IS_WRITE (dr) && !slp_impossible)
2669 {
2670 if (loop_vinfo)
2671 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2672 if (bb_vinfo)
2673 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2674 }
2675 }
2676
2677 return true;
2678 }
2679
2680 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2681 accesses of legal size, step, etc. Detect gaps, single element
2682 interleaving, and other special cases. Set grouped access info.
2683 Collect groups of strided stores for further use in SLP analysis. */
2684
2685 static bool
2686 vect_analyze_group_access (dr_vec_info *dr_info)
2687 {
2688 if (!vect_analyze_group_access_1 (dr_info))
2689 {
2690 /* Dissolve the group if present. */
2691 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2692 while (stmt_info)
2693 {
2694 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2695 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2696 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2697 stmt_info = next;
2698 }
2699 return false;
2700 }
2701 return true;
2702 }
2703
2704 /* Analyze the access pattern of the data-reference DR_INFO.
2705 In case of non-consecutive accesses call vect_analyze_group_access() to
2706 analyze groups of accesses. */
2707
2708 static bool
2709 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2710 {
2711 data_reference *dr = dr_info->dr;
2712 tree step = DR_STEP (dr);
2713 tree scalar_type = TREE_TYPE (DR_REF (dr));
2714 stmt_vec_info stmt_info = dr_info->stmt;
2715 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2716 class loop *loop = NULL;
2717
2718 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2719 return true;
2720
2721 if (loop_vinfo)
2722 loop = LOOP_VINFO_LOOP (loop_vinfo);
2723
2724 if (loop_vinfo && !step)
2725 {
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2728 "bad data-ref access in loop\n");
2729 return false;
2730 }
2731
2732 /* Allow loads with zero step in inner-loop vectorization. */
2733 if (loop_vinfo && integer_zerop (step))
2734 {
2735 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2736 if (!nested_in_vect_loop_p (loop, stmt_info))
2737 return DR_IS_READ (dr);
2738 /* Allow references with zero step for outer loops marked
2739 with pragma omp simd only - it guarantees absence of
2740 loop-carried dependencies between inner loop iterations. */
2741 if (loop->safelen < 2)
2742 {
2743 if (dump_enabled_p ())
2744 dump_printf_loc (MSG_NOTE, vect_location,
2745 "zero step in inner loop of nest\n");
2746 return false;
2747 }
2748 }
2749
2750 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2751 {
2752 /* Interleaved accesses are not yet supported within outer-loop
2753 vectorization for references in the inner-loop. */
2754 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2755
2756 /* For the rest of the analysis we use the outer-loop step. */
2757 step = STMT_VINFO_DR_STEP (stmt_info);
2758 if (integer_zerop (step))
2759 {
2760 if (dump_enabled_p ())
2761 dump_printf_loc (MSG_NOTE, vect_location,
2762 "zero step in outer loop.\n");
2763 return DR_IS_READ (dr);
2764 }
2765 }
2766
2767 /* Consecutive? */
2768 if (TREE_CODE (step) == INTEGER_CST)
2769 {
2770 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2771 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2772 || (dr_step < 0
2773 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2774 {
2775 /* Mark that it is not interleaving. */
2776 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2777 return true;
2778 }
2779 }
2780
2781 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2782 {
2783 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_NOTE, vect_location,
2785 "grouped access in outer loop.\n");
2786 return false;
2787 }
2788
2789
2790 /* Assume this is a DR handled by non-constant strided load case. */
2791 if (TREE_CODE (step) != INTEGER_CST)
2792 return (STMT_VINFO_STRIDED_P (stmt_info)
2793 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2794 || vect_analyze_group_access (dr_info)));
2795
2796 /* Not consecutive access - check if it's a part of interleaving group. */
2797 return vect_analyze_group_access (dr_info);
2798 }
2799
2800 /* Compare two data-references DRA and DRB to group them into chunks
2801 suitable for grouping. */
2802
2803 static int
2804 dr_group_sort_cmp (const void *dra_, const void *drb_)
2805 {
2806 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2807 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2808 int cmp;
2809
2810 /* Stabilize sort. */
2811 if (dra == drb)
2812 return 0;
2813
2814 /* DRs in different loops never belong to the same group. */
2815 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2816 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2817 if (loopa != loopb)
2818 return loopa->num < loopb->num ? -1 : 1;
2819
2820 /* Ordering of DRs according to base. */
2821 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2822 DR_BASE_ADDRESS (drb));
2823 if (cmp != 0)
2824 return cmp;
2825
2826 /* And according to DR_OFFSET. */
2827 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2828 if (cmp != 0)
2829 return cmp;
2830
2831 /* Put reads before writes. */
2832 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2833 return DR_IS_READ (dra) ? -1 : 1;
2834
2835 /* Then sort after access size. */
2836 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2837 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2838 if (cmp != 0)
2839 return cmp;
2840
2841 /* And after step. */
2842 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2843 if (cmp != 0)
2844 return cmp;
2845
2846 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2847 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2848 if (cmp == 0)
2849 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2850 return cmp;
2851 }
2852
2853 /* If OP is the result of a conversion, return the unconverted value,
2854 otherwise return null. */
2855
2856 static tree
2857 strip_conversion (tree op)
2858 {
2859 if (TREE_CODE (op) != SSA_NAME)
2860 return NULL_TREE;
2861 gimple *stmt = SSA_NAME_DEF_STMT (op);
2862 if (!is_gimple_assign (stmt)
2863 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2864 return NULL_TREE;
2865 return gimple_assign_rhs1 (stmt);
2866 }
2867
2868 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2869 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2870 be grouped in SLP mode. */
2871
2872 static bool
2873 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2874 bool allow_slp_p)
2875 {
2876 if (gimple_assign_single_p (stmt1_info->stmt))
2877 return gimple_assign_single_p (stmt2_info->stmt);
2878
2879 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2880 if (call1 && gimple_call_internal_p (call1))
2881 {
2882 /* Check for two masked loads or two masked stores. */
2883 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2884 if (!call2 || !gimple_call_internal_p (call2))
2885 return false;
2886 internal_fn ifn = gimple_call_internal_fn (call1);
2887 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2888 return false;
2889 if (ifn != gimple_call_internal_fn (call2))
2890 return false;
2891
2892 /* Check that the masks are the same. Cope with casts of masks,
2893 like those created by build_mask_conversion. */
2894 tree mask1 = gimple_call_arg (call1, 2);
2895 tree mask2 = gimple_call_arg (call2, 2);
2896 if (!operand_equal_p (mask1, mask2, 0)
2897 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2898 {
2899 mask1 = strip_conversion (mask1);
2900 if (!mask1)
2901 return false;
2902 mask2 = strip_conversion (mask2);
2903 if (!mask2)
2904 return false;
2905 if (!operand_equal_p (mask1, mask2, 0))
2906 return false;
2907 }
2908 return true;
2909 }
2910
2911 return false;
2912 }
2913
2914 /* Function vect_analyze_data_ref_accesses.
2915
2916 Analyze the access pattern of all the data references in the loop.
2917
2918 FORNOW: the only access pattern that is considered vectorizable is a
2919 simple step 1 (consecutive) access.
2920
2921 FORNOW: handle only arrays and pointer accesses. */
2922
2923 opt_result
2924 vect_analyze_data_ref_accesses (vec_info *vinfo)
2925 {
2926 unsigned int i;
2927 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2928 struct data_reference *dr;
2929
2930 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2931
2932 if (datarefs.is_empty ())
2933 return opt_result::success ();
2934
2935 /* Sort the array of datarefs to make building the interleaving chains
2936 linear. Don't modify the original vector's order, it is needed for
2937 determining what dependencies are reversed. */
2938 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2939 datarefs_copy.qsort (dr_group_sort_cmp);
2940 hash_set<stmt_vec_info> to_fixup;
2941
2942 /* Build the interleaving chains. */
2943 for (i = 0; i < datarefs_copy.length () - 1;)
2944 {
2945 data_reference_p dra = datarefs_copy[i];
2946 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2947 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2948 stmt_vec_info lastinfo = NULL;
2949 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2950 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2951 {
2952 ++i;
2953 continue;
2954 }
2955 for (i = i + 1; i < datarefs_copy.length (); ++i)
2956 {
2957 data_reference_p drb = datarefs_copy[i];
2958 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2959 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2960 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2961 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2962 break;
2963
2964 /* ??? Imperfect sorting (non-compatible types, non-modulo
2965 accesses, same accesses) can lead to a group to be artificially
2966 split here as we don't just skip over those. If it really
2967 matters we can push those to a worklist and re-iterate
2968 over them. The we can just skip ahead to the next DR here. */
2969
2970 /* DRs in a different loop should not be put into the same
2971 interleaving group. */
2972 if (gimple_bb (DR_STMT (dra))->loop_father
2973 != gimple_bb (DR_STMT (drb))->loop_father)
2974 break;
2975
2976 /* Check that the data-refs have same first location (except init)
2977 and they are both either store or load (not load and store,
2978 not masked loads or stores). */
2979 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2980 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2981 DR_BASE_ADDRESS (drb)) != 0
2982 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2983 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2984 break;
2985
2986 /* Check that the data-refs have the same constant size. */
2987 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2988 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2989 if (!tree_fits_uhwi_p (sza)
2990 || !tree_fits_uhwi_p (szb)
2991 || !tree_int_cst_equal (sza, szb))
2992 break;
2993
2994 /* Check that the data-refs have the same step. */
2995 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2996 break;
2997
2998 /* Check the types are compatible.
2999 ??? We don't distinguish this during sorting. */
3000 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3001 TREE_TYPE (DR_REF (drb))))
3002 break;
3003
3004 /* Check that the DR_INITs are compile-time constants. */
3005 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3006 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3007 break;
3008
3009 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3010 just hold extra information. */
3011 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3012 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3013 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3014 break;
3015
3016 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3017 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3018 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3019 HOST_WIDE_INT init_prev
3020 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3021 gcc_assert (init_a <= init_b
3022 && init_a <= init_prev
3023 && init_prev <= init_b);
3024
3025 /* Do not place the same access in the interleaving chain twice. */
3026 if (init_b == init_prev)
3027 {
3028 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3029 < gimple_uid (DR_STMT (drb)));
3030 /* Simply link in duplicates and fix up the chain below. */
3031 }
3032 else
3033 {
3034 /* If init_b == init_a + the size of the type * k, we have an
3035 interleaving, and DRA is accessed before DRB. */
3036 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3037 if (type_size_a == 0
3038 || (init_b - init_a) % type_size_a != 0)
3039 break;
3040
3041 /* If we have a store, the accesses are adjacent. This splits
3042 groups into chunks we support (we don't support vectorization
3043 of stores with gaps). */
3044 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3045 break;
3046
3047 /* If the step (if not zero or non-constant) is greater than the
3048 difference between data-refs' inits this splits groups into
3049 suitable sizes. */
3050 if (tree_fits_shwi_p (DR_STEP (dra)))
3051 {
3052 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3053 if (step != 0 && step <= (init_b - init_a))
3054 break;
3055 }
3056 }
3057
3058 if (dump_enabled_p ())
3059 dump_printf_loc (MSG_NOTE, vect_location,
3060 DR_IS_READ (dra)
3061 ? "Detected interleaving load %T and %T\n"
3062 : "Detected interleaving store %T and %T\n",
3063 DR_REF (dra), DR_REF (drb));
3064
3065 /* Link the found element into the group list. */
3066 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3067 {
3068 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3069 lastinfo = stmtinfo_a;
3070 }
3071 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3072 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3073 lastinfo = stmtinfo_b;
3074
3075 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3076 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3077
3078 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3079 dump_printf_loc (MSG_NOTE, vect_location,
3080 "Load suitable for SLP vectorization only.\n");
3081
3082 if (init_b == init_prev
3083 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3084 && dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE, vect_location,
3086 "Queuing group with duplicate access for fixup\n");
3087 }
3088 }
3089
3090 /* Fixup groups with duplicate entries by splitting it. */
3091 while (1)
3092 {
3093 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3094 if (!(it != to_fixup.end ()))
3095 break;
3096 stmt_vec_info grp = *it;
3097 to_fixup.remove (grp);
3098
3099 /* Find the earliest duplicate group member. */
3100 unsigned first_duplicate = -1u;
3101 stmt_vec_info next, g = grp;
3102 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3103 {
3104 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3105 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3106 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3107 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3108 g = next;
3109 }
3110 if (first_duplicate == -1U)
3111 continue;
3112
3113 /* Then move all stmts after the first duplicate to a new group.
3114 Note this is a heuristic but one with the property that *it
3115 is fixed up completely. */
3116 g = grp;
3117 stmt_vec_info newgroup = NULL, ng = grp;
3118 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3119 {
3120 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3121 {
3122 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3123 if (!newgroup)
3124 newgroup = next;
3125 else
3126 DR_GROUP_NEXT_ELEMENT (ng) = next;
3127 ng = next;
3128 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3129 }
3130 else
3131 g = DR_GROUP_NEXT_ELEMENT (g);
3132 }
3133 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3134
3135 /* Fixup the new group which still may contain duplicates. */
3136 to_fixup.add (newgroup);
3137 }
3138
3139 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3140 {
3141 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3142 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3143 && !vect_analyze_data_ref_access (dr_info))
3144 {
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3147 "not vectorized: complicated access pattern.\n");
3148
3149 if (is_a <bb_vec_info> (vinfo))
3150 {
3151 /* Mark the statement as not vectorizable. */
3152 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3153 continue;
3154 }
3155 else
3156 {
3157 datarefs_copy.release ();
3158 return opt_result::failure_at (dr_info->stmt->stmt,
3159 "not vectorized:"
3160 " complicated access pattern.\n");
3161 }
3162 }
3163 }
3164
3165 datarefs_copy.release ();
3166 return opt_result::success ();
3167 }
3168
3169 /* Function vect_vfa_segment_size.
3170
3171 Input:
3172 DR_INFO: The data reference.
3173 LENGTH_FACTOR: segment length to consider.
3174
3175 Return a value suitable for the dr_with_seg_len::seg_len field.
3176 This is the "distance travelled" by the pointer from the first
3177 iteration in the segment to the last. Note that it does not include
3178 the size of the access; in effect it only describes the first byte. */
3179
3180 static tree
3181 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3182 {
3183 length_factor = size_binop (MINUS_EXPR,
3184 fold_convert (sizetype, length_factor),
3185 size_one_node);
3186 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3187 length_factor);
3188 }
3189
3190 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3191 gives the worst-case number of bytes covered by the segment. */
3192
3193 static unsigned HOST_WIDE_INT
3194 vect_vfa_access_size (dr_vec_info *dr_info)
3195 {
3196 stmt_vec_info stmt_vinfo = dr_info->stmt;
3197 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3198 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3199 unsigned HOST_WIDE_INT access_size = ref_size;
3200 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3201 {
3202 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3203 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3204 }
3205 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3206 && (vect_supportable_dr_alignment (dr_info, false)
3207 == dr_explicit_realign_optimized))
3208 {
3209 /* We might access a full vector's worth. */
3210 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3211 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3212 }
3213 return access_size;
3214 }
3215
3216 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3217 describes. */
3218
3219 static unsigned int
3220 vect_vfa_align (dr_vec_info *dr_info)
3221 {
3222 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3223 }
3224
3225 /* Function vect_no_alias_p.
3226
3227 Given data references A and B with equal base and offset, see whether
3228 the alias relation can be decided at compilation time. Return 1 if
3229 it can and the references alias, 0 if it can and the references do
3230 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3231 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3232 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3233
3234 static int
3235 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3236 tree segment_length_a, tree segment_length_b,
3237 unsigned HOST_WIDE_INT access_size_a,
3238 unsigned HOST_WIDE_INT access_size_b)
3239 {
3240 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3241 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3242 poly_uint64 const_length_a;
3243 poly_uint64 const_length_b;
3244
3245 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3246 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3247 [a, a+12) */
3248 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3249 {
3250 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3251 offset_a = (offset_a + access_size_a) - const_length_a;
3252 }
3253 else
3254 const_length_a = tree_to_poly_uint64 (segment_length_a);
3255 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3256 {
3257 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3258 offset_b = (offset_b + access_size_b) - const_length_b;
3259 }
3260 else
3261 const_length_b = tree_to_poly_uint64 (segment_length_b);
3262
3263 const_length_a += access_size_a;
3264 const_length_b += access_size_b;
3265
3266 if (ranges_known_overlap_p (offset_a, const_length_a,
3267 offset_b, const_length_b))
3268 return 1;
3269
3270 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3271 offset_b, const_length_b))
3272 return 0;
3273
3274 return -1;
3275 }
3276
3277 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3278 in DDR is >= VF. */
3279
3280 static bool
3281 dependence_distance_ge_vf (data_dependence_relation *ddr,
3282 unsigned int loop_depth, poly_uint64 vf)
3283 {
3284 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3285 || DDR_NUM_DIST_VECTS (ddr) == 0)
3286 return false;
3287
3288 /* If the dependence is exact, we should have limited the VF instead. */
3289 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3290
3291 unsigned int i;
3292 lambda_vector dist_v;
3293 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3294 {
3295 HOST_WIDE_INT dist = dist_v[loop_depth];
3296 if (dist != 0
3297 && !(dist > 0 && DDR_REVERSED_P (ddr))
3298 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3299 return false;
3300 }
3301
3302 if (dump_enabled_p ())
3303 dump_printf_loc (MSG_NOTE, vect_location,
3304 "dependence distance between %T and %T is >= VF\n",
3305 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3306
3307 return true;
3308 }
3309
3310 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3311
3312 static void
3313 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3314 {
3315 dump_printf (dump_kind, "%s (%T) >= ",
3316 lower_bound.unsigned_p ? "unsigned" : "abs",
3317 lower_bound.expr);
3318 dump_dec (dump_kind, lower_bound.min_value);
3319 }
3320
3321 /* Record that the vectorized loop requires the vec_lower_bound described
3322 by EXPR, UNSIGNED_P and MIN_VALUE. */
3323
3324 static void
3325 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3326 poly_uint64 min_value)
3327 {
3328 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3329 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3330 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3331 {
3332 unsigned_p &= lower_bounds[i].unsigned_p;
3333 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3334 if (lower_bounds[i].unsigned_p != unsigned_p
3335 || maybe_lt (lower_bounds[i].min_value, min_value))
3336 {
3337 lower_bounds[i].unsigned_p = unsigned_p;
3338 lower_bounds[i].min_value = min_value;
3339 if (dump_enabled_p ())
3340 {
3341 dump_printf_loc (MSG_NOTE, vect_location,
3342 "updating run-time check to ");
3343 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3344 dump_printf (MSG_NOTE, "\n");
3345 }
3346 }
3347 return;
3348 }
3349
3350 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3351 if (dump_enabled_p ())
3352 {
3353 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3354 dump_lower_bound (MSG_NOTE, lower_bound);
3355 dump_printf (MSG_NOTE, "\n");
3356 }
3357 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3358 }
3359
3360 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3361 will span fewer than GAP bytes. */
3362
3363 static bool
3364 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3365 poly_int64 gap)
3366 {
3367 stmt_vec_info stmt_info = dr_info->stmt;
3368 HOST_WIDE_INT count
3369 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3370 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3371 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3372 return (estimated_poly_value (gap)
3373 <= count * vect_get_scalar_dr_size (dr_info));
3374 }
3375
3376 /* Return true if we know that there is no alias between DR_INFO_A and
3377 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3378 When returning true, set *LOWER_BOUND_OUT to this N. */
3379
3380 static bool
3381 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3382 poly_uint64 *lower_bound_out)
3383 {
3384 /* Check that there is a constant gap of known sign between DR_A
3385 and DR_B. */
3386 data_reference *dr_a = dr_info_a->dr;
3387 data_reference *dr_b = dr_info_b->dr;
3388 poly_int64 init_a, init_b;
3389 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3390 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3391 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3392 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3393 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3394 || !ordered_p (init_a, init_b))
3395 return false;
3396
3397 /* Sort DR_A and DR_B by the address they access. */
3398 if (maybe_lt (init_b, init_a))
3399 {
3400 std::swap (init_a, init_b);
3401 std::swap (dr_info_a, dr_info_b);
3402 std::swap (dr_a, dr_b);
3403 }
3404
3405 /* If the two accesses could be dependent within a scalar iteration,
3406 make sure that we'd retain their order. */
3407 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3408 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3409 return false;
3410
3411 /* There is no alias if abs (DR_STEP) is greater than or equal to
3412 the bytes spanned by the combination of the two accesses. */
3413 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3414 return true;
3415 }
3416
3417 /* Function vect_prune_runtime_alias_test_list.
3418
3419 Prune a list of ddrs to be tested at run-time by versioning for alias.
3420 Merge several alias checks into one if possible.
3421 Return FALSE if resulting list of ddrs is longer then allowed by
3422 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3423
3424 opt_result
3425 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3426 {
3427 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3428 hash_set <tree_pair_hash> compared_objects;
3429
3430 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3431 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3432 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3433 vec<vec_object_pair> &check_unequal_addrs
3434 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3435 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3436 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3437
3438 ddr_p ddr;
3439 unsigned int i;
3440 tree length_factor;
3441
3442 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3443
3444 /* Step values are irrelevant for aliasing if the number of vector
3445 iterations is equal to the number of scalar iterations (which can
3446 happen for fully-SLP loops). */
3447 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3448
3449 if (!ignore_step_p)
3450 {
3451 /* Convert the checks for nonzero steps into bound tests. */
3452 tree value;
3453 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3454 vect_check_lower_bound (loop_vinfo, value, true, 1);
3455 }
3456
3457 if (may_alias_ddrs.is_empty ())
3458 return opt_result::success ();
3459
3460 comp_alias_ddrs.create (may_alias_ddrs.length ());
3461
3462 unsigned int loop_depth
3463 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3464 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3465
3466 /* First, we collect all data ref pairs for aliasing checks. */
3467 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3468 {
3469 int comp_res;
3470 poly_uint64 lower_bound;
3471 tree segment_length_a, segment_length_b;
3472 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3473 unsigned int align_a, align_b;
3474
3475 /* Ignore the alias if the VF we chose ended up being no greater
3476 than the dependence distance. */
3477 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3478 continue;
3479
3480 if (DDR_OBJECT_A (ddr))
3481 {
3482 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3483 if (!compared_objects.add (new_pair))
3484 {
3485 if (dump_enabled_p ())
3486 dump_printf_loc (MSG_NOTE, vect_location,
3487 "checking that %T and %T"
3488 " have different addresses\n",
3489 new_pair.first, new_pair.second);
3490 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3491 }
3492 continue;
3493 }
3494
3495 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3496 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3497
3498 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3499 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3500
3501 /* Skip the pair if inter-iteration dependencies are irrelevant
3502 and intra-iteration dependencies are guaranteed to be honored. */
3503 if (ignore_step_p
3504 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3505 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3506 &lower_bound)))
3507 {
3508 if (dump_enabled_p ())
3509 dump_printf_loc (MSG_NOTE, vect_location,
3510 "no need for alias check between "
3511 "%T and %T when VF is 1\n",
3512 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3513 continue;
3514 }
3515
3516 /* See whether we can handle the alias using a bounds check on
3517 the step, and whether that's likely to be the best approach.
3518 (It might not be, for example, if the minimum step is much larger
3519 than the number of bytes handled by one vector iteration.) */
3520 if (!ignore_step_p
3521 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3522 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3523 &lower_bound)
3524 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3525 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3526 {
3527 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3528 if (dump_enabled_p ())
3529 {
3530 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3531 "%T and %T when the step %T is outside ",
3532 DR_REF (dr_info_a->dr),
3533 DR_REF (dr_info_b->dr),
3534 DR_STEP (dr_info_a->dr));
3535 if (unsigned_p)
3536 dump_printf (MSG_NOTE, "[0");
3537 else
3538 {
3539 dump_printf (MSG_NOTE, "(");
3540 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3541 }
3542 dump_printf (MSG_NOTE, ", ");
3543 dump_dec (MSG_NOTE, lower_bound);
3544 dump_printf (MSG_NOTE, ")\n");
3545 }
3546 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3547 unsigned_p, lower_bound);
3548 continue;
3549 }
3550
3551 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3552 if (dr_group_first_a)
3553 {
3554 stmt_info_a = dr_group_first_a;
3555 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3556 }
3557
3558 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3559 if (dr_group_first_b)
3560 {
3561 stmt_info_b = dr_group_first_b;
3562 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3563 }
3564
3565 if (ignore_step_p)
3566 {
3567 segment_length_a = size_zero_node;
3568 segment_length_b = size_zero_node;
3569 }
3570 else
3571 {
3572 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3573 DR_STEP (dr_info_b->dr), 0))
3574 length_factor = scalar_loop_iters;
3575 else
3576 length_factor = size_int (vect_factor);
3577 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3578 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3579 }
3580 access_size_a = vect_vfa_access_size (dr_info_a);
3581 access_size_b = vect_vfa_access_size (dr_info_b);
3582 align_a = vect_vfa_align (dr_info_a);
3583 align_b = vect_vfa_align (dr_info_b);
3584
3585 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3586 DR_BASE_ADDRESS (dr_info_b->dr));
3587 if (comp_res == 0)
3588 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3589 DR_OFFSET (dr_info_b->dr));
3590
3591 /* See whether the alias is known at compilation time. */
3592 if (comp_res == 0
3593 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3594 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3595 && poly_int_tree_p (segment_length_a)
3596 && poly_int_tree_p (segment_length_b))
3597 {
3598 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3599 segment_length_a,
3600 segment_length_b,
3601 access_size_a,
3602 access_size_b);
3603 if (res >= 0 && dump_enabled_p ())
3604 {
3605 dump_printf_loc (MSG_NOTE, vect_location,
3606 "can tell at compile time that %T and %T",
3607 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3608 if (res == 0)
3609 dump_printf (MSG_NOTE, " do not alias\n");
3610 else
3611 dump_printf (MSG_NOTE, " alias\n");
3612 }
3613
3614 if (res == 0)
3615 continue;
3616
3617 if (res == 1)
3618 return opt_result::failure_at (stmt_info_b->stmt,
3619 "not vectorized:"
3620 " compilation time alias: %G%G",
3621 stmt_info_a->stmt,
3622 stmt_info_b->stmt);
3623 }
3624
3625 dr_with_seg_len_pair_t dr_with_seg_len_pair
3626 (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3627 access_size_a, align_a),
3628 dr_with_seg_len (dr_info_b->dr, segment_length_b,
3629 access_size_b, align_b));
3630
3631 /* Canonicalize pairs by sorting the two DR members. */
3632 if (comp_res > 0)
3633 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3634
3635 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3636 }
3637
3638 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3639
3640 unsigned int count = (comp_alias_ddrs.length ()
3641 + check_unequal_addrs.length ());
3642
3643 if (dump_enabled_p ())
3644 dump_printf_loc (MSG_NOTE, vect_location,
3645 "improved number of alias checks from %d to %d\n",
3646 may_alias_ddrs.length (), count);
3647 unsigned limit = PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS);
3648 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3649 limit = default_param_value
3650 (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) * 6 / 10;
3651 if (count > limit)
3652 return opt_result::failure_at
3653 (vect_location,
3654 "number of versioning for alias run-time tests exceeds %d "
3655 "(--param vect-max-version-for-alias-checks)\n", limit);
3656
3657 return opt_result::success ();
3658 }
3659
3660 /* Check whether we can use an internal function for a gather load
3661 or scatter store. READ_P is true for loads and false for stores.
3662 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3663 the type of the memory elements being loaded or stored. OFFSET_BITS
3664 is the number of bits in each scalar offset and OFFSET_SIGN is the
3665 sign of the offset. SCALE is the amount by which the offset should
3666 be multiplied *after* it has been converted to address width.
3667
3668 Return true if the function is supported, storing the function
3669 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3670
3671 bool
3672 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3673 tree memory_type, unsigned int offset_bits,
3674 signop offset_sign, int scale,
3675 internal_fn *ifn_out, tree *element_type_out)
3676 {
3677 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3678 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3679 if (offset_bits > element_bits)
3680 /* Internal functions require the offset to be the same width as
3681 the vector elements. We can extend narrower offsets, but it isn't
3682 safe to truncate wider offsets. */
3683 return false;
3684
3685 if (element_bits != memory_bits)
3686 /* For now the vector elements must be the same width as the
3687 memory elements. */
3688 return false;
3689
3690 /* Work out which function we need. */
3691 internal_fn ifn;
3692 if (read_p)
3693 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3694 else
3695 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3696
3697 /* Test whether the target supports this combination. */
3698 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3699 offset_sign, scale))
3700 return false;
3701
3702 *ifn_out = ifn;
3703 *element_type_out = TREE_TYPE (vectype);
3704 return true;
3705 }
3706
3707 /* STMT_INFO is a call to an internal gather load or scatter store function.
3708 Describe the operation in INFO. */
3709
3710 static void
3711 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3712 gather_scatter_info *info)
3713 {
3714 gcall *call = as_a <gcall *> (stmt_info->stmt);
3715 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3716 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3717
3718 info->ifn = gimple_call_internal_fn (call);
3719 info->decl = NULL_TREE;
3720 info->base = gimple_call_arg (call, 0);
3721 info->offset = gimple_call_arg (call, 1);
3722 info->offset_dt = vect_unknown_def_type;
3723 info->offset_vectype = NULL_TREE;
3724 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3725 info->element_type = TREE_TYPE (vectype);
3726 info->memory_type = TREE_TYPE (DR_REF (dr));
3727 }
3728
3729 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3730 gather load or scatter store. Describe the operation in *INFO if so. */
3731
3732 bool
3733 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3734 gather_scatter_info *info)
3735 {
3736 HOST_WIDE_INT scale = 1;
3737 poly_int64 pbitpos, pbitsize;
3738 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3739 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3740 tree offtype = NULL_TREE;
3741 tree decl = NULL_TREE, base, off;
3742 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3743 tree memory_type = TREE_TYPE (DR_REF (dr));
3744 machine_mode pmode;
3745 int punsignedp, reversep, pvolatilep = 0;
3746 internal_fn ifn;
3747 tree element_type;
3748 bool masked_p = false;
3749
3750 /* See whether this is already a call to a gather/scatter internal function.
3751 If not, see whether it's a masked load or store. */
3752 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3753 if (call && gimple_call_internal_p (call))
3754 {
3755 ifn = gimple_call_internal_fn (call);
3756 if (internal_gather_scatter_fn_p (ifn))
3757 {
3758 vect_describe_gather_scatter_call (stmt_info, info);
3759 return true;
3760 }
3761 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3762 }
3763
3764 /* True if we should aim to use internal functions rather than
3765 built-in functions. */
3766 bool use_ifn_p = (DR_IS_READ (dr)
3767 ? supports_vec_gather_load_p ()
3768 : supports_vec_scatter_store_p ());
3769
3770 base = DR_REF (dr);
3771 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3772 see if we can use the def stmt of the address. */
3773 if (masked_p
3774 && TREE_CODE (base) == MEM_REF
3775 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3776 && integer_zerop (TREE_OPERAND (base, 1))
3777 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3778 {
3779 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3780 if (is_gimple_assign (def_stmt)
3781 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3782 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3783 }
3784
3785 /* The gather and scatter builtins need address of the form
3786 loop_invariant + vector * {1, 2, 4, 8}
3787 or
3788 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3789 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3790 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3791 multiplications and additions in it. To get a vector, we need
3792 a single SSA_NAME that will be defined in the loop and will
3793 contain everything that is not loop invariant and that can be
3794 vectorized. The following code attempts to find such a preexistng
3795 SSA_NAME OFF and put the loop invariants into a tree BASE
3796 that can be gimplified before the loop. */
3797 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3798 &punsignedp, &reversep, &pvolatilep);
3799 if (reversep)
3800 return false;
3801
3802 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3803
3804 if (TREE_CODE (base) == MEM_REF)
3805 {
3806 if (!integer_zerop (TREE_OPERAND (base, 1)))
3807 {
3808 if (off == NULL_TREE)
3809 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3810 else
3811 off = size_binop (PLUS_EXPR, off,
3812 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3813 }
3814 base = TREE_OPERAND (base, 0);
3815 }
3816 else
3817 base = build_fold_addr_expr (base);
3818
3819 if (off == NULL_TREE)
3820 off = size_zero_node;
3821
3822 /* If base is not loop invariant, either off is 0, then we start with just
3823 the constant offset in the loop invariant BASE and continue with base
3824 as OFF, otherwise give up.
3825 We could handle that case by gimplifying the addition of base + off
3826 into some SSA_NAME and use that as off, but for now punt. */
3827 if (!expr_invariant_in_loop_p (loop, base))
3828 {
3829 if (!integer_zerop (off))
3830 return false;
3831 off = base;
3832 base = size_int (pbytepos);
3833 }
3834 /* Otherwise put base + constant offset into the loop invariant BASE
3835 and continue with OFF. */
3836 else
3837 {
3838 base = fold_convert (sizetype, base);
3839 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3840 }
3841
3842 /* OFF at this point may be either a SSA_NAME or some tree expression
3843 from get_inner_reference. Try to peel off loop invariants from it
3844 into BASE as long as possible. */
3845 STRIP_NOPS (off);
3846 while (offtype == NULL_TREE)
3847 {
3848 enum tree_code code;
3849 tree op0, op1, add = NULL_TREE;
3850
3851 if (TREE_CODE (off) == SSA_NAME)
3852 {
3853 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3854
3855 if (expr_invariant_in_loop_p (loop, off))
3856 return false;
3857
3858 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3859 break;
3860
3861 op0 = gimple_assign_rhs1 (def_stmt);
3862 code = gimple_assign_rhs_code (def_stmt);
3863 op1 = gimple_assign_rhs2 (def_stmt);
3864 }
3865 else
3866 {
3867 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3868 return false;
3869 code = TREE_CODE (off);
3870 extract_ops_from_tree (off, &code, &op0, &op1);
3871 }
3872 switch (code)
3873 {
3874 case POINTER_PLUS_EXPR:
3875 case PLUS_EXPR:
3876 if (expr_invariant_in_loop_p (loop, op0))
3877 {
3878 add = op0;
3879 off = op1;
3880 do_add:
3881 add = fold_convert (sizetype, add);
3882 if (scale != 1)
3883 add = size_binop (MULT_EXPR, add, size_int (scale));
3884 base = size_binop (PLUS_EXPR, base, add);
3885 continue;
3886 }
3887 if (expr_invariant_in_loop_p (loop, op1))
3888 {
3889 add = op1;
3890 off = op0;
3891 goto do_add;
3892 }
3893 break;
3894 case MINUS_EXPR:
3895 if (expr_invariant_in_loop_p (loop, op1))
3896 {
3897 add = fold_convert (sizetype, op1);
3898 add = size_binop (MINUS_EXPR, size_zero_node, add);
3899 off = op0;
3900 goto do_add;
3901 }
3902 break;
3903 case MULT_EXPR:
3904 if (scale == 1 && tree_fits_shwi_p (op1))
3905 {
3906 int new_scale = tree_to_shwi (op1);
3907 /* Only treat this as a scaling operation if the target
3908 supports it. */
3909 if (use_ifn_p
3910 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3911 vectype, memory_type, 1,
3912 TYPE_SIGN (TREE_TYPE (op0)),
3913 new_scale, &ifn,
3914 &element_type))
3915 break;
3916 scale = new_scale;
3917 off = op0;
3918 continue;
3919 }
3920 break;
3921 case SSA_NAME:
3922 off = op0;
3923 continue;
3924 CASE_CONVERT:
3925 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3926 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3927 break;
3928 if (TYPE_PRECISION (TREE_TYPE (op0))
3929 == TYPE_PRECISION (TREE_TYPE (off)))
3930 {
3931 off = op0;
3932 continue;
3933 }
3934
3935 /* The internal functions need the offset to be the same width
3936 as the elements of VECTYPE. Don't include operations that
3937 cast the offset from that width to a different width. */
3938 if (use_ifn_p
3939 && (int_size_in_bytes (TREE_TYPE (vectype))
3940 == int_size_in_bytes (TREE_TYPE (off))))
3941 break;
3942
3943 if (TYPE_PRECISION (TREE_TYPE (op0))
3944 < TYPE_PRECISION (TREE_TYPE (off)))
3945 {
3946 off = op0;
3947 offtype = TREE_TYPE (off);
3948 STRIP_NOPS (off);
3949 continue;
3950 }
3951 break;
3952 default:
3953 break;
3954 }
3955 break;
3956 }
3957
3958 /* If at the end OFF still isn't a SSA_NAME or isn't
3959 defined in the loop, punt. */
3960 if (TREE_CODE (off) != SSA_NAME
3961 || expr_invariant_in_loop_p (loop, off))
3962 return false;
3963
3964 if (offtype == NULL_TREE)
3965 offtype = TREE_TYPE (off);
3966
3967 if (use_ifn_p)
3968 {
3969 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3970 memory_type, TYPE_PRECISION (offtype),
3971 TYPE_SIGN (offtype), scale, &ifn,
3972 &element_type))
3973 return false;
3974 }
3975 else
3976 {
3977 if (DR_IS_READ (dr))
3978 {
3979 if (targetm.vectorize.builtin_gather)
3980 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3981 }
3982 else
3983 {
3984 if (targetm.vectorize.builtin_scatter)
3985 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3986 }
3987
3988 if (!decl)
3989 return false;
3990
3991 ifn = IFN_LAST;
3992 element_type = TREE_TYPE (vectype);
3993 }
3994
3995 info->ifn = ifn;
3996 info->decl = decl;
3997 info->base = base;
3998 info->offset = off;
3999 info->offset_dt = vect_unknown_def_type;
4000 info->offset_vectype = NULL_TREE;
4001 info->scale = scale;
4002 info->element_type = element_type;
4003 info->memory_type = memory_type;
4004 return true;
4005 }
4006
4007 /* Find the data references in STMT, analyze them with respect to LOOP and
4008 append them to DATAREFS. Return false if datarefs in this stmt cannot
4009 be handled. */
4010
4011 opt_result
4012 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4013 vec<data_reference_p> *datarefs)
4014 {
4015 /* We can ignore clobbers for dataref analysis - they are removed during
4016 loop vectorization and BB vectorization checks dependences with a
4017 stmt walk. */
4018 if (gimple_clobber_p (stmt))
4019 return opt_result::success ();
4020
4021 if (gimple_has_volatile_ops (stmt))
4022 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4023 stmt);
4024
4025 if (stmt_can_throw_internal (cfun, stmt))
4026 return opt_result::failure_at (stmt,
4027 "not vectorized:"
4028 " statement can throw an exception: %G",
4029 stmt);
4030
4031 auto_vec<data_reference_p, 2> refs;
4032 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4033 if (!res)
4034 return res;
4035
4036 if (refs.is_empty ())
4037 return opt_result::success ();
4038
4039 if (refs.length () > 1)
4040 return opt_result::failure_at (stmt,
4041 "not vectorized:"
4042 " more than one data ref in stmt: %G", stmt);
4043
4044 if (gcall *call = dyn_cast <gcall *> (stmt))
4045 if (!gimple_call_internal_p (call)
4046 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4047 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4048 return opt_result::failure_at (stmt,
4049 "not vectorized: dr in a call %G", stmt);
4050
4051 data_reference_p dr = refs.pop ();
4052 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4053 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4054 return opt_result::failure_at (stmt,
4055 "not vectorized:"
4056 " statement is bitfield access %G", stmt);
4057
4058 if (DR_BASE_ADDRESS (dr)
4059 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4060 return opt_result::failure_at (stmt,
4061 "not vectorized:"
4062 " base addr of dr is a constant\n");
4063
4064 /* Check whether this may be a SIMD lane access and adjust the
4065 DR to make it easier for us to handle it. */
4066 if (loop
4067 && loop->simduid
4068 && (!DR_BASE_ADDRESS (dr)
4069 || !DR_OFFSET (dr)
4070 || !DR_INIT (dr)
4071 || !DR_STEP (dr)))
4072 {
4073 struct data_reference *newdr
4074 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4075 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4076 if (DR_BASE_ADDRESS (newdr)
4077 && DR_OFFSET (newdr)
4078 && DR_INIT (newdr)
4079 && DR_STEP (newdr)
4080 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4081 && integer_zerop (DR_STEP (newdr)))
4082 {
4083 tree base_address = DR_BASE_ADDRESS (newdr);
4084 tree off = DR_OFFSET (newdr);
4085 tree step = ssize_int (1);
4086 if (integer_zerop (off)
4087 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4088 {
4089 off = TREE_OPERAND (base_address, 1);
4090 base_address = TREE_OPERAND (base_address, 0);
4091 }
4092 STRIP_NOPS (off);
4093 if (TREE_CODE (off) == MULT_EXPR
4094 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4095 {
4096 step = TREE_OPERAND (off, 1);
4097 off = TREE_OPERAND (off, 0);
4098 STRIP_NOPS (off);
4099 }
4100 if (CONVERT_EXPR_P (off)
4101 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4102 < TYPE_PRECISION (TREE_TYPE (off))))
4103 off = TREE_OPERAND (off, 0);
4104 if (TREE_CODE (off) == SSA_NAME)
4105 {
4106 gimple *def = SSA_NAME_DEF_STMT (off);
4107 /* Look through widening conversion. */
4108 if (is_gimple_assign (def)
4109 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4110 {
4111 tree rhs1 = gimple_assign_rhs1 (def);
4112 if (TREE_CODE (rhs1) == SSA_NAME
4113 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4114 && (TYPE_PRECISION (TREE_TYPE (off))
4115 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4116 def = SSA_NAME_DEF_STMT (rhs1);
4117 }
4118 if (is_gimple_call (def)
4119 && gimple_call_internal_p (def)
4120 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4121 {
4122 tree arg = gimple_call_arg (def, 0);
4123 tree reft = TREE_TYPE (DR_REF (newdr));
4124 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4125 arg = SSA_NAME_VAR (arg);
4126 if (arg == loop->simduid
4127 /* For now. */
4128 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4129 {
4130 DR_BASE_ADDRESS (newdr) = base_address;
4131 DR_OFFSET (newdr) = ssize_int (0);
4132 DR_STEP (newdr) = step;
4133 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4134 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4135 /* Mark as simd-lane access. */
4136 tree arg2 = gimple_call_arg (def, 1);
4137 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4138 free_data_ref (dr);
4139 datarefs->safe_push (newdr);
4140 return opt_result::success ();
4141 }
4142 }
4143 }
4144 }
4145 free_data_ref (newdr);
4146 }
4147
4148 datarefs->safe_push (dr);
4149 return opt_result::success ();
4150 }
4151
4152 /* Function vect_analyze_data_refs.
4153
4154 Find all the data references in the loop or basic block.
4155
4156 The general structure of the analysis of data refs in the vectorizer is as
4157 follows:
4158 1- vect_analyze_data_refs(loop/bb): call
4159 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4160 in the loop/bb and their dependences.
4161 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4162 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4163 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4164
4165 */
4166
4167 opt_result
4168 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4169 {
4170 class loop *loop = NULL;
4171 unsigned int i;
4172 struct data_reference *dr;
4173 tree scalar_type;
4174
4175 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4176
4177 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4178 loop = LOOP_VINFO_LOOP (loop_vinfo);
4179
4180 /* Go through the data-refs, check that the analysis succeeded. Update
4181 pointer from stmt_vec_info struct to DR and vectype. */
4182
4183 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4184 FOR_EACH_VEC_ELT (datarefs, i, dr)
4185 {
4186 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4187 poly_uint64 vf;
4188
4189 gcc_assert (DR_REF (dr));
4190 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4191 gcc_assert (!stmt_info->dr_aux.dr);
4192 stmt_info->dr_aux.dr = dr;
4193 stmt_info->dr_aux.stmt = stmt_info;
4194
4195 /* Check that analysis of the data-ref succeeded. */
4196 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4197 || !DR_STEP (dr))
4198 {
4199 bool maybe_gather
4200 = DR_IS_READ (dr)
4201 && !TREE_THIS_VOLATILE (DR_REF (dr))
4202 && (targetm.vectorize.builtin_gather != NULL
4203 || supports_vec_gather_load_p ());
4204 bool maybe_scatter
4205 = DR_IS_WRITE (dr)
4206 && !TREE_THIS_VOLATILE (DR_REF (dr))
4207 && (targetm.vectorize.builtin_scatter != NULL
4208 || supports_vec_scatter_store_p ());
4209
4210 /* If target supports vector gather loads or scatter stores,
4211 see if they can't be used. */
4212 if (is_a <loop_vec_info> (vinfo)
4213 && !nested_in_vect_loop_p (loop, stmt_info))
4214 {
4215 if (maybe_gather || maybe_scatter)
4216 {
4217 if (maybe_gather)
4218 gatherscatter = GATHER;
4219 else
4220 gatherscatter = SCATTER;
4221 }
4222 }
4223
4224 if (gatherscatter == SG_NONE)
4225 {
4226 if (dump_enabled_p ())
4227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4228 "not vectorized: data ref analysis "
4229 "failed %G", stmt_info->stmt);
4230 if (is_a <bb_vec_info> (vinfo))
4231 {
4232 /* In BB vectorization the ref can still participate
4233 in dependence analysis, we just can't vectorize it. */
4234 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4235 continue;
4236 }
4237 return opt_result::failure_at (stmt_info->stmt,
4238 "not vectorized:"
4239 " data ref analysis failed: %G",
4240 stmt_info->stmt);
4241 }
4242 }
4243
4244 /* See if this was detected as SIMD lane access. */
4245 if (dr->aux == (void *)-1
4246 || dr->aux == (void *)-2
4247 || dr->aux == (void *)-3
4248 || dr->aux == (void *)-4)
4249 {
4250 if (nested_in_vect_loop_p (loop, stmt_info))
4251 return opt_result::failure_at (stmt_info->stmt,
4252 "not vectorized:"
4253 " data ref analysis failed: %G",
4254 stmt_info->stmt);
4255 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4256 = -(uintptr_t) dr->aux;
4257 }
4258
4259 tree base = get_base_address (DR_REF (dr));
4260 if (base && VAR_P (base) && DECL_NONALIASED (base))
4261 {
4262 if (dump_enabled_p ())
4263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4264 "not vectorized: base object not addressable "
4265 "for stmt: %G", stmt_info->stmt);
4266 if (is_a <bb_vec_info> (vinfo))
4267 {
4268 /* In BB vectorization the ref can still participate
4269 in dependence analysis, we just can't vectorize it. */
4270 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4271 continue;
4272 }
4273 return opt_result::failure_at (stmt_info->stmt,
4274 "not vectorized: base object not"
4275 " addressable for stmt: %G",
4276 stmt_info->stmt);
4277 }
4278
4279 if (is_a <loop_vec_info> (vinfo)
4280 && DR_STEP (dr)
4281 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4282 {
4283 if (nested_in_vect_loop_p (loop, stmt_info))
4284 return opt_result::failure_at (stmt_info->stmt,
4285 "not vectorized: "
4286 "not suitable for strided load %G",
4287 stmt_info->stmt);
4288 STMT_VINFO_STRIDED_P (stmt_info) = true;
4289 }
4290
4291 /* Update DR field in stmt_vec_info struct. */
4292
4293 /* If the dataref is in an inner-loop of the loop that is considered for
4294 for vectorization, we also want to analyze the access relative to
4295 the outer-loop (DR contains information only relative to the
4296 inner-most enclosing loop). We do that by building a reference to the
4297 first location accessed by the inner-loop, and analyze it relative to
4298 the outer-loop. */
4299 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4300 {
4301 /* Build a reference to the first location accessed by the
4302 inner loop: *(BASE + INIT + OFFSET). By construction,
4303 this address must be invariant in the inner loop, so we
4304 can consider it as being used in the outer loop. */
4305 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4306 tree offset = unshare_expr (DR_OFFSET (dr));
4307 tree init = unshare_expr (DR_INIT (dr));
4308 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4309 init, offset);
4310 tree init_addr = fold_build_pointer_plus (base, init_offset);
4311 tree init_ref = build_fold_indirect_ref (init_addr);
4312
4313 if (dump_enabled_p ())
4314 dump_printf_loc (MSG_NOTE, vect_location,
4315 "analyze in outer loop: %T\n", init_ref);
4316
4317 opt_result res
4318 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4319 init_ref, loop, stmt_info->stmt);
4320 if (!res)
4321 /* dr_analyze_innermost already explained the failure. */
4322 return res;
4323
4324 if (dump_enabled_p ())
4325 dump_printf_loc (MSG_NOTE, vect_location,
4326 "\touter base_address: %T\n"
4327 "\touter offset from base address: %T\n"
4328 "\touter constant offset from base address: %T\n"
4329 "\touter step: %T\n"
4330 "\touter base alignment: %d\n\n"
4331 "\touter base misalignment: %d\n"
4332 "\touter offset alignment: %d\n"
4333 "\touter step alignment: %d\n",
4334 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4335 STMT_VINFO_DR_OFFSET (stmt_info),
4336 STMT_VINFO_DR_INIT (stmt_info),
4337 STMT_VINFO_DR_STEP (stmt_info),
4338 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4339 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4340 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4341 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4342 }
4343
4344 /* Set vectype for STMT. */
4345 scalar_type = TREE_TYPE (DR_REF (dr));
4346 STMT_VINFO_VECTYPE (stmt_info)
4347 = get_vectype_for_scalar_type (vinfo, scalar_type);
4348 if (!STMT_VINFO_VECTYPE (stmt_info))
4349 {
4350 if (dump_enabled_p ())
4351 {
4352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4353 "not vectorized: no vectype for stmt: %G",
4354 stmt_info->stmt);
4355 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4357 scalar_type);
4358 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4359 }
4360
4361 if (is_a <bb_vec_info> (vinfo))
4362 {
4363 /* No vector type is fine, the ref can still participate
4364 in dependence analysis, we just can't vectorize it. */
4365 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4366 continue;
4367 }
4368 if (fatal)
4369 *fatal = false;
4370 return opt_result::failure_at (stmt_info->stmt,
4371 "not vectorized:"
4372 " no vectype for stmt: %G"
4373 " scalar_type: %T\n",
4374 stmt_info->stmt, scalar_type);
4375 }
4376 else
4377 {
4378 if (dump_enabled_p ())
4379 dump_printf_loc (MSG_NOTE, vect_location,
4380 "got vectype for stmt: %G%T\n",
4381 stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4382 }
4383
4384 /* Adjust the minimal vectorization factor according to the
4385 vector type. */
4386 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4387 *min_vf = upper_bound (*min_vf, vf);
4388
4389 if (gatherscatter != SG_NONE)
4390 {
4391 gather_scatter_info gs_info;
4392 if (!vect_check_gather_scatter (stmt_info,
4393 as_a <loop_vec_info> (vinfo),
4394 &gs_info)
4395 || !get_vectype_for_scalar_type (vinfo,
4396 TREE_TYPE (gs_info.offset)))
4397 {
4398 if (fatal)
4399 *fatal = false;
4400 return opt_result::failure_at
4401 (stmt_info->stmt,
4402 (gatherscatter == GATHER)
4403 ? "not vectorized: not suitable for gather load %G"
4404 : "not vectorized: not suitable for scatter store %G",
4405 stmt_info->stmt);
4406 }
4407 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4408 }
4409 }
4410
4411 /* We used to stop processing and prune the list here. Verify we no
4412 longer need to. */
4413 gcc_assert (i == datarefs.length ());
4414
4415 return opt_result::success ();
4416 }
4417
4418
4419 /* Function vect_get_new_vect_var.
4420
4421 Returns a name for a new variable. The current naming scheme appends the
4422 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4423 the name of vectorizer generated variables, and appends that to NAME if
4424 provided. */
4425
4426 tree
4427 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4428 {
4429 const char *prefix;
4430 tree new_vect_var;
4431
4432 switch (var_kind)
4433 {
4434 case vect_simple_var:
4435 prefix = "vect";
4436 break;
4437 case vect_scalar_var:
4438 prefix = "stmp";
4439 break;
4440 case vect_mask_var:
4441 prefix = "mask";
4442 break;
4443 case vect_pointer_var:
4444 prefix = "vectp";
4445 break;
4446 default:
4447 gcc_unreachable ();
4448 }
4449
4450 if (name)
4451 {
4452 char* tmp = concat (prefix, "_", name, NULL);
4453 new_vect_var = create_tmp_reg (type, tmp);
4454 free (tmp);
4455 }
4456 else
4457 new_vect_var = create_tmp_reg (type, prefix);
4458
4459 return new_vect_var;
4460 }
4461
4462 /* Like vect_get_new_vect_var but return an SSA name. */
4463
4464 tree
4465 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4466 {
4467 const char *prefix;
4468 tree new_vect_var;
4469
4470 switch (var_kind)
4471 {
4472 case vect_simple_var:
4473 prefix = "vect";
4474 break;
4475 case vect_scalar_var:
4476 prefix = "stmp";
4477 break;
4478 case vect_pointer_var:
4479 prefix = "vectp";
4480 break;
4481 default:
4482 gcc_unreachable ();
4483 }
4484
4485 if (name)
4486 {
4487 char* tmp = concat (prefix, "_", name, NULL);
4488 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4489 free (tmp);
4490 }
4491 else
4492 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4493
4494 return new_vect_var;
4495 }
4496
4497 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4498
4499 static void
4500 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4501 {
4502 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4503 int misalign = DR_MISALIGNMENT (dr_info);
4504 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4505 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4506 else
4507 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4508 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4509 misalign);
4510 }
4511
4512 /* Function vect_create_addr_base_for_vector_ref.
4513
4514 Create an expression that computes the address of the first memory location
4515 that will be accessed for a data reference.
4516
4517 Input:
4518 STMT_INFO: The statement containing the data reference.
4519 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4520 OFFSET: Optional. If supplied, it is be added to the initial address.
4521 LOOP: Specify relative to which loop-nest should the address be computed.
4522 For example, when the dataref is in an inner-loop nested in an
4523 outer-loop that is now being vectorized, LOOP can be either the
4524 outer-loop, or the inner-loop. The first memory location accessed
4525 by the following dataref ('in' points to short):
4526
4527 for (i=0; i<N; i++)
4528 for (j=0; j<M; j++)
4529 s += in[i+j]
4530
4531 is as follows:
4532 if LOOP=i_loop: &in (relative to i_loop)
4533 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4534 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4535 initial address. Unlike OFFSET, which is number of elements to
4536 be added, BYTE_OFFSET is measured in bytes.
4537
4538 Output:
4539 1. Return an SSA_NAME whose value is the address of the memory location of
4540 the first vector of the data reference.
4541 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4542 these statement(s) which define the returned SSA_NAME.
4543
4544 FORNOW: We are only handling array accesses with step 1. */
4545
4546 tree
4547 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4548 gimple_seq *new_stmt_list,
4549 tree offset,
4550 tree byte_offset)
4551 {
4552 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4553 struct data_reference *dr = dr_info->dr;
4554 const char *base_name;
4555 tree addr_base;
4556 tree dest;
4557 gimple_seq seq = NULL;
4558 tree vect_ptr_type;
4559 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4560 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4561 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4562
4563 tree data_ref_base = unshare_expr (drb->base_address);
4564 tree base_offset = unshare_expr (drb->offset);
4565 tree init = unshare_expr (drb->init);
4566
4567 if (loop_vinfo)
4568 base_name = get_name (data_ref_base);
4569 else
4570 {
4571 base_offset = ssize_int (0);
4572 init = ssize_int (0);
4573 base_name = get_name (DR_REF (dr));
4574 }
4575
4576 /* Create base_offset */
4577 base_offset = size_binop (PLUS_EXPR,
4578 fold_convert (sizetype, base_offset),
4579 fold_convert (sizetype, init));
4580
4581 if (offset)
4582 {
4583 offset = fold_build2 (MULT_EXPR, sizetype,
4584 fold_convert (sizetype, offset), step);
4585 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4586 base_offset, offset);
4587 }
4588 if (byte_offset)
4589 {
4590 byte_offset = fold_convert (sizetype, byte_offset);
4591 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4592 base_offset, byte_offset);
4593 }
4594
4595 /* base + base_offset */
4596 if (loop_vinfo)
4597 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4598 else
4599 {
4600 addr_base = build1 (ADDR_EXPR,
4601 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4602 unshare_expr (DR_REF (dr)));
4603 }
4604
4605 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4606 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4607 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4608 gimple_seq_add_seq (new_stmt_list, seq);
4609
4610 if (DR_PTR_INFO (dr)
4611 && TREE_CODE (addr_base) == SSA_NAME
4612 && !SSA_NAME_PTR_INFO (addr_base))
4613 {
4614 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4615 if (offset || byte_offset)
4616 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4617 }
4618
4619 if (dump_enabled_p ())
4620 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4621
4622 return addr_base;
4623 }
4624
4625
4626 /* Function vect_create_data_ref_ptr.
4627
4628 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4629 location accessed in the loop by STMT_INFO, along with the def-use update
4630 chain to appropriately advance the pointer through the loop iterations.
4631 Also set aliasing information for the pointer. This pointer is used by
4632 the callers to this function to create a memory reference expression for
4633 vector load/store access.
4634
4635 Input:
4636 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4637 GIMPLE_ASSIGN <name, data-ref> or
4638 GIMPLE_ASSIGN <data-ref, name>.
4639 2. AGGR_TYPE: the type of the reference, which should be either a vector
4640 or an array.
4641 3. AT_LOOP: the loop where the vector memref is to be created.
4642 4. OFFSET (optional): an offset to be added to the initial address accessed
4643 by the data-ref in STMT_INFO.
4644 5. BSI: location where the new stmts are to be placed if there is no loop
4645 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4646 pointing to the initial address.
4647 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4648 to the initial address accessed by the data-ref in STMT_INFO. This is
4649 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4650 in bytes.
4651 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4652 to the IV during each iteration of the loop. NULL says to move
4653 by one copy of AGGR_TYPE up or down, depending on the step of the
4654 data reference.
4655
4656 Output:
4657 1. Declare a new ptr to vector_type, and have it point to the base of the
4658 data reference (initial addressed accessed by the data reference).
4659 For example, for vector of type V8HI, the following code is generated:
4660
4661 v8hi *ap;
4662 ap = (v8hi *)initial_address;
4663
4664 if OFFSET is not supplied:
4665 initial_address = &a[init];
4666 if OFFSET is supplied:
4667 initial_address = &a[init + OFFSET];
4668 if BYTE_OFFSET is supplied:
4669 initial_address = &a[init] + BYTE_OFFSET;
4670
4671 Return the initial_address in INITIAL_ADDRESS.
4672
4673 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4674 update the pointer in each iteration of the loop.
4675
4676 Return the increment stmt that updates the pointer in PTR_INCR.
4677
4678 3. Return the pointer. */
4679
4680 tree
4681 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4682 class loop *at_loop, tree offset,
4683 tree *initial_address, gimple_stmt_iterator *gsi,
4684 gimple **ptr_incr, bool only_init,
4685 tree byte_offset, tree iv_step)
4686 {
4687 const char *base_name;
4688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4689 class loop *loop = NULL;
4690 bool nested_in_vect_loop = false;
4691 class loop *containing_loop = NULL;
4692 tree aggr_ptr_type;
4693 tree aggr_ptr;
4694 tree new_temp;
4695 gimple_seq new_stmt_list = NULL;
4696 edge pe = NULL;
4697 basic_block new_bb;
4698 tree aggr_ptr_init;
4699 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4700 struct data_reference *dr = dr_info->dr;
4701 tree aptr;
4702 gimple_stmt_iterator incr_gsi;
4703 bool insert_after;
4704 tree indx_before_incr, indx_after_incr;
4705 gimple *incr;
4706 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4707
4708 gcc_assert (iv_step != NULL_TREE
4709 || TREE_CODE (aggr_type) == ARRAY_TYPE
4710 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4711
4712 if (loop_vinfo)
4713 {
4714 loop = LOOP_VINFO_LOOP (loop_vinfo);
4715 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4716 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4717 pe = loop_preheader_edge (loop);
4718 }
4719 else
4720 {
4721 gcc_assert (bb_vinfo);
4722 only_init = true;
4723 *ptr_incr = NULL;
4724 }
4725
4726 /* Create an expression for the first address accessed by this load
4727 in LOOP. */
4728 base_name = get_name (DR_BASE_ADDRESS (dr));
4729
4730 if (dump_enabled_p ())
4731 {
4732 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4733 dump_printf_loc (MSG_NOTE, vect_location,
4734 "create %s-pointer variable to type: %T",
4735 get_tree_code_name (TREE_CODE (aggr_type)),
4736 aggr_type);
4737 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4738 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4739 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4740 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4741 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4742 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4743 else
4744 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4745 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4746 }
4747
4748 /* (1) Create the new aggregate-pointer variable.
4749 Vector and array types inherit the alias set of their component
4750 type by default so we need to use a ref-all pointer if the data
4751 reference does not conflict with the created aggregated data
4752 reference because it is not addressable. */
4753 bool need_ref_all = false;
4754 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4755 get_alias_set (DR_REF (dr))))
4756 need_ref_all = true;
4757 /* Likewise for any of the data references in the stmt group. */
4758 else if (DR_GROUP_SIZE (stmt_info) > 1)
4759 {
4760 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4761 do
4762 {
4763 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4764 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4765 get_alias_set (DR_REF (sdr))))
4766 {
4767 need_ref_all = true;
4768 break;
4769 }
4770 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4771 }
4772 while (sinfo);
4773 }
4774 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4775 need_ref_all);
4776 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4777
4778
4779 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4780 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4781 def-use update cycles for the pointer: one relative to the outer-loop
4782 (LOOP), which is what steps (3) and (4) below do. The other is relative
4783 to the inner-loop (which is the inner-most loop containing the dataref),
4784 and this is done be step (5) below.
4785
4786 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4787 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4788 redundant. Steps (3),(4) create the following:
4789
4790 vp0 = &base_addr;
4791 LOOP: vp1 = phi(vp0,vp2)
4792 ...
4793 ...
4794 vp2 = vp1 + step
4795 goto LOOP
4796
4797 If there is an inner-loop nested in loop, then step (5) will also be
4798 applied, and an additional update in the inner-loop will be created:
4799
4800 vp0 = &base_addr;
4801 LOOP: vp1 = phi(vp0,vp2)
4802 ...
4803 inner: vp3 = phi(vp1,vp4)
4804 vp4 = vp3 + inner_step
4805 if () goto inner
4806 ...
4807 vp2 = vp1 + step
4808 if () goto LOOP */
4809
4810 /* (2) Calculate the initial address of the aggregate-pointer, and set
4811 the aggregate-pointer to point to it before the loop. */
4812
4813 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4814
4815 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4816 offset, byte_offset);
4817 if (new_stmt_list)
4818 {
4819 if (pe)
4820 {
4821 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4822 gcc_assert (!new_bb);
4823 }
4824 else
4825 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4826 }
4827
4828 *initial_address = new_temp;
4829 aggr_ptr_init = new_temp;
4830
4831 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4832 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4833 inner-loop nested in LOOP (during outer-loop vectorization). */
4834
4835 /* No update in loop is required. */
4836 if (only_init && (!loop_vinfo || at_loop == loop))
4837 aptr = aggr_ptr_init;
4838 else
4839 {
4840 /* Accesses to invariant addresses should be handled specially
4841 by the caller. */
4842 tree step = vect_dr_behavior (dr_info)->step;
4843 gcc_assert (!integer_zerop (step));
4844
4845 if (iv_step == NULL_TREE)
4846 {
4847 /* The step of the aggregate pointer is the type size,
4848 negated for downward accesses. */
4849 iv_step = TYPE_SIZE_UNIT (aggr_type);
4850 if (tree_int_cst_sgn (step) == -1)
4851 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4852 }
4853
4854 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4855
4856 create_iv (aggr_ptr_init,
4857 fold_convert (aggr_ptr_type, iv_step),
4858 aggr_ptr, loop, &incr_gsi, insert_after,
4859 &indx_before_incr, &indx_after_incr);
4860 incr = gsi_stmt (incr_gsi);
4861 loop_vinfo->add_stmt (incr);
4862
4863 /* Copy the points-to information if it exists. */
4864 if (DR_PTR_INFO (dr))
4865 {
4866 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4867 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4868 }
4869 if (ptr_incr)
4870 *ptr_incr = incr;
4871
4872 aptr = indx_before_incr;
4873 }
4874
4875 if (!nested_in_vect_loop || only_init)
4876 return aptr;
4877
4878
4879 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4880 nested in LOOP, if exists. */
4881
4882 gcc_assert (nested_in_vect_loop);
4883 if (!only_init)
4884 {
4885 standard_iv_increment_position (containing_loop, &incr_gsi,
4886 &insert_after);
4887 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4888 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4889 &indx_after_incr);
4890 incr = gsi_stmt (incr_gsi);
4891 loop_vinfo->add_stmt (incr);
4892
4893 /* Copy the points-to information if it exists. */
4894 if (DR_PTR_INFO (dr))
4895 {
4896 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4897 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4898 }
4899 if (ptr_incr)
4900 *ptr_incr = incr;
4901
4902 return indx_before_incr;
4903 }
4904 else
4905 gcc_unreachable ();
4906 }
4907
4908
4909 /* Function bump_vector_ptr
4910
4911 Increment a pointer (to a vector type) by vector-size. If requested,
4912 i.e. if PTR-INCR is given, then also connect the new increment stmt
4913 to the existing def-use update-chain of the pointer, by modifying
4914 the PTR_INCR as illustrated below:
4915
4916 The pointer def-use update-chain before this function:
4917 DATAREF_PTR = phi (p_0, p_2)
4918 ....
4919 PTR_INCR: p_2 = DATAREF_PTR + step
4920
4921 The pointer def-use update-chain after this function:
4922 DATAREF_PTR = phi (p_0, p_2)
4923 ....
4924 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4925 ....
4926 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4927
4928 Input:
4929 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4930 in the loop.
4931 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4932 the loop. The increment amount across iterations is expected
4933 to be vector_size.
4934 BSI - location where the new update stmt is to be placed.
4935 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4936 BUMP - optional. The offset by which to bump the pointer. If not given,
4937 the offset is assumed to be vector_size.
4938
4939 Output: Return NEW_DATAREF_PTR as illustrated above.
4940
4941 */
4942
4943 tree
4944 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4945 stmt_vec_info stmt_info, tree bump)
4946 {
4947 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4948 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4949 tree update = TYPE_SIZE_UNIT (vectype);
4950 gassign *incr_stmt;
4951 ssa_op_iter iter;
4952 use_operand_p use_p;
4953 tree new_dataref_ptr;
4954
4955 if (bump)
4956 update = bump;
4957
4958 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4959 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4960 else
4961 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4962 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4963 dataref_ptr, update);
4964 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4965
4966 /* Copy the points-to information if it exists. */
4967 if (DR_PTR_INFO (dr))
4968 {
4969 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4970 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4971 }
4972
4973 if (!ptr_incr)
4974 return new_dataref_ptr;
4975
4976 /* Update the vector-pointer's cross-iteration increment. */
4977 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4978 {
4979 tree use = USE_FROM_PTR (use_p);
4980
4981 if (use == dataref_ptr)
4982 SET_USE (use_p, new_dataref_ptr);
4983 else
4984 gcc_assert (operand_equal_p (use, update, 0));
4985 }
4986
4987 return new_dataref_ptr;
4988 }
4989
4990
4991 /* Copy memory reference info such as base/clique from the SRC reference
4992 to the DEST MEM_REF. */
4993
4994 void
4995 vect_copy_ref_info (tree dest, tree src)
4996 {
4997 if (TREE_CODE (dest) != MEM_REF)
4998 return;
4999
5000 tree src_base = src;
5001 while (handled_component_p (src_base))
5002 src_base = TREE_OPERAND (src_base, 0);
5003 if (TREE_CODE (src_base) != MEM_REF
5004 && TREE_CODE (src_base) != TARGET_MEM_REF)
5005 return;
5006
5007 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5008 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5009 }
5010
5011
5012 /* Function vect_create_destination_var.
5013
5014 Create a new temporary of type VECTYPE. */
5015
5016 tree
5017 vect_create_destination_var (tree scalar_dest, tree vectype)
5018 {
5019 tree vec_dest;
5020 const char *name;
5021 char *new_name;
5022 tree type;
5023 enum vect_var_kind kind;
5024
5025 kind = vectype
5026 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5027 ? vect_mask_var
5028 : vect_simple_var
5029 : vect_scalar_var;
5030 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5031
5032 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5033
5034 name = get_name (scalar_dest);
5035 if (name)
5036 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5037 else
5038 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5039 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5040 free (new_name);
5041
5042 return vec_dest;
5043 }
5044
5045 /* Function vect_grouped_store_supported.
5046
5047 Returns TRUE if interleave high and interleave low permutations
5048 are supported, and FALSE otherwise. */
5049
5050 bool
5051 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5052 {
5053 machine_mode mode = TYPE_MODE (vectype);
5054
5055 /* vect_permute_store_chain requires the group size to be equal to 3 or
5056 be a power of two. */
5057 if (count != 3 && exact_log2 (count) == -1)
5058 {
5059 if (dump_enabled_p ())
5060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5061 "the size of the group of accesses"
5062 " is not a power of 2 or not eqaul to 3\n");
5063 return false;
5064 }
5065
5066 /* Check that the permutation is supported. */
5067 if (VECTOR_MODE_P (mode))
5068 {
5069 unsigned int i;
5070 if (count == 3)
5071 {
5072 unsigned int j0 = 0, j1 = 0, j2 = 0;
5073 unsigned int i, j;
5074
5075 unsigned int nelt;
5076 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5077 {
5078 if (dump_enabled_p ())
5079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5080 "cannot handle groups of 3 stores for"
5081 " variable-length vectors\n");
5082 return false;
5083 }
5084
5085 vec_perm_builder sel (nelt, nelt, 1);
5086 sel.quick_grow (nelt);
5087 vec_perm_indices indices;
5088 for (j = 0; j < 3; j++)
5089 {
5090 int nelt0 = ((3 - j) * nelt) % 3;
5091 int nelt1 = ((3 - j) * nelt + 1) % 3;
5092 int nelt2 = ((3 - j) * nelt + 2) % 3;
5093 for (i = 0; i < nelt; i++)
5094 {
5095 if (3 * i + nelt0 < nelt)
5096 sel[3 * i + nelt0] = j0++;
5097 if (3 * i + nelt1 < nelt)
5098 sel[3 * i + nelt1] = nelt + j1++;
5099 if (3 * i + nelt2 < nelt)
5100 sel[3 * i + nelt2] = 0;
5101 }
5102 indices.new_vector (sel, 2, nelt);
5103 if (!can_vec_perm_const_p (mode, indices))
5104 {
5105 if (dump_enabled_p ())
5106 dump_printf (MSG_MISSED_OPTIMIZATION,
5107 "permutation op not supported by target.\n");
5108 return false;
5109 }
5110
5111 for (i = 0; i < nelt; i++)
5112 {
5113 if (3 * i + nelt0 < nelt)
5114 sel[3 * i + nelt0] = 3 * i + nelt0;
5115 if (3 * i + nelt1 < nelt)
5116 sel[3 * i + nelt1] = 3 * i + nelt1;
5117 if (3 * i + nelt2 < nelt)
5118 sel[3 * i + nelt2] = nelt + j2++;
5119 }
5120 indices.new_vector (sel, 2, nelt);
5121 if (!can_vec_perm_const_p (mode, indices))
5122 {
5123 if (dump_enabled_p ())
5124 dump_printf (MSG_MISSED_OPTIMIZATION,
5125 "permutation op not supported by target.\n");
5126 return false;
5127 }
5128 }
5129 return true;
5130 }
5131 else
5132 {
5133 /* If length is not equal to 3 then only power of 2 is supported. */
5134 gcc_assert (pow2p_hwi (count));
5135 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5136
5137 /* The encoding has 2 interleaved stepped patterns. */
5138 vec_perm_builder sel (nelt, 2, 3);
5139 sel.quick_grow (6);
5140 for (i = 0; i < 3; i++)
5141 {
5142 sel[i * 2] = i;
5143 sel[i * 2 + 1] = i + nelt;
5144 }
5145 vec_perm_indices indices (sel, 2, nelt);
5146 if (can_vec_perm_const_p (mode, indices))
5147 {
5148 for (i = 0; i < 6; i++)
5149 sel[i] += exact_div (nelt, 2);
5150 indices.new_vector (sel, 2, nelt);
5151 if (can_vec_perm_const_p (mode, indices))
5152 return true;
5153 }
5154 }
5155 }
5156
5157 if (dump_enabled_p ())
5158 dump_printf (MSG_MISSED_OPTIMIZATION,
5159 "permutation op not supported by target.\n");
5160 return false;
5161 }
5162
5163
5164 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5165 type VECTYPE. MASKED_P says whether the masked form is needed. */
5166
5167 bool
5168 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5169 bool masked_p)
5170 {
5171 if (masked_p)
5172 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5173 vec_mask_store_lanes_optab,
5174 vectype, count);
5175 else
5176 return vect_lanes_optab_supported_p ("vec_store_lanes",
5177 vec_store_lanes_optab,
5178 vectype, count);
5179 }
5180
5181
5182 /* Function vect_permute_store_chain.
5183
5184 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5185 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5186 the data correctly for the stores. Return the final references for stores
5187 in RESULT_CHAIN.
5188
5189 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5190 The input is 4 vectors each containing 8 elements. We assign a number to
5191 each element, the input sequence is:
5192
5193 1st vec: 0 1 2 3 4 5 6 7
5194 2nd vec: 8 9 10 11 12 13 14 15
5195 3rd vec: 16 17 18 19 20 21 22 23
5196 4th vec: 24 25 26 27 28 29 30 31
5197
5198 The output sequence should be:
5199
5200 1st vec: 0 8 16 24 1 9 17 25
5201 2nd vec: 2 10 18 26 3 11 19 27
5202 3rd vec: 4 12 20 28 5 13 21 30
5203 4th vec: 6 14 22 30 7 15 23 31
5204
5205 i.e., we interleave the contents of the four vectors in their order.
5206
5207 We use interleave_high/low instructions to create such output. The input of
5208 each interleave_high/low operation is two vectors:
5209 1st vec 2nd vec
5210 0 1 2 3 4 5 6 7
5211 the even elements of the result vector are obtained left-to-right from the
5212 high/low elements of the first vector. The odd elements of the result are
5213 obtained left-to-right from the high/low elements of the second vector.
5214 The output of interleave_high will be: 0 4 1 5
5215 and of interleave_low: 2 6 3 7
5216
5217
5218 The permutation is done in log LENGTH stages. In each stage interleave_high
5219 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5220 where the first argument is taken from the first half of DR_CHAIN and the
5221 second argument from it's second half.
5222 In our example,
5223
5224 I1: interleave_high (1st vec, 3rd vec)
5225 I2: interleave_low (1st vec, 3rd vec)
5226 I3: interleave_high (2nd vec, 4th vec)
5227 I4: interleave_low (2nd vec, 4th vec)
5228
5229 The output for the first stage is:
5230
5231 I1: 0 16 1 17 2 18 3 19
5232 I2: 4 20 5 21 6 22 7 23
5233 I3: 8 24 9 25 10 26 11 27
5234 I4: 12 28 13 29 14 30 15 31
5235
5236 The output of the second stage, i.e. the final result is:
5237
5238 I1: 0 8 16 24 1 9 17 25
5239 I2: 2 10 18 26 3 11 19 27
5240 I3: 4 12 20 28 5 13 21 30
5241 I4: 6 14 22 30 7 15 23 31. */
5242
5243 void
5244 vect_permute_store_chain (vec<tree> dr_chain,
5245 unsigned int length,
5246 stmt_vec_info stmt_info,
5247 gimple_stmt_iterator *gsi,
5248 vec<tree> *result_chain)
5249 {
5250 tree vect1, vect2, high, low;
5251 gimple *perm_stmt;
5252 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5253 tree perm_mask_low, perm_mask_high;
5254 tree data_ref;
5255 tree perm3_mask_low, perm3_mask_high;
5256 unsigned int i, j, n, log_length = exact_log2 (length);
5257
5258 result_chain->quick_grow (length);
5259 memcpy (result_chain->address (), dr_chain.address (),
5260 length * sizeof (tree));
5261
5262 if (length == 3)
5263 {
5264 /* vect_grouped_store_supported ensures that this is constant. */
5265 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5266 unsigned int j0 = 0, j1 = 0, j2 = 0;
5267
5268 vec_perm_builder sel (nelt, nelt, 1);
5269 sel.quick_grow (nelt);
5270 vec_perm_indices indices;
5271 for (j = 0; j < 3; j++)
5272 {
5273 int nelt0 = ((3 - j) * nelt) % 3;
5274 int nelt1 = ((3 - j) * nelt + 1) % 3;
5275 int nelt2 = ((3 - j) * nelt + 2) % 3;
5276
5277 for (i = 0; i < nelt; i++)
5278 {
5279 if (3 * i + nelt0 < nelt)
5280 sel[3 * i + nelt0] = j0++;
5281 if (3 * i + nelt1 < nelt)
5282 sel[3 * i + nelt1] = nelt + j1++;
5283 if (3 * i + nelt2 < nelt)
5284 sel[3 * i + nelt2] = 0;
5285 }
5286 indices.new_vector (sel, 2, nelt);
5287 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5288
5289 for (i = 0; i < nelt; i++)
5290 {
5291 if (3 * i + nelt0 < nelt)
5292 sel[3 * i + nelt0] = 3 * i + nelt0;
5293 if (3 * i + nelt1 < nelt)
5294 sel[3 * i + nelt1] = 3 * i + nelt1;
5295 if (3 * i + nelt2 < nelt)
5296 sel[3 * i + nelt2] = nelt + j2++;
5297 }
5298 indices.new_vector (sel, 2, nelt);
5299 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5300
5301 vect1 = dr_chain[0];
5302 vect2 = dr_chain[1];
5303
5304 /* Create interleaving stmt:
5305 low = VEC_PERM_EXPR <vect1, vect2,
5306 {j, nelt, *, j + 1, nelt + j + 1, *,
5307 j + 2, nelt + j + 2, *, ...}> */
5308 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5309 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5310 vect2, perm3_mask_low);
5311 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5312
5313 vect1 = data_ref;
5314 vect2 = dr_chain[2];
5315 /* Create interleaving stmt:
5316 low = VEC_PERM_EXPR <vect1, vect2,
5317 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5318 6, 7, nelt + j + 2, ...}> */
5319 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5320 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5321 vect2, perm3_mask_high);
5322 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5323 (*result_chain)[j] = data_ref;
5324 }
5325 }
5326 else
5327 {
5328 /* If length is not equal to 3 then only power of 2 is supported. */
5329 gcc_assert (pow2p_hwi (length));
5330
5331 /* The encoding has 2 interleaved stepped patterns. */
5332 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5333 vec_perm_builder sel (nelt, 2, 3);
5334 sel.quick_grow (6);
5335 for (i = 0; i < 3; i++)
5336 {
5337 sel[i * 2] = i;
5338 sel[i * 2 + 1] = i + nelt;
5339 }
5340 vec_perm_indices indices (sel, 2, nelt);
5341 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5342
5343 for (i = 0; i < 6; i++)
5344 sel[i] += exact_div (nelt, 2);
5345 indices.new_vector (sel, 2, nelt);
5346 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5347
5348 for (i = 0, n = log_length; i < n; i++)
5349 {
5350 for (j = 0; j < length/2; j++)
5351 {
5352 vect1 = dr_chain[j];
5353 vect2 = dr_chain[j+length/2];
5354
5355 /* Create interleaving stmt:
5356 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5357 ...}> */
5358 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5359 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5360 vect2, perm_mask_high);
5361 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5362 (*result_chain)[2*j] = high;
5363
5364 /* Create interleaving stmt:
5365 low = VEC_PERM_EXPR <vect1, vect2,
5366 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5367 ...}> */
5368 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5369 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5370 vect2, perm_mask_low);
5371 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5372 (*result_chain)[2*j+1] = low;
5373 }
5374 memcpy (dr_chain.address (), result_chain->address (),
5375 length * sizeof (tree));
5376 }
5377 }
5378 }
5379
5380 /* Function vect_setup_realignment
5381
5382 This function is called when vectorizing an unaligned load using
5383 the dr_explicit_realign[_optimized] scheme.
5384 This function generates the following code at the loop prolog:
5385
5386 p = initial_addr;
5387 x msq_init = *(floor(p)); # prolog load
5388 realignment_token = call target_builtin;
5389 loop:
5390 x msq = phi (msq_init, ---)
5391
5392 The stmts marked with x are generated only for the case of
5393 dr_explicit_realign_optimized.
5394
5395 The code above sets up a new (vector) pointer, pointing to the first
5396 location accessed by STMT_INFO, and a "floor-aligned" load using that
5397 pointer. It also generates code to compute the "realignment-token"
5398 (if the relevant target hook was defined), and creates a phi-node at the
5399 loop-header bb whose arguments are the result of the prolog-load (created
5400 by this function) and the result of a load that takes place in the loop
5401 (to be created by the caller to this function).
5402
5403 For the case of dr_explicit_realign_optimized:
5404 The caller to this function uses the phi-result (msq) to create the
5405 realignment code inside the loop, and sets up the missing phi argument,
5406 as follows:
5407 loop:
5408 msq = phi (msq_init, lsq)
5409 lsq = *(floor(p')); # load in loop
5410 result = realign_load (msq, lsq, realignment_token);
5411
5412 For the case of dr_explicit_realign:
5413 loop:
5414 msq = *(floor(p)); # load in loop
5415 p' = p + (VS-1);
5416 lsq = *(floor(p')); # load in loop
5417 result = realign_load (msq, lsq, realignment_token);
5418
5419 Input:
5420 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5421 a memory location that may be unaligned.
5422 BSI - place where new code is to be inserted.
5423 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5424 is used.
5425
5426 Output:
5427 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5428 target hook, if defined.
5429 Return value - the result of the loop-header phi node. */
5430
5431 tree
5432 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5433 tree *realignment_token,
5434 enum dr_alignment_support alignment_support_scheme,
5435 tree init_addr,
5436 class loop **at_loop)
5437 {
5438 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5441 struct data_reference *dr = dr_info->dr;
5442 class loop *loop = NULL;
5443 edge pe = NULL;
5444 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5445 tree vec_dest;
5446 gimple *inc;
5447 tree ptr;
5448 tree data_ref;
5449 basic_block new_bb;
5450 tree msq_init = NULL_TREE;
5451 tree new_temp;
5452 gphi *phi_stmt;
5453 tree msq = NULL_TREE;
5454 gimple_seq stmts = NULL;
5455 bool compute_in_loop = false;
5456 bool nested_in_vect_loop = false;
5457 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5458 class loop *loop_for_initial_load = NULL;
5459
5460 if (loop_vinfo)
5461 {
5462 loop = LOOP_VINFO_LOOP (loop_vinfo);
5463 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5464 }
5465
5466 gcc_assert (alignment_support_scheme == dr_explicit_realign
5467 || alignment_support_scheme == dr_explicit_realign_optimized);
5468
5469 /* We need to generate three things:
5470 1. the misalignment computation
5471 2. the extra vector load (for the optimized realignment scheme).
5472 3. the phi node for the two vectors from which the realignment is
5473 done (for the optimized realignment scheme). */
5474
5475 /* 1. Determine where to generate the misalignment computation.
5476
5477 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478 calculation will be generated by this function, outside the loop (in the
5479 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5480 caller, inside the loop.
5481
5482 Background: If the misalignment remains fixed throughout the iterations of
5483 the loop, then both realignment schemes are applicable, and also the
5484 misalignment computation can be done outside LOOP. This is because we are
5485 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486 are a multiple of VS (the Vector Size), and therefore the misalignment in
5487 different vectorized LOOP iterations is always the same.
5488 The problem arises only if the memory access is in an inner-loop nested
5489 inside LOOP, which is now being vectorized using outer-loop vectorization.
5490 This is the only case when the misalignment of the memory access may not
5491 remain fixed throughout the iterations of the inner-loop (as explained in
5492 detail in vect_supportable_dr_alignment). In this case, not only is the
5493 optimized realignment scheme not applicable, but also the misalignment
5494 computation (and generation of the realignment token that is passed to
5495 REALIGN_LOAD) have to be done inside the loop.
5496
5497 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498 or not, which in turn determines if the misalignment is computed inside
5499 the inner-loop, or outside LOOP. */
5500
5501 if (init_addr != NULL_TREE || !loop_vinfo)
5502 {
5503 compute_in_loop = true;
5504 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5505 }
5506
5507
5508 /* 2. Determine where to generate the extra vector load.
5509
5510 For the optimized realignment scheme, instead of generating two vector
5511 loads in each iteration, we generate a single extra vector load in the
5512 preheader of the loop, and in each iteration reuse the result of the
5513 vector load from the previous iteration. In case the memory access is in
5514 an inner-loop nested inside LOOP, which is now being vectorized using
5515 outer-loop vectorization, we need to determine whether this initial vector
5516 load should be generated at the preheader of the inner-loop, or can be
5517 generated at the preheader of LOOP. If the memory access has no evolution
5518 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519 to be generated inside LOOP (in the preheader of the inner-loop). */
5520
5521 if (nested_in_vect_loop)
5522 {
5523 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524 bool invariant_in_outerloop =
5525 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5527 }
5528 else
5529 loop_for_initial_load = loop;
5530 if (at_loop)
5531 *at_loop = loop_for_initial_load;
5532
5533 if (loop_for_initial_load)
5534 pe = loop_preheader_edge (loop_for_initial_load);
5535
5536 /* 3. For the case of the optimized realignment, create the first vector
5537 load at the loop preheader. */
5538
5539 if (alignment_support_scheme == dr_explicit_realign_optimized)
5540 {
5541 /* Create msq_init = *(floor(p1)) in the loop preheader */
5542 gassign *new_stmt;
5543
5544 gcc_assert (!compute_in_loop);
5545 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5547 loop_for_initial_load, NULL_TREE,
5548 &init_addr, NULL, &inc, true);
5549 if (TREE_CODE (ptr) == SSA_NAME)
5550 new_temp = copy_ssa_name (ptr);
5551 else
5552 new_temp = make_ssa_name (TREE_TYPE (ptr));
5553 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5554 tree type = TREE_TYPE (ptr);
5555 new_stmt = gimple_build_assign
5556 (new_temp, BIT_AND_EXPR, ptr,
5557 fold_build2 (MINUS_EXPR, type,
5558 build_int_cst (type, 0),
5559 build_int_cst (type, align)));
5560 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5561 gcc_assert (!new_bb);
5562 data_ref
5563 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5564 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5565 vect_copy_ref_info (data_ref, DR_REF (dr));
5566 new_stmt = gimple_build_assign (vec_dest, data_ref);
5567 new_temp = make_ssa_name (vec_dest, new_stmt);
5568 gimple_assign_set_lhs (new_stmt, new_temp);
5569 if (pe)
5570 {
5571 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5572 gcc_assert (!new_bb);
5573 }
5574 else
5575 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5576
5577 msq_init = gimple_assign_lhs (new_stmt);
5578 }
5579
5580 /* 4. Create realignment token using a target builtin, if available.
5581 It is done either inside the containing loop, or before LOOP (as
5582 determined above). */
5583
5584 if (targetm.vectorize.builtin_mask_for_load)
5585 {
5586 gcall *new_stmt;
5587 tree builtin_decl;
5588
5589 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5590 if (!init_addr)
5591 {
5592 /* Generate the INIT_ADDR computation outside LOOP. */
5593 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5594 NULL_TREE);
5595 if (loop)
5596 {
5597 pe = loop_preheader_edge (loop);
5598 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5599 gcc_assert (!new_bb);
5600 }
5601 else
5602 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5603 }
5604
5605 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5606 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5607 vec_dest =
5608 vect_create_destination_var (scalar_dest,
5609 gimple_call_return_type (new_stmt));
5610 new_temp = make_ssa_name (vec_dest, new_stmt);
5611 gimple_call_set_lhs (new_stmt, new_temp);
5612
5613 if (compute_in_loop)
5614 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5615 else
5616 {
5617 /* Generate the misalignment computation outside LOOP. */
5618 pe = loop_preheader_edge (loop);
5619 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5620 gcc_assert (!new_bb);
5621 }
5622
5623 *realignment_token = gimple_call_lhs (new_stmt);
5624
5625 /* The result of the CALL_EXPR to this builtin is determined from
5626 the value of the parameter and no global variables are touched
5627 which makes the builtin a "const" function. Requiring the
5628 builtin to have the "const" attribute makes it unnecessary
5629 to call mark_call_clobbered. */
5630 gcc_assert (TREE_READONLY (builtin_decl));
5631 }
5632
5633 if (alignment_support_scheme == dr_explicit_realign)
5634 return msq;
5635
5636 gcc_assert (!compute_in_loop);
5637 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5638
5639
5640 /* 5. Create msq = phi <msq_init, lsq> in loop */
5641
5642 pe = loop_preheader_edge (containing_loop);
5643 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5644 msq = make_ssa_name (vec_dest);
5645 phi_stmt = create_phi_node (msq, containing_loop->header);
5646 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5647
5648 return msq;
5649 }
5650
5651
5652 /* Function vect_grouped_load_supported.
5653
5654 COUNT is the size of the load group (the number of statements plus the
5655 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5656 only one statement, with a gap of COUNT - 1.
5657
5658 Returns true if a suitable permute exists. */
5659
5660 bool
5661 vect_grouped_load_supported (tree vectype, bool single_element_p,
5662 unsigned HOST_WIDE_INT count)
5663 {
5664 machine_mode mode = TYPE_MODE (vectype);
5665
5666 /* If this is single-element interleaving with an element distance
5667 that leaves unused vector loads around punt - we at least create
5668 very sub-optimal code in that case (and blow up memory,
5669 see PR65518). */
5670 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5671 {
5672 if (dump_enabled_p ())
5673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5674 "single-element interleaving not supported "
5675 "for not adjacent vector loads\n");
5676 return false;
5677 }
5678
5679 /* vect_permute_load_chain requires the group size to be equal to 3 or
5680 be a power of two. */
5681 if (count != 3 && exact_log2 (count) == -1)
5682 {
5683 if (dump_enabled_p ())
5684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5685 "the size of the group of accesses"
5686 " is not a power of 2 or not equal to 3\n");
5687 return false;
5688 }
5689
5690 /* Check that the permutation is supported. */
5691 if (VECTOR_MODE_P (mode))
5692 {
5693 unsigned int i, j;
5694 if (count == 3)
5695 {
5696 unsigned int nelt;
5697 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5698 {
5699 if (dump_enabled_p ())
5700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5701 "cannot handle groups of 3 loads for"
5702 " variable-length vectors\n");
5703 return false;
5704 }
5705
5706 vec_perm_builder sel (nelt, nelt, 1);
5707 sel.quick_grow (nelt);
5708 vec_perm_indices indices;
5709 unsigned int k;
5710 for (k = 0; k < 3; k++)
5711 {
5712 for (i = 0; i < nelt; i++)
5713 if (3 * i + k < 2 * nelt)
5714 sel[i] = 3 * i + k;
5715 else
5716 sel[i] = 0;
5717 indices.new_vector (sel, 2, nelt);
5718 if (!can_vec_perm_const_p (mode, indices))
5719 {
5720 if (dump_enabled_p ())
5721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5722 "shuffle of 3 loads is not supported by"
5723 " target\n");
5724 return false;
5725 }
5726 for (i = 0, j = 0; i < nelt; i++)
5727 if (3 * i + k < 2 * nelt)
5728 sel[i] = i;
5729 else
5730 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5731 indices.new_vector (sel, 2, nelt);
5732 if (!can_vec_perm_const_p (mode, indices))
5733 {
5734 if (dump_enabled_p ())
5735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5736 "shuffle of 3 loads is not supported by"
5737 " target\n");
5738 return false;
5739 }
5740 }
5741 return true;
5742 }
5743 else
5744 {
5745 /* If length is not equal to 3 then only power of 2 is supported. */
5746 gcc_assert (pow2p_hwi (count));
5747 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5748
5749 /* The encoding has a single stepped pattern. */
5750 vec_perm_builder sel (nelt, 1, 3);
5751 sel.quick_grow (3);
5752 for (i = 0; i < 3; i++)
5753 sel[i] = i * 2;
5754 vec_perm_indices indices (sel, 2, nelt);
5755 if (can_vec_perm_const_p (mode, indices))
5756 {
5757 for (i = 0; i < 3; i++)
5758 sel[i] = i * 2 + 1;
5759 indices.new_vector (sel, 2, nelt);
5760 if (can_vec_perm_const_p (mode, indices))
5761 return true;
5762 }
5763 }
5764 }
5765
5766 if (dump_enabled_p ())
5767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5768 "extract even/odd not supported by target\n");
5769 return false;
5770 }
5771
5772 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5773 type VECTYPE. MASKED_P says whether the masked form is needed. */
5774
5775 bool
5776 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5777 bool masked_p)
5778 {
5779 if (masked_p)
5780 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5781 vec_mask_load_lanes_optab,
5782 vectype, count);
5783 else
5784 return vect_lanes_optab_supported_p ("vec_load_lanes",
5785 vec_load_lanes_optab,
5786 vectype, count);
5787 }
5788
5789 /* Function vect_permute_load_chain.
5790
5791 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5792 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5793 the input data correctly. Return the final references for loads in
5794 RESULT_CHAIN.
5795
5796 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5797 The input is 4 vectors each containing 8 elements. We assign a number to each
5798 element, the input sequence is:
5799
5800 1st vec: 0 1 2 3 4 5 6 7
5801 2nd vec: 8 9 10 11 12 13 14 15
5802 3rd vec: 16 17 18 19 20 21 22 23
5803 4th vec: 24 25 26 27 28 29 30 31
5804
5805 The output sequence should be:
5806
5807 1st vec: 0 4 8 12 16 20 24 28
5808 2nd vec: 1 5 9 13 17 21 25 29
5809 3rd vec: 2 6 10 14 18 22 26 30
5810 4th vec: 3 7 11 15 19 23 27 31
5811
5812 i.e., the first output vector should contain the first elements of each
5813 interleaving group, etc.
5814
5815 We use extract_even/odd instructions to create such output. The input of
5816 each extract_even/odd operation is two vectors
5817 1st vec 2nd vec
5818 0 1 2 3 4 5 6 7
5819
5820 and the output is the vector of extracted even/odd elements. The output of
5821 extract_even will be: 0 2 4 6
5822 and of extract_odd: 1 3 5 7
5823
5824
5825 The permutation is done in log LENGTH stages. In each stage extract_even
5826 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5827 their order. In our example,
5828
5829 E1: extract_even (1st vec, 2nd vec)
5830 E2: extract_odd (1st vec, 2nd vec)
5831 E3: extract_even (3rd vec, 4th vec)
5832 E4: extract_odd (3rd vec, 4th vec)
5833
5834 The output for the first stage will be:
5835
5836 E1: 0 2 4 6 8 10 12 14
5837 E2: 1 3 5 7 9 11 13 15
5838 E3: 16 18 20 22 24 26 28 30
5839 E4: 17 19 21 23 25 27 29 31
5840
5841 In order to proceed and create the correct sequence for the next stage (or
5842 for the correct output, if the second stage is the last one, as in our
5843 example), we first put the output of extract_even operation and then the
5844 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5845 The input for the second stage is:
5846
5847 1st vec (E1): 0 2 4 6 8 10 12 14
5848 2nd vec (E3): 16 18 20 22 24 26 28 30
5849 3rd vec (E2): 1 3 5 7 9 11 13 15
5850 4th vec (E4): 17 19 21 23 25 27 29 31
5851
5852 The output of the second stage:
5853
5854 E1: 0 4 8 12 16 20 24 28
5855 E2: 2 6 10 14 18 22 26 30
5856 E3: 1 5 9 13 17 21 25 29
5857 E4: 3 7 11 15 19 23 27 31
5858
5859 And RESULT_CHAIN after reordering:
5860
5861 1st vec (E1): 0 4 8 12 16 20 24 28
5862 2nd vec (E3): 1 5 9 13 17 21 25 29
5863 3rd vec (E2): 2 6 10 14 18 22 26 30
5864 4th vec (E4): 3 7 11 15 19 23 27 31. */
5865
5866 static void
5867 vect_permute_load_chain (vec<tree> dr_chain,
5868 unsigned int length,
5869 stmt_vec_info stmt_info,
5870 gimple_stmt_iterator *gsi,
5871 vec<tree> *result_chain)
5872 {
5873 tree data_ref, first_vect, second_vect;
5874 tree perm_mask_even, perm_mask_odd;
5875 tree perm3_mask_low, perm3_mask_high;
5876 gimple *perm_stmt;
5877 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5878 unsigned int i, j, log_length = exact_log2 (length);
5879
5880 result_chain->quick_grow (length);
5881 memcpy (result_chain->address (), dr_chain.address (),
5882 length * sizeof (tree));
5883
5884 if (length == 3)
5885 {
5886 /* vect_grouped_load_supported ensures that this is constant. */
5887 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5888 unsigned int k;
5889
5890 vec_perm_builder sel (nelt, nelt, 1);
5891 sel.quick_grow (nelt);
5892 vec_perm_indices indices;
5893 for (k = 0; k < 3; k++)
5894 {
5895 for (i = 0; i < nelt; i++)
5896 if (3 * i + k < 2 * nelt)
5897 sel[i] = 3 * i + k;
5898 else
5899 sel[i] = 0;
5900 indices.new_vector (sel, 2, nelt);
5901 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5902
5903 for (i = 0, j = 0; i < nelt; i++)
5904 if (3 * i + k < 2 * nelt)
5905 sel[i] = i;
5906 else
5907 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5908 indices.new_vector (sel, 2, nelt);
5909 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5910
5911 first_vect = dr_chain[0];
5912 second_vect = dr_chain[1];
5913
5914 /* Create interleaving stmt (low part of):
5915 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5916 ...}> */
5917 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5918 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5919 second_vect, perm3_mask_low);
5920 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5921
5922 /* Create interleaving stmt (high part of):
5923 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5924 ...}> */
5925 first_vect = data_ref;
5926 second_vect = dr_chain[2];
5927 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5928 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5929 second_vect, perm3_mask_high);
5930 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5931 (*result_chain)[k] = data_ref;
5932 }
5933 }
5934 else
5935 {
5936 /* If length is not equal to 3 then only power of 2 is supported. */
5937 gcc_assert (pow2p_hwi (length));
5938
5939 /* The encoding has a single stepped pattern. */
5940 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5941 vec_perm_builder sel (nelt, 1, 3);
5942 sel.quick_grow (3);
5943 for (i = 0; i < 3; ++i)
5944 sel[i] = i * 2;
5945 vec_perm_indices indices (sel, 2, nelt);
5946 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5947
5948 for (i = 0; i < 3; ++i)
5949 sel[i] = i * 2 + 1;
5950 indices.new_vector (sel, 2, nelt);
5951 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5952
5953 for (i = 0; i < log_length; i++)
5954 {
5955 for (j = 0; j < length; j += 2)
5956 {
5957 first_vect = dr_chain[j];
5958 second_vect = dr_chain[j+1];
5959
5960 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5961 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5962 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5963 first_vect, second_vect,
5964 perm_mask_even);
5965 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5966 (*result_chain)[j/2] = data_ref;
5967
5968 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5969 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5970 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5971 first_vect, second_vect,
5972 perm_mask_odd);
5973 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5974 (*result_chain)[j/2+length/2] = data_ref;
5975 }
5976 memcpy (dr_chain.address (), result_chain->address (),
5977 length * sizeof (tree));
5978 }
5979 }
5980 }
5981
5982 /* Function vect_shift_permute_load_chain.
5983
5984 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5985 sequence of stmts to reorder the input data accordingly.
5986 Return the final references for loads in RESULT_CHAIN.
5987 Return true if successed, false otherwise.
5988
5989 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5990 The input is 3 vectors each containing 8 elements. We assign a
5991 number to each element, the input sequence is:
5992
5993 1st vec: 0 1 2 3 4 5 6 7
5994 2nd vec: 8 9 10 11 12 13 14 15
5995 3rd vec: 16 17 18 19 20 21 22 23
5996
5997 The output sequence should be:
5998
5999 1st vec: 0 3 6 9 12 15 18 21
6000 2nd vec: 1 4 7 10 13 16 19 22
6001 3rd vec: 2 5 8 11 14 17 20 23
6002
6003 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6004
6005 First we shuffle all 3 vectors to get correct elements order:
6006
6007 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6008 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6009 3rd vec: (16 19 22) (17 20 23) (18 21)
6010
6011 Next we unite and shift vector 3 times:
6012
6013 1st step:
6014 shift right by 6 the concatenation of:
6015 "1st vec" and "2nd vec"
6016 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6017 "2nd vec" and "3rd vec"
6018 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6019 "3rd vec" and "1st vec"
6020 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6021 | New vectors |
6022
6023 So that now new vectors are:
6024
6025 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6026 2nd vec: (10 13) (16 19 22) (17 20 23)
6027 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6028
6029 2nd step:
6030 shift right by 5 the concatenation of:
6031 "1st vec" and "3rd vec"
6032 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6033 "2nd vec" and "1st vec"
6034 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6035 "3rd vec" and "2nd vec"
6036 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6037 | New vectors |
6038
6039 So that now new vectors are:
6040
6041 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6042 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6043 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6044
6045 3rd step:
6046 shift right by 5 the concatenation of:
6047 "1st vec" and "1st vec"
6048 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6049 shift right by 3 the concatenation of:
6050 "2nd vec" and "2nd vec"
6051 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6052 | New vectors |
6053
6054 So that now all vectors are READY:
6055 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6056 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6057 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6058
6059 This algorithm is faster than one in vect_permute_load_chain if:
6060 1. "shift of a concatination" is faster than general permutation.
6061 This is usually so.
6062 2. The TARGET machine can't execute vector instructions in parallel.
6063 This is because each step of the algorithm depends on previous.
6064 The algorithm in vect_permute_load_chain is much more parallel.
6065
6066 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6067 */
6068
6069 static bool
6070 vect_shift_permute_load_chain (vec<tree> dr_chain,
6071 unsigned int length,
6072 stmt_vec_info stmt_info,
6073 gimple_stmt_iterator *gsi,
6074 vec<tree> *result_chain)
6075 {
6076 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6077 tree perm2_mask1, perm2_mask2, perm3_mask;
6078 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6079 gimple *perm_stmt;
6080
6081 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6082 unsigned int i;
6083 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6084
6085 unsigned HOST_WIDE_INT nelt, vf;
6086 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6087 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6088 /* Not supported for variable-length vectors. */
6089 return false;
6090
6091 vec_perm_builder sel (nelt, nelt, 1);
6092 sel.quick_grow (nelt);
6093
6094 result_chain->quick_grow (length);
6095 memcpy (result_chain->address (), dr_chain.address (),
6096 length * sizeof (tree));
6097
6098 if (pow2p_hwi (length) && vf > 4)
6099 {
6100 unsigned int j, log_length = exact_log2 (length);
6101 for (i = 0; i < nelt / 2; ++i)
6102 sel[i] = i * 2;
6103 for (i = 0; i < nelt / 2; ++i)
6104 sel[nelt / 2 + i] = i * 2 + 1;
6105 vec_perm_indices indices (sel, 2, nelt);
6106 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6107 {
6108 if (dump_enabled_p ())
6109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6110 "shuffle of 2 fields structure is not \
6111 supported by target\n");
6112 return false;
6113 }
6114 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6115
6116 for (i = 0; i < nelt / 2; ++i)
6117 sel[i] = i * 2 + 1;
6118 for (i = 0; i < nelt / 2; ++i)
6119 sel[nelt / 2 + i] = i * 2;
6120 indices.new_vector (sel, 2, nelt);
6121 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6122 {
6123 if (dump_enabled_p ())
6124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6125 "shuffle of 2 fields structure is not \
6126 supported by target\n");
6127 return false;
6128 }
6129 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6130
6131 /* Generating permutation constant to shift all elements.
6132 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6133 for (i = 0; i < nelt; i++)
6134 sel[i] = nelt / 2 + i;
6135 indices.new_vector (sel, 2, nelt);
6136 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6137 {
6138 if (dump_enabled_p ())
6139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6140 "shift permutation is not supported by target\n");
6141 return false;
6142 }
6143 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6144
6145 /* Generating permutation constant to select vector from 2.
6146 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6147 for (i = 0; i < nelt / 2; i++)
6148 sel[i] = i;
6149 for (i = nelt / 2; i < nelt; i++)
6150 sel[i] = nelt + i;
6151 indices.new_vector (sel, 2, nelt);
6152 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6153 {
6154 if (dump_enabled_p ())
6155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6156 "select is not supported by target\n");
6157 return false;
6158 }
6159 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6160
6161 for (i = 0; i < log_length; i++)
6162 {
6163 for (j = 0; j < length; j += 2)
6164 {
6165 first_vect = dr_chain[j];
6166 second_vect = dr_chain[j + 1];
6167
6168 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6169 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6170 first_vect, first_vect,
6171 perm2_mask1);
6172 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6173 vect[0] = data_ref;
6174
6175 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6176 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6177 second_vect, second_vect,
6178 perm2_mask2);
6179 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6180 vect[1] = data_ref;
6181
6182 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6183 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6184 vect[0], vect[1], shift1_mask);
6185 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6186 (*result_chain)[j/2 + length/2] = data_ref;
6187
6188 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6189 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6190 vect[0], vect[1], select_mask);
6191 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6192 (*result_chain)[j/2] = data_ref;
6193 }
6194 memcpy (dr_chain.address (), result_chain->address (),
6195 length * sizeof (tree));
6196 }
6197 return true;
6198 }
6199 if (length == 3 && vf > 2)
6200 {
6201 unsigned int k = 0, l = 0;
6202
6203 /* Generating permutation constant to get all elements in rigth order.
6204 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6205 for (i = 0; i < nelt; i++)
6206 {
6207 if (3 * k + (l % 3) >= nelt)
6208 {
6209 k = 0;
6210 l += (3 - (nelt % 3));
6211 }
6212 sel[i] = 3 * k + (l % 3);
6213 k++;
6214 }
6215 vec_perm_indices indices (sel, 2, nelt);
6216 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6217 {
6218 if (dump_enabled_p ())
6219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6220 "shuffle of 3 fields structure is not \
6221 supported by target\n");
6222 return false;
6223 }
6224 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6225
6226 /* Generating permutation constant to shift all elements.
6227 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6228 for (i = 0; i < nelt; i++)
6229 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6230 indices.new_vector (sel, 2, nelt);
6231 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6232 {
6233 if (dump_enabled_p ())
6234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6235 "shift permutation is not supported by target\n");
6236 return false;
6237 }
6238 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6239
6240 /* Generating permutation constant to shift all elements.
6241 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6242 for (i = 0; i < nelt; i++)
6243 sel[i] = 2 * (nelt / 3) + 1 + i;
6244 indices.new_vector (sel, 2, nelt);
6245 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6246 {
6247 if (dump_enabled_p ())
6248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6249 "shift permutation is not supported by target\n");
6250 return false;
6251 }
6252 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6253
6254 /* Generating permutation constant to shift all elements.
6255 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6256 for (i = 0; i < nelt; i++)
6257 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6258 indices.new_vector (sel, 2, nelt);
6259 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6260 {
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 "shift permutation is not supported by target\n");
6264 return false;
6265 }
6266 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6267
6268 /* Generating permutation constant to shift all elements.
6269 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6270 for (i = 0; i < nelt; i++)
6271 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6272 indices.new_vector (sel, 2, nelt);
6273 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6274 {
6275 if (dump_enabled_p ())
6276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277 "shift permutation is not supported by target\n");
6278 return false;
6279 }
6280 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6281
6282 for (k = 0; k < 3; k++)
6283 {
6284 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6285 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6286 dr_chain[k], dr_chain[k],
6287 perm3_mask);
6288 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6289 vect[k] = data_ref;
6290 }
6291
6292 for (k = 0; k < 3; k++)
6293 {
6294 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6295 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6296 vect[k % 3], vect[(k + 1) % 3],
6297 shift1_mask);
6298 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6299 vect_shift[k] = data_ref;
6300 }
6301
6302 for (k = 0; k < 3; k++)
6303 {
6304 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6305 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6306 vect_shift[(4 - k) % 3],
6307 vect_shift[(3 - k) % 3],
6308 shift2_mask);
6309 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6310 vect[k] = data_ref;
6311 }
6312
6313 (*result_chain)[3 - (nelt % 3)] = vect[2];
6314
6315 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6316 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6317 vect[0], shift3_mask);
6318 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6319 (*result_chain)[nelt % 3] = data_ref;
6320
6321 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6322 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6323 vect[1], shift4_mask);
6324 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6325 (*result_chain)[0] = data_ref;
6326 return true;
6327 }
6328 return false;
6329 }
6330
6331 /* Function vect_transform_grouped_load.
6332
6333 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6334 to perform their permutation and ascribe the result vectorized statements to
6335 the scalar statements.
6336 */
6337
6338 void
6339 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6340 int size, gimple_stmt_iterator *gsi)
6341 {
6342 machine_mode mode;
6343 vec<tree> result_chain = vNULL;
6344
6345 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6346 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6347 vectors, that are ready for vector computation. */
6348 result_chain.create (size);
6349
6350 /* If reassociation width for vector type is 2 or greater target machine can
6351 execute 2 or more vector instructions in parallel. Otherwise try to
6352 get chain for loads group using vect_shift_permute_load_chain. */
6353 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6354 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6355 || pow2p_hwi (size)
6356 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6357 gsi, &result_chain))
6358 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6359 vect_record_grouped_load_vectors (stmt_info, result_chain);
6360 result_chain.release ();
6361 }
6362
6363 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6364 generated as part of the vectorization of STMT_INFO. Assign the statement
6365 for each vector to the associated scalar statement. */
6366
6367 void
6368 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6369 vec<tree> result_chain)
6370 {
6371 vec_info *vinfo = stmt_info->vinfo;
6372 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6373 unsigned int i, gap_count;
6374 tree tmp_data_ref;
6375
6376 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6377 Since we scan the chain starting from it's first node, their order
6378 corresponds the order of data-refs in RESULT_CHAIN. */
6379 stmt_vec_info next_stmt_info = first_stmt_info;
6380 gap_count = 1;
6381 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6382 {
6383 if (!next_stmt_info)
6384 break;
6385
6386 /* Skip the gaps. Loads created for the gaps will be removed by dead
6387 code elimination pass later. No need to check for the first stmt in
6388 the group, since it always exists.
6389 DR_GROUP_GAP is the number of steps in elements from the previous
6390 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6391 correspond to the gaps. */
6392 if (next_stmt_info != first_stmt_info
6393 && gap_count < DR_GROUP_GAP (next_stmt_info))
6394 {
6395 gap_count++;
6396 continue;
6397 }
6398
6399 /* ??? The following needs cleanup after the removal of
6400 DR_GROUP_SAME_DR_STMT. */
6401 if (next_stmt_info)
6402 {
6403 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6404 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6405 copies, and we put the new vector statement in the first available
6406 RELATED_STMT. */
6407 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6408 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6409 else
6410 {
6411 stmt_vec_info prev_stmt_info
6412 = STMT_VINFO_VEC_STMT (next_stmt_info);
6413 stmt_vec_info rel_stmt_info
6414 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6415 while (rel_stmt_info)
6416 {
6417 prev_stmt_info = rel_stmt_info;
6418 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6419 }
6420
6421 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6422 }
6423
6424 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6425 gap_count = 1;
6426 }
6427 }
6428 }
6429
6430 /* Function vect_force_dr_alignment_p.
6431
6432 Returns whether the alignment of a DECL can be forced to be aligned
6433 on ALIGNMENT bit boundary. */
6434
6435 bool
6436 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6437 {
6438 if (!VAR_P (decl))
6439 return false;
6440
6441 if (decl_in_symtab_p (decl)
6442 && !symtab_node::get (decl)->can_increase_alignment_p ())
6443 return false;
6444
6445 if (TREE_STATIC (decl))
6446 return (known_le (alignment,
6447 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6448 else
6449 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6450 }
6451
6452
6453 /* Return whether the data reference DR_INFO is supported with respect to its
6454 alignment.
6455 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6456 it is aligned, i.e., check if it is possible to vectorize it with different
6457 alignment. */
6458
6459 enum dr_alignment_support
6460 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6461 bool check_aligned_accesses)
6462 {
6463 data_reference *dr = dr_info->dr;
6464 stmt_vec_info stmt_info = dr_info->stmt;
6465 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6466 machine_mode mode = TYPE_MODE (vectype);
6467 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6468 class loop *vect_loop = NULL;
6469 bool nested_in_vect_loop = false;
6470
6471 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6472 return dr_aligned;
6473
6474 /* For now assume all conditional loads/stores support unaligned
6475 access without any special code. */
6476 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6477 if (gimple_call_internal_p (stmt)
6478 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6479 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6480 return dr_unaligned_supported;
6481
6482 if (loop_vinfo)
6483 {
6484 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6485 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6486 }
6487
6488 /* Possibly unaligned access. */
6489
6490 /* We can choose between using the implicit realignment scheme (generating
6491 a misaligned_move stmt) and the explicit realignment scheme (generating
6492 aligned loads with a REALIGN_LOAD). There are two variants to the
6493 explicit realignment scheme: optimized, and unoptimized.
6494 We can optimize the realignment only if the step between consecutive
6495 vector loads is equal to the vector size. Since the vector memory
6496 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6497 is guaranteed that the misalignment amount remains the same throughout the
6498 execution of the vectorized loop. Therefore, we can create the
6499 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6500 at the loop preheader.
6501
6502 However, in the case of outer-loop vectorization, when vectorizing a
6503 memory access in the inner-loop nested within the LOOP that is now being
6504 vectorized, while it is guaranteed that the misalignment of the
6505 vectorized memory access will remain the same in different outer-loop
6506 iterations, it is *not* guaranteed that is will remain the same throughout
6507 the execution of the inner-loop. This is because the inner-loop advances
6508 with the original scalar step (and not in steps of VS). If the inner-loop
6509 step happens to be a multiple of VS, then the misalignment remains fixed
6510 and we can use the optimized realignment scheme. For example:
6511
6512 for (i=0; i<N; i++)
6513 for (j=0; j<M; j++)
6514 s += a[i+j];
6515
6516 When vectorizing the i-loop in the above example, the step between
6517 consecutive vector loads is 1, and so the misalignment does not remain
6518 fixed across the execution of the inner-loop, and the realignment cannot
6519 be optimized (as illustrated in the following pseudo vectorized loop):
6520
6521 for (i=0; i<N; i+=4)
6522 for (j=0; j<M; j++){
6523 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6524 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6525 // (assuming that we start from an aligned address).
6526 }
6527
6528 We therefore have to use the unoptimized realignment scheme:
6529
6530 for (i=0; i<N; i+=4)
6531 for (j=k; j<M; j+=4)
6532 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6533 // that the misalignment of the initial address is
6534 // 0).
6535
6536 The loop can then be vectorized as follows:
6537
6538 for (k=0; k<4; k++){
6539 rt = get_realignment_token (&vp[k]);
6540 for (i=0; i<N; i+=4){
6541 v1 = vp[i+k];
6542 for (j=k; j<M; j+=4){
6543 v2 = vp[i+j+VS-1];
6544 va = REALIGN_LOAD <v1,v2,rt>;
6545 vs += va;
6546 v1 = v2;
6547 }
6548 }
6549 } */
6550
6551 if (DR_IS_READ (dr))
6552 {
6553 bool is_packed = false;
6554 tree type = (TREE_TYPE (DR_REF (dr)));
6555
6556 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6557 && (!targetm.vectorize.builtin_mask_for_load
6558 || targetm.vectorize.builtin_mask_for_load ()))
6559 {
6560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6561
6562 /* If we are doing SLP then the accesses need not have the
6563 same alignment, instead it depends on the SLP group size. */
6564 if (loop_vinfo
6565 && STMT_SLP_TYPE (stmt_info)
6566 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6567 * (DR_GROUP_SIZE
6568 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6569 TYPE_VECTOR_SUBPARTS (vectype)))
6570 ;
6571 else if (!loop_vinfo
6572 || (nested_in_vect_loop
6573 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6574 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6575 return dr_explicit_realign;
6576 else
6577 return dr_explicit_realign_optimized;
6578 }
6579 if (!known_alignment_for_access_p (dr_info))
6580 is_packed = not_size_aligned (DR_REF (dr));
6581
6582 if (targetm.vectorize.support_vector_misalignment
6583 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6584 /* Can't software pipeline the loads, but can at least do them. */
6585 return dr_unaligned_supported;
6586 }
6587 else
6588 {
6589 bool is_packed = false;
6590 tree type = (TREE_TYPE (DR_REF (dr)));
6591
6592 if (!known_alignment_for_access_p (dr_info))
6593 is_packed = not_size_aligned (DR_REF (dr));
6594
6595 if (targetm.vectorize.support_vector_misalignment
6596 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6597 return dr_unaligned_supported;
6598 }
6599
6600 /* Unsupported. */
6601 return dr_unaligned_unsupported;
6602 }