]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-data-refs.c
Add flags to dr_with_seg_len_pair_t
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
63 {
64 machine_mode mode, array_mode;
65 bool limit_p;
66
67 mode = TYPE_MODE (vectype);
68 if (!targetm.array_mode (mode, count).exists (&array_mode))
69 {
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
73 {
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode), count);
78 return false;
79 }
80 }
81
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
83 {
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "cannot use %s<%s><%s>\n", name,
87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88 return false;
89 }
90
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE, vect_location,
93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94 GET_MODE_NAME (mode));
95
96 return true;
97 }
98
99
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
116
117 tree
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
121 {
122 tree scalar_type = gimple_expr_type (stmt_info->stmt);
123 HOST_WIDE_INT lhs, rhs;
124
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
129
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
131
132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133 if (assign
134 && (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
140 {
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
142
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
146 }
147 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
148 {
149 unsigned int i = 0;
150 if (gimple_call_internal_p (call))
151 {
152 internal_fn ifn = gimple_call_internal_fn (call);
153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 /* gimple_expr_type already picked the type of the loaded
155 or stored data. */
156 i = ~0U;
157 else if (internal_fn_mask_index (ifn) == 0)
158 i = 1;
159 }
160 if (i < gimple_call_num_args (call))
161 {
162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
164 {
165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 if (rhs < lhs)
167 scalar_type = rhs_type;
168 }
169 }
170 }
171
172 *lhs_size_unit = lhs;
173 *rhs_size_unit = rhs;
174 return scalar_type;
175 }
176
177
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
181
182 static opt_result
183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
184 {
185 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186
187 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
188 return opt_result::failure_at (vect_location,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
191 " == 0\n");
192
193 opt_result res
194 = runtime_alias_check_p (ddr, loop,
195 optimize_loop_nest_for_speed_p (loop));
196 if (!res)
197 return res;
198
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200 return opt_result::success ();
201 }
202
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
204
205 static void
206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
207 {
208 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209 for (unsigned int i = 0; i < checks.length(); ++i)
210 if (checks[i] == value)
211 return;
212
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location,
215 "need run-time check that %T is nonzero\n",
216 value);
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
218 }
219
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
223
224 static bool
225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
226 {
227 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
229
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233 return true;
234
235 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
236 SLP group are emitted at the position of the last scalar load and
237 all loads in an interleaving group are emitted at the position
238 of the first scalar load.
239 Stores in a group are emitted at the position of the last scalar store.
240 Compute that position and check whether the resulting order matches
241 the current one.
242 We have not yet decided between SLP and interleaving so we have
243 to conservatively assume both. */
244 stmt_vec_info il_a;
245 stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
246 if (last_a)
247 {
248 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
249 s = DR_GROUP_NEXT_ELEMENT (s))
250 last_a = get_later_stmt (last_a, s);
251 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
252 {
253 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
254 s = DR_GROUP_NEXT_ELEMENT (s))
255 if (get_later_stmt (il_a, s) == il_a)
256 il_a = s;
257 }
258 else
259 il_a = last_a;
260 }
261 else
262 last_a = il_a = stmtinfo_a;
263 stmt_vec_info il_b;
264 stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
265 if (last_b)
266 {
267 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
268 s = DR_GROUP_NEXT_ELEMENT (s))
269 last_b = get_later_stmt (last_b, s);
270 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
271 {
272 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
273 s = DR_GROUP_NEXT_ELEMENT (s))
274 if (get_later_stmt (il_b, s) == il_b)
275 il_b = s;
276 }
277 else
278 il_b = last_b;
279 }
280 else
281 last_b = il_b = stmtinfo_b;
282 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
283 return (/* SLP */
284 (get_later_stmt (last_a, last_b) == last_a) == a_after_b
285 /* Interleaving */
286 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
287 /* Mixed */
288 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
289 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
290 }
291
292 /* A subroutine of vect_analyze_data_ref_dependence. Handle
293 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
294 distances. These distances are conservatively correct but they don't
295 reflect a guaranteed dependence.
296
297 Return true if this function does all the work necessary to avoid
298 an alias or false if the caller should use the dependence distances
299 to limit the vectorization factor in the usual way. LOOP_DEPTH is
300 the depth of the loop described by LOOP_VINFO and the other arguments
301 are as for vect_analyze_data_ref_dependence. */
302
303 static bool
304 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
305 loop_vec_info loop_vinfo,
306 int loop_depth, unsigned int *max_vf)
307 {
308 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
309 lambda_vector dist_v;
310 unsigned int i;
311 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
312 {
313 int dist = dist_v[loop_depth];
314 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
315 {
316 /* If the user asserted safelen >= DIST consecutive iterations
317 can be executed concurrently, assume independence.
318
319 ??? An alternative would be to add the alias check even
320 in this case, and vectorize the fallback loop with the
321 maximum VF set to safelen. However, if the user has
322 explicitly given a length, it's less likely that that
323 would be a win. */
324 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
325 {
326 if ((unsigned int) loop->safelen < *max_vf)
327 *max_vf = loop->safelen;
328 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
329 continue;
330 }
331
332 /* For dependence distances of 2 or more, we have the option
333 of limiting VF or checking for an alias at runtime.
334 Prefer to check at runtime if we can, to avoid limiting
335 the VF unnecessarily when the bases are in fact independent.
336
337 Note that the alias checks will be removed if the VF ends up
338 being small enough. */
339 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
340 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
341 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
342 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
343 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
344 }
345 }
346 return true;
347 }
348
349
350 /* Function vect_analyze_data_ref_dependence.
351
352 FIXME: I needed to change the sense of the returned flag.
353
354 Return FALSE if there (might) exist a dependence between a memory-reference
355 DRA and a memory-reference DRB. When versioning for alias may check a
356 dependence at run-time, return TRUE. Adjust *MAX_VF according to
357 the data dependence. */
358
359 static opt_result
360 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
361 loop_vec_info loop_vinfo,
362 unsigned int *max_vf)
363 {
364 unsigned int i;
365 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
366 struct data_reference *dra = DDR_A (ddr);
367 struct data_reference *drb = DDR_B (ddr);
368 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
369 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
370 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
371 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
372 lambda_vector dist_v;
373 unsigned int loop_depth;
374
375 /* In loop analysis all data references should be vectorizable. */
376 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
377 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
378 gcc_unreachable ();
379
380 /* Independent data accesses. */
381 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
382 return opt_result::success ();
383
384 if (dra == drb
385 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
386 return opt_result::success ();
387
388 /* We do not have to consider dependences between accesses that belong
389 to the same group, unless the stride could be smaller than the
390 group size. */
391 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
392 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
394 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
395 return opt_result::success ();
396
397 /* Even if we have an anti-dependence then, as the vectorized loop covers at
398 least two scalar iterations, there is always also a true dependence.
399 As the vectorizer does not re-order loads and stores we can ignore
400 the anti-dependence if TBAA can disambiguate both DRs similar to the
401 case with known negative distance anti-dependences (positive
402 distance anti-dependences would violate TBAA constraints). */
403 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
404 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
405 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
406 get_alias_set (DR_REF (drb))))
407 return opt_result::success ();
408
409 /* Unknown data dependence. */
410 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
411 {
412 /* If user asserted safelen consecutive iterations can be
413 executed concurrently, assume independence. */
414 if (loop->safelen >= 2)
415 {
416 if ((unsigned int) loop->safelen < *max_vf)
417 *max_vf = loop->safelen;
418 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
419 return opt_result::success ();
420 }
421
422 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
423 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
424 return opt_result::failure_at
425 (stmtinfo_a->stmt,
426 "versioning for alias not supported for: "
427 "can't determine dependence between %T and %T\n",
428 DR_REF (dra), DR_REF (drb));
429
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
432 "versioning for alias required: "
433 "can't determine dependence between %T and %T\n",
434 DR_REF (dra), DR_REF (drb));
435
436 /* Add to list of ddrs that need to be tested at run-time. */
437 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
438 }
439
440 /* Known data dependence. */
441 if (DDR_NUM_DIST_VECTS (ddr) == 0)
442 {
443 /* If user asserted safelen consecutive iterations can be
444 executed concurrently, assume independence. */
445 if (loop->safelen >= 2)
446 {
447 if ((unsigned int) loop->safelen < *max_vf)
448 *max_vf = loop->safelen;
449 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
450 return opt_result::success ();
451 }
452
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
455 return opt_result::failure_at
456 (stmtinfo_a->stmt,
457 "versioning for alias not supported for: "
458 "bad dist vector for %T and %T\n",
459 DR_REF (dra), DR_REF (drb));
460
461 if (dump_enabled_p ())
462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
463 "versioning for alias required: "
464 "bad dist vector for %T and %T\n",
465 DR_REF (dra), DR_REF (drb));
466 /* Add to list of ddrs that need to be tested at run-time. */
467 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
468 }
469
470 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
471
472 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
473 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
474 loop_depth, max_vf))
475 return opt_result::success ();
476
477 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
478 {
479 int dist = dist_v[loop_depth];
480
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_NOTE, vect_location,
483 "dependence distance = %d.\n", dist);
484
485 if (dist == 0)
486 {
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location,
489 "dependence distance == 0 between %T and %T\n",
490 DR_REF (dra), DR_REF (drb));
491
492 /* When we perform grouped accesses and perform implicit CSE
493 by detecting equal accesses and doing disambiguation with
494 runtime alias tests like for
495 .. = a[i];
496 .. = a[i+1];
497 a[i] = ..;
498 a[i+1] = ..;
499 *p = ..;
500 .. = a[i];
501 .. = a[i+1];
502 where we will end up loading { a[i], a[i+1] } once, make
503 sure that inserting group loads before the first load and
504 stores after the last store will do the right thing.
505 Similar for groups like
506 a[i] = ...;
507 ... = a[i];
508 a[i+1] = ...;
509 where loads from the group interleave with the store. */
510 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
511 return opt_result::failure_at (stmtinfo_a->stmt,
512 "READ_WRITE dependence"
513 " in interleaving.\n");
514
515 if (loop->safelen < 2)
516 {
517 tree indicator = dr_zero_step_indicator (dra);
518 if (!indicator || integer_zerop (indicator))
519 return opt_result::failure_at (stmtinfo_a->stmt,
520 "access also has a zero step\n");
521 else if (TREE_CODE (indicator) != INTEGER_CST)
522 vect_check_nonzero_value (loop_vinfo, indicator);
523 }
524 continue;
525 }
526
527 if (dist > 0 && DDR_REVERSED_P (ddr))
528 {
529 /* If DDR_REVERSED_P the order of the data-refs in DDR was
530 reversed (to make distance vector positive), and the actual
531 distance is negative. */
532 if (dump_enabled_p ())
533 dump_printf_loc (MSG_NOTE, vect_location,
534 "dependence distance negative.\n");
535 /* When doing outer loop vectorization, we need to check if there is
536 a backward dependence at the inner loop level if the dependence
537 at the outer loop is reversed. See PR81740. */
538 if (nested_in_vect_loop_p (loop, stmtinfo_a)
539 || nested_in_vect_loop_p (loop, stmtinfo_b))
540 {
541 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
542 DDR_LOOP_NEST (ddr));
543 if (dist_v[inner_depth] < 0)
544 return opt_result::failure_at (stmtinfo_a->stmt,
545 "not vectorized, dependence "
546 "between data-refs %T and %T\n",
547 DR_REF (dra), DR_REF (drb));
548 }
549 /* Record a negative dependence distance to later limit the
550 amount of stmt copying / unrolling we can perform.
551 Only need to handle read-after-write dependence. */
552 if (DR_IS_READ (drb)
553 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
554 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
555 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
556 continue;
557 }
558
559 unsigned int abs_dist = abs (dist);
560 if (abs_dist >= 2 && abs_dist < *max_vf)
561 {
562 /* The dependence distance requires reduction of the maximal
563 vectorization factor. */
564 *max_vf = abs_dist;
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "adjusting maximal vectorization factor to %i\n",
568 *max_vf);
569 }
570
571 if (abs_dist >= *max_vf)
572 {
573 /* Dependence distance does not create dependence, as far as
574 vectorization is concerned, in this case. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location,
577 "dependence distance >= VF.\n");
578 continue;
579 }
580
581 return opt_result::failure_at (stmtinfo_a->stmt,
582 "not vectorized, possible dependence "
583 "between data-refs %T and %T\n",
584 DR_REF (dra), DR_REF (drb));
585 }
586
587 return opt_result::success ();
588 }
589
590 /* Function vect_analyze_data_ref_dependences.
591
592 Examine all the data references in the loop, and make sure there do not
593 exist any data dependences between them. Set *MAX_VF according to
594 the maximum vectorization factor the data dependences allow. */
595
596 opt_result
597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
598 unsigned int *max_vf)
599 {
600 unsigned int i;
601 struct data_dependence_relation *ddr;
602
603 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
604
605 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
606 {
607 LOOP_VINFO_DDRS (loop_vinfo)
608 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
609 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
610 /* We need read-read dependences to compute
611 STMT_VINFO_SAME_ALIGN_REFS. */
612 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
613 &LOOP_VINFO_DDRS (loop_vinfo),
614 LOOP_VINFO_LOOP_NEST (loop_vinfo),
615 true);
616 gcc_assert (res);
617 }
618
619 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
620
621 /* For epilogues we either have no aliases or alias versioning
622 was applied to original loop. Therefore we may just get max_vf
623 using VF of original loop. */
624 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
625 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
626 else
627 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
628 {
629 opt_result res
630 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
631 if (!res)
632 return res;
633 }
634
635 return opt_result::success ();
636 }
637
638
639 /* Function vect_slp_analyze_data_ref_dependence.
640
641 Return TRUE if there (might) exist a dependence between a memory-reference
642 DRA and a memory-reference DRB for VINFO. When versioning for alias
643 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
644 according to the data dependence. */
645
646 static bool
647 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
648 struct data_dependence_relation *ddr)
649 {
650 struct data_reference *dra = DDR_A (ddr);
651 struct data_reference *drb = DDR_B (ddr);
652 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
653 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
654
655 /* We need to check dependences of statements marked as unvectorizable
656 as well, they still can prohibit vectorization. */
657
658 /* Independent data accesses. */
659 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
660 return false;
661
662 if (dra == drb)
663 return false;
664
665 /* Read-read is OK. */
666 if (DR_IS_READ (dra) && DR_IS_READ (drb))
667 return false;
668
669 /* If dra and drb are part of the same interleaving chain consider
670 them independent. */
671 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
672 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
673 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
674 return false;
675
676 /* Unknown data dependence. */
677 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
678 {
679 if (dump_enabled_p ())
680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
681 "can't determine dependence between %T and %T\n",
682 DR_REF (dra), DR_REF (drb));
683 }
684 else if (dump_enabled_p ())
685 dump_printf_loc (MSG_NOTE, vect_location,
686 "determined dependence between %T and %T\n",
687 DR_REF (dra), DR_REF (drb));
688
689 return true;
690 }
691
692
693 /* Analyze dependences involved in the transform of SLP NODE. STORES
694 contain the vector of scalar stores of this instance if we are
695 disambiguating the loads. */
696
697 static bool
698 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
699 vec<stmt_vec_info> stores,
700 stmt_vec_info last_store_info)
701 {
702 /* This walks over all stmts involved in the SLP load/store done
703 in NODE verifying we can sink them up to the last stmt in the
704 group. */
705 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
706 vec_info *vinfo = last_access_info->vinfo;
707 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
708 {
709 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
710 if (access_info == last_access_info)
711 continue;
712 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
713 ao_ref ref;
714 bool ref_initialized_p = false;
715 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
716 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
717 {
718 gimple *stmt = gsi_stmt (gsi);
719 if (! gimple_vuse (stmt)
720 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
721 continue;
722
723 /* If we couldn't record a (single) data reference for this
724 stmt we have to resort to the alias oracle. */
725 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
726 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
727 if (!dr_b)
728 {
729 /* We are moving a store or sinking a load - this means
730 we cannot use TBAA for disambiguation. */
731 if (!ref_initialized_p)
732 ao_ref_init (&ref, DR_REF (dr_a));
733 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
734 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
735 return false;
736 continue;
737 }
738
739 bool dependent = false;
740 /* If we run into a store of this same instance (we've just
741 marked those) then delay dependence checking until we run
742 into the last store because this is where it will have
743 been sunk to (and we verify if we can do that as well). */
744 if (gimple_visited_p (stmt))
745 {
746 if (stmt_info != last_store_info)
747 continue;
748 unsigned i;
749 stmt_vec_info store_info;
750 FOR_EACH_VEC_ELT (stores, i, store_info)
751 {
752 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
753 ddr_p ddr = initialize_data_dependence_relation
754 (dr_a, store_dr, vNULL);
755 dependent
756 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
757 free_dependence_relation (ddr);
758 if (dependent)
759 break;
760 }
761 }
762 else
763 {
764 ddr_p ddr = initialize_data_dependence_relation (dr_a,
765 dr_b, vNULL);
766 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
767 free_dependence_relation (ddr);
768 }
769 if (dependent)
770 return false;
771 }
772 }
773 return true;
774 }
775
776
777 /* Function vect_analyze_data_ref_dependences.
778
779 Examine all the data references in the basic-block, and make sure there
780 do not exist any data dependences between them. Set *MAX_VF according to
781 the maximum vectorization factor the data dependences allow. */
782
783 bool
784 vect_slp_analyze_instance_dependence (slp_instance instance)
785 {
786 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
787
788 /* The stores of this instance are at the root of the SLP tree. */
789 slp_tree store = SLP_INSTANCE_TREE (instance);
790 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
791 store = NULL;
792
793 /* Verify we can sink stores to the vectorized stmt insert location. */
794 stmt_vec_info last_store_info = NULL;
795 if (store)
796 {
797 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
798 return false;
799
800 /* Mark stores in this instance and remember the last one. */
801 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
802 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
803 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
804 }
805
806 bool res = true;
807
808 /* Verify we can sink loads to the vectorized stmt insert location,
809 special-casing stores of this instance. */
810 slp_tree load;
811 unsigned int i;
812 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
813 if (! vect_slp_analyze_node_dependences (instance, load,
814 store
815 ? SLP_TREE_SCALAR_STMTS (store)
816 : vNULL, last_store_info))
817 {
818 res = false;
819 break;
820 }
821
822 /* Unset the visited flag. */
823 if (store)
824 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
825 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
826
827 return res;
828 }
829
830 /* Record the base alignment guarantee given by DRB, which occurs
831 in STMT_INFO. */
832
833 static void
834 vect_record_base_alignment (stmt_vec_info stmt_info,
835 innermost_loop_behavior *drb)
836 {
837 vec_info *vinfo = stmt_info->vinfo;
838 bool existed;
839 innermost_loop_behavior *&entry
840 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
841 if (!existed || entry->base_alignment < drb->base_alignment)
842 {
843 entry = drb;
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_NOTE, vect_location,
846 "recording new base alignment for %T\n"
847 " alignment: %d\n"
848 " misalignment: %d\n"
849 " based on: %G",
850 drb->base_address,
851 drb->base_alignment,
852 drb->base_misalignment,
853 stmt_info->stmt);
854 }
855 }
856
857 /* If the region we're going to vectorize is reached, all unconditional
858 data references occur at least once. We can therefore pool the base
859 alignment guarantees from each unconditional reference. Do this by
860 going through all the data references in VINFO and checking whether
861 the containing statement makes the reference unconditionally. If so,
862 record the alignment of the base address in VINFO so that it can be
863 used for all other references with the same base. */
864
865 void
866 vect_record_base_alignments (vec_info *vinfo)
867 {
868 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
869 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
870 data_reference *dr;
871 unsigned int i;
872 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
873 {
874 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
875 stmt_vec_info stmt_info = dr_info->stmt;
876 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
877 && STMT_VINFO_VECTORIZABLE (stmt_info)
878 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
879 {
880 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
881
882 /* If DR is nested in the loop that is being vectorized, we can also
883 record the alignment of the base wrt the outer loop. */
884 if (loop && nested_in_vect_loop_p (loop, stmt_info))
885 vect_record_base_alignment
886 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
887 }
888 }
889 }
890
891 /* Return the target alignment for the vectorized form of DR_INFO. */
892
893 static poly_uint64
894 vect_calculate_target_alignment (dr_vec_info *dr_info)
895 {
896 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
897 return targetm.vectorize.preferred_vector_alignment (vectype);
898 }
899
900 /* Function vect_compute_data_ref_alignment
901
902 Compute the misalignment of the data reference DR_INFO.
903
904 Output:
905 1. DR_MISALIGNMENT (DR_INFO) is defined.
906
907 FOR NOW: No analysis is actually performed. Misalignment is calculated
908 only for trivial cases. TODO. */
909
910 static void
911 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
912 {
913 stmt_vec_info stmt_info = dr_info->stmt;
914 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
915 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
916 class loop *loop = NULL;
917 tree ref = DR_REF (dr_info->dr);
918 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
919
920 if (dump_enabled_p ())
921 dump_printf_loc (MSG_NOTE, vect_location,
922 "vect_compute_data_ref_alignment:\n");
923
924 if (loop_vinfo)
925 loop = LOOP_VINFO_LOOP (loop_vinfo);
926
927 /* Initialize misalignment to unknown. */
928 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
929
930 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
931 return;
932
933 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
934 bool step_preserves_misalignment_p;
935
936 poly_uint64 vector_alignment
937 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
938 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
939
940 /* If the main loop has peeled for alignment we have no way of knowing
941 whether the data accesses in the epilogues are aligned. We can't at
942 compile time answer the question whether we have entered the main loop or
943 not. Fixes PR 92351. */
944 if (loop_vinfo)
945 {
946 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
947 if (orig_loop_vinfo
948 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
949 return;
950 }
951
952 unsigned HOST_WIDE_INT vect_align_c;
953 if (!vector_alignment.is_constant (&vect_align_c))
954 return;
955
956 /* No step for BB vectorization. */
957 if (!loop)
958 {
959 gcc_assert (integer_zerop (drb->step));
960 step_preserves_misalignment_p = true;
961 }
962
963 /* In case the dataref is in an inner-loop of the loop that is being
964 vectorized (LOOP), we use the base and misalignment information
965 relative to the outer-loop (LOOP). This is ok only if the misalignment
966 stays the same throughout the execution of the inner-loop, which is why
967 we have to check that the stride of the dataref in the inner-loop evenly
968 divides by the vector alignment. */
969 else if (nested_in_vect_loop_p (loop, stmt_info))
970 {
971 step_preserves_misalignment_p
972 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
973
974 if (dump_enabled_p ())
975 {
976 if (step_preserves_misalignment_p)
977 dump_printf_loc (MSG_NOTE, vect_location,
978 "inner step divides the vector alignment.\n");
979 else
980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
981 "inner step doesn't divide the vector"
982 " alignment.\n");
983 }
984 }
985
986 /* Similarly we can only use base and misalignment information relative to
987 an innermost loop if the misalignment stays the same throughout the
988 execution of the loop. As above, this is the case if the stride of
989 the dataref evenly divides by the alignment. */
990 else
991 {
992 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
993 step_preserves_misalignment_p
994 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
995
996 if (!step_preserves_misalignment_p && dump_enabled_p ())
997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
998 "step doesn't divide the vector alignment.\n");
999 }
1000
1001 unsigned int base_alignment = drb->base_alignment;
1002 unsigned int base_misalignment = drb->base_misalignment;
1003
1004 /* Calculate the maximum of the pooled base address alignment and the
1005 alignment that we can compute for DR itself. */
1006 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1007 if (entry && base_alignment < (*entry)->base_alignment)
1008 {
1009 base_alignment = (*entry)->base_alignment;
1010 base_misalignment = (*entry)->base_misalignment;
1011 }
1012
1013 if (drb->offset_alignment < vect_align_c
1014 || !step_preserves_misalignment_p
1015 /* We need to know whether the step wrt the vectorized loop is
1016 negative when computing the starting misalignment below. */
1017 || TREE_CODE (drb->step) != INTEGER_CST)
1018 {
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 "Unknown alignment for access: %T\n", ref);
1022 return;
1023 }
1024
1025 if (base_alignment < vect_align_c)
1026 {
1027 unsigned int max_alignment;
1028 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1029 if (max_alignment < vect_align_c
1030 || !vect_can_force_dr_alignment_p (base,
1031 vect_align_c * BITS_PER_UNIT))
1032 {
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_NOTE, vect_location,
1035 "can't force alignment of ref: %T\n", ref);
1036 return;
1037 }
1038
1039 /* Force the alignment of the decl.
1040 NOTE: This is the only change to the code we make during
1041 the analysis phase, before deciding to vectorize the loop. */
1042 if (dump_enabled_p ())
1043 dump_printf_loc (MSG_NOTE, vect_location,
1044 "force alignment of %T\n", ref);
1045
1046 dr_info->base_decl = base;
1047 dr_info->base_misaligned = true;
1048 base_misalignment = 0;
1049 }
1050 poly_int64 misalignment
1051 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1052
1053 /* If this is a backward running DR then first access in the larger
1054 vectype actually is N-1 elements before the address in the DR.
1055 Adjust misalign accordingly. */
1056 if (tree_int_cst_sgn (drb->step) < 0)
1057 /* PLUS because STEP is negative. */
1058 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1059 * TREE_INT_CST_LOW (drb->step));
1060
1061 unsigned int const_misalignment;
1062 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1063 {
1064 if (dump_enabled_p ())
1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 "Non-constant misalignment for access: %T\n", ref);
1067 return;
1068 }
1069
1070 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1071
1072 if (dump_enabled_p ())
1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 "misalign = %d bytes of ref %T\n",
1075 DR_MISALIGNMENT (dr_info), ref);
1076
1077 return;
1078 }
1079
1080 /* Function vect_update_misalignment_for_peel.
1081 Sets DR_INFO's misalignment
1082 - to 0 if it has the same alignment as DR_PEEL_INFO,
1083 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1084 - to -1 (unknown) otherwise.
1085
1086 DR_INFO - the data reference whose misalignment is to be adjusted.
1087 DR_PEEL_INFO - the data reference whose misalignment is being made
1088 zero in the vector loop by the peel.
1089 NPEEL - the number of iterations in the peel loop if the misalignment
1090 of DR_PEEL_INFO is known at compile time. */
1091
1092 static void
1093 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1094 dr_vec_info *dr_peel_info, int npeel)
1095 {
1096 unsigned int i;
1097 vec<dr_p> same_aligned_drs;
1098 struct data_reference *current_dr;
1099 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1100
1101 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1102 it is aligned in the vector loop. */
1103 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1104 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1105 {
1106 if (current_dr != dr_info->dr)
1107 continue;
1108 gcc_assert (!known_alignment_for_access_p (dr_info)
1109 || !known_alignment_for_access_p (dr_peel_info)
1110 || (DR_MISALIGNMENT (dr_info)
1111 == DR_MISALIGNMENT (dr_peel_info)));
1112 SET_DR_MISALIGNMENT (dr_info, 0);
1113 return;
1114 }
1115
1116 unsigned HOST_WIDE_INT alignment;
1117 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1118 && known_alignment_for_access_p (dr_info)
1119 && known_alignment_for_access_p (dr_peel_info))
1120 {
1121 int misal = DR_MISALIGNMENT (dr_info);
1122 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1123 misal &= alignment - 1;
1124 SET_DR_MISALIGNMENT (dr_info, misal);
1125 return;
1126 }
1127
1128 if (dump_enabled_p ())
1129 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1130 "to unknown (-1).\n");
1131 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1132 }
1133
1134
1135 /* Function verify_data_ref_alignment
1136
1137 Return TRUE if DR_INFO can be handled with respect to alignment. */
1138
1139 static opt_result
1140 verify_data_ref_alignment (dr_vec_info *dr_info)
1141 {
1142 enum dr_alignment_support supportable_dr_alignment
1143 = vect_supportable_dr_alignment (dr_info, false);
1144 if (!supportable_dr_alignment)
1145 return opt_result::failure_at
1146 (dr_info->stmt->stmt,
1147 DR_IS_READ (dr_info->dr)
1148 ? "not vectorized: unsupported unaligned load: %T\n"
1149 : "not vectorized: unsupported unaligned store: %T\n",
1150 DR_REF (dr_info->dr));
1151
1152 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1153 dump_printf_loc (MSG_NOTE, vect_location,
1154 "Vectorizing an unaligned access.\n");
1155
1156 return opt_result::success ();
1157 }
1158
1159 /* Function vect_verify_datarefs_alignment
1160
1161 Return TRUE if all data references in the loop can be
1162 handled with respect to alignment. */
1163
1164 opt_result
1165 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1166 {
1167 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1168 struct data_reference *dr;
1169 unsigned int i;
1170
1171 FOR_EACH_VEC_ELT (datarefs, i, dr)
1172 {
1173 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1174 stmt_vec_info stmt_info = dr_info->stmt;
1175
1176 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1177 continue;
1178
1179 /* For interleaving, only the alignment of the first access matters. */
1180 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1181 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1182 continue;
1183
1184 /* Strided accesses perform only component accesses, alignment is
1185 irrelevant for them. */
1186 if (STMT_VINFO_STRIDED_P (stmt_info)
1187 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1188 continue;
1189
1190 opt_result res = verify_data_ref_alignment (dr_info);
1191 if (!res)
1192 return res;
1193 }
1194
1195 return opt_result::success ();
1196 }
1197
1198 /* Given an memory reference EXP return whether its alignment is less
1199 than its size. */
1200
1201 static bool
1202 not_size_aligned (tree exp)
1203 {
1204 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205 return true;
1206
1207 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208 > get_object_alignment (exp));
1209 }
1210
1211 /* Function vector_alignment_reachable_p
1212
1213 Return true if vector alignment for DR_INFO is reachable by peeling
1214 a few loop iterations. Return false otherwise. */
1215
1216 static bool
1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1218 {
1219 stmt_vec_info stmt_info = dr_info->stmt;
1220 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1221
1222 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1223 {
1224 /* For interleaved access we peel only if number of iterations in
1225 the prolog loop ({VF - misalignment}), is a multiple of the
1226 number of the interleaved accesses. */
1227 int elem_size, mis_in_elements;
1228
1229 /* FORNOW: handle only known alignment. */
1230 if (!known_alignment_for_access_p (dr_info))
1231 return false;
1232
1233 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235 elem_size = vector_element_size (vector_size, nelements);
1236 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1237
1238 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239 return false;
1240 }
1241
1242 /* If misalignment is known at the compile time then allow peeling
1243 only if natural alignment is reachable through peeling. */
1244 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1245 {
1246 HOST_WIDE_INT elmsize =
1247 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248 if (dump_enabled_p ())
1249 {
1250 dump_printf_loc (MSG_NOTE, vect_location,
1251 "data size = %wd. misalignment = %d.\n", elmsize,
1252 DR_MISALIGNMENT (dr_info));
1253 }
1254 if (DR_MISALIGNMENT (dr_info) % elmsize)
1255 {
1256 if (dump_enabled_p ())
1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 "data size does not divide the misalignment.\n");
1259 return false;
1260 }
1261 }
1262
1263 if (!known_alignment_for_access_p (dr_info))
1264 {
1265 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 "Unknown misalignment, %snaturally aligned\n",
1270 is_packed ? "not " : "");
1271 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272 }
1273
1274 return true;
1275 }
1276
1277
1278 /* Calculate the cost of the memory access represented by DR_INFO. */
1279
1280 static void
1281 vect_get_data_access_cost (dr_vec_info *dr_info,
1282 unsigned int *inside_cost,
1283 unsigned int *outside_cost,
1284 stmt_vector_for_cost *body_cost_vec,
1285 stmt_vector_for_cost *prologue_cost_vec)
1286 {
1287 stmt_vec_info stmt_info = dr_info->stmt;
1288 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289 int ncopies;
1290
1291 if (PURE_SLP_STMT (stmt_info))
1292 ncopies = 1;
1293 else
1294 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295
1296 if (DR_IS_READ (dr_info->dr))
1297 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298 prologue_cost_vec, body_cost_vec, false);
1299 else
1300 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1301
1302 if (dump_enabled_p ())
1303 dump_printf_loc (MSG_NOTE, vect_location,
1304 "vect_get_data_access_cost: inside_cost = %d, "
1305 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1306 }
1307
1308
1309 typedef struct _vect_peel_info
1310 {
1311 dr_vec_info *dr_info;
1312 int npeel;
1313 unsigned int count;
1314 } *vect_peel_info;
1315
1316 typedef struct _vect_peel_extended_info
1317 {
1318 struct _vect_peel_info peel_info;
1319 unsigned int inside_cost;
1320 unsigned int outside_cost;
1321 } *vect_peel_extended_info;
1322
1323
1324 /* Peeling hashtable helpers. */
1325
1326 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1327 {
1328 static inline hashval_t hash (const _vect_peel_info *);
1329 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1330 };
1331
1332 inline hashval_t
1333 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1334 {
1335 return (hashval_t) peel_info->npeel;
1336 }
1337
1338 inline bool
1339 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1340 {
1341 return (a->npeel == b->npeel);
1342 }
1343
1344
1345 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1346
1347 static void
1348 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350 int npeel)
1351 {
1352 struct _vect_peel_info elem, *slot;
1353 _vect_peel_info **new_slot;
1354 bool supportable_dr_alignment
1355 = vect_supportable_dr_alignment (dr_info, true);
1356
1357 elem.npeel = npeel;
1358 slot = peeling_htab->find (&elem);
1359 if (slot)
1360 slot->count++;
1361 else
1362 {
1363 slot = XNEW (struct _vect_peel_info);
1364 slot->npeel = npeel;
1365 slot->dr_info = dr_info;
1366 slot->count = 1;
1367 new_slot = peeling_htab->find_slot (slot, INSERT);
1368 *new_slot = slot;
1369 }
1370
1371 if (!supportable_dr_alignment
1372 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373 slot->count += VECT_MAX_COST;
1374 }
1375
1376
1377 /* Traverse peeling hash table to find peeling option that aligns maximum
1378 number of data accesses. */
1379
1380 int
1381 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382 _vect_peel_extended_info *max)
1383 {
1384 vect_peel_info elem = *slot;
1385
1386 if (elem->count > max->peel_info.count
1387 || (elem->count == max->peel_info.count
1388 && max->peel_info.npeel > elem->npeel))
1389 {
1390 max->peel_info.npeel = elem->npeel;
1391 max->peel_info.count = elem->count;
1392 max->peel_info.dr_info = elem->dr_info;
1393 }
1394
1395 return 1;
1396 }
1397
1398 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1400 we assume DR0_INFO's misalignment will be zero after peeling. */
1401
1402 static void
1403 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404 dr_vec_info *dr0_info,
1405 unsigned int *inside_cost,
1406 unsigned int *outside_cost,
1407 stmt_vector_for_cost *body_cost_vec,
1408 stmt_vector_for_cost *prologue_cost_vec,
1409 unsigned int npeel,
1410 bool unknown_misalignment)
1411 {
1412 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413 unsigned i;
1414 data_reference *dr;
1415
1416 FOR_EACH_VEC_ELT (datarefs, i, dr)
1417 {
1418 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419 stmt_vec_info stmt_info = dr_info->stmt;
1420 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421 continue;
1422
1423 /* For interleaving, only the alignment of the first access
1424 matters. */
1425 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427 continue;
1428
1429 /* Strided accesses perform only component accesses, alignment is
1430 irrelevant for them. */
1431 if (STMT_VINFO_STRIDED_P (stmt_info)
1432 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433 continue;
1434
1435 int save_misalignment;
1436 save_misalignment = DR_MISALIGNMENT (dr_info);
1437 if (npeel == 0)
1438 ;
1439 else if (unknown_misalignment && dr_info == dr0_info)
1440 SET_DR_MISALIGNMENT (dr_info, 0);
1441 else
1442 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443 vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444 body_cost_vec, prologue_cost_vec);
1445 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1446 }
1447 }
1448
1449 /* Traverse peeling hash table and calculate cost for each peeling option.
1450 Find the one with the lowest cost. */
1451
1452 int
1453 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454 _vect_peel_extended_info *min)
1455 {
1456 vect_peel_info elem = *slot;
1457 int dummy;
1458 unsigned int inside_cost = 0, outside_cost = 0;
1459 stmt_vec_info stmt_info = elem->dr_info->stmt;
1460 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462 epilogue_cost_vec;
1463
1464 prologue_cost_vec.create (2);
1465 body_cost_vec.create (2);
1466 epilogue_cost_vec.create (2);
1467
1468 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469 &outside_cost, &body_cost_vec,
1470 &prologue_cost_vec, elem->npeel, false);
1471
1472 body_cost_vec.release ();
1473
1474 outside_cost += vect_get_known_peeling_cost
1475 (loop_vinfo, elem->npeel, &dummy,
1476 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477 &prologue_cost_vec, &epilogue_cost_vec);
1478
1479 /* Prologue and epilogue costs are added to the target model later.
1480 These costs depend only on the scalar iteration cost, the
1481 number of peeling iterations finally chosen, and the number of
1482 misaligned statements. So discard the information found here. */
1483 prologue_cost_vec.release ();
1484 epilogue_cost_vec.release ();
1485
1486 if (inside_cost < min->inside_cost
1487 || (inside_cost == min->inside_cost
1488 && outside_cost < min->outside_cost))
1489 {
1490 min->inside_cost = inside_cost;
1491 min->outside_cost = outside_cost;
1492 min->peel_info.dr_info = elem->dr_info;
1493 min->peel_info.npeel = elem->npeel;
1494 min->peel_info.count = elem->count;
1495 }
1496
1497 return 1;
1498 }
1499
1500
1501 /* Choose best peeling option by traversing peeling hash table and either
1502 choosing an option with the lowest cost (if cost model is enabled) or the
1503 option that aligns as many accesses as possible. */
1504
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507 loop_vec_info loop_vinfo)
1508 {
1509 struct _vect_peel_extended_info res;
1510
1511 res.peel_info.dr_info = NULL;
1512
1513 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514 {
1515 res.inside_cost = INT_MAX;
1516 res.outside_cost = INT_MAX;
1517 peeling_htab->traverse <_vect_peel_extended_info *,
1518 vect_peeling_hash_get_lowest_cost> (&res);
1519 }
1520 else
1521 {
1522 res.peel_info.count = 0;
1523 peeling_htab->traverse <_vect_peel_extended_info *,
1524 vect_peeling_hash_get_most_frequent> (&res);
1525 res.inside_cost = 0;
1526 res.outside_cost = 0;
1527 }
1528
1529 return res;
1530 }
1531
1532 /* Return true if the new peeling NPEEL is supported. */
1533
1534 static bool
1535 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536 unsigned npeel)
1537 {
1538 unsigned i;
1539 struct data_reference *dr = NULL;
1540 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541 enum dr_alignment_support supportable_dr_alignment;
1542
1543 /* Ensure that all data refs can be vectorized after the peel. */
1544 FOR_EACH_VEC_ELT (datarefs, i, dr)
1545 {
1546 int save_misalignment;
1547
1548 if (dr == dr0_info->dr)
1549 continue;
1550
1551 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552 stmt_vec_info stmt_info = dr_info->stmt;
1553 /* For interleaving, only the alignment of the first access
1554 matters. */
1555 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557 continue;
1558
1559 /* Strided accesses perform only component accesses, alignment is
1560 irrelevant for them. */
1561 if (STMT_VINFO_STRIDED_P (stmt_info)
1562 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563 continue;
1564
1565 save_misalignment = DR_MISALIGNMENT (dr_info);
1566 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567 supportable_dr_alignment
1568 = vect_supportable_dr_alignment (dr_info, false);
1569 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1570
1571 if (!supportable_dr_alignment)
1572 return false;
1573 }
1574
1575 return true;
1576 }
1577
1578 /* Function vect_enhance_data_refs_alignment
1579
1580 This pass will use loop versioning and loop peeling in order to enhance
1581 the alignment of data references in the loop.
1582
1583 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584 original loop is to be vectorized. Any other loops that are created by
1585 the transformations performed in this pass - are not supposed to be
1586 vectorized. This restriction will be relaxed.
1587
1588 This pass will require a cost model to guide it whether to apply peeling
1589 or versioning or a combination of the two. For example, the scheme that
1590 intel uses when given a loop with several memory accesses, is as follows:
1591 choose one memory access ('p') which alignment you want to force by doing
1592 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1593 other accesses are not necessarily aligned, or (2) use loop versioning to
1594 generate one loop in which all accesses are aligned, and another loop in
1595 which only 'p' is necessarily aligned.
1596
1597 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600
1601 Devising a cost model is the most critical aspect of this work. It will
1602 guide us on which access to peel for, whether to use loop versioning, how
1603 many versions to create, etc. The cost model will probably consist of
1604 generic considerations as well as target specific considerations (on
1605 powerpc for example, misaligned stores are more painful than misaligned
1606 loads).
1607
1608 Here are the general steps involved in alignment enhancements:
1609
1610 -- original loop, before alignment analysis:
1611 for (i=0; i<N; i++){
1612 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1614 }
1615
1616 -- After vect_compute_data_refs_alignment:
1617 for (i=0; i<N; i++){
1618 x = q[i]; # DR_MISALIGNMENT(q) = 3
1619 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1620 }
1621
1622 -- Possibility 1: we do loop versioning:
1623 if (p is aligned) {
1624 for (i=0; i<N; i++){ # loop 1A
1625 x = q[i]; # DR_MISALIGNMENT(q) = 3
1626 p[i] = y; # DR_MISALIGNMENT(p) = 0
1627 }
1628 }
1629 else {
1630 for (i=0; i<N; i++){ # loop 1B
1631 x = q[i]; # DR_MISALIGNMENT(q) = 3
1632 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1633 }
1634 }
1635
1636 -- Possibility 2: we do loop peeling:
1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1638 x = q[i];
1639 p[i] = y;
1640 }
1641 for (i = 3; i < N; i++){ # loop 2A
1642 x = q[i]; # DR_MISALIGNMENT(q) = 0
1643 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1644 }
1645
1646 -- Possibility 3: combination of loop peeling and versioning:
1647 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1648 x = q[i];
1649 p[i] = y;
1650 }
1651 if (p is aligned) {
1652 for (i = 3; i<N; i++){ # loop 3A
1653 x = q[i]; # DR_MISALIGNMENT(q) = 0
1654 p[i] = y; # DR_MISALIGNMENT(p) = 0
1655 }
1656 }
1657 else {
1658 for (i = 3; i<N; i++){ # loop 3B
1659 x = q[i]; # DR_MISALIGNMENT(q) = 0
1660 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1661 }
1662 }
1663
1664 These loops are later passed to loop_transform to be vectorized. The
1665 vectorizer will use the alignment information to guide the transformation
1666 (whether to generate regular loads/stores, or with special handling for
1667 misalignment). */
1668
1669 opt_result
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671 {
1672 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674 enum dr_alignment_support supportable_dr_alignment;
1675 dr_vec_info *first_store = NULL;
1676 dr_vec_info *dr0_info = NULL;
1677 struct data_reference *dr;
1678 unsigned int i, j;
1679 bool do_peeling = false;
1680 bool do_versioning = false;
1681 unsigned int npeel = 0;
1682 bool one_misalignment_known = false;
1683 bool one_misalignment_unknown = false;
1684 bool one_dr_unsupportable = false;
1685 dr_vec_info *unsupportable_dr_info = NULL;
1686 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1687 unsigned possible_npeel_number = 1;
1688 tree vectype;
1689 unsigned int mis, same_align_drs_max = 0;
1690 hash_table<peel_info_hasher> peeling_htab (1);
1691
1692 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1693
1694 /* Reset data so we can safely be called multiple times. */
1695 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1696 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1697
1698 /* While cost model enhancements are expected in the future, the high level
1699 view of the code at this time is as follows:
1700
1701 A) If there is a misaligned access then see if peeling to align
1702 this access can make all data references satisfy
1703 vect_supportable_dr_alignment. If so, update data structures
1704 as needed and return true.
1705
1706 B) If peeling wasn't possible and there is a data reference with an
1707 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1708 then see if loop versioning checks can be used to make all data
1709 references satisfy vect_supportable_dr_alignment. If so, update
1710 data structures as needed and return true.
1711
1712 C) If neither peeling nor versioning were successful then return false if
1713 any data reference does not satisfy vect_supportable_dr_alignment.
1714
1715 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1716
1717 Note, Possibility 3 above (which is peeling and versioning together) is not
1718 being done at this time. */
1719
1720 /* (1) Peeling to force alignment. */
1721
1722 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1723 Considerations:
1724 + How many accesses will become aligned due to the peeling
1725 - How many accesses will become unaligned due to the peeling,
1726 and the cost of misaligned accesses.
1727 - The cost of peeling (the extra runtime checks, the increase
1728 in code size). */
1729
1730 FOR_EACH_VEC_ELT (datarefs, i, dr)
1731 {
1732 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1733 stmt_vec_info stmt_info = dr_info->stmt;
1734
1735 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1736 continue;
1737
1738 /* For interleaving, only the alignment of the first access
1739 matters. */
1740 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1741 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1742 continue;
1743
1744 /* For scatter-gather or invariant accesses there is nothing
1745 to enhance. */
1746 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1747 || integer_zerop (DR_STEP (dr)))
1748 continue;
1749
1750 /* Strided accesses perform only component accesses, alignment is
1751 irrelevant for them. */
1752 if (STMT_VINFO_STRIDED_P (stmt_info)
1753 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1754 continue;
1755
1756 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1757 do_peeling = vector_alignment_reachable_p (dr_info);
1758 if (do_peeling)
1759 {
1760 if (known_alignment_for_access_p (dr_info))
1761 {
1762 unsigned int npeel_tmp = 0;
1763 bool negative = tree_int_cst_compare (DR_STEP (dr),
1764 size_zero_node) < 0;
1765
1766 vectype = STMT_VINFO_VECTYPE (stmt_info);
1767 /* If known_alignment_for_access_p then we have set
1768 DR_MISALIGNMENT which is only done if we know it at compiler
1769 time, so it is safe to assume target alignment is constant.
1770 */
1771 unsigned int target_align =
1772 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1773 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774 mis = (negative
1775 ? DR_MISALIGNMENT (dr_info)
1776 : -DR_MISALIGNMENT (dr_info));
1777 if (DR_MISALIGNMENT (dr_info) != 0)
1778 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1779
1780 /* For multiple types, it is possible that the bigger type access
1781 will have more than one peeling option. E.g., a loop with two
1782 types: one of size (vector size / 4), and the other one of
1783 size (vector size / 8). Vectorization factor will 8. If both
1784 accesses are misaligned by 3, the first one needs one scalar
1785 iteration to be aligned, and the second one needs 5. But the
1786 first one will be aligned also by peeling 5 scalar
1787 iterations, and in that case both accesses will be aligned.
1788 Hence, except for the immediate peeling amount, we also want
1789 to try to add full vector size, while we don't exceed
1790 vectorization factor.
1791 We do this automatically for cost model, since we calculate
1792 cost for every peeling option. */
1793 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1794 {
1795 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797 possible_npeel_number
1798 = vect_get_num_vectors (nscalars, vectype);
1799
1800 /* NPEEL_TMP is 0 when there is no misalignment, but also
1801 allow peeling NELEMENTS. */
1802 if (DR_MISALIGNMENT (dr_info) == 0)
1803 possible_npeel_number++;
1804 }
1805
1806 /* Save info about DR in the hash table. Also include peeling
1807 amounts according to the explanation above. */
1808 for (j = 0; j < possible_npeel_number; j++)
1809 {
1810 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811 dr_info, npeel_tmp);
1812 npeel_tmp += target_align / dr_size;
1813 }
1814
1815 one_misalignment_known = true;
1816 }
1817 else
1818 {
1819 /* If we don't know any misalignment values, we prefer
1820 peeling for data-ref that has the maximum number of data-refs
1821 with the same alignment, unless the target prefers to align
1822 stores over load. */
1823 unsigned same_align_drs
1824 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825 if (!dr0_info
1826 || same_align_drs_max < same_align_drs)
1827 {
1828 same_align_drs_max = same_align_drs;
1829 dr0_info = dr_info;
1830 }
1831 /* For data-refs with the same number of related
1832 accesses prefer the one where the misalign
1833 computation will be invariant in the outermost loop. */
1834 else if (same_align_drs_max == same_align_drs)
1835 {
1836 class loop *ivloop0, *ivloop;
1837 ivloop0 = outermost_invariant_loop_for_expr
1838 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839 ivloop = outermost_invariant_loop_for_expr
1840 (loop, DR_BASE_ADDRESS (dr));
1841 if ((ivloop && !ivloop0)
1842 || (ivloop && ivloop0
1843 && flow_loop_nested_p (ivloop, ivloop0)))
1844 dr0_info = dr_info;
1845 }
1846
1847 one_misalignment_unknown = true;
1848
1849 /* Check for data refs with unsupportable alignment that
1850 can be peeled. */
1851 if (!supportable_dr_alignment)
1852 {
1853 one_dr_unsupportable = true;
1854 unsupportable_dr_info = dr_info;
1855 }
1856
1857 if (!first_store && DR_IS_WRITE (dr))
1858 first_store = dr_info;
1859 }
1860 }
1861 else
1862 {
1863 if (!aligned_access_p (dr_info))
1864 {
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867 "vector alignment may not be reachable\n");
1868 break;
1869 }
1870 }
1871 }
1872
1873 /* Check if we can possibly peel the loop. */
1874 if (!vect_can_advance_ivs_p (loop_vinfo)
1875 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876 || loop->inner)
1877 do_peeling = false;
1878
1879 struct _vect_peel_extended_info peel_for_known_alignment;
1880 struct _vect_peel_extended_info peel_for_unknown_alignment;
1881 struct _vect_peel_extended_info best_peel;
1882
1883 peel_for_unknown_alignment.inside_cost = INT_MAX;
1884 peel_for_unknown_alignment.outside_cost = INT_MAX;
1885 peel_for_unknown_alignment.peel_info.count = 0;
1886
1887 if (do_peeling
1888 && one_misalignment_unknown)
1889 {
1890 /* Check if the target requires to prefer stores over loads, i.e., if
1891 misaligned stores are more expensive than misaligned loads (taking
1892 drs with same alignment into account). */
1893 unsigned int load_inside_cost = 0;
1894 unsigned int load_outside_cost = 0;
1895 unsigned int store_inside_cost = 0;
1896 unsigned int store_outside_cost = 0;
1897 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1898
1899 stmt_vector_for_cost dummy;
1900 dummy.create (2);
1901 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902 &load_inside_cost,
1903 &load_outside_cost,
1904 &dummy, &dummy, estimated_npeels, true);
1905 dummy.release ();
1906
1907 if (first_store)
1908 {
1909 dummy.create (2);
1910 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911 &store_inside_cost,
1912 &store_outside_cost,
1913 &dummy, &dummy,
1914 estimated_npeels, true);
1915 dummy.release ();
1916 }
1917 else
1918 {
1919 store_inside_cost = INT_MAX;
1920 store_outside_cost = INT_MAX;
1921 }
1922
1923 if (load_inside_cost > store_inside_cost
1924 || (load_inside_cost == store_inside_cost
1925 && load_outside_cost > store_outside_cost))
1926 {
1927 dr0_info = first_store;
1928 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1930 }
1931 else
1932 {
1933 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935 }
1936
1937 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938 prologue_cost_vec.create (2);
1939 epilogue_cost_vec.create (2);
1940
1941 int dummy2;
1942 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943 (loop_vinfo, estimated_npeels, &dummy2,
1944 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945 &prologue_cost_vec, &epilogue_cost_vec);
1946
1947 prologue_cost_vec.release ();
1948 epilogue_cost_vec.release ();
1949
1950 peel_for_unknown_alignment.peel_info.count = 1
1951 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952 }
1953
1954 peel_for_unknown_alignment.peel_info.npeel = 0;
1955 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1956
1957 best_peel = peel_for_unknown_alignment;
1958
1959 peel_for_known_alignment.inside_cost = INT_MAX;
1960 peel_for_known_alignment.outside_cost = INT_MAX;
1961 peel_for_known_alignment.peel_info.count = 0;
1962 peel_for_known_alignment.peel_info.dr_info = NULL;
1963
1964 if (do_peeling && one_misalignment_known)
1965 {
1966 /* Peeling is possible, but there is no data access that is not supported
1967 unless aligned. So we try to choose the best possible peeling from
1968 the hash table. */
1969 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970 (&peeling_htab, loop_vinfo);
1971 }
1972
1973 /* Compare costs of peeling for known and unknown alignment. */
1974 if (peel_for_known_alignment.peel_info.dr_info != NULL
1975 && peel_for_unknown_alignment.inside_cost
1976 >= peel_for_known_alignment.inside_cost)
1977 {
1978 best_peel = peel_for_known_alignment;
1979
1980 /* If the best peeling for known alignment has NPEEL == 0, perform no
1981 peeling at all except if there is an unsupportable dr that we can
1982 align. */
1983 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984 do_peeling = false;
1985 }
1986
1987 /* If there is an unsupportable data ref, prefer this over all choices so far
1988 since we'd have to discard a chosen peeling except when it accidentally
1989 aligned the unsupportable data ref. */
1990 if (one_dr_unsupportable)
1991 dr0_info = unsupportable_dr_info;
1992 else if (do_peeling)
1993 {
1994 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995 TODO: Use nopeel_outside_cost or get rid of it? */
1996 unsigned nopeel_inside_cost = 0;
1997 unsigned nopeel_outside_cost = 0;
1998
1999 stmt_vector_for_cost dummy;
2000 dummy.create (2);
2001 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002 &nopeel_outside_cost, &dummy, &dummy,
2003 0, false);
2004 dummy.release ();
2005
2006 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2007 costs will be recorded. */
2008 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009 prologue_cost_vec.create (2);
2010 epilogue_cost_vec.create (2);
2011
2012 int dummy2;
2013 nopeel_outside_cost += vect_get_known_peeling_cost
2014 (loop_vinfo, 0, &dummy2,
2015 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016 &prologue_cost_vec, &epilogue_cost_vec);
2017
2018 prologue_cost_vec.release ();
2019 epilogue_cost_vec.release ();
2020
2021 npeel = best_peel.peel_info.npeel;
2022 dr0_info = best_peel.peel_info.dr_info;
2023
2024 /* If no peeling is not more expensive than the best peeling we
2025 have so far, don't perform any peeling. */
2026 if (nopeel_inside_cost <= best_peel.inside_cost)
2027 do_peeling = false;
2028 }
2029
2030 if (do_peeling)
2031 {
2032 stmt_vec_info stmt_info = dr0_info->stmt;
2033 vectype = STMT_VINFO_VECTYPE (stmt_info);
2034
2035 if (known_alignment_for_access_p (dr0_info))
2036 {
2037 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038 size_zero_node) < 0;
2039 if (!npeel)
2040 {
2041 /* Since it's known at compile time, compute the number of
2042 iterations in the peeled loop (the peeling factor) for use in
2043 updating DR_MISALIGNMENT values. The peeling factor is the
2044 vectorization factor minus the misalignment as an element
2045 count. */
2046 mis = (negative
2047 ? DR_MISALIGNMENT (dr0_info)
2048 : -DR_MISALIGNMENT (dr0_info));
2049 /* If known_alignment_for_access_p then we have set
2050 DR_MISALIGNMENT which is only done if we know it at compiler
2051 time, so it is safe to assume target alignment is constant.
2052 */
2053 unsigned int target_align =
2054 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2055 npeel = ((mis & (target_align - 1))
2056 / vect_get_scalar_dr_size (dr0_info));
2057 }
2058
2059 /* For interleaved data access every iteration accesses all the
2060 members of the group, therefore we divide the number of iterations
2061 by the group size. */
2062 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2063 npeel /= DR_GROUP_SIZE (stmt_info);
2064
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_NOTE, vect_location,
2067 "Try peeling by %d\n", npeel);
2068 }
2069
2070 /* Ensure that all datarefs can be vectorized after the peel. */
2071 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2072 do_peeling = false;
2073
2074 /* Check if all datarefs are supportable and log. */
2075 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2076 {
2077 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078 if (!stat)
2079 do_peeling = false;
2080 else
2081 return stat;
2082 }
2083
2084 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2085 if (do_peeling)
2086 {
2087 unsigned max_allowed_peel
2088 = param_vect_max_peeling_for_alignment;
2089 if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2090 max_allowed_peel = 0;
2091 if (max_allowed_peel != (unsigned)-1)
2092 {
2093 unsigned max_peel = npeel;
2094 if (max_peel == 0)
2095 {
2096 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2097 unsigned HOST_WIDE_INT target_align_c;
2098 if (target_align.is_constant (&target_align_c))
2099 max_peel =
2100 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2101 else
2102 {
2103 do_peeling = false;
2104 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_NOTE, vect_location,
2106 "Disable peeling, max peels set and vector"
2107 " alignment unknown\n");
2108 }
2109 }
2110 if (max_peel > max_allowed_peel)
2111 {
2112 do_peeling = false;
2113 if (dump_enabled_p ())
2114 dump_printf_loc (MSG_NOTE, vect_location,
2115 "Disable peeling, max peels reached: %d\n", max_peel);
2116 }
2117 }
2118 }
2119
2120 /* Cost model #2 - if peeling may result in a remaining loop not
2121 iterating enough to be vectorized then do not peel. Since this
2122 is a cost heuristic rather than a correctness decision, use the
2123 most likely runtime value for variable vectorization factors. */
2124 if (do_peeling
2125 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2126 {
2127 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2128 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2129 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2130 < assumed_vf + max_peel)
2131 do_peeling = false;
2132 }
2133
2134 if (do_peeling)
2135 {
2136 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2137 If the misalignment of DR_i is identical to that of dr0 then set
2138 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2139 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2140 by the peeling factor times the element size of DR_i (MOD the
2141 vectorization factor times the size). Otherwise, the
2142 misalignment of DR_i must be set to unknown. */
2143 FOR_EACH_VEC_ELT (datarefs, i, dr)
2144 if (dr != dr0_info->dr)
2145 {
2146 /* Strided accesses perform only component accesses, alignment
2147 is irrelevant for them. */
2148 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2149 stmt_info = dr_info->stmt;
2150 if (STMT_VINFO_STRIDED_P (stmt_info)
2151 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2152 continue;
2153
2154 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2155 }
2156
2157 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2158 if (npeel)
2159 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2160 else
2161 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2162 = DR_MISALIGNMENT (dr0_info);
2163 SET_DR_MISALIGNMENT (dr0_info, 0);
2164 if (dump_enabled_p ())
2165 {
2166 dump_printf_loc (MSG_NOTE, vect_location,
2167 "Alignment of access forced using peeling.\n");
2168 dump_printf_loc (MSG_NOTE, vect_location,
2169 "Peeling for alignment will be applied.\n");
2170 }
2171
2172 /* The inside-loop cost will be accounted for in vectorizable_load
2173 and vectorizable_store correctly with adjusted alignments.
2174 Drop the body_cst_vec on the floor here. */
2175 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2176 gcc_assert (stat);
2177 return stat;
2178 }
2179 }
2180
2181 /* (2) Versioning to force alignment. */
2182
2183 /* Try versioning if:
2184 1) optimize loop for speed and the cost-model is not cheap
2185 2) there is at least one unsupported misaligned data ref with an unknown
2186 misalignment, and
2187 3) all misaligned data refs with a known misalignment are supported, and
2188 4) the number of runtime alignment checks is within reason. */
2189
2190 do_versioning
2191 = (optimize_loop_nest_for_speed_p (loop)
2192 && !loop->inner /* FORNOW */
2193 && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2194
2195 if (do_versioning)
2196 {
2197 FOR_EACH_VEC_ELT (datarefs, i, dr)
2198 {
2199 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2200 stmt_vec_info stmt_info = dr_info->stmt;
2201
2202 /* For interleaving, only the alignment of the first access
2203 matters. */
2204 if (aligned_access_p (dr_info)
2205 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2206 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2207 continue;
2208
2209 if (STMT_VINFO_STRIDED_P (stmt_info))
2210 {
2211 /* Strided loads perform only component accesses, alignment is
2212 irrelevant for them. */
2213 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2214 continue;
2215 do_versioning = false;
2216 break;
2217 }
2218
2219 supportable_dr_alignment
2220 = vect_supportable_dr_alignment (dr_info, false);
2221
2222 if (!supportable_dr_alignment)
2223 {
2224 int mask;
2225 tree vectype;
2226
2227 if (known_alignment_for_access_p (dr_info)
2228 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2229 >= (unsigned) param_vect_max_version_for_alignment_checks)
2230 {
2231 do_versioning = false;
2232 break;
2233 }
2234
2235 vectype = STMT_VINFO_VECTYPE (stmt_info);
2236 gcc_assert (vectype);
2237
2238 /* At present we don't support versioning for alignment
2239 with variable VF, since there's no guarantee that the
2240 VF is a power of two. We could relax this if we added
2241 a way of enforcing a power-of-two size. */
2242 unsigned HOST_WIDE_INT size;
2243 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2244 {
2245 do_versioning = false;
2246 break;
2247 }
2248
2249 /* Forcing alignment in the first iteration is no good if
2250 we don't keep it across iterations. For now, just disable
2251 versioning in this case.
2252 ?? We could actually unroll the loop to achieve the required
2253 overall step alignment, and forcing the alignment could be
2254 done by doing some iterations of the non-vectorized loop. */
2255 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2256 * DR_STEP_ALIGNMENT (dr),
2257 DR_TARGET_ALIGNMENT (dr_info)))
2258 {
2259 do_versioning = false;
2260 break;
2261 }
2262
2263 /* The rightmost bits of an aligned address must be zeros.
2264 Construct the mask needed for this test. For example,
2265 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2266 mask must be 15 = 0xf. */
2267 mask = size - 1;
2268
2269 /* FORNOW: use the same mask to test all potentially unaligned
2270 references in the loop. The vectorizer currently supports
2271 a single vector size, see the reference to
2272 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2273 vectorization factor is computed. */
2274 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2275 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2276 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2277 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2278 }
2279 }
2280
2281 /* Versioning requires at least one misaligned data reference. */
2282 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2283 do_versioning = false;
2284 else if (!do_versioning)
2285 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2286 }
2287
2288 if (do_versioning)
2289 {
2290 vec<stmt_vec_info> may_misalign_stmts
2291 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2292 stmt_vec_info stmt_info;
2293
2294 /* It can now be assumed that the data references in the statements
2295 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2296 of the loop being vectorized. */
2297 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2298 {
2299 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2300 SET_DR_MISALIGNMENT (dr_info, 0);
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_NOTE, vect_location,
2303 "Alignment of access forced using versioning.\n");
2304 }
2305
2306 if (dump_enabled_p ())
2307 dump_printf_loc (MSG_NOTE, vect_location,
2308 "Versioning for alignment will be applied.\n");
2309
2310 /* Peeling and versioning can't be done together at this time. */
2311 gcc_assert (! (do_peeling && do_versioning));
2312
2313 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2314 gcc_assert (stat);
2315 return stat;
2316 }
2317
2318 /* This point is reached if neither peeling nor versioning is being done. */
2319 gcc_assert (! (do_peeling || do_versioning));
2320
2321 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2322 return stat;
2323 }
2324
2325
2326 /* Function vect_find_same_alignment_drs.
2327
2328 Update group and alignment relations in VINFO according to the chosen
2329 vectorization factor. */
2330
2331 static void
2332 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2333 {
2334 struct data_reference *dra = DDR_A (ddr);
2335 struct data_reference *drb = DDR_B (ddr);
2336 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2337 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2338 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2339 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2340
2341 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2342 return;
2343
2344 if (dra == drb)
2345 return;
2346
2347 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2348 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2349 return;
2350
2351 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2352 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2353 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2354 return;
2355
2356 /* Two references with distance zero have the same alignment. */
2357 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2358 - wi::to_poly_offset (DR_INIT (drb)));
2359 if (maybe_ne (diff, 0))
2360 {
2361 /* Get the wider of the two alignments. */
2362 poly_uint64 align_a =
2363 exact_div (vect_calculate_target_alignment (dr_info_a),
2364 BITS_PER_UNIT);
2365 poly_uint64 align_b =
2366 exact_div (vect_calculate_target_alignment (dr_info_b),
2367 BITS_PER_UNIT);
2368 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2369 if (!align_a.is_constant (&align_a_c)
2370 || !align_b.is_constant (&align_b_c))
2371 return;
2372
2373 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2374
2375 /* Require the gap to be a multiple of the larger vector alignment. */
2376 if (!multiple_p (diff, max_align))
2377 return;
2378 }
2379
2380 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2381 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2382 if (dump_enabled_p ())
2383 dump_printf_loc (MSG_NOTE, vect_location,
2384 "accesses have the same alignment: %T and %T\n",
2385 DR_REF (dra), DR_REF (drb));
2386 }
2387
2388
2389 /* Function vect_analyze_data_refs_alignment
2390
2391 Analyze the alignment of the data-references in the loop.
2392 Return FALSE if a data reference is found that cannot be vectorized. */
2393
2394 opt_result
2395 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2396 {
2397 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2398
2399 /* Mark groups of data references with same alignment using
2400 data dependence information. */
2401 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2402 struct data_dependence_relation *ddr;
2403 unsigned int i;
2404
2405 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2406 vect_find_same_alignment_drs (vinfo, ddr);
2407
2408 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2409 struct data_reference *dr;
2410
2411 vect_record_base_alignments (vinfo);
2412 FOR_EACH_VEC_ELT (datarefs, i, dr)
2413 {
2414 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2415 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2416 vect_compute_data_ref_alignment (dr_info);
2417 }
2418
2419 return opt_result::success ();
2420 }
2421
2422
2423 /* Analyze alignment of DRs of stmts in NODE. */
2424
2425 static bool
2426 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2427 {
2428 /* We vectorize from the first scalar stmt in the node unless
2429 the node is permuted in which case we start from the first
2430 element in the group. */
2431 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2432 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2433 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2434 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2435
2436 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2437 vect_compute_data_ref_alignment (dr_info);
2438 /* For creating the data-ref pointer we need alignment of the
2439 first element anyway. */
2440 if (dr_info != first_dr_info)
2441 vect_compute_data_ref_alignment (first_dr_info);
2442 if (! verify_data_ref_alignment (dr_info))
2443 {
2444 if (dump_enabled_p ())
2445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2446 "not vectorized: bad data alignment in basic "
2447 "block.\n");
2448 return false;
2449 }
2450
2451 return true;
2452 }
2453
2454 /* Function vect_slp_analyze_instance_alignment
2455
2456 Analyze the alignment of the data-references in the SLP instance.
2457 Return FALSE if a data reference is found that cannot be vectorized. */
2458
2459 bool
2460 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2461 {
2462 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2463
2464 slp_tree node;
2465 unsigned i;
2466 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2467 if (! vect_slp_analyze_and_verify_node_alignment (node))
2468 return false;
2469
2470 node = SLP_INSTANCE_TREE (instance);
2471 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2472 && ! vect_slp_analyze_and_verify_node_alignment
2473 (SLP_INSTANCE_TREE (instance)))
2474 return false;
2475
2476 return true;
2477 }
2478
2479
2480 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2481 accesses of legal size, step, etc. Detect gaps, single element
2482 interleaving, and other special cases. Set grouped access info.
2483 Collect groups of strided stores for further use in SLP analysis.
2484 Worker for vect_analyze_group_access. */
2485
2486 static bool
2487 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2488 {
2489 data_reference *dr = dr_info->dr;
2490 tree step = DR_STEP (dr);
2491 tree scalar_type = TREE_TYPE (DR_REF (dr));
2492 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2493 stmt_vec_info stmt_info = dr_info->stmt;
2494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2495 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2496 HOST_WIDE_INT dr_step = -1;
2497 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2498 bool slp_impossible = false;
2499
2500 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2501 size of the interleaving group (including gaps). */
2502 if (tree_fits_shwi_p (step))
2503 {
2504 dr_step = tree_to_shwi (step);
2505 /* Check that STEP is a multiple of type size. Otherwise there is
2506 a non-element-sized gap at the end of the group which we
2507 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2508 ??? As we can handle non-constant step fine here we should
2509 simply remove uses of DR_GROUP_GAP between the last and first
2510 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2511 simply not include that gap. */
2512 if ((dr_step % type_size) != 0)
2513 {
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location,
2516 "Step %T is not a multiple of the element size"
2517 " for %T\n",
2518 step, DR_REF (dr));
2519 return false;
2520 }
2521 groupsize = absu_hwi (dr_step) / type_size;
2522 }
2523 else
2524 groupsize = 0;
2525
2526 /* Not consecutive access is possible only if it is a part of interleaving. */
2527 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2528 {
2529 /* Check if it this DR is a part of interleaving, and is a single
2530 element of the group that is accessed in the loop. */
2531
2532 /* Gaps are supported only for loads. STEP must be a multiple of the type
2533 size. */
2534 if (DR_IS_READ (dr)
2535 && (dr_step % type_size) == 0
2536 && groupsize > 0)
2537 {
2538 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2539 DR_GROUP_SIZE (stmt_info) = groupsize;
2540 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2541 if (dump_enabled_p ())
2542 dump_printf_loc (MSG_NOTE, vect_location,
2543 "Detected single element interleaving %T"
2544 " step %T\n",
2545 DR_REF (dr), step);
2546
2547 return true;
2548 }
2549
2550 if (dump_enabled_p ())
2551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2552 "not consecutive access %G", stmt_info->stmt);
2553
2554 if (bb_vinfo)
2555 {
2556 /* Mark the statement as unvectorizable. */
2557 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2558 return true;
2559 }
2560
2561 if (dump_enabled_p ())
2562 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2563 STMT_VINFO_STRIDED_P (stmt_info) = true;
2564 return true;
2565 }
2566
2567 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2568 {
2569 /* First stmt in the interleaving chain. Check the chain. */
2570 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2571 struct data_reference *data_ref = dr;
2572 unsigned int count = 1;
2573 tree prev_init = DR_INIT (data_ref);
2574 HOST_WIDE_INT diff, gaps = 0;
2575
2576 /* By construction, all group members have INTEGER_CST DR_INITs. */
2577 while (next)
2578 {
2579 /* We never have the same DR multiple times. */
2580 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2581 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2582
2583 data_ref = STMT_VINFO_DATA_REF (next);
2584
2585 /* All group members have the same STEP by construction. */
2586 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2587
2588 /* Check that the distance between two accesses is equal to the type
2589 size. Otherwise, we have gaps. */
2590 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2591 - TREE_INT_CST_LOW (prev_init)) / type_size;
2592 if (diff != 1)
2593 {
2594 /* FORNOW: SLP of accesses with gaps is not supported. */
2595 slp_impossible = true;
2596 if (DR_IS_WRITE (data_ref))
2597 {
2598 if (dump_enabled_p ())
2599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2600 "interleaved store with gaps\n");
2601 return false;
2602 }
2603
2604 gaps += diff - 1;
2605 }
2606
2607 last_accessed_element += diff;
2608
2609 /* Store the gap from the previous member of the group. If there is no
2610 gap in the access, DR_GROUP_GAP is always 1. */
2611 DR_GROUP_GAP (next) = diff;
2612
2613 prev_init = DR_INIT (data_ref);
2614 next = DR_GROUP_NEXT_ELEMENT (next);
2615 /* Count the number of data-refs in the chain. */
2616 count++;
2617 }
2618
2619 if (groupsize == 0)
2620 groupsize = count + gaps;
2621
2622 /* This could be UINT_MAX but as we are generating code in a very
2623 inefficient way we have to cap earlier. See PR78699 for example. */
2624 if (groupsize > 4096)
2625 {
2626 if (dump_enabled_p ())
2627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2628 "group is too large\n");
2629 return false;
2630 }
2631
2632 /* Check that the size of the interleaving is equal to count for stores,
2633 i.e., that there are no gaps. */
2634 if (groupsize != count
2635 && !DR_IS_READ (dr))
2636 {
2637 groupsize = count;
2638 STMT_VINFO_STRIDED_P (stmt_info) = true;
2639 }
2640
2641 /* If there is a gap after the last load in the group it is the
2642 difference between the groupsize and the last accessed
2643 element.
2644 When there is no gap, this difference should be 0. */
2645 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2646
2647 DR_GROUP_SIZE (stmt_info) = groupsize;
2648 if (dump_enabled_p ())
2649 {
2650 dump_printf_loc (MSG_NOTE, vect_location,
2651 "Detected interleaving ");
2652 if (DR_IS_READ (dr))
2653 dump_printf (MSG_NOTE, "load ");
2654 else if (STMT_VINFO_STRIDED_P (stmt_info))
2655 dump_printf (MSG_NOTE, "strided store ");
2656 else
2657 dump_printf (MSG_NOTE, "store ");
2658 dump_printf (MSG_NOTE, "of size %u\n",
2659 (unsigned)groupsize);
2660 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2661 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2662 while (next)
2663 {
2664 if (DR_GROUP_GAP (next) != 1)
2665 dump_printf_loc (MSG_NOTE, vect_location,
2666 "\t<gap of %d elements>\n",
2667 DR_GROUP_GAP (next) - 1);
2668 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2669 next = DR_GROUP_NEXT_ELEMENT (next);
2670 }
2671 if (DR_GROUP_GAP (stmt_info) != 0)
2672 dump_printf_loc (MSG_NOTE, vect_location,
2673 "\t<gap of %d elements>\n",
2674 DR_GROUP_GAP (stmt_info));
2675 }
2676
2677 /* SLP: create an SLP data structure for every interleaving group of
2678 stores for further analysis in vect_analyse_slp. */
2679 if (DR_IS_WRITE (dr) && !slp_impossible)
2680 {
2681 if (loop_vinfo)
2682 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2683 if (bb_vinfo)
2684 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2685 }
2686 }
2687
2688 return true;
2689 }
2690
2691 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2692 accesses of legal size, step, etc. Detect gaps, single element
2693 interleaving, and other special cases. Set grouped access info.
2694 Collect groups of strided stores for further use in SLP analysis. */
2695
2696 static bool
2697 vect_analyze_group_access (dr_vec_info *dr_info)
2698 {
2699 if (!vect_analyze_group_access_1 (dr_info))
2700 {
2701 /* Dissolve the group if present. */
2702 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2703 while (stmt_info)
2704 {
2705 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2706 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2707 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2708 stmt_info = next;
2709 }
2710 return false;
2711 }
2712 return true;
2713 }
2714
2715 /* Analyze the access pattern of the data-reference DR_INFO.
2716 In case of non-consecutive accesses call vect_analyze_group_access() to
2717 analyze groups of accesses. */
2718
2719 static bool
2720 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2721 {
2722 data_reference *dr = dr_info->dr;
2723 tree step = DR_STEP (dr);
2724 tree scalar_type = TREE_TYPE (DR_REF (dr));
2725 stmt_vec_info stmt_info = dr_info->stmt;
2726 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2727 class loop *loop = NULL;
2728
2729 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2730 return true;
2731
2732 if (loop_vinfo)
2733 loop = LOOP_VINFO_LOOP (loop_vinfo);
2734
2735 if (loop_vinfo && !step)
2736 {
2737 if (dump_enabled_p ())
2738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2739 "bad data-ref access in loop\n");
2740 return false;
2741 }
2742
2743 /* Allow loads with zero step in inner-loop vectorization. */
2744 if (loop_vinfo && integer_zerop (step))
2745 {
2746 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2747 if (!nested_in_vect_loop_p (loop, stmt_info))
2748 return DR_IS_READ (dr);
2749 /* Allow references with zero step for outer loops marked
2750 with pragma omp simd only - it guarantees absence of
2751 loop-carried dependencies between inner loop iterations. */
2752 if (loop->safelen < 2)
2753 {
2754 if (dump_enabled_p ())
2755 dump_printf_loc (MSG_NOTE, vect_location,
2756 "zero step in inner loop of nest\n");
2757 return false;
2758 }
2759 }
2760
2761 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2762 {
2763 /* Interleaved accesses are not yet supported within outer-loop
2764 vectorization for references in the inner-loop. */
2765 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2766
2767 /* For the rest of the analysis we use the outer-loop step. */
2768 step = STMT_VINFO_DR_STEP (stmt_info);
2769 if (integer_zerop (step))
2770 {
2771 if (dump_enabled_p ())
2772 dump_printf_loc (MSG_NOTE, vect_location,
2773 "zero step in outer loop.\n");
2774 return DR_IS_READ (dr);
2775 }
2776 }
2777
2778 /* Consecutive? */
2779 if (TREE_CODE (step) == INTEGER_CST)
2780 {
2781 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2782 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2783 || (dr_step < 0
2784 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2785 {
2786 /* Mark that it is not interleaving. */
2787 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2788 return true;
2789 }
2790 }
2791
2792 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2793 {
2794 if (dump_enabled_p ())
2795 dump_printf_loc (MSG_NOTE, vect_location,
2796 "grouped access in outer loop.\n");
2797 return false;
2798 }
2799
2800
2801 /* Assume this is a DR handled by non-constant strided load case. */
2802 if (TREE_CODE (step) != INTEGER_CST)
2803 return (STMT_VINFO_STRIDED_P (stmt_info)
2804 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2805 || vect_analyze_group_access (dr_info)));
2806
2807 /* Not consecutive access - check if it's a part of interleaving group. */
2808 return vect_analyze_group_access (dr_info);
2809 }
2810
2811 /* Compare two data-references DRA and DRB to group them into chunks
2812 suitable for grouping. */
2813
2814 static int
2815 dr_group_sort_cmp (const void *dra_, const void *drb_)
2816 {
2817 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2818 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2819 int cmp;
2820
2821 /* Stabilize sort. */
2822 if (dra == drb)
2823 return 0;
2824
2825 /* DRs in different loops never belong to the same group. */
2826 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2827 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2828 if (loopa != loopb)
2829 return loopa->num < loopb->num ? -1 : 1;
2830
2831 /* Ordering of DRs according to base. */
2832 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2833 DR_BASE_ADDRESS (drb));
2834 if (cmp != 0)
2835 return cmp;
2836
2837 /* And according to DR_OFFSET. */
2838 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2839 if (cmp != 0)
2840 return cmp;
2841
2842 /* Put reads before writes. */
2843 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2844 return DR_IS_READ (dra) ? -1 : 1;
2845
2846 /* Then sort after access size. */
2847 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2848 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2849 if (cmp != 0)
2850 return cmp;
2851
2852 /* And after step. */
2853 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2854 if (cmp != 0)
2855 return cmp;
2856
2857 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2858 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2859 if (cmp == 0)
2860 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2861 return cmp;
2862 }
2863
2864 /* If OP is the result of a conversion, return the unconverted value,
2865 otherwise return null. */
2866
2867 static tree
2868 strip_conversion (tree op)
2869 {
2870 if (TREE_CODE (op) != SSA_NAME)
2871 return NULL_TREE;
2872 gimple *stmt = SSA_NAME_DEF_STMT (op);
2873 if (!is_gimple_assign (stmt)
2874 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2875 return NULL_TREE;
2876 return gimple_assign_rhs1 (stmt);
2877 }
2878
2879 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2880 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2881 be grouped in SLP mode. */
2882
2883 static bool
2884 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2885 bool allow_slp_p)
2886 {
2887 if (gimple_assign_single_p (stmt1_info->stmt))
2888 return gimple_assign_single_p (stmt2_info->stmt);
2889
2890 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2891 if (call1 && gimple_call_internal_p (call1))
2892 {
2893 /* Check for two masked loads or two masked stores. */
2894 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2895 if (!call2 || !gimple_call_internal_p (call2))
2896 return false;
2897 internal_fn ifn = gimple_call_internal_fn (call1);
2898 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2899 return false;
2900 if (ifn != gimple_call_internal_fn (call2))
2901 return false;
2902
2903 /* Check that the masks are the same. Cope with casts of masks,
2904 like those created by build_mask_conversion. */
2905 tree mask1 = gimple_call_arg (call1, 2);
2906 tree mask2 = gimple_call_arg (call2, 2);
2907 if (!operand_equal_p (mask1, mask2, 0)
2908 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2909 {
2910 mask1 = strip_conversion (mask1);
2911 if (!mask1)
2912 return false;
2913 mask2 = strip_conversion (mask2);
2914 if (!mask2)
2915 return false;
2916 if (!operand_equal_p (mask1, mask2, 0))
2917 return false;
2918 }
2919 return true;
2920 }
2921
2922 return false;
2923 }
2924
2925 /* Function vect_analyze_data_ref_accesses.
2926
2927 Analyze the access pattern of all the data references in the loop.
2928
2929 FORNOW: the only access pattern that is considered vectorizable is a
2930 simple step 1 (consecutive) access.
2931
2932 FORNOW: handle only arrays and pointer accesses. */
2933
2934 opt_result
2935 vect_analyze_data_ref_accesses (vec_info *vinfo)
2936 {
2937 unsigned int i;
2938 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2939 struct data_reference *dr;
2940
2941 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2942
2943 if (datarefs.is_empty ())
2944 return opt_result::success ();
2945
2946 /* Sort the array of datarefs to make building the interleaving chains
2947 linear. Don't modify the original vector's order, it is needed for
2948 determining what dependencies are reversed. */
2949 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2950 datarefs_copy.qsort (dr_group_sort_cmp);
2951 hash_set<stmt_vec_info> to_fixup;
2952
2953 /* Build the interleaving chains. */
2954 for (i = 0; i < datarefs_copy.length () - 1;)
2955 {
2956 data_reference_p dra = datarefs_copy[i];
2957 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2958 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2959 stmt_vec_info lastinfo = NULL;
2960 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2961 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2962 {
2963 ++i;
2964 continue;
2965 }
2966 for (i = i + 1; i < datarefs_copy.length (); ++i)
2967 {
2968 data_reference_p drb = datarefs_copy[i];
2969 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2970 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2971 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2972 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2973 break;
2974
2975 /* ??? Imperfect sorting (non-compatible types, non-modulo
2976 accesses, same accesses) can lead to a group to be artificially
2977 split here as we don't just skip over those. If it really
2978 matters we can push those to a worklist and re-iterate
2979 over them. The we can just skip ahead to the next DR here. */
2980
2981 /* DRs in a different loop should not be put into the same
2982 interleaving group. */
2983 if (gimple_bb (DR_STMT (dra))->loop_father
2984 != gimple_bb (DR_STMT (drb))->loop_father)
2985 break;
2986
2987 /* Check that the data-refs have same first location (except init)
2988 and they are both either store or load (not load and store,
2989 not masked loads or stores). */
2990 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2991 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2992 DR_BASE_ADDRESS (drb)) != 0
2993 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2994 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2995 break;
2996
2997 /* Check that the data-refs have the same constant size. */
2998 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2999 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3000 if (!tree_fits_uhwi_p (sza)
3001 || !tree_fits_uhwi_p (szb)
3002 || !tree_int_cst_equal (sza, szb))
3003 break;
3004
3005 /* Check that the data-refs have the same step. */
3006 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3007 break;
3008
3009 /* Check the types are compatible.
3010 ??? We don't distinguish this during sorting. */
3011 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3012 TREE_TYPE (DR_REF (drb))))
3013 break;
3014
3015 /* Check that the DR_INITs are compile-time constants. */
3016 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3017 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3018 break;
3019
3020 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3021 just hold extra information. */
3022 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3023 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3024 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3025 break;
3026
3027 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3028 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3029 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3030 HOST_WIDE_INT init_prev
3031 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3032 gcc_assert (init_a <= init_b
3033 && init_a <= init_prev
3034 && init_prev <= init_b);
3035
3036 /* Do not place the same access in the interleaving chain twice. */
3037 if (init_b == init_prev)
3038 {
3039 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3040 < gimple_uid (DR_STMT (drb)));
3041 /* Simply link in duplicates and fix up the chain below. */
3042 }
3043 else
3044 {
3045 /* If init_b == init_a + the size of the type * k, we have an
3046 interleaving, and DRA is accessed before DRB. */
3047 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3048 if (type_size_a == 0
3049 || (init_b - init_a) % type_size_a != 0)
3050 break;
3051
3052 /* If we have a store, the accesses are adjacent. This splits
3053 groups into chunks we support (we don't support vectorization
3054 of stores with gaps). */
3055 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3056 break;
3057
3058 /* If the step (if not zero or non-constant) is greater than the
3059 difference between data-refs' inits this splits groups into
3060 suitable sizes. */
3061 if (tree_fits_shwi_p (DR_STEP (dra)))
3062 {
3063 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3064 if (step != 0 && step <= (init_b - init_a))
3065 break;
3066 }
3067 }
3068
3069 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_NOTE, vect_location,
3071 DR_IS_READ (dra)
3072 ? "Detected interleaving load %T and %T\n"
3073 : "Detected interleaving store %T and %T\n",
3074 DR_REF (dra), DR_REF (drb));
3075
3076 /* Link the found element into the group list. */
3077 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3078 {
3079 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3080 lastinfo = stmtinfo_a;
3081 }
3082 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3083 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3084 lastinfo = stmtinfo_b;
3085
3086 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3087 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3088
3089 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3090 dump_printf_loc (MSG_NOTE, vect_location,
3091 "Load suitable for SLP vectorization only.\n");
3092
3093 if (init_b == init_prev
3094 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3095 && dump_enabled_p ())
3096 dump_printf_loc (MSG_NOTE, vect_location,
3097 "Queuing group with duplicate access for fixup\n");
3098 }
3099 }
3100
3101 /* Fixup groups with duplicate entries by splitting it. */
3102 while (1)
3103 {
3104 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3105 if (!(it != to_fixup.end ()))
3106 break;
3107 stmt_vec_info grp = *it;
3108 to_fixup.remove (grp);
3109
3110 /* Find the earliest duplicate group member. */
3111 unsigned first_duplicate = -1u;
3112 stmt_vec_info next, g = grp;
3113 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3114 {
3115 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3116 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3117 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3118 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3119 g = next;
3120 }
3121 if (first_duplicate == -1U)
3122 continue;
3123
3124 /* Then move all stmts after the first duplicate to a new group.
3125 Note this is a heuristic but one with the property that *it
3126 is fixed up completely. */
3127 g = grp;
3128 stmt_vec_info newgroup = NULL, ng = grp;
3129 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3130 {
3131 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3132 {
3133 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3134 if (!newgroup)
3135 newgroup = next;
3136 else
3137 DR_GROUP_NEXT_ELEMENT (ng) = next;
3138 ng = next;
3139 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3140 }
3141 else
3142 g = DR_GROUP_NEXT_ELEMENT (g);
3143 }
3144 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3145
3146 /* Fixup the new group which still may contain duplicates. */
3147 to_fixup.add (newgroup);
3148 }
3149
3150 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3151 {
3152 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3153 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3154 && !vect_analyze_data_ref_access (dr_info))
3155 {
3156 if (dump_enabled_p ())
3157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3158 "not vectorized: complicated access pattern.\n");
3159
3160 if (is_a <bb_vec_info> (vinfo))
3161 {
3162 /* Mark the statement as not vectorizable. */
3163 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3164 continue;
3165 }
3166 else
3167 {
3168 datarefs_copy.release ();
3169 return opt_result::failure_at (dr_info->stmt->stmt,
3170 "not vectorized:"
3171 " complicated access pattern.\n");
3172 }
3173 }
3174 }
3175
3176 datarefs_copy.release ();
3177 return opt_result::success ();
3178 }
3179
3180 /* Function vect_vfa_segment_size.
3181
3182 Input:
3183 DR_INFO: The data reference.
3184 LENGTH_FACTOR: segment length to consider.
3185
3186 Return a value suitable for the dr_with_seg_len::seg_len field.
3187 This is the "distance travelled" by the pointer from the first
3188 iteration in the segment to the last. Note that it does not include
3189 the size of the access; in effect it only describes the first byte. */
3190
3191 static tree
3192 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3193 {
3194 length_factor = size_binop (MINUS_EXPR,
3195 fold_convert (sizetype, length_factor),
3196 size_one_node);
3197 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3198 length_factor);
3199 }
3200
3201 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3202 gives the worst-case number of bytes covered by the segment. */
3203
3204 static unsigned HOST_WIDE_INT
3205 vect_vfa_access_size (dr_vec_info *dr_info)
3206 {
3207 stmt_vec_info stmt_vinfo = dr_info->stmt;
3208 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3209 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3210 unsigned HOST_WIDE_INT access_size = ref_size;
3211 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3212 {
3213 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3214 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3215 }
3216 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3217 && (vect_supportable_dr_alignment (dr_info, false)
3218 == dr_explicit_realign_optimized))
3219 {
3220 /* We might access a full vector's worth. */
3221 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3222 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3223 }
3224 return access_size;
3225 }
3226
3227 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3228 describes. */
3229
3230 static unsigned int
3231 vect_vfa_align (dr_vec_info *dr_info)
3232 {
3233 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3234 }
3235
3236 /* Function vect_no_alias_p.
3237
3238 Given data references A and B with equal base and offset, see whether
3239 the alias relation can be decided at compilation time. Return 1 if
3240 it can and the references alias, 0 if it can and the references do
3241 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3242 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3243 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3244
3245 static int
3246 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3247 tree segment_length_a, tree segment_length_b,
3248 unsigned HOST_WIDE_INT access_size_a,
3249 unsigned HOST_WIDE_INT access_size_b)
3250 {
3251 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3252 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3253 poly_uint64 const_length_a;
3254 poly_uint64 const_length_b;
3255
3256 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3257 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3258 [a, a+12) */
3259 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3260 {
3261 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3262 offset_a = (offset_a + access_size_a) - const_length_a;
3263 }
3264 else
3265 const_length_a = tree_to_poly_uint64 (segment_length_a);
3266 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3267 {
3268 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3269 offset_b = (offset_b + access_size_b) - const_length_b;
3270 }
3271 else
3272 const_length_b = tree_to_poly_uint64 (segment_length_b);
3273
3274 const_length_a += access_size_a;
3275 const_length_b += access_size_b;
3276
3277 if (ranges_known_overlap_p (offset_a, const_length_a,
3278 offset_b, const_length_b))
3279 return 1;
3280
3281 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3282 offset_b, const_length_b))
3283 return 0;
3284
3285 return -1;
3286 }
3287
3288 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3289 in DDR is >= VF. */
3290
3291 static bool
3292 dependence_distance_ge_vf (data_dependence_relation *ddr,
3293 unsigned int loop_depth, poly_uint64 vf)
3294 {
3295 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3296 || DDR_NUM_DIST_VECTS (ddr) == 0)
3297 return false;
3298
3299 /* If the dependence is exact, we should have limited the VF instead. */
3300 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3301
3302 unsigned int i;
3303 lambda_vector dist_v;
3304 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3305 {
3306 HOST_WIDE_INT dist = dist_v[loop_depth];
3307 if (dist != 0
3308 && !(dist > 0 && DDR_REVERSED_P (ddr))
3309 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3310 return false;
3311 }
3312
3313 if (dump_enabled_p ())
3314 dump_printf_loc (MSG_NOTE, vect_location,
3315 "dependence distance between %T and %T is >= VF\n",
3316 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3317
3318 return true;
3319 }
3320
3321 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3322
3323 static void
3324 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3325 {
3326 dump_printf (dump_kind, "%s (%T) >= ",
3327 lower_bound.unsigned_p ? "unsigned" : "abs",
3328 lower_bound.expr);
3329 dump_dec (dump_kind, lower_bound.min_value);
3330 }
3331
3332 /* Record that the vectorized loop requires the vec_lower_bound described
3333 by EXPR, UNSIGNED_P and MIN_VALUE. */
3334
3335 static void
3336 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3337 poly_uint64 min_value)
3338 {
3339 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3340 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3341 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3342 {
3343 unsigned_p &= lower_bounds[i].unsigned_p;
3344 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3345 if (lower_bounds[i].unsigned_p != unsigned_p
3346 || maybe_lt (lower_bounds[i].min_value, min_value))
3347 {
3348 lower_bounds[i].unsigned_p = unsigned_p;
3349 lower_bounds[i].min_value = min_value;
3350 if (dump_enabled_p ())
3351 {
3352 dump_printf_loc (MSG_NOTE, vect_location,
3353 "updating run-time check to ");
3354 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3355 dump_printf (MSG_NOTE, "\n");
3356 }
3357 }
3358 return;
3359 }
3360
3361 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3362 if (dump_enabled_p ())
3363 {
3364 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3365 dump_lower_bound (MSG_NOTE, lower_bound);
3366 dump_printf (MSG_NOTE, "\n");
3367 }
3368 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3369 }
3370
3371 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3372 will span fewer than GAP bytes. */
3373
3374 static bool
3375 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3376 poly_int64 gap)
3377 {
3378 stmt_vec_info stmt_info = dr_info->stmt;
3379 HOST_WIDE_INT count
3380 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3381 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3382 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3383 return (estimated_poly_value (gap)
3384 <= count * vect_get_scalar_dr_size (dr_info));
3385 }
3386
3387 /* Return true if we know that there is no alias between DR_INFO_A and
3388 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3389 When returning true, set *LOWER_BOUND_OUT to this N. */
3390
3391 static bool
3392 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3393 poly_uint64 *lower_bound_out)
3394 {
3395 /* Check that there is a constant gap of known sign between DR_A
3396 and DR_B. */
3397 data_reference *dr_a = dr_info_a->dr;
3398 data_reference *dr_b = dr_info_b->dr;
3399 poly_int64 init_a, init_b;
3400 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3401 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3402 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3403 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3404 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3405 || !ordered_p (init_a, init_b))
3406 return false;
3407
3408 /* Sort DR_A and DR_B by the address they access. */
3409 if (maybe_lt (init_b, init_a))
3410 {
3411 std::swap (init_a, init_b);
3412 std::swap (dr_info_a, dr_info_b);
3413 std::swap (dr_a, dr_b);
3414 }
3415
3416 /* If the two accesses could be dependent within a scalar iteration,
3417 make sure that we'd retain their order. */
3418 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3419 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3420 return false;
3421
3422 /* There is no alias if abs (DR_STEP) is greater than or equal to
3423 the bytes spanned by the combination of the two accesses. */
3424 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3425 return true;
3426 }
3427
3428 /* Function vect_prune_runtime_alias_test_list.
3429
3430 Prune a list of ddrs to be tested at run-time by versioning for alias.
3431 Merge several alias checks into one if possible.
3432 Return FALSE if resulting list of ddrs is longer then allowed by
3433 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3434
3435 opt_result
3436 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3437 {
3438 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3439 hash_set <tree_pair_hash> compared_objects;
3440
3441 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3442 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3443 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3444 vec<vec_object_pair> &check_unequal_addrs
3445 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3446 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3447 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3448
3449 ddr_p ddr;
3450 unsigned int i;
3451 tree length_factor;
3452
3453 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3454
3455 /* Step values are irrelevant for aliasing if the number of vector
3456 iterations is equal to the number of scalar iterations (which can
3457 happen for fully-SLP loops). */
3458 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3459
3460 if (!ignore_step_p)
3461 {
3462 /* Convert the checks for nonzero steps into bound tests. */
3463 tree value;
3464 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3465 vect_check_lower_bound (loop_vinfo, value, true, 1);
3466 }
3467
3468 if (may_alias_ddrs.is_empty ())
3469 return opt_result::success ();
3470
3471 comp_alias_ddrs.create (may_alias_ddrs.length ());
3472
3473 unsigned int loop_depth
3474 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3475 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3476
3477 /* First, we collect all data ref pairs for aliasing checks. */
3478 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3479 {
3480 poly_uint64 lower_bound;
3481 tree segment_length_a, segment_length_b;
3482 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3483 unsigned int align_a, align_b;
3484
3485 /* Ignore the alias if the VF we chose ended up being no greater
3486 than the dependence distance. */
3487 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3488 continue;
3489
3490 if (DDR_OBJECT_A (ddr))
3491 {
3492 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3493 if (!compared_objects.add (new_pair))
3494 {
3495 if (dump_enabled_p ())
3496 dump_printf_loc (MSG_NOTE, vect_location,
3497 "checking that %T and %T"
3498 " have different addresses\n",
3499 new_pair.first, new_pair.second);
3500 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3501 }
3502 continue;
3503 }
3504
3505 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3506 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3507
3508 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3509 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3510
3511 bool preserves_scalar_order_p
3512 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3513
3514 /* Skip the pair if inter-iteration dependencies are irrelevant
3515 and intra-iteration dependencies are guaranteed to be honored. */
3516 if (ignore_step_p
3517 && (preserves_scalar_order_p
3518 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3519 &lower_bound)))
3520 {
3521 if (dump_enabled_p ())
3522 dump_printf_loc (MSG_NOTE, vect_location,
3523 "no need for alias check between "
3524 "%T and %T when VF is 1\n",
3525 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3526 continue;
3527 }
3528
3529 /* See whether we can handle the alias using a bounds check on
3530 the step, and whether that's likely to be the best approach.
3531 (It might not be, for example, if the minimum step is much larger
3532 than the number of bytes handled by one vector iteration.) */
3533 if (!ignore_step_p
3534 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3535 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3536 &lower_bound)
3537 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3538 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3539 {
3540 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3541 if (dump_enabled_p ())
3542 {
3543 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3544 "%T and %T when the step %T is outside ",
3545 DR_REF (dr_info_a->dr),
3546 DR_REF (dr_info_b->dr),
3547 DR_STEP (dr_info_a->dr));
3548 if (unsigned_p)
3549 dump_printf (MSG_NOTE, "[0");
3550 else
3551 {
3552 dump_printf (MSG_NOTE, "(");
3553 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3554 }
3555 dump_printf (MSG_NOTE, ", ");
3556 dump_dec (MSG_NOTE, lower_bound);
3557 dump_printf (MSG_NOTE, ")\n");
3558 }
3559 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3560 unsigned_p, lower_bound);
3561 continue;
3562 }
3563
3564 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3565 if (dr_group_first_a)
3566 {
3567 stmt_info_a = dr_group_first_a;
3568 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3569 }
3570
3571 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3572 if (dr_group_first_b)
3573 {
3574 stmt_info_b = dr_group_first_b;
3575 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3576 }
3577
3578 if (ignore_step_p)
3579 {
3580 segment_length_a = size_zero_node;
3581 segment_length_b = size_zero_node;
3582 }
3583 else
3584 {
3585 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3586 DR_STEP (dr_info_b->dr), 0))
3587 length_factor = scalar_loop_iters;
3588 else
3589 length_factor = size_int (vect_factor);
3590 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3591 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3592 }
3593 access_size_a = vect_vfa_access_size (dr_info_a);
3594 access_size_b = vect_vfa_access_size (dr_info_b);
3595 align_a = vect_vfa_align (dr_info_a);
3596 align_b = vect_vfa_align (dr_info_b);
3597
3598 /* See whether the alias is known at compilation time. */
3599 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3600 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3601 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3602 DR_OFFSET (dr_info_b->dr), 0)
3603 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3604 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3605 && poly_int_tree_p (segment_length_a)
3606 && poly_int_tree_p (segment_length_b))
3607 {
3608 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3609 segment_length_a,
3610 segment_length_b,
3611 access_size_a,
3612 access_size_b);
3613 if (res >= 0 && dump_enabled_p ())
3614 {
3615 dump_printf_loc (MSG_NOTE, vect_location,
3616 "can tell at compile time that %T and %T",
3617 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3618 if (res == 0)
3619 dump_printf (MSG_NOTE, " do not alias\n");
3620 else
3621 dump_printf (MSG_NOTE, " alias\n");
3622 }
3623
3624 if (res == 0)
3625 continue;
3626
3627 if (res == 1)
3628 return opt_result::failure_at (stmt_info_b->stmt,
3629 "not vectorized:"
3630 " compilation time alias: %G%G",
3631 stmt_info_a->stmt,
3632 stmt_info_b->stmt);
3633 }
3634
3635 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3636 access_size_a, align_a);
3637 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3638 access_size_b, align_b);
3639 /* Canonicalize the order to be the one that's needed for accurate
3640 RAW, WAR and WAW flags, in cases where the data references are
3641 well-ordered. The order doesn't really matter otherwise,
3642 but we might as well be consistent. */
3643 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3644 std::swap (dr_a, dr_b);
3645
3646 dr_with_seg_len_pair_t dr_with_seg_len_pair
3647 (dr_a, dr_b, (preserves_scalar_order_p
3648 ? dr_with_seg_len_pair_t::WELL_ORDERED
3649 : dr_with_seg_len_pair_t::REORDERED));
3650
3651 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3652 }
3653
3654 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3655
3656 unsigned int count = (comp_alias_ddrs.length ()
3657 + check_unequal_addrs.length ());
3658
3659 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_NOTE, vect_location,
3661 "improved number of alias checks from %d to %d\n",
3662 may_alias_ddrs.length (), count);
3663 unsigned limit = param_vect_max_version_for_alias_checks;
3664 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3665 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3666 if (count > limit)
3667 return opt_result::failure_at
3668 (vect_location,
3669 "number of versioning for alias run-time tests exceeds %d "
3670 "(--param vect-max-version-for-alias-checks)\n", limit);
3671
3672 return opt_result::success ();
3673 }
3674
3675 /* Check whether we can use an internal function for a gather load
3676 or scatter store. READ_P is true for loads and false for stores.
3677 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3678 the type of the memory elements being loaded or stored. OFFSET_TYPE
3679 is the type of the offset that is being applied to the invariant
3680 base address. SCALE is the amount by which the offset should
3681 be multiplied *after* it has been converted to address width.
3682
3683 Return true if the function is supported, storing the function id in
3684 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3685
3686 bool
3687 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3688 tree vectype, tree memory_type, tree offset_type,
3689 int scale, internal_fn *ifn_out,
3690 tree *offset_vectype_out)
3691 {
3692 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3693 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3694 if (element_bits != memory_bits)
3695 /* For now the vector elements must be the same width as the
3696 memory elements. */
3697 return false;
3698
3699 /* Work out which function we need. */
3700 internal_fn ifn;
3701 if (read_p)
3702 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3703 else
3704 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3705
3706 for (;;)
3707 {
3708 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3709 if (!offset_vectype)
3710 return false;
3711
3712 /* Test whether the target supports this combination. */
3713 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3714 offset_vectype, scale))
3715 {
3716 *ifn_out = ifn;
3717 *offset_vectype_out = offset_vectype;
3718 return true;
3719 }
3720
3721 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3722 && TYPE_PRECISION (offset_type) >= element_bits)
3723 return false;
3724
3725 offset_type = build_nonstandard_integer_type
3726 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3727 }
3728 }
3729
3730 /* STMT_INFO is a call to an internal gather load or scatter store function.
3731 Describe the operation in INFO. */
3732
3733 static void
3734 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3735 gather_scatter_info *info)
3736 {
3737 gcall *call = as_a <gcall *> (stmt_info->stmt);
3738 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3739 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3740
3741 info->ifn = gimple_call_internal_fn (call);
3742 info->decl = NULL_TREE;
3743 info->base = gimple_call_arg (call, 0);
3744 info->offset = gimple_call_arg (call, 1);
3745 info->offset_dt = vect_unknown_def_type;
3746 info->offset_vectype = NULL_TREE;
3747 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3748 info->element_type = TREE_TYPE (vectype);
3749 info->memory_type = TREE_TYPE (DR_REF (dr));
3750 }
3751
3752 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3753 gather load or scatter store. Describe the operation in *INFO if so. */
3754
3755 bool
3756 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3757 gather_scatter_info *info)
3758 {
3759 HOST_WIDE_INT scale = 1;
3760 poly_int64 pbitpos, pbitsize;
3761 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3762 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3763 tree offtype = NULL_TREE;
3764 tree decl = NULL_TREE, base, off;
3765 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3766 tree memory_type = TREE_TYPE (DR_REF (dr));
3767 machine_mode pmode;
3768 int punsignedp, reversep, pvolatilep = 0;
3769 internal_fn ifn;
3770 tree offset_vectype;
3771 bool masked_p = false;
3772
3773 /* See whether this is already a call to a gather/scatter internal function.
3774 If not, see whether it's a masked load or store. */
3775 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3776 if (call && gimple_call_internal_p (call))
3777 {
3778 ifn = gimple_call_internal_fn (call);
3779 if (internal_gather_scatter_fn_p (ifn))
3780 {
3781 vect_describe_gather_scatter_call (stmt_info, info);
3782 return true;
3783 }
3784 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3785 }
3786
3787 /* True if we should aim to use internal functions rather than
3788 built-in functions. */
3789 bool use_ifn_p = (DR_IS_READ (dr)
3790 ? supports_vec_gather_load_p ()
3791 : supports_vec_scatter_store_p ());
3792
3793 base = DR_REF (dr);
3794 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3795 see if we can use the def stmt of the address. */
3796 if (masked_p
3797 && TREE_CODE (base) == MEM_REF
3798 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3799 && integer_zerop (TREE_OPERAND (base, 1))
3800 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3801 {
3802 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3803 if (is_gimple_assign (def_stmt)
3804 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3805 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3806 }
3807
3808 /* The gather and scatter builtins need address of the form
3809 loop_invariant + vector * {1, 2, 4, 8}
3810 or
3811 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3812 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3813 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3814 multiplications and additions in it. To get a vector, we need
3815 a single SSA_NAME that will be defined in the loop and will
3816 contain everything that is not loop invariant and that can be
3817 vectorized. The following code attempts to find such a preexistng
3818 SSA_NAME OFF and put the loop invariants into a tree BASE
3819 that can be gimplified before the loop. */
3820 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3821 &punsignedp, &reversep, &pvolatilep);
3822 if (reversep)
3823 return false;
3824
3825 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3826
3827 if (TREE_CODE (base) == MEM_REF)
3828 {
3829 if (!integer_zerop (TREE_OPERAND (base, 1)))
3830 {
3831 if (off == NULL_TREE)
3832 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3833 else
3834 off = size_binop (PLUS_EXPR, off,
3835 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3836 }
3837 base = TREE_OPERAND (base, 0);
3838 }
3839 else
3840 base = build_fold_addr_expr (base);
3841
3842 if (off == NULL_TREE)
3843 off = size_zero_node;
3844
3845 /* If base is not loop invariant, either off is 0, then we start with just
3846 the constant offset in the loop invariant BASE and continue with base
3847 as OFF, otherwise give up.
3848 We could handle that case by gimplifying the addition of base + off
3849 into some SSA_NAME and use that as off, but for now punt. */
3850 if (!expr_invariant_in_loop_p (loop, base))
3851 {
3852 if (!integer_zerop (off))
3853 return false;
3854 off = base;
3855 base = size_int (pbytepos);
3856 }
3857 /* Otherwise put base + constant offset into the loop invariant BASE
3858 and continue with OFF. */
3859 else
3860 {
3861 base = fold_convert (sizetype, base);
3862 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3863 }
3864
3865 /* OFF at this point may be either a SSA_NAME or some tree expression
3866 from get_inner_reference. Try to peel off loop invariants from it
3867 into BASE as long as possible. */
3868 STRIP_NOPS (off);
3869 while (offtype == NULL_TREE)
3870 {
3871 enum tree_code code;
3872 tree op0, op1, add = NULL_TREE;
3873
3874 if (TREE_CODE (off) == SSA_NAME)
3875 {
3876 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3877
3878 if (expr_invariant_in_loop_p (loop, off))
3879 return false;
3880
3881 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3882 break;
3883
3884 op0 = gimple_assign_rhs1 (def_stmt);
3885 code = gimple_assign_rhs_code (def_stmt);
3886 op1 = gimple_assign_rhs2 (def_stmt);
3887 }
3888 else
3889 {
3890 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3891 return false;
3892 code = TREE_CODE (off);
3893 extract_ops_from_tree (off, &code, &op0, &op1);
3894 }
3895 switch (code)
3896 {
3897 case POINTER_PLUS_EXPR:
3898 case PLUS_EXPR:
3899 if (expr_invariant_in_loop_p (loop, op0))
3900 {
3901 add = op0;
3902 off = op1;
3903 do_add:
3904 add = fold_convert (sizetype, add);
3905 if (scale != 1)
3906 add = size_binop (MULT_EXPR, add, size_int (scale));
3907 base = size_binop (PLUS_EXPR, base, add);
3908 continue;
3909 }
3910 if (expr_invariant_in_loop_p (loop, op1))
3911 {
3912 add = op1;
3913 off = op0;
3914 goto do_add;
3915 }
3916 break;
3917 case MINUS_EXPR:
3918 if (expr_invariant_in_loop_p (loop, op1))
3919 {
3920 add = fold_convert (sizetype, op1);
3921 add = size_binop (MINUS_EXPR, size_zero_node, add);
3922 off = op0;
3923 goto do_add;
3924 }
3925 break;
3926 case MULT_EXPR:
3927 if (scale == 1 && tree_fits_shwi_p (op1))
3928 {
3929 int new_scale = tree_to_shwi (op1);
3930 /* Only treat this as a scaling operation if the target
3931 supports it for at least some offset type. */
3932 if (use_ifn_p
3933 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3934 masked_p, vectype, memory_type,
3935 signed_char_type_node,
3936 new_scale, &ifn,
3937 &offset_vectype)
3938 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3939 masked_p, vectype, memory_type,
3940 unsigned_char_type_node,
3941 new_scale, &ifn,
3942 &offset_vectype))
3943 break;
3944 scale = new_scale;
3945 off = op0;
3946 continue;
3947 }
3948 break;
3949 case SSA_NAME:
3950 off = op0;
3951 continue;
3952 CASE_CONVERT:
3953 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3954 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3955 break;
3956
3957 /* Don't include the conversion if the target is happy with
3958 the current offset type. */
3959 if (use_ifn_p
3960 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3961 masked_p, vectype, memory_type,
3962 TREE_TYPE (off), scale, &ifn,
3963 &offset_vectype))
3964 break;
3965
3966 if (TYPE_PRECISION (TREE_TYPE (op0))
3967 == TYPE_PRECISION (TREE_TYPE (off)))
3968 {
3969 off = op0;
3970 continue;
3971 }
3972
3973 if (TYPE_PRECISION (TREE_TYPE (op0))
3974 < TYPE_PRECISION (TREE_TYPE (off)))
3975 {
3976 off = op0;
3977 offtype = TREE_TYPE (off);
3978 STRIP_NOPS (off);
3979 continue;
3980 }
3981 break;
3982 default:
3983 break;
3984 }
3985 break;
3986 }
3987
3988 /* If at the end OFF still isn't a SSA_NAME or isn't
3989 defined in the loop, punt. */
3990 if (TREE_CODE (off) != SSA_NAME
3991 || expr_invariant_in_loop_p (loop, off))
3992 return false;
3993
3994 if (offtype == NULL_TREE)
3995 offtype = TREE_TYPE (off);
3996
3997 if (use_ifn_p)
3998 {
3999 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4000 vectype, memory_type, offtype, scale,
4001 &ifn, &offset_vectype))
4002 return false;
4003 }
4004 else
4005 {
4006 if (DR_IS_READ (dr))
4007 {
4008 if (targetm.vectorize.builtin_gather)
4009 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4010 }
4011 else
4012 {
4013 if (targetm.vectorize.builtin_scatter)
4014 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4015 }
4016
4017 if (!decl)
4018 return false;
4019
4020 ifn = IFN_LAST;
4021 /* The offset vector type will be read from DECL when needed. */
4022 offset_vectype = NULL_TREE;
4023 }
4024
4025 info->ifn = ifn;
4026 info->decl = decl;
4027 info->base = base;
4028 info->offset = off;
4029 info->offset_dt = vect_unknown_def_type;
4030 info->offset_vectype = offset_vectype;
4031 info->scale = scale;
4032 info->element_type = TREE_TYPE (vectype);
4033 info->memory_type = memory_type;
4034 return true;
4035 }
4036
4037 /* Find the data references in STMT, analyze them with respect to LOOP and
4038 append them to DATAREFS. Return false if datarefs in this stmt cannot
4039 be handled. */
4040
4041 opt_result
4042 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4043 vec<data_reference_p> *datarefs)
4044 {
4045 /* We can ignore clobbers for dataref analysis - they are removed during
4046 loop vectorization and BB vectorization checks dependences with a
4047 stmt walk. */
4048 if (gimple_clobber_p (stmt))
4049 return opt_result::success ();
4050
4051 if (gimple_has_volatile_ops (stmt))
4052 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4053 stmt);
4054
4055 if (stmt_can_throw_internal (cfun, stmt))
4056 return opt_result::failure_at (stmt,
4057 "not vectorized:"
4058 " statement can throw an exception: %G",
4059 stmt);
4060
4061 auto_vec<data_reference_p, 2> refs;
4062 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4063 if (!res)
4064 return res;
4065
4066 if (refs.is_empty ())
4067 return opt_result::success ();
4068
4069 if (refs.length () > 1)
4070 return opt_result::failure_at (stmt,
4071 "not vectorized:"
4072 " more than one data ref in stmt: %G", stmt);
4073
4074 if (gcall *call = dyn_cast <gcall *> (stmt))
4075 if (!gimple_call_internal_p (call)
4076 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4077 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4078 return opt_result::failure_at (stmt,
4079 "not vectorized: dr in a call %G", stmt);
4080
4081 data_reference_p dr = refs.pop ();
4082 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4083 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4084 return opt_result::failure_at (stmt,
4085 "not vectorized:"
4086 " statement is bitfield access %G", stmt);
4087
4088 if (DR_BASE_ADDRESS (dr)
4089 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4090 return opt_result::failure_at (stmt,
4091 "not vectorized:"
4092 " base addr of dr is a constant\n");
4093
4094 /* Check whether this may be a SIMD lane access and adjust the
4095 DR to make it easier for us to handle it. */
4096 if (loop
4097 && loop->simduid
4098 && (!DR_BASE_ADDRESS (dr)
4099 || !DR_OFFSET (dr)
4100 || !DR_INIT (dr)
4101 || !DR_STEP (dr)))
4102 {
4103 struct data_reference *newdr
4104 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4105 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4106 if (DR_BASE_ADDRESS (newdr)
4107 && DR_OFFSET (newdr)
4108 && DR_INIT (newdr)
4109 && DR_STEP (newdr)
4110 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4111 && integer_zerop (DR_STEP (newdr)))
4112 {
4113 tree base_address = DR_BASE_ADDRESS (newdr);
4114 tree off = DR_OFFSET (newdr);
4115 tree step = ssize_int (1);
4116 if (integer_zerop (off)
4117 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4118 {
4119 off = TREE_OPERAND (base_address, 1);
4120 base_address = TREE_OPERAND (base_address, 0);
4121 }
4122 STRIP_NOPS (off);
4123 if (TREE_CODE (off) == MULT_EXPR
4124 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4125 {
4126 step = TREE_OPERAND (off, 1);
4127 off = TREE_OPERAND (off, 0);
4128 STRIP_NOPS (off);
4129 }
4130 if (CONVERT_EXPR_P (off)
4131 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4132 < TYPE_PRECISION (TREE_TYPE (off))))
4133 off = TREE_OPERAND (off, 0);
4134 if (TREE_CODE (off) == SSA_NAME)
4135 {
4136 gimple *def = SSA_NAME_DEF_STMT (off);
4137 /* Look through widening conversion. */
4138 if (is_gimple_assign (def)
4139 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4140 {
4141 tree rhs1 = gimple_assign_rhs1 (def);
4142 if (TREE_CODE (rhs1) == SSA_NAME
4143 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4144 && (TYPE_PRECISION (TREE_TYPE (off))
4145 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4146 def = SSA_NAME_DEF_STMT (rhs1);
4147 }
4148 if (is_gimple_call (def)
4149 && gimple_call_internal_p (def)
4150 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4151 {
4152 tree arg = gimple_call_arg (def, 0);
4153 tree reft = TREE_TYPE (DR_REF (newdr));
4154 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4155 arg = SSA_NAME_VAR (arg);
4156 if (arg == loop->simduid
4157 /* For now. */
4158 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4159 {
4160 DR_BASE_ADDRESS (newdr) = base_address;
4161 DR_OFFSET (newdr) = ssize_int (0);
4162 DR_STEP (newdr) = step;
4163 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4164 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4165 /* Mark as simd-lane access. */
4166 tree arg2 = gimple_call_arg (def, 1);
4167 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4168 free_data_ref (dr);
4169 datarefs->safe_push (newdr);
4170 return opt_result::success ();
4171 }
4172 }
4173 }
4174 }
4175 free_data_ref (newdr);
4176 }
4177
4178 datarefs->safe_push (dr);
4179 return opt_result::success ();
4180 }
4181
4182 /* Function vect_analyze_data_refs.
4183
4184 Find all the data references in the loop or basic block.
4185
4186 The general structure of the analysis of data refs in the vectorizer is as
4187 follows:
4188 1- vect_analyze_data_refs(loop/bb): call
4189 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4190 in the loop/bb and their dependences.
4191 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4192 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4193 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4194
4195 */
4196
4197 opt_result
4198 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4199 {
4200 class loop *loop = NULL;
4201 unsigned int i;
4202 struct data_reference *dr;
4203 tree scalar_type;
4204
4205 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4206
4207 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4208 loop = LOOP_VINFO_LOOP (loop_vinfo);
4209
4210 /* Go through the data-refs, check that the analysis succeeded. Update
4211 pointer from stmt_vec_info struct to DR and vectype. */
4212
4213 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4214 FOR_EACH_VEC_ELT (datarefs, i, dr)
4215 {
4216 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4217 poly_uint64 vf;
4218
4219 gcc_assert (DR_REF (dr));
4220 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4221 gcc_assert (!stmt_info->dr_aux.dr);
4222 stmt_info->dr_aux.dr = dr;
4223 stmt_info->dr_aux.stmt = stmt_info;
4224
4225 /* Check that analysis of the data-ref succeeded. */
4226 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4227 || !DR_STEP (dr))
4228 {
4229 bool maybe_gather
4230 = DR_IS_READ (dr)
4231 && !TREE_THIS_VOLATILE (DR_REF (dr))
4232 && (targetm.vectorize.builtin_gather != NULL
4233 || supports_vec_gather_load_p ());
4234 bool maybe_scatter
4235 = DR_IS_WRITE (dr)
4236 && !TREE_THIS_VOLATILE (DR_REF (dr))
4237 && (targetm.vectorize.builtin_scatter != NULL
4238 || supports_vec_scatter_store_p ());
4239
4240 /* If target supports vector gather loads or scatter stores,
4241 see if they can't be used. */
4242 if (is_a <loop_vec_info> (vinfo)
4243 && !nested_in_vect_loop_p (loop, stmt_info))
4244 {
4245 if (maybe_gather || maybe_scatter)
4246 {
4247 if (maybe_gather)
4248 gatherscatter = GATHER;
4249 else
4250 gatherscatter = SCATTER;
4251 }
4252 }
4253
4254 if (gatherscatter == SG_NONE)
4255 {
4256 if (dump_enabled_p ())
4257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4258 "not vectorized: data ref analysis "
4259 "failed %G", stmt_info->stmt);
4260 if (is_a <bb_vec_info> (vinfo))
4261 {
4262 /* In BB vectorization the ref can still participate
4263 in dependence analysis, we just can't vectorize it. */
4264 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4265 continue;
4266 }
4267 return opt_result::failure_at (stmt_info->stmt,
4268 "not vectorized:"
4269 " data ref analysis failed: %G",
4270 stmt_info->stmt);
4271 }
4272 }
4273
4274 /* See if this was detected as SIMD lane access. */
4275 if (dr->aux == (void *)-1
4276 || dr->aux == (void *)-2
4277 || dr->aux == (void *)-3
4278 || dr->aux == (void *)-4)
4279 {
4280 if (nested_in_vect_loop_p (loop, stmt_info))
4281 return opt_result::failure_at (stmt_info->stmt,
4282 "not vectorized:"
4283 " data ref analysis failed: %G",
4284 stmt_info->stmt);
4285 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4286 = -(uintptr_t) dr->aux;
4287 }
4288
4289 tree base = get_base_address (DR_REF (dr));
4290 if (base && VAR_P (base) && DECL_NONALIASED (base))
4291 {
4292 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4294 "not vectorized: base object not addressable "
4295 "for stmt: %G", stmt_info->stmt);
4296 if (is_a <bb_vec_info> (vinfo))
4297 {
4298 /* In BB vectorization the ref can still participate
4299 in dependence analysis, we just can't vectorize it. */
4300 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4301 continue;
4302 }
4303 return opt_result::failure_at (stmt_info->stmt,
4304 "not vectorized: base object not"
4305 " addressable for stmt: %G",
4306 stmt_info->stmt);
4307 }
4308
4309 if (is_a <loop_vec_info> (vinfo)
4310 && DR_STEP (dr)
4311 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4312 {
4313 if (nested_in_vect_loop_p (loop, stmt_info))
4314 return opt_result::failure_at (stmt_info->stmt,
4315 "not vectorized: "
4316 "not suitable for strided load %G",
4317 stmt_info->stmt);
4318 STMT_VINFO_STRIDED_P (stmt_info) = true;
4319 }
4320
4321 /* Update DR field in stmt_vec_info struct. */
4322
4323 /* If the dataref is in an inner-loop of the loop that is considered for
4324 for vectorization, we also want to analyze the access relative to
4325 the outer-loop (DR contains information only relative to the
4326 inner-most enclosing loop). We do that by building a reference to the
4327 first location accessed by the inner-loop, and analyze it relative to
4328 the outer-loop. */
4329 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4330 {
4331 /* Build a reference to the first location accessed by the
4332 inner loop: *(BASE + INIT + OFFSET). By construction,
4333 this address must be invariant in the inner loop, so we
4334 can consider it as being used in the outer loop. */
4335 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4336 tree offset = unshare_expr (DR_OFFSET (dr));
4337 tree init = unshare_expr (DR_INIT (dr));
4338 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4339 init, offset);
4340 tree init_addr = fold_build_pointer_plus (base, init_offset);
4341 tree init_ref = build_fold_indirect_ref (init_addr);
4342
4343 if (dump_enabled_p ())
4344 dump_printf_loc (MSG_NOTE, vect_location,
4345 "analyze in outer loop: %T\n", init_ref);
4346
4347 opt_result res
4348 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4349 init_ref, loop, stmt_info->stmt);
4350 if (!res)
4351 /* dr_analyze_innermost already explained the failure. */
4352 return res;
4353
4354 if (dump_enabled_p ())
4355 dump_printf_loc (MSG_NOTE, vect_location,
4356 "\touter base_address: %T\n"
4357 "\touter offset from base address: %T\n"
4358 "\touter constant offset from base address: %T\n"
4359 "\touter step: %T\n"
4360 "\touter base alignment: %d\n\n"
4361 "\touter base misalignment: %d\n"
4362 "\touter offset alignment: %d\n"
4363 "\touter step alignment: %d\n",
4364 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4365 STMT_VINFO_DR_OFFSET (stmt_info),
4366 STMT_VINFO_DR_INIT (stmt_info),
4367 STMT_VINFO_DR_STEP (stmt_info),
4368 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4369 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4370 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4371 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4372 }
4373
4374 /* Set vectype for STMT. */
4375 scalar_type = TREE_TYPE (DR_REF (dr));
4376 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4377 if (!vectype)
4378 {
4379 if (dump_enabled_p ())
4380 {
4381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4382 "not vectorized: no vectype for stmt: %G",
4383 stmt_info->stmt);
4384 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4385 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4386 scalar_type);
4387 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4388 }
4389
4390 if (is_a <bb_vec_info> (vinfo))
4391 {
4392 /* No vector type is fine, the ref can still participate
4393 in dependence analysis, we just can't vectorize it. */
4394 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4395 continue;
4396 }
4397 if (fatal)
4398 *fatal = false;
4399 return opt_result::failure_at (stmt_info->stmt,
4400 "not vectorized:"
4401 " no vectype for stmt: %G"
4402 " scalar_type: %T\n",
4403 stmt_info->stmt, scalar_type);
4404 }
4405 else
4406 {
4407 if (dump_enabled_p ())
4408 dump_printf_loc (MSG_NOTE, vect_location,
4409 "got vectype for stmt: %G%T\n",
4410 stmt_info->stmt, vectype);
4411 }
4412
4413 /* Adjust the minimal vectorization factor according to the
4414 vector type. */
4415 vf = TYPE_VECTOR_SUBPARTS (vectype);
4416 *min_vf = upper_bound (*min_vf, vf);
4417
4418 /* Leave the BB vectorizer to pick the vector type later, based on
4419 the final dataref group size and SLP node size. */
4420 if (is_a <loop_vec_info> (vinfo))
4421 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4422
4423 if (gatherscatter != SG_NONE)
4424 {
4425 gather_scatter_info gs_info;
4426 if (!vect_check_gather_scatter (stmt_info,
4427 as_a <loop_vec_info> (vinfo),
4428 &gs_info)
4429 || !get_vectype_for_scalar_type (vinfo,
4430 TREE_TYPE (gs_info.offset)))
4431 {
4432 if (fatal)
4433 *fatal = false;
4434 return opt_result::failure_at
4435 (stmt_info->stmt,
4436 (gatherscatter == GATHER)
4437 ? "not vectorized: not suitable for gather load %G"
4438 : "not vectorized: not suitable for scatter store %G",
4439 stmt_info->stmt);
4440 }
4441 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4442 }
4443 }
4444
4445 /* We used to stop processing and prune the list here. Verify we no
4446 longer need to. */
4447 gcc_assert (i == datarefs.length ());
4448
4449 return opt_result::success ();
4450 }
4451
4452
4453 /* Function vect_get_new_vect_var.
4454
4455 Returns a name for a new variable. The current naming scheme appends the
4456 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4457 the name of vectorizer generated variables, and appends that to NAME if
4458 provided. */
4459
4460 tree
4461 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4462 {
4463 const char *prefix;
4464 tree new_vect_var;
4465
4466 switch (var_kind)
4467 {
4468 case vect_simple_var:
4469 prefix = "vect";
4470 break;
4471 case vect_scalar_var:
4472 prefix = "stmp";
4473 break;
4474 case vect_mask_var:
4475 prefix = "mask";
4476 break;
4477 case vect_pointer_var:
4478 prefix = "vectp";
4479 break;
4480 default:
4481 gcc_unreachable ();
4482 }
4483
4484 if (name)
4485 {
4486 char* tmp = concat (prefix, "_", name, NULL);
4487 new_vect_var = create_tmp_reg (type, tmp);
4488 free (tmp);
4489 }
4490 else
4491 new_vect_var = create_tmp_reg (type, prefix);
4492
4493 return new_vect_var;
4494 }
4495
4496 /* Like vect_get_new_vect_var but return an SSA name. */
4497
4498 tree
4499 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4500 {
4501 const char *prefix;
4502 tree new_vect_var;
4503
4504 switch (var_kind)
4505 {
4506 case vect_simple_var:
4507 prefix = "vect";
4508 break;
4509 case vect_scalar_var:
4510 prefix = "stmp";
4511 break;
4512 case vect_pointer_var:
4513 prefix = "vectp";
4514 break;
4515 default:
4516 gcc_unreachable ();
4517 }
4518
4519 if (name)
4520 {
4521 char* tmp = concat (prefix, "_", name, NULL);
4522 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4523 free (tmp);
4524 }
4525 else
4526 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4527
4528 return new_vect_var;
4529 }
4530
4531 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4532
4533 static void
4534 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4535 {
4536 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4537 int misalign = DR_MISALIGNMENT (dr_info);
4538 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4539 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4540 else
4541 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4542 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4543 misalign);
4544 }
4545
4546 /* Function vect_create_addr_base_for_vector_ref.
4547
4548 Create an expression that computes the address of the first memory location
4549 that will be accessed for a data reference.
4550
4551 Input:
4552 STMT_INFO: The statement containing the data reference.
4553 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4554 OFFSET: Optional. If supplied, it is be added to the initial address.
4555 LOOP: Specify relative to which loop-nest should the address be computed.
4556 For example, when the dataref is in an inner-loop nested in an
4557 outer-loop that is now being vectorized, LOOP can be either the
4558 outer-loop, or the inner-loop. The first memory location accessed
4559 by the following dataref ('in' points to short):
4560
4561 for (i=0; i<N; i++)
4562 for (j=0; j<M; j++)
4563 s += in[i+j]
4564
4565 is as follows:
4566 if LOOP=i_loop: &in (relative to i_loop)
4567 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4568 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4569 initial address. Unlike OFFSET, which is number of elements to
4570 be added, BYTE_OFFSET is measured in bytes.
4571
4572 Output:
4573 1. Return an SSA_NAME whose value is the address of the memory location of
4574 the first vector of the data reference.
4575 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4576 these statement(s) which define the returned SSA_NAME.
4577
4578 FORNOW: We are only handling array accesses with step 1. */
4579
4580 tree
4581 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4582 gimple_seq *new_stmt_list,
4583 tree offset,
4584 tree byte_offset)
4585 {
4586 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4587 struct data_reference *dr = dr_info->dr;
4588 const char *base_name;
4589 tree addr_base;
4590 tree dest;
4591 gimple_seq seq = NULL;
4592 tree vect_ptr_type;
4593 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4594 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4595 innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4596
4597 tree data_ref_base = unshare_expr (drb->base_address);
4598 tree base_offset = unshare_expr (drb->offset);
4599 tree init = unshare_expr (drb->init);
4600
4601 if (loop_vinfo)
4602 base_name = get_name (data_ref_base);
4603 else
4604 {
4605 base_offset = ssize_int (0);
4606 init = ssize_int (0);
4607 base_name = get_name (DR_REF (dr));
4608 }
4609
4610 /* Create base_offset */
4611 base_offset = size_binop (PLUS_EXPR,
4612 fold_convert (sizetype, base_offset),
4613 fold_convert (sizetype, init));
4614
4615 if (offset)
4616 {
4617 offset = fold_build2 (MULT_EXPR, sizetype,
4618 fold_convert (sizetype, offset), step);
4619 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4620 base_offset, offset);
4621 }
4622 if (byte_offset)
4623 {
4624 byte_offset = fold_convert (sizetype, byte_offset);
4625 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4626 base_offset, byte_offset);
4627 }
4628
4629 /* base + base_offset */
4630 if (loop_vinfo)
4631 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4632 else
4633 {
4634 addr_base = build1 (ADDR_EXPR,
4635 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4636 unshare_expr (DR_REF (dr)));
4637 }
4638
4639 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4640 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4641 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4642 gimple_seq_add_seq (new_stmt_list, seq);
4643
4644 if (DR_PTR_INFO (dr)
4645 && TREE_CODE (addr_base) == SSA_NAME
4646 && !SSA_NAME_PTR_INFO (addr_base))
4647 {
4648 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4649 if (offset || byte_offset)
4650 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4651 }
4652
4653 if (dump_enabled_p ())
4654 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4655
4656 return addr_base;
4657 }
4658
4659
4660 /* Function vect_create_data_ref_ptr.
4661
4662 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4663 location accessed in the loop by STMT_INFO, along with the def-use update
4664 chain to appropriately advance the pointer through the loop iterations.
4665 Also set aliasing information for the pointer. This pointer is used by
4666 the callers to this function to create a memory reference expression for
4667 vector load/store access.
4668
4669 Input:
4670 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4671 GIMPLE_ASSIGN <name, data-ref> or
4672 GIMPLE_ASSIGN <data-ref, name>.
4673 2. AGGR_TYPE: the type of the reference, which should be either a vector
4674 or an array.
4675 3. AT_LOOP: the loop where the vector memref is to be created.
4676 4. OFFSET (optional): an offset to be added to the initial address accessed
4677 by the data-ref in STMT_INFO.
4678 5. BSI: location where the new stmts are to be placed if there is no loop
4679 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4680 pointing to the initial address.
4681 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4682 to the initial address accessed by the data-ref in STMT_INFO. This is
4683 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4684 in bytes.
4685 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4686 to the IV during each iteration of the loop. NULL says to move
4687 by one copy of AGGR_TYPE up or down, depending on the step of the
4688 data reference.
4689
4690 Output:
4691 1. Declare a new ptr to vector_type, and have it point to the base of the
4692 data reference (initial addressed accessed by the data reference).
4693 For example, for vector of type V8HI, the following code is generated:
4694
4695 v8hi *ap;
4696 ap = (v8hi *)initial_address;
4697
4698 if OFFSET is not supplied:
4699 initial_address = &a[init];
4700 if OFFSET is supplied:
4701 initial_address = &a[init + OFFSET];
4702 if BYTE_OFFSET is supplied:
4703 initial_address = &a[init] + BYTE_OFFSET;
4704
4705 Return the initial_address in INITIAL_ADDRESS.
4706
4707 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4708 update the pointer in each iteration of the loop.
4709
4710 Return the increment stmt that updates the pointer in PTR_INCR.
4711
4712 3. Return the pointer. */
4713
4714 tree
4715 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4716 class loop *at_loop, tree offset,
4717 tree *initial_address, gimple_stmt_iterator *gsi,
4718 gimple **ptr_incr, bool only_init,
4719 tree byte_offset, tree iv_step)
4720 {
4721 const char *base_name;
4722 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4723 class loop *loop = NULL;
4724 bool nested_in_vect_loop = false;
4725 class loop *containing_loop = NULL;
4726 tree aggr_ptr_type;
4727 tree aggr_ptr;
4728 tree new_temp;
4729 gimple_seq new_stmt_list = NULL;
4730 edge pe = NULL;
4731 basic_block new_bb;
4732 tree aggr_ptr_init;
4733 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4734 struct data_reference *dr = dr_info->dr;
4735 tree aptr;
4736 gimple_stmt_iterator incr_gsi;
4737 bool insert_after;
4738 tree indx_before_incr, indx_after_incr;
4739 gimple *incr;
4740 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4741
4742 gcc_assert (iv_step != NULL_TREE
4743 || TREE_CODE (aggr_type) == ARRAY_TYPE
4744 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4745
4746 if (loop_vinfo)
4747 {
4748 loop = LOOP_VINFO_LOOP (loop_vinfo);
4749 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4750 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4751 pe = loop_preheader_edge (loop);
4752 }
4753 else
4754 {
4755 gcc_assert (bb_vinfo);
4756 only_init = true;
4757 *ptr_incr = NULL;
4758 }
4759
4760 /* Create an expression for the first address accessed by this load
4761 in LOOP. */
4762 base_name = get_name (DR_BASE_ADDRESS (dr));
4763
4764 if (dump_enabled_p ())
4765 {
4766 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4767 dump_printf_loc (MSG_NOTE, vect_location,
4768 "create %s-pointer variable to type: %T",
4769 get_tree_code_name (TREE_CODE (aggr_type)),
4770 aggr_type);
4771 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4772 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4773 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4774 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4775 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4776 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4777 else
4778 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4779 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4780 }
4781
4782 /* (1) Create the new aggregate-pointer variable.
4783 Vector and array types inherit the alias set of their component
4784 type by default so we need to use a ref-all pointer if the data
4785 reference does not conflict with the created aggregated data
4786 reference because it is not addressable. */
4787 bool need_ref_all = false;
4788 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4789 get_alias_set (DR_REF (dr))))
4790 need_ref_all = true;
4791 /* Likewise for any of the data references in the stmt group. */
4792 else if (DR_GROUP_SIZE (stmt_info) > 1)
4793 {
4794 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4795 do
4796 {
4797 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4798 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4799 get_alias_set (DR_REF (sdr))))
4800 {
4801 need_ref_all = true;
4802 break;
4803 }
4804 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4805 }
4806 while (sinfo);
4807 }
4808 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4809 need_ref_all);
4810 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4811
4812
4813 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4814 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4815 def-use update cycles for the pointer: one relative to the outer-loop
4816 (LOOP), which is what steps (3) and (4) below do. The other is relative
4817 to the inner-loop (which is the inner-most loop containing the dataref),
4818 and this is done be step (5) below.
4819
4820 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4821 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4822 redundant. Steps (3),(4) create the following:
4823
4824 vp0 = &base_addr;
4825 LOOP: vp1 = phi(vp0,vp2)
4826 ...
4827 ...
4828 vp2 = vp1 + step
4829 goto LOOP
4830
4831 If there is an inner-loop nested in loop, then step (5) will also be
4832 applied, and an additional update in the inner-loop will be created:
4833
4834 vp0 = &base_addr;
4835 LOOP: vp1 = phi(vp0,vp2)
4836 ...
4837 inner: vp3 = phi(vp1,vp4)
4838 vp4 = vp3 + inner_step
4839 if () goto inner
4840 ...
4841 vp2 = vp1 + step
4842 if () goto LOOP */
4843
4844 /* (2) Calculate the initial address of the aggregate-pointer, and set
4845 the aggregate-pointer to point to it before the loop. */
4846
4847 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4848
4849 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4850 offset, byte_offset);
4851 if (new_stmt_list)
4852 {
4853 if (pe)
4854 {
4855 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4856 gcc_assert (!new_bb);
4857 }
4858 else
4859 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4860 }
4861
4862 *initial_address = new_temp;
4863 aggr_ptr_init = new_temp;
4864
4865 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4866 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4867 inner-loop nested in LOOP (during outer-loop vectorization). */
4868
4869 /* No update in loop is required. */
4870 if (only_init && (!loop_vinfo || at_loop == loop))
4871 aptr = aggr_ptr_init;
4872 else
4873 {
4874 /* Accesses to invariant addresses should be handled specially
4875 by the caller. */
4876 tree step = vect_dr_behavior (dr_info)->step;
4877 gcc_assert (!integer_zerop (step));
4878
4879 if (iv_step == NULL_TREE)
4880 {
4881 /* The step of the aggregate pointer is the type size,
4882 negated for downward accesses. */
4883 iv_step = TYPE_SIZE_UNIT (aggr_type);
4884 if (tree_int_cst_sgn (step) == -1)
4885 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4886 }
4887
4888 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4889
4890 create_iv (aggr_ptr_init,
4891 fold_convert (aggr_ptr_type, iv_step),
4892 aggr_ptr, loop, &incr_gsi, insert_after,
4893 &indx_before_incr, &indx_after_incr);
4894 incr = gsi_stmt (incr_gsi);
4895 loop_vinfo->add_stmt (incr);
4896
4897 /* Copy the points-to information if it exists. */
4898 if (DR_PTR_INFO (dr))
4899 {
4900 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4901 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4902 }
4903 if (ptr_incr)
4904 *ptr_incr = incr;
4905
4906 aptr = indx_before_incr;
4907 }
4908
4909 if (!nested_in_vect_loop || only_init)
4910 return aptr;
4911
4912
4913 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4914 nested in LOOP, if exists. */
4915
4916 gcc_assert (nested_in_vect_loop);
4917 if (!only_init)
4918 {
4919 standard_iv_increment_position (containing_loop, &incr_gsi,
4920 &insert_after);
4921 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4922 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4923 &indx_after_incr);
4924 incr = gsi_stmt (incr_gsi);
4925 loop_vinfo->add_stmt (incr);
4926
4927 /* Copy the points-to information if it exists. */
4928 if (DR_PTR_INFO (dr))
4929 {
4930 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4931 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4932 }
4933 if (ptr_incr)
4934 *ptr_incr = incr;
4935
4936 return indx_before_incr;
4937 }
4938 else
4939 gcc_unreachable ();
4940 }
4941
4942
4943 /* Function bump_vector_ptr
4944
4945 Increment a pointer (to a vector type) by vector-size. If requested,
4946 i.e. if PTR-INCR is given, then also connect the new increment stmt
4947 to the existing def-use update-chain of the pointer, by modifying
4948 the PTR_INCR as illustrated below:
4949
4950 The pointer def-use update-chain before this function:
4951 DATAREF_PTR = phi (p_0, p_2)
4952 ....
4953 PTR_INCR: p_2 = DATAREF_PTR + step
4954
4955 The pointer def-use update-chain after this function:
4956 DATAREF_PTR = phi (p_0, p_2)
4957 ....
4958 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4959 ....
4960 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4961
4962 Input:
4963 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4964 in the loop.
4965 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4966 the loop. The increment amount across iterations is expected
4967 to be vector_size.
4968 BSI - location where the new update stmt is to be placed.
4969 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4970 BUMP - optional. The offset by which to bump the pointer. If not given,
4971 the offset is assumed to be vector_size.
4972
4973 Output: Return NEW_DATAREF_PTR as illustrated above.
4974
4975 */
4976
4977 tree
4978 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4979 stmt_vec_info stmt_info, tree bump)
4980 {
4981 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4982 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4983 tree update = TYPE_SIZE_UNIT (vectype);
4984 gassign *incr_stmt;
4985 ssa_op_iter iter;
4986 use_operand_p use_p;
4987 tree new_dataref_ptr;
4988
4989 if (bump)
4990 update = bump;
4991
4992 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4993 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4994 else
4995 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4996 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4997 dataref_ptr, update);
4998 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4999
5000 /* Copy the points-to information if it exists. */
5001 if (DR_PTR_INFO (dr))
5002 {
5003 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5004 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5005 }
5006
5007 if (!ptr_incr)
5008 return new_dataref_ptr;
5009
5010 /* Update the vector-pointer's cross-iteration increment. */
5011 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5012 {
5013 tree use = USE_FROM_PTR (use_p);
5014
5015 if (use == dataref_ptr)
5016 SET_USE (use_p, new_dataref_ptr);
5017 else
5018 gcc_assert (operand_equal_p (use, update, 0));
5019 }
5020
5021 return new_dataref_ptr;
5022 }
5023
5024
5025 /* Copy memory reference info such as base/clique from the SRC reference
5026 to the DEST MEM_REF. */
5027
5028 void
5029 vect_copy_ref_info (tree dest, tree src)
5030 {
5031 if (TREE_CODE (dest) != MEM_REF)
5032 return;
5033
5034 tree src_base = src;
5035 while (handled_component_p (src_base))
5036 src_base = TREE_OPERAND (src_base, 0);
5037 if (TREE_CODE (src_base) != MEM_REF
5038 && TREE_CODE (src_base) != TARGET_MEM_REF)
5039 return;
5040
5041 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5042 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5043 }
5044
5045
5046 /* Function vect_create_destination_var.
5047
5048 Create a new temporary of type VECTYPE. */
5049
5050 tree
5051 vect_create_destination_var (tree scalar_dest, tree vectype)
5052 {
5053 tree vec_dest;
5054 const char *name;
5055 char *new_name;
5056 tree type;
5057 enum vect_var_kind kind;
5058
5059 kind = vectype
5060 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5061 ? vect_mask_var
5062 : vect_simple_var
5063 : vect_scalar_var;
5064 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5065
5066 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5067
5068 name = get_name (scalar_dest);
5069 if (name)
5070 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5071 else
5072 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5073 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5074 free (new_name);
5075
5076 return vec_dest;
5077 }
5078
5079 /* Function vect_grouped_store_supported.
5080
5081 Returns TRUE if interleave high and interleave low permutations
5082 are supported, and FALSE otherwise. */
5083
5084 bool
5085 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5086 {
5087 machine_mode mode = TYPE_MODE (vectype);
5088
5089 /* vect_permute_store_chain requires the group size to be equal to 3 or
5090 be a power of two. */
5091 if (count != 3 && exact_log2 (count) == -1)
5092 {
5093 if (dump_enabled_p ())
5094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5095 "the size of the group of accesses"
5096 " is not a power of 2 or not eqaul to 3\n");
5097 return false;
5098 }
5099
5100 /* Check that the permutation is supported. */
5101 if (VECTOR_MODE_P (mode))
5102 {
5103 unsigned int i;
5104 if (count == 3)
5105 {
5106 unsigned int j0 = 0, j1 = 0, j2 = 0;
5107 unsigned int i, j;
5108
5109 unsigned int nelt;
5110 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5111 {
5112 if (dump_enabled_p ())
5113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5114 "cannot handle groups of 3 stores for"
5115 " variable-length vectors\n");
5116 return false;
5117 }
5118
5119 vec_perm_builder sel (nelt, nelt, 1);
5120 sel.quick_grow (nelt);
5121 vec_perm_indices indices;
5122 for (j = 0; j < 3; j++)
5123 {
5124 int nelt0 = ((3 - j) * nelt) % 3;
5125 int nelt1 = ((3 - j) * nelt + 1) % 3;
5126 int nelt2 = ((3 - j) * nelt + 2) % 3;
5127 for (i = 0; i < nelt; i++)
5128 {
5129 if (3 * i + nelt0 < nelt)
5130 sel[3 * i + nelt0] = j0++;
5131 if (3 * i + nelt1 < nelt)
5132 sel[3 * i + nelt1] = nelt + j1++;
5133 if (3 * i + nelt2 < nelt)
5134 sel[3 * i + nelt2] = 0;
5135 }
5136 indices.new_vector (sel, 2, nelt);
5137 if (!can_vec_perm_const_p (mode, indices))
5138 {
5139 if (dump_enabled_p ())
5140 dump_printf (MSG_MISSED_OPTIMIZATION,
5141 "permutation op not supported by target.\n");
5142 return false;
5143 }
5144
5145 for (i = 0; i < nelt; i++)
5146 {
5147 if (3 * i + nelt0 < nelt)
5148 sel[3 * i + nelt0] = 3 * i + nelt0;
5149 if (3 * i + nelt1 < nelt)
5150 sel[3 * i + nelt1] = 3 * i + nelt1;
5151 if (3 * i + nelt2 < nelt)
5152 sel[3 * i + nelt2] = nelt + j2++;
5153 }
5154 indices.new_vector (sel, 2, nelt);
5155 if (!can_vec_perm_const_p (mode, indices))
5156 {
5157 if (dump_enabled_p ())
5158 dump_printf (MSG_MISSED_OPTIMIZATION,
5159 "permutation op not supported by target.\n");
5160 return false;
5161 }
5162 }
5163 return true;
5164 }
5165 else
5166 {
5167 /* If length is not equal to 3 then only power of 2 is supported. */
5168 gcc_assert (pow2p_hwi (count));
5169 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5170
5171 /* The encoding has 2 interleaved stepped patterns. */
5172 vec_perm_builder sel (nelt, 2, 3);
5173 sel.quick_grow (6);
5174 for (i = 0; i < 3; i++)
5175 {
5176 sel[i * 2] = i;
5177 sel[i * 2 + 1] = i + nelt;
5178 }
5179 vec_perm_indices indices (sel, 2, nelt);
5180 if (can_vec_perm_const_p (mode, indices))
5181 {
5182 for (i = 0; i < 6; i++)
5183 sel[i] += exact_div (nelt, 2);
5184 indices.new_vector (sel, 2, nelt);
5185 if (can_vec_perm_const_p (mode, indices))
5186 return true;
5187 }
5188 }
5189 }
5190
5191 if (dump_enabled_p ())
5192 dump_printf (MSG_MISSED_OPTIMIZATION,
5193 "permutation op not supported by target.\n");
5194 return false;
5195 }
5196
5197
5198 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5199 type VECTYPE. MASKED_P says whether the masked form is needed. */
5200
5201 bool
5202 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5203 bool masked_p)
5204 {
5205 if (masked_p)
5206 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5207 vec_mask_store_lanes_optab,
5208 vectype, count);
5209 else
5210 return vect_lanes_optab_supported_p ("vec_store_lanes",
5211 vec_store_lanes_optab,
5212 vectype, count);
5213 }
5214
5215
5216 /* Function vect_permute_store_chain.
5217
5218 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5219 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5220 the data correctly for the stores. Return the final references for stores
5221 in RESULT_CHAIN.
5222
5223 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5224 The input is 4 vectors each containing 8 elements. We assign a number to
5225 each element, the input sequence is:
5226
5227 1st vec: 0 1 2 3 4 5 6 7
5228 2nd vec: 8 9 10 11 12 13 14 15
5229 3rd vec: 16 17 18 19 20 21 22 23
5230 4th vec: 24 25 26 27 28 29 30 31
5231
5232 The output sequence should be:
5233
5234 1st vec: 0 8 16 24 1 9 17 25
5235 2nd vec: 2 10 18 26 3 11 19 27
5236 3rd vec: 4 12 20 28 5 13 21 30
5237 4th vec: 6 14 22 30 7 15 23 31
5238
5239 i.e., we interleave the contents of the four vectors in their order.
5240
5241 We use interleave_high/low instructions to create such output. The input of
5242 each interleave_high/low operation is two vectors:
5243 1st vec 2nd vec
5244 0 1 2 3 4 5 6 7
5245 the even elements of the result vector are obtained left-to-right from the
5246 high/low elements of the first vector. The odd elements of the result are
5247 obtained left-to-right from the high/low elements of the second vector.
5248 The output of interleave_high will be: 0 4 1 5
5249 and of interleave_low: 2 6 3 7
5250
5251
5252 The permutation is done in log LENGTH stages. In each stage interleave_high
5253 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5254 where the first argument is taken from the first half of DR_CHAIN and the
5255 second argument from it's second half.
5256 In our example,
5257
5258 I1: interleave_high (1st vec, 3rd vec)
5259 I2: interleave_low (1st vec, 3rd vec)
5260 I3: interleave_high (2nd vec, 4th vec)
5261 I4: interleave_low (2nd vec, 4th vec)
5262
5263 The output for the first stage is:
5264
5265 I1: 0 16 1 17 2 18 3 19
5266 I2: 4 20 5 21 6 22 7 23
5267 I3: 8 24 9 25 10 26 11 27
5268 I4: 12 28 13 29 14 30 15 31
5269
5270 The output of the second stage, i.e. the final result is:
5271
5272 I1: 0 8 16 24 1 9 17 25
5273 I2: 2 10 18 26 3 11 19 27
5274 I3: 4 12 20 28 5 13 21 30
5275 I4: 6 14 22 30 7 15 23 31. */
5276
5277 void
5278 vect_permute_store_chain (vec<tree> dr_chain,
5279 unsigned int length,
5280 stmt_vec_info stmt_info,
5281 gimple_stmt_iterator *gsi,
5282 vec<tree> *result_chain)
5283 {
5284 tree vect1, vect2, high, low;
5285 gimple *perm_stmt;
5286 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5287 tree perm_mask_low, perm_mask_high;
5288 tree data_ref;
5289 tree perm3_mask_low, perm3_mask_high;
5290 unsigned int i, j, n, log_length = exact_log2 (length);
5291
5292 result_chain->quick_grow (length);
5293 memcpy (result_chain->address (), dr_chain.address (),
5294 length * sizeof (tree));
5295
5296 if (length == 3)
5297 {
5298 /* vect_grouped_store_supported ensures that this is constant. */
5299 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5300 unsigned int j0 = 0, j1 = 0, j2 = 0;
5301
5302 vec_perm_builder sel (nelt, nelt, 1);
5303 sel.quick_grow (nelt);
5304 vec_perm_indices indices;
5305 for (j = 0; j < 3; j++)
5306 {
5307 int nelt0 = ((3 - j) * nelt) % 3;
5308 int nelt1 = ((3 - j) * nelt + 1) % 3;
5309 int nelt2 = ((3 - j) * nelt + 2) % 3;
5310
5311 for (i = 0; i < nelt; i++)
5312 {
5313 if (3 * i + nelt0 < nelt)
5314 sel[3 * i + nelt0] = j0++;
5315 if (3 * i + nelt1 < nelt)
5316 sel[3 * i + nelt1] = nelt + j1++;
5317 if (3 * i + nelt2 < nelt)
5318 sel[3 * i + nelt2] = 0;
5319 }
5320 indices.new_vector (sel, 2, nelt);
5321 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5322
5323 for (i = 0; i < nelt; i++)
5324 {
5325 if (3 * i + nelt0 < nelt)
5326 sel[3 * i + nelt0] = 3 * i + nelt0;
5327 if (3 * i + nelt1 < nelt)
5328 sel[3 * i + nelt1] = 3 * i + nelt1;
5329 if (3 * i + nelt2 < nelt)
5330 sel[3 * i + nelt2] = nelt + j2++;
5331 }
5332 indices.new_vector (sel, 2, nelt);
5333 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5334
5335 vect1 = dr_chain[0];
5336 vect2 = dr_chain[1];
5337
5338 /* Create interleaving stmt:
5339 low = VEC_PERM_EXPR <vect1, vect2,
5340 {j, nelt, *, j + 1, nelt + j + 1, *,
5341 j + 2, nelt + j + 2, *, ...}> */
5342 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5343 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5344 vect2, perm3_mask_low);
5345 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5346
5347 vect1 = data_ref;
5348 vect2 = dr_chain[2];
5349 /* Create interleaving stmt:
5350 low = VEC_PERM_EXPR <vect1, vect2,
5351 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5352 6, 7, nelt + j + 2, ...}> */
5353 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5354 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5355 vect2, perm3_mask_high);
5356 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5357 (*result_chain)[j] = data_ref;
5358 }
5359 }
5360 else
5361 {
5362 /* If length is not equal to 3 then only power of 2 is supported. */
5363 gcc_assert (pow2p_hwi (length));
5364
5365 /* The encoding has 2 interleaved stepped patterns. */
5366 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5367 vec_perm_builder sel (nelt, 2, 3);
5368 sel.quick_grow (6);
5369 for (i = 0; i < 3; i++)
5370 {
5371 sel[i * 2] = i;
5372 sel[i * 2 + 1] = i + nelt;
5373 }
5374 vec_perm_indices indices (sel, 2, nelt);
5375 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5376
5377 for (i = 0; i < 6; i++)
5378 sel[i] += exact_div (nelt, 2);
5379 indices.new_vector (sel, 2, nelt);
5380 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5381
5382 for (i = 0, n = log_length; i < n; i++)
5383 {
5384 for (j = 0; j < length/2; j++)
5385 {
5386 vect1 = dr_chain[j];
5387 vect2 = dr_chain[j+length/2];
5388
5389 /* Create interleaving stmt:
5390 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5391 ...}> */
5392 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5393 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5394 vect2, perm_mask_high);
5395 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5396 (*result_chain)[2*j] = high;
5397
5398 /* Create interleaving stmt:
5399 low = VEC_PERM_EXPR <vect1, vect2,
5400 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5401 ...}> */
5402 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5403 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5404 vect2, perm_mask_low);
5405 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5406 (*result_chain)[2*j+1] = low;
5407 }
5408 memcpy (dr_chain.address (), result_chain->address (),
5409 length * sizeof (tree));
5410 }
5411 }
5412 }
5413
5414 /* Function vect_setup_realignment
5415
5416 This function is called when vectorizing an unaligned load using
5417 the dr_explicit_realign[_optimized] scheme.
5418 This function generates the following code at the loop prolog:
5419
5420 p = initial_addr;
5421 x msq_init = *(floor(p)); # prolog load
5422 realignment_token = call target_builtin;
5423 loop:
5424 x msq = phi (msq_init, ---)
5425
5426 The stmts marked with x are generated only for the case of
5427 dr_explicit_realign_optimized.
5428
5429 The code above sets up a new (vector) pointer, pointing to the first
5430 location accessed by STMT_INFO, and a "floor-aligned" load using that
5431 pointer. It also generates code to compute the "realignment-token"
5432 (if the relevant target hook was defined), and creates a phi-node at the
5433 loop-header bb whose arguments are the result of the prolog-load (created
5434 by this function) and the result of a load that takes place in the loop
5435 (to be created by the caller to this function).
5436
5437 For the case of dr_explicit_realign_optimized:
5438 The caller to this function uses the phi-result (msq) to create the
5439 realignment code inside the loop, and sets up the missing phi argument,
5440 as follows:
5441 loop:
5442 msq = phi (msq_init, lsq)
5443 lsq = *(floor(p')); # load in loop
5444 result = realign_load (msq, lsq, realignment_token);
5445
5446 For the case of dr_explicit_realign:
5447 loop:
5448 msq = *(floor(p)); # load in loop
5449 p' = p + (VS-1);
5450 lsq = *(floor(p')); # load in loop
5451 result = realign_load (msq, lsq, realignment_token);
5452
5453 Input:
5454 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5455 a memory location that may be unaligned.
5456 BSI - place where new code is to be inserted.
5457 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5458 is used.
5459
5460 Output:
5461 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5462 target hook, if defined.
5463 Return value - the result of the loop-header phi node. */
5464
5465 tree
5466 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5467 tree *realignment_token,
5468 enum dr_alignment_support alignment_support_scheme,
5469 tree init_addr,
5470 class loop **at_loop)
5471 {
5472 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5473 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5474 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5475 struct data_reference *dr = dr_info->dr;
5476 class loop *loop = NULL;
5477 edge pe = NULL;
5478 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5479 tree vec_dest;
5480 gimple *inc;
5481 tree ptr;
5482 tree data_ref;
5483 basic_block new_bb;
5484 tree msq_init = NULL_TREE;
5485 tree new_temp;
5486 gphi *phi_stmt;
5487 tree msq = NULL_TREE;
5488 gimple_seq stmts = NULL;
5489 bool compute_in_loop = false;
5490 bool nested_in_vect_loop = false;
5491 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5492 class loop *loop_for_initial_load = NULL;
5493
5494 if (loop_vinfo)
5495 {
5496 loop = LOOP_VINFO_LOOP (loop_vinfo);
5497 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5498 }
5499
5500 gcc_assert (alignment_support_scheme == dr_explicit_realign
5501 || alignment_support_scheme == dr_explicit_realign_optimized);
5502
5503 /* We need to generate three things:
5504 1. the misalignment computation
5505 2. the extra vector load (for the optimized realignment scheme).
5506 3. the phi node for the two vectors from which the realignment is
5507 done (for the optimized realignment scheme). */
5508
5509 /* 1. Determine where to generate the misalignment computation.
5510
5511 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5512 calculation will be generated by this function, outside the loop (in the
5513 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5514 caller, inside the loop.
5515
5516 Background: If the misalignment remains fixed throughout the iterations of
5517 the loop, then both realignment schemes are applicable, and also the
5518 misalignment computation can be done outside LOOP. This is because we are
5519 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5520 are a multiple of VS (the Vector Size), and therefore the misalignment in
5521 different vectorized LOOP iterations is always the same.
5522 The problem arises only if the memory access is in an inner-loop nested
5523 inside LOOP, which is now being vectorized using outer-loop vectorization.
5524 This is the only case when the misalignment of the memory access may not
5525 remain fixed throughout the iterations of the inner-loop (as explained in
5526 detail in vect_supportable_dr_alignment). In this case, not only is the
5527 optimized realignment scheme not applicable, but also the misalignment
5528 computation (and generation of the realignment token that is passed to
5529 REALIGN_LOAD) have to be done inside the loop.
5530
5531 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5532 or not, which in turn determines if the misalignment is computed inside
5533 the inner-loop, or outside LOOP. */
5534
5535 if (init_addr != NULL_TREE || !loop_vinfo)
5536 {
5537 compute_in_loop = true;
5538 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5539 }
5540
5541
5542 /* 2. Determine where to generate the extra vector load.
5543
5544 For the optimized realignment scheme, instead of generating two vector
5545 loads in each iteration, we generate a single extra vector load in the
5546 preheader of the loop, and in each iteration reuse the result of the
5547 vector load from the previous iteration. In case the memory access is in
5548 an inner-loop nested inside LOOP, which is now being vectorized using
5549 outer-loop vectorization, we need to determine whether this initial vector
5550 load should be generated at the preheader of the inner-loop, or can be
5551 generated at the preheader of LOOP. If the memory access has no evolution
5552 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5553 to be generated inside LOOP (in the preheader of the inner-loop). */
5554
5555 if (nested_in_vect_loop)
5556 {
5557 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5558 bool invariant_in_outerloop =
5559 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5560 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5561 }
5562 else
5563 loop_for_initial_load = loop;
5564 if (at_loop)
5565 *at_loop = loop_for_initial_load;
5566
5567 if (loop_for_initial_load)
5568 pe = loop_preheader_edge (loop_for_initial_load);
5569
5570 /* 3. For the case of the optimized realignment, create the first vector
5571 load at the loop preheader. */
5572
5573 if (alignment_support_scheme == dr_explicit_realign_optimized)
5574 {
5575 /* Create msq_init = *(floor(p1)) in the loop preheader */
5576 gassign *new_stmt;
5577
5578 gcc_assert (!compute_in_loop);
5579 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5580 ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5581 loop_for_initial_load, NULL_TREE,
5582 &init_addr, NULL, &inc, true);
5583 if (TREE_CODE (ptr) == SSA_NAME)
5584 new_temp = copy_ssa_name (ptr);
5585 else
5586 new_temp = make_ssa_name (TREE_TYPE (ptr));
5587 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5588 tree type = TREE_TYPE (ptr);
5589 new_stmt = gimple_build_assign
5590 (new_temp, BIT_AND_EXPR, ptr,
5591 fold_build2 (MINUS_EXPR, type,
5592 build_int_cst (type, 0),
5593 build_int_cst (type, align)));
5594 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5595 gcc_assert (!new_bb);
5596 data_ref
5597 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5598 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5599 vect_copy_ref_info (data_ref, DR_REF (dr));
5600 new_stmt = gimple_build_assign (vec_dest, data_ref);
5601 new_temp = make_ssa_name (vec_dest, new_stmt);
5602 gimple_assign_set_lhs (new_stmt, new_temp);
5603 if (pe)
5604 {
5605 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5606 gcc_assert (!new_bb);
5607 }
5608 else
5609 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5610
5611 msq_init = gimple_assign_lhs (new_stmt);
5612 }
5613
5614 /* 4. Create realignment token using a target builtin, if available.
5615 It is done either inside the containing loop, or before LOOP (as
5616 determined above). */
5617
5618 if (targetm.vectorize.builtin_mask_for_load)
5619 {
5620 gcall *new_stmt;
5621 tree builtin_decl;
5622
5623 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5624 if (!init_addr)
5625 {
5626 /* Generate the INIT_ADDR computation outside LOOP. */
5627 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5628 NULL_TREE);
5629 if (loop)
5630 {
5631 pe = loop_preheader_edge (loop);
5632 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5633 gcc_assert (!new_bb);
5634 }
5635 else
5636 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5637 }
5638
5639 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5640 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5641 vec_dest =
5642 vect_create_destination_var (scalar_dest,
5643 gimple_call_return_type (new_stmt));
5644 new_temp = make_ssa_name (vec_dest, new_stmt);
5645 gimple_call_set_lhs (new_stmt, new_temp);
5646
5647 if (compute_in_loop)
5648 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5649 else
5650 {
5651 /* Generate the misalignment computation outside LOOP. */
5652 pe = loop_preheader_edge (loop);
5653 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5654 gcc_assert (!new_bb);
5655 }
5656
5657 *realignment_token = gimple_call_lhs (new_stmt);
5658
5659 /* The result of the CALL_EXPR to this builtin is determined from
5660 the value of the parameter and no global variables are touched
5661 which makes the builtin a "const" function. Requiring the
5662 builtin to have the "const" attribute makes it unnecessary
5663 to call mark_call_clobbered. */
5664 gcc_assert (TREE_READONLY (builtin_decl));
5665 }
5666
5667 if (alignment_support_scheme == dr_explicit_realign)
5668 return msq;
5669
5670 gcc_assert (!compute_in_loop);
5671 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5672
5673
5674 /* 5. Create msq = phi <msq_init, lsq> in loop */
5675
5676 pe = loop_preheader_edge (containing_loop);
5677 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5678 msq = make_ssa_name (vec_dest);
5679 phi_stmt = create_phi_node (msq, containing_loop->header);
5680 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5681
5682 return msq;
5683 }
5684
5685
5686 /* Function vect_grouped_load_supported.
5687
5688 COUNT is the size of the load group (the number of statements plus the
5689 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5690 only one statement, with a gap of COUNT - 1.
5691
5692 Returns true if a suitable permute exists. */
5693
5694 bool
5695 vect_grouped_load_supported (tree vectype, bool single_element_p,
5696 unsigned HOST_WIDE_INT count)
5697 {
5698 machine_mode mode = TYPE_MODE (vectype);
5699
5700 /* If this is single-element interleaving with an element distance
5701 that leaves unused vector loads around punt - we at least create
5702 very sub-optimal code in that case (and blow up memory,
5703 see PR65518). */
5704 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5705 {
5706 if (dump_enabled_p ())
5707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5708 "single-element interleaving not supported "
5709 "for not adjacent vector loads\n");
5710 return false;
5711 }
5712
5713 /* vect_permute_load_chain requires the group size to be equal to 3 or
5714 be a power of two. */
5715 if (count != 3 && exact_log2 (count) == -1)
5716 {
5717 if (dump_enabled_p ())
5718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719 "the size of the group of accesses"
5720 " is not a power of 2 or not equal to 3\n");
5721 return false;
5722 }
5723
5724 /* Check that the permutation is supported. */
5725 if (VECTOR_MODE_P (mode))
5726 {
5727 unsigned int i, j;
5728 if (count == 3)
5729 {
5730 unsigned int nelt;
5731 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5732 {
5733 if (dump_enabled_p ())
5734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5735 "cannot handle groups of 3 loads for"
5736 " variable-length vectors\n");
5737 return false;
5738 }
5739
5740 vec_perm_builder sel (nelt, nelt, 1);
5741 sel.quick_grow (nelt);
5742 vec_perm_indices indices;
5743 unsigned int k;
5744 for (k = 0; k < 3; k++)
5745 {
5746 for (i = 0; i < nelt; i++)
5747 if (3 * i + k < 2 * nelt)
5748 sel[i] = 3 * i + k;
5749 else
5750 sel[i] = 0;
5751 indices.new_vector (sel, 2, nelt);
5752 if (!can_vec_perm_const_p (mode, indices))
5753 {
5754 if (dump_enabled_p ())
5755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5756 "shuffle of 3 loads is not supported by"
5757 " target\n");
5758 return false;
5759 }
5760 for (i = 0, j = 0; i < nelt; i++)
5761 if (3 * i + k < 2 * nelt)
5762 sel[i] = i;
5763 else
5764 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5765 indices.new_vector (sel, 2, nelt);
5766 if (!can_vec_perm_const_p (mode, indices))
5767 {
5768 if (dump_enabled_p ())
5769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5770 "shuffle of 3 loads is not supported by"
5771 " target\n");
5772 return false;
5773 }
5774 }
5775 return true;
5776 }
5777 else
5778 {
5779 /* If length is not equal to 3 then only power of 2 is supported. */
5780 gcc_assert (pow2p_hwi (count));
5781 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5782
5783 /* The encoding has a single stepped pattern. */
5784 vec_perm_builder sel (nelt, 1, 3);
5785 sel.quick_grow (3);
5786 for (i = 0; i < 3; i++)
5787 sel[i] = i * 2;
5788 vec_perm_indices indices (sel, 2, nelt);
5789 if (can_vec_perm_const_p (mode, indices))
5790 {
5791 for (i = 0; i < 3; i++)
5792 sel[i] = i * 2 + 1;
5793 indices.new_vector (sel, 2, nelt);
5794 if (can_vec_perm_const_p (mode, indices))
5795 return true;
5796 }
5797 }
5798 }
5799
5800 if (dump_enabled_p ())
5801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5802 "extract even/odd not supported by target\n");
5803 return false;
5804 }
5805
5806 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5807 type VECTYPE. MASKED_P says whether the masked form is needed. */
5808
5809 bool
5810 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5811 bool masked_p)
5812 {
5813 if (masked_p)
5814 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5815 vec_mask_load_lanes_optab,
5816 vectype, count);
5817 else
5818 return vect_lanes_optab_supported_p ("vec_load_lanes",
5819 vec_load_lanes_optab,
5820 vectype, count);
5821 }
5822
5823 /* Function vect_permute_load_chain.
5824
5825 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5826 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5827 the input data correctly. Return the final references for loads in
5828 RESULT_CHAIN.
5829
5830 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5831 The input is 4 vectors each containing 8 elements. We assign a number to each
5832 element, the input sequence is:
5833
5834 1st vec: 0 1 2 3 4 5 6 7
5835 2nd vec: 8 9 10 11 12 13 14 15
5836 3rd vec: 16 17 18 19 20 21 22 23
5837 4th vec: 24 25 26 27 28 29 30 31
5838
5839 The output sequence should be:
5840
5841 1st vec: 0 4 8 12 16 20 24 28
5842 2nd vec: 1 5 9 13 17 21 25 29
5843 3rd vec: 2 6 10 14 18 22 26 30
5844 4th vec: 3 7 11 15 19 23 27 31
5845
5846 i.e., the first output vector should contain the first elements of each
5847 interleaving group, etc.
5848
5849 We use extract_even/odd instructions to create such output. The input of
5850 each extract_even/odd operation is two vectors
5851 1st vec 2nd vec
5852 0 1 2 3 4 5 6 7
5853
5854 and the output is the vector of extracted even/odd elements. The output of
5855 extract_even will be: 0 2 4 6
5856 and of extract_odd: 1 3 5 7
5857
5858
5859 The permutation is done in log LENGTH stages. In each stage extract_even
5860 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5861 their order. In our example,
5862
5863 E1: extract_even (1st vec, 2nd vec)
5864 E2: extract_odd (1st vec, 2nd vec)
5865 E3: extract_even (3rd vec, 4th vec)
5866 E4: extract_odd (3rd vec, 4th vec)
5867
5868 The output for the first stage will be:
5869
5870 E1: 0 2 4 6 8 10 12 14
5871 E2: 1 3 5 7 9 11 13 15
5872 E3: 16 18 20 22 24 26 28 30
5873 E4: 17 19 21 23 25 27 29 31
5874
5875 In order to proceed and create the correct sequence for the next stage (or
5876 for the correct output, if the second stage is the last one, as in our
5877 example), we first put the output of extract_even operation and then the
5878 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5879 The input for the second stage is:
5880
5881 1st vec (E1): 0 2 4 6 8 10 12 14
5882 2nd vec (E3): 16 18 20 22 24 26 28 30
5883 3rd vec (E2): 1 3 5 7 9 11 13 15
5884 4th vec (E4): 17 19 21 23 25 27 29 31
5885
5886 The output of the second stage:
5887
5888 E1: 0 4 8 12 16 20 24 28
5889 E2: 2 6 10 14 18 22 26 30
5890 E3: 1 5 9 13 17 21 25 29
5891 E4: 3 7 11 15 19 23 27 31
5892
5893 And RESULT_CHAIN after reordering:
5894
5895 1st vec (E1): 0 4 8 12 16 20 24 28
5896 2nd vec (E3): 1 5 9 13 17 21 25 29
5897 3rd vec (E2): 2 6 10 14 18 22 26 30
5898 4th vec (E4): 3 7 11 15 19 23 27 31. */
5899
5900 static void
5901 vect_permute_load_chain (vec<tree> dr_chain,
5902 unsigned int length,
5903 stmt_vec_info stmt_info,
5904 gimple_stmt_iterator *gsi,
5905 vec<tree> *result_chain)
5906 {
5907 tree data_ref, first_vect, second_vect;
5908 tree perm_mask_even, perm_mask_odd;
5909 tree perm3_mask_low, perm3_mask_high;
5910 gimple *perm_stmt;
5911 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5912 unsigned int i, j, log_length = exact_log2 (length);
5913
5914 result_chain->quick_grow (length);
5915 memcpy (result_chain->address (), dr_chain.address (),
5916 length * sizeof (tree));
5917
5918 if (length == 3)
5919 {
5920 /* vect_grouped_load_supported ensures that this is constant. */
5921 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5922 unsigned int k;
5923
5924 vec_perm_builder sel (nelt, nelt, 1);
5925 sel.quick_grow (nelt);
5926 vec_perm_indices indices;
5927 for (k = 0; k < 3; k++)
5928 {
5929 for (i = 0; i < nelt; i++)
5930 if (3 * i + k < 2 * nelt)
5931 sel[i] = 3 * i + k;
5932 else
5933 sel[i] = 0;
5934 indices.new_vector (sel, 2, nelt);
5935 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5936
5937 for (i = 0, j = 0; i < nelt; i++)
5938 if (3 * i + k < 2 * nelt)
5939 sel[i] = i;
5940 else
5941 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5942 indices.new_vector (sel, 2, nelt);
5943 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5944
5945 first_vect = dr_chain[0];
5946 second_vect = dr_chain[1];
5947
5948 /* Create interleaving stmt (low part of):
5949 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5950 ...}> */
5951 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5952 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5953 second_vect, perm3_mask_low);
5954 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5955
5956 /* Create interleaving stmt (high part of):
5957 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5958 ...}> */
5959 first_vect = data_ref;
5960 second_vect = dr_chain[2];
5961 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5962 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5963 second_vect, perm3_mask_high);
5964 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5965 (*result_chain)[k] = data_ref;
5966 }
5967 }
5968 else
5969 {
5970 /* If length is not equal to 3 then only power of 2 is supported. */
5971 gcc_assert (pow2p_hwi (length));
5972
5973 /* The encoding has a single stepped pattern. */
5974 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5975 vec_perm_builder sel (nelt, 1, 3);
5976 sel.quick_grow (3);
5977 for (i = 0; i < 3; ++i)
5978 sel[i] = i * 2;
5979 vec_perm_indices indices (sel, 2, nelt);
5980 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5981
5982 for (i = 0; i < 3; ++i)
5983 sel[i] = i * 2 + 1;
5984 indices.new_vector (sel, 2, nelt);
5985 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5986
5987 for (i = 0; i < log_length; i++)
5988 {
5989 for (j = 0; j < length; j += 2)
5990 {
5991 first_vect = dr_chain[j];
5992 second_vect = dr_chain[j+1];
5993
5994 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5995 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5996 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5997 first_vect, second_vect,
5998 perm_mask_even);
5999 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6000 (*result_chain)[j/2] = data_ref;
6001
6002 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6003 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6004 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6005 first_vect, second_vect,
6006 perm_mask_odd);
6007 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6008 (*result_chain)[j/2+length/2] = data_ref;
6009 }
6010 memcpy (dr_chain.address (), result_chain->address (),
6011 length * sizeof (tree));
6012 }
6013 }
6014 }
6015
6016 /* Function vect_shift_permute_load_chain.
6017
6018 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6019 sequence of stmts to reorder the input data accordingly.
6020 Return the final references for loads in RESULT_CHAIN.
6021 Return true if successed, false otherwise.
6022
6023 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6024 The input is 3 vectors each containing 8 elements. We assign a
6025 number to each element, the input sequence is:
6026
6027 1st vec: 0 1 2 3 4 5 6 7
6028 2nd vec: 8 9 10 11 12 13 14 15
6029 3rd vec: 16 17 18 19 20 21 22 23
6030
6031 The output sequence should be:
6032
6033 1st vec: 0 3 6 9 12 15 18 21
6034 2nd vec: 1 4 7 10 13 16 19 22
6035 3rd vec: 2 5 8 11 14 17 20 23
6036
6037 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6038
6039 First we shuffle all 3 vectors to get correct elements order:
6040
6041 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6042 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6043 3rd vec: (16 19 22) (17 20 23) (18 21)
6044
6045 Next we unite and shift vector 3 times:
6046
6047 1st step:
6048 shift right by 6 the concatenation of:
6049 "1st vec" and "2nd vec"
6050 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6051 "2nd vec" and "3rd vec"
6052 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6053 "3rd vec" and "1st vec"
6054 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6055 | New vectors |
6056
6057 So that now new vectors are:
6058
6059 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6060 2nd vec: (10 13) (16 19 22) (17 20 23)
6061 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6062
6063 2nd step:
6064 shift right by 5 the concatenation of:
6065 "1st vec" and "3rd vec"
6066 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6067 "2nd vec" and "1st vec"
6068 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6069 "3rd vec" and "2nd vec"
6070 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6071 | New vectors |
6072
6073 So that now new vectors are:
6074
6075 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6076 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6077 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6078
6079 3rd step:
6080 shift right by 5 the concatenation of:
6081 "1st vec" and "1st vec"
6082 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6083 shift right by 3 the concatenation of:
6084 "2nd vec" and "2nd vec"
6085 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6086 | New vectors |
6087
6088 So that now all vectors are READY:
6089 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6090 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6091 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6092
6093 This algorithm is faster than one in vect_permute_load_chain if:
6094 1. "shift of a concatination" is faster than general permutation.
6095 This is usually so.
6096 2. The TARGET machine can't execute vector instructions in parallel.
6097 This is because each step of the algorithm depends on previous.
6098 The algorithm in vect_permute_load_chain is much more parallel.
6099
6100 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6101 */
6102
6103 static bool
6104 vect_shift_permute_load_chain (vec<tree> dr_chain,
6105 unsigned int length,
6106 stmt_vec_info stmt_info,
6107 gimple_stmt_iterator *gsi,
6108 vec<tree> *result_chain)
6109 {
6110 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6111 tree perm2_mask1, perm2_mask2, perm3_mask;
6112 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6113 gimple *perm_stmt;
6114
6115 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6116 unsigned int i;
6117 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6118
6119 unsigned HOST_WIDE_INT nelt, vf;
6120 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6121 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6122 /* Not supported for variable-length vectors. */
6123 return false;
6124
6125 vec_perm_builder sel (nelt, nelt, 1);
6126 sel.quick_grow (nelt);
6127
6128 result_chain->quick_grow (length);
6129 memcpy (result_chain->address (), dr_chain.address (),
6130 length * sizeof (tree));
6131
6132 if (pow2p_hwi (length) && vf > 4)
6133 {
6134 unsigned int j, log_length = exact_log2 (length);
6135 for (i = 0; i < nelt / 2; ++i)
6136 sel[i] = i * 2;
6137 for (i = 0; i < nelt / 2; ++i)
6138 sel[nelt / 2 + i] = i * 2 + 1;
6139 vec_perm_indices indices (sel, 2, nelt);
6140 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6141 {
6142 if (dump_enabled_p ())
6143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6144 "shuffle of 2 fields structure is not \
6145 supported by target\n");
6146 return false;
6147 }
6148 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6149
6150 for (i = 0; i < nelt / 2; ++i)
6151 sel[i] = i * 2 + 1;
6152 for (i = 0; i < nelt / 2; ++i)
6153 sel[nelt / 2 + i] = i * 2;
6154 indices.new_vector (sel, 2, nelt);
6155 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6156 {
6157 if (dump_enabled_p ())
6158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6159 "shuffle of 2 fields structure is not \
6160 supported by target\n");
6161 return false;
6162 }
6163 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6164
6165 /* Generating permutation constant to shift all elements.
6166 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6167 for (i = 0; i < nelt; i++)
6168 sel[i] = nelt / 2 + i;
6169 indices.new_vector (sel, 2, nelt);
6170 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6171 {
6172 if (dump_enabled_p ())
6173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6174 "shift permutation is not supported by target\n");
6175 return false;
6176 }
6177 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6178
6179 /* Generating permutation constant to select vector from 2.
6180 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6181 for (i = 0; i < nelt / 2; i++)
6182 sel[i] = i;
6183 for (i = nelt / 2; i < nelt; i++)
6184 sel[i] = nelt + i;
6185 indices.new_vector (sel, 2, nelt);
6186 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6187 {
6188 if (dump_enabled_p ())
6189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6190 "select is not supported by target\n");
6191 return false;
6192 }
6193 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6194
6195 for (i = 0; i < log_length; i++)
6196 {
6197 for (j = 0; j < length; j += 2)
6198 {
6199 first_vect = dr_chain[j];
6200 second_vect = dr_chain[j + 1];
6201
6202 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6203 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6204 first_vect, first_vect,
6205 perm2_mask1);
6206 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6207 vect[0] = data_ref;
6208
6209 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6210 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6211 second_vect, second_vect,
6212 perm2_mask2);
6213 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6214 vect[1] = data_ref;
6215
6216 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6217 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6218 vect[0], vect[1], shift1_mask);
6219 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6220 (*result_chain)[j/2 + length/2] = data_ref;
6221
6222 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6223 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6224 vect[0], vect[1], select_mask);
6225 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6226 (*result_chain)[j/2] = data_ref;
6227 }
6228 memcpy (dr_chain.address (), result_chain->address (),
6229 length * sizeof (tree));
6230 }
6231 return true;
6232 }
6233 if (length == 3 && vf > 2)
6234 {
6235 unsigned int k = 0, l = 0;
6236
6237 /* Generating permutation constant to get all elements in rigth order.
6238 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6239 for (i = 0; i < nelt; i++)
6240 {
6241 if (3 * k + (l % 3) >= nelt)
6242 {
6243 k = 0;
6244 l += (3 - (nelt % 3));
6245 }
6246 sel[i] = 3 * k + (l % 3);
6247 k++;
6248 }
6249 vec_perm_indices indices (sel, 2, nelt);
6250 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6251 {
6252 if (dump_enabled_p ())
6253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254 "shuffle of 3 fields structure is not \
6255 supported by target\n");
6256 return false;
6257 }
6258 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6259
6260 /* Generating permutation constant to shift all elements.
6261 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6262 for (i = 0; i < nelt; i++)
6263 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6264 indices.new_vector (sel, 2, nelt);
6265 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6266 {
6267 if (dump_enabled_p ())
6268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6269 "shift permutation is not supported by target\n");
6270 return false;
6271 }
6272 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6273
6274 /* Generating permutation constant to shift all elements.
6275 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6276 for (i = 0; i < nelt; i++)
6277 sel[i] = 2 * (nelt / 3) + 1 + i;
6278 indices.new_vector (sel, 2, nelt);
6279 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6280 {
6281 if (dump_enabled_p ())
6282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6283 "shift permutation is not supported by target\n");
6284 return false;
6285 }
6286 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6287
6288 /* Generating permutation constant to shift all elements.
6289 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6290 for (i = 0; i < nelt; i++)
6291 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6292 indices.new_vector (sel, 2, nelt);
6293 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6294 {
6295 if (dump_enabled_p ())
6296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6297 "shift permutation is not supported by target\n");
6298 return false;
6299 }
6300 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6301
6302 /* Generating permutation constant to shift all elements.
6303 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6304 for (i = 0; i < nelt; i++)
6305 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6306 indices.new_vector (sel, 2, nelt);
6307 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6308 {
6309 if (dump_enabled_p ())
6310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6311 "shift permutation is not supported by target\n");
6312 return false;
6313 }
6314 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6315
6316 for (k = 0; k < 3; k++)
6317 {
6318 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6319 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6320 dr_chain[k], dr_chain[k],
6321 perm3_mask);
6322 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6323 vect[k] = data_ref;
6324 }
6325
6326 for (k = 0; k < 3; k++)
6327 {
6328 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6329 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6330 vect[k % 3], vect[(k + 1) % 3],
6331 shift1_mask);
6332 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6333 vect_shift[k] = data_ref;
6334 }
6335
6336 for (k = 0; k < 3; k++)
6337 {
6338 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6339 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6340 vect_shift[(4 - k) % 3],
6341 vect_shift[(3 - k) % 3],
6342 shift2_mask);
6343 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6344 vect[k] = data_ref;
6345 }
6346
6347 (*result_chain)[3 - (nelt % 3)] = vect[2];
6348
6349 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6350 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6351 vect[0], shift3_mask);
6352 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6353 (*result_chain)[nelt % 3] = data_ref;
6354
6355 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6356 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6357 vect[1], shift4_mask);
6358 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6359 (*result_chain)[0] = data_ref;
6360 return true;
6361 }
6362 return false;
6363 }
6364
6365 /* Function vect_transform_grouped_load.
6366
6367 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6368 to perform their permutation and ascribe the result vectorized statements to
6369 the scalar statements.
6370 */
6371
6372 void
6373 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6374 int size, gimple_stmt_iterator *gsi)
6375 {
6376 machine_mode mode;
6377 vec<tree> result_chain = vNULL;
6378
6379 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6380 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6381 vectors, that are ready for vector computation. */
6382 result_chain.create (size);
6383
6384 /* If reassociation width for vector type is 2 or greater target machine can
6385 execute 2 or more vector instructions in parallel. Otherwise try to
6386 get chain for loads group using vect_shift_permute_load_chain. */
6387 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6388 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6389 || pow2p_hwi (size)
6390 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6391 gsi, &result_chain))
6392 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6393 vect_record_grouped_load_vectors (stmt_info, result_chain);
6394 result_chain.release ();
6395 }
6396
6397 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6398 generated as part of the vectorization of STMT_INFO. Assign the statement
6399 for each vector to the associated scalar statement. */
6400
6401 void
6402 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6403 vec<tree> result_chain)
6404 {
6405 vec_info *vinfo = stmt_info->vinfo;
6406 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6407 unsigned int i, gap_count;
6408 tree tmp_data_ref;
6409
6410 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6411 Since we scan the chain starting from it's first node, their order
6412 corresponds the order of data-refs in RESULT_CHAIN. */
6413 stmt_vec_info next_stmt_info = first_stmt_info;
6414 gap_count = 1;
6415 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6416 {
6417 if (!next_stmt_info)
6418 break;
6419
6420 /* Skip the gaps. Loads created for the gaps will be removed by dead
6421 code elimination pass later. No need to check for the first stmt in
6422 the group, since it always exists.
6423 DR_GROUP_GAP is the number of steps in elements from the previous
6424 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6425 correspond to the gaps. */
6426 if (next_stmt_info != first_stmt_info
6427 && gap_count < DR_GROUP_GAP (next_stmt_info))
6428 {
6429 gap_count++;
6430 continue;
6431 }
6432
6433 /* ??? The following needs cleanup after the removal of
6434 DR_GROUP_SAME_DR_STMT. */
6435 if (next_stmt_info)
6436 {
6437 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6438 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6439 copies, and we put the new vector statement in the first available
6440 RELATED_STMT. */
6441 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6442 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6443 else
6444 {
6445 stmt_vec_info prev_stmt_info
6446 = STMT_VINFO_VEC_STMT (next_stmt_info);
6447 stmt_vec_info rel_stmt_info
6448 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6449 while (rel_stmt_info)
6450 {
6451 prev_stmt_info = rel_stmt_info;
6452 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6453 }
6454
6455 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6456 }
6457
6458 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6459 gap_count = 1;
6460 }
6461 }
6462 }
6463
6464 /* Function vect_force_dr_alignment_p.
6465
6466 Returns whether the alignment of a DECL can be forced to be aligned
6467 on ALIGNMENT bit boundary. */
6468
6469 bool
6470 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6471 {
6472 if (!VAR_P (decl))
6473 return false;
6474
6475 if (decl_in_symtab_p (decl)
6476 && !symtab_node::get (decl)->can_increase_alignment_p ())
6477 return false;
6478
6479 if (TREE_STATIC (decl))
6480 return (known_le (alignment,
6481 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6482 else
6483 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6484 }
6485
6486
6487 /* Return whether the data reference DR_INFO is supported with respect to its
6488 alignment.
6489 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6490 it is aligned, i.e., check if it is possible to vectorize it with different
6491 alignment. */
6492
6493 enum dr_alignment_support
6494 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6495 bool check_aligned_accesses)
6496 {
6497 data_reference *dr = dr_info->dr;
6498 stmt_vec_info stmt_info = dr_info->stmt;
6499 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6500 machine_mode mode = TYPE_MODE (vectype);
6501 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6502 class loop *vect_loop = NULL;
6503 bool nested_in_vect_loop = false;
6504
6505 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6506 return dr_aligned;
6507
6508 /* For now assume all conditional loads/stores support unaligned
6509 access without any special code. */
6510 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6511 if (gimple_call_internal_p (stmt)
6512 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6513 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6514 return dr_unaligned_supported;
6515
6516 if (loop_vinfo)
6517 {
6518 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6519 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6520 }
6521
6522 /* Possibly unaligned access. */
6523
6524 /* We can choose between using the implicit realignment scheme (generating
6525 a misaligned_move stmt) and the explicit realignment scheme (generating
6526 aligned loads with a REALIGN_LOAD). There are two variants to the
6527 explicit realignment scheme: optimized, and unoptimized.
6528 We can optimize the realignment only if the step between consecutive
6529 vector loads is equal to the vector size. Since the vector memory
6530 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6531 is guaranteed that the misalignment amount remains the same throughout the
6532 execution of the vectorized loop. Therefore, we can create the
6533 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6534 at the loop preheader.
6535
6536 However, in the case of outer-loop vectorization, when vectorizing a
6537 memory access in the inner-loop nested within the LOOP that is now being
6538 vectorized, while it is guaranteed that the misalignment of the
6539 vectorized memory access will remain the same in different outer-loop
6540 iterations, it is *not* guaranteed that is will remain the same throughout
6541 the execution of the inner-loop. This is because the inner-loop advances
6542 with the original scalar step (and not in steps of VS). If the inner-loop
6543 step happens to be a multiple of VS, then the misalignment remains fixed
6544 and we can use the optimized realignment scheme. For example:
6545
6546 for (i=0; i<N; i++)
6547 for (j=0; j<M; j++)
6548 s += a[i+j];
6549
6550 When vectorizing the i-loop in the above example, the step between
6551 consecutive vector loads is 1, and so the misalignment does not remain
6552 fixed across the execution of the inner-loop, and the realignment cannot
6553 be optimized (as illustrated in the following pseudo vectorized loop):
6554
6555 for (i=0; i<N; i+=4)
6556 for (j=0; j<M; j++){
6557 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6558 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6559 // (assuming that we start from an aligned address).
6560 }
6561
6562 We therefore have to use the unoptimized realignment scheme:
6563
6564 for (i=0; i<N; i+=4)
6565 for (j=k; j<M; j+=4)
6566 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6567 // that the misalignment of the initial address is
6568 // 0).
6569
6570 The loop can then be vectorized as follows:
6571
6572 for (k=0; k<4; k++){
6573 rt = get_realignment_token (&vp[k]);
6574 for (i=0; i<N; i+=4){
6575 v1 = vp[i+k];
6576 for (j=k; j<M; j+=4){
6577 v2 = vp[i+j+VS-1];
6578 va = REALIGN_LOAD <v1,v2,rt>;
6579 vs += va;
6580 v1 = v2;
6581 }
6582 }
6583 } */
6584
6585 if (DR_IS_READ (dr))
6586 {
6587 bool is_packed = false;
6588 tree type = (TREE_TYPE (DR_REF (dr)));
6589
6590 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6591 && (!targetm.vectorize.builtin_mask_for_load
6592 || targetm.vectorize.builtin_mask_for_load ()))
6593 {
6594 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6595
6596 /* If we are doing SLP then the accesses need not have the
6597 same alignment, instead it depends on the SLP group size. */
6598 if (loop_vinfo
6599 && STMT_SLP_TYPE (stmt_info)
6600 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6601 * (DR_GROUP_SIZE
6602 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6603 TYPE_VECTOR_SUBPARTS (vectype)))
6604 ;
6605 else if (!loop_vinfo
6606 || (nested_in_vect_loop
6607 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6608 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6609 return dr_explicit_realign;
6610 else
6611 return dr_explicit_realign_optimized;
6612 }
6613 if (!known_alignment_for_access_p (dr_info))
6614 is_packed = not_size_aligned (DR_REF (dr));
6615
6616 if (targetm.vectorize.support_vector_misalignment
6617 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6618 /* Can't software pipeline the loads, but can at least do them. */
6619 return dr_unaligned_supported;
6620 }
6621 else
6622 {
6623 bool is_packed = false;
6624 tree type = (TREE_TYPE (DR_REF (dr)));
6625
6626 if (!known_alignment_for_access_p (dr_info))
6627 is_packed = not_size_aligned (DR_REF (dr));
6628
6629 if (targetm.vectorize.support_vector_misalignment
6630 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6631 return dr_unaligned_supported;
6632 }
6633
6634 /* Unsupported. */
6635 return dr_unaligned_unsupported;
6636 }