]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-data-refs.c
c++: Adjust push_template_decl_real
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
63 {
64 machine_mode mode, array_mode;
65 bool limit_p;
66
67 mode = TYPE_MODE (vectype);
68 if (!targetm.array_mode (mode, count).exists (&array_mode))
69 {
70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
73 {
74 if (dump_enabled_p ())
75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 "no array mode for %s[%wu]\n",
77 GET_MODE_NAME (mode), count);
78 return false;
79 }
80 }
81
82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
83 {
84 if (dump_enabled_p ())
85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86 "cannot use %s<%s><%s>\n", name,
87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88 return false;
89 }
90
91 if (dump_enabled_p ())
92 dump_printf_loc (MSG_NOTE, vect_location,
93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94 GET_MODE_NAME (mode));
95
96 return true;
97 }
98
99
100 /* Return the smallest scalar part of STMT_INFO.
101 This is used to determine the vectype of the stmt. We generally set the
102 vectype according to the type of the result (lhs). For stmts whose
103 result-type is different than the type of the arguments (e.g., demotion,
104 promotion), vectype will be reset appropriately (later). Note that we have
105 to visit the smallest datatype in this function, because that determines the
106 VF. If the smallest datatype in the loop is present only as the rhs of a
107 promotion operation - we'd miss it.
108 Such a case, where a variable of this datatype does not appear in the lhs
109 anywhere in the loop, can only occur if it's an invariant: e.g.:
110 'int_x = (int) short_inv', which we'd expect to have been optimized away by
111 invariant motion. However, we cannot rely on invariant motion to always
112 take invariants out of the loop, and so in the case of promotion we also
113 have to check the rhs.
114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115 types. */
116
117 tree
118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 HOST_WIDE_INT *lhs_size_unit,
120 HOST_WIDE_INT *rhs_size_unit)
121 {
122 tree scalar_type = gimple_expr_type (stmt_info->stmt);
123 HOST_WIDE_INT lhs, rhs;
124
125 /* During the analysis phase, this function is called on arbitrary
126 statements that might not have scalar results. */
127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128 return scalar_type;
129
130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
131
132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133 if (assign
134 && (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
140 {
141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
142
143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144 if (rhs < lhs)
145 scalar_type = rhs_type;
146 }
147 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
148 {
149 unsigned int i = 0;
150 if (gimple_call_internal_p (call))
151 {
152 internal_fn ifn = gimple_call_internal_fn (call);
153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 /* gimple_expr_type already picked the type of the loaded
155 or stored data. */
156 i = ~0U;
157 else if (internal_fn_mask_index (ifn) == 0)
158 i = 1;
159 }
160 if (i < gimple_call_num_args (call))
161 {
162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
164 {
165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 if (rhs < lhs)
167 scalar_type = rhs_type;
168 }
169 }
170 }
171
172 *lhs_size_unit = lhs;
173 *rhs_size_unit = rhs;
174 return scalar_type;
175 }
176
177
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179 tested at run-time. Return TRUE if DDR was successfully inserted.
180 Return false if versioning is not supported. */
181
182 static opt_result
183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
184 {
185 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186
187 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
188 return opt_result::failure_at (vect_location,
189 "will not create alias checks, as"
190 " --param vect-max-version-for-alias-checks"
191 " == 0\n");
192
193 opt_result res
194 = runtime_alias_check_p (ddr, loop,
195 optimize_loop_nest_for_speed_p (loop));
196 if (!res)
197 return res;
198
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200 return opt_result::success ();
201 }
202
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
204
205 static void
206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
207 {
208 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209 for (unsigned int i = 0; i < checks.length(); ++i)
210 if (checks[i] == value)
211 return;
212
213 if (dump_enabled_p ())
214 dump_printf_loc (MSG_NOTE, vect_location,
215 "need run-time check that %T is nonzero\n",
216 value);
217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
218 }
219
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222 DR_INFO_B. At least one of the accesses is a write. */
223
224 static bool
225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
226 {
227 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
229
230 /* Single statements are always kept in their original order. */
231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233 return true;
234
235 /* STMT_A and STMT_B belong to overlapping groups. All loads in a
236 SLP group are emitted at the position of the last scalar load and
237 all loads in an interleaving group are emitted at the position
238 of the first scalar load.
239 Stores in a group are emitted at the position of the last scalar store.
240 Compute that position and check whether the resulting order matches
241 the current one.
242 We have not yet decided between SLP and interleaving so we have
243 to conservatively assume both. */
244 stmt_vec_info il_a;
245 stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
246 if (last_a)
247 {
248 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
249 s = DR_GROUP_NEXT_ELEMENT (s))
250 last_a = get_later_stmt (last_a, s);
251 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
252 {
253 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
254 s = DR_GROUP_NEXT_ELEMENT (s))
255 if (get_later_stmt (il_a, s) == il_a)
256 il_a = s;
257 }
258 else
259 il_a = last_a;
260 }
261 else
262 last_a = il_a = stmtinfo_a;
263 stmt_vec_info il_b;
264 stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
265 if (last_b)
266 {
267 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
268 s = DR_GROUP_NEXT_ELEMENT (s))
269 last_b = get_later_stmt (last_b, s);
270 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
271 {
272 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
273 s = DR_GROUP_NEXT_ELEMENT (s))
274 if (get_later_stmt (il_b, s) == il_b)
275 il_b = s;
276 }
277 else
278 il_b = last_b;
279 }
280 else
281 last_b = il_b = stmtinfo_b;
282 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
283 return (/* SLP */
284 (get_later_stmt (last_a, last_b) == last_a) == a_after_b
285 /* Interleaving */
286 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
287 /* Mixed */
288 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
289 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
290 }
291
292 /* A subroutine of vect_analyze_data_ref_dependence. Handle
293 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
294 distances. These distances are conservatively correct but they don't
295 reflect a guaranteed dependence.
296
297 Return true if this function does all the work necessary to avoid
298 an alias or false if the caller should use the dependence distances
299 to limit the vectorization factor in the usual way. LOOP_DEPTH is
300 the depth of the loop described by LOOP_VINFO and the other arguments
301 are as for vect_analyze_data_ref_dependence. */
302
303 static bool
304 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
305 loop_vec_info loop_vinfo,
306 int loop_depth, unsigned int *max_vf)
307 {
308 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
309 lambda_vector dist_v;
310 unsigned int i;
311 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
312 {
313 int dist = dist_v[loop_depth];
314 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
315 {
316 /* If the user asserted safelen >= DIST consecutive iterations
317 can be executed concurrently, assume independence.
318
319 ??? An alternative would be to add the alias check even
320 in this case, and vectorize the fallback loop with the
321 maximum VF set to safelen. However, if the user has
322 explicitly given a length, it's less likely that that
323 would be a win. */
324 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
325 {
326 if ((unsigned int) loop->safelen < *max_vf)
327 *max_vf = loop->safelen;
328 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
329 continue;
330 }
331
332 /* For dependence distances of 2 or more, we have the option
333 of limiting VF or checking for an alias at runtime.
334 Prefer to check at runtime if we can, to avoid limiting
335 the VF unnecessarily when the bases are in fact independent.
336
337 Note that the alias checks will be removed if the VF ends up
338 being small enough. */
339 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
340 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
341 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
342 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
343 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
344 }
345 }
346 return true;
347 }
348
349
350 /* Function vect_analyze_data_ref_dependence.
351
352 FIXME: I needed to change the sense of the returned flag.
353
354 Return FALSE if there (might) exist a dependence between a memory-reference
355 DRA and a memory-reference DRB. When versioning for alias may check a
356 dependence at run-time, return TRUE. Adjust *MAX_VF according to
357 the data dependence. */
358
359 static opt_result
360 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
361 loop_vec_info loop_vinfo,
362 unsigned int *max_vf)
363 {
364 unsigned int i;
365 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
366 struct data_reference *dra = DDR_A (ddr);
367 struct data_reference *drb = DDR_B (ddr);
368 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
369 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
370 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
371 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
372 lambda_vector dist_v;
373 unsigned int loop_depth;
374
375 /* In loop analysis all data references should be vectorizable. */
376 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
377 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
378 gcc_unreachable ();
379
380 /* Independent data accesses. */
381 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
382 return opt_result::success ();
383
384 if (dra == drb
385 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
386 return opt_result::success ();
387
388 /* We do not have to consider dependences between accesses that belong
389 to the same group, unless the stride could be smaller than the
390 group size. */
391 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
392 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
394 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
395 return opt_result::success ();
396
397 /* Even if we have an anti-dependence then, as the vectorized loop covers at
398 least two scalar iterations, there is always also a true dependence.
399 As the vectorizer does not re-order loads and stores we can ignore
400 the anti-dependence if TBAA can disambiguate both DRs similar to the
401 case with known negative distance anti-dependences (positive
402 distance anti-dependences would violate TBAA constraints). */
403 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
404 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
405 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
406 get_alias_set (DR_REF (drb))))
407 return opt_result::success ();
408
409 /* Unknown data dependence. */
410 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
411 {
412 /* If user asserted safelen consecutive iterations can be
413 executed concurrently, assume independence. */
414 if (loop->safelen >= 2)
415 {
416 if ((unsigned int) loop->safelen < *max_vf)
417 *max_vf = loop->safelen;
418 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
419 return opt_result::success ();
420 }
421
422 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
423 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
424 return opt_result::failure_at
425 (stmtinfo_a->stmt,
426 "versioning for alias not supported for: "
427 "can't determine dependence between %T and %T\n",
428 DR_REF (dra), DR_REF (drb));
429
430 if (dump_enabled_p ())
431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
432 "versioning for alias required: "
433 "can't determine dependence between %T and %T\n",
434 DR_REF (dra), DR_REF (drb));
435
436 /* Add to list of ddrs that need to be tested at run-time. */
437 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
438 }
439
440 /* Known data dependence. */
441 if (DDR_NUM_DIST_VECTS (ddr) == 0)
442 {
443 /* If user asserted safelen consecutive iterations can be
444 executed concurrently, assume independence. */
445 if (loop->safelen >= 2)
446 {
447 if ((unsigned int) loop->safelen < *max_vf)
448 *max_vf = loop->safelen;
449 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
450 return opt_result::success ();
451 }
452
453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
455 return opt_result::failure_at
456 (stmtinfo_a->stmt,
457 "versioning for alias not supported for: "
458 "bad dist vector for %T and %T\n",
459 DR_REF (dra), DR_REF (drb));
460
461 if (dump_enabled_p ())
462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
463 "versioning for alias required: "
464 "bad dist vector for %T and %T\n",
465 DR_REF (dra), DR_REF (drb));
466 /* Add to list of ddrs that need to be tested at run-time. */
467 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
468 }
469
470 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
471
472 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
473 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
474 loop_depth, max_vf))
475 return opt_result::success ();
476
477 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
478 {
479 int dist = dist_v[loop_depth];
480
481 if (dump_enabled_p ())
482 dump_printf_loc (MSG_NOTE, vect_location,
483 "dependence distance = %d.\n", dist);
484
485 if (dist == 0)
486 {
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location,
489 "dependence distance == 0 between %T and %T\n",
490 DR_REF (dra), DR_REF (drb));
491
492 /* When we perform grouped accesses and perform implicit CSE
493 by detecting equal accesses and doing disambiguation with
494 runtime alias tests like for
495 .. = a[i];
496 .. = a[i+1];
497 a[i] = ..;
498 a[i+1] = ..;
499 *p = ..;
500 .. = a[i];
501 .. = a[i+1];
502 where we will end up loading { a[i], a[i+1] } once, make
503 sure that inserting group loads before the first load and
504 stores after the last store will do the right thing.
505 Similar for groups like
506 a[i] = ...;
507 ... = a[i];
508 a[i+1] = ...;
509 where loads from the group interleave with the store. */
510 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
511 return opt_result::failure_at (stmtinfo_a->stmt,
512 "READ_WRITE dependence"
513 " in interleaving.\n");
514
515 if (loop->safelen < 2)
516 {
517 tree indicator = dr_zero_step_indicator (dra);
518 if (!indicator || integer_zerop (indicator))
519 return opt_result::failure_at (stmtinfo_a->stmt,
520 "access also has a zero step\n");
521 else if (TREE_CODE (indicator) != INTEGER_CST)
522 vect_check_nonzero_value (loop_vinfo, indicator);
523 }
524 continue;
525 }
526
527 if (dist > 0 && DDR_REVERSED_P (ddr))
528 {
529 /* If DDR_REVERSED_P the order of the data-refs in DDR was
530 reversed (to make distance vector positive), and the actual
531 distance is negative. */
532 if (dump_enabled_p ())
533 dump_printf_loc (MSG_NOTE, vect_location,
534 "dependence distance negative.\n");
535 /* When doing outer loop vectorization, we need to check if there is
536 a backward dependence at the inner loop level if the dependence
537 at the outer loop is reversed. See PR81740. */
538 if (nested_in_vect_loop_p (loop, stmtinfo_a)
539 || nested_in_vect_loop_p (loop, stmtinfo_b))
540 {
541 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
542 DDR_LOOP_NEST (ddr));
543 if (dist_v[inner_depth] < 0)
544 return opt_result::failure_at (stmtinfo_a->stmt,
545 "not vectorized, dependence "
546 "between data-refs %T and %T\n",
547 DR_REF (dra), DR_REF (drb));
548 }
549 /* Record a negative dependence distance to later limit the
550 amount of stmt copying / unrolling we can perform.
551 Only need to handle read-after-write dependence. */
552 if (DR_IS_READ (drb)
553 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
554 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
555 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
556 continue;
557 }
558
559 unsigned int abs_dist = abs (dist);
560 if (abs_dist >= 2 && abs_dist < *max_vf)
561 {
562 /* The dependence distance requires reduction of the maximal
563 vectorization factor. */
564 *max_vf = abs_dist;
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "adjusting maximal vectorization factor to %i\n",
568 *max_vf);
569 }
570
571 if (abs_dist >= *max_vf)
572 {
573 /* Dependence distance does not create dependence, as far as
574 vectorization is concerned, in this case. */
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_NOTE, vect_location,
577 "dependence distance >= VF.\n");
578 continue;
579 }
580
581 return opt_result::failure_at (stmtinfo_a->stmt,
582 "not vectorized, possible dependence "
583 "between data-refs %T and %T\n",
584 DR_REF (dra), DR_REF (drb));
585 }
586
587 return opt_result::success ();
588 }
589
590 /* Function vect_analyze_data_ref_dependences.
591
592 Examine all the data references in the loop, and make sure there do not
593 exist any data dependences between them. Set *MAX_VF according to
594 the maximum vectorization factor the data dependences allow. */
595
596 opt_result
597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
598 unsigned int *max_vf)
599 {
600 unsigned int i;
601 struct data_dependence_relation *ddr;
602
603 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
604
605 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
606 {
607 LOOP_VINFO_DDRS (loop_vinfo)
608 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
609 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
610 /* We need read-read dependences to compute
611 STMT_VINFO_SAME_ALIGN_REFS. */
612 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
613 &LOOP_VINFO_DDRS (loop_vinfo),
614 LOOP_VINFO_LOOP_NEST (loop_vinfo),
615 true);
616 gcc_assert (res);
617 }
618
619 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
620
621 /* For epilogues we either have no aliases or alias versioning
622 was applied to original loop. Therefore we may just get max_vf
623 using VF of original loop. */
624 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
625 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
626 else
627 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
628 {
629 opt_result res
630 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
631 if (!res)
632 return res;
633 }
634
635 return opt_result::success ();
636 }
637
638
639 /* Function vect_slp_analyze_data_ref_dependence.
640
641 Return TRUE if there (might) exist a dependence between a memory-reference
642 DRA and a memory-reference DRB for VINFO. When versioning for alias
643 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
644 according to the data dependence. */
645
646 static bool
647 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
648 struct data_dependence_relation *ddr)
649 {
650 struct data_reference *dra = DDR_A (ddr);
651 struct data_reference *drb = DDR_B (ddr);
652 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
653 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
654
655 /* We need to check dependences of statements marked as unvectorizable
656 as well, they still can prohibit vectorization. */
657
658 /* Independent data accesses. */
659 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
660 return false;
661
662 if (dra == drb)
663 return false;
664
665 /* Read-read is OK. */
666 if (DR_IS_READ (dra) && DR_IS_READ (drb))
667 return false;
668
669 /* If dra and drb are part of the same interleaving chain consider
670 them independent. */
671 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
672 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
673 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
674 return false;
675
676 /* Unknown data dependence. */
677 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
678 {
679 if (dump_enabled_p ())
680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
681 "can't determine dependence between %T and %T\n",
682 DR_REF (dra), DR_REF (drb));
683 }
684 else if (dump_enabled_p ())
685 dump_printf_loc (MSG_NOTE, vect_location,
686 "determined dependence between %T and %T\n",
687 DR_REF (dra), DR_REF (drb));
688
689 return true;
690 }
691
692
693 /* Analyze dependences involved in the transform of SLP NODE. STORES
694 contain the vector of scalar stores of this instance if we are
695 disambiguating the loads. */
696
697 static bool
698 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
699 vec<stmt_vec_info> stores,
700 stmt_vec_info last_store_info)
701 {
702 /* This walks over all stmts involved in the SLP load/store done
703 in NODE verifying we can sink them up to the last stmt in the
704 group. */
705 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
706 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
707 {
708 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
709 if (access_info == last_access_info)
710 continue;
711 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
712 ao_ref ref;
713 bool ref_initialized_p = false;
714 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
715 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
716 {
717 gimple *stmt = gsi_stmt (gsi);
718 if (! gimple_vuse (stmt)
719 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
720 continue;
721
722 /* If we couldn't record a (single) data reference for this
723 stmt we have to resort to the alias oracle. */
724 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
725 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
726 if (!dr_b)
727 {
728 /* We are moving a store or sinking a load - this means
729 we cannot use TBAA for disambiguation. */
730 if (!ref_initialized_p)
731 ao_ref_init (&ref, DR_REF (dr_a));
732 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
733 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
734 return false;
735 continue;
736 }
737
738 bool dependent = false;
739 /* If we run into a store of this same instance (we've just
740 marked those) then delay dependence checking until we run
741 into the last store because this is where it will have
742 been sunk to (and we verify if we can do that as well). */
743 if (gimple_visited_p (stmt))
744 {
745 if (stmt_info != last_store_info)
746 continue;
747 unsigned i;
748 stmt_vec_info store_info;
749 FOR_EACH_VEC_ELT (stores, i, store_info)
750 {
751 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
752 ddr_p ddr = initialize_data_dependence_relation
753 (dr_a, store_dr, vNULL);
754 dependent
755 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
756 free_dependence_relation (ddr);
757 if (dependent)
758 break;
759 }
760 }
761 else
762 {
763 ddr_p ddr = initialize_data_dependence_relation (dr_a,
764 dr_b, vNULL);
765 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
766 free_dependence_relation (ddr);
767 }
768 if (dependent)
769 return false;
770 }
771 }
772 return true;
773 }
774
775
776 /* Function vect_analyze_data_ref_dependences.
777
778 Examine all the data references in the basic-block, and make sure there
779 do not exist any data dependences between them. Set *MAX_VF according to
780 the maximum vectorization factor the data dependences allow. */
781
782 bool
783 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
784 {
785 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
786
787 /* The stores of this instance are at the root of the SLP tree. */
788 slp_tree store = SLP_INSTANCE_TREE (instance);
789 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
790 store = NULL;
791
792 /* Verify we can sink stores to the vectorized stmt insert location. */
793 stmt_vec_info last_store_info = NULL;
794 if (store)
795 {
796 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
797 return false;
798
799 /* Mark stores in this instance and remember the last one. */
800 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
801 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
802 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
803 }
804
805 bool res = true;
806
807 /* Verify we can sink loads to the vectorized stmt insert location,
808 special-casing stores of this instance. */
809 slp_tree load;
810 unsigned int i;
811 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
812 if (! vect_slp_analyze_node_dependences (vinfo, load,
813 store
814 ? SLP_TREE_SCALAR_STMTS (store)
815 : vNULL, last_store_info))
816 {
817 res = false;
818 break;
819 }
820
821 /* Unset the visited flag. */
822 if (store)
823 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
824 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
825
826 return res;
827 }
828
829 /* Record the base alignment guarantee given by DRB, which occurs
830 in STMT_INFO. */
831
832 static void
833 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
834 innermost_loop_behavior *drb)
835 {
836 bool existed;
837 innermost_loop_behavior *&entry
838 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
839 if (!existed || entry->base_alignment < drb->base_alignment)
840 {
841 entry = drb;
842 if (dump_enabled_p ())
843 dump_printf_loc (MSG_NOTE, vect_location,
844 "recording new base alignment for %T\n"
845 " alignment: %d\n"
846 " misalignment: %d\n"
847 " based on: %G",
848 drb->base_address,
849 drb->base_alignment,
850 drb->base_misalignment,
851 stmt_info->stmt);
852 }
853 }
854
855 /* If the region we're going to vectorize is reached, all unconditional
856 data references occur at least once. We can therefore pool the base
857 alignment guarantees from each unconditional reference. Do this by
858 going through all the data references in VINFO and checking whether
859 the containing statement makes the reference unconditionally. If so,
860 record the alignment of the base address in VINFO so that it can be
861 used for all other references with the same base. */
862
863 void
864 vect_record_base_alignments (vec_info *vinfo)
865 {
866 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
867 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
868 data_reference *dr;
869 unsigned int i;
870 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
871 {
872 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
873 stmt_vec_info stmt_info = dr_info->stmt;
874 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
875 && STMT_VINFO_VECTORIZABLE (stmt_info)
876 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
877 {
878 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
879
880 /* If DR is nested in the loop that is being vectorized, we can also
881 record the alignment of the base wrt the outer loop. */
882 if (loop && nested_in_vect_loop_p (loop, stmt_info))
883 vect_record_base_alignment
884 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
885 }
886 }
887 }
888
889 /* Return the target alignment for the vectorized form of DR_INFO. */
890
891 static poly_uint64
892 vect_calculate_target_alignment (dr_vec_info *dr_info)
893 {
894 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
895 return targetm.vectorize.preferred_vector_alignment (vectype);
896 }
897
898 /* Function vect_compute_data_ref_alignment
899
900 Compute the misalignment of the data reference DR_INFO.
901
902 Output:
903 1. DR_MISALIGNMENT (DR_INFO) is defined.
904
905 FOR NOW: No analysis is actually performed. Misalignment is calculated
906 only for trivial cases. TODO. */
907
908 static void
909 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
910 {
911 stmt_vec_info stmt_info = dr_info->stmt;
912 vec_base_alignments *base_alignments = &vinfo->base_alignments;
913 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
914 class loop *loop = NULL;
915 tree ref = DR_REF (dr_info->dr);
916 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
917
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_NOTE, vect_location,
920 "vect_compute_data_ref_alignment:\n");
921
922 if (loop_vinfo)
923 loop = LOOP_VINFO_LOOP (loop_vinfo);
924
925 /* Initialize misalignment to unknown. */
926 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
927
928 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
929 return;
930
931 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
932 bool step_preserves_misalignment_p;
933
934 poly_uint64 vector_alignment
935 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
936 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
937
938 /* If the main loop has peeled for alignment we have no way of knowing
939 whether the data accesses in the epilogues are aligned. We can't at
940 compile time answer the question whether we have entered the main loop or
941 not. Fixes PR 92351. */
942 if (loop_vinfo)
943 {
944 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
945 if (orig_loop_vinfo
946 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
947 return;
948 }
949
950 unsigned HOST_WIDE_INT vect_align_c;
951 if (!vector_alignment.is_constant (&vect_align_c))
952 return;
953
954 /* No step for BB vectorization. */
955 if (!loop)
956 {
957 gcc_assert (integer_zerop (drb->step));
958 step_preserves_misalignment_p = true;
959 }
960
961 /* In case the dataref is in an inner-loop of the loop that is being
962 vectorized (LOOP), we use the base and misalignment information
963 relative to the outer-loop (LOOP). This is ok only if the misalignment
964 stays the same throughout the execution of the inner-loop, which is why
965 we have to check that the stride of the dataref in the inner-loop evenly
966 divides by the vector alignment. */
967 else if (nested_in_vect_loop_p (loop, stmt_info))
968 {
969 step_preserves_misalignment_p
970 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
971
972 if (dump_enabled_p ())
973 {
974 if (step_preserves_misalignment_p)
975 dump_printf_loc (MSG_NOTE, vect_location,
976 "inner step divides the vector alignment.\n");
977 else
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "inner step doesn't divide the vector"
980 " alignment.\n");
981 }
982 }
983
984 /* Similarly we can only use base and misalignment information relative to
985 an innermost loop if the misalignment stays the same throughout the
986 execution of the loop. As above, this is the case if the stride of
987 the dataref evenly divides by the alignment. */
988 else
989 {
990 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
991 step_preserves_misalignment_p
992 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
993
994 if (!step_preserves_misalignment_p && dump_enabled_p ())
995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
996 "step doesn't divide the vector alignment.\n");
997 }
998
999 unsigned int base_alignment = drb->base_alignment;
1000 unsigned int base_misalignment = drb->base_misalignment;
1001
1002 /* Calculate the maximum of the pooled base address alignment and the
1003 alignment that we can compute for DR itself. */
1004 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1005 if (entry && base_alignment < (*entry)->base_alignment)
1006 {
1007 base_alignment = (*entry)->base_alignment;
1008 base_misalignment = (*entry)->base_misalignment;
1009 }
1010
1011 if (drb->offset_alignment < vect_align_c
1012 || !step_preserves_misalignment_p
1013 /* We need to know whether the step wrt the vectorized loop is
1014 negative when computing the starting misalignment below. */
1015 || TREE_CODE (drb->step) != INTEGER_CST)
1016 {
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 "Unknown alignment for access: %T\n", ref);
1020 return;
1021 }
1022
1023 if (base_alignment < vect_align_c)
1024 {
1025 unsigned int max_alignment;
1026 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1027 if (max_alignment < vect_align_c
1028 || !vect_can_force_dr_alignment_p (base,
1029 vect_align_c * BITS_PER_UNIT))
1030 {
1031 if (dump_enabled_p ())
1032 dump_printf_loc (MSG_NOTE, vect_location,
1033 "can't force alignment of ref: %T\n", ref);
1034 return;
1035 }
1036
1037 /* Force the alignment of the decl.
1038 NOTE: This is the only change to the code we make during
1039 the analysis phase, before deciding to vectorize the loop. */
1040 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_NOTE, vect_location,
1042 "force alignment of %T\n", ref);
1043
1044 dr_info->base_decl = base;
1045 dr_info->base_misaligned = true;
1046 base_misalignment = 0;
1047 }
1048 poly_int64 misalignment
1049 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1050
1051 /* If this is a backward running DR then first access in the larger
1052 vectype actually is N-1 elements before the address in the DR.
1053 Adjust misalign accordingly. */
1054 if (tree_int_cst_sgn (drb->step) < 0)
1055 /* PLUS because STEP is negative. */
1056 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1057 * TREE_INT_CST_LOW (drb->step));
1058
1059 unsigned int const_misalignment;
1060 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1061 {
1062 if (dump_enabled_p ())
1063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1064 "Non-constant misalignment for access: %T\n", ref);
1065 return;
1066 }
1067
1068 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1069
1070 if (dump_enabled_p ())
1071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1072 "misalign = %d bytes of ref %T\n",
1073 DR_MISALIGNMENT (dr_info), ref);
1074
1075 return;
1076 }
1077
1078 /* Function vect_update_misalignment_for_peel.
1079 Sets DR_INFO's misalignment
1080 - to 0 if it has the same alignment as DR_PEEL_INFO,
1081 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1082 - to -1 (unknown) otherwise.
1083
1084 DR_INFO - the data reference whose misalignment is to be adjusted.
1085 DR_PEEL_INFO - the data reference whose misalignment is being made
1086 zero in the vector loop by the peel.
1087 NPEEL - the number of iterations in the peel loop if the misalignment
1088 of DR_PEEL_INFO is known at compile time. */
1089
1090 static void
1091 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1092 dr_vec_info *dr_peel_info, int npeel)
1093 {
1094 unsigned int i;
1095 vec<dr_p> same_aligned_drs;
1096 struct data_reference *current_dr;
1097 stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1098
1099 /* It can be assumed that if dr_info has the same alignment as dr_peel,
1100 it is aligned in the vector loop. */
1101 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1102 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1103 {
1104 if (current_dr != dr_info->dr)
1105 continue;
1106 gcc_assert (!known_alignment_for_access_p (dr_info)
1107 || !known_alignment_for_access_p (dr_peel_info)
1108 || (DR_MISALIGNMENT (dr_info)
1109 == DR_MISALIGNMENT (dr_peel_info)));
1110 SET_DR_MISALIGNMENT (dr_info, 0);
1111 return;
1112 }
1113
1114 unsigned HOST_WIDE_INT alignment;
1115 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1116 && known_alignment_for_access_p (dr_info)
1117 && known_alignment_for_access_p (dr_peel_info))
1118 {
1119 int misal = DR_MISALIGNMENT (dr_info);
1120 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1121 misal &= alignment - 1;
1122 SET_DR_MISALIGNMENT (dr_info, misal);
1123 return;
1124 }
1125
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1128 "to unknown (-1).\n");
1129 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1130 }
1131
1132
1133 /* Function verify_data_ref_alignment
1134
1135 Return TRUE if DR_INFO can be handled with respect to alignment. */
1136
1137 static opt_result
1138 verify_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info)
1139 {
1140 enum dr_alignment_support supportable_dr_alignment
1141 = vect_supportable_dr_alignment (vinfo, dr_info, false);
1142 if (!supportable_dr_alignment)
1143 return opt_result::failure_at
1144 (dr_info->stmt->stmt,
1145 DR_IS_READ (dr_info->dr)
1146 ? "not vectorized: unsupported unaligned load: %T\n"
1147 : "not vectorized: unsupported unaligned store: %T\n",
1148 DR_REF (dr_info->dr));
1149
1150 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1151 dump_printf_loc (MSG_NOTE, vect_location,
1152 "Vectorizing an unaligned access.\n");
1153
1154 return opt_result::success ();
1155 }
1156
1157 /* Function vect_verify_datarefs_alignment
1158
1159 Return TRUE if all data references in the loop can be
1160 handled with respect to alignment. */
1161
1162 opt_result
1163 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1164 {
1165 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1166 struct data_reference *dr;
1167 unsigned int i;
1168
1169 FOR_EACH_VEC_ELT (datarefs, i, dr)
1170 {
1171 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1172 stmt_vec_info stmt_info = dr_info->stmt;
1173
1174 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1175 continue;
1176
1177 /* For interleaving, only the alignment of the first access matters. */
1178 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1179 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1180 continue;
1181
1182 /* Strided accesses perform only component accesses, alignment is
1183 irrelevant for them. */
1184 if (STMT_VINFO_STRIDED_P (stmt_info)
1185 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1186 continue;
1187
1188 opt_result res = verify_data_ref_alignment (vinfo, dr_info);
1189 if (!res)
1190 return res;
1191 }
1192
1193 return opt_result::success ();
1194 }
1195
1196 /* Given an memory reference EXP return whether its alignment is less
1197 than its size. */
1198
1199 static bool
1200 not_size_aligned (tree exp)
1201 {
1202 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1203 return true;
1204
1205 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1206 > get_object_alignment (exp));
1207 }
1208
1209 /* Function vector_alignment_reachable_p
1210
1211 Return true if vector alignment for DR_INFO is reachable by peeling
1212 a few loop iterations. Return false otherwise. */
1213
1214 static bool
1215 vector_alignment_reachable_p (dr_vec_info *dr_info)
1216 {
1217 stmt_vec_info stmt_info = dr_info->stmt;
1218 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1219
1220 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1221 {
1222 /* For interleaved access we peel only if number of iterations in
1223 the prolog loop ({VF - misalignment}), is a multiple of the
1224 number of the interleaved accesses. */
1225 int elem_size, mis_in_elements;
1226
1227 /* FORNOW: handle only known alignment. */
1228 if (!known_alignment_for_access_p (dr_info))
1229 return false;
1230
1231 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1232 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1233 elem_size = vector_element_size (vector_size, nelements);
1234 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1235
1236 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1237 return false;
1238 }
1239
1240 /* If misalignment is known at the compile time then allow peeling
1241 only if natural alignment is reachable through peeling. */
1242 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1243 {
1244 HOST_WIDE_INT elmsize =
1245 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1246 if (dump_enabled_p ())
1247 {
1248 dump_printf_loc (MSG_NOTE, vect_location,
1249 "data size = %wd. misalignment = %d.\n", elmsize,
1250 DR_MISALIGNMENT (dr_info));
1251 }
1252 if (DR_MISALIGNMENT (dr_info) % elmsize)
1253 {
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256 "data size does not divide the misalignment.\n");
1257 return false;
1258 }
1259 }
1260
1261 if (!known_alignment_for_access_p (dr_info))
1262 {
1263 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1264 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1265 if (dump_enabled_p ())
1266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1267 "Unknown misalignment, %snaturally aligned\n",
1268 is_packed ? "not " : "");
1269 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1270 }
1271
1272 return true;
1273 }
1274
1275
1276 /* Calculate the cost of the memory access represented by DR_INFO. */
1277
1278 static void
1279 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1280 unsigned int *inside_cost,
1281 unsigned int *outside_cost,
1282 stmt_vector_for_cost *body_cost_vec,
1283 stmt_vector_for_cost *prologue_cost_vec)
1284 {
1285 stmt_vec_info stmt_info = dr_info->stmt;
1286 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1287 int ncopies;
1288
1289 if (PURE_SLP_STMT (stmt_info))
1290 ncopies = 1;
1291 else
1292 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1293
1294 if (DR_IS_READ (dr_info->dr))
1295 vect_get_load_cost (vinfo, stmt_info, ncopies, true, inside_cost,
1296 outside_cost, prologue_cost_vec, body_cost_vec, false);
1297 else
1298 vect_get_store_cost (vinfo,stmt_info, ncopies, inside_cost, body_cost_vec);
1299
1300 if (dump_enabled_p ())
1301 dump_printf_loc (MSG_NOTE, vect_location,
1302 "vect_get_data_access_cost: inside_cost = %d, "
1303 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1304 }
1305
1306
1307 typedef struct _vect_peel_info
1308 {
1309 dr_vec_info *dr_info;
1310 int npeel;
1311 unsigned int count;
1312 } *vect_peel_info;
1313
1314 typedef struct _vect_peel_extended_info
1315 {
1316 vec_info *vinfo;
1317 struct _vect_peel_info peel_info;
1318 unsigned int inside_cost;
1319 unsigned int outside_cost;
1320 } *vect_peel_extended_info;
1321
1322
1323 /* Peeling hashtable helpers. */
1324
1325 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1326 {
1327 static inline hashval_t hash (const _vect_peel_info *);
1328 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1329 };
1330
1331 inline hashval_t
1332 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1333 {
1334 return (hashval_t) peel_info->npeel;
1335 }
1336
1337 inline bool
1338 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1339 {
1340 return (a->npeel == b->npeel);
1341 }
1342
1343
1344 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1345
1346 static void
1347 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1348 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1349 int npeel)
1350 {
1351 struct _vect_peel_info elem, *slot;
1352 _vect_peel_info **new_slot;
1353 bool supportable_dr_alignment
1354 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1355
1356 elem.npeel = npeel;
1357 slot = peeling_htab->find (&elem);
1358 if (slot)
1359 slot->count++;
1360 else
1361 {
1362 slot = XNEW (struct _vect_peel_info);
1363 slot->npeel = npeel;
1364 slot->dr_info = dr_info;
1365 slot->count = 1;
1366 new_slot = peeling_htab->find_slot (slot, INSERT);
1367 *new_slot = slot;
1368 }
1369
1370 if (!supportable_dr_alignment
1371 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1372 slot->count += VECT_MAX_COST;
1373 }
1374
1375
1376 /* Traverse peeling hash table to find peeling option that aligns maximum
1377 number of data accesses. */
1378
1379 int
1380 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1381 _vect_peel_extended_info *max)
1382 {
1383 vect_peel_info elem = *slot;
1384
1385 if (elem->count > max->peel_info.count
1386 || (elem->count == max->peel_info.count
1387 && max->peel_info.npeel > elem->npeel))
1388 {
1389 max->peel_info.npeel = elem->npeel;
1390 max->peel_info.count = elem->count;
1391 max->peel_info.dr_info = elem->dr_info;
1392 }
1393
1394 return 1;
1395 }
1396
1397 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1398 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1399 we assume DR0_INFO's misalignment will be zero after peeling. */
1400
1401 static void
1402 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1403 dr_vec_info *dr0_info,
1404 unsigned int *inside_cost,
1405 unsigned int *outside_cost,
1406 stmt_vector_for_cost *body_cost_vec,
1407 stmt_vector_for_cost *prologue_cost_vec,
1408 unsigned int npeel,
1409 bool unknown_misalignment)
1410 {
1411 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1412 unsigned i;
1413 data_reference *dr;
1414
1415 FOR_EACH_VEC_ELT (datarefs, i, dr)
1416 {
1417 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1418 stmt_vec_info stmt_info = dr_info->stmt;
1419 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1420 continue;
1421
1422 /* For interleaving, only the alignment of the first access
1423 matters. */
1424 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1425 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1426 continue;
1427
1428 /* Strided accesses perform only component accesses, alignment is
1429 irrelevant for them. */
1430 if (STMT_VINFO_STRIDED_P (stmt_info)
1431 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1432 continue;
1433
1434 int save_misalignment;
1435 save_misalignment = DR_MISALIGNMENT (dr_info);
1436 if (npeel == 0)
1437 ;
1438 else if (unknown_misalignment && dr_info == dr0_info)
1439 SET_DR_MISALIGNMENT (dr_info, 0);
1440 else
1441 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1442 vect_get_data_access_cost (loop_vinfo, dr_info, inside_cost, outside_cost,
1443 body_cost_vec, prologue_cost_vec);
1444 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1445 }
1446 }
1447
1448 /* Traverse peeling hash table and calculate cost for each peeling option.
1449 Find the one with the lowest cost. */
1450
1451 int
1452 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1453 _vect_peel_extended_info *min)
1454 {
1455 vect_peel_info elem = *slot;
1456 int dummy;
1457 unsigned int inside_cost = 0, outside_cost = 0;
1458 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1459 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1460 epilogue_cost_vec;
1461
1462 prologue_cost_vec.create (2);
1463 body_cost_vec.create (2);
1464 epilogue_cost_vec.create (2);
1465
1466 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1467 &outside_cost, &body_cost_vec,
1468 &prologue_cost_vec, elem->npeel, false);
1469
1470 body_cost_vec.release ();
1471
1472 outside_cost += vect_get_known_peeling_cost
1473 (loop_vinfo, elem->npeel, &dummy,
1474 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1475 &prologue_cost_vec, &epilogue_cost_vec);
1476
1477 /* Prologue and epilogue costs are added to the target model later.
1478 These costs depend only on the scalar iteration cost, the
1479 number of peeling iterations finally chosen, and the number of
1480 misaligned statements. So discard the information found here. */
1481 prologue_cost_vec.release ();
1482 epilogue_cost_vec.release ();
1483
1484 if (inside_cost < min->inside_cost
1485 || (inside_cost == min->inside_cost
1486 && outside_cost < min->outside_cost))
1487 {
1488 min->inside_cost = inside_cost;
1489 min->outside_cost = outside_cost;
1490 min->peel_info.dr_info = elem->dr_info;
1491 min->peel_info.npeel = elem->npeel;
1492 min->peel_info.count = elem->count;
1493 }
1494
1495 return 1;
1496 }
1497
1498
1499 /* Choose best peeling option by traversing peeling hash table and either
1500 choosing an option with the lowest cost (if cost model is enabled) or the
1501 option that aligns as many accesses as possible. */
1502
1503 static struct _vect_peel_extended_info
1504 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1505 loop_vec_info loop_vinfo)
1506 {
1507 struct _vect_peel_extended_info res;
1508
1509 res.peel_info.dr_info = NULL;
1510 res.vinfo = loop_vinfo;
1511
1512 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1513 {
1514 res.inside_cost = INT_MAX;
1515 res.outside_cost = INT_MAX;
1516 peeling_htab->traverse <_vect_peel_extended_info *,
1517 vect_peeling_hash_get_lowest_cost> (&res);
1518 }
1519 else
1520 {
1521 res.peel_info.count = 0;
1522 peeling_htab->traverse <_vect_peel_extended_info *,
1523 vect_peeling_hash_get_most_frequent> (&res);
1524 res.inside_cost = 0;
1525 res.outside_cost = 0;
1526 }
1527
1528 return res;
1529 }
1530
1531 /* Return true if the new peeling NPEEL is supported. */
1532
1533 static bool
1534 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1535 unsigned npeel)
1536 {
1537 unsigned i;
1538 struct data_reference *dr = NULL;
1539 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1540 enum dr_alignment_support supportable_dr_alignment;
1541
1542 /* Ensure that all data refs can be vectorized after the peel. */
1543 FOR_EACH_VEC_ELT (datarefs, i, dr)
1544 {
1545 int save_misalignment;
1546
1547 if (dr == dr0_info->dr)
1548 continue;
1549
1550 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1551 stmt_vec_info stmt_info = dr_info->stmt;
1552 /* For interleaving, only the alignment of the first access
1553 matters. */
1554 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1555 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1556 continue;
1557
1558 /* Strided accesses perform only component accesses, alignment is
1559 irrelevant for them. */
1560 if (STMT_VINFO_STRIDED_P (stmt_info)
1561 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1562 continue;
1563
1564 save_misalignment = DR_MISALIGNMENT (dr_info);
1565 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1566 supportable_dr_alignment
1567 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
1568 SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1569
1570 if (!supportable_dr_alignment)
1571 return false;
1572 }
1573
1574 return true;
1575 }
1576
1577 /* Function vect_enhance_data_refs_alignment
1578
1579 This pass will use loop versioning and loop peeling in order to enhance
1580 the alignment of data references in the loop.
1581
1582 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1583 original loop is to be vectorized. Any other loops that are created by
1584 the transformations performed in this pass - are not supposed to be
1585 vectorized. This restriction will be relaxed.
1586
1587 This pass will require a cost model to guide it whether to apply peeling
1588 or versioning or a combination of the two. For example, the scheme that
1589 intel uses when given a loop with several memory accesses, is as follows:
1590 choose one memory access ('p') which alignment you want to force by doing
1591 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1592 other accesses are not necessarily aligned, or (2) use loop versioning to
1593 generate one loop in which all accesses are aligned, and another loop in
1594 which only 'p' is necessarily aligned.
1595
1596 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1597 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1598 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1599
1600 Devising a cost model is the most critical aspect of this work. It will
1601 guide us on which access to peel for, whether to use loop versioning, how
1602 many versions to create, etc. The cost model will probably consist of
1603 generic considerations as well as target specific considerations (on
1604 powerpc for example, misaligned stores are more painful than misaligned
1605 loads).
1606
1607 Here are the general steps involved in alignment enhancements:
1608
1609 -- original loop, before alignment analysis:
1610 for (i=0; i<N; i++){
1611 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1612 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1613 }
1614
1615 -- After vect_compute_data_refs_alignment:
1616 for (i=0; i<N; i++){
1617 x = q[i]; # DR_MISALIGNMENT(q) = 3
1618 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1619 }
1620
1621 -- Possibility 1: we do loop versioning:
1622 if (p is aligned) {
1623 for (i=0; i<N; i++){ # loop 1A
1624 x = q[i]; # DR_MISALIGNMENT(q) = 3
1625 p[i] = y; # DR_MISALIGNMENT(p) = 0
1626 }
1627 }
1628 else {
1629 for (i=0; i<N; i++){ # loop 1B
1630 x = q[i]; # DR_MISALIGNMENT(q) = 3
1631 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1632 }
1633 }
1634
1635 -- Possibility 2: we do loop peeling:
1636 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1637 x = q[i];
1638 p[i] = y;
1639 }
1640 for (i = 3; i < N; i++){ # loop 2A
1641 x = q[i]; # DR_MISALIGNMENT(q) = 0
1642 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1643 }
1644
1645 -- Possibility 3: combination of loop peeling and versioning:
1646 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1647 x = q[i];
1648 p[i] = y;
1649 }
1650 if (p is aligned) {
1651 for (i = 3; i<N; i++){ # loop 3A
1652 x = q[i]; # DR_MISALIGNMENT(q) = 0
1653 p[i] = y; # DR_MISALIGNMENT(p) = 0
1654 }
1655 }
1656 else {
1657 for (i = 3; i<N; i++){ # loop 3B
1658 x = q[i]; # DR_MISALIGNMENT(q) = 0
1659 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1660 }
1661 }
1662
1663 These loops are later passed to loop_transform to be vectorized. The
1664 vectorizer will use the alignment information to guide the transformation
1665 (whether to generate regular loads/stores, or with special handling for
1666 misalignment). */
1667
1668 opt_result
1669 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1670 {
1671 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1672 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1673 enum dr_alignment_support supportable_dr_alignment;
1674 dr_vec_info *first_store = NULL;
1675 dr_vec_info *dr0_info = NULL;
1676 struct data_reference *dr;
1677 unsigned int i, j;
1678 bool do_peeling = false;
1679 bool do_versioning = false;
1680 unsigned int npeel = 0;
1681 bool one_misalignment_known = false;
1682 bool one_misalignment_unknown = false;
1683 bool one_dr_unsupportable = false;
1684 dr_vec_info *unsupportable_dr_info = NULL;
1685 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1686 unsigned possible_npeel_number = 1;
1687 tree vectype;
1688 unsigned int mis, same_align_drs_max = 0;
1689 hash_table<peel_info_hasher> peeling_htab (1);
1690
1691 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1692
1693 /* Reset data so we can safely be called multiple times. */
1694 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1695 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1696
1697 /* While cost model enhancements are expected in the future, the high level
1698 view of the code at this time is as follows:
1699
1700 A) If there is a misaligned access then see if peeling to align
1701 this access can make all data references satisfy
1702 vect_supportable_dr_alignment. If so, update data structures
1703 as needed and return true.
1704
1705 B) If peeling wasn't possible and there is a data reference with an
1706 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1707 then see if loop versioning checks can be used to make all data
1708 references satisfy vect_supportable_dr_alignment. If so, update
1709 data structures as needed and return true.
1710
1711 C) If neither peeling nor versioning were successful then return false if
1712 any data reference does not satisfy vect_supportable_dr_alignment.
1713
1714 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1715
1716 Note, Possibility 3 above (which is peeling and versioning together) is not
1717 being done at this time. */
1718
1719 /* (1) Peeling to force alignment. */
1720
1721 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1722 Considerations:
1723 + How many accesses will become aligned due to the peeling
1724 - How many accesses will become unaligned due to the peeling,
1725 and the cost of misaligned accesses.
1726 - The cost of peeling (the extra runtime checks, the increase
1727 in code size). */
1728
1729 FOR_EACH_VEC_ELT (datarefs, i, dr)
1730 {
1731 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1732 stmt_vec_info stmt_info = dr_info->stmt;
1733
1734 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1735 continue;
1736
1737 /* For interleaving, only the alignment of the first access
1738 matters. */
1739 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1740 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1741 continue;
1742
1743 /* For scatter-gather or invariant accesses there is nothing
1744 to enhance. */
1745 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1746 || integer_zerop (DR_STEP (dr)))
1747 continue;
1748
1749 /* Strided accesses perform only component accesses, alignment is
1750 irrelevant for them. */
1751 if (STMT_VINFO_STRIDED_P (stmt_info)
1752 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1753 continue;
1754
1755 supportable_dr_alignment
1756 = vect_supportable_dr_alignment (loop_vinfo, dr_info, true);
1757 do_peeling = vector_alignment_reachable_p (dr_info);
1758 if (do_peeling)
1759 {
1760 if (known_alignment_for_access_p (dr_info))
1761 {
1762 unsigned int npeel_tmp = 0;
1763 bool negative = tree_int_cst_compare (DR_STEP (dr),
1764 size_zero_node) < 0;
1765
1766 vectype = STMT_VINFO_VECTYPE (stmt_info);
1767 /* If known_alignment_for_access_p then we have set
1768 DR_MISALIGNMENT which is only done if we know it at compiler
1769 time, so it is safe to assume target alignment is constant.
1770 */
1771 unsigned int target_align =
1772 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1773 unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774 mis = (negative
1775 ? DR_MISALIGNMENT (dr_info)
1776 : -DR_MISALIGNMENT (dr_info));
1777 if (DR_MISALIGNMENT (dr_info) != 0)
1778 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1779
1780 /* For multiple types, it is possible that the bigger type access
1781 will have more than one peeling option. E.g., a loop with two
1782 types: one of size (vector size / 4), and the other one of
1783 size (vector size / 8). Vectorization factor will 8. If both
1784 accesses are misaligned by 3, the first one needs one scalar
1785 iteration to be aligned, and the second one needs 5. But the
1786 first one will be aligned also by peeling 5 scalar
1787 iterations, and in that case both accesses will be aligned.
1788 Hence, except for the immediate peeling amount, we also want
1789 to try to add full vector size, while we don't exceed
1790 vectorization factor.
1791 We do this automatically for cost model, since we calculate
1792 cost for every peeling option. */
1793 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1794 {
1795 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797 possible_npeel_number
1798 = vect_get_num_vectors (nscalars, vectype);
1799
1800 /* NPEEL_TMP is 0 when there is no misalignment, but also
1801 allow peeling NELEMENTS. */
1802 if (DR_MISALIGNMENT (dr_info) == 0)
1803 possible_npeel_number++;
1804 }
1805
1806 /* Save info about DR in the hash table. Also include peeling
1807 amounts according to the explanation above. */
1808 for (j = 0; j < possible_npeel_number; j++)
1809 {
1810 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811 dr_info, npeel_tmp);
1812 npeel_tmp += target_align / dr_size;
1813 }
1814
1815 one_misalignment_known = true;
1816 }
1817 else
1818 {
1819 /* If we don't know any misalignment values, we prefer
1820 peeling for data-ref that has the maximum number of data-refs
1821 with the same alignment, unless the target prefers to align
1822 stores over load. */
1823 unsigned same_align_drs
1824 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825 if (!dr0_info
1826 || same_align_drs_max < same_align_drs)
1827 {
1828 same_align_drs_max = same_align_drs;
1829 dr0_info = dr_info;
1830 }
1831 /* For data-refs with the same number of related
1832 accesses prefer the one where the misalign
1833 computation will be invariant in the outermost loop. */
1834 else if (same_align_drs_max == same_align_drs)
1835 {
1836 class loop *ivloop0, *ivloop;
1837 ivloop0 = outermost_invariant_loop_for_expr
1838 (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839 ivloop = outermost_invariant_loop_for_expr
1840 (loop, DR_BASE_ADDRESS (dr));
1841 if ((ivloop && !ivloop0)
1842 || (ivloop && ivloop0
1843 && flow_loop_nested_p (ivloop, ivloop0)))
1844 dr0_info = dr_info;
1845 }
1846
1847 one_misalignment_unknown = true;
1848
1849 /* Check for data refs with unsupportable alignment that
1850 can be peeled. */
1851 if (!supportable_dr_alignment)
1852 {
1853 one_dr_unsupportable = true;
1854 unsupportable_dr_info = dr_info;
1855 }
1856
1857 if (!first_store && DR_IS_WRITE (dr))
1858 first_store = dr_info;
1859 }
1860 }
1861 else
1862 {
1863 if (!aligned_access_p (dr_info))
1864 {
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867 "vector alignment may not be reachable\n");
1868 break;
1869 }
1870 }
1871 }
1872
1873 /* Check if we can possibly peel the loop. */
1874 if (!vect_can_advance_ivs_p (loop_vinfo)
1875 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876 || loop->inner)
1877 do_peeling = false;
1878
1879 struct _vect_peel_extended_info peel_for_known_alignment;
1880 struct _vect_peel_extended_info peel_for_unknown_alignment;
1881 struct _vect_peel_extended_info best_peel;
1882
1883 peel_for_unknown_alignment.inside_cost = INT_MAX;
1884 peel_for_unknown_alignment.outside_cost = INT_MAX;
1885 peel_for_unknown_alignment.peel_info.count = 0;
1886
1887 if (do_peeling
1888 && one_misalignment_unknown)
1889 {
1890 /* Check if the target requires to prefer stores over loads, i.e., if
1891 misaligned stores are more expensive than misaligned loads (taking
1892 drs with same alignment into account). */
1893 unsigned int load_inside_cost = 0;
1894 unsigned int load_outside_cost = 0;
1895 unsigned int store_inside_cost = 0;
1896 unsigned int store_outside_cost = 0;
1897 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1898
1899 stmt_vector_for_cost dummy;
1900 dummy.create (2);
1901 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902 &load_inside_cost,
1903 &load_outside_cost,
1904 &dummy, &dummy, estimated_npeels, true);
1905 dummy.release ();
1906
1907 if (first_store)
1908 {
1909 dummy.create (2);
1910 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911 &store_inside_cost,
1912 &store_outside_cost,
1913 &dummy, &dummy,
1914 estimated_npeels, true);
1915 dummy.release ();
1916 }
1917 else
1918 {
1919 store_inside_cost = INT_MAX;
1920 store_outside_cost = INT_MAX;
1921 }
1922
1923 if (load_inside_cost > store_inside_cost
1924 || (load_inside_cost == store_inside_cost
1925 && load_outside_cost > store_outside_cost))
1926 {
1927 dr0_info = first_store;
1928 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1930 }
1931 else
1932 {
1933 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935 }
1936
1937 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938 prologue_cost_vec.create (2);
1939 epilogue_cost_vec.create (2);
1940
1941 int dummy2;
1942 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943 (loop_vinfo, estimated_npeels, &dummy2,
1944 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945 &prologue_cost_vec, &epilogue_cost_vec);
1946
1947 prologue_cost_vec.release ();
1948 epilogue_cost_vec.release ();
1949
1950 peel_for_unknown_alignment.peel_info.count = 1
1951 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952 }
1953
1954 peel_for_unknown_alignment.peel_info.npeel = 0;
1955 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1956
1957 best_peel = peel_for_unknown_alignment;
1958
1959 peel_for_known_alignment.inside_cost = INT_MAX;
1960 peel_for_known_alignment.outside_cost = INT_MAX;
1961 peel_for_known_alignment.peel_info.count = 0;
1962 peel_for_known_alignment.peel_info.dr_info = NULL;
1963
1964 if (do_peeling && one_misalignment_known)
1965 {
1966 /* Peeling is possible, but there is no data access that is not supported
1967 unless aligned. So we try to choose the best possible peeling from
1968 the hash table. */
1969 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970 (&peeling_htab, loop_vinfo);
1971 }
1972
1973 /* Compare costs of peeling for known and unknown alignment. */
1974 if (peel_for_known_alignment.peel_info.dr_info != NULL
1975 && peel_for_unknown_alignment.inside_cost
1976 >= peel_for_known_alignment.inside_cost)
1977 {
1978 best_peel = peel_for_known_alignment;
1979
1980 /* If the best peeling for known alignment has NPEEL == 0, perform no
1981 peeling at all except if there is an unsupportable dr that we can
1982 align. */
1983 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984 do_peeling = false;
1985 }
1986
1987 /* If there is an unsupportable data ref, prefer this over all choices so far
1988 since we'd have to discard a chosen peeling except when it accidentally
1989 aligned the unsupportable data ref. */
1990 if (one_dr_unsupportable)
1991 dr0_info = unsupportable_dr_info;
1992 else if (do_peeling)
1993 {
1994 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995 TODO: Use nopeel_outside_cost or get rid of it? */
1996 unsigned nopeel_inside_cost = 0;
1997 unsigned nopeel_outside_cost = 0;
1998
1999 stmt_vector_for_cost dummy;
2000 dummy.create (2);
2001 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002 &nopeel_outside_cost, &dummy, &dummy,
2003 0, false);
2004 dummy.release ();
2005
2006 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2007 costs will be recorded. */
2008 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009 prologue_cost_vec.create (2);
2010 epilogue_cost_vec.create (2);
2011
2012 int dummy2;
2013 nopeel_outside_cost += vect_get_known_peeling_cost
2014 (loop_vinfo, 0, &dummy2,
2015 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016 &prologue_cost_vec, &epilogue_cost_vec);
2017
2018 prologue_cost_vec.release ();
2019 epilogue_cost_vec.release ();
2020
2021 npeel = best_peel.peel_info.npeel;
2022 dr0_info = best_peel.peel_info.dr_info;
2023
2024 /* If no peeling is not more expensive than the best peeling we
2025 have so far, don't perform any peeling. */
2026 if (nopeel_inside_cost <= best_peel.inside_cost)
2027 do_peeling = false;
2028 }
2029
2030 if (do_peeling)
2031 {
2032 stmt_vec_info stmt_info = dr0_info->stmt;
2033 vectype = STMT_VINFO_VECTYPE (stmt_info);
2034
2035 if (known_alignment_for_access_p (dr0_info))
2036 {
2037 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038 size_zero_node) < 0;
2039 if (!npeel)
2040 {
2041 /* Since it's known at compile time, compute the number of
2042 iterations in the peeled loop (the peeling factor) for use in
2043 updating DR_MISALIGNMENT values. The peeling factor is the
2044 vectorization factor minus the misalignment as an element
2045 count. */
2046 mis = (negative
2047 ? DR_MISALIGNMENT (dr0_info)
2048 : -DR_MISALIGNMENT (dr0_info));
2049 /* If known_alignment_for_access_p then we have set
2050 DR_MISALIGNMENT which is only done if we know it at compiler
2051 time, so it is safe to assume target alignment is constant.
2052 */
2053 unsigned int target_align =
2054 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2055 npeel = ((mis & (target_align - 1))
2056 / vect_get_scalar_dr_size (dr0_info));
2057 }
2058
2059 /* For interleaved data access every iteration accesses all the
2060 members of the group, therefore we divide the number of iterations
2061 by the group size. */
2062 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2063 npeel /= DR_GROUP_SIZE (stmt_info);
2064
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_NOTE, vect_location,
2067 "Try peeling by %d\n", npeel);
2068 }
2069
2070 /* Ensure that all datarefs can be vectorized after the peel. */
2071 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2072 do_peeling = false;
2073
2074 /* Check if all datarefs are supportable and log. */
2075 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2076 {
2077 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078 if (!stat)
2079 do_peeling = false;
2080 else
2081 return stat;
2082 }
2083
2084 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2085 if (do_peeling)
2086 {
2087 unsigned max_allowed_peel
2088 = param_vect_max_peeling_for_alignment;
2089 if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2090 max_allowed_peel = 0;
2091 if (max_allowed_peel != (unsigned)-1)
2092 {
2093 unsigned max_peel = npeel;
2094 if (max_peel == 0)
2095 {
2096 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2097 unsigned HOST_WIDE_INT target_align_c;
2098 if (target_align.is_constant (&target_align_c))
2099 max_peel =
2100 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2101 else
2102 {
2103 do_peeling = false;
2104 if (dump_enabled_p ())
2105 dump_printf_loc (MSG_NOTE, vect_location,
2106 "Disable peeling, max peels set and vector"
2107 " alignment unknown\n");
2108 }
2109 }
2110 if (max_peel > max_allowed_peel)
2111 {
2112 do_peeling = false;
2113 if (dump_enabled_p ())
2114 dump_printf_loc (MSG_NOTE, vect_location,
2115 "Disable peeling, max peels reached: %d\n", max_peel);
2116 }
2117 }
2118 }
2119
2120 /* Cost model #2 - if peeling may result in a remaining loop not
2121 iterating enough to be vectorized then do not peel. Since this
2122 is a cost heuristic rather than a correctness decision, use the
2123 most likely runtime value for variable vectorization factors. */
2124 if (do_peeling
2125 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2126 {
2127 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2128 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2129 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2130 < assumed_vf + max_peel)
2131 do_peeling = false;
2132 }
2133
2134 if (do_peeling)
2135 {
2136 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2137 If the misalignment of DR_i is identical to that of dr0 then set
2138 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2139 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2140 by the peeling factor times the element size of DR_i (MOD the
2141 vectorization factor times the size). Otherwise, the
2142 misalignment of DR_i must be set to unknown. */
2143 FOR_EACH_VEC_ELT (datarefs, i, dr)
2144 if (dr != dr0_info->dr)
2145 {
2146 /* Strided accesses perform only component accesses, alignment
2147 is irrelevant for them. */
2148 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2149 stmt_info = dr_info->stmt;
2150 if (STMT_VINFO_STRIDED_P (stmt_info)
2151 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2152 continue;
2153
2154 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2155 }
2156
2157 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2158 if (npeel)
2159 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2160 else
2161 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2162 = DR_MISALIGNMENT (dr0_info);
2163 SET_DR_MISALIGNMENT (dr0_info, 0);
2164 if (dump_enabled_p ())
2165 {
2166 dump_printf_loc (MSG_NOTE, vect_location,
2167 "Alignment of access forced using peeling.\n");
2168 dump_printf_loc (MSG_NOTE, vect_location,
2169 "Peeling for alignment will be applied.\n");
2170 }
2171
2172 /* The inside-loop cost will be accounted for in vectorizable_load
2173 and vectorizable_store correctly with adjusted alignments.
2174 Drop the body_cst_vec on the floor here. */
2175 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2176 gcc_assert (stat);
2177 return stat;
2178 }
2179 }
2180
2181 /* (2) Versioning to force alignment. */
2182
2183 /* Try versioning if:
2184 1) optimize loop for speed and the cost-model is not cheap
2185 2) there is at least one unsupported misaligned data ref with an unknown
2186 misalignment, and
2187 3) all misaligned data refs with a known misalignment are supported, and
2188 4) the number of runtime alignment checks is within reason. */
2189
2190 do_versioning
2191 = (optimize_loop_nest_for_speed_p (loop)
2192 && !loop->inner /* FORNOW */
2193 && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2194
2195 if (do_versioning)
2196 {
2197 FOR_EACH_VEC_ELT (datarefs, i, dr)
2198 {
2199 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2200 stmt_vec_info stmt_info = dr_info->stmt;
2201
2202 /* For interleaving, only the alignment of the first access
2203 matters. */
2204 if (aligned_access_p (dr_info)
2205 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2206 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2207 continue;
2208
2209 if (STMT_VINFO_STRIDED_P (stmt_info))
2210 {
2211 /* Strided loads perform only component accesses, alignment is
2212 irrelevant for them. */
2213 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2214 continue;
2215 do_versioning = false;
2216 break;
2217 }
2218
2219 supportable_dr_alignment
2220 = vect_supportable_dr_alignment (loop_vinfo, dr_info, false);
2221
2222 if (!supportable_dr_alignment)
2223 {
2224 int mask;
2225 tree vectype;
2226
2227 if (known_alignment_for_access_p (dr_info)
2228 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2229 >= (unsigned) param_vect_max_version_for_alignment_checks)
2230 {
2231 do_versioning = false;
2232 break;
2233 }
2234
2235 vectype = STMT_VINFO_VECTYPE (stmt_info);
2236 gcc_assert (vectype);
2237
2238 /* At present we don't support versioning for alignment
2239 with variable VF, since there's no guarantee that the
2240 VF is a power of two. We could relax this if we added
2241 a way of enforcing a power-of-two size. */
2242 unsigned HOST_WIDE_INT size;
2243 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2244 {
2245 do_versioning = false;
2246 break;
2247 }
2248
2249 /* Forcing alignment in the first iteration is no good if
2250 we don't keep it across iterations. For now, just disable
2251 versioning in this case.
2252 ?? We could actually unroll the loop to achieve the required
2253 overall step alignment, and forcing the alignment could be
2254 done by doing some iterations of the non-vectorized loop. */
2255 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2256 * DR_STEP_ALIGNMENT (dr),
2257 DR_TARGET_ALIGNMENT (dr_info)))
2258 {
2259 do_versioning = false;
2260 break;
2261 }
2262
2263 /* The rightmost bits of an aligned address must be zeros.
2264 Construct the mask needed for this test. For example,
2265 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2266 mask must be 15 = 0xf. */
2267 mask = size - 1;
2268
2269 /* FORNOW: use the same mask to test all potentially unaligned
2270 references in the loop. */
2271 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2272 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2273 {
2274 do_versioning = false;
2275 break;
2276 }
2277
2278 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2279 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2280 }
2281 }
2282
2283 /* Versioning requires at least one misaligned data reference. */
2284 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2285 do_versioning = false;
2286 else if (!do_versioning)
2287 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2288 }
2289
2290 if (do_versioning)
2291 {
2292 vec<stmt_vec_info> may_misalign_stmts
2293 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2294 stmt_vec_info stmt_info;
2295
2296 /* It can now be assumed that the data references in the statements
2297 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2298 of the loop being vectorized. */
2299 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2300 {
2301 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2302 SET_DR_MISALIGNMENT (dr_info, 0);
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_NOTE, vect_location,
2305 "Alignment of access forced using versioning.\n");
2306 }
2307
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_NOTE, vect_location,
2310 "Versioning for alignment will be applied.\n");
2311
2312 /* Peeling and versioning can't be done together at this time. */
2313 gcc_assert (! (do_peeling && do_versioning));
2314
2315 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2316 gcc_assert (stat);
2317 return stat;
2318 }
2319
2320 /* This point is reached if neither peeling nor versioning is being done. */
2321 gcc_assert (! (do_peeling || do_versioning));
2322
2323 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2324 return stat;
2325 }
2326
2327
2328 /* Function vect_find_same_alignment_drs.
2329
2330 Update group and alignment relations in VINFO according to the chosen
2331 vectorization factor. */
2332
2333 static void
2334 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2335 {
2336 struct data_reference *dra = DDR_A (ddr);
2337 struct data_reference *drb = DDR_B (ddr);
2338 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2339 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2340 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2341 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2342
2343 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2344 return;
2345
2346 if (dra == drb)
2347 return;
2348
2349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2351 return;
2352
2353 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2354 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2355 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2356 return;
2357
2358 /* Two references with distance zero have the same alignment. */
2359 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2360 - wi::to_poly_offset (DR_INIT (drb)));
2361 if (maybe_ne (diff, 0))
2362 {
2363 /* Get the wider of the two alignments. */
2364 poly_uint64 align_a =
2365 exact_div (vect_calculate_target_alignment (dr_info_a),
2366 BITS_PER_UNIT);
2367 poly_uint64 align_b =
2368 exact_div (vect_calculate_target_alignment (dr_info_b),
2369 BITS_PER_UNIT);
2370 unsigned HOST_WIDE_INT align_a_c, align_b_c;
2371 if (!align_a.is_constant (&align_a_c)
2372 || !align_b.is_constant (&align_b_c))
2373 return;
2374
2375 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2376
2377 /* Require the gap to be a multiple of the larger vector alignment. */
2378 if (!multiple_p (diff, max_align))
2379 return;
2380 }
2381
2382 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2383 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2384 if (dump_enabled_p ())
2385 dump_printf_loc (MSG_NOTE, vect_location,
2386 "accesses have the same alignment: %T and %T\n",
2387 DR_REF (dra), DR_REF (drb));
2388 }
2389
2390
2391 /* Function vect_analyze_data_refs_alignment
2392
2393 Analyze the alignment of the data-references in the loop.
2394 Return FALSE if a data reference is found that cannot be vectorized. */
2395
2396 opt_result
2397 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2398 {
2399 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2400
2401 /* Mark groups of data references with same alignment using
2402 data dependence information. */
2403 vec<ddr_p> ddrs = vinfo->shared->ddrs;
2404 struct data_dependence_relation *ddr;
2405 unsigned int i;
2406
2407 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2408 vect_find_same_alignment_drs (vinfo, ddr);
2409
2410 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2411 struct data_reference *dr;
2412
2413 vect_record_base_alignments (vinfo);
2414 FOR_EACH_VEC_ELT (datarefs, i, dr)
2415 {
2416 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2417 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2418 vect_compute_data_ref_alignment (vinfo, dr_info);
2419 }
2420
2421 return opt_result::success ();
2422 }
2423
2424
2425 /* Analyze alignment of DRs of stmts in NODE. */
2426
2427 static bool
2428 vect_slp_analyze_and_verify_node_alignment (vec_info *vinfo, slp_tree node)
2429 {
2430 /* We vectorize from the first scalar stmt in the node unless
2431 the node is permuted in which case we start from the first
2432 element in the group. */
2433 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2434 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2435 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2436 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2437
2438 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2439 vect_compute_data_ref_alignment (vinfo, dr_info);
2440 /* For creating the data-ref pointer we need alignment of the
2441 first element anyway. */
2442 if (dr_info != first_dr_info)
2443 vect_compute_data_ref_alignment (vinfo, first_dr_info);
2444 if (! verify_data_ref_alignment (vinfo, dr_info))
2445 {
2446 if (dump_enabled_p ())
2447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448 "not vectorized: bad data alignment in basic "
2449 "block.\n");
2450 return false;
2451 }
2452
2453 return true;
2454 }
2455
2456 /* Function vect_slp_analyze_instance_alignment
2457
2458 Analyze the alignment of the data-references in the SLP instance.
2459 Return FALSE if a data reference is found that cannot be vectorized. */
2460
2461 bool
2462 vect_slp_analyze_and_verify_instance_alignment (vec_info *vinfo,
2463 slp_instance instance)
2464 {
2465 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2466
2467 slp_tree node;
2468 unsigned i;
2469 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2470 if (! vect_slp_analyze_and_verify_node_alignment (vinfo, node))
2471 return false;
2472
2473 node = SLP_INSTANCE_TREE (instance);
2474 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2475 && ! vect_slp_analyze_and_verify_node_alignment
2476 (vinfo, SLP_INSTANCE_TREE (instance)))
2477 return false;
2478
2479 return true;
2480 }
2481
2482
2483 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2484 accesses of legal size, step, etc. Detect gaps, single element
2485 interleaving, and other special cases. Set grouped access info.
2486 Collect groups of strided stores for further use in SLP analysis.
2487 Worker for vect_analyze_group_access. */
2488
2489 static bool
2490 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2491 {
2492 data_reference *dr = dr_info->dr;
2493 tree step = DR_STEP (dr);
2494 tree scalar_type = TREE_TYPE (DR_REF (dr));
2495 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2496 stmt_vec_info stmt_info = dr_info->stmt;
2497 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2498 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2499 HOST_WIDE_INT dr_step = -1;
2500 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2501 bool slp_impossible = false;
2502
2503 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2504 size of the interleaving group (including gaps). */
2505 if (tree_fits_shwi_p (step))
2506 {
2507 dr_step = tree_to_shwi (step);
2508 /* Check that STEP is a multiple of type size. Otherwise there is
2509 a non-element-sized gap at the end of the group which we
2510 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2511 ??? As we can handle non-constant step fine here we should
2512 simply remove uses of DR_GROUP_GAP between the last and first
2513 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2514 simply not include that gap. */
2515 if ((dr_step % type_size) != 0)
2516 {
2517 if (dump_enabled_p ())
2518 dump_printf_loc (MSG_NOTE, vect_location,
2519 "Step %T is not a multiple of the element size"
2520 " for %T\n",
2521 step, DR_REF (dr));
2522 return false;
2523 }
2524 groupsize = absu_hwi (dr_step) / type_size;
2525 }
2526 else
2527 groupsize = 0;
2528
2529 /* Not consecutive access is possible only if it is a part of interleaving. */
2530 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2531 {
2532 /* Check if it this DR is a part of interleaving, and is a single
2533 element of the group that is accessed in the loop. */
2534
2535 /* Gaps are supported only for loads. STEP must be a multiple of the type
2536 size. */
2537 if (DR_IS_READ (dr)
2538 && (dr_step % type_size) == 0
2539 && groupsize > 0)
2540 {
2541 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2542 DR_GROUP_SIZE (stmt_info) = groupsize;
2543 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2544 if (dump_enabled_p ())
2545 dump_printf_loc (MSG_NOTE, vect_location,
2546 "Detected single element interleaving %T"
2547 " step %T\n",
2548 DR_REF (dr), step);
2549
2550 return true;
2551 }
2552
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "not consecutive access %G", stmt_info->stmt);
2556
2557 if (bb_vinfo)
2558 {
2559 /* Mark the statement as unvectorizable. */
2560 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2561 return true;
2562 }
2563
2564 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2566 STMT_VINFO_STRIDED_P (stmt_info) = true;
2567 return true;
2568 }
2569
2570 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2571 {
2572 /* First stmt in the interleaving chain. Check the chain. */
2573 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2574 struct data_reference *data_ref = dr;
2575 unsigned int count = 1;
2576 tree prev_init = DR_INIT (data_ref);
2577 HOST_WIDE_INT diff, gaps = 0;
2578
2579 /* By construction, all group members have INTEGER_CST DR_INITs. */
2580 while (next)
2581 {
2582 /* We never have the same DR multiple times. */
2583 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2584 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2585
2586 data_ref = STMT_VINFO_DATA_REF (next);
2587
2588 /* All group members have the same STEP by construction. */
2589 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2590
2591 /* Check that the distance between two accesses is equal to the type
2592 size. Otherwise, we have gaps. */
2593 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2594 - TREE_INT_CST_LOW (prev_init)) / type_size;
2595 if (diff != 1)
2596 {
2597 /* FORNOW: SLP of accesses with gaps is not supported. */
2598 slp_impossible = true;
2599 if (DR_IS_WRITE (data_ref))
2600 {
2601 if (dump_enabled_p ())
2602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2603 "interleaved store with gaps\n");
2604 return false;
2605 }
2606
2607 gaps += diff - 1;
2608 }
2609
2610 last_accessed_element += diff;
2611
2612 /* Store the gap from the previous member of the group. If there is no
2613 gap in the access, DR_GROUP_GAP is always 1. */
2614 DR_GROUP_GAP (next) = diff;
2615
2616 prev_init = DR_INIT (data_ref);
2617 next = DR_GROUP_NEXT_ELEMENT (next);
2618 /* Count the number of data-refs in the chain. */
2619 count++;
2620 }
2621
2622 if (groupsize == 0)
2623 groupsize = count + gaps;
2624
2625 /* This could be UINT_MAX but as we are generating code in a very
2626 inefficient way we have to cap earlier. See PR78699 for example. */
2627 if (groupsize > 4096)
2628 {
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631 "group is too large\n");
2632 return false;
2633 }
2634
2635 /* Check that the size of the interleaving is equal to count for stores,
2636 i.e., that there are no gaps. */
2637 if (groupsize != count
2638 && !DR_IS_READ (dr))
2639 {
2640 groupsize = count;
2641 STMT_VINFO_STRIDED_P (stmt_info) = true;
2642 }
2643
2644 /* If there is a gap after the last load in the group it is the
2645 difference between the groupsize and the last accessed
2646 element.
2647 When there is no gap, this difference should be 0. */
2648 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2649
2650 DR_GROUP_SIZE (stmt_info) = groupsize;
2651 if (dump_enabled_p ())
2652 {
2653 dump_printf_loc (MSG_NOTE, vect_location,
2654 "Detected interleaving ");
2655 if (DR_IS_READ (dr))
2656 dump_printf (MSG_NOTE, "load ");
2657 else if (STMT_VINFO_STRIDED_P (stmt_info))
2658 dump_printf (MSG_NOTE, "strided store ");
2659 else
2660 dump_printf (MSG_NOTE, "store ");
2661 dump_printf (MSG_NOTE, "of size %u\n",
2662 (unsigned)groupsize);
2663 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2664 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2665 while (next)
2666 {
2667 if (DR_GROUP_GAP (next) != 1)
2668 dump_printf_loc (MSG_NOTE, vect_location,
2669 "\t<gap of %d elements>\n",
2670 DR_GROUP_GAP (next) - 1);
2671 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2672 next = DR_GROUP_NEXT_ELEMENT (next);
2673 }
2674 if (DR_GROUP_GAP (stmt_info) != 0)
2675 dump_printf_loc (MSG_NOTE, vect_location,
2676 "\t<gap of %d elements>\n",
2677 DR_GROUP_GAP (stmt_info));
2678 }
2679
2680 /* SLP: create an SLP data structure for every interleaving group of
2681 stores for further analysis in vect_analyse_slp. */
2682 if (DR_IS_WRITE (dr) && !slp_impossible)
2683 {
2684 if (loop_vinfo)
2685 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2686 if (bb_vinfo)
2687 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2688 }
2689 }
2690
2691 return true;
2692 }
2693
2694 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2695 accesses of legal size, step, etc. Detect gaps, single element
2696 interleaving, and other special cases. Set grouped access info.
2697 Collect groups of strided stores for further use in SLP analysis. */
2698
2699 static bool
2700 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2701 {
2702 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2703 {
2704 /* Dissolve the group if present. */
2705 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2706 while (stmt_info)
2707 {
2708 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2709 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2710 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2711 stmt_info = next;
2712 }
2713 return false;
2714 }
2715 return true;
2716 }
2717
2718 /* Analyze the access pattern of the data-reference DR_INFO.
2719 In case of non-consecutive accesses call vect_analyze_group_access() to
2720 analyze groups of accesses. */
2721
2722 static bool
2723 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2724 {
2725 data_reference *dr = dr_info->dr;
2726 tree step = DR_STEP (dr);
2727 tree scalar_type = TREE_TYPE (DR_REF (dr));
2728 stmt_vec_info stmt_info = dr_info->stmt;
2729 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2730 class loop *loop = NULL;
2731
2732 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2733 return true;
2734
2735 if (loop_vinfo)
2736 loop = LOOP_VINFO_LOOP (loop_vinfo);
2737
2738 if (loop_vinfo && !step)
2739 {
2740 if (dump_enabled_p ())
2741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742 "bad data-ref access in loop\n");
2743 return false;
2744 }
2745
2746 /* Allow loads with zero step in inner-loop vectorization. */
2747 if (loop_vinfo && integer_zerop (step))
2748 {
2749 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2750 if (!nested_in_vect_loop_p (loop, stmt_info))
2751 return DR_IS_READ (dr);
2752 /* Allow references with zero step for outer loops marked
2753 with pragma omp simd only - it guarantees absence of
2754 loop-carried dependencies between inner loop iterations. */
2755 if (loop->safelen < 2)
2756 {
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_NOTE, vect_location,
2759 "zero step in inner loop of nest\n");
2760 return false;
2761 }
2762 }
2763
2764 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2765 {
2766 /* Interleaved accesses are not yet supported within outer-loop
2767 vectorization for references in the inner-loop. */
2768 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2769
2770 /* For the rest of the analysis we use the outer-loop step. */
2771 step = STMT_VINFO_DR_STEP (stmt_info);
2772 if (integer_zerop (step))
2773 {
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_NOTE, vect_location,
2776 "zero step in outer loop.\n");
2777 return DR_IS_READ (dr);
2778 }
2779 }
2780
2781 /* Consecutive? */
2782 if (TREE_CODE (step) == INTEGER_CST)
2783 {
2784 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2785 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2786 || (dr_step < 0
2787 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2788 {
2789 /* Mark that it is not interleaving. */
2790 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2791 return true;
2792 }
2793 }
2794
2795 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2796 {
2797 if (dump_enabled_p ())
2798 dump_printf_loc (MSG_NOTE, vect_location,
2799 "grouped access in outer loop.\n");
2800 return false;
2801 }
2802
2803
2804 /* Assume this is a DR handled by non-constant strided load case. */
2805 if (TREE_CODE (step) != INTEGER_CST)
2806 return (STMT_VINFO_STRIDED_P (stmt_info)
2807 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2808 || vect_analyze_group_access (vinfo, dr_info)));
2809
2810 /* Not consecutive access - check if it's a part of interleaving group. */
2811 return vect_analyze_group_access (vinfo, dr_info);
2812 }
2813
2814 /* Compare two data-references DRA and DRB to group them into chunks
2815 suitable for grouping. */
2816
2817 static int
2818 dr_group_sort_cmp (const void *dra_, const void *drb_)
2819 {
2820 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2821 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2822 int cmp;
2823
2824 /* Stabilize sort. */
2825 if (dra == drb)
2826 return 0;
2827
2828 /* DRs in different loops never belong to the same group. */
2829 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2830 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2831 if (loopa != loopb)
2832 return loopa->num < loopb->num ? -1 : 1;
2833
2834 /* Ordering of DRs according to base. */
2835 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2836 DR_BASE_ADDRESS (drb));
2837 if (cmp != 0)
2838 return cmp;
2839
2840 /* And according to DR_OFFSET. */
2841 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2842 if (cmp != 0)
2843 return cmp;
2844
2845 /* Put reads before writes. */
2846 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2847 return DR_IS_READ (dra) ? -1 : 1;
2848
2849 /* Then sort after access size. */
2850 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2851 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2852 if (cmp != 0)
2853 return cmp;
2854
2855 /* And after step. */
2856 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2857 if (cmp != 0)
2858 return cmp;
2859
2860 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2861 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2862 if (cmp == 0)
2863 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2864 return cmp;
2865 }
2866
2867 /* If OP is the result of a conversion, return the unconverted value,
2868 otherwise return null. */
2869
2870 static tree
2871 strip_conversion (tree op)
2872 {
2873 if (TREE_CODE (op) != SSA_NAME)
2874 return NULL_TREE;
2875 gimple *stmt = SSA_NAME_DEF_STMT (op);
2876 if (!is_gimple_assign (stmt)
2877 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2878 return NULL_TREE;
2879 return gimple_assign_rhs1 (stmt);
2880 }
2881
2882 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2883 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
2884 be grouped in SLP mode. */
2885
2886 static bool
2887 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2888 bool allow_slp_p)
2889 {
2890 if (gimple_assign_single_p (stmt1_info->stmt))
2891 return gimple_assign_single_p (stmt2_info->stmt);
2892
2893 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2894 if (call1 && gimple_call_internal_p (call1))
2895 {
2896 /* Check for two masked loads or two masked stores. */
2897 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2898 if (!call2 || !gimple_call_internal_p (call2))
2899 return false;
2900 internal_fn ifn = gimple_call_internal_fn (call1);
2901 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2902 return false;
2903 if (ifn != gimple_call_internal_fn (call2))
2904 return false;
2905
2906 /* Check that the masks are the same. Cope with casts of masks,
2907 like those created by build_mask_conversion. */
2908 tree mask1 = gimple_call_arg (call1, 2);
2909 tree mask2 = gimple_call_arg (call2, 2);
2910 if (!operand_equal_p (mask1, mask2, 0)
2911 && (ifn == IFN_MASK_STORE || !allow_slp_p))
2912 {
2913 mask1 = strip_conversion (mask1);
2914 if (!mask1)
2915 return false;
2916 mask2 = strip_conversion (mask2);
2917 if (!mask2)
2918 return false;
2919 if (!operand_equal_p (mask1, mask2, 0))
2920 return false;
2921 }
2922 return true;
2923 }
2924
2925 return false;
2926 }
2927
2928 /* Function vect_analyze_data_ref_accesses.
2929
2930 Analyze the access pattern of all the data references in the loop.
2931
2932 FORNOW: the only access pattern that is considered vectorizable is a
2933 simple step 1 (consecutive) access.
2934
2935 FORNOW: handle only arrays and pointer accesses. */
2936
2937 opt_result
2938 vect_analyze_data_ref_accesses (vec_info *vinfo)
2939 {
2940 unsigned int i;
2941 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2942 struct data_reference *dr;
2943
2944 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2945
2946 if (datarefs.is_empty ())
2947 return opt_result::success ();
2948
2949 /* Sort the array of datarefs to make building the interleaving chains
2950 linear. Don't modify the original vector's order, it is needed for
2951 determining what dependencies are reversed. */
2952 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2953 datarefs_copy.qsort (dr_group_sort_cmp);
2954 hash_set<stmt_vec_info> to_fixup;
2955
2956 /* Build the interleaving chains. */
2957 for (i = 0; i < datarefs_copy.length () - 1;)
2958 {
2959 data_reference_p dra = datarefs_copy[i];
2960 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2961 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2962 stmt_vec_info lastinfo = NULL;
2963 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2964 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2965 {
2966 ++i;
2967 continue;
2968 }
2969 for (i = i + 1; i < datarefs_copy.length (); ++i)
2970 {
2971 data_reference_p drb = datarefs_copy[i];
2972 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2973 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2974 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2975 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2976 break;
2977
2978 /* ??? Imperfect sorting (non-compatible types, non-modulo
2979 accesses, same accesses) can lead to a group to be artificially
2980 split here as we don't just skip over those. If it really
2981 matters we can push those to a worklist and re-iterate
2982 over them. The we can just skip ahead to the next DR here. */
2983
2984 /* DRs in a different loop should not be put into the same
2985 interleaving group. */
2986 if (gimple_bb (DR_STMT (dra))->loop_father
2987 != gimple_bb (DR_STMT (drb))->loop_father)
2988 break;
2989
2990 /* Check that the data-refs have same first location (except init)
2991 and they are both either store or load (not load and store,
2992 not masked loads or stores). */
2993 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2994 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2995 DR_BASE_ADDRESS (drb)) != 0
2996 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2997 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2998 break;
2999
3000 /* Check that the data-refs have the same constant size. */
3001 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3002 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3003 if (!tree_fits_uhwi_p (sza)
3004 || !tree_fits_uhwi_p (szb)
3005 || !tree_int_cst_equal (sza, szb))
3006 break;
3007
3008 /* Check that the data-refs have the same step. */
3009 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3010 break;
3011
3012 /* Check the types are compatible.
3013 ??? We don't distinguish this during sorting. */
3014 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3015 TREE_TYPE (DR_REF (drb))))
3016 break;
3017
3018 /* Check that the DR_INITs are compile-time constants. */
3019 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3020 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3021 break;
3022
3023 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3024 just hold extra information. */
3025 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3026 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3027 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3028 break;
3029
3030 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3031 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3032 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3033 HOST_WIDE_INT init_prev
3034 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3035 gcc_assert (init_a <= init_b
3036 && init_a <= init_prev
3037 && init_prev <= init_b);
3038
3039 /* Do not place the same access in the interleaving chain twice. */
3040 if (init_b == init_prev)
3041 {
3042 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3043 < gimple_uid (DR_STMT (drb)));
3044 /* Simply link in duplicates and fix up the chain below. */
3045 }
3046 else
3047 {
3048 /* If init_b == init_a + the size of the type * k, we have an
3049 interleaving, and DRA is accessed before DRB. */
3050 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3051 if (type_size_a == 0
3052 || (init_b - init_a) % type_size_a != 0)
3053 break;
3054
3055 /* If we have a store, the accesses are adjacent. This splits
3056 groups into chunks we support (we don't support vectorization
3057 of stores with gaps). */
3058 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3059 break;
3060
3061 /* If the step (if not zero or non-constant) is greater than the
3062 difference between data-refs' inits this splits groups into
3063 suitable sizes. */
3064 if (tree_fits_shwi_p (DR_STEP (dra)))
3065 {
3066 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3067 if (step != 0 && step <= (init_b - init_a))
3068 break;
3069 }
3070 }
3071
3072 if (dump_enabled_p ())
3073 dump_printf_loc (MSG_NOTE, vect_location,
3074 DR_IS_READ (dra)
3075 ? "Detected interleaving load %T and %T\n"
3076 : "Detected interleaving store %T and %T\n",
3077 DR_REF (dra), DR_REF (drb));
3078
3079 /* Link the found element into the group list. */
3080 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3081 {
3082 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3083 lastinfo = stmtinfo_a;
3084 }
3085 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3086 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3087 lastinfo = stmtinfo_b;
3088
3089 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3090 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3091
3092 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3093 dump_printf_loc (MSG_NOTE, vect_location,
3094 "Load suitable for SLP vectorization only.\n");
3095
3096 if (init_b == init_prev
3097 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3098 && dump_enabled_p ())
3099 dump_printf_loc (MSG_NOTE, vect_location,
3100 "Queuing group with duplicate access for fixup\n");
3101 }
3102 }
3103
3104 /* Fixup groups with duplicate entries by splitting it. */
3105 while (1)
3106 {
3107 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3108 if (!(it != to_fixup.end ()))
3109 break;
3110 stmt_vec_info grp = *it;
3111 to_fixup.remove (grp);
3112
3113 /* Find the earliest duplicate group member. */
3114 unsigned first_duplicate = -1u;
3115 stmt_vec_info next, g = grp;
3116 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3117 {
3118 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3119 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3120 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3121 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3122 g = next;
3123 }
3124 if (first_duplicate == -1U)
3125 continue;
3126
3127 /* Then move all stmts after the first duplicate to a new group.
3128 Note this is a heuristic but one with the property that *it
3129 is fixed up completely. */
3130 g = grp;
3131 stmt_vec_info newgroup = NULL, ng = grp;
3132 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3133 {
3134 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3135 {
3136 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3137 if (!newgroup)
3138 newgroup = next;
3139 else
3140 DR_GROUP_NEXT_ELEMENT (ng) = next;
3141 ng = next;
3142 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3143 }
3144 else
3145 g = DR_GROUP_NEXT_ELEMENT (g);
3146 }
3147 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3148
3149 /* Fixup the new group which still may contain duplicates. */
3150 to_fixup.add (newgroup);
3151 }
3152
3153 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3154 {
3155 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3156 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3157 && !vect_analyze_data_ref_access (vinfo, dr_info))
3158 {
3159 if (dump_enabled_p ())
3160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3161 "not vectorized: complicated access pattern.\n");
3162
3163 if (is_a <bb_vec_info> (vinfo))
3164 {
3165 /* Mark the statement as not vectorizable. */
3166 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3167 continue;
3168 }
3169 else
3170 {
3171 datarefs_copy.release ();
3172 return opt_result::failure_at (dr_info->stmt->stmt,
3173 "not vectorized:"
3174 " complicated access pattern.\n");
3175 }
3176 }
3177 }
3178
3179 datarefs_copy.release ();
3180 return opt_result::success ();
3181 }
3182
3183 /* Function vect_vfa_segment_size.
3184
3185 Input:
3186 DR_INFO: The data reference.
3187 LENGTH_FACTOR: segment length to consider.
3188
3189 Return a value suitable for the dr_with_seg_len::seg_len field.
3190 This is the "distance travelled" by the pointer from the first
3191 iteration in the segment to the last. Note that it does not include
3192 the size of the access; in effect it only describes the first byte. */
3193
3194 static tree
3195 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3196 {
3197 length_factor = size_binop (MINUS_EXPR,
3198 fold_convert (sizetype, length_factor),
3199 size_one_node);
3200 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3201 length_factor);
3202 }
3203
3204 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3205 gives the worst-case number of bytes covered by the segment. */
3206
3207 static unsigned HOST_WIDE_INT
3208 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3209 {
3210 stmt_vec_info stmt_vinfo = dr_info->stmt;
3211 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3212 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3213 unsigned HOST_WIDE_INT access_size = ref_size;
3214 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3215 {
3216 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3217 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3218 }
3219 if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3220 && (vect_supportable_dr_alignment (vinfo, dr_info, false)
3221 == dr_explicit_realign_optimized))
3222 {
3223 /* We might access a full vector's worth. */
3224 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3225 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3226 }
3227 return access_size;
3228 }
3229
3230 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3231 describes. */
3232
3233 static unsigned int
3234 vect_vfa_align (dr_vec_info *dr_info)
3235 {
3236 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3237 }
3238
3239 /* Function vect_no_alias_p.
3240
3241 Given data references A and B with equal base and offset, see whether
3242 the alias relation can be decided at compilation time. Return 1 if
3243 it can and the references alias, 0 if it can and the references do
3244 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3245 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3246 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3247
3248 static int
3249 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3250 tree segment_length_a, tree segment_length_b,
3251 unsigned HOST_WIDE_INT access_size_a,
3252 unsigned HOST_WIDE_INT access_size_b)
3253 {
3254 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3255 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3256 poly_uint64 const_length_a;
3257 poly_uint64 const_length_b;
3258
3259 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3260 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3261 [a, a+12) */
3262 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3263 {
3264 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3265 offset_a -= const_length_a;
3266 }
3267 else
3268 const_length_a = tree_to_poly_uint64 (segment_length_a);
3269 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3270 {
3271 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3272 offset_b -= const_length_b;
3273 }
3274 else
3275 const_length_b = tree_to_poly_uint64 (segment_length_b);
3276
3277 const_length_a += access_size_a;
3278 const_length_b += access_size_b;
3279
3280 if (ranges_known_overlap_p (offset_a, const_length_a,
3281 offset_b, const_length_b))
3282 return 1;
3283
3284 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3285 offset_b, const_length_b))
3286 return 0;
3287
3288 return -1;
3289 }
3290
3291 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3292 in DDR is >= VF. */
3293
3294 static bool
3295 dependence_distance_ge_vf (data_dependence_relation *ddr,
3296 unsigned int loop_depth, poly_uint64 vf)
3297 {
3298 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3299 || DDR_NUM_DIST_VECTS (ddr) == 0)
3300 return false;
3301
3302 /* If the dependence is exact, we should have limited the VF instead. */
3303 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3304
3305 unsigned int i;
3306 lambda_vector dist_v;
3307 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3308 {
3309 HOST_WIDE_INT dist = dist_v[loop_depth];
3310 if (dist != 0
3311 && !(dist > 0 && DDR_REVERSED_P (ddr))
3312 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3313 return false;
3314 }
3315
3316 if (dump_enabled_p ())
3317 dump_printf_loc (MSG_NOTE, vect_location,
3318 "dependence distance between %T and %T is >= VF\n",
3319 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3320
3321 return true;
3322 }
3323
3324 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3325
3326 static void
3327 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3328 {
3329 dump_printf (dump_kind, "%s (%T) >= ",
3330 lower_bound.unsigned_p ? "unsigned" : "abs",
3331 lower_bound.expr);
3332 dump_dec (dump_kind, lower_bound.min_value);
3333 }
3334
3335 /* Record that the vectorized loop requires the vec_lower_bound described
3336 by EXPR, UNSIGNED_P and MIN_VALUE. */
3337
3338 static void
3339 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3340 poly_uint64 min_value)
3341 {
3342 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3343 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3344 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3345 {
3346 unsigned_p &= lower_bounds[i].unsigned_p;
3347 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3348 if (lower_bounds[i].unsigned_p != unsigned_p
3349 || maybe_lt (lower_bounds[i].min_value, min_value))
3350 {
3351 lower_bounds[i].unsigned_p = unsigned_p;
3352 lower_bounds[i].min_value = min_value;
3353 if (dump_enabled_p ())
3354 {
3355 dump_printf_loc (MSG_NOTE, vect_location,
3356 "updating run-time check to ");
3357 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3358 dump_printf (MSG_NOTE, "\n");
3359 }
3360 }
3361 return;
3362 }
3363
3364 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3365 if (dump_enabled_p ())
3366 {
3367 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3368 dump_lower_bound (MSG_NOTE, lower_bound);
3369 dump_printf (MSG_NOTE, "\n");
3370 }
3371 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3372 }
3373
3374 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3375 will span fewer than GAP bytes. */
3376
3377 static bool
3378 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3379 poly_int64 gap)
3380 {
3381 stmt_vec_info stmt_info = dr_info->stmt;
3382 HOST_WIDE_INT count
3383 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3384 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3385 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3386 return (estimated_poly_value (gap)
3387 <= count * vect_get_scalar_dr_size (dr_info));
3388 }
3389
3390 /* Return true if we know that there is no alias between DR_INFO_A and
3391 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3392 When returning true, set *LOWER_BOUND_OUT to this N. */
3393
3394 static bool
3395 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3396 poly_uint64 *lower_bound_out)
3397 {
3398 /* Check that there is a constant gap of known sign between DR_A
3399 and DR_B. */
3400 data_reference *dr_a = dr_info_a->dr;
3401 data_reference *dr_b = dr_info_b->dr;
3402 poly_int64 init_a, init_b;
3403 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3404 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3405 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3406 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3407 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3408 || !ordered_p (init_a, init_b))
3409 return false;
3410
3411 /* Sort DR_A and DR_B by the address they access. */
3412 if (maybe_lt (init_b, init_a))
3413 {
3414 std::swap (init_a, init_b);
3415 std::swap (dr_info_a, dr_info_b);
3416 std::swap (dr_a, dr_b);
3417 }
3418
3419 /* If the two accesses could be dependent within a scalar iteration,
3420 make sure that we'd retain their order. */
3421 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3422 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3423 return false;
3424
3425 /* There is no alias if abs (DR_STEP) is greater than or equal to
3426 the bytes spanned by the combination of the two accesses. */
3427 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3428 return true;
3429 }
3430
3431 /* Function vect_prune_runtime_alias_test_list.
3432
3433 Prune a list of ddrs to be tested at run-time by versioning for alias.
3434 Merge several alias checks into one if possible.
3435 Return FALSE if resulting list of ddrs is longer then allowed by
3436 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3437
3438 opt_result
3439 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3440 {
3441 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3442 hash_set <tree_pair_hash> compared_objects;
3443
3444 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3445 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3446 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3447 vec<vec_object_pair> &check_unequal_addrs
3448 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3449 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3450 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3451
3452 ddr_p ddr;
3453 unsigned int i;
3454 tree length_factor;
3455
3456 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3457
3458 /* Step values are irrelevant for aliasing if the number of vector
3459 iterations is equal to the number of scalar iterations (which can
3460 happen for fully-SLP loops). */
3461 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3462
3463 if (!ignore_step_p)
3464 {
3465 /* Convert the checks for nonzero steps into bound tests. */
3466 tree value;
3467 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3468 vect_check_lower_bound (loop_vinfo, value, true, 1);
3469 }
3470
3471 if (may_alias_ddrs.is_empty ())
3472 return opt_result::success ();
3473
3474 comp_alias_ddrs.create (may_alias_ddrs.length ());
3475
3476 unsigned int loop_depth
3477 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3478 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3479
3480 /* First, we collect all data ref pairs for aliasing checks. */
3481 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3482 {
3483 poly_uint64 lower_bound;
3484 tree segment_length_a, segment_length_b;
3485 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3486 unsigned int align_a, align_b;
3487
3488 /* Ignore the alias if the VF we chose ended up being no greater
3489 than the dependence distance. */
3490 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3491 continue;
3492
3493 if (DDR_OBJECT_A (ddr))
3494 {
3495 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3496 if (!compared_objects.add (new_pair))
3497 {
3498 if (dump_enabled_p ())
3499 dump_printf_loc (MSG_NOTE, vect_location,
3500 "checking that %T and %T"
3501 " have different addresses\n",
3502 new_pair.first, new_pair.second);
3503 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3504 }
3505 continue;
3506 }
3507
3508 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3509 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3510
3511 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3512 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3513
3514 bool preserves_scalar_order_p
3515 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3516
3517 /* Skip the pair if inter-iteration dependencies are irrelevant
3518 and intra-iteration dependencies are guaranteed to be honored. */
3519 if (ignore_step_p
3520 && (preserves_scalar_order_p
3521 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3522 &lower_bound)))
3523 {
3524 if (dump_enabled_p ())
3525 dump_printf_loc (MSG_NOTE, vect_location,
3526 "no need for alias check between "
3527 "%T and %T when VF is 1\n",
3528 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3529 continue;
3530 }
3531
3532 /* See whether we can handle the alias using a bounds check on
3533 the step, and whether that's likely to be the best approach.
3534 (It might not be, for example, if the minimum step is much larger
3535 than the number of bytes handled by one vector iteration.) */
3536 if (!ignore_step_p
3537 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3538 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3539 &lower_bound)
3540 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3541 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3542 {
3543 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3544 if (dump_enabled_p ())
3545 {
3546 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3547 "%T and %T when the step %T is outside ",
3548 DR_REF (dr_info_a->dr),
3549 DR_REF (dr_info_b->dr),
3550 DR_STEP (dr_info_a->dr));
3551 if (unsigned_p)
3552 dump_printf (MSG_NOTE, "[0");
3553 else
3554 {
3555 dump_printf (MSG_NOTE, "(");
3556 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3557 }
3558 dump_printf (MSG_NOTE, ", ");
3559 dump_dec (MSG_NOTE, lower_bound);
3560 dump_printf (MSG_NOTE, ")\n");
3561 }
3562 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3563 unsigned_p, lower_bound);
3564 continue;
3565 }
3566
3567 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3568 if (dr_group_first_a)
3569 {
3570 stmt_info_a = dr_group_first_a;
3571 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3572 }
3573
3574 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3575 if (dr_group_first_b)
3576 {
3577 stmt_info_b = dr_group_first_b;
3578 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3579 }
3580
3581 if (ignore_step_p)
3582 {
3583 segment_length_a = size_zero_node;
3584 segment_length_b = size_zero_node;
3585 }
3586 else
3587 {
3588 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3589 DR_STEP (dr_info_b->dr), 0))
3590 length_factor = scalar_loop_iters;
3591 else
3592 length_factor = size_int (vect_factor);
3593 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3594 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3595 }
3596 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3597 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3598 align_a = vect_vfa_align (dr_info_a);
3599 align_b = vect_vfa_align (dr_info_b);
3600
3601 /* See whether the alias is known at compilation time. */
3602 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3603 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3604 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3605 DR_OFFSET (dr_info_b->dr), 0)
3606 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3607 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3608 && poly_int_tree_p (segment_length_a)
3609 && poly_int_tree_p (segment_length_b))
3610 {
3611 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3612 segment_length_a,
3613 segment_length_b,
3614 access_size_a,
3615 access_size_b);
3616 if (res >= 0 && dump_enabled_p ())
3617 {
3618 dump_printf_loc (MSG_NOTE, vect_location,
3619 "can tell at compile time that %T and %T",
3620 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3621 if (res == 0)
3622 dump_printf (MSG_NOTE, " do not alias\n");
3623 else
3624 dump_printf (MSG_NOTE, " alias\n");
3625 }
3626
3627 if (res == 0)
3628 continue;
3629
3630 if (res == 1)
3631 return opt_result::failure_at (stmt_info_b->stmt,
3632 "not vectorized:"
3633 " compilation time alias: %G%G",
3634 stmt_info_a->stmt,
3635 stmt_info_b->stmt);
3636 }
3637
3638 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3639 access_size_a, align_a);
3640 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3641 access_size_b, align_b);
3642 /* Canonicalize the order to be the one that's needed for accurate
3643 RAW, WAR and WAW flags, in cases where the data references are
3644 well-ordered. The order doesn't really matter otherwise,
3645 but we might as well be consistent. */
3646 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3647 std::swap (dr_a, dr_b);
3648
3649 dr_with_seg_len_pair_t dr_with_seg_len_pair
3650 (dr_a, dr_b, (preserves_scalar_order_p
3651 ? dr_with_seg_len_pair_t::WELL_ORDERED
3652 : dr_with_seg_len_pair_t::REORDERED));
3653
3654 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3655 }
3656
3657 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3658
3659 unsigned int count = (comp_alias_ddrs.length ()
3660 + check_unequal_addrs.length ());
3661
3662 if (dump_enabled_p ())
3663 dump_printf_loc (MSG_NOTE, vect_location,
3664 "improved number of alias checks from %d to %d\n",
3665 may_alias_ddrs.length (), count);
3666 unsigned limit = param_vect_max_version_for_alias_checks;
3667 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3668 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3669 if (count > limit)
3670 return opt_result::failure_at
3671 (vect_location,
3672 "number of versioning for alias run-time tests exceeds %d "
3673 "(--param vect-max-version-for-alias-checks)\n", limit);
3674
3675 return opt_result::success ();
3676 }
3677
3678 /* Check whether we can use an internal function for a gather load
3679 or scatter store. READ_P is true for loads and false for stores.
3680 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3681 the type of the memory elements being loaded or stored. OFFSET_TYPE
3682 is the type of the offset that is being applied to the invariant
3683 base address. SCALE is the amount by which the offset should
3684 be multiplied *after* it has been converted to address width.
3685
3686 Return true if the function is supported, storing the function id in
3687 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3688
3689 bool
3690 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3691 tree vectype, tree memory_type, tree offset_type,
3692 int scale, internal_fn *ifn_out,
3693 tree *offset_vectype_out)
3694 {
3695 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3696 unsigned int element_bits = vector_element_bits (vectype);
3697 if (element_bits != memory_bits)
3698 /* For now the vector elements must be the same width as the
3699 memory elements. */
3700 return false;
3701
3702 /* Work out which function we need. */
3703 internal_fn ifn;
3704 if (read_p)
3705 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3706 else
3707 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3708
3709 for (;;)
3710 {
3711 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3712 if (!offset_vectype)
3713 return false;
3714
3715 /* Test whether the target supports this combination. */
3716 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3717 offset_vectype, scale))
3718 {
3719 *ifn_out = ifn;
3720 *offset_vectype_out = offset_vectype;
3721 return true;
3722 }
3723
3724 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3725 && TYPE_PRECISION (offset_type) >= element_bits)
3726 return false;
3727
3728 offset_type = build_nonstandard_integer_type
3729 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3730 }
3731 }
3732
3733 /* STMT_INFO is a call to an internal gather load or scatter store function.
3734 Describe the operation in INFO. */
3735
3736 static void
3737 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3738 gather_scatter_info *info)
3739 {
3740 gcall *call = as_a <gcall *> (stmt_info->stmt);
3741 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3742 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3743
3744 info->ifn = gimple_call_internal_fn (call);
3745 info->decl = NULL_TREE;
3746 info->base = gimple_call_arg (call, 0);
3747 info->offset = gimple_call_arg (call, 1);
3748 info->offset_dt = vect_unknown_def_type;
3749 info->offset_vectype = NULL_TREE;
3750 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3751 info->element_type = TREE_TYPE (vectype);
3752 info->memory_type = TREE_TYPE (DR_REF (dr));
3753 }
3754
3755 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3756 gather load or scatter store. Describe the operation in *INFO if so. */
3757
3758 bool
3759 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3760 gather_scatter_info *info)
3761 {
3762 HOST_WIDE_INT scale = 1;
3763 poly_int64 pbitpos, pbitsize;
3764 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3765 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3766 tree offtype = NULL_TREE;
3767 tree decl = NULL_TREE, base, off;
3768 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3769 tree memory_type = TREE_TYPE (DR_REF (dr));
3770 machine_mode pmode;
3771 int punsignedp, reversep, pvolatilep = 0;
3772 internal_fn ifn;
3773 tree offset_vectype;
3774 bool masked_p = false;
3775
3776 /* See whether this is already a call to a gather/scatter internal function.
3777 If not, see whether it's a masked load or store. */
3778 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3779 if (call && gimple_call_internal_p (call))
3780 {
3781 ifn = gimple_call_internal_fn (call);
3782 if (internal_gather_scatter_fn_p (ifn))
3783 {
3784 vect_describe_gather_scatter_call (stmt_info, info);
3785 return true;
3786 }
3787 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3788 }
3789
3790 /* True if we should aim to use internal functions rather than
3791 built-in functions. */
3792 bool use_ifn_p = (DR_IS_READ (dr)
3793 ? supports_vec_gather_load_p ()
3794 : supports_vec_scatter_store_p ());
3795
3796 base = DR_REF (dr);
3797 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3798 see if we can use the def stmt of the address. */
3799 if (masked_p
3800 && TREE_CODE (base) == MEM_REF
3801 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3802 && integer_zerop (TREE_OPERAND (base, 1))
3803 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3804 {
3805 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3806 if (is_gimple_assign (def_stmt)
3807 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3808 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3809 }
3810
3811 /* The gather and scatter builtins need address of the form
3812 loop_invariant + vector * {1, 2, 4, 8}
3813 or
3814 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3815 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3816 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3817 multiplications and additions in it. To get a vector, we need
3818 a single SSA_NAME that will be defined in the loop and will
3819 contain everything that is not loop invariant and that can be
3820 vectorized. The following code attempts to find such a preexistng
3821 SSA_NAME OFF and put the loop invariants into a tree BASE
3822 that can be gimplified before the loop. */
3823 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3824 &punsignedp, &reversep, &pvolatilep);
3825 if (reversep)
3826 return false;
3827
3828 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3829
3830 if (TREE_CODE (base) == MEM_REF)
3831 {
3832 if (!integer_zerop (TREE_OPERAND (base, 1)))
3833 {
3834 if (off == NULL_TREE)
3835 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3836 else
3837 off = size_binop (PLUS_EXPR, off,
3838 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3839 }
3840 base = TREE_OPERAND (base, 0);
3841 }
3842 else
3843 base = build_fold_addr_expr (base);
3844
3845 if (off == NULL_TREE)
3846 off = size_zero_node;
3847
3848 /* If base is not loop invariant, either off is 0, then we start with just
3849 the constant offset in the loop invariant BASE and continue with base
3850 as OFF, otherwise give up.
3851 We could handle that case by gimplifying the addition of base + off
3852 into some SSA_NAME and use that as off, but for now punt. */
3853 if (!expr_invariant_in_loop_p (loop, base))
3854 {
3855 if (!integer_zerop (off))
3856 return false;
3857 off = base;
3858 base = size_int (pbytepos);
3859 }
3860 /* Otherwise put base + constant offset into the loop invariant BASE
3861 and continue with OFF. */
3862 else
3863 {
3864 base = fold_convert (sizetype, base);
3865 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3866 }
3867
3868 /* OFF at this point may be either a SSA_NAME or some tree expression
3869 from get_inner_reference. Try to peel off loop invariants from it
3870 into BASE as long as possible. */
3871 STRIP_NOPS (off);
3872 while (offtype == NULL_TREE)
3873 {
3874 enum tree_code code;
3875 tree op0, op1, add = NULL_TREE;
3876
3877 if (TREE_CODE (off) == SSA_NAME)
3878 {
3879 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3880
3881 if (expr_invariant_in_loop_p (loop, off))
3882 return false;
3883
3884 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3885 break;
3886
3887 op0 = gimple_assign_rhs1 (def_stmt);
3888 code = gimple_assign_rhs_code (def_stmt);
3889 op1 = gimple_assign_rhs2 (def_stmt);
3890 }
3891 else
3892 {
3893 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3894 return false;
3895 code = TREE_CODE (off);
3896 extract_ops_from_tree (off, &code, &op0, &op1);
3897 }
3898 switch (code)
3899 {
3900 case POINTER_PLUS_EXPR:
3901 case PLUS_EXPR:
3902 if (expr_invariant_in_loop_p (loop, op0))
3903 {
3904 add = op0;
3905 off = op1;
3906 do_add:
3907 add = fold_convert (sizetype, add);
3908 if (scale != 1)
3909 add = size_binop (MULT_EXPR, add, size_int (scale));
3910 base = size_binop (PLUS_EXPR, base, add);
3911 continue;
3912 }
3913 if (expr_invariant_in_loop_p (loop, op1))
3914 {
3915 add = op1;
3916 off = op0;
3917 goto do_add;
3918 }
3919 break;
3920 case MINUS_EXPR:
3921 if (expr_invariant_in_loop_p (loop, op1))
3922 {
3923 add = fold_convert (sizetype, op1);
3924 add = size_binop (MINUS_EXPR, size_zero_node, add);
3925 off = op0;
3926 goto do_add;
3927 }
3928 break;
3929 case MULT_EXPR:
3930 if (scale == 1 && tree_fits_shwi_p (op1))
3931 {
3932 int new_scale = tree_to_shwi (op1);
3933 /* Only treat this as a scaling operation if the target
3934 supports it for at least some offset type. */
3935 if (use_ifn_p
3936 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3937 masked_p, vectype, memory_type,
3938 signed_char_type_node,
3939 new_scale, &ifn,
3940 &offset_vectype)
3941 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3942 masked_p, vectype, memory_type,
3943 unsigned_char_type_node,
3944 new_scale, &ifn,
3945 &offset_vectype))
3946 break;
3947 scale = new_scale;
3948 off = op0;
3949 continue;
3950 }
3951 break;
3952 case SSA_NAME:
3953 off = op0;
3954 continue;
3955 CASE_CONVERT:
3956 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3957 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3958 break;
3959
3960 /* Don't include the conversion if the target is happy with
3961 the current offset type. */
3962 if (use_ifn_p
3963 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3964 masked_p, vectype, memory_type,
3965 TREE_TYPE (off), scale, &ifn,
3966 &offset_vectype))
3967 break;
3968
3969 if (TYPE_PRECISION (TREE_TYPE (op0))
3970 == TYPE_PRECISION (TREE_TYPE (off)))
3971 {
3972 off = op0;
3973 continue;
3974 }
3975
3976 if (TYPE_PRECISION (TREE_TYPE (op0))
3977 < TYPE_PRECISION (TREE_TYPE (off)))
3978 {
3979 off = op0;
3980 offtype = TREE_TYPE (off);
3981 STRIP_NOPS (off);
3982 continue;
3983 }
3984 break;
3985 default:
3986 break;
3987 }
3988 break;
3989 }
3990
3991 /* If at the end OFF still isn't a SSA_NAME or isn't
3992 defined in the loop, punt. */
3993 if (TREE_CODE (off) != SSA_NAME
3994 || expr_invariant_in_loop_p (loop, off))
3995 return false;
3996
3997 if (offtype == NULL_TREE)
3998 offtype = TREE_TYPE (off);
3999
4000 if (use_ifn_p)
4001 {
4002 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4003 vectype, memory_type, offtype, scale,
4004 &ifn, &offset_vectype))
4005 return false;
4006 }
4007 else
4008 {
4009 if (DR_IS_READ (dr))
4010 {
4011 if (targetm.vectorize.builtin_gather)
4012 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4013 }
4014 else
4015 {
4016 if (targetm.vectorize.builtin_scatter)
4017 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4018 }
4019
4020 if (!decl)
4021 return false;
4022
4023 ifn = IFN_LAST;
4024 /* The offset vector type will be read from DECL when needed. */
4025 offset_vectype = NULL_TREE;
4026 }
4027
4028 info->ifn = ifn;
4029 info->decl = decl;
4030 info->base = base;
4031 info->offset = off;
4032 info->offset_dt = vect_unknown_def_type;
4033 info->offset_vectype = offset_vectype;
4034 info->scale = scale;
4035 info->element_type = TREE_TYPE (vectype);
4036 info->memory_type = memory_type;
4037 return true;
4038 }
4039
4040 /* Find the data references in STMT, analyze them with respect to LOOP and
4041 append them to DATAREFS. Return false if datarefs in this stmt cannot
4042 be handled. */
4043
4044 opt_result
4045 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4046 vec<data_reference_p> *datarefs)
4047 {
4048 /* We can ignore clobbers for dataref analysis - they are removed during
4049 loop vectorization and BB vectorization checks dependences with a
4050 stmt walk. */
4051 if (gimple_clobber_p (stmt))
4052 return opt_result::success ();
4053
4054 if (gimple_has_volatile_ops (stmt))
4055 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4056 stmt);
4057
4058 if (stmt_can_throw_internal (cfun, stmt))
4059 return opt_result::failure_at (stmt,
4060 "not vectorized:"
4061 " statement can throw an exception: %G",
4062 stmt);
4063
4064 auto_vec<data_reference_p, 2> refs;
4065 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4066 if (!res)
4067 return res;
4068
4069 if (refs.is_empty ())
4070 return opt_result::success ();
4071
4072 if (refs.length () > 1)
4073 return opt_result::failure_at (stmt,
4074 "not vectorized:"
4075 " more than one data ref in stmt: %G", stmt);
4076
4077 if (gcall *call = dyn_cast <gcall *> (stmt))
4078 if (!gimple_call_internal_p (call)
4079 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4080 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4081 return opt_result::failure_at (stmt,
4082 "not vectorized: dr in a call %G", stmt);
4083
4084 data_reference_p dr = refs.pop ();
4085 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4086 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4087 return opt_result::failure_at (stmt,
4088 "not vectorized:"
4089 " statement is bitfield access %G", stmt);
4090
4091 if (DR_BASE_ADDRESS (dr)
4092 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4093 return opt_result::failure_at (stmt,
4094 "not vectorized:"
4095 " base addr of dr is a constant\n");
4096
4097 /* Check whether this may be a SIMD lane access and adjust the
4098 DR to make it easier for us to handle it. */
4099 if (loop
4100 && loop->simduid
4101 && (!DR_BASE_ADDRESS (dr)
4102 || !DR_OFFSET (dr)
4103 || !DR_INIT (dr)
4104 || !DR_STEP (dr)))
4105 {
4106 struct data_reference *newdr
4107 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4108 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4109 if (DR_BASE_ADDRESS (newdr)
4110 && DR_OFFSET (newdr)
4111 && DR_INIT (newdr)
4112 && DR_STEP (newdr)
4113 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4114 && integer_zerop (DR_STEP (newdr)))
4115 {
4116 tree base_address = DR_BASE_ADDRESS (newdr);
4117 tree off = DR_OFFSET (newdr);
4118 tree step = ssize_int (1);
4119 if (integer_zerop (off)
4120 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4121 {
4122 off = TREE_OPERAND (base_address, 1);
4123 base_address = TREE_OPERAND (base_address, 0);
4124 }
4125 STRIP_NOPS (off);
4126 if (TREE_CODE (off) == MULT_EXPR
4127 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4128 {
4129 step = TREE_OPERAND (off, 1);
4130 off = TREE_OPERAND (off, 0);
4131 STRIP_NOPS (off);
4132 }
4133 if (CONVERT_EXPR_P (off)
4134 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4135 < TYPE_PRECISION (TREE_TYPE (off))))
4136 off = TREE_OPERAND (off, 0);
4137 if (TREE_CODE (off) == SSA_NAME)
4138 {
4139 gimple *def = SSA_NAME_DEF_STMT (off);
4140 /* Look through widening conversion. */
4141 if (is_gimple_assign (def)
4142 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4143 {
4144 tree rhs1 = gimple_assign_rhs1 (def);
4145 if (TREE_CODE (rhs1) == SSA_NAME
4146 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4147 && (TYPE_PRECISION (TREE_TYPE (off))
4148 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4149 def = SSA_NAME_DEF_STMT (rhs1);
4150 }
4151 if (is_gimple_call (def)
4152 && gimple_call_internal_p (def)
4153 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4154 {
4155 tree arg = gimple_call_arg (def, 0);
4156 tree reft = TREE_TYPE (DR_REF (newdr));
4157 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4158 arg = SSA_NAME_VAR (arg);
4159 if (arg == loop->simduid
4160 /* For now. */
4161 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4162 {
4163 DR_BASE_ADDRESS (newdr) = base_address;
4164 DR_OFFSET (newdr) = ssize_int (0);
4165 DR_STEP (newdr) = step;
4166 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4167 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4168 /* Mark as simd-lane access. */
4169 tree arg2 = gimple_call_arg (def, 1);
4170 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4171 free_data_ref (dr);
4172 datarefs->safe_push (newdr);
4173 return opt_result::success ();
4174 }
4175 }
4176 }
4177 }
4178 free_data_ref (newdr);
4179 }
4180
4181 datarefs->safe_push (dr);
4182 return opt_result::success ();
4183 }
4184
4185 /* Function vect_analyze_data_refs.
4186
4187 Find all the data references in the loop or basic block.
4188
4189 The general structure of the analysis of data refs in the vectorizer is as
4190 follows:
4191 1- vect_analyze_data_refs(loop/bb): call
4192 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4193 in the loop/bb and their dependences.
4194 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4195 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4196 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4197
4198 */
4199
4200 opt_result
4201 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4202 {
4203 class loop *loop = NULL;
4204 unsigned int i;
4205 struct data_reference *dr;
4206 tree scalar_type;
4207
4208 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4209
4210 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4211 loop = LOOP_VINFO_LOOP (loop_vinfo);
4212
4213 /* Go through the data-refs, check that the analysis succeeded. Update
4214 pointer from stmt_vec_info struct to DR and vectype. */
4215
4216 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4217 FOR_EACH_VEC_ELT (datarefs, i, dr)
4218 {
4219 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4220 poly_uint64 vf;
4221
4222 gcc_assert (DR_REF (dr));
4223 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4224 gcc_assert (!stmt_info->dr_aux.dr);
4225 stmt_info->dr_aux.dr = dr;
4226 stmt_info->dr_aux.stmt = stmt_info;
4227
4228 /* Check that analysis of the data-ref succeeded. */
4229 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4230 || !DR_STEP (dr))
4231 {
4232 bool maybe_gather
4233 = DR_IS_READ (dr)
4234 && !TREE_THIS_VOLATILE (DR_REF (dr))
4235 && (targetm.vectorize.builtin_gather != NULL
4236 || supports_vec_gather_load_p ());
4237 bool maybe_scatter
4238 = DR_IS_WRITE (dr)
4239 && !TREE_THIS_VOLATILE (DR_REF (dr))
4240 && (targetm.vectorize.builtin_scatter != NULL
4241 || supports_vec_scatter_store_p ());
4242
4243 /* If target supports vector gather loads or scatter stores,
4244 see if they can't be used. */
4245 if (is_a <loop_vec_info> (vinfo)
4246 && !nested_in_vect_loop_p (loop, stmt_info))
4247 {
4248 if (maybe_gather || maybe_scatter)
4249 {
4250 if (maybe_gather)
4251 gatherscatter = GATHER;
4252 else
4253 gatherscatter = SCATTER;
4254 }
4255 }
4256
4257 if (gatherscatter == SG_NONE)
4258 {
4259 if (dump_enabled_p ())
4260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4261 "not vectorized: data ref analysis "
4262 "failed %G", stmt_info->stmt);
4263 if (is_a <bb_vec_info> (vinfo))
4264 {
4265 /* In BB vectorization the ref can still participate
4266 in dependence analysis, we just can't vectorize it. */
4267 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4268 continue;
4269 }
4270 return opt_result::failure_at (stmt_info->stmt,
4271 "not vectorized:"
4272 " data ref analysis failed: %G",
4273 stmt_info->stmt);
4274 }
4275 }
4276
4277 /* See if this was detected as SIMD lane access. */
4278 if (dr->aux == (void *)-1
4279 || dr->aux == (void *)-2
4280 || dr->aux == (void *)-3
4281 || dr->aux == (void *)-4)
4282 {
4283 if (nested_in_vect_loop_p (loop, stmt_info))
4284 return opt_result::failure_at (stmt_info->stmt,
4285 "not vectorized:"
4286 " data ref analysis failed: %G",
4287 stmt_info->stmt);
4288 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4289 = -(uintptr_t) dr->aux;
4290 }
4291
4292 tree base = get_base_address (DR_REF (dr));
4293 if (base && VAR_P (base) && DECL_NONALIASED (base))
4294 {
4295 if (dump_enabled_p ())
4296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4297 "not vectorized: base object not addressable "
4298 "for stmt: %G", stmt_info->stmt);
4299 if (is_a <bb_vec_info> (vinfo))
4300 {
4301 /* In BB vectorization the ref can still participate
4302 in dependence analysis, we just can't vectorize it. */
4303 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4304 continue;
4305 }
4306 return opt_result::failure_at (stmt_info->stmt,
4307 "not vectorized: base object not"
4308 " addressable for stmt: %G",
4309 stmt_info->stmt);
4310 }
4311
4312 if (is_a <loop_vec_info> (vinfo)
4313 && DR_STEP (dr)
4314 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4315 {
4316 if (nested_in_vect_loop_p (loop, stmt_info))
4317 return opt_result::failure_at (stmt_info->stmt,
4318 "not vectorized: "
4319 "not suitable for strided load %G",
4320 stmt_info->stmt);
4321 STMT_VINFO_STRIDED_P (stmt_info) = true;
4322 }
4323
4324 /* Update DR field in stmt_vec_info struct. */
4325
4326 /* If the dataref is in an inner-loop of the loop that is considered for
4327 for vectorization, we also want to analyze the access relative to
4328 the outer-loop (DR contains information only relative to the
4329 inner-most enclosing loop). We do that by building a reference to the
4330 first location accessed by the inner-loop, and analyze it relative to
4331 the outer-loop. */
4332 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4333 {
4334 /* Build a reference to the first location accessed by the
4335 inner loop: *(BASE + INIT + OFFSET). By construction,
4336 this address must be invariant in the inner loop, so we
4337 can consider it as being used in the outer loop. */
4338 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4339 tree offset = unshare_expr (DR_OFFSET (dr));
4340 tree init = unshare_expr (DR_INIT (dr));
4341 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4342 init, offset);
4343 tree init_addr = fold_build_pointer_plus (base, init_offset);
4344 tree init_ref = build_fold_indirect_ref (init_addr);
4345
4346 if (dump_enabled_p ())
4347 dump_printf_loc (MSG_NOTE, vect_location,
4348 "analyze in outer loop: %T\n", init_ref);
4349
4350 opt_result res
4351 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4352 init_ref, loop, stmt_info->stmt);
4353 if (!res)
4354 /* dr_analyze_innermost already explained the failure. */
4355 return res;
4356
4357 if (dump_enabled_p ())
4358 dump_printf_loc (MSG_NOTE, vect_location,
4359 "\touter base_address: %T\n"
4360 "\touter offset from base address: %T\n"
4361 "\touter constant offset from base address: %T\n"
4362 "\touter step: %T\n"
4363 "\touter base alignment: %d\n\n"
4364 "\touter base misalignment: %d\n"
4365 "\touter offset alignment: %d\n"
4366 "\touter step alignment: %d\n",
4367 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4368 STMT_VINFO_DR_OFFSET (stmt_info),
4369 STMT_VINFO_DR_INIT (stmt_info),
4370 STMT_VINFO_DR_STEP (stmt_info),
4371 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4372 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4373 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4374 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4375 }
4376
4377 /* Set vectype for STMT. */
4378 scalar_type = TREE_TYPE (DR_REF (dr));
4379 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4380 if (!vectype)
4381 {
4382 if (dump_enabled_p ())
4383 {
4384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4385 "not vectorized: no vectype for stmt: %G",
4386 stmt_info->stmt);
4387 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4388 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4389 scalar_type);
4390 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4391 }
4392
4393 if (is_a <bb_vec_info> (vinfo))
4394 {
4395 /* No vector type is fine, the ref can still participate
4396 in dependence analysis, we just can't vectorize it. */
4397 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4398 continue;
4399 }
4400 if (fatal)
4401 *fatal = false;
4402 return opt_result::failure_at (stmt_info->stmt,
4403 "not vectorized:"
4404 " no vectype for stmt: %G"
4405 " scalar_type: %T\n",
4406 stmt_info->stmt, scalar_type);
4407 }
4408 else
4409 {
4410 if (dump_enabled_p ())
4411 dump_printf_loc (MSG_NOTE, vect_location,
4412 "got vectype for stmt: %G%T\n",
4413 stmt_info->stmt, vectype);
4414 }
4415
4416 /* Adjust the minimal vectorization factor according to the
4417 vector type. */
4418 vf = TYPE_VECTOR_SUBPARTS (vectype);
4419 *min_vf = upper_bound (*min_vf, vf);
4420
4421 /* Leave the BB vectorizer to pick the vector type later, based on
4422 the final dataref group size and SLP node size. */
4423 if (is_a <loop_vec_info> (vinfo))
4424 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4425
4426 if (gatherscatter != SG_NONE)
4427 {
4428 gather_scatter_info gs_info;
4429 if (!vect_check_gather_scatter (stmt_info,
4430 as_a <loop_vec_info> (vinfo),
4431 &gs_info)
4432 || !get_vectype_for_scalar_type (vinfo,
4433 TREE_TYPE (gs_info.offset)))
4434 {
4435 if (fatal)
4436 *fatal = false;
4437 return opt_result::failure_at
4438 (stmt_info->stmt,
4439 (gatherscatter == GATHER)
4440 ? "not vectorized: not suitable for gather load %G"
4441 : "not vectorized: not suitable for scatter store %G",
4442 stmt_info->stmt);
4443 }
4444 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4445 }
4446 }
4447
4448 /* We used to stop processing and prune the list here. Verify we no
4449 longer need to. */
4450 gcc_assert (i == datarefs.length ());
4451
4452 return opt_result::success ();
4453 }
4454
4455
4456 /* Function vect_get_new_vect_var.
4457
4458 Returns a name for a new variable. The current naming scheme appends the
4459 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4460 the name of vectorizer generated variables, and appends that to NAME if
4461 provided. */
4462
4463 tree
4464 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4465 {
4466 const char *prefix;
4467 tree new_vect_var;
4468
4469 switch (var_kind)
4470 {
4471 case vect_simple_var:
4472 prefix = "vect";
4473 break;
4474 case vect_scalar_var:
4475 prefix = "stmp";
4476 break;
4477 case vect_mask_var:
4478 prefix = "mask";
4479 break;
4480 case vect_pointer_var:
4481 prefix = "vectp";
4482 break;
4483 default:
4484 gcc_unreachable ();
4485 }
4486
4487 if (name)
4488 {
4489 char* tmp = concat (prefix, "_", name, NULL);
4490 new_vect_var = create_tmp_reg (type, tmp);
4491 free (tmp);
4492 }
4493 else
4494 new_vect_var = create_tmp_reg (type, prefix);
4495
4496 return new_vect_var;
4497 }
4498
4499 /* Like vect_get_new_vect_var but return an SSA name. */
4500
4501 tree
4502 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4503 {
4504 const char *prefix;
4505 tree new_vect_var;
4506
4507 switch (var_kind)
4508 {
4509 case vect_simple_var:
4510 prefix = "vect";
4511 break;
4512 case vect_scalar_var:
4513 prefix = "stmp";
4514 break;
4515 case vect_pointer_var:
4516 prefix = "vectp";
4517 break;
4518 default:
4519 gcc_unreachable ();
4520 }
4521
4522 if (name)
4523 {
4524 char* tmp = concat (prefix, "_", name, NULL);
4525 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4526 free (tmp);
4527 }
4528 else
4529 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4530
4531 return new_vect_var;
4532 }
4533
4534 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */
4535
4536 static void
4537 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4538 {
4539 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4540 int misalign = DR_MISALIGNMENT (dr_info);
4541 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4542 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4543 else
4544 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4545 known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4546 misalign);
4547 }
4548
4549 /* Function vect_create_addr_base_for_vector_ref.
4550
4551 Create an expression that computes the address of the first memory location
4552 that will be accessed for a data reference.
4553
4554 Input:
4555 STMT_INFO: The statement containing the data reference.
4556 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4557 OFFSET: Optional. If supplied, it is be added to the initial address.
4558 LOOP: Specify relative to which loop-nest should the address be computed.
4559 For example, when the dataref is in an inner-loop nested in an
4560 outer-loop that is now being vectorized, LOOP can be either the
4561 outer-loop, or the inner-loop. The first memory location accessed
4562 by the following dataref ('in' points to short):
4563
4564 for (i=0; i<N; i++)
4565 for (j=0; j<M; j++)
4566 s += in[i+j]
4567
4568 is as follows:
4569 if LOOP=i_loop: &in (relative to i_loop)
4570 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4571 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4572 initial address. Unlike OFFSET, which is number of elements to
4573 be added, BYTE_OFFSET is measured in bytes.
4574
4575 Output:
4576 1. Return an SSA_NAME whose value is the address of the memory location of
4577 the first vector of the data reference.
4578 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4579 these statement(s) which define the returned SSA_NAME.
4580
4581 FORNOW: We are only handling array accesses with step 1. */
4582
4583 tree
4584 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4585 gimple_seq *new_stmt_list,
4586 tree offset,
4587 tree byte_offset)
4588 {
4589 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4590 struct data_reference *dr = dr_info->dr;
4591 const char *base_name;
4592 tree addr_base;
4593 tree dest;
4594 gimple_seq seq = NULL;
4595 tree vect_ptr_type;
4596 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4597 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4598 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4599
4600 tree data_ref_base = unshare_expr (drb->base_address);
4601 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4602 tree init = unshare_expr (drb->init);
4603
4604 if (loop_vinfo)
4605 base_name = get_name (data_ref_base);
4606 else
4607 {
4608 base_offset = ssize_int (0);
4609 init = ssize_int (0);
4610 base_name = get_name (DR_REF (dr));
4611 }
4612
4613 /* Create base_offset */
4614 base_offset = size_binop (PLUS_EXPR,
4615 fold_convert (sizetype, base_offset),
4616 fold_convert (sizetype, init));
4617
4618 if (offset)
4619 {
4620 offset = fold_build2 (MULT_EXPR, sizetype,
4621 fold_convert (sizetype, offset), step);
4622 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4623 base_offset, offset);
4624 }
4625 if (byte_offset)
4626 {
4627 byte_offset = fold_convert (sizetype, byte_offset);
4628 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4629 base_offset, byte_offset);
4630 }
4631
4632 /* base + base_offset */
4633 if (loop_vinfo)
4634 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4635 else
4636 {
4637 addr_base = build1 (ADDR_EXPR,
4638 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4639 unshare_expr (DR_REF (dr)));
4640 }
4641
4642 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4643 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4644 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4645 gimple_seq_add_seq (new_stmt_list, seq);
4646
4647 if (DR_PTR_INFO (dr)
4648 && TREE_CODE (addr_base) == SSA_NAME
4649 && !SSA_NAME_PTR_INFO (addr_base))
4650 {
4651 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4652 if (offset || byte_offset)
4653 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4654 }
4655
4656 if (dump_enabled_p ())
4657 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4658
4659 return addr_base;
4660 }
4661
4662
4663 /* Function vect_create_data_ref_ptr.
4664
4665 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4666 location accessed in the loop by STMT_INFO, along with the def-use update
4667 chain to appropriately advance the pointer through the loop iterations.
4668 Also set aliasing information for the pointer. This pointer is used by
4669 the callers to this function to create a memory reference expression for
4670 vector load/store access.
4671
4672 Input:
4673 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4674 GIMPLE_ASSIGN <name, data-ref> or
4675 GIMPLE_ASSIGN <data-ref, name>.
4676 2. AGGR_TYPE: the type of the reference, which should be either a vector
4677 or an array.
4678 3. AT_LOOP: the loop where the vector memref is to be created.
4679 4. OFFSET (optional): an offset to be added to the initial address accessed
4680 by the data-ref in STMT_INFO.
4681 5. BSI: location where the new stmts are to be placed if there is no loop
4682 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4683 pointing to the initial address.
4684 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4685 to the initial address accessed by the data-ref in STMT_INFO. This is
4686 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4687 in bytes.
4688 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4689 to the IV during each iteration of the loop. NULL says to move
4690 by one copy of AGGR_TYPE up or down, depending on the step of the
4691 data reference.
4692
4693 Output:
4694 1. Declare a new ptr to vector_type, and have it point to the base of the
4695 data reference (initial addressed accessed by the data reference).
4696 For example, for vector of type V8HI, the following code is generated:
4697
4698 v8hi *ap;
4699 ap = (v8hi *)initial_address;
4700
4701 if OFFSET is not supplied:
4702 initial_address = &a[init];
4703 if OFFSET is supplied:
4704 initial_address = &a[init + OFFSET];
4705 if BYTE_OFFSET is supplied:
4706 initial_address = &a[init] + BYTE_OFFSET;
4707
4708 Return the initial_address in INITIAL_ADDRESS.
4709
4710 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4711 update the pointer in each iteration of the loop.
4712
4713 Return the increment stmt that updates the pointer in PTR_INCR.
4714
4715 3. Return the pointer. */
4716
4717 tree
4718 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4719 tree aggr_type, class loop *at_loop, tree offset,
4720 tree *initial_address, gimple_stmt_iterator *gsi,
4721 gimple **ptr_incr, bool only_init,
4722 tree byte_offset, tree iv_step)
4723 {
4724 const char *base_name;
4725 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4726 class loop *loop = NULL;
4727 bool nested_in_vect_loop = false;
4728 class loop *containing_loop = NULL;
4729 tree aggr_ptr_type;
4730 tree aggr_ptr;
4731 tree new_temp;
4732 gimple_seq new_stmt_list = NULL;
4733 edge pe = NULL;
4734 basic_block new_bb;
4735 tree aggr_ptr_init;
4736 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4737 struct data_reference *dr = dr_info->dr;
4738 tree aptr;
4739 gimple_stmt_iterator incr_gsi;
4740 bool insert_after;
4741 tree indx_before_incr, indx_after_incr;
4742 gimple *incr;
4743 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4744
4745 gcc_assert (iv_step != NULL_TREE
4746 || TREE_CODE (aggr_type) == ARRAY_TYPE
4747 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4748
4749 if (loop_vinfo)
4750 {
4751 loop = LOOP_VINFO_LOOP (loop_vinfo);
4752 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4753 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4754 pe = loop_preheader_edge (loop);
4755 }
4756 else
4757 {
4758 gcc_assert (bb_vinfo);
4759 only_init = true;
4760 *ptr_incr = NULL;
4761 }
4762
4763 /* Create an expression for the first address accessed by this load
4764 in LOOP. */
4765 base_name = get_name (DR_BASE_ADDRESS (dr));
4766
4767 if (dump_enabled_p ())
4768 {
4769 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4770 dump_printf_loc (MSG_NOTE, vect_location,
4771 "create %s-pointer variable to type: %T",
4772 get_tree_code_name (TREE_CODE (aggr_type)),
4773 aggr_type);
4774 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4775 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4776 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4777 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4778 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4779 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4780 else
4781 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4782 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4783 }
4784
4785 /* (1) Create the new aggregate-pointer variable.
4786 Vector and array types inherit the alias set of their component
4787 type by default so we need to use a ref-all pointer if the data
4788 reference does not conflict with the created aggregated data
4789 reference because it is not addressable. */
4790 bool need_ref_all = false;
4791 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4792 get_alias_set (DR_REF (dr))))
4793 need_ref_all = true;
4794 /* Likewise for any of the data references in the stmt group. */
4795 else if (DR_GROUP_SIZE (stmt_info) > 1)
4796 {
4797 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4798 do
4799 {
4800 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4801 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4802 get_alias_set (DR_REF (sdr))))
4803 {
4804 need_ref_all = true;
4805 break;
4806 }
4807 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4808 }
4809 while (sinfo);
4810 }
4811 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4812 need_ref_all);
4813 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4814
4815
4816 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4817 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4818 def-use update cycles for the pointer: one relative to the outer-loop
4819 (LOOP), which is what steps (3) and (4) below do. The other is relative
4820 to the inner-loop (which is the inner-most loop containing the dataref),
4821 and this is done be step (5) below.
4822
4823 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4824 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4825 redundant. Steps (3),(4) create the following:
4826
4827 vp0 = &base_addr;
4828 LOOP: vp1 = phi(vp0,vp2)
4829 ...
4830 ...
4831 vp2 = vp1 + step
4832 goto LOOP
4833
4834 If there is an inner-loop nested in loop, then step (5) will also be
4835 applied, and an additional update in the inner-loop will be created:
4836
4837 vp0 = &base_addr;
4838 LOOP: vp1 = phi(vp0,vp2)
4839 ...
4840 inner: vp3 = phi(vp1,vp4)
4841 vp4 = vp3 + inner_step
4842 if () goto inner
4843 ...
4844 vp2 = vp1 + step
4845 if () goto LOOP */
4846
4847 /* (2) Calculate the initial address of the aggregate-pointer, and set
4848 the aggregate-pointer to point to it before the loop. */
4849
4850 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4851
4852 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
4853 stmt_info, &new_stmt_list,
4854 offset, byte_offset);
4855 if (new_stmt_list)
4856 {
4857 if (pe)
4858 {
4859 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4860 gcc_assert (!new_bb);
4861 }
4862 else
4863 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4864 }
4865
4866 *initial_address = new_temp;
4867 aggr_ptr_init = new_temp;
4868
4869 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4870 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4871 inner-loop nested in LOOP (during outer-loop vectorization). */
4872
4873 /* No update in loop is required. */
4874 if (only_init && (!loop_vinfo || at_loop == loop))
4875 aptr = aggr_ptr_init;
4876 else
4877 {
4878 /* Accesses to invariant addresses should be handled specially
4879 by the caller. */
4880 tree step = vect_dr_behavior (vinfo, dr_info)->step;
4881 gcc_assert (!integer_zerop (step));
4882
4883 if (iv_step == NULL_TREE)
4884 {
4885 /* The step of the aggregate pointer is the type size,
4886 negated for downward accesses. */
4887 iv_step = TYPE_SIZE_UNIT (aggr_type);
4888 if (tree_int_cst_sgn (step) == -1)
4889 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4890 }
4891
4892 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4893
4894 create_iv (aggr_ptr_init,
4895 fold_convert (aggr_ptr_type, iv_step),
4896 aggr_ptr, loop, &incr_gsi, insert_after,
4897 &indx_before_incr, &indx_after_incr);
4898 incr = gsi_stmt (incr_gsi);
4899 loop_vinfo->add_stmt (incr);
4900
4901 /* Copy the points-to information if it exists. */
4902 if (DR_PTR_INFO (dr))
4903 {
4904 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4905 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4906 }
4907 if (ptr_incr)
4908 *ptr_incr = incr;
4909
4910 aptr = indx_before_incr;
4911 }
4912
4913 if (!nested_in_vect_loop || only_init)
4914 return aptr;
4915
4916
4917 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4918 nested in LOOP, if exists. */
4919
4920 gcc_assert (nested_in_vect_loop);
4921 if (!only_init)
4922 {
4923 standard_iv_increment_position (containing_loop, &incr_gsi,
4924 &insert_after);
4925 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4926 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4927 &indx_after_incr);
4928 incr = gsi_stmt (incr_gsi);
4929 loop_vinfo->add_stmt (incr);
4930
4931 /* Copy the points-to information if it exists. */
4932 if (DR_PTR_INFO (dr))
4933 {
4934 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4935 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4936 }
4937 if (ptr_incr)
4938 *ptr_incr = incr;
4939
4940 return indx_before_incr;
4941 }
4942 else
4943 gcc_unreachable ();
4944 }
4945
4946
4947 /* Function bump_vector_ptr
4948
4949 Increment a pointer (to a vector type) by vector-size. If requested,
4950 i.e. if PTR-INCR is given, then also connect the new increment stmt
4951 to the existing def-use update-chain of the pointer, by modifying
4952 the PTR_INCR as illustrated below:
4953
4954 The pointer def-use update-chain before this function:
4955 DATAREF_PTR = phi (p_0, p_2)
4956 ....
4957 PTR_INCR: p_2 = DATAREF_PTR + step
4958
4959 The pointer def-use update-chain after this function:
4960 DATAREF_PTR = phi (p_0, p_2)
4961 ....
4962 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4963 ....
4964 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4965
4966 Input:
4967 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4968 in the loop.
4969 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4970 the loop. The increment amount across iterations is expected
4971 to be vector_size.
4972 BSI - location where the new update stmt is to be placed.
4973 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4974 BUMP - optional. The offset by which to bump the pointer. If not given,
4975 the offset is assumed to be vector_size.
4976
4977 Output: Return NEW_DATAREF_PTR as illustrated above.
4978
4979 */
4980
4981 tree
4982 bump_vector_ptr (vec_info *vinfo,
4983 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4984 stmt_vec_info stmt_info, tree bump)
4985 {
4986 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4987 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4988 tree update = TYPE_SIZE_UNIT (vectype);
4989 gassign *incr_stmt;
4990 ssa_op_iter iter;
4991 use_operand_p use_p;
4992 tree new_dataref_ptr;
4993
4994 if (bump)
4995 update = bump;
4996
4997 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4998 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4999 else
5000 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5001 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5002 dataref_ptr, update);
5003 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5004
5005 /* Copy the points-to information if it exists. */
5006 if (DR_PTR_INFO (dr))
5007 {
5008 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5009 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5010 }
5011
5012 if (!ptr_incr)
5013 return new_dataref_ptr;
5014
5015 /* Update the vector-pointer's cross-iteration increment. */
5016 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5017 {
5018 tree use = USE_FROM_PTR (use_p);
5019
5020 if (use == dataref_ptr)
5021 SET_USE (use_p, new_dataref_ptr);
5022 else
5023 gcc_assert (operand_equal_p (use, update, 0));
5024 }
5025
5026 return new_dataref_ptr;
5027 }
5028
5029
5030 /* Copy memory reference info such as base/clique from the SRC reference
5031 to the DEST MEM_REF. */
5032
5033 void
5034 vect_copy_ref_info (tree dest, tree src)
5035 {
5036 if (TREE_CODE (dest) != MEM_REF)
5037 return;
5038
5039 tree src_base = src;
5040 while (handled_component_p (src_base))
5041 src_base = TREE_OPERAND (src_base, 0);
5042 if (TREE_CODE (src_base) != MEM_REF
5043 && TREE_CODE (src_base) != TARGET_MEM_REF)
5044 return;
5045
5046 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5047 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5048 }
5049
5050
5051 /* Function vect_create_destination_var.
5052
5053 Create a new temporary of type VECTYPE. */
5054
5055 tree
5056 vect_create_destination_var (tree scalar_dest, tree vectype)
5057 {
5058 tree vec_dest;
5059 const char *name;
5060 char *new_name;
5061 tree type;
5062 enum vect_var_kind kind;
5063
5064 kind = vectype
5065 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5066 ? vect_mask_var
5067 : vect_simple_var
5068 : vect_scalar_var;
5069 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5070
5071 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5072
5073 name = get_name (scalar_dest);
5074 if (name)
5075 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5076 else
5077 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5078 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5079 free (new_name);
5080
5081 return vec_dest;
5082 }
5083
5084 /* Function vect_grouped_store_supported.
5085
5086 Returns TRUE if interleave high and interleave low permutations
5087 are supported, and FALSE otherwise. */
5088
5089 bool
5090 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5091 {
5092 machine_mode mode = TYPE_MODE (vectype);
5093
5094 /* vect_permute_store_chain requires the group size to be equal to 3 or
5095 be a power of two. */
5096 if (count != 3 && exact_log2 (count) == -1)
5097 {
5098 if (dump_enabled_p ())
5099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5100 "the size of the group of accesses"
5101 " is not a power of 2 or not eqaul to 3\n");
5102 return false;
5103 }
5104
5105 /* Check that the permutation is supported. */
5106 if (VECTOR_MODE_P (mode))
5107 {
5108 unsigned int i;
5109 if (count == 3)
5110 {
5111 unsigned int j0 = 0, j1 = 0, j2 = 0;
5112 unsigned int i, j;
5113
5114 unsigned int nelt;
5115 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5116 {
5117 if (dump_enabled_p ())
5118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5119 "cannot handle groups of 3 stores for"
5120 " variable-length vectors\n");
5121 return false;
5122 }
5123
5124 vec_perm_builder sel (nelt, nelt, 1);
5125 sel.quick_grow (nelt);
5126 vec_perm_indices indices;
5127 for (j = 0; j < 3; j++)
5128 {
5129 int nelt0 = ((3 - j) * nelt) % 3;
5130 int nelt1 = ((3 - j) * nelt + 1) % 3;
5131 int nelt2 = ((3 - j) * nelt + 2) % 3;
5132 for (i = 0; i < nelt; i++)
5133 {
5134 if (3 * i + nelt0 < nelt)
5135 sel[3 * i + nelt0] = j0++;
5136 if (3 * i + nelt1 < nelt)
5137 sel[3 * i + nelt1] = nelt + j1++;
5138 if (3 * i + nelt2 < nelt)
5139 sel[3 * i + nelt2] = 0;
5140 }
5141 indices.new_vector (sel, 2, nelt);
5142 if (!can_vec_perm_const_p (mode, indices))
5143 {
5144 if (dump_enabled_p ())
5145 dump_printf (MSG_MISSED_OPTIMIZATION,
5146 "permutation op not supported by target.\n");
5147 return false;
5148 }
5149
5150 for (i = 0; i < nelt; i++)
5151 {
5152 if (3 * i + nelt0 < nelt)
5153 sel[3 * i + nelt0] = 3 * i + nelt0;
5154 if (3 * i + nelt1 < nelt)
5155 sel[3 * i + nelt1] = 3 * i + nelt1;
5156 if (3 * i + nelt2 < nelt)
5157 sel[3 * i + nelt2] = nelt + j2++;
5158 }
5159 indices.new_vector (sel, 2, nelt);
5160 if (!can_vec_perm_const_p (mode, indices))
5161 {
5162 if (dump_enabled_p ())
5163 dump_printf (MSG_MISSED_OPTIMIZATION,
5164 "permutation op not supported by target.\n");
5165 return false;
5166 }
5167 }
5168 return true;
5169 }
5170 else
5171 {
5172 /* If length is not equal to 3 then only power of 2 is supported. */
5173 gcc_assert (pow2p_hwi (count));
5174 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5175
5176 /* The encoding has 2 interleaved stepped patterns. */
5177 vec_perm_builder sel (nelt, 2, 3);
5178 sel.quick_grow (6);
5179 for (i = 0; i < 3; i++)
5180 {
5181 sel[i * 2] = i;
5182 sel[i * 2 + 1] = i + nelt;
5183 }
5184 vec_perm_indices indices (sel, 2, nelt);
5185 if (can_vec_perm_const_p (mode, indices))
5186 {
5187 for (i = 0; i < 6; i++)
5188 sel[i] += exact_div (nelt, 2);
5189 indices.new_vector (sel, 2, nelt);
5190 if (can_vec_perm_const_p (mode, indices))
5191 return true;
5192 }
5193 }
5194 }
5195
5196 if (dump_enabled_p ())
5197 dump_printf (MSG_MISSED_OPTIMIZATION,
5198 "permutation op not supported by target.\n");
5199 return false;
5200 }
5201
5202
5203 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5204 type VECTYPE. MASKED_P says whether the masked form is needed. */
5205
5206 bool
5207 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5208 bool masked_p)
5209 {
5210 if (masked_p)
5211 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5212 vec_mask_store_lanes_optab,
5213 vectype, count);
5214 else
5215 return vect_lanes_optab_supported_p ("vec_store_lanes",
5216 vec_store_lanes_optab,
5217 vectype, count);
5218 }
5219
5220
5221 /* Function vect_permute_store_chain.
5222
5223 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5224 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5225 the data correctly for the stores. Return the final references for stores
5226 in RESULT_CHAIN.
5227
5228 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5229 The input is 4 vectors each containing 8 elements. We assign a number to
5230 each element, the input sequence is:
5231
5232 1st vec: 0 1 2 3 4 5 6 7
5233 2nd vec: 8 9 10 11 12 13 14 15
5234 3rd vec: 16 17 18 19 20 21 22 23
5235 4th vec: 24 25 26 27 28 29 30 31
5236
5237 The output sequence should be:
5238
5239 1st vec: 0 8 16 24 1 9 17 25
5240 2nd vec: 2 10 18 26 3 11 19 27
5241 3rd vec: 4 12 20 28 5 13 21 30
5242 4th vec: 6 14 22 30 7 15 23 31
5243
5244 i.e., we interleave the contents of the four vectors in their order.
5245
5246 We use interleave_high/low instructions to create such output. The input of
5247 each interleave_high/low operation is two vectors:
5248 1st vec 2nd vec
5249 0 1 2 3 4 5 6 7
5250 the even elements of the result vector are obtained left-to-right from the
5251 high/low elements of the first vector. The odd elements of the result are
5252 obtained left-to-right from the high/low elements of the second vector.
5253 The output of interleave_high will be: 0 4 1 5
5254 and of interleave_low: 2 6 3 7
5255
5256
5257 The permutation is done in log LENGTH stages. In each stage interleave_high
5258 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5259 where the first argument is taken from the first half of DR_CHAIN and the
5260 second argument from it's second half.
5261 In our example,
5262
5263 I1: interleave_high (1st vec, 3rd vec)
5264 I2: interleave_low (1st vec, 3rd vec)
5265 I3: interleave_high (2nd vec, 4th vec)
5266 I4: interleave_low (2nd vec, 4th vec)
5267
5268 The output for the first stage is:
5269
5270 I1: 0 16 1 17 2 18 3 19
5271 I2: 4 20 5 21 6 22 7 23
5272 I3: 8 24 9 25 10 26 11 27
5273 I4: 12 28 13 29 14 30 15 31
5274
5275 The output of the second stage, i.e. the final result is:
5276
5277 I1: 0 8 16 24 1 9 17 25
5278 I2: 2 10 18 26 3 11 19 27
5279 I3: 4 12 20 28 5 13 21 30
5280 I4: 6 14 22 30 7 15 23 31. */
5281
5282 void
5283 vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain,
5284 unsigned int length,
5285 stmt_vec_info stmt_info,
5286 gimple_stmt_iterator *gsi,
5287 vec<tree> *result_chain)
5288 {
5289 tree vect1, vect2, high, low;
5290 gimple *perm_stmt;
5291 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5292 tree perm_mask_low, perm_mask_high;
5293 tree data_ref;
5294 tree perm3_mask_low, perm3_mask_high;
5295 unsigned int i, j, n, log_length = exact_log2 (length);
5296
5297 result_chain->quick_grow (length);
5298 memcpy (result_chain->address (), dr_chain.address (),
5299 length * sizeof (tree));
5300
5301 if (length == 3)
5302 {
5303 /* vect_grouped_store_supported ensures that this is constant. */
5304 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5305 unsigned int j0 = 0, j1 = 0, j2 = 0;
5306
5307 vec_perm_builder sel (nelt, nelt, 1);
5308 sel.quick_grow (nelt);
5309 vec_perm_indices indices;
5310 for (j = 0; j < 3; j++)
5311 {
5312 int nelt0 = ((3 - j) * nelt) % 3;
5313 int nelt1 = ((3 - j) * nelt + 1) % 3;
5314 int nelt2 = ((3 - j) * nelt + 2) % 3;
5315
5316 for (i = 0; i < nelt; i++)
5317 {
5318 if (3 * i + nelt0 < nelt)
5319 sel[3 * i + nelt0] = j0++;
5320 if (3 * i + nelt1 < nelt)
5321 sel[3 * i + nelt1] = nelt + j1++;
5322 if (3 * i + nelt2 < nelt)
5323 sel[3 * i + nelt2] = 0;
5324 }
5325 indices.new_vector (sel, 2, nelt);
5326 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5327
5328 for (i = 0; i < nelt; i++)
5329 {
5330 if (3 * i + nelt0 < nelt)
5331 sel[3 * i + nelt0] = 3 * i + nelt0;
5332 if (3 * i + nelt1 < nelt)
5333 sel[3 * i + nelt1] = 3 * i + nelt1;
5334 if (3 * i + nelt2 < nelt)
5335 sel[3 * i + nelt2] = nelt + j2++;
5336 }
5337 indices.new_vector (sel, 2, nelt);
5338 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5339
5340 vect1 = dr_chain[0];
5341 vect2 = dr_chain[1];
5342
5343 /* Create interleaving stmt:
5344 low = VEC_PERM_EXPR <vect1, vect2,
5345 {j, nelt, *, j + 1, nelt + j + 1, *,
5346 j + 2, nelt + j + 2, *, ...}> */
5347 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5348 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5349 vect2, perm3_mask_low);
5350 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5351
5352 vect1 = data_ref;
5353 vect2 = dr_chain[2];
5354 /* Create interleaving stmt:
5355 low = VEC_PERM_EXPR <vect1, vect2,
5356 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5357 6, 7, nelt + j + 2, ...}> */
5358 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5359 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5360 vect2, perm3_mask_high);
5361 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5362 (*result_chain)[j] = data_ref;
5363 }
5364 }
5365 else
5366 {
5367 /* If length is not equal to 3 then only power of 2 is supported. */
5368 gcc_assert (pow2p_hwi (length));
5369
5370 /* The encoding has 2 interleaved stepped patterns. */
5371 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5372 vec_perm_builder sel (nelt, 2, 3);
5373 sel.quick_grow (6);
5374 for (i = 0; i < 3; i++)
5375 {
5376 sel[i * 2] = i;
5377 sel[i * 2 + 1] = i + nelt;
5378 }
5379 vec_perm_indices indices (sel, 2, nelt);
5380 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5381
5382 for (i = 0; i < 6; i++)
5383 sel[i] += exact_div (nelt, 2);
5384 indices.new_vector (sel, 2, nelt);
5385 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5386
5387 for (i = 0, n = log_length; i < n; i++)
5388 {
5389 for (j = 0; j < length/2; j++)
5390 {
5391 vect1 = dr_chain[j];
5392 vect2 = dr_chain[j+length/2];
5393
5394 /* Create interleaving stmt:
5395 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5396 ...}> */
5397 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5398 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5399 vect2, perm_mask_high);
5400 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5401 (*result_chain)[2*j] = high;
5402
5403 /* Create interleaving stmt:
5404 low = VEC_PERM_EXPR <vect1, vect2,
5405 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5406 ...}> */
5407 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5408 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5409 vect2, perm_mask_low);
5410 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5411 (*result_chain)[2*j+1] = low;
5412 }
5413 memcpy (dr_chain.address (), result_chain->address (),
5414 length * sizeof (tree));
5415 }
5416 }
5417 }
5418
5419 /* Function vect_setup_realignment
5420
5421 This function is called when vectorizing an unaligned load using
5422 the dr_explicit_realign[_optimized] scheme.
5423 This function generates the following code at the loop prolog:
5424
5425 p = initial_addr;
5426 x msq_init = *(floor(p)); # prolog load
5427 realignment_token = call target_builtin;
5428 loop:
5429 x msq = phi (msq_init, ---)
5430
5431 The stmts marked with x are generated only for the case of
5432 dr_explicit_realign_optimized.
5433
5434 The code above sets up a new (vector) pointer, pointing to the first
5435 location accessed by STMT_INFO, and a "floor-aligned" load using that
5436 pointer. It also generates code to compute the "realignment-token"
5437 (if the relevant target hook was defined), and creates a phi-node at the
5438 loop-header bb whose arguments are the result of the prolog-load (created
5439 by this function) and the result of a load that takes place in the loop
5440 (to be created by the caller to this function).
5441
5442 For the case of dr_explicit_realign_optimized:
5443 The caller to this function uses the phi-result (msq) to create the
5444 realignment code inside the loop, and sets up the missing phi argument,
5445 as follows:
5446 loop:
5447 msq = phi (msq_init, lsq)
5448 lsq = *(floor(p')); # load in loop
5449 result = realign_load (msq, lsq, realignment_token);
5450
5451 For the case of dr_explicit_realign:
5452 loop:
5453 msq = *(floor(p)); # load in loop
5454 p' = p + (VS-1);
5455 lsq = *(floor(p')); # load in loop
5456 result = realign_load (msq, lsq, realignment_token);
5457
5458 Input:
5459 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5460 a memory location that may be unaligned.
5461 BSI - place where new code is to be inserted.
5462 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5463 is used.
5464
5465 Output:
5466 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5467 target hook, if defined.
5468 Return value - the result of the loop-header phi node. */
5469
5470 tree
5471 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5472 gimple_stmt_iterator *gsi, tree *realignment_token,
5473 enum dr_alignment_support alignment_support_scheme,
5474 tree init_addr,
5475 class loop **at_loop)
5476 {
5477 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5478 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5479 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5480 struct data_reference *dr = dr_info->dr;
5481 class loop *loop = NULL;
5482 edge pe = NULL;
5483 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5484 tree vec_dest;
5485 gimple *inc;
5486 tree ptr;
5487 tree data_ref;
5488 basic_block new_bb;
5489 tree msq_init = NULL_TREE;
5490 tree new_temp;
5491 gphi *phi_stmt;
5492 tree msq = NULL_TREE;
5493 gimple_seq stmts = NULL;
5494 bool compute_in_loop = false;
5495 bool nested_in_vect_loop = false;
5496 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5497 class loop *loop_for_initial_load = NULL;
5498
5499 if (loop_vinfo)
5500 {
5501 loop = LOOP_VINFO_LOOP (loop_vinfo);
5502 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5503 }
5504
5505 gcc_assert (alignment_support_scheme == dr_explicit_realign
5506 || alignment_support_scheme == dr_explicit_realign_optimized);
5507
5508 /* We need to generate three things:
5509 1. the misalignment computation
5510 2. the extra vector load (for the optimized realignment scheme).
5511 3. the phi node for the two vectors from which the realignment is
5512 done (for the optimized realignment scheme). */
5513
5514 /* 1. Determine where to generate the misalignment computation.
5515
5516 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5517 calculation will be generated by this function, outside the loop (in the
5518 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5519 caller, inside the loop.
5520
5521 Background: If the misalignment remains fixed throughout the iterations of
5522 the loop, then both realignment schemes are applicable, and also the
5523 misalignment computation can be done outside LOOP. This is because we are
5524 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5525 are a multiple of VS (the Vector Size), and therefore the misalignment in
5526 different vectorized LOOP iterations is always the same.
5527 The problem arises only if the memory access is in an inner-loop nested
5528 inside LOOP, which is now being vectorized using outer-loop vectorization.
5529 This is the only case when the misalignment of the memory access may not
5530 remain fixed throughout the iterations of the inner-loop (as explained in
5531 detail in vect_supportable_dr_alignment). In this case, not only is the
5532 optimized realignment scheme not applicable, but also the misalignment
5533 computation (and generation of the realignment token that is passed to
5534 REALIGN_LOAD) have to be done inside the loop.
5535
5536 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5537 or not, which in turn determines if the misalignment is computed inside
5538 the inner-loop, or outside LOOP. */
5539
5540 if (init_addr != NULL_TREE || !loop_vinfo)
5541 {
5542 compute_in_loop = true;
5543 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5544 }
5545
5546
5547 /* 2. Determine where to generate the extra vector load.
5548
5549 For the optimized realignment scheme, instead of generating two vector
5550 loads in each iteration, we generate a single extra vector load in the
5551 preheader of the loop, and in each iteration reuse the result of the
5552 vector load from the previous iteration. In case the memory access is in
5553 an inner-loop nested inside LOOP, which is now being vectorized using
5554 outer-loop vectorization, we need to determine whether this initial vector
5555 load should be generated at the preheader of the inner-loop, or can be
5556 generated at the preheader of LOOP. If the memory access has no evolution
5557 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5558 to be generated inside LOOP (in the preheader of the inner-loop). */
5559
5560 if (nested_in_vect_loop)
5561 {
5562 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5563 bool invariant_in_outerloop =
5564 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5565 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5566 }
5567 else
5568 loop_for_initial_load = loop;
5569 if (at_loop)
5570 *at_loop = loop_for_initial_load;
5571
5572 if (loop_for_initial_load)
5573 pe = loop_preheader_edge (loop_for_initial_load);
5574
5575 /* 3. For the case of the optimized realignment, create the first vector
5576 load at the loop preheader. */
5577
5578 if (alignment_support_scheme == dr_explicit_realign_optimized)
5579 {
5580 /* Create msq_init = *(floor(p1)) in the loop preheader */
5581 gassign *new_stmt;
5582
5583 gcc_assert (!compute_in_loop);
5584 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5585 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5586 loop_for_initial_load, NULL_TREE,
5587 &init_addr, NULL, &inc, true);
5588 if (TREE_CODE (ptr) == SSA_NAME)
5589 new_temp = copy_ssa_name (ptr);
5590 else
5591 new_temp = make_ssa_name (TREE_TYPE (ptr));
5592 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5593 tree type = TREE_TYPE (ptr);
5594 new_stmt = gimple_build_assign
5595 (new_temp, BIT_AND_EXPR, ptr,
5596 fold_build2 (MINUS_EXPR, type,
5597 build_int_cst (type, 0),
5598 build_int_cst (type, align)));
5599 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5600 gcc_assert (!new_bb);
5601 data_ref
5602 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5603 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5604 vect_copy_ref_info (data_ref, DR_REF (dr));
5605 new_stmt = gimple_build_assign (vec_dest, data_ref);
5606 new_temp = make_ssa_name (vec_dest, new_stmt);
5607 gimple_assign_set_lhs (new_stmt, new_temp);
5608 if (pe)
5609 {
5610 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5611 gcc_assert (!new_bb);
5612 }
5613 else
5614 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5615
5616 msq_init = gimple_assign_lhs (new_stmt);
5617 }
5618
5619 /* 4. Create realignment token using a target builtin, if available.
5620 It is done either inside the containing loop, or before LOOP (as
5621 determined above). */
5622
5623 if (targetm.vectorize.builtin_mask_for_load)
5624 {
5625 gcall *new_stmt;
5626 tree builtin_decl;
5627
5628 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5629 if (!init_addr)
5630 {
5631 /* Generate the INIT_ADDR computation outside LOOP. */
5632 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5633 stmt_info, &stmts,
5634 NULL_TREE);
5635 if (loop)
5636 {
5637 pe = loop_preheader_edge (loop);
5638 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5639 gcc_assert (!new_bb);
5640 }
5641 else
5642 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5643 }
5644
5645 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5646 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5647 vec_dest =
5648 vect_create_destination_var (scalar_dest,
5649 gimple_call_return_type (new_stmt));
5650 new_temp = make_ssa_name (vec_dest, new_stmt);
5651 gimple_call_set_lhs (new_stmt, new_temp);
5652
5653 if (compute_in_loop)
5654 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5655 else
5656 {
5657 /* Generate the misalignment computation outside LOOP. */
5658 pe = loop_preheader_edge (loop);
5659 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5660 gcc_assert (!new_bb);
5661 }
5662
5663 *realignment_token = gimple_call_lhs (new_stmt);
5664
5665 /* The result of the CALL_EXPR to this builtin is determined from
5666 the value of the parameter and no global variables are touched
5667 which makes the builtin a "const" function. Requiring the
5668 builtin to have the "const" attribute makes it unnecessary
5669 to call mark_call_clobbered. */
5670 gcc_assert (TREE_READONLY (builtin_decl));
5671 }
5672
5673 if (alignment_support_scheme == dr_explicit_realign)
5674 return msq;
5675
5676 gcc_assert (!compute_in_loop);
5677 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5678
5679
5680 /* 5. Create msq = phi <msq_init, lsq> in loop */
5681
5682 pe = loop_preheader_edge (containing_loop);
5683 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5684 msq = make_ssa_name (vec_dest);
5685 phi_stmt = create_phi_node (msq, containing_loop->header);
5686 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5687
5688 return msq;
5689 }
5690
5691
5692 /* Function vect_grouped_load_supported.
5693
5694 COUNT is the size of the load group (the number of statements plus the
5695 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5696 only one statement, with a gap of COUNT - 1.
5697
5698 Returns true if a suitable permute exists. */
5699
5700 bool
5701 vect_grouped_load_supported (tree vectype, bool single_element_p,
5702 unsigned HOST_WIDE_INT count)
5703 {
5704 machine_mode mode = TYPE_MODE (vectype);
5705
5706 /* If this is single-element interleaving with an element distance
5707 that leaves unused vector loads around punt - we at least create
5708 very sub-optimal code in that case (and blow up memory,
5709 see PR65518). */
5710 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5711 {
5712 if (dump_enabled_p ())
5713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5714 "single-element interleaving not supported "
5715 "for not adjacent vector loads\n");
5716 return false;
5717 }
5718
5719 /* vect_permute_load_chain requires the group size to be equal to 3 or
5720 be a power of two. */
5721 if (count != 3 && exact_log2 (count) == -1)
5722 {
5723 if (dump_enabled_p ())
5724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5725 "the size of the group of accesses"
5726 " is not a power of 2 or not equal to 3\n");
5727 return false;
5728 }
5729
5730 /* Check that the permutation is supported. */
5731 if (VECTOR_MODE_P (mode))
5732 {
5733 unsigned int i, j;
5734 if (count == 3)
5735 {
5736 unsigned int nelt;
5737 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5738 {
5739 if (dump_enabled_p ())
5740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5741 "cannot handle groups of 3 loads for"
5742 " variable-length vectors\n");
5743 return false;
5744 }
5745
5746 vec_perm_builder sel (nelt, nelt, 1);
5747 sel.quick_grow (nelt);
5748 vec_perm_indices indices;
5749 unsigned int k;
5750 for (k = 0; k < 3; k++)
5751 {
5752 for (i = 0; i < nelt; i++)
5753 if (3 * i + k < 2 * nelt)
5754 sel[i] = 3 * i + k;
5755 else
5756 sel[i] = 0;
5757 indices.new_vector (sel, 2, nelt);
5758 if (!can_vec_perm_const_p (mode, indices))
5759 {
5760 if (dump_enabled_p ())
5761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5762 "shuffle of 3 loads is not supported by"
5763 " target\n");
5764 return false;
5765 }
5766 for (i = 0, j = 0; i < nelt; i++)
5767 if (3 * i + k < 2 * nelt)
5768 sel[i] = i;
5769 else
5770 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5771 indices.new_vector (sel, 2, nelt);
5772 if (!can_vec_perm_const_p (mode, indices))
5773 {
5774 if (dump_enabled_p ())
5775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5776 "shuffle of 3 loads is not supported by"
5777 " target\n");
5778 return false;
5779 }
5780 }
5781 return true;
5782 }
5783 else
5784 {
5785 /* If length is not equal to 3 then only power of 2 is supported. */
5786 gcc_assert (pow2p_hwi (count));
5787 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5788
5789 /* The encoding has a single stepped pattern. */
5790 vec_perm_builder sel (nelt, 1, 3);
5791 sel.quick_grow (3);
5792 for (i = 0; i < 3; i++)
5793 sel[i] = i * 2;
5794 vec_perm_indices indices (sel, 2, nelt);
5795 if (can_vec_perm_const_p (mode, indices))
5796 {
5797 for (i = 0; i < 3; i++)
5798 sel[i] = i * 2 + 1;
5799 indices.new_vector (sel, 2, nelt);
5800 if (can_vec_perm_const_p (mode, indices))
5801 return true;
5802 }
5803 }
5804 }
5805
5806 if (dump_enabled_p ())
5807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5808 "extract even/odd not supported by target\n");
5809 return false;
5810 }
5811
5812 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5813 type VECTYPE. MASKED_P says whether the masked form is needed. */
5814
5815 bool
5816 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5817 bool masked_p)
5818 {
5819 if (masked_p)
5820 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5821 vec_mask_load_lanes_optab,
5822 vectype, count);
5823 else
5824 return vect_lanes_optab_supported_p ("vec_load_lanes",
5825 vec_load_lanes_optab,
5826 vectype, count);
5827 }
5828
5829 /* Function vect_permute_load_chain.
5830
5831 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5832 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5833 the input data correctly. Return the final references for loads in
5834 RESULT_CHAIN.
5835
5836 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5837 The input is 4 vectors each containing 8 elements. We assign a number to each
5838 element, the input sequence is:
5839
5840 1st vec: 0 1 2 3 4 5 6 7
5841 2nd vec: 8 9 10 11 12 13 14 15
5842 3rd vec: 16 17 18 19 20 21 22 23
5843 4th vec: 24 25 26 27 28 29 30 31
5844
5845 The output sequence should be:
5846
5847 1st vec: 0 4 8 12 16 20 24 28
5848 2nd vec: 1 5 9 13 17 21 25 29
5849 3rd vec: 2 6 10 14 18 22 26 30
5850 4th vec: 3 7 11 15 19 23 27 31
5851
5852 i.e., the first output vector should contain the first elements of each
5853 interleaving group, etc.
5854
5855 We use extract_even/odd instructions to create such output. The input of
5856 each extract_even/odd operation is two vectors
5857 1st vec 2nd vec
5858 0 1 2 3 4 5 6 7
5859
5860 and the output is the vector of extracted even/odd elements. The output of
5861 extract_even will be: 0 2 4 6
5862 and of extract_odd: 1 3 5 7
5863
5864
5865 The permutation is done in log LENGTH stages. In each stage extract_even
5866 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5867 their order. In our example,
5868
5869 E1: extract_even (1st vec, 2nd vec)
5870 E2: extract_odd (1st vec, 2nd vec)
5871 E3: extract_even (3rd vec, 4th vec)
5872 E4: extract_odd (3rd vec, 4th vec)
5873
5874 The output for the first stage will be:
5875
5876 E1: 0 2 4 6 8 10 12 14
5877 E2: 1 3 5 7 9 11 13 15
5878 E3: 16 18 20 22 24 26 28 30
5879 E4: 17 19 21 23 25 27 29 31
5880
5881 In order to proceed and create the correct sequence for the next stage (or
5882 for the correct output, if the second stage is the last one, as in our
5883 example), we first put the output of extract_even operation and then the
5884 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5885 The input for the second stage is:
5886
5887 1st vec (E1): 0 2 4 6 8 10 12 14
5888 2nd vec (E3): 16 18 20 22 24 26 28 30
5889 3rd vec (E2): 1 3 5 7 9 11 13 15
5890 4th vec (E4): 17 19 21 23 25 27 29 31
5891
5892 The output of the second stage:
5893
5894 E1: 0 4 8 12 16 20 24 28
5895 E2: 2 6 10 14 18 22 26 30
5896 E3: 1 5 9 13 17 21 25 29
5897 E4: 3 7 11 15 19 23 27 31
5898
5899 And RESULT_CHAIN after reordering:
5900
5901 1st vec (E1): 0 4 8 12 16 20 24 28
5902 2nd vec (E3): 1 5 9 13 17 21 25 29
5903 3rd vec (E2): 2 6 10 14 18 22 26 30
5904 4th vec (E4): 3 7 11 15 19 23 27 31. */
5905
5906 static void
5907 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
5908 unsigned int length,
5909 stmt_vec_info stmt_info,
5910 gimple_stmt_iterator *gsi,
5911 vec<tree> *result_chain)
5912 {
5913 tree data_ref, first_vect, second_vect;
5914 tree perm_mask_even, perm_mask_odd;
5915 tree perm3_mask_low, perm3_mask_high;
5916 gimple *perm_stmt;
5917 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5918 unsigned int i, j, log_length = exact_log2 (length);
5919
5920 result_chain->quick_grow (length);
5921 memcpy (result_chain->address (), dr_chain.address (),
5922 length * sizeof (tree));
5923
5924 if (length == 3)
5925 {
5926 /* vect_grouped_load_supported ensures that this is constant. */
5927 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5928 unsigned int k;
5929
5930 vec_perm_builder sel (nelt, nelt, 1);
5931 sel.quick_grow (nelt);
5932 vec_perm_indices indices;
5933 for (k = 0; k < 3; k++)
5934 {
5935 for (i = 0; i < nelt; i++)
5936 if (3 * i + k < 2 * nelt)
5937 sel[i] = 3 * i + k;
5938 else
5939 sel[i] = 0;
5940 indices.new_vector (sel, 2, nelt);
5941 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5942
5943 for (i = 0, j = 0; i < nelt; i++)
5944 if (3 * i + k < 2 * nelt)
5945 sel[i] = i;
5946 else
5947 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5948 indices.new_vector (sel, 2, nelt);
5949 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5950
5951 first_vect = dr_chain[0];
5952 second_vect = dr_chain[1];
5953
5954 /* Create interleaving stmt (low part of):
5955 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5956 ...}> */
5957 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5958 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5959 second_vect, perm3_mask_low);
5960 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5961
5962 /* Create interleaving stmt (high part of):
5963 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5964 ...}> */
5965 first_vect = data_ref;
5966 second_vect = dr_chain[2];
5967 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5968 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5969 second_vect, perm3_mask_high);
5970 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5971 (*result_chain)[k] = data_ref;
5972 }
5973 }
5974 else
5975 {
5976 /* If length is not equal to 3 then only power of 2 is supported. */
5977 gcc_assert (pow2p_hwi (length));
5978
5979 /* The encoding has a single stepped pattern. */
5980 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5981 vec_perm_builder sel (nelt, 1, 3);
5982 sel.quick_grow (3);
5983 for (i = 0; i < 3; ++i)
5984 sel[i] = i * 2;
5985 vec_perm_indices indices (sel, 2, nelt);
5986 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5987
5988 for (i = 0; i < 3; ++i)
5989 sel[i] = i * 2 + 1;
5990 indices.new_vector (sel, 2, nelt);
5991 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5992
5993 for (i = 0; i < log_length; i++)
5994 {
5995 for (j = 0; j < length; j += 2)
5996 {
5997 first_vect = dr_chain[j];
5998 second_vect = dr_chain[j+1];
5999
6000 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6001 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6002 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6003 first_vect, second_vect,
6004 perm_mask_even);
6005 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6006 (*result_chain)[j/2] = data_ref;
6007
6008 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6009 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6010 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6011 first_vect, second_vect,
6012 perm_mask_odd);
6013 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6014 (*result_chain)[j/2+length/2] = data_ref;
6015 }
6016 memcpy (dr_chain.address (), result_chain->address (),
6017 length * sizeof (tree));
6018 }
6019 }
6020 }
6021
6022 /* Function vect_shift_permute_load_chain.
6023
6024 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6025 sequence of stmts to reorder the input data accordingly.
6026 Return the final references for loads in RESULT_CHAIN.
6027 Return true if successed, false otherwise.
6028
6029 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6030 The input is 3 vectors each containing 8 elements. We assign a
6031 number to each element, the input sequence is:
6032
6033 1st vec: 0 1 2 3 4 5 6 7
6034 2nd vec: 8 9 10 11 12 13 14 15
6035 3rd vec: 16 17 18 19 20 21 22 23
6036
6037 The output sequence should be:
6038
6039 1st vec: 0 3 6 9 12 15 18 21
6040 2nd vec: 1 4 7 10 13 16 19 22
6041 3rd vec: 2 5 8 11 14 17 20 23
6042
6043 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6044
6045 First we shuffle all 3 vectors to get correct elements order:
6046
6047 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6048 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6049 3rd vec: (16 19 22) (17 20 23) (18 21)
6050
6051 Next we unite and shift vector 3 times:
6052
6053 1st step:
6054 shift right by 6 the concatenation of:
6055 "1st vec" and "2nd vec"
6056 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6057 "2nd vec" and "3rd vec"
6058 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6059 "3rd vec" and "1st vec"
6060 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6061 | New vectors |
6062
6063 So that now new vectors are:
6064
6065 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6066 2nd vec: (10 13) (16 19 22) (17 20 23)
6067 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6068
6069 2nd step:
6070 shift right by 5 the concatenation of:
6071 "1st vec" and "3rd vec"
6072 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6073 "2nd vec" and "1st vec"
6074 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6075 "3rd vec" and "2nd vec"
6076 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6077 | New vectors |
6078
6079 So that now new vectors are:
6080
6081 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6082 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6083 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6084
6085 3rd step:
6086 shift right by 5 the concatenation of:
6087 "1st vec" and "1st vec"
6088 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6089 shift right by 3 the concatenation of:
6090 "2nd vec" and "2nd vec"
6091 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6092 | New vectors |
6093
6094 So that now all vectors are READY:
6095 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6096 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6097 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6098
6099 This algorithm is faster than one in vect_permute_load_chain if:
6100 1. "shift of a concatination" is faster than general permutation.
6101 This is usually so.
6102 2. The TARGET machine can't execute vector instructions in parallel.
6103 This is because each step of the algorithm depends on previous.
6104 The algorithm in vect_permute_load_chain is much more parallel.
6105
6106 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6107 */
6108
6109 static bool
6110 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6111 unsigned int length,
6112 stmt_vec_info stmt_info,
6113 gimple_stmt_iterator *gsi,
6114 vec<tree> *result_chain)
6115 {
6116 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6117 tree perm2_mask1, perm2_mask2, perm3_mask;
6118 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6119 gimple *perm_stmt;
6120
6121 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6122 unsigned int i;
6123 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6124
6125 unsigned HOST_WIDE_INT nelt, vf;
6126 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6127 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6128 /* Not supported for variable-length vectors. */
6129 return false;
6130
6131 vec_perm_builder sel (nelt, nelt, 1);
6132 sel.quick_grow (nelt);
6133
6134 result_chain->quick_grow (length);
6135 memcpy (result_chain->address (), dr_chain.address (),
6136 length * sizeof (tree));
6137
6138 if (pow2p_hwi (length) && vf > 4)
6139 {
6140 unsigned int j, log_length = exact_log2 (length);
6141 for (i = 0; i < nelt / 2; ++i)
6142 sel[i] = i * 2;
6143 for (i = 0; i < nelt / 2; ++i)
6144 sel[nelt / 2 + i] = i * 2 + 1;
6145 vec_perm_indices indices (sel, 2, nelt);
6146 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6147 {
6148 if (dump_enabled_p ())
6149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6150 "shuffle of 2 fields structure is not \
6151 supported by target\n");
6152 return false;
6153 }
6154 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6155
6156 for (i = 0; i < nelt / 2; ++i)
6157 sel[i] = i * 2 + 1;
6158 for (i = 0; i < nelt / 2; ++i)
6159 sel[nelt / 2 + i] = i * 2;
6160 indices.new_vector (sel, 2, nelt);
6161 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6162 {
6163 if (dump_enabled_p ())
6164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6165 "shuffle of 2 fields structure is not \
6166 supported by target\n");
6167 return false;
6168 }
6169 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6170
6171 /* Generating permutation constant to shift all elements.
6172 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6173 for (i = 0; i < nelt; i++)
6174 sel[i] = nelt / 2 + i;
6175 indices.new_vector (sel, 2, nelt);
6176 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6177 {
6178 if (dump_enabled_p ())
6179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6180 "shift permutation is not supported by target\n");
6181 return false;
6182 }
6183 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6184
6185 /* Generating permutation constant to select vector from 2.
6186 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6187 for (i = 0; i < nelt / 2; i++)
6188 sel[i] = i;
6189 for (i = nelt / 2; i < nelt; i++)
6190 sel[i] = nelt + i;
6191 indices.new_vector (sel, 2, nelt);
6192 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6193 {
6194 if (dump_enabled_p ())
6195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196 "select is not supported by target\n");
6197 return false;
6198 }
6199 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6200
6201 for (i = 0; i < log_length; i++)
6202 {
6203 for (j = 0; j < length; j += 2)
6204 {
6205 first_vect = dr_chain[j];
6206 second_vect = dr_chain[j + 1];
6207
6208 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6209 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6210 first_vect, first_vect,
6211 perm2_mask1);
6212 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6213 vect[0] = data_ref;
6214
6215 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6216 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6217 second_vect, second_vect,
6218 perm2_mask2);
6219 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6220 vect[1] = data_ref;
6221
6222 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6223 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6224 vect[0], vect[1], shift1_mask);
6225 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6226 (*result_chain)[j/2 + length/2] = data_ref;
6227
6228 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6229 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6230 vect[0], vect[1], select_mask);
6231 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6232 (*result_chain)[j/2] = data_ref;
6233 }
6234 memcpy (dr_chain.address (), result_chain->address (),
6235 length * sizeof (tree));
6236 }
6237 return true;
6238 }
6239 if (length == 3 && vf > 2)
6240 {
6241 unsigned int k = 0, l = 0;
6242
6243 /* Generating permutation constant to get all elements in rigth order.
6244 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6245 for (i = 0; i < nelt; i++)
6246 {
6247 if (3 * k + (l % 3) >= nelt)
6248 {
6249 k = 0;
6250 l += (3 - (nelt % 3));
6251 }
6252 sel[i] = 3 * k + (l % 3);
6253 k++;
6254 }
6255 vec_perm_indices indices (sel, 2, nelt);
6256 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6257 {
6258 if (dump_enabled_p ())
6259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260 "shuffle of 3 fields structure is not \
6261 supported by target\n");
6262 return false;
6263 }
6264 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6265
6266 /* Generating permutation constant to shift all elements.
6267 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6268 for (i = 0; i < nelt; i++)
6269 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6270 indices.new_vector (sel, 2, nelt);
6271 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6272 {
6273 if (dump_enabled_p ())
6274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6275 "shift permutation is not supported by target\n");
6276 return false;
6277 }
6278 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6279
6280 /* Generating permutation constant to shift all elements.
6281 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6282 for (i = 0; i < nelt; i++)
6283 sel[i] = 2 * (nelt / 3) + 1 + i;
6284 indices.new_vector (sel, 2, nelt);
6285 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6286 {
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289 "shift permutation is not supported by target\n");
6290 return false;
6291 }
6292 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6293
6294 /* Generating permutation constant to shift all elements.
6295 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6296 for (i = 0; i < nelt; i++)
6297 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6298 indices.new_vector (sel, 2, nelt);
6299 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6300 {
6301 if (dump_enabled_p ())
6302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6303 "shift permutation is not supported by target\n");
6304 return false;
6305 }
6306 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6307
6308 /* Generating permutation constant to shift all elements.
6309 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6310 for (i = 0; i < nelt; i++)
6311 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6312 indices.new_vector (sel, 2, nelt);
6313 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6314 {
6315 if (dump_enabled_p ())
6316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6317 "shift permutation is not supported by target\n");
6318 return false;
6319 }
6320 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6321
6322 for (k = 0; k < 3; k++)
6323 {
6324 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6325 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6326 dr_chain[k], dr_chain[k],
6327 perm3_mask);
6328 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6329 vect[k] = data_ref;
6330 }
6331
6332 for (k = 0; k < 3; k++)
6333 {
6334 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6335 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6336 vect[k % 3], vect[(k + 1) % 3],
6337 shift1_mask);
6338 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6339 vect_shift[k] = data_ref;
6340 }
6341
6342 for (k = 0; k < 3; k++)
6343 {
6344 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6345 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6346 vect_shift[(4 - k) % 3],
6347 vect_shift[(3 - k) % 3],
6348 shift2_mask);
6349 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6350 vect[k] = data_ref;
6351 }
6352
6353 (*result_chain)[3 - (nelt % 3)] = vect[2];
6354
6355 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6356 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6357 vect[0], shift3_mask);
6358 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6359 (*result_chain)[nelt % 3] = data_ref;
6360
6361 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6362 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6363 vect[1], shift4_mask);
6364 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6365 (*result_chain)[0] = data_ref;
6366 return true;
6367 }
6368 return false;
6369 }
6370
6371 /* Function vect_transform_grouped_load.
6372
6373 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6374 to perform their permutation and ascribe the result vectorized statements to
6375 the scalar statements.
6376 */
6377
6378 void
6379 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6380 vec<tree> dr_chain,
6381 int size, gimple_stmt_iterator *gsi)
6382 {
6383 machine_mode mode;
6384 vec<tree> result_chain = vNULL;
6385
6386 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6387 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6388 vectors, that are ready for vector computation. */
6389 result_chain.create (size);
6390
6391 /* If reassociation width for vector type is 2 or greater target machine can
6392 execute 2 or more vector instructions in parallel. Otherwise try to
6393 get chain for loads group using vect_shift_permute_load_chain. */
6394 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6395 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6396 || pow2p_hwi (size)
6397 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6398 gsi, &result_chain))
6399 vect_permute_load_chain (vinfo, dr_chain,
6400 size, stmt_info, gsi, &result_chain);
6401 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6402 result_chain.release ();
6403 }
6404
6405 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6406 generated as part of the vectorization of STMT_INFO. Assign the statement
6407 for each vector to the associated scalar statement. */
6408
6409 void
6410 vect_record_grouped_load_vectors (vec_info *vinfo, stmt_vec_info stmt_info,
6411 vec<tree> result_chain)
6412 {
6413 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6414 unsigned int i, gap_count;
6415 tree tmp_data_ref;
6416
6417 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6418 Since we scan the chain starting from it's first node, their order
6419 corresponds the order of data-refs in RESULT_CHAIN. */
6420 stmt_vec_info next_stmt_info = first_stmt_info;
6421 gap_count = 1;
6422 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6423 {
6424 if (!next_stmt_info)
6425 break;
6426
6427 /* Skip the gaps. Loads created for the gaps will be removed by dead
6428 code elimination pass later. No need to check for the first stmt in
6429 the group, since it always exists.
6430 DR_GROUP_GAP is the number of steps in elements from the previous
6431 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6432 correspond to the gaps. */
6433 if (next_stmt_info != first_stmt_info
6434 && gap_count < DR_GROUP_GAP (next_stmt_info))
6435 {
6436 gap_count++;
6437 continue;
6438 }
6439
6440 /* ??? The following needs cleanup after the removal of
6441 DR_GROUP_SAME_DR_STMT. */
6442 if (next_stmt_info)
6443 {
6444 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6445 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6446 copies, and we put the new vector statement in the first available
6447 RELATED_STMT. */
6448 if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6449 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6450 else
6451 {
6452 stmt_vec_info prev_stmt_info
6453 = STMT_VINFO_VEC_STMT (next_stmt_info);
6454 stmt_vec_info rel_stmt_info
6455 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6456 while (rel_stmt_info)
6457 {
6458 prev_stmt_info = rel_stmt_info;
6459 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6460 }
6461
6462 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6463 }
6464
6465 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6466 gap_count = 1;
6467 }
6468 }
6469 }
6470
6471 /* Function vect_force_dr_alignment_p.
6472
6473 Returns whether the alignment of a DECL can be forced to be aligned
6474 on ALIGNMENT bit boundary. */
6475
6476 bool
6477 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6478 {
6479 if (!VAR_P (decl))
6480 return false;
6481
6482 if (decl_in_symtab_p (decl)
6483 && !symtab_node::get (decl)->can_increase_alignment_p ())
6484 return false;
6485
6486 if (TREE_STATIC (decl))
6487 return (known_le (alignment,
6488 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6489 else
6490 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6491 }
6492
6493
6494 /* Return whether the data reference DR_INFO is supported with respect to its
6495 alignment.
6496 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6497 it is aligned, i.e., check if it is possible to vectorize it with different
6498 alignment. */
6499
6500 enum dr_alignment_support
6501 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6502 bool check_aligned_accesses)
6503 {
6504 data_reference *dr = dr_info->dr;
6505 stmt_vec_info stmt_info = dr_info->stmt;
6506 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6507 machine_mode mode = TYPE_MODE (vectype);
6508 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6509 class loop *vect_loop = NULL;
6510 bool nested_in_vect_loop = false;
6511
6512 if (aligned_access_p (dr_info) && !check_aligned_accesses)
6513 return dr_aligned;
6514
6515 /* For now assume all conditional loads/stores support unaligned
6516 access without any special code. */
6517 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6518 if (gimple_call_internal_p (stmt)
6519 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6520 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6521 return dr_unaligned_supported;
6522
6523 if (loop_vinfo)
6524 {
6525 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6526 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6527 }
6528
6529 /* Possibly unaligned access. */
6530
6531 /* We can choose between using the implicit realignment scheme (generating
6532 a misaligned_move stmt) and the explicit realignment scheme (generating
6533 aligned loads with a REALIGN_LOAD). There are two variants to the
6534 explicit realignment scheme: optimized, and unoptimized.
6535 We can optimize the realignment only if the step between consecutive
6536 vector loads is equal to the vector size. Since the vector memory
6537 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6538 is guaranteed that the misalignment amount remains the same throughout the
6539 execution of the vectorized loop. Therefore, we can create the
6540 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6541 at the loop preheader.
6542
6543 However, in the case of outer-loop vectorization, when vectorizing a
6544 memory access in the inner-loop nested within the LOOP that is now being
6545 vectorized, while it is guaranteed that the misalignment of the
6546 vectorized memory access will remain the same in different outer-loop
6547 iterations, it is *not* guaranteed that is will remain the same throughout
6548 the execution of the inner-loop. This is because the inner-loop advances
6549 with the original scalar step (and not in steps of VS). If the inner-loop
6550 step happens to be a multiple of VS, then the misalignment remains fixed
6551 and we can use the optimized realignment scheme. For example:
6552
6553 for (i=0; i<N; i++)
6554 for (j=0; j<M; j++)
6555 s += a[i+j];
6556
6557 When vectorizing the i-loop in the above example, the step between
6558 consecutive vector loads is 1, and so the misalignment does not remain
6559 fixed across the execution of the inner-loop, and the realignment cannot
6560 be optimized (as illustrated in the following pseudo vectorized loop):
6561
6562 for (i=0; i<N; i+=4)
6563 for (j=0; j<M; j++){
6564 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6565 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6566 // (assuming that we start from an aligned address).
6567 }
6568
6569 We therefore have to use the unoptimized realignment scheme:
6570
6571 for (i=0; i<N; i+=4)
6572 for (j=k; j<M; j+=4)
6573 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6574 // that the misalignment of the initial address is
6575 // 0).
6576
6577 The loop can then be vectorized as follows:
6578
6579 for (k=0; k<4; k++){
6580 rt = get_realignment_token (&vp[k]);
6581 for (i=0; i<N; i+=4){
6582 v1 = vp[i+k];
6583 for (j=k; j<M; j+=4){
6584 v2 = vp[i+j+VS-1];
6585 va = REALIGN_LOAD <v1,v2,rt>;
6586 vs += va;
6587 v1 = v2;
6588 }
6589 }
6590 } */
6591
6592 if (DR_IS_READ (dr))
6593 {
6594 bool is_packed = false;
6595 tree type = (TREE_TYPE (DR_REF (dr)));
6596
6597 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6598 && (!targetm.vectorize.builtin_mask_for_load
6599 || targetm.vectorize.builtin_mask_for_load ()))
6600 {
6601 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6602
6603 /* If we are doing SLP then the accesses need not have the
6604 same alignment, instead it depends on the SLP group size. */
6605 if (loop_vinfo
6606 && STMT_SLP_TYPE (stmt_info)
6607 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6608 * (DR_GROUP_SIZE
6609 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6610 TYPE_VECTOR_SUBPARTS (vectype)))
6611 ;
6612 else if (!loop_vinfo
6613 || (nested_in_vect_loop
6614 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6615 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6616 return dr_explicit_realign;
6617 else
6618 return dr_explicit_realign_optimized;
6619 }
6620 if (!known_alignment_for_access_p (dr_info))
6621 is_packed = not_size_aligned (DR_REF (dr));
6622
6623 if (targetm.vectorize.support_vector_misalignment
6624 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6625 /* Can't software pipeline the loads, but can at least do them. */
6626 return dr_unaligned_supported;
6627 }
6628 else
6629 {
6630 bool is_packed = false;
6631 tree type = (TREE_TYPE (DR_REF (dr)));
6632
6633 if (!known_alignment_for_access_p (dr_info))
6634 is_packed = not_size_aligned (DR_REF (dr));
6635
6636 if (targetm.vectorize.support_vector_misalignment
6637 (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6638 return dr_unaligned_supported;
6639 }
6640
6641 /* Unsupported. */
6642 return dr_unaligned_unsupported;
6643 }