]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-data-refs.c
x86: Remove "%!" before ret
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
80 }
81 }
82
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 {
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
90 }
91
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
96
97 return true;
98 }
99
100
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
117
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
120 {
121 HOST_WIDE_INT lhs, rhs;
122
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126 return scalar_type;
127
128 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129
130 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131 if (assign)
132 {
133 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134 if (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
142 {
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
148 }
149 }
150 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
151 {
152 unsigned int i = 0;
153 if (gimple_call_internal_p (call))
154 {
155 internal_fn ifn = gimple_call_internal_fn (call);
156 if (internal_load_fn_p (ifn))
157 /* For loads the LHS type does the trick. */
158 i = ~0U;
159 else if (internal_store_fn_p (ifn))
160 {
161 /* For stores use the tyep of the stored value. */
162 i = internal_fn_stored_value_index (ifn);
163 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
164 i = ~0U;
165 }
166 else if (internal_fn_mask_index (ifn) == 0)
167 i = 1;
168 }
169 if (i < gimple_call_num_args (call))
170 {
171 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
172 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
173 {
174 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
175 if (rhs < lhs)
176 scalar_type = rhs_type;
177 }
178 }
179 }
180
181 return scalar_type;
182 }
183
184
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186 tested at run-time. Return TRUE if DDR was successfully inserted.
187 Return false if versioning is not supported. */
188
189 static opt_result
190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
191 {
192 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
193
194 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
195 return opt_result::failure_at (vect_location,
196 "will not create alias checks, as"
197 " --param vect-max-version-for-alias-checks"
198 " == 0\n");
199
200 opt_result res
201 = runtime_alias_check_p (ddr, loop,
202 optimize_loop_nest_for_speed_p (loop));
203 if (!res)
204 return res;
205
206 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207 return opt_result::success ();
208 }
209
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
211
212 static void
213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
214 {
215 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
216 for (unsigned int i = 0; i < checks.length(); ++i)
217 if (checks[i] == value)
218 return;
219
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location,
222 "need run-time check that %T is nonzero\n",
223 value);
224 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 }
226
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229 DR_INFO_B. At least one of the accesses is a write. */
230
231 static bool
232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
233 {
234 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
235 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
236
237 /* Single statements are always kept in their original order. */
238 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
239 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
240 return true;
241
242 /* STMT_A and STMT_B belong to overlapping groups. All loads are
243 emitted at the position of the first scalar load.
244 Stores in a group are emitted at the position of the last scalar store.
245 Compute that position and check whether the resulting order matches
246 the current one. */
247 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
248 if (il_a)
249 {
250 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
251 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
252 s = DR_GROUP_NEXT_ELEMENT (s))
253 il_a = get_later_stmt (il_a, s);
254 else /* DR_IS_READ */
255 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
256 s = DR_GROUP_NEXT_ELEMENT (s))
257 if (get_later_stmt (il_a, s) == il_a)
258 il_a = s;
259 }
260 else
261 il_a = stmtinfo_a;
262 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
263 if (il_b)
264 {
265 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
266 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
267 s = DR_GROUP_NEXT_ELEMENT (s))
268 il_b = get_later_stmt (il_b, s);
269 else /* DR_IS_READ */
270 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
271 s = DR_GROUP_NEXT_ELEMENT (s))
272 if (get_later_stmt (il_b, s) == il_b)
273 il_b = s;
274 }
275 else
276 il_b = stmtinfo_b;
277 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
278 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 }
280
281 /* A subroutine of vect_analyze_data_ref_dependence. Handle
282 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283 distances. These distances are conservatively correct but they don't
284 reflect a guaranteed dependence.
285
286 Return true if this function does all the work necessary to avoid
287 an alias or false if the caller should use the dependence distances
288 to limit the vectorization factor in the usual way. LOOP_DEPTH is
289 the depth of the loop described by LOOP_VINFO and the other arguments
290 are as for vect_analyze_data_ref_dependence. */
291
292 static bool
293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
294 loop_vec_info loop_vinfo,
295 int loop_depth, unsigned int *max_vf)
296 {
297 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
298 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
299 {
300 int dist = dist_v[loop_depth];
301 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
302 {
303 /* If the user asserted safelen >= DIST consecutive iterations
304 can be executed concurrently, assume independence.
305
306 ??? An alternative would be to add the alias check even
307 in this case, and vectorize the fallback loop with the
308 maximum VF set to safelen. However, if the user has
309 explicitly given a length, it's less likely that that
310 would be a win. */
311 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
312 {
313 if ((unsigned int) loop->safelen < *max_vf)
314 *max_vf = loop->safelen;
315 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
316 continue;
317 }
318
319 /* For dependence distances of 2 or more, we have the option
320 of limiting VF or checking for an alias at runtime.
321 Prefer to check at runtime if we can, to avoid limiting
322 the VF unnecessarily when the bases are in fact independent.
323
324 Note that the alias checks will be removed if the VF ends up
325 being small enough. */
326 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
327 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
328 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
329 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
330 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331 }
332 }
333 return true;
334 }
335
336
337 /* Function vect_analyze_data_ref_dependence.
338
339 FIXME: I needed to change the sense of the returned flag.
340
341 Return FALSE if there (might) exist a dependence between a memory-reference
342 DRA and a memory-reference DRB. When versioning for alias may check a
343 dependence at run-time, return TRUE. Adjust *MAX_VF according to
344 the data dependence. */
345
346 static opt_result
347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
348 loop_vec_info loop_vinfo,
349 unsigned int *max_vf)
350 {
351 unsigned int i;
352 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
353 struct data_reference *dra = DDR_A (ddr);
354 struct data_reference *drb = DDR_B (ddr);
355 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
356 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
357 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
358 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
359 lambda_vector dist_v;
360 unsigned int loop_depth;
361
362 /* In loop analysis all data references should be vectorizable. */
363 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
364 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
365 gcc_unreachable ();
366
367 /* Independent data accesses. */
368 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
369 return opt_result::success ();
370
371 if (dra == drb
372 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
373 return opt_result::success ();
374
375 /* We do not have to consider dependences between accesses that belong
376 to the same group, unless the stride could be smaller than the
377 group size. */
378 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
379 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
380 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
381 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
382 return opt_result::success ();
383
384 /* Even if we have an anti-dependence then, as the vectorized loop covers at
385 least two scalar iterations, there is always also a true dependence.
386 As the vectorizer does not re-order loads and stores we can ignore
387 the anti-dependence if TBAA can disambiguate both DRs similar to the
388 case with known negative distance anti-dependences (positive
389 distance anti-dependences would violate TBAA constraints). */
390 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
391 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
392 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
393 get_alias_set (DR_REF (drb))))
394 return opt_result::success ();
395
396 /* Unknown data dependence. */
397 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
398 {
399 /* If user asserted safelen consecutive iterations can be
400 executed concurrently, assume independence. */
401 if (loop->safelen >= 2)
402 {
403 if ((unsigned int) loop->safelen < *max_vf)
404 *max_vf = loop->safelen;
405 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
406 return opt_result::success ();
407 }
408
409 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
410 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
411 return opt_result::failure_at
412 (stmtinfo_a->stmt,
413 "versioning for alias not supported for: "
414 "can't determine dependence between %T and %T\n",
415 DR_REF (dra), DR_REF (drb));
416
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
419 "versioning for alias required: "
420 "can't determine dependence between %T and %T\n",
421 DR_REF (dra), DR_REF (drb));
422
423 /* Add to list of ddrs that need to be tested at run-time. */
424 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
425 }
426
427 /* Known data dependence. */
428 if (DDR_NUM_DIST_VECTS (ddr) == 0)
429 {
430 /* If user asserted safelen consecutive iterations can be
431 executed concurrently, assume independence. */
432 if (loop->safelen >= 2)
433 {
434 if ((unsigned int) loop->safelen < *max_vf)
435 *max_vf = loop->safelen;
436 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
437 return opt_result::success ();
438 }
439
440 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
441 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
442 return opt_result::failure_at
443 (stmtinfo_a->stmt,
444 "versioning for alias not supported for: "
445 "bad dist vector for %T and %T\n",
446 DR_REF (dra), DR_REF (drb));
447
448 if (dump_enabled_p ())
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
450 "versioning for alias required: "
451 "bad dist vector for %T and %T\n",
452 DR_REF (dra), DR_REF (drb));
453 /* Add to list of ddrs that need to be tested at run-time. */
454 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
455 }
456
457 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
458
459 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
460 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
461 loop_depth, max_vf))
462 return opt_result::success ();
463
464 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
465 {
466 int dist = dist_v[loop_depth];
467
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "dependence distance = %d.\n", dist);
471
472 if (dist == 0)
473 {
474 if (dump_enabled_p ())
475 dump_printf_loc (MSG_NOTE, vect_location,
476 "dependence distance == 0 between %T and %T\n",
477 DR_REF (dra), DR_REF (drb));
478
479 /* When we perform grouped accesses and perform implicit CSE
480 by detecting equal accesses and doing disambiguation with
481 runtime alias tests like for
482 .. = a[i];
483 .. = a[i+1];
484 a[i] = ..;
485 a[i+1] = ..;
486 *p = ..;
487 .. = a[i];
488 .. = a[i+1];
489 where we will end up loading { a[i], a[i+1] } once, make
490 sure that inserting group loads before the first load and
491 stores after the last store will do the right thing.
492 Similar for groups like
493 a[i] = ...;
494 ... = a[i];
495 a[i+1] = ...;
496 where loads from the group interleave with the store. */
497 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
498 return opt_result::failure_at (stmtinfo_a->stmt,
499 "READ_WRITE dependence"
500 " in interleaving.\n");
501
502 if (loop->safelen < 2)
503 {
504 tree indicator = dr_zero_step_indicator (dra);
505 if (!indicator || integer_zerop (indicator))
506 return opt_result::failure_at (stmtinfo_a->stmt,
507 "access also has a zero step\n");
508 else if (TREE_CODE (indicator) != INTEGER_CST)
509 vect_check_nonzero_value (loop_vinfo, indicator);
510 }
511 continue;
512 }
513
514 if (dist > 0 && DDR_REVERSED_P (ddr))
515 {
516 /* If DDR_REVERSED_P the order of the data-refs in DDR was
517 reversed (to make distance vector positive), and the actual
518 distance is negative. */
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "dependence distance negative.\n");
522 /* When doing outer loop vectorization, we need to check if there is
523 a backward dependence at the inner loop level if the dependence
524 at the outer loop is reversed. See PR81740. */
525 if (nested_in_vect_loop_p (loop, stmtinfo_a)
526 || nested_in_vect_loop_p (loop, stmtinfo_b))
527 {
528 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
529 DDR_LOOP_NEST (ddr));
530 if (dist_v[inner_depth] < 0)
531 return opt_result::failure_at (stmtinfo_a->stmt,
532 "not vectorized, dependence "
533 "between data-refs %T and %T\n",
534 DR_REF (dra), DR_REF (drb));
535 }
536 /* Record a negative dependence distance to later limit the
537 amount of stmt copying / unrolling we can perform.
538 Only need to handle read-after-write dependence. */
539 if (DR_IS_READ (drb)
540 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
541 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
542 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
543 continue;
544 }
545
546 unsigned int abs_dist = abs (dist);
547 if (abs_dist >= 2 && abs_dist < *max_vf)
548 {
549 /* The dependence distance requires reduction of the maximal
550 vectorization factor. */
551 *max_vf = abs_dist;
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "adjusting maximal vectorization factor to %i\n",
555 *max_vf);
556 }
557
558 if (abs_dist >= *max_vf)
559 {
560 /* Dependence distance does not create dependence, as far as
561 vectorization is concerned, in this case. */
562 if (dump_enabled_p ())
563 dump_printf_loc (MSG_NOTE, vect_location,
564 "dependence distance >= VF.\n");
565 continue;
566 }
567
568 return opt_result::failure_at (stmtinfo_a->stmt,
569 "not vectorized, possible dependence "
570 "between data-refs %T and %T\n",
571 DR_REF (dra), DR_REF (drb));
572 }
573
574 return opt_result::success ();
575 }
576
577 /* Function vect_analyze_data_ref_dependences.
578
579 Examine all the data references in the loop, and make sure there do not
580 exist any data dependences between them. Set *MAX_VF according to
581 the maximum vectorization factor the data dependences allow. */
582
583 opt_result
584 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
585 unsigned int *max_vf)
586 {
587 unsigned int i;
588 struct data_dependence_relation *ddr;
589
590 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
591
592 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
593 {
594 LOOP_VINFO_DDRS (loop_vinfo)
595 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
596 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
597 /* We do not need read-read dependences. */
598 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
599 &LOOP_VINFO_DDRS (loop_vinfo),
600 LOOP_VINFO_LOOP_NEST (loop_vinfo),
601 false);
602 gcc_assert (res);
603 }
604
605 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
606
607 /* For epilogues we either have no aliases or alias versioning
608 was applied to original loop. Therefore we may just get max_vf
609 using VF of original loop. */
610 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
611 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
612 else
613 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
614 {
615 opt_result res
616 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
617 if (!res)
618 return res;
619 }
620
621 return opt_result::success ();
622 }
623
624
625 /* Function vect_slp_analyze_data_ref_dependence.
626
627 Return TRUE if there (might) exist a dependence between a memory-reference
628 DRA and a memory-reference DRB for VINFO. When versioning for alias
629 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
630 according to the data dependence. */
631
632 static bool
633 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
634 struct data_dependence_relation *ddr)
635 {
636 struct data_reference *dra = DDR_A (ddr);
637 struct data_reference *drb = DDR_B (ddr);
638 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
639 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
640
641 /* We need to check dependences of statements marked as unvectorizable
642 as well, they still can prohibit vectorization. */
643
644 /* Independent data accesses. */
645 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
646 return false;
647
648 if (dra == drb)
649 return false;
650
651 /* Read-read is OK. */
652 if (DR_IS_READ (dra) && DR_IS_READ (drb))
653 return false;
654
655 /* If dra and drb are part of the same interleaving chain consider
656 them independent. */
657 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
658 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
659 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
660 return false;
661
662 /* Unknown data dependence. */
663 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "can't determine dependence between %T and %T\n",
668 DR_REF (dra), DR_REF (drb));
669 }
670 else if (dump_enabled_p ())
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "determined dependence between %T and %T\n",
673 DR_REF (dra), DR_REF (drb));
674
675 return true;
676 }
677
678
679 /* Analyze dependences involved in the transform of SLP NODE. STORES
680 contain the vector of scalar stores of this instance if we are
681 disambiguating the loads. */
682
683 static bool
684 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
685 vec<stmt_vec_info> stores,
686 stmt_vec_info last_store_info)
687 {
688 /* This walks over all stmts involved in the SLP load/store done
689 in NODE verifying we can sink them up to the last stmt in the
690 group. */
691 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
692 {
693 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
694 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
695 {
696 stmt_vec_info access_info
697 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
698 if (access_info == last_access_info)
699 continue;
700 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
701 ao_ref ref;
702 bool ref_initialized_p = false;
703 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
704 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
705 {
706 gimple *stmt = gsi_stmt (gsi);
707 if (! gimple_vuse (stmt))
708 continue;
709
710 /* If we couldn't record a (single) data reference for this
711 stmt we have to resort to the alias oracle. */
712 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
713 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
714 if (!dr_b)
715 {
716 /* We are moving a store - this means
717 we cannot use TBAA for disambiguation. */
718 if (!ref_initialized_p)
719 ao_ref_init (&ref, DR_REF (dr_a));
720 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
721 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
722 return false;
723 continue;
724 }
725
726 bool dependent = false;
727 /* If we run into a store of this same instance (we've just
728 marked those) then delay dependence checking until we run
729 into the last store because this is where it will have
730 been sunk to (and we verify if we can do that as well). */
731 if (gimple_visited_p (stmt))
732 {
733 if (stmt_info != last_store_info)
734 continue;
735
736 for (stmt_vec_info &store_info : stores)
737 {
738 data_reference *store_dr
739 = STMT_VINFO_DATA_REF (store_info);
740 ddr_p ddr = initialize_data_dependence_relation
741 (dr_a, store_dr, vNULL);
742 dependent
743 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
744 free_dependence_relation (ddr);
745 if (dependent)
746 break;
747 }
748 }
749 else
750 {
751 ddr_p ddr = initialize_data_dependence_relation (dr_a,
752 dr_b, vNULL);
753 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
754 free_dependence_relation (ddr);
755 }
756 if (dependent)
757 return false;
758 }
759 }
760 }
761 else /* DR_IS_READ */
762 {
763 stmt_vec_info first_access_info
764 = vect_find_first_scalar_stmt_in_slp (node);
765 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
766 {
767 stmt_vec_info access_info
768 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
769 if (access_info == first_access_info)
770 continue;
771 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
772 ao_ref ref;
773 bool ref_initialized_p = false;
774 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
775 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
776 {
777 gimple *stmt = gsi_stmt (gsi);
778 if (! gimple_vdef (stmt))
779 continue;
780
781 /* If we couldn't record a (single) data reference for this
782 stmt we have to resort to the alias oracle. */
783 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
784 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
785
786 /* We are hoisting a load - this means we can use
787 TBAA for disambiguation. */
788 if (!ref_initialized_p)
789 ao_ref_init (&ref, DR_REF (dr_a));
790 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
791 {
792 if (!dr_b)
793 return false;
794 /* Resort to dependence checking below. */
795 }
796 else
797 /* No dependence. */
798 continue;
799
800 bool dependent = false;
801 /* If we run into a store of this same instance (we've just
802 marked those) then delay dependence checking until we run
803 into the last store because this is where it will have
804 been sunk to (and we verify if we can do that as well). */
805 if (gimple_visited_p (stmt))
806 {
807 if (stmt_info != last_store_info)
808 continue;
809
810 for (stmt_vec_info &store_info : stores)
811 {
812 data_reference *store_dr
813 = STMT_VINFO_DATA_REF (store_info);
814 ddr_p ddr = initialize_data_dependence_relation
815 (dr_a, store_dr, vNULL);
816 dependent
817 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
818 free_dependence_relation (ddr);
819 if (dependent)
820 break;
821 }
822 }
823 else
824 {
825 ddr_p ddr = initialize_data_dependence_relation (dr_a,
826 dr_b, vNULL);
827 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
828 free_dependence_relation (ddr);
829 }
830 if (dependent)
831 return false;
832 }
833 }
834 }
835 return true;
836 }
837
838
839 /* Function vect_analyze_data_ref_dependences.
840
841 Examine all the data references in the basic-block, and make sure there
842 do not exist any data dependences between them. Set *MAX_VF according to
843 the maximum vectorization factor the data dependences allow. */
844
845 bool
846 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
847 {
848 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
849
850 /* The stores of this instance are at the root of the SLP tree. */
851 slp_tree store = NULL;
852 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
853 store = SLP_INSTANCE_TREE (instance);
854
855 /* Verify we can sink stores to the vectorized stmt insert location. */
856 stmt_vec_info last_store_info = NULL;
857 if (store)
858 {
859 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
860 return false;
861
862 /* Mark stores in this instance and remember the last one. */
863 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
864 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
865 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
866 }
867
868 bool res = true;
869
870 /* Verify we can sink loads to the vectorized stmt insert location,
871 special-casing stores of this instance. */
872 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
873 if (! vect_slp_analyze_node_dependences (vinfo, load,
874 store
875 ? SLP_TREE_SCALAR_STMTS (store)
876 : vNULL, last_store_info))
877 {
878 res = false;
879 break;
880 }
881
882 /* Unset the visited flag. */
883 if (store)
884 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
885 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
886
887 return res;
888 }
889
890 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
891 applied. */
892
893 int
894 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
895 {
896 HOST_WIDE_INT diff = 0;
897 /* Alignment is only analyzed for the first element of a DR group,
898 use that but adjust misalignment by the offset of the access. */
899 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
900 {
901 dr_vec_info *first_dr
902 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
903 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
904 INTEGER_CSTs and the first element in the group has the lowest
905 address. */
906 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
907 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
908 gcc_assert (diff >= 0);
909 dr_info = first_dr;
910 }
911
912 int misalign = dr_info->misalignment;
913 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
914 if (misalign == DR_MISALIGNMENT_UNKNOWN)
915 return misalign;
916
917 /* If the access is only aligned for a vector type with smaller alignment
918 requirement the access has unknown misalignment. */
919 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
920 targetm.vectorize.preferred_vector_alignment (vectype)))
921 return DR_MISALIGNMENT_UNKNOWN;
922
923 /* Apply the offset from the DR group start and the externally supplied
924 offset which can for example result from a negative stride access. */
925 poly_int64 misalignment = misalign + diff + offset;
926
927 /* vect_compute_data_ref_alignment will have ensured that target_alignment
928 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
929 unsigned HOST_WIDE_INT target_alignment_c
930 = dr_info->target_alignment.to_constant ();
931 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
932 return DR_MISALIGNMENT_UNKNOWN;
933 return misalign;
934 }
935
936 /* Record the base alignment guarantee given by DRB, which occurs
937 in STMT_INFO. */
938
939 static void
940 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
941 innermost_loop_behavior *drb)
942 {
943 bool existed;
944 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
945 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
946 if (!existed || entry.second->base_alignment < drb->base_alignment)
947 {
948 entry = std::make_pair (stmt_info, drb);
949 if (dump_enabled_p ())
950 dump_printf_loc (MSG_NOTE, vect_location,
951 "recording new base alignment for %T\n"
952 " alignment: %d\n"
953 " misalignment: %d\n"
954 " based on: %G",
955 drb->base_address,
956 drb->base_alignment,
957 drb->base_misalignment,
958 stmt_info->stmt);
959 }
960 }
961
962 /* If the region we're going to vectorize is reached, all unconditional
963 data references occur at least once. We can therefore pool the base
964 alignment guarantees from each unconditional reference. Do this by
965 going through all the data references in VINFO and checking whether
966 the containing statement makes the reference unconditionally. If so,
967 record the alignment of the base address in VINFO so that it can be
968 used for all other references with the same base. */
969
970 void
971 vect_record_base_alignments (vec_info *vinfo)
972 {
973 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
974 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
975 for (data_reference *dr : vinfo->shared->datarefs)
976 {
977 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
978 stmt_vec_info stmt_info = dr_info->stmt;
979 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
980 && STMT_VINFO_VECTORIZABLE (stmt_info)
981 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
982 {
983 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
984
985 /* If DR is nested in the loop that is being vectorized, we can also
986 record the alignment of the base wrt the outer loop. */
987 if (loop && nested_in_vect_loop_p (loop, stmt_info))
988 vect_record_base_alignment
989 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
990 }
991 }
992 }
993
994 /* Function vect_compute_data_ref_alignment
995
996 Compute the misalignment of the data reference DR_INFO when vectorizing
997 with VECTYPE.
998
999 Output:
1000 1. initialized misalignment info for DR_INFO
1001
1002 FOR NOW: No analysis is actually performed. Misalignment is calculated
1003 only for trivial cases. TODO. */
1004
1005 static void
1006 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1007 tree vectype)
1008 {
1009 stmt_vec_info stmt_info = dr_info->stmt;
1010 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1011 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1012 class loop *loop = NULL;
1013 tree ref = DR_REF (dr_info->dr);
1014
1015 if (dump_enabled_p ())
1016 dump_printf_loc (MSG_NOTE, vect_location,
1017 "vect_compute_data_ref_alignment:\n");
1018
1019 if (loop_vinfo)
1020 loop = LOOP_VINFO_LOOP (loop_vinfo);
1021
1022 /* Initialize misalignment to unknown. */
1023 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1024
1025 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1026 return;
1027
1028 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1029 bool step_preserves_misalignment_p;
1030
1031 poly_uint64 vector_alignment
1032 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1033 BITS_PER_UNIT);
1034 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1035
1036 /* If the main loop has peeled for alignment we have no way of knowing
1037 whether the data accesses in the epilogues are aligned. We can't at
1038 compile time answer the question whether we have entered the main loop or
1039 not. Fixes PR 92351. */
1040 if (loop_vinfo)
1041 {
1042 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1043 if (orig_loop_vinfo
1044 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1045 return;
1046 }
1047
1048 unsigned HOST_WIDE_INT vect_align_c;
1049 if (!vector_alignment.is_constant (&vect_align_c))
1050 return;
1051
1052 /* No step for BB vectorization. */
1053 if (!loop)
1054 {
1055 gcc_assert (integer_zerop (drb->step));
1056 step_preserves_misalignment_p = true;
1057 }
1058
1059 /* In case the dataref is in an inner-loop of the loop that is being
1060 vectorized (LOOP), we use the base and misalignment information
1061 relative to the outer-loop (LOOP). This is ok only if the misalignment
1062 stays the same throughout the execution of the inner-loop, which is why
1063 we have to check that the stride of the dataref in the inner-loop evenly
1064 divides by the vector alignment. */
1065 else if (nested_in_vect_loop_p (loop, stmt_info))
1066 {
1067 step_preserves_misalignment_p
1068 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1069
1070 if (dump_enabled_p ())
1071 {
1072 if (step_preserves_misalignment_p)
1073 dump_printf_loc (MSG_NOTE, vect_location,
1074 "inner step divides the vector alignment.\n");
1075 else
1076 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1077 "inner step doesn't divide the vector"
1078 " alignment.\n");
1079 }
1080 }
1081
1082 /* Similarly we can only use base and misalignment information relative to
1083 an innermost loop if the misalignment stays the same throughout the
1084 execution of the loop. As above, this is the case if the stride of
1085 the dataref evenly divides by the alignment. */
1086 else
1087 {
1088 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1089 step_preserves_misalignment_p
1090 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1091
1092 if (!step_preserves_misalignment_p && dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094 "step doesn't divide the vector alignment.\n");
1095 }
1096
1097 unsigned int base_alignment = drb->base_alignment;
1098 unsigned int base_misalignment = drb->base_misalignment;
1099
1100 /* Calculate the maximum of the pooled base address alignment and the
1101 alignment that we can compute for DR itself. */
1102 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1103 = base_alignments->get (drb->base_address);
1104 if (entry
1105 && base_alignment < (*entry).second->base_alignment
1106 && (loop_vinfo
1107 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1108 gimple_bb (entry->first->stmt))
1109 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1110 || (entry->first->dr_aux.group <= dr_info->group)))))
1111 {
1112 base_alignment = entry->second->base_alignment;
1113 base_misalignment = entry->second->base_misalignment;
1114 }
1115
1116 if (drb->offset_alignment < vect_align_c
1117 || !step_preserves_misalignment_p
1118 /* We need to know whether the step wrt the vectorized loop is
1119 negative when computing the starting misalignment below. */
1120 || TREE_CODE (drb->step) != INTEGER_CST)
1121 {
1122 if (dump_enabled_p ())
1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124 "Unknown alignment for access: %T\n", ref);
1125 return;
1126 }
1127
1128 if (base_alignment < vect_align_c)
1129 {
1130 unsigned int max_alignment;
1131 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1132 if (max_alignment < vect_align_c
1133 || !vect_can_force_dr_alignment_p (base,
1134 vect_align_c * BITS_PER_UNIT))
1135 {
1136 if (dump_enabled_p ())
1137 dump_printf_loc (MSG_NOTE, vect_location,
1138 "can't force alignment of ref: %T\n", ref);
1139 return;
1140 }
1141
1142 /* Force the alignment of the decl.
1143 NOTE: This is the only change to the code we make during
1144 the analysis phase, before deciding to vectorize the loop. */
1145 if (dump_enabled_p ())
1146 dump_printf_loc (MSG_NOTE, vect_location,
1147 "force alignment of %T\n", ref);
1148
1149 dr_info->base_decl = base;
1150 dr_info->base_misaligned = true;
1151 base_misalignment = 0;
1152 }
1153 poly_int64 misalignment
1154 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1155
1156 unsigned int const_misalignment;
1157 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1158 {
1159 if (dump_enabled_p ())
1160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1161 "Non-constant misalignment for access: %T\n", ref);
1162 return;
1163 }
1164
1165 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1166
1167 if (dump_enabled_p ())
1168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1169 "misalign = %d bytes of ref %T\n",
1170 const_misalignment, ref);
1171
1172 return;
1173 }
1174
1175 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1176 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1177 is made aligned via peeling. */
1178
1179 static bool
1180 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1181 dr_vec_info *dr_peel_info)
1182 {
1183 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1184 DR_TARGET_ALIGNMENT (dr_info)))
1185 {
1186 poly_offset_int diff
1187 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1188 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1189 if (known_eq (diff, 0)
1190 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1191 return true;
1192 }
1193 return false;
1194 }
1195
1196 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1197 aligned via peeling. */
1198
1199 static bool
1200 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1201 dr_vec_info *dr_peel_info)
1202 {
1203 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1204 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1205 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1206 DR_OFFSET (dr_peel_info->dr), 0)
1207 || !operand_equal_p (DR_STEP (dr_info->dr),
1208 DR_STEP (dr_peel_info->dr), 0))
1209 return false;
1210
1211 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1212 }
1213
1214 /* Compute the value for dr_info->misalign so that the access appears
1215 aligned. This is used by peeling to compensate for dr_misalignment
1216 applying the offset for negative step. */
1217
1218 int
1219 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1220 {
1221 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1222 return 0;
1223
1224 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1225 poly_int64 misalignment
1226 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1227 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1228
1229 unsigned HOST_WIDE_INT target_alignment_c;
1230 int misalign;
1231 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1232 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1233 return DR_MISALIGNMENT_UNKNOWN;
1234 return misalign;
1235 }
1236
1237 /* Function vect_update_misalignment_for_peel.
1238 Sets DR_INFO's misalignment
1239 - to 0 if it has the same alignment as DR_PEEL_INFO,
1240 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1241 - to -1 (unknown) otherwise.
1242
1243 DR_INFO - the data reference whose misalignment is to be adjusted.
1244 DR_PEEL_INFO - the data reference whose misalignment is being made
1245 zero in the vector loop by the peel.
1246 NPEEL - the number of iterations in the peel loop if the misalignment
1247 of DR_PEEL_INFO is known at compile time. */
1248
1249 static void
1250 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1251 dr_vec_info *dr_peel_info, int npeel)
1252 {
1253 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1254 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1255 {
1256 SET_DR_MISALIGNMENT (dr_info,
1257 vect_dr_misalign_for_aligned_access (dr_peel_info));
1258 return;
1259 }
1260
1261 unsigned HOST_WIDE_INT alignment;
1262 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1263 && known_alignment_for_access_p (dr_info,
1264 STMT_VINFO_VECTYPE (dr_info->stmt))
1265 && known_alignment_for_access_p (dr_peel_info,
1266 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1267 {
1268 int misal = dr_info->misalignment;
1269 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1270 misal &= alignment - 1;
1271 set_dr_misalignment (dr_info, misal);
1272 return;
1273 }
1274
1275 if (dump_enabled_p ())
1276 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1277 "to unknown (-1).\n");
1278 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1279 }
1280
1281 /* Return true if alignment is relevant for DR_INFO. */
1282
1283 static bool
1284 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1285 {
1286 stmt_vec_info stmt_info = dr_info->stmt;
1287
1288 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1289 return false;
1290
1291 /* For interleaving, only the alignment of the first access matters. */
1292 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1293 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1294 return false;
1295
1296 /* Scatter-gather and invariant accesses continue to address individual
1297 scalars, so vector-level alignment is irrelevant. */
1298 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1299 || integer_zerop (DR_STEP (dr_info->dr)))
1300 return false;
1301
1302 /* Strided accesses perform only component accesses, alignment is
1303 irrelevant for them. */
1304 if (STMT_VINFO_STRIDED_P (stmt_info)
1305 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1306 return false;
1307
1308 return true;
1309 }
1310
1311 /* Given an memory reference EXP return whether its alignment is less
1312 than its size. */
1313
1314 static bool
1315 not_size_aligned (tree exp)
1316 {
1317 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1318 return true;
1319
1320 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1321 > get_object_alignment (exp));
1322 }
1323
1324 /* Function vector_alignment_reachable_p
1325
1326 Return true if vector alignment for DR_INFO is reachable by peeling
1327 a few loop iterations. Return false otherwise. */
1328
1329 static bool
1330 vector_alignment_reachable_p (dr_vec_info *dr_info)
1331 {
1332 stmt_vec_info stmt_info = dr_info->stmt;
1333 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1334
1335 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1336 {
1337 /* For interleaved access we peel only if number of iterations in
1338 the prolog loop ({VF - misalignment}), is a multiple of the
1339 number of the interleaved accesses. */
1340 int elem_size, mis_in_elements;
1341
1342 /* FORNOW: handle only known alignment. */
1343 if (!known_alignment_for_access_p (dr_info, vectype))
1344 return false;
1345
1346 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1347 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1348 elem_size = vector_element_size (vector_size, nelements);
1349 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1350
1351 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1352 return false;
1353 }
1354
1355 /* If misalignment is known at the compile time then allow peeling
1356 only if natural alignment is reachable through peeling. */
1357 if (known_alignment_for_access_p (dr_info, vectype)
1358 && !aligned_access_p (dr_info, vectype))
1359 {
1360 HOST_WIDE_INT elmsize =
1361 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1362 if (dump_enabled_p ())
1363 {
1364 dump_printf_loc (MSG_NOTE, vect_location,
1365 "data size = %wd. misalignment = %d.\n", elmsize,
1366 dr_misalignment (dr_info, vectype));
1367 }
1368 if (dr_misalignment (dr_info, vectype) % elmsize)
1369 {
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372 "data size does not divide the misalignment.\n");
1373 return false;
1374 }
1375 }
1376
1377 if (!known_alignment_for_access_p (dr_info, vectype))
1378 {
1379 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1380 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383 "Unknown misalignment, %snaturally aligned\n",
1384 is_packed ? "not " : "");
1385 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1386 }
1387
1388 return true;
1389 }
1390
1391
1392 /* Calculate the cost of the memory access represented by DR_INFO. */
1393
1394 static void
1395 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1396 dr_alignment_support alignment_support_scheme,
1397 int misalignment,
1398 unsigned int *inside_cost,
1399 unsigned int *outside_cost,
1400 stmt_vector_for_cost *body_cost_vec,
1401 stmt_vector_for_cost *prologue_cost_vec)
1402 {
1403 stmt_vec_info stmt_info = dr_info->stmt;
1404 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1405 int ncopies;
1406
1407 if (PURE_SLP_STMT (stmt_info))
1408 ncopies = 1;
1409 else
1410 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1411
1412 if (DR_IS_READ (dr_info->dr))
1413 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1414 misalignment, true, inside_cost,
1415 outside_cost, prologue_cost_vec, body_cost_vec, false);
1416 else
1417 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1418 misalignment, inside_cost, body_cost_vec);
1419
1420 if (dump_enabled_p ())
1421 dump_printf_loc (MSG_NOTE, vect_location,
1422 "vect_get_data_access_cost: inside_cost = %d, "
1423 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1424 }
1425
1426
1427 typedef struct _vect_peel_info
1428 {
1429 dr_vec_info *dr_info;
1430 int npeel;
1431 unsigned int count;
1432 } *vect_peel_info;
1433
1434 typedef struct _vect_peel_extended_info
1435 {
1436 vec_info *vinfo;
1437 struct _vect_peel_info peel_info;
1438 unsigned int inside_cost;
1439 unsigned int outside_cost;
1440 } *vect_peel_extended_info;
1441
1442
1443 /* Peeling hashtable helpers. */
1444
1445 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1446 {
1447 static inline hashval_t hash (const _vect_peel_info *);
1448 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1449 };
1450
1451 inline hashval_t
1452 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1453 {
1454 return (hashval_t) peel_info->npeel;
1455 }
1456
1457 inline bool
1458 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1459 {
1460 return (a->npeel == b->npeel);
1461 }
1462
1463
1464 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1465
1466 static void
1467 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1468 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1469 int npeel, bool supportable_if_not_aligned)
1470 {
1471 struct _vect_peel_info elem, *slot;
1472 _vect_peel_info **new_slot;
1473
1474 elem.npeel = npeel;
1475 slot = peeling_htab->find (&elem);
1476 if (slot)
1477 slot->count++;
1478 else
1479 {
1480 slot = XNEW (struct _vect_peel_info);
1481 slot->npeel = npeel;
1482 slot->dr_info = dr_info;
1483 slot->count = 1;
1484 new_slot = peeling_htab->find_slot (slot, INSERT);
1485 *new_slot = slot;
1486 }
1487
1488 /* If this DR is not supported with unknown misalignment then bias
1489 this slot when the cost model is disabled. */
1490 if (!supportable_if_not_aligned
1491 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1492 slot->count += VECT_MAX_COST;
1493 }
1494
1495
1496 /* Traverse peeling hash table to find peeling option that aligns maximum
1497 number of data accesses. */
1498
1499 int
1500 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1501 _vect_peel_extended_info *max)
1502 {
1503 vect_peel_info elem = *slot;
1504
1505 if (elem->count > max->peel_info.count
1506 || (elem->count == max->peel_info.count
1507 && max->peel_info.npeel > elem->npeel))
1508 {
1509 max->peel_info.npeel = elem->npeel;
1510 max->peel_info.count = elem->count;
1511 max->peel_info.dr_info = elem->dr_info;
1512 }
1513
1514 return 1;
1515 }
1516
1517 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1518 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1519 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1520 after peeling. */
1521
1522 static void
1523 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1524 dr_vec_info *dr0_info,
1525 unsigned int *inside_cost,
1526 unsigned int *outside_cost,
1527 stmt_vector_for_cost *body_cost_vec,
1528 stmt_vector_for_cost *prologue_cost_vec,
1529 unsigned int npeel)
1530 {
1531 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1532
1533 bool dr0_alignment_known_p
1534 = (dr0_info
1535 && known_alignment_for_access_p (dr0_info,
1536 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1537
1538 for (data_reference *dr : datarefs)
1539 {
1540 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541 if (!vect_relevant_for_alignment_p (dr_info))
1542 continue;
1543
1544 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1545 dr_alignment_support alignment_support_scheme;
1546 int misalignment;
1547 unsigned HOST_WIDE_INT alignment;
1548
1549 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1550 size_zero_node) < 0;
1551 poly_int64 off = 0;
1552 if (negative)
1553 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1554 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1555
1556 if (npeel == 0)
1557 misalignment = dr_misalignment (dr_info, vectype, off);
1558 else if (dr_info == dr0_info
1559 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1560 misalignment = 0;
1561 else if (!dr0_alignment_known_p
1562 || !known_alignment_for_access_p (dr_info, vectype)
1563 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1564 misalignment = DR_MISALIGNMENT_UNKNOWN;
1565 else
1566 {
1567 misalignment = dr_misalignment (dr_info, vectype, off);
1568 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1569 misalignment &= alignment - 1;
1570 }
1571 alignment_support_scheme
1572 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1573 misalignment);
1574
1575 vect_get_data_access_cost (loop_vinfo, dr_info,
1576 alignment_support_scheme, misalignment,
1577 inside_cost, outside_cost,
1578 body_cost_vec, prologue_cost_vec);
1579 }
1580 }
1581
1582 /* Traverse peeling hash table and calculate cost for each peeling option.
1583 Find the one with the lowest cost. */
1584
1585 int
1586 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1587 _vect_peel_extended_info *min)
1588 {
1589 vect_peel_info elem = *slot;
1590 int dummy;
1591 unsigned int inside_cost = 0, outside_cost = 0;
1592 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1593 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1594 epilogue_cost_vec;
1595
1596 prologue_cost_vec.create (2);
1597 body_cost_vec.create (2);
1598 epilogue_cost_vec.create (2);
1599
1600 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1601 &outside_cost, &body_cost_vec,
1602 &prologue_cost_vec, elem->npeel);
1603
1604 body_cost_vec.release ();
1605
1606 outside_cost += vect_get_known_peeling_cost
1607 (loop_vinfo, elem->npeel, &dummy,
1608 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1609 &prologue_cost_vec, &epilogue_cost_vec);
1610
1611 /* Prologue and epilogue costs are added to the target model later.
1612 These costs depend only on the scalar iteration cost, the
1613 number of peeling iterations finally chosen, and the number of
1614 misaligned statements. So discard the information found here. */
1615 prologue_cost_vec.release ();
1616 epilogue_cost_vec.release ();
1617
1618 if (inside_cost < min->inside_cost
1619 || (inside_cost == min->inside_cost
1620 && outside_cost < min->outside_cost))
1621 {
1622 min->inside_cost = inside_cost;
1623 min->outside_cost = outside_cost;
1624 min->peel_info.dr_info = elem->dr_info;
1625 min->peel_info.npeel = elem->npeel;
1626 min->peel_info.count = elem->count;
1627 }
1628
1629 return 1;
1630 }
1631
1632
1633 /* Choose best peeling option by traversing peeling hash table and either
1634 choosing an option with the lowest cost (if cost model is enabled) or the
1635 option that aligns as many accesses as possible. */
1636
1637 static struct _vect_peel_extended_info
1638 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1639 loop_vec_info loop_vinfo)
1640 {
1641 struct _vect_peel_extended_info res;
1642
1643 res.peel_info.dr_info = NULL;
1644 res.vinfo = loop_vinfo;
1645
1646 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1647 {
1648 res.inside_cost = INT_MAX;
1649 res.outside_cost = INT_MAX;
1650 peeling_htab->traverse <_vect_peel_extended_info *,
1651 vect_peeling_hash_get_lowest_cost> (&res);
1652 }
1653 else
1654 {
1655 res.peel_info.count = 0;
1656 peeling_htab->traverse <_vect_peel_extended_info *,
1657 vect_peeling_hash_get_most_frequent> (&res);
1658 res.inside_cost = 0;
1659 res.outside_cost = 0;
1660 }
1661
1662 return res;
1663 }
1664
1665 /* Return true if the new peeling NPEEL is supported. */
1666
1667 static bool
1668 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1669 unsigned npeel)
1670 {
1671 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1672 enum dr_alignment_support supportable_dr_alignment;
1673
1674 bool dr0_alignment_known_p
1675 = known_alignment_for_access_p (dr0_info,
1676 STMT_VINFO_VECTYPE (dr0_info->stmt));
1677
1678 /* Ensure that all data refs can be vectorized after the peel. */
1679 for (data_reference *dr : datarefs)
1680 {
1681 if (dr == dr0_info->dr)
1682 continue;
1683
1684 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1685 if (!vect_relevant_for_alignment_p (dr_info)
1686 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1687 continue;
1688
1689 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1690 int misalignment;
1691 unsigned HOST_WIDE_INT alignment;
1692 if (!dr0_alignment_known_p
1693 || !known_alignment_for_access_p (dr_info, vectype)
1694 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1695 misalignment = DR_MISALIGNMENT_UNKNOWN;
1696 else
1697 {
1698 misalignment = dr_misalignment (dr_info, vectype);
1699 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1700 misalignment &= alignment - 1;
1701 }
1702 supportable_dr_alignment
1703 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1704 misalignment);
1705 if (supportable_dr_alignment == dr_unaligned_unsupported)
1706 return false;
1707 }
1708
1709 return true;
1710 }
1711
1712 /* Compare two data-references DRA and DRB to group them into chunks
1713 with related alignment. */
1714
1715 static int
1716 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1717 {
1718 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1719 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1720 int cmp;
1721
1722 /* Stabilize sort. */
1723 if (dra == drb)
1724 return 0;
1725
1726 /* Ordering of DRs according to base. */
1727 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1728 DR_BASE_ADDRESS (drb));
1729 if (cmp != 0)
1730 return cmp;
1731
1732 /* And according to DR_OFFSET. */
1733 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1734 if (cmp != 0)
1735 return cmp;
1736
1737 /* And after step. */
1738 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1739 if (cmp != 0)
1740 return cmp;
1741
1742 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1743 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1744 if (cmp == 0)
1745 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1746 return cmp;
1747 }
1748
1749 /* Function vect_enhance_data_refs_alignment
1750
1751 This pass will use loop versioning and loop peeling in order to enhance
1752 the alignment of data references in the loop.
1753
1754 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1755 original loop is to be vectorized. Any other loops that are created by
1756 the transformations performed in this pass - are not supposed to be
1757 vectorized. This restriction will be relaxed.
1758
1759 This pass will require a cost model to guide it whether to apply peeling
1760 or versioning or a combination of the two. For example, the scheme that
1761 intel uses when given a loop with several memory accesses, is as follows:
1762 choose one memory access ('p') which alignment you want to force by doing
1763 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1764 other accesses are not necessarily aligned, or (2) use loop versioning to
1765 generate one loop in which all accesses are aligned, and another loop in
1766 which only 'p' is necessarily aligned.
1767
1768 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1769 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1770 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1771
1772 Devising a cost model is the most critical aspect of this work. It will
1773 guide us on which access to peel for, whether to use loop versioning, how
1774 many versions to create, etc. The cost model will probably consist of
1775 generic considerations as well as target specific considerations (on
1776 powerpc for example, misaligned stores are more painful than misaligned
1777 loads).
1778
1779 Here are the general steps involved in alignment enhancements:
1780
1781 -- original loop, before alignment analysis:
1782 for (i=0; i<N; i++){
1783 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1784 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1785 }
1786
1787 -- After vect_compute_data_refs_alignment:
1788 for (i=0; i<N; i++){
1789 x = q[i]; # DR_MISALIGNMENT(q) = 3
1790 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1791 }
1792
1793 -- Possibility 1: we do loop versioning:
1794 if (p is aligned) {
1795 for (i=0; i<N; i++){ # loop 1A
1796 x = q[i]; # DR_MISALIGNMENT(q) = 3
1797 p[i] = y; # DR_MISALIGNMENT(p) = 0
1798 }
1799 }
1800 else {
1801 for (i=0; i<N; i++){ # loop 1B
1802 x = q[i]; # DR_MISALIGNMENT(q) = 3
1803 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1804 }
1805 }
1806
1807 -- Possibility 2: we do loop peeling:
1808 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1809 x = q[i];
1810 p[i] = y;
1811 }
1812 for (i = 3; i < N; i++){ # loop 2A
1813 x = q[i]; # DR_MISALIGNMENT(q) = 0
1814 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1815 }
1816
1817 -- Possibility 3: combination of loop peeling and versioning:
1818 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1819 x = q[i];
1820 p[i] = y;
1821 }
1822 if (p is aligned) {
1823 for (i = 3; i<N; i++){ # loop 3A
1824 x = q[i]; # DR_MISALIGNMENT(q) = 0
1825 p[i] = y; # DR_MISALIGNMENT(p) = 0
1826 }
1827 }
1828 else {
1829 for (i = 3; i<N; i++){ # loop 3B
1830 x = q[i]; # DR_MISALIGNMENT(q) = 0
1831 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1832 }
1833 }
1834
1835 These loops are later passed to loop_transform to be vectorized. The
1836 vectorizer will use the alignment information to guide the transformation
1837 (whether to generate regular loads/stores, or with special handling for
1838 misalignment). */
1839
1840 opt_result
1841 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1842 {
1843 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1844 dr_vec_info *first_store = NULL;
1845 dr_vec_info *dr0_info = NULL;
1846 struct data_reference *dr;
1847 unsigned int i;
1848 bool do_peeling = false;
1849 bool do_versioning = false;
1850 unsigned int npeel = 0;
1851 bool one_misalignment_known = false;
1852 bool one_misalignment_unknown = false;
1853 bool one_dr_unsupportable = false;
1854 dr_vec_info *unsupportable_dr_info = NULL;
1855 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1856 hash_table<peel_info_hasher> peeling_htab (1);
1857
1858 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1859
1860 /* Reset data so we can safely be called multiple times. */
1861 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1862 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1863
1864 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1865 return opt_result::success ();
1866
1867 /* Sort the vector of datarefs so DRs that have the same or dependent
1868 alignment are next to each other. */
1869 auto_vec<data_reference_p> datarefs
1870 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1871 datarefs.qsort (dr_align_group_sort_cmp);
1872
1873 /* Compute the number of DRs that become aligned when we peel
1874 a dataref so it becomes aligned. */
1875 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1876 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1877 unsigned i0;
1878 for (i0 = 0; i0 < datarefs.length (); ++i0)
1879 if (DR_BASE_ADDRESS (datarefs[i0]))
1880 break;
1881 for (i = i0 + 1; i <= datarefs.length (); ++i)
1882 {
1883 if (i == datarefs.length ()
1884 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1885 DR_BASE_ADDRESS (datarefs[i]), 0)
1886 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1887 DR_OFFSET (datarefs[i]), 0)
1888 || !operand_equal_p (DR_STEP (datarefs[i0]),
1889 DR_STEP (datarefs[i]), 0))
1890 {
1891 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1892 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1893 will get known misalignment if we align one of the refs
1894 with the largest DR_TARGET_ALIGNMENT. */
1895 for (unsigned j = i0; j < i; ++j)
1896 {
1897 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1898 for (unsigned k = i0; k < i; ++k)
1899 {
1900 if (k == j)
1901 continue;
1902 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1903 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1904 dr_infoj))
1905 n_same_align_refs[j]++;
1906 }
1907 }
1908 i0 = i;
1909 }
1910 }
1911
1912 /* While cost model enhancements are expected in the future, the high level
1913 view of the code at this time is as follows:
1914
1915 A) If there is a misaligned access then see if peeling to align
1916 this access can make all data references satisfy
1917 vect_supportable_dr_alignment. If so, update data structures
1918 as needed and return true.
1919
1920 B) If peeling wasn't possible and there is a data reference with an
1921 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1922 then see if loop versioning checks can be used to make all data
1923 references satisfy vect_supportable_dr_alignment. If so, update
1924 data structures as needed and return true.
1925
1926 C) If neither peeling nor versioning were successful then return false if
1927 any data reference does not satisfy vect_supportable_dr_alignment.
1928
1929 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1930
1931 Note, Possibility 3 above (which is peeling and versioning together) is not
1932 being done at this time. */
1933
1934 /* (1) Peeling to force alignment. */
1935
1936 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1937 Considerations:
1938 + How many accesses will become aligned due to the peeling
1939 - How many accesses will become unaligned due to the peeling,
1940 and the cost of misaligned accesses.
1941 - The cost of peeling (the extra runtime checks, the increase
1942 in code size). */
1943
1944 FOR_EACH_VEC_ELT (datarefs, i, dr)
1945 {
1946 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1947 if (!vect_relevant_for_alignment_p (dr_info))
1948 continue;
1949
1950 stmt_vec_info stmt_info = dr_info->stmt;
1951 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1952 do_peeling = vector_alignment_reachable_p (dr_info);
1953 if (do_peeling)
1954 {
1955 if (known_alignment_for_access_p (dr_info, vectype))
1956 {
1957 unsigned int npeel_tmp = 0;
1958 bool negative = tree_int_cst_compare (DR_STEP (dr),
1959 size_zero_node) < 0;
1960
1961 /* If known_alignment_for_access_p then we have set
1962 DR_MISALIGNMENT which is only done if we know it at compiler
1963 time, so it is safe to assume target alignment is constant.
1964 */
1965 unsigned int target_align =
1966 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1967 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1968 poly_int64 off = 0;
1969 if (negative)
1970 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1971 unsigned int mis = dr_misalignment (dr_info, vectype, off);
1972 mis = negative ? mis : -mis;
1973 if (mis != 0)
1974 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1975
1976 /* For multiple types, it is possible that the bigger type access
1977 will have more than one peeling option. E.g., a loop with two
1978 types: one of size (vector size / 4), and the other one of
1979 size (vector size / 8). Vectorization factor will 8. If both
1980 accesses are misaligned by 3, the first one needs one scalar
1981 iteration to be aligned, and the second one needs 5. But the
1982 first one will be aligned also by peeling 5 scalar
1983 iterations, and in that case both accesses will be aligned.
1984 Hence, except for the immediate peeling amount, we also want
1985 to try to add full vector size, while we don't exceed
1986 vectorization factor.
1987 We do this automatically for cost model, since we calculate
1988 cost for every peeling option. */
1989 poly_uint64 nscalars = npeel_tmp;
1990 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1991 {
1992 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1993 nscalars = (STMT_SLP_TYPE (stmt_info)
1994 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1995 }
1996
1997 /* Save info about DR in the hash table. Also include peeling
1998 amounts according to the explanation above. Indicate
1999 the alignment status when the ref is not aligned.
2000 ??? Rather than using unknown alignment here we should
2001 prune all entries from the peeling hashtable which cause
2002 DRs to be not supported. */
2003 bool supportable_if_not_aligned
2004 = vect_supportable_dr_alignment
2005 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2006 while (known_le (npeel_tmp, nscalars))
2007 {
2008 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2009 dr_info, npeel_tmp,
2010 supportable_if_not_aligned);
2011 npeel_tmp += MAX (1, target_align / dr_size);
2012 }
2013
2014 one_misalignment_known = true;
2015 }
2016 else
2017 {
2018 /* If we don't know any misalignment values, we prefer
2019 peeling for data-ref that has the maximum number of data-refs
2020 with the same alignment, unless the target prefers to align
2021 stores over load. */
2022 unsigned same_align_drs = n_same_align_refs[i];
2023 if (!dr0_info
2024 || dr0_same_align_drs < same_align_drs)
2025 {
2026 dr0_same_align_drs = same_align_drs;
2027 dr0_info = dr_info;
2028 }
2029 /* For data-refs with the same number of related
2030 accesses prefer the one where the misalign
2031 computation will be invariant in the outermost loop. */
2032 else if (dr0_same_align_drs == same_align_drs)
2033 {
2034 class loop *ivloop0, *ivloop;
2035 ivloop0 = outermost_invariant_loop_for_expr
2036 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2037 ivloop = outermost_invariant_loop_for_expr
2038 (loop, DR_BASE_ADDRESS (dr));
2039 if ((ivloop && !ivloop0)
2040 || (ivloop && ivloop0
2041 && flow_loop_nested_p (ivloop, ivloop0)))
2042 dr0_info = dr_info;
2043 }
2044
2045 one_misalignment_unknown = true;
2046
2047 /* Check for data refs with unsupportable alignment that
2048 can be peeled. */
2049 enum dr_alignment_support supportable_dr_alignment
2050 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2051 DR_MISALIGNMENT_UNKNOWN);
2052 if (supportable_dr_alignment == dr_unaligned_unsupported)
2053 {
2054 one_dr_unsupportable = true;
2055 unsupportable_dr_info = dr_info;
2056 }
2057
2058 if (!first_store && DR_IS_WRITE (dr))
2059 {
2060 first_store = dr_info;
2061 first_store_same_align_drs = same_align_drs;
2062 }
2063 }
2064 }
2065 else
2066 {
2067 if (!aligned_access_p (dr_info, vectype))
2068 {
2069 if (dump_enabled_p ())
2070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2071 "vector alignment may not be reachable\n");
2072 break;
2073 }
2074 }
2075 }
2076
2077 /* Check if we can possibly peel the loop. */
2078 if (!vect_can_advance_ivs_p (loop_vinfo)
2079 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2080 || loop->inner)
2081 do_peeling = false;
2082
2083 struct _vect_peel_extended_info peel_for_known_alignment;
2084 struct _vect_peel_extended_info peel_for_unknown_alignment;
2085 struct _vect_peel_extended_info best_peel;
2086
2087 peel_for_unknown_alignment.inside_cost = INT_MAX;
2088 peel_for_unknown_alignment.outside_cost = INT_MAX;
2089 peel_for_unknown_alignment.peel_info.count = 0;
2090
2091 if (do_peeling
2092 && one_misalignment_unknown)
2093 {
2094 /* Check if the target requires to prefer stores over loads, i.e., if
2095 misaligned stores are more expensive than misaligned loads (taking
2096 drs with same alignment into account). */
2097 unsigned int load_inside_cost = 0;
2098 unsigned int load_outside_cost = 0;
2099 unsigned int store_inside_cost = 0;
2100 unsigned int store_outside_cost = 0;
2101 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2102
2103 stmt_vector_for_cost dummy;
2104 dummy.create (2);
2105 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2106 &load_inside_cost,
2107 &load_outside_cost,
2108 &dummy, &dummy, estimated_npeels);
2109 dummy.release ();
2110
2111 if (first_store)
2112 {
2113 dummy.create (2);
2114 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2115 &store_inside_cost,
2116 &store_outside_cost,
2117 &dummy, &dummy,
2118 estimated_npeels);
2119 dummy.release ();
2120 }
2121 else
2122 {
2123 store_inside_cost = INT_MAX;
2124 store_outside_cost = INT_MAX;
2125 }
2126
2127 if (load_inside_cost > store_inside_cost
2128 || (load_inside_cost == store_inside_cost
2129 && load_outside_cost > store_outside_cost))
2130 {
2131 dr0_info = first_store;
2132 dr0_same_align_drs = first_store_same_align_drs;
2133 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2134 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2135 }
2136 else
2137 {
2138 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2139 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2140 }
2141
2142 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2143 prologue_cost_vec.create (2);
2144 epilogue_cost_vec.create (2);
2145
2146 int dummy2;
2147 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2148 (loop_vinfo, estimated_npeels, &dummy2,
2149 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2150 &prologue_cost_vec, &epilogue_cost_vec);
2151
2152 prologue_cost_vec.release ();
2153 epilogue_cost_vec.release ();
2154
2155 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2156 }
2157
2158 peel_for_unknown_alignment.peel_info.npeel = 0;
2159 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2160
2161 best_peel = peel_for_unknown_alignment;
2162
2163 peel_for_known_alignment.inside_cost = INT_MAX;
2164 peel_for_known_alignment.outside_cost = INT_MAX;
2165 peel_for_known_alignment.peel_info.count = 0;
2166 peel_for_known_alignment.peel_info.dr_info = NULL;
2167
2168 if (do_peeling && one_misalignment_known)
2169 {
2170 /* Peeling is possible, but there is no data access that is not supported
2171 unless aligned. So we try to choose the best possible peeling from
2172 the hash table. */
2173 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2174 (&peeling_htab, loop_vinfo);
2175 }
2176
2177 /* Compare costs of peeling for known and unknown alignment. */
2178 if (peel_for_known_alignment.peel_info.dr_info != NULL
2179 && peel_for_unknown_alignment.inside_cost
2180 >= peel_for_known_alignment.inside_cost)
2181 {
2182 best_peel = peel_for_known_alignment;
2183
2184 /* If the best peeling for known alignment has NPEEL == 0, perform no
2185 peeling at all except if there is an unsupportable dr that we can
2186 align. */
2187 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2188 do_peeling = false;
2189 }
2190
2191 /* If there is an unsupportable data ref, prefer this over all choices so far
2192 since we'd have to discard a chosen peeling except when it accidentally
2193 aligned the unsupportable data ref. */
2194 if (one_dr_unsupportable)
2195 dr0_info = unsupportable_dr_info;
2196 else if (do_peeling)
2197 {
2198 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2199 TODO: Use nopeel_outside_cost or get rid of it? */
2200 unsigned nopeel_inside_cost = 0;
2201 unsigned nopeel_outside_cost = 0;
2202
2203 stmt_vector_for_cost dummy;
2204 dummy.create (2);
2205 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2206 &nopeel_outside_cost, &dummy, &dummy, 0);
2207 dummy.release ();
2208
2209 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2210 costs will be recorded. */
2211 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2212 prologue_cost_vec.create (2);
2213 epilogue_cost_vec.create (2);
2214
2215 int dummy2;
2216 nopeel_outside_cost += vect_get_known_peeling_cost
2217 (loop_vinfo, 0, &dummy2,
2218 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2219 &prologue_cost_vec, &epilogue_cost_vec);
2220
2221 prologue_cost_vec.release ();
2222 epilogue_cost_vec.release ();
2223
2224 npeel = best_peel.peel_info.npeel;
2225 dr0_info = best_peel.peel_info.dr_info;
2226
2227 /* If no peeling is not more expensive than the best peeling we
2228 have so far, don't perform any peeling. */
2229 if (nopeel_inside_cost <= best_peel.inside_cost)
2230 do_peeling = false;
2231 }
2232
2233 if (do_peeling)
2234 {
2235 stmt_vec_info stmt_info = dr0_info->stmt;
2236 if (known_alignment_for_access_p (dr0_info,
2237 STMT_VINFO_VECTYPE (stmt_info)))
2238 {
2239 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2240 size_zero_node) < 0;
2241 if (!npeel)
2242 {
2243 /* Since it's known at compile time, compute the number of
2244 iterations in the peeled loop (the peeling factor) for use in
2245 updating DR_MISALIGNMENT values. The peeling factor is the
2246 vectorization factor minus the misalignment as an element
2247 count. */
2248 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2249 poly_int64 off = 0;
2250 if (negative)
2251 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2252 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2253 unsigned int mis
2254 = dr_misalignment (dr0_info, vectype, off);
2255 mis = negative ? mis : -mis;
2256 /* If known_alignment_for_access_p then we have set
2257 DR_MISALIGNMENT which is only done if we know it at compiler
2258 time, so it is safe to assume target alignment is constant.
2259 */
2260 unsigned int target_align =
2261 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2262 npeel = ((mis & (target_align - 1))
2263 / vect_get_scalar_dr_size (dr0_info));
2264 }
2265
2266 /* For interleaved data access every iteration accesses all the
2267 members of the group, therefore we divide the number of iterations
2268 by the group size. */
2269 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2270 npeel /= DR_GROUP_SIZE (stmt_info);
2271
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_NOTE, vect_location,
2274 "Try peeling by %d\n", npeel);
2275 }
2276
2277 /* Ensure that all datarefs can be vectorized after the peel. */
2278 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2279 do_peeling = false;
2280
2281 /* Check if all datarefs are supportable and log. */
2282 if (do_peeling
2283 && npeel == 0
2284 && known_alignment_for_access_p (dr0_info,
2285 STMT_VINFO_VECTYPE (stmt_info)))
2286 return opt_result::success ();
2287
2288 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2289 if (do_peeling)
2290 {
2291 unsigned max_allowed_peel
2292 = param_vect_max_peeling_for_alignment;
2293 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2294 max_allowed_peel = 0;
2295 if (max_allowed_peel != (unsigned)-1)
2296 {
2297 unsigned max_peel = npeel;
2298 if (max_peel == 0)
2299 {
2300 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2301 unsigned HOST_WIDE_INT target_align_c;
2302 if (target_align.is_constant (&target_align_c))
2303 max_peel =
2304 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2305 else
2306 {
2307 do_peeling = false;
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_NOTE, vect_location,
2310 "Disable peeling, max peels set and vector"
2311 " alignment unknown\n");
2312 }
2313 }
2314 if (max_peel > max_allowed_peel)
2315 {
2316 do_peeling = false;
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_NOTE, vect_location,
2319 "Disable peeling, max peels reached: %d\n", max_peel);
2320 }
2321 }
2322 }
2323
2324 /* Cost model #2 - if peeling may result in a remaining loop not
2325 iterating enough to be vectorized then do not peel. Since this
2326 is a cost heuristic rather than a correctness decision, use the
2327 most likely runtime value for variable vectorization factors. */
2328 if (do_peeling
2329 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2330 {
2331 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2332 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2333 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2334 < assumed_vf + max_peel)
2335 do_peeling = false;
2336 }
2337
2338 if (do_peeling)
2339 {
2340 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2341 If the misalignment of DR_i is identical to that of dr0 then set
2342 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2343 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2344 by the peeling factor times the element size of DR_i (MOD the
2345 vectorization factor times the size). Otherwise, the
2346 misalignment of DR_i must be set to unknown. */
2347 FOR_EACH_VEC_ELT (datarefs, i, dr)
2348 if (dr != dr0_info->dr)
2349 {
2350 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2351 if (!vect_relevant_for_alignment_p (dr_info))
2352 continue;
2353
2354 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2355 }
2356
2357 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2358 if (npeel)
2359 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2360 else
2361 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2362 SET_DR_MISALIGNMENT (dr0_info,
2363 vect_dr_misalign_for_aligned_access (dr0_info));
2364 if (dump_enabled_p ())
2365 {
2366 dump_printf_loc (MSG_NOTE, vect_location,
2367 "Alignment of access forced using peeling.\n");
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "Peeling for alignment will be applied.\n");
2370 }
2371
2372 /* The inside-loop cost will be accounted for in vectorizable_load
2373 and vectorizable_store correctly with adjusted alignments.
2374 Drop the body_cst_vec on the floor here. */
2375 return opt_result::success ();
2376 }
2377 }
2378
2379 /* (2) Versioning to force alignment. */
2380
2381 /* Try versioning if:
2382 1) optimize loop for speed and the cost-model is not cheap
2383 2) there is at least one unsupported misaligned data ref with an unknown
2384 misalignment, and
2385 3) all misaligned data refs with a known misalignment are supported, and
2386 4) the number of runtime alignment checks is within reason. */
2387
2388 do_versioning
2389 = (optimize_loop_nest_for_speed_p (loop)
2390 && !loop->inner /* FORNOW */
2391 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2392
2393 if (do_versioning)
2394 {
2395 FOR_EACH_VEC_ELT (datarefs, i, dr)
2396 {
2397 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2398 if (!vect_relevant_for_alignment_p (dr_info))
2399 continue;
2400
2401 stmt_vec_info stmt_info = dr_info->stmt;
2402 if (STMT_VINFO_STRIDED_P (stmt_info))
2403 {
2404 do_versioning = false;
2405 break;
2406 }
2407
2408 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2409 bool negative = tree_int_cst_compare (DR_STEP (dr),
2410 size_zero_node) < 0;
2411 poly_int64 off = 0;
2412 if (negative)
2413 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2414 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2415 int misalignment;
2416 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2417 continue;
2418
2419 enum dr_alignment_support supportable_dr_alignment
2420 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2421 misalignment);
2422 if (supportable_dr_alignment == dr_unaligned_unsupported)
2423 {
2424 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2425 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2426 >= (unsigned) param_vect_max_version_for_alignment_checks))
2427 {
2428 do_versioning = false;
2429 break;
2430 }
2431
2432 /* At present we don't support versioning for alignment
2433 with variable VF, since there's no guarantee that the
2434 VF is a power of two. We could relax this if we added
2435 a way of enforcing a power-of-two size. */
2436 unsigned HOST_WIDE_INT size;
2437 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2438 {
2439 do_versioning = false;
2440 break;
2441 }
2442
2443 /* Forcing alignment in the first iteration is no good if
2444 we don't keep it across iterations. For now, just disable
2445 versioning in this case.
2446 ?? We could actually unroll the loop to achieve the required
2447 overall step alignment, and forcing the alignment could be
2448 done by doing some iterations of the non-vectorized loop. */
2449 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2450 * DR_STEP_ALIGNMENT (dr),
2451 DR_TARGET_ALIGNMENT (dr_info)))
2452 {
2453 do_versioning = false;
2454 break;
2455 }
2456
2457 /* The rightmost bits of an aligned address must be zeros.
2458 Construct the mask needed for this test. For example,
2459 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2460 mask must be 15 = 0xf. */
2461 int mask = size - 1;
2462
2463 /* FORNOW: use the same mask to test all potentially unaligned
2464 references in the loop. */
2465 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2466 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2467 {
2468 do_versioning = false;
2469 break;
2470 }
2471
2472 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2473 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2474 }
2475 }
2476
2477 /* Versioning requires at least one misaligned data reference. */
2478 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2479 do_versioning = false;
2480 else if (!do_versioning)
2481 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2482 }
2483
2484 if (do_versioning)
2485 {
2486 const vec<stmt_vec_info> &may_misalign_stmts
2487 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2488 stmt_vec_info stmt_info;
2489
2490 /* It can now be assumed that the data references in the statements
2491 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2492 of the loop being vectorized. */
2493 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2494 {
2495 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2496 SET_DR_MISALIGNMENT (dr_info,
2497 vect_dr_misalign_for_aligned_access (dr_info));
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_NOTE, vect_location,
2500 "Alignment of access forced using versioning.\n");
2501 }
2502
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_NOTE, vect_location,
2505 "Versioning for alignment will be applied.\n");
2506
2507 /* Peeling and versioning can't be done together at this time. */
2508 gcc_assert (! (do_peeling && do_versioning));
2509
2510 return opt_result::success ();
2511 }
2512
2513 /* This point is reached if neither peeling nor versioning is being done. */
2514 gcc_assert (! (do_peeling || do_versioning));
2515
2516 return opt_result::success ();
2517 }
2518
2519
2520 /* Function vect_analyze_data_refs_alignment
2521
2522 Analyze the alignment of the data-references in the loop.
2523 Return FALSE if a data reference is found that cannot be vectorized. */
2524
2525 opt_result
2526 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2527 {
2528 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2529
2530 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2531 struct data_reference *dr;
2532 unsigned int i;
2533
2534 vect_record_base_alignments (loop_vinfo);
2535 FOR_EACH_VEC_ELT (datarefs, i, dr)
2536 {
2537 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2538 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2539 {
2540 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2541 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2542 continue;
2543 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2544 STMT_VINFO_VECTYPE (dr_info->stmt));
2545 }
2546 }
2547
2548 return opt_result::success ();
2549 }
2550
2551
2552 /* Analyze alignment of DRs of stmts in NODE. */
2553
2554 static bool
2555 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2556 {
2557 /* Alignment is maintained in the first element of the group. */
2558 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2559 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2560 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2561 tree vectype = SLP_TREE_VECTYPE (node);
2562 poly_uint64 vector_alignment
2563 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2564 BITS_PER_UNIT);
2565 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2566 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2567 /* Re-analyze alignment when we're facing a vectorization with a bigger
2568 alignment requirement. */
2569 else if (known_lt (dr_info->target_alignment, vector_alignment))
2570 {
2571 poly_uint64 old_target_alignment = dr_info->target_alignment;
2572 int old_misalignment = dr_info->misalignment;
2573 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2574 /* But keep knowledge about a smaller alignment. */
2575 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2576 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2577 {
2578 dr_info->target_alignment = old_target_alignment;
2579 dr_info->misalignment = old_misalignment;
2580 }
2581 }
2582 /* When we ever face unordered target alignments the first one wins in terms
2583 of analyzing and the other will become unknown in dr_misalignment. */
2584 return true;
2585 }
2586
2587 /* Function vect_slp_analyze_instance_alignment
2588
2589 Analyze the alignment of the data-references in the SLP instance.
2590 Return FALSE if a data reference is found that cannot be vectorized. */
2591
2592 bool
2593 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2594 slp_instance instance)
2595 {
2596 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2597
2598 slp_tree node;
2599 unsigned i;
2600 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2601 if (! vect_slp_analyze_node_alignment (vinfo, node))
2602 return false;
2603
2604 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2605 && ! vect_slp_analyze_node_alignment
2606 (vinfo, SLP_INSTANCE_TREE (instance)))
2607 return false;
2608
2609 return true;
2610 }
2611
2612
2613 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2614 accesses of legal size, step, etc. Detect gaps, single element
2615 interleaving, and other special cases. Set grouped access info.
2616 Collect groups of strided stores for further use in SLP analysis.
2617 Worker for vect_analyze_group_access. */
2618
2619 static bool
2620 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2621 {
2622 data_reference *dr = dr_info->dr;
2623 tree step = DR_STEP (dr);
2624 tree scalar_type = TREE_TYPE (DR_REF (dr));
2625 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2626 stmt_vec_info stmt_info = dr_info->stmt;
2627 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2628 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2629 HOST_WIDE_INT dr_step = -1;
2630 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2631 bool slp_impossible = false;
2632
2633 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2634 size of the interleaving group (including gaps). */
2635 if (tree_fits_shwi_p (step))
2636 {
2637 dr_step = tree_to_shwi (step);
2638 /* Check that STEP is a multiple of type size. Otherwise there is
2639 a non-element-sized gap at the end of the group which we
2640 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2641 ??? As we can handle non-constant step fine here we should
2642 simply remove uses of DR_GROUP_GAP between the last and first
2643 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2644 simply not include that gap. */
2645 if ((dr_step % type_size) != 0)
2646 {
2647 if (dump_enabled_p ())
2648 dump_printf_loc (MSG_NOTE, vect_location,
2649 "Step %T is not a multiple of the element size"
2650 " for %T\n",
2651 step, DR_REF (dr));
2652 return false;
2653 }
2654 groupsize = absu_hwi (dr_step) / type_size;
2655 }
2656 else
2657 groupsize = 0;
2658
2659 /* Not consecutive access is possible only if it is a part of interleaving. */
2660 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2661 {
2662 /* Check if it this DR is a part of interleaving, and is a single
2663 element of the group that is accessed in the loop. */
2664
2665 /* Gaps are supported only for loads. STEP must be a multiple of the type
2666 size. */
2667 if (DR_IS_READ (dr)
2668 && (dr_step % type_size) == 0
2669 && groupsize > 0
2670 /* This could be UINT_MAX but as we are generating code in a very
2671 inefficient way we have to cap earlier.
2672 See PR91403 for example. */
2673 && groupsize <= 4096)
2674 {
2675 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2676 DR_GROUP_SIZE (stmt_info) = groupsize;
2677 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2678 if (dump_enabled_p ())
2679 dump_printf_loc (MSG_NOTE, vect_location,
2680 "Detected single element interleaving %T"
2681 " step %T\n",
2682 DR_REF (dr), step);
2683
2684 return true;
2685 }
2686
2687 if (dump_enabled_p ())
2688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2689 "not consecutive access %G", stmt_info->stmt);
2690
2691 if (bb_vinfo)
2692 {
2693 /* Mark the statement as unvectorizable. */
2694 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2695 return true;
2696 }
2697
2698 if (dump_enabled_p ())
2699 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2700 STMT_VINFO_STRIDED_P (stmt_info) = true;
2701 return true;
2702 }
2703
2704 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2705 {
2706 /* First stmt in the interleaving chain. Check the chain. */
2707 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2708 struct data_reference *data_ref = dr;
2709 unsigned int count = 1;
2710 tree prev_init = DR_INIT (data_ref);
2711 HOST_WIDE_INT diff, gaps = 0;
2712
2713 /* By construction, all group members have INTEGER_CST DR_INITs. */
2714 while (next)
2715 {
2716 /* We never have the same DR multiple times. */
2717 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2718 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2719
2720 data_ref = STMT_VINFO_DATA_REF (next);
2721
2722 /* All group members have the same STEP by construction. */
2723 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2724
2725 /* Check that the distance between two accesses is equal to the type
2726 size. Otherwise, we have gaps. */
2727 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2728 - TREE_INT_CST_LOW (prev_init)) / type_size;
2729 if (diff != 1)
2730 {
2731 /* FORNOW: SLP of accesses with gaps is not supported. */
2732 slp_impossible = true;
2733 if (DR_IS_WRITE (data_ref))
2734 {
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2737 "interleaved store with gaps\n");
2738 return false;
2739 }
2740
2741 gaps += diff - 1;
2742 }
2743
2744 last_accessed_element += diff;
2745
2746 /* Store the gap from the previous member of the group. If there is no
2747 gap in the access, DR_GROUP_GAP is always 1. */
2748 DR_GROUP_GAP (next) = diff;
2749
2750 prev_init = DR_INIT (data_ref);
2751 next = DR_GROUP_NEXT_ELEMENT (next);
2752 /* Count the number of data-refs in the chain. */
2753 count++;
2754 }
2755
2756 if (groupsize == 0)
2757 groupsize = count + gaps;
2758
2759 /* This could be UINT_MAX but as we are generating code in a very
2760 inefficient way we have to cap earlier. See PR78699 for example. */
2761 if (groupsize > 4096)
2762 {
2763 if (dump_enabled_p ())
2764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2765 "group is too large\n");
2766 return false;
2767 }
2768
2769 /* Check that the size of the interleaving is equal to count for stores,
2770 i.e., that there are no gaps. */
2771 if (groupsize != count
2772 && !DR_IS_READ (dr))
2773 {
2774 groupsize = count;
2775 STMT_VINFO_STRIDED_P (stmt_info) = true;
2776 }
2777
2778 /* If there is a gap after the last load in the group it is the
2779 difference between the groupsize and the last accessed
2780 element.
2781 When there is no gap, this difference should be 0. */
2782 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2783
2784 DR_GROUP_SIZE (stmt_info) = groupsize;
2785 if (dump_enabled_p ())
2786 {
2787 dump_printf_loc (MSG_NOTE, vect_location,
2788 "Detected interleaving ");
2789 if (DR_IS_READ (dr))
2790 dump_printf (MSG_NOTE, "load ");
2791 else if (STMT_VINFO_STRIDED_P (stmt_info))
2792 dump_printf (MSG_NOTE, "strided store ");
2793 else
2794 dump_printf (MSG_NOTE, "store ");
2795 dump_printf (MSG_NOTE, "of size %u\n",
2796 (unsigned)groupsize);
2797 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2798 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2799 while (next)
2800 {
2801 if (DR_GROUP_GAP (next) != 1)
2802 dump_printf_loc (MSG_NOTE, vect_location,
2803 "\t<gap of %d elements>\n",
2804 DR_GROUP_GAP (next) - 1);
2805 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2806 next = DR_GROUP_NEXT_ELEMENT (next);
2807 }
2808 if (DR_GROUP_GAP (stmt_info) != 0)
2809 dump_printf_loc (MSG_NOTE, vect_location,
2810 "\t<gap of %d elements>\n",
2811 DR_GROUP_GAP (stmt_info));
2812 }
2813
2814 /* SLP: create an SLP data structure for every interleaving group of
2815 stores for further analysis in vect_analyse_slp. */
2816 if (DR_IS_WRITE (dr) && !slp_impossible)
2817 {
2818 if (loop_vinfo)
2819 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2820 if (bb_vinfo)
2821 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2822 }
2823 }
2824
2825 return true;
2826 }
2827
2828 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2829 accesses of legal size, step, etc. Detect gaps, single element
2830 interleaving, and other special cases. Set grouped access info.
2831 Collect groups of strided stores for further use in SLP analysis. */
2832
2833 static bool
2834 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2835 {
2836 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2837 {
2838 /* Dissolve the group if present. */
2839 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2840 while (stmt_info)
2841 {
2842 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2843 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2844 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2845 stmt_info = next;
2846 }
2847 return false;
2848 }
2849 return true;
2850 }
2851
2852 /* Analyze the access pattern of the data-reference DR_INFO.
2853 In case of non-consecutive accesses call vect_analyze_group_access() to
2854 analyze groups of accesses. */
2855
2856 static bool
2857 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2858 {
2859 data_reference *dr = dr_info->dr;
2860 tree step = DR_STEP (dr);
2861 tree scalar_type = TREE_TYPE (DR_REF (dr));
2862 stmt_vec_info stmt_info = dr_info->stmt;
2863 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2864 class loop *loop = NULL;
2865
2866 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2867 return true;
2868
2869 if (loop_vinfo)
2870 loop = LOOP_VINFO_LOOP (loop_vinfo);
2871
2872 if (loop_vinfo && !step)
2873 {
2874 if (dump_enabled_p ())
2875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2876 "bad data-ref access in loop\n");
2877 return false;
2878 }
2879
2880 /* Allow loads with zero step in inner-loop vectorization. */
2881 if (loop_vinfo && integer_zerop (step))
2882 {
2883 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2884 if (!nested_in_vect_loop_p (loop, stmt_info))
2885 return DR_IS_READ (dr);
2886 /* Allow references with zero step for outer loops marked
2887 with pragma omp simd only - it guarantees absence of
2888 loop-carried dependencies between inner loop iterations. */
2889 if (loop->safelen < 2)
2890 {
2891 if (dump_enabled_p ())
2892 dump_printf_loc (MSG_NOTE, vect_location,
2893 "zero step in inner loop of nest\n");
2894 return false;
2895 }
2896 }
2897
2898 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2899 {
2900 /* Interleaved accesses are not yet supported within outer-loop
2901 vectorization for references in the inner-loop. */
2902 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2903
2904 /* For the rest of the analysis we use the outer-loop step. */
2905 step = STMT_VINFO_DR_STEP (stmt_info);
2906 if (integer_zerop (step))
2907 {
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_NOTE, vect_location,
2910 "zero step in outer loop.\n");
2911 return DR_IS_READ (dr);
2912 }
2913 }
2914
2915 /* Consecutive? */
2916 if (TREE_CODE (step) == INTEGER_CST)
2917 {
2918 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2919 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2920 || (dr_step < 0
2921 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2922 {
2923 /* Mark that it is not interleaving. */
2924 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2925 return true;
2926 }
2927 }
2928
2929 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2930 {
2931 if (dump_enabled_p ())
2932 dump_printf_loc (MSG_NOTE, vect_location,
2933 "grouped access in outer loop.\n");
2934 return false;
2935 }
2936
2937
2938 /* Assume this is a DR handled by non-constant strided load case. */
2939 if (TREE_CODE (step) != INTEGER_CST)
2940 return (STMT_VINFO_STRIDED_P (stmt_info)
2941 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2942 || vect_analyze_group_access (vinfo, dr_info)));
2943
2944 /* Not consecutive access - check if it's a part of interleaving group. */
2945 return vect_analyze_group_access (vinfo, dr_info);
2946 }
2947
2948 /* Compare two data-references DRA and DRB to group them into chunks
2949 suitable for grouping. */
2950
2951 static int
2952 dr_group_sort_cmp (const void *dra_, const void *drb_)
2953 {
2954 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2955 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2956 data_reference_p dra = dra_info->dr;
2957 data_reference_p drb = drb_info->dr;
2958 int cmp;
2959
2960 /* Stabilize sort. */
2961 if (dra == drb)
2962 return 0;
2963
2964 /* Different group IDs lead never belong to the same group. */
2965 if (dra_info->group != drb_info->group)
2966 return dra_info->group < drb_info->group ? -1 : 1;
2967
2968 /* Ordering of DRs according to base. */
2969 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2970 DR_BASE_ADDRESS (drb));
2971 if (cmp != 0)
2972 return cmp;
2973
2974 /* And according to DR_OFFSET. */
2975 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2976 if (cmp != 0)
2977 return cmp;
2978
2979 /* Put reads before writes. */
2980 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2981 return DR_IS_READ (dra) ? -1 : 1;
2982
2983 /* Then sort after access size. */
2984 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2985 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2986 if (cmp != 0)
2987 return cmp;
2988
2989 /* And after step. */
2990 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2991 if (cmp != 0)
2992 return cmp;
2993
2994 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2995 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2996 if (cmp == 0)
2997 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2998 return cmp;
2999 }
3000
3001 /* If OP is the result of a conversion, return the unconverted value,
3002 otherwise return null. */
3003
3004 static tree
3005 strip_conversion (tree op)
3006 {
3007 if (TREE_CODE (op) != SSA_NAME)
3008 return NULL_TREE;
3009 gimple *stmt = SSA_NAME_DEF_STMT (op);
3010 if (!is_gimple_assign (stmt)
3011 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3012 return NULL_TREE;
3013 return gimple_assign_rhs1 (stmt);
3014 }
3015
3016 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3017 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3018 be grouped in SLP mode. */
3019
3020 static bool
3021 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3022 bool allow_slp_p)
3023 {
3024 if (gimple_assign_single_p (stmt1_info->stmt))
3025 return gimple_assign_single_p (stmt2_info->stmt);
3026
3027 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3028 if (call1 && gimple_call_internal_p (call1))
3029 {
3030 /* Check for two masked loads or two masked stores. */
3031 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3032 if (!call2 || !gimple_call_internal_p (call2))
3033 return false;
3034 internal_fn ifn = gimple_call_internal_fn (call1);
3035 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3036 return false;
3037 if (ifn != gimple_call_internal_fn (call2))
3038 return false;
3039
3040 /* Check that the masks are the same. Cope with casts of masks,
3041 like those created by build_mask_conversion. */
3042 tree mask1 = gimple_call_arg (call1, 2);
3043 tree mask2 = gimple_call_arg (call2, 2);
3044 if (!operand_equal_p (mask1, mask2, 0)
3045 && (ifn == IFN_MASK_STORE || !allow_slp_p))
3046 {
3047 mask1 = strip_conversion (mask1);
3048 if (!mask1)
3049 return false;
3050 mask2 = strip_conversion (mask2);
3051 if (!mask2)
3052 return false;
3053 if (!operand_equal_p (mask1, mask2, 0))
3054 return false;
3055 }
3056 return true;
3057 }
3058
3059 return false;
3060 }
3061
3062 /* Function vect_analyze_data_ref_accesses.
3063
3064 Analyze the access pattern of all the data references in the loop.
3065
3066 FORNOW: the only access pattern that is considered vectorizable is a
3067 simple step 1 (consecutive) access.
3068
3069 FORNOW: handle only arrays and pointer accesses. */
3070
3071 opt_result
3072 vect_analyze_data_ref_accesses (vec_info *vinfo,
3073 vec<int> *dataref_groups)
3074 {
3075 unsigned int i;
3076 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3077
3078 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3079
3080 if (datarefs.is_empty ())
3081 return opt_result::success ();
3082
3083 /* Sort the array of datarefs to make building the interleaving chains
3084 linear. Don't modify the original vector's order, it is needed for
3085 determining what dependencies are reversed. */
3086 vec<dr_vec_info *> datarefs_copy;
3087 datarefs_copy.create (datarefs.length ());
3088 for (unsigned i = 0; i < datarefs.length (); i++)
3089 {
3090 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3091 /* If the caller computed DR grouping use that, otherwise group by
3092 basic blocks. */
3093 if (dataref_groups)
3094 dr_info->group = (*dataref_groups)[i];
3095 else
3096 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3097 datarefs_copy.quick_push (dr_info);
3098 }
3099 datarefs_copy.qsort (dr_group_sort_cmp);
3100 hash_set<stmt_vec_info> to_fixup;
3101
3102 /* Build the interleaving chains. */
3103 for (i = 0; i < datarefs_copy.length () - 1;)
3104 {
3105 dr_vec_info *dr_info_a = datarefs_copy[i];
3106 data_reference_p dra = dr_info_a->dr;
3107 int dra_group_id = dr_info_a->group;
3108 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3109 stmt_vec_info lastinfo = NULL;
3110 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3111 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3112 {
3113 ++i;
3114 continue;
3115 }
3116 for (i = i + 1; i < datarefs_copy.length (); ++i)
3117 {
3118 dr_vec_info *dr_info_b = datarefs_copy[i];
3119 data_reference_p drb = dr_info_b->dr;
3120 int drb_group_id = dr_info_b->group;
3121 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3122 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3123 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3124 break;
3125
3126 /* ??? Imperfect sorting (non-compatible types, non-modulo
3127 accesses, same accesses) can lead to a group to be artificially
3128 split here as we don't just skip over those. If it really
3129 matters we can push those to a worklist and re-iterate
3130 over them. The we can just skip ahead to the next DR here. */
3131
3132 /* DRs in a different DR group should not be put into the same
3133 interleaving group. */
3134 if (dra_group_id != drb_group_id)
3135 break;
3136
3137 /* Check that the data-refs have same first location (except init)
3138 and they are both either store or load (not load and store,
3139 not masked loads or stores). */
3140 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3141 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3142 DR_BASE_ADDRESS (drb)) != 0
3143 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3144 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3145 break;
3146
3147 /* Check that the data-refs have the same constant size. */
3148 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3149 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3150 if (!tree_fits_uhwi_p (sza)
3151 || !tree_fits_uhwi_p (szb)
3152 || !tree_int_cst_equal (sza, szb))
3153 break;
3154
3155 /* Check that the data-refs have the same step. */
3156 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3157 break;
3158
3159 /* Check the types are compatible.
3160 ??? We don't distinguish this during sorting. */
3161 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3162 TREE_TYPE (DR_REF (drb))))
3163 break;
3164
3165 /* Check that the DR_INITs are compile-time constants. */
3166 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3167 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3168 break;
3169
3170 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3171 just hold extra information. */
3172 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3173 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3174 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3175 break;
3176
3177 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3178 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3179 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3180 HOST_WIDE_INT init_prev
3181 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3182 gcc_assert (init_a <= init_b
3183 && init_a <= init_prev
3184 && init_prev <= init_b);
3185
3186 /* Do not place the same access in the interleaving chain twice. */
3187 if (init_b == init_prev)
3188 {
3189 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3190 < gimple_uid (DR_STMT (drb)));
3191 /* Simply link in duplicates and fix up the chain below. */
3192 }
3193 else
3194 {
3195 /* If init_b == init_a + the size of the type * k, we have an
3196 interleaving, and DRA is accessed before DRB. */
3197 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3198 if (type_size_a == 0
3199 || (init_b - init_a) % type_size_a != 0)
3200 break;
3201
3202 /* If we have a store, the accesses are adjacent. This splits
3203 groups into chunks we support (we don't support vectorization
3204 of stores with gaps). */
3205 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3206 break;
3207
3208 /* If the step (if not zero or non-constant) is smaller than the
3209 difference between data-refs' inits this splits groups into
3210 suitable sizes. */
3211 if (tree_fits_shwi_p (DR_STEP (dra)))
3212 {
3213 unsigned HOST_WIDE_INT step
3214 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3215 if (step != 0
3216 && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3217 break;
3218 }
3219 }
3220
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 DR_IS_READ (dra)
3224 ? "Detected interleaving load %T and %T\n"
3225 : "Detected interleaving store %T and %T\n",
3226 DR_REF (dra), DR_REF (drb));
3227
3228 /* Link the found element into the group list. */
3229 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3230 {
3231 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3232 lastinfo = stmtinfo_a;
3233 }
3234 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3235 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3236 lastinfo = stmtinfo_b;
3237
3238 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3239 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3240
3241 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3242 dump_printf_loc (MSG_NOTE, vect_location,
3243 "Load suitable for SLP vectorization only.\n");
3244
3245 if (init_b == init_prev
3246 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3247 && dump_enabled_p ())
3248 dump_printf_loc (MSG_NOTE, vect_location,
3249 "Queuing group with duplicate access for fixup\n");
3250 }
3251 }
3252
3253 /* Fixup groups with duplicate entries by splitting it. */
3254 while (1)
3255 {
3256 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3257 if (!(it != to_fixup.end ()))
3258 break;
3259 stmt_vec_info grp = *it;
3260 to_fixup.remove (grp);
3261
3262 /* Find the earliest duplicate group member. */
3263 unsigned first_duplicate = -1u;
3264 stmt_vec_info next, g = grp;
3265 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3266 {
3267 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3268 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3269 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3270 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3271 g = next;
3272 }
3273 if (first_duplicate == -1U)
3274 continue;
3275
3276 /* Then move all stmts after the first duplicate to a new group.
3277 Note this is a heuristic but one with the property that *it
3278 is fixed up completely. */
3279 g = grp;
3280 stmt_vec_info newgroup = NULL, ng = grp;
3281 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3282 {
3283 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3284 {
3285 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3286 if (!newgroup)
3287 newgroup = next;
3288 else
3289 DR_GROUP_NEXT_ELEMENT (ng) = next;
3290 ng = next;
3291 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3292 }
3293 else
3294 g = DR_GROUP_NEXT_ELEMENT (g);
3295 }
3296 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3297
3298 /* Fixup the new group which still may contain duplicates. */
3299 to_fixup.add (newgroup);
3300 }
3301
3302 dr_vec_info *dr_info;
3303 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3304 {
3305 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3306 && !vect_analyze_data_ref_access (vinfo, dr_info))
3307 {
3308 if (dump_enabled_p ())
3309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3310 "not vectorized: complicated access pattern.\n");
3311
3312 if (is_a <bb_vec_info> (vinfo))
3313 {
3314 /* Mark the statement as not vectorizable. */
3315 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3316 continue;
3317 }
3318 else
3319 {
3320 datarefs_copy.release ();
3321 return opt_result::failure_at (dr_info->stmt->stmt,
3322 "not vectorized:"
3323 " complicated access pattern.\n");
3324 }
3325 }
3326 }
3327
3328 datarefs_copy.release ();
3329 return opt_result::success ();
3330 }
3331
3332 /* Function vect_vfa_segment_size.
3333
3334 Input:
3335 DR_INFO: The data reference.
3336 LENGTH_FACTOR: segment length to consider.
3337
3338 Return a value suitable for the dr_with_seg_len::seg_len field.
3339 This is the "distance travelled" by the pointer from the first
3340 iteration in the segment to the last. Note that it does not include
3341 the size of the access; in effect it only describes the first byte. */
3342
3343 static tree
3344 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3345 {
3346 length_factor = size_binop (MINUS_EXPR,
3347 fold_convert (sizetype, length_factor),
3348 size_one_node);
3349 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3350 length_factor);
3351 }
3352
3353 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3354 gives the worst-case number of bytes covered by the segment. */
3355
3356 static unsigned HOST_WIDE_INT
3357 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3358 {
3359 stmt_vec_info stmt_vinfo = dr_info->stmt;
3360 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3361 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3362 unsigned HOST_WIDE_INT access_size = ref_size;
3363 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3364 {
3365 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3366 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3367 }
3368 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3369 int misalignment;
3370 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3371 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3372 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3373 == dr_explicit_realign_optimized))
3374 {
3375 /* We might access a full vector's worth. */
3376 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3377 }
3378 return access_size;
3379 }
3380
3381 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3382 describes. */
3383
3384 static unsigned int
3385 vect_vfa_align (dr_vec_info *dr_info)
3386 {
3387 return dr_alignment (dr_info->dr);
3388 }
3389
3390 /* Function vect_no_alias_p.
3391
3392 Given data references A and B with equal base and offset, see whether
3393 the alias relation can be decided at compilation time. Return 1 if
3394 it can and the references alias, 0 if it can and the references do
3395 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3396 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3397 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3398
3399 static int
3400 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3401 tree segment_length_a, tree segment_length_b,
3402 unsigned HOST_WIDE_INT access_size_a,
3403 unsigned HOST_WIDE_INT access_size_b)
3404 {
3405 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3406 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3407 poly_uint64 const_length_a;
3408 poly_uint64 const_length_b;
3409
3410 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3411 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3412 [a, a+12) */
3413 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3414 {
3415 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3416 offset_a -= const_length_a;
3417 }
3418 else
3419 const_length_a = tree_to_poly_uint64 (segment_length_a);
3420 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3421 {
3422 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3423 offset_b -= const_length_b;
3424 }
3425 else
3426 const_length_b = tree_to_poly_uint64 (segment_length_b);
3427
3428 const_length_a += access_size_a;
3429 const_length_b += access_size_b;
3430
3431 if (ranges_known_overlap_p (offset_a, const_length_a,
3432 offset_b, const_length_b))
3433 return 1;
3434
3435 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3436 offset_b, const_length_b))
3437 return 0;
3438
3439 return -1;
3440 }
3441
3442 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3443 in DDR is >= VF. */
3444
3445 static bool
3446 dependence_distance_ge_vf (data_dependence_relation *ddr,
3447 unsigned int loop_depth, poly_uint64 vf)
3448 {
3449 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3450 || DDR_NUM_DIST_VECTS (ddr) == 0)
3451 return false;
3452
3453 /* If the dependence is exact, we should have limited the VF instead. */
3454 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3455
3456 unsigned int i;
3457 lambda_vector dist_v;
3458 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3459 {
3460 HOST_WIDE_INT dist = dist_v[loop_depth];
3461 if (dist != 0
3462 && !(dist > 0 && DDR_REVERSED_P (ddr))
3463 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3464 return false;
3465 }
3466
3467 if (dump_enabled_p ())
3468 dump_printf_loc (MSG_NOTE, vect_location,
3469 "dependence distance between %T and %T is >= VF\n",
3470 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3471
3472 return true;
3473 }
3474
3475 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3476
3477 static void
3478 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3479 {
3480 dump_printf (dump_kind, "%s (%T) >= ",
3481 lower_bound.unsigned_p ? "unsigned" : "abs",
3482 lower_bound.expr);
3483 dump_dec (dump_kind, lower_bound.min_value);
3484 }
3485
3486 /* Record that the vectorized loop requires the vec_lower_bound described
3487 by EXPR, UNSIGNED_P and MIN_VALUE. */
3488
3489 static void
3490 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3491 poly_uint64 min_value)
3492 {
3493 vec<vec_lower_bound> &lower_bounds
3494 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3495 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3496 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3497 {
3498 unsigned_p &= lower_bounds[i].unsigned_p;
3499 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3500 if (lower_bounds[i].unsigned_p != unsigned_p
3501 || maybe_lt (lower_bounds[i].min_value, min_value))
3502 {
3503 lower_bounds[i].unsigned_p = unsigned_p;
3504 lower_bounds[i].min_value = min_value;
3505 if (dump_enabled_p ())
3506 {
3507 dump_printf_loc (MSG_NOTE, vect_location,
3508 "updating run-time check to ");
3509 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3510 dump_printf (MSG_NOTE, "\n");
3511 }
3512 }
3513 return;
3514 }
3515
3516 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3517 if (dump_enabled_p ())
3518 {
3519 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3520 dump_lower_bound (MSG_NOTE, lower_bound);
3521 dump_printf (MSG_NOTE, "\n");
3522 }
3523 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3524 }
3525
3526 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3527 will span fewer than GAP bytes. */
3528
3529 static bool
3530 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3531 poly_int64 gap)
3532 {
3533 stmt_vec_info stmt_info = dr_info->stmt;
3534 HOST_WIDE_INT count
3535 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3536 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3537 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3538 return (estimated_poly_value (gap)
3539 <= count * vect_get_scalar_dr_size (dr_info));
3540 }
3541
3542 /* Return true if we know that there is no alias between DR_INFO_A and
3543 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3544 When returning true, set *LOWER_BOUND_OUT to this N. */
3545
3546 static bool
3547 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3548 poly_uint64 *lower_bound_out)
3549 {
3550 /* Check that there is a constant gap of known sign between DR_A
3551 and DR_B. */
3552 data_reference *dr_a = dr_info_a->dr;
3553 data_reference *dr_b = dr_info_b->dr;
3554 poly_int64 init_a, init_b;
3555 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3556 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3557 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3558 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3559 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3560 || !ordered_p (init_a, init_b))
3561 return false;
3562
3563 /* Sort DR_A and DR_B by the address they access. */
3564 if (maybe_lt (init_b, init_a))
3565 {
3566 std::swap (init_a, init_b);
3567 std::swap (dr_info_a, dr_info_b);
3568 std::swap (dr_a, dr_b);
3569 }
3570
3571 /* If the two accesses could be dependent within a scalar iteration,
3572 make sure that we'd retain their order. */
3573 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3574 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3575 return false;
3576
3577 /* There is no alias if abs (DR_STEP) is greater than or equal to
3578 the bytes spanned by the combination of the two accesses. */
3579 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3580 return true;
3581 }
3582
3583 /* Function vect_prune_runtime_alias_test_list.
3584
3585 Prune a list of ddrs to be tested at run-time by versioning for alias.
3586 Merge several alias checks into one if possible.
3587 Return FALSE if resulting list of ddrs is longer then allowed by
3588 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3589
3590 opt_result
3591 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3592 {
3593 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3594 hash_set <tree_pair_hash> compared_objects;
3595
3596 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3597 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3598 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3599 const vec<vec_object_pair> &check_unequal_addrs
3600 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3601 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3602 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3603
3604 ddr_p ddr;
3605 unsigned int i;
3606 tree length_factor;
3607
3608 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3609
3610 /* Step values are irrelevant for aliasing if the number of vector
3611 iterations is equal to the number of scalar iterations (which can
3612 happen for fully-SLP loops). */
3613 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3614
3615 if (!vf_one_p)
3616 {
3617 /* Convert the checks for nonzero steps into bound tests. */
3618 tree value;
3619 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3620 vect_check_lower_bound (loop_vinfo, value, true, 1);
3621 }
3622
3623 if (may_alias_ddrs.is_empty ())
3624 return opt_result::success ();
3625
3626 comp_alias_ddrs.create (may_alias_ddrs.length ());
3627
3628 unsigned int loop_depth
3629 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3630 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3631
3632 /* First, we collect all data ref pairs for aliasing checks. */
3633 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3634 {
3635 poly_uint64 lower_bound;
3636 tree segment_length_a, segment_length_b;
3637 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3638 unsigned int align_a, align_b;
3639
3640 /* Ignore the alias if the VF we chose ended up being no greater
3641 than the dependence distance. */
3642 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3643 continue;
3644
3645 if (DDR_OBJECT_A (ddr))
3646 {
3647 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3648 if (!compared_objects.add (new_pair))
3649 {
3650 if (dump_enabled_p ())
3651 dump_printf_loc (MSG_NOTE, vect_location,
3652 "checking that %T and %T"
3653 " have different addresses\n",
3654 new_pair.first, new_pair.second);
3655 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3656 }
3657 continue;
3658 }
3659
3660 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3661 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3662
3663 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3664 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3665
3666 bool preserves_scalar_order_p
3667 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3668 bool ignore_step_p
3669 = (vf_one_p
3670 && (preserves_scalar_order_p
3671 || operand_equal_p (DR_STEP (dr_info_a->dr),
3672 DR_STEP (dr_info_b->dr))));
3673
3674 /* Skip the pair if inter-iteration dependencies are irrelevant
3675 and intra-iteration dependencies are guaranteed to be honored. */
3676 if (ignore_step_p
3677 && (preserves_scalar_order_p
3678 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3679 &lower_bound)))
3680 {
3681 if (dump_enabled_p ())
3682 dump_printf_loc (MSG_NOTE, vect_location,
3683 "no need for alias check between "
3684 "%T and %T when VF is 1\n",
3685 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3686 continue;
3687 }
3688
3689 /* See whether we can handle the alias using a bounds check on
3690 the step, and whether that's likely to be the best approach.
3691 (It might not be, for example, if the minimum step is much larger
3692 than the number of bytes handled by one vector iteration.) */
3693 if (!ignore_step_p
3694 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3695 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3696 &lower_bound)
3697 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3698 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3699 {
3700 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3701 if (dump_enabled_p ())
3702 {
3703 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3704 "%T and %T when the step %T is outside ",
3705 DR_REF (dr_info_a->dr),
3706 DR_REF (dr_info_b->dr),
3707 DR_STEP (dr_info_a->dr));
3708 if (unsigned_p)
3709 dump_printf (MSG_NOTE, "[0");
3710 else
3711 {
3712 dump_printf (MSG_NOTE, "(");
3713 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3714 }
3715 dump_printf (MSG_NOTE, ", ");
3716 dump_dec (MSG_NOTE, lower_bound);
3717 dump_printf (MSG_NOTE, ")\n");
3718 }
3719 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3720 unsigned_p, lower_bound);
3721 continue;
3722 }
3723
3724 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3725 if (dr_group_first_a)
3726 {
3727 stmt_info_a = dr_group_first_a;
3728 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3729 }
3730
3731 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3732 if (dr_group_first_b)
3733 {
3734 stmt_info_b = dr_group_first_b;
3735 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3736 }
3737
3738 if (ignore_step_p)
3739 {
3740 segment_length_a = size_zero_node;
3741 segment_length_b = size_zero_node;
3742 }
3743 else
3744 {
3745 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3746 DR_STEP (dr_info_b->dr), 0))
3747 length_factor = scalar_loop_iters;
3748 else
3749 length_factor = size_int (vect_factor);
3750 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3751 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3752 }
3753 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3754 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3755 align_a = vect_vfa_align (dr_info_a);
3756 align_b = vect_vfa_align (dr_info_b);
3757
3758 /* See whether the alias is known at compilation time. */
3759 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3760 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3761 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3762 DR_OFFSET (dr_info_b->dr), 0)
3763 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3764 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3765 && poly_int_tree_p (segment_length_a)
3766 && poly_int_tree_p (segment_length_b))
3767 {
3768 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3769 segment_length_a,
3770 segment_length_b,
3771 access_size_a,
3772 access_size_b);
3773 if (res >= 0 && dump_enabled_p ())
3774 {
3775 dump_printf_loc (MSG_NOTE, vect_location,
3776 "can tell at compile time that %T and %T",
3777 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3778 if (res == 0)
3779 dump_printf (MSG_NOTE, " do not alias\n");
3780 else
3781 dump_printf (MSG_NOTE, " alias\n");
3782 }
3783
3784 if (res == 0)
3785 continue;
3786
3787 if (res == 1)
3788 return opt_result::failure_at (stmt_info_b->stmt,
3789 "not vectorized:"
3790 " compilation time alias: %G%G",
3791 stmt_info_a->stmt,
3792 stmt_info_b->stmt);
3793 }
3794
3795 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3796 access_size_a, align_a);
3797 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3798 access_size_b, align_b);
3799 /* Canonicalize the order to be the one that's needed for accurate
3800 RAW, WAR and WAW flags, in cases where the data references are
3801 well-ordered. The order doesn't really matter otherwise,
3802 but we might as well be consistent. */
3803 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3804 std::swap (dr_a, dr_b);
3805
3806 dr_with_seg_len_pair_t dr_with_seg_len_pair
3807 (dr_a, dr_b, (preserves_scalar_order_p
3808 ? dr_with_seg_len_pair_t::WELL_ORDERED
3809 : dr_with_seg_len_pair_t::REORDERED));
3810
3811 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3812 }
3813
3814 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3815
3816 unsigned int count = (comp_alias_ddrs.length ()
3817 + check_unequal_addrs.length ());
3818
3819 if (count
3820 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3821 == VECT_COST_MODEL_VERY_CHEAP))
3822 return opt_result::failure_at
3823 (vect_location, "would need a runtime alias check\n");
3824
3825 if (dump_enabled_p ())
3826 dump_printf_loc (MSG_NOTE, vect_location,
3827 "improved number of alias checks from %d to %d\n",
3828 may_alias_ddrs.length (), count);
3829 unsigned limit = param_vect_max_version_for_alias_checks;
3830 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3831 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3832 if (count > limit)
3833 return opt_result::failure_at
3834 (vect_location,
3835 "number of versioning for alias run-time tests exceeds %d "
3836 "(--param vect-max-version-for-alias-checks)\n", limit);
3837
3838 return opt_result::success ();
3839 }
3840
3841 /* Check whether we can use an internal function for a gather load
3842 or scatter store. READ_P is true for loads and false for stores.
3843 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3844 the type of the memory elements being loaded or stored. OFFSET_TYPE
3845 is the type of the offset that is being applied to the invariant
3846 base address. SCALE is the amount by which the offset should
3847 be multiplied *after* it has been converted to address width.
3848
3849 Return true if the function is supported, storing the function id in
3850 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3851
3852 bool
3853 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3854 tree vectype, tree memory_type, tree offset_type,
3855 int scale, internal_fn *ifn_out,
3856 tree *offset_vectype_out)
3857 {
3858 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3859 unsigned int element_bits = vector_element_bits (vectype);
3860 if (element_bits != memory_bits)
3861 /* For now the vector elements must be the same width as the
3862 memory elements. */
3863 return false;
3864
3865 /* Work out which function we need. */
3866 internal_fn ifn, alt_ifn;
3867 if (read_p)
3868 {
3869 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3870 alt_ifn = IFN_MASK_GATHER_LOAD;
3871 }
3872 else
3873 {
3874 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3875 alt_ifn = IFN_MASK_SCATTER_STORE;
3876 }
3877
3878 for (;;)
3879 {
3880 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3881 if (!offset_vectype)
3882 return false;
3883
3884 /* Test whether the target supports this combination. */
3885 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3886 offset_vectype, scale))
3887 {
3888 *ifn_out = ifn;
3889 *offset_vectype_out = offset_vectype;
3890 return true;
3891 }
3892 else if (!masked_p
3893 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3894 memory_type,
3895 offset_vectype,
3896 scale))
3897 {
3898 *ifn_out = alt_ifn;
3899 *offset_vectype_out = offset_vectype;
3900 return true;
3901 }
3902
3903 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3904 && TYPE_PRECISION (offset_type) >= element_bits)
3905 return false;
3906
3907 offset_type = build_nonstandard_integer_type
3908 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3909 }
3910 }
3911
3912 /* STMT_INFO is a call to an internal gather load or scatter store function.
3913 Describe the operation in INFO. */
3914
3915 static void
3916 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3917 gather_scatter_info *info)
3918 {
3919 gcall *call = as_a <gcall *> (stmt_info->stmt);
3920 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3921 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3922
3923 info->ifn = gimple_call_internal_fn (call);
3924 info->decl = NULL_TREE;
3925 info->base = gimple_call_arg (call, 0);
3926 info->offset = gimple_call_arg (call, 1);
3927 info->offset_dt = vect_unknown_def_type;
3928 info->offset_vectype = NULL_TREE;
3929 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3930 info->element_type = TREE_TYPE (vectype);
3931 info->memory_type = TREE_TYPE (DR_REF (dr));
3932 }
3933
3934 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3935 gather load or scatter store. Describe the operation in *INFO if so. */
3936
3937 bool
3938 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3939 gather_scatter_info *info)
3940 {
3941 HOST_WIDE_INT scale = 1;
3942 poly_int64 pbitpos, pbitsize;
3943 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3944 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3945 tree offtype = NULL_TREE;
3946 tree decl = NULL_TREE, base, off;
3947 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3948 tree memory_type = TREE_TYPE (DR_REF (dr));
3949 machine_mode pmode;
3950 int punsignedp, reversep, pvolatilep = 0;
3951 internal_fn ifn;
3952 tree offset_vectype;
3953 bool masked_p = false;
3954
3955 /* See whether this is already a call to a gather/scatter internal function.
3956 If not, see whether it's a masked load or store. */
3957 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3958 if (call && gimple_call_internal_p (call))
3959 {
3960 ifn = gimple_call_internal_fn (call);
3961 if (internal_gather_scatter_fn_p (ifn))
3962 {
3963 vect_describe_gather_scatter_call (stmt_info, info);
3964 return true;
3965 }
3966 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3967 }
3968
3969 /* True if we should aim to use internal functions rather than
3970 built-in functions. */
3971 bool use_ifn_p = (DR_IS_READ (dr)
3972 ? supports_vec_gather_load_p ()
3973 : supports_vec_scatter_store_p ());
3974
3975 base = DR_REF (dr);
3976 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3977 see if we can use the def stmt of the address. */
3978 if (masked_p
3979 && TREE_CODE (base) == MEM_REF
3980 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3981 && integer_zerop (TREE_OPERAND (base, 1))
3982 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3983 {
3984 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3985 if (is_gimple_assign (def_stmt)
3986 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3987 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3988 }
3989
3990 /* The gather and scatter builtins need address of the form
3991 loop_invariant + vector * {1, 2, 4, 8}
3992 or
3993 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3994 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3995 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3996 multiplications and additions in it. To get a vector, we need
3997 a single SSA_NAME that will be defined in the loop and will
3998 contain everything that is not loop invariant and that can be
3999 vectorized. The following code attempts to find such a preexistng
4000 SSA_NAME OFF and put the loop invariants into a tree BASE
4001 that can be gimplified before the loop. */
4002 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4003 &punsignedp, &reversep, &pvolatilep);
4004 if (reversep)
4005 return false;
4006
4007 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4008
4009 if (TREE_CODE (base) == MEM_REF)
4010 {
4011 if (!integer_zerop (TREE_OPERAND (base, 1)))
4012 {
4013 if (off == NULL_TREE)
4014 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4015 else
4016 off = size_binop (PLUS_EXPR, off,
4017 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4018 }
4019 base = TREE_OPERAND (base, 0);
4020 }
4021 else
4022 base = build_fold_addr_expr (base);
4023
4024 if (off == NULL_TREE)
4025 off = size_zero_node;
4026
4027 /* If base is not loop invariant, either off is 0, then we start with just
4028 the constant offset in the loop invariant BASE and continue with base
4029 as OFF, otherwise give up.
4030 We could handle that case by gimplifying the addition of base + off
4031 into some SSA_NAME and use that as off, but for now punt. */
4032 if (!expr_invariant_in_loop_p (loop, base))
4033 {
4034 if (!integer_zerop (off))
4035 return false;
4036 off = base;
4037 base = size_int (pbytepos);
4038 }
4039 /* Otherwise put base + constant offset into the loop invariant BASE
4040 and continue with OFF. */
4041 else
4042 {
4043 base = fold_convert (sizetype, base);
4044 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4045 }
4046
4047 /* OFF at this point may be either a SSA_NAME or some tree expression
4048 from get_inner_reference. Try to peel off loop invariants from it
4049 into BASE as long as possible. */
4050 STRIP_NOPS (off);
4051 while (offtype == NULL_TREE)
4052 {
4053 enum tree_code code;
4054 tree op0, op1, add = NULL_TREE;
4055
4056 if (TREE_CODE (off) == SSA_NAME)
4057 {
4058 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4059
4060 if (expr_invariant_in_loop_p (loop, off))
4061 return false;
4062
4063 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4064 break;
4065
4066 op0 = gimple_assign_rhs1 (def_stmt);
4067 code = gimple_assign_rhs_code (def_stmt);
4068 op1 = gimple_assign_rhs2 (def_stmt);
4069 }
4070 else
4071 {
4072 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4073 return false;
4074 code = TREE_CODE (off);
4075 extract_ops_from_tree (off, &code, &op0, &op1);
4076 }
4077 switch (code)
4078 {
4079 case POINTER_PLUS_EXPR:
4080 case PLUS_EXPR:
4081 if (expr_invariant_in_loop_p (loop, op0))
4082 {
4083 add = op0;
4084 off = op1;
4085 do_add:
4086 add = fold_convert (sizetype, add);
4087 if (scale != 1)
4088 add = size_binop (MULT_EXPR, add, size_int (scale));
4089 base = size_binop (PLUS_EXPR, base, add);
4090 continue;
4091 }
4092 if (expr_invariant_in_loop_p (loop, op1))
4093 {
4094 add = op1;
4095 off = op0;
4096 goto do_add;
4097 }
4098 break;
4099 case MINUS_EXPR:
4100 if (expr_invariant_in_loop_p (loop, op1))
4101 {
4102 add = fold_convert (sizetype, op1);
4103 add = size_binop (MINUS_EXPR, size_zero_node, add);
4104 off = op0;
4105 goto do_add;
4106 }
4107 break;
4108 case MULT_EXPR:
4109 if (scale == 1 && tree_fits_shwi_p (op1))
4110 {
4111 int new_scale = tree_to_shwi (op1);
4112 /* Only treat this as a scaling operation if the target
4113 supports it for at least some offset type. */
4114 if (use_ifn_p
4115 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4116 masked_p, vectype, memory_type,
4117 signed_char_type_node,
4118 new_scale, &ifn,
4119 &offset_vectype)
4120 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4121 masked_p, vectype, memory_type,
4122 unsigned_char_type_node,
4123 new_scale, &ifn,
4124 &offset_vectype))
4125 break;
4126 scale = new_scale;
4127 off = op0;
4128 continue;
4129 }
4130 break;
4131 case SSA_NAME:
4132 off = op0;
4133 continue;
4134 CASE_CONVERT:
4135 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4136 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4137 break;
4138
4139 /* Don't include the conversion if the target is happy with
4140 the current offset type. */
4141 if (use_ifn_p
4142 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4143 masked_p, vectype, memory_type,
4144 TREE_TYPE (off), scale, &ifn,
4145 &offset_vectype))
4146 break;
4147
4148 if (TYPE_PRECISION (TREE_TYPE (op0))
4149 == TYPE_PRECISION (TREE_TYPE (off)))
4150 {
4151 off = op0;
4152 continue;
4153 }
4154
4155 /* Include the conversion if it is widening and we're using
4156 the IFN path or the target can handle the converted from
4157 offset or the current size is not already the same as the
4158 data vector element size. */
4159 if ((TYPE_PRECISION (TREE_TYPE (op0))
4160 < TYPE_PRECISION (TREE_TYPE (off)))
4161 && (use_ifn_p
4162 || (DR_IS_READ (dr)
4163 ? (targetm.vectorize.builtin_gather
4164 && targetm.vectorize.builtin_gather (vectype,
4165 TREE_TYPE (op0),
4166 scale))
4167 : (targetm.vectorize.builtin_scatter
4168 && targetm.vectorize.builtin_scatter (vectype,
4169 TREE_TYPE (op0),
4170 scale)))
4171 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4172 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4173 {
4174 off = op0;
4175 offtype = TREE_TYPE (off);
4176 STRIP_NOPS (off);
4177 continue;
4178 }
4179 break;
4180 default:
4181 break;
4182 }
4183 break;
4184 }
4185
4186 /* If at the end OFF still isn't a SSA_NAME or isn't
4187 defined in the loop, punt. */
4188 if (TREE_CODE (off) != SSA_NAME
4189 || expr_invariant_in_loop_p (loop, off))
4190 return false;
4191
4192 if (offtype == NULL_TREE)
4193 offtype = TREE_TYPE (off);
4194
4195 if (use_ifn_p)
4196 {
4197 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4198 vectype, memory_type, offtype, scale,
4199 &ifn, &offset_vectype))
4200 ifn = IFN_LAST;
4201 decl = NULL_TREE;
4202 }
4203 else
4204 {
4205 if (DR_IS_READ (dr))
4206 {
4207 if (targetm.vectorize.builtin_gather)
4208 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4209 }
4210 else
4211 {
4212 if (targetm.vectorize.builtin_scatter)
4213 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4214 }
4215 ifn = IFN_LAST;
4216 /* The offset vector type will be read from DECL when needed. */
4217 offset_vectype = NULL_TREE;
4218 }
4219
4220 info->ifn = ifn;
4221 info->decl = decl;
4222 info->base = base;
4223 info->offset = off;
4224 info->offset_dt = vect_unknown_def_type;
4225 info->offset_vectype = offset_vectype;
4226 info->scale = scale;
4227 info->element_type = TREE_TYPE (vectype);
4228 info->memory_type = memory_type;
4229 return true;
4230 }
4231
4232 /* Find the data references in STMT, analyze them with respect to LOOP and
4233 append them to DATAREFS. Return false if datarefs in this stmt cannot
4234 be handled. */
4235
4236 opt_result
4237 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4238 vec<data_reference_p> *datarefs,
4239 vec<int> *dataref_groups, int group_id)
4240 {
4241 /* We can ignore clobbers for dataref analysis - they are removed during
4242 loop vectorization and BB vectorization checks dependences with a
4243 stmt walk. */
4244 if (gimple_clobber_p (stmt))
4245 return opt_result::success ();
4246
4247 if (gimple_has_volatile_ops (stmt))
4248 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4249 stmt);
4250
4251 if (stmt_can_throw_internal (cfun, stmt))
4252 return opt_result::failure_at (stmt,
4253 "not vectorized:"
4254 " statement can throw an exception: %G",
4255 stmt);
4256
4257 auto_vec<data_reference_p, 2> refs;
4258 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4259 if (!res)
4260 return res;
4261
4262 if (refs.is_empty ())
4263 return opt_result::success ();
4264
4265 if (refs.length () > 1)
4266 {
4267 while (!refs.is_empty ())
4268 free_data_ref (refs.pop ());
4269 return opt_result::failure_at (stmt,
4270 "not vectorized: more than one "
4271 "data ref in stmt: %G", stmt);
4272 }
4273
4274 data_reference_p dr = refs.pop ();
4275 if (gcall *call = dyn_cast <gcall *> (stmt))
4276 if (!gimple_call_internal_p (call)
4277 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4278 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4279 {
4280 free_data_ref (dr);
4281 return opt_result::failure_at (stmt,
4282 "not vectorized: dr in a call %G", stmt);
4283 }
4284
4285 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4286 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4287 {
4288 free_data_ref (dr);
4289 return opt_result::failure_at (stmt,
4290 "not vectorized:"
4291 " statement is bitfield access %G", stmt);
4292 }
4293
4294 if (DR_BASE_ADDRESS (dr)
4295 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4296 {
4297 free_data_ref (dr);
4298 return opt_result::failure_at (stmt,
4299 "not vectorized:"
4300 " base addr of dr is a constant\n");
4301 }
4302
4303 /* Check whether this may be a SIMD lane access and adjust the
4304 DR to make it easier for us to handle it. */
4305 if (loop
4306 && loop->simduid
4307 && (!DR_BASE_ADDRESS (dr)
4308 || !DR_OFFSET (dr)
4309 || !DR_INIT (dr)
4310 || !DR_STEP (dr)))
4311 {
4312 struct data_reference *newdr
4313 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4314 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4315 if (DR_BASE_ADDRESS (newdr)
4316 && DR_OFFSET (newdr)
4317 && DR_INIT (newdr)
4318 && DR_STEP (newdr)
4319 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4320 && integer_zerop (DR_STEP (newdr)))
4321 {
4322 tree base_address = DR_BASE_ADDRESS (newdr);
4323 tree off = DR_OFFSET (newdr);
4324 tree step = ssize_int (1);
4325 if (integer_zerop (off)
4326 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4327 {
4328 off = TREE_OPERAND (base_address, 1);
4329 base_address = TREE_OPERAND (base_address, 0);
4330 }
4331 STRIP_NOPS (off);
4332 if (TREE_CODE (off) == MULT_EXPR
4333 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4334 {
4335 step = TREE_OPERAND (off, 1);
4336 off = TREE_OPERAND (off, 0);
4337 STRIP_NOPS (off);
4338 }
4339 if (CONVERT_EXPR_P (off)
4340 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4341 < TYPE_PRECISION (TREE_TYPE (off))))
4342 off = TREE_OPERAND (off, 0);
4343 if (TREE_CODE (off) == SSA_NAME)
4344 {
4345 gimple *def = SSA_NAME_DEF_STMT (off);
4346 /* Look through widening conversion. */
4347 if (is_gimple_assign (def)
4348 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4349 {
4350 tree rhs1 = gimple_assign_rhs1 (def);
4351 if (TREE_CODE (rhs1) == SSA_NAME
4352 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4353 && (TYPE_PRECISION (TREE_TYPE (off))
4354 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4355 def = SSA_NAME_DEF_STMT (rhs1);
4356 }
4357 if (is_gimple_call (def)
4358 && gimple_call_internal_p (def)
4359 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4360 {
4361 tree arg = gimple_call_arg (def, 0);
4362 tree reft = TREE_TYPE (DR_REF (newdr));
4363 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4364 arg = SSA_NAME_VAR (arg);
4365 if (arg == loop->simduid
4366 /* For now. */
4367 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4368 {
4369 DR_BASE_ADDRESS (newdr) = base_address;
4370 DR_OFFSET (newdr) = ssize_int (0);
4371 DR_STEP (newdr) = step;
4372 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4373 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4374 /* Mark as simd-lane access. */
4375 tree arg2 = gimple_call_arg (def, 1);
4376 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4377 free_data_ref (dr);
4378 datarefs->safe_push (newdr);
4379 if (dataref_groups)
4380 dataref_groups->safe_push (group_id);
4381 return opt_result::success ();
4382 }
4383 }
4384 }
4385 }
4386 free_data_ref (newdr);
4387 }
4388
4389 datarefs->safe_push (dr);
4390 if (dataref_groups)
4391 dataref_groups->safe_push (group_id);
4392 return opt_result::success ();
4393 }
4394
4395 /* Function vect_analyze_data_refs.
4396
4397 Find all the data references in the loop or basic block.
4398
4399 The general structure of the analysis of data refs in the vectorizer is as
4400 follows:
4401 1- vect_analyze_data_refs(loop/bb): call
4402 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4403 in the loop/bb and their dependences.
4404 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4405 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4406 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4407
4408 */
4409
4410 opt_result
4411 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4412 {
4413 class loop *loop = NULL;
4414 unsigned int i;
4415 struct data_reference *dr;
4416 tree scalar_type;
4417
4418 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4419
4420 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4421 loop = LOOP_VINFO_LOOP (loop_vinfo);
4422
4423 /* Go through the data-refs, check that the analysis succeeded. Update
4424 pointer from stmt_vec_info struct to DR and vectype. */
4425
4426 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4427 FOR_EACH_VEC_ELT (datarefs, i, dr)
4428 {
4429 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4430 poly_uint64 vf;
4431
4432 gcc_assert (DR_REF (dr));
4433 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4434 gcc_assert (!stmt_info->dr_aux.dr);
4435 stmt_info->dr_aux.dr = dr;
4436 stmt_info->dr_aux.stmt = stmt_info;
4437
4438 /* Check that analysis of the data-ref succeeded. */
4439 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4440 || !DR_STEP (dr))
4441 {
4442 bool maybe_gather
4443 = DR_IS_READ (dr)
4444 && !TREE_THIS_VOLATILE (DR_REF (dr));
4445 bool maybe_scatter
4446 = DR_IS_WRITE (dr)
4447 && !TREE_THIS_VOLATILE (DR_REF (dr))
4448 && (targetm.vectorize.builtin_scatter != NULL
4449 || supports_vec_scatter_store_p ());
4450
4451 /* If target supports vector gather loads or scatter stores,
4452 see if they can't be used. */
4453 if (is_a <loop_vec_info> (vinfo)
4454 && !nested_in_vect_loop_p (loop, stmt_info))
4455 {
4456 if (maybe_gather || maybe_scatter)
4457 {
4458 if (maybe_gather)
4459 gatherscatter = GATHER;
4460 else
4461 gatherscatter = SCATTER;
4462 }
4463 }
4464
4465 if (gatherscatter == SG_NONE)
4466 {
4467 if (dump_enabled_p ())
4468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4469 "not vectorized: data ref analysis "
4470 "failed %G", stmt_info->stmt);
4471 if (is_a <bb_vec_info> (vinfo))
4472 {
4473 /* In BB vectorization the ref can still participate
4474 in dependence analysis, we just can't vectorize it. */
4475 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4476 continue;
4477 }
4478 return opt_result::failure_at (stmt_info->stmt,
4479 "not vectorized:"
4480 " data ref analysis failed: %G",
4481 stmt_info->stmt);
4482 }
4483 }
4484
4485 /* See if this was detected as SIMD lane access. */
4486 if (dr->aux == (void *)-1
4487 || dr->aux == (void *)-2
4488 || dr->aux == (void *)-3
4489 || dr->aux == (void *)-4)
4490 {
4491 if (nested_in_vect_loop_p (loop, stmt_info))
4492 return opt_result::failure_at (stmt_info->stmt,
4493 "not vectorized:"
4494 " data ref analysis failed: %G",
4495 stmt_info->stmt);
4496 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4497 = -(uintptr_t) dr->aux;
4498 }
4499
4500 tree base = get_base_address (DR_REF (dr));
4501 if (base && VAR_P (base) && DECL_NONALIASED (base))
4502 {
4503 if (dump_enabled_p ())
4504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4505 "not vectorized: base object not addressable "
4506 "for stmt: %G", stmt_info->stmt);
4507 if (is_a <bb_vec_info> (vinfo))
4508 {
4509 /* In BB vectorization the ref can still participate
4510 in dependence analysis, we just can't vectorize it. */
4511 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4512 continue;
4513 }
4514 return opt_result::failure_at (stmt_info->stmt,
4515 "not vectorized: base object not"
4516 " addressable for stmt: %G",
4517 stmt_info->stmt);
4518 }
4519
4520 if (is_a <loop_vec_info> (vinfo)
4521 && DR_STEP (dr)
4522 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4523 {
4524 if (nested_in_vect_loop_p (loop, stmt_info))
4525 return opt_result::failure_at (stmt_info->stmt,
4526 "not vectorized: "
4527 "not suitable for strided load %G",
4528 stmt_info->stmt);
4529 STMT_VINFO_STRIDED_P (stmt_info) = true;
4530 }
4531
4532 /* Update DR field in stmt_vec_info struct. */
4533
4534 /* If the dataref is in an inner-loop of the loop that is considered for
4535 for vectorization, we also want to analyze the access relative to
4536 the outer-loop (DR contains information only relative to the
4537 inner-most enclosing loop). We do that by building a reference to the
4538 first location accessed by the inner-loop, and analyze it relative to
4539 the outer-loop. */
4540 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4541 {
4542 /* Build a reference to the first location accessed by the
4543 inner loop: *(BASE + INIT + OFFSET). By construction,
4544 this address must be invariant in the inner loop, so we
4545 can consider it as being used in the outer loop. */
4546 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4547 tree offset = unshare_expr (DR_OFFSET (dr));
4548 tree init = unshare_expr (DR_INIT (dr));
4549 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4550 init, offset);
4551 tree init_addr = fold_build_pointer_plus (base, init_offset);
4552 tree init_ref = build_fold_indirect_ref (init_addr);
4553
4554 if (dump_enabled_p ())
4555 dump_printf_loc (MSG_NOTE, vect_location,
4556 "analyze in outer loop: %T\n", init_ref);
4557
4558 opt_result res
4559 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4560 init_ref, loop, stmt_info->stmt);
4561 if (!res)
4562 /* dr_analyze_innermost already explained the failure. */
4563 return res;
4564
4565 if (dump_enabled_p ())
4566 dump_printf_loc (MSG_NOTE, vect_location,
4567 "\touter base_address: %T\n"
4568 "\touter offset from base address: %T\n"
4569 "\touter constant offset from base address: %T\n"
4570 "\touter step: %T\n"
4571 "\touter base alignment: %d\n\n"
4572 "\touter base misalignment: %d\n"
4573 "\touter offset alignment: %d\n"
4574 "\touter step alignment: %d\n",
4575 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4576 STMT_VINFO_DR_OFFSET (stmt_info),
4577 STMT_VINFO_DR_INIT (stmt_info),
4578 STMT_VINFO_DR_STEP (stmt_info),
4579 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4580 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4581 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4582 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4583 }
4584
4585 /* Set vectype for STMT. */
4586 scalar_type = TREE_TYPE (DR_REF (dr));
4587 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4588 if (!vectype)
4589 {
4590 if (dump_enabled_p ())
4591 {
4592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4593 "not vectorized: no vectype for stmt: %G",
4594 stmt_info->stmt);
4595 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4596 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4597 scalar_type);
4598 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4599 }
4600
4601 if (is_a <bb_vec_info> (vinfo))
4602 {
4603 /* No vector type is fine, the ref can still participate
4604 in dependence analysis, we just can't vectorize it. */
4605 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4606 continue;
4607 }
4608 if (fatal)
4609 *fatal = false;
4610 return opt_result::failure_at (stmt_info->stmt,
4611 "not vectorized:"
4612 " no vectype for stmt: %G"
4613 " scalar_type: %T\n",
4614 stmt_info->stmt, scalar_type);
4615 }
4616 else
4617 {
4618 if (dump_enabled_p ())
4619 dump_printf_loc (MSG_NOTE, vect_location,
4620 "got vectype for stmt: %G%T\n",
4621 stmt_info->stmt, vectype);
4622 }
4623
4624 /* Adjust the minimal vectorization factor according to the
4625 vector type. */
4626 vf = TYPE_VECTOR_SUBPARTS (vectype);
4627 *min_vf = upper_bound (*min_vf, vf);
4628
4629 /* Leave the BB vectorizer to pick the vector type later, based on
4630 the final dataref group size and SLP node size. */
4631 if (is_a <loop_vec_info> (vinfo))
4632 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4633
4634 if (gatherscatter != SG_NONE)
4635 {
4636 gather_scatter_info gs_info;
4637 if (!vect_check_gather_scatter (stmt_info,
4638 as_a <loop_vec_info> (vinfo),
4639 &gs_info)
4640 || !get_vectype_for_scalar_type (vinfo,
4641 TREE_TYPE (gs_info.offset)))
4642 {
4643 if (fatal)
4644 *fatal = false;
4645 return opt_result::failure_at
4646 (stmt_info->stmt,
4647 (gatherscatter == GATHER)
4648 ? "not vectorized: not suitable for gather load %G"
4649 : "not vectorized: not suitable for scatter store %G",
4650 stmt_info->stmt);
4651 }
4652 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4653 }
4654 }
4655
4656 /* We used to stop processing and prune the list here. Verify we no
4657 longer need to. */
4658 gcc_assert (i == datarefs.length ());
4659
4660 return opt_result::success ();
4661 }
4662
4663
4664 /* Function vect_get_new_vect_var.
4665
4666 Returns a name for a new variable. The current naming scheme appends the
4667 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4668 the name of vectorizer generated variables, and appends that to NAME if
4669 provided. */
4670
4671 tree
4672 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4673 {
4674 const char *prefix;
4675 tree new_vect_var;
4676
4677 switch (var_kind)
4678 {
4679 case vect_simple_var:
4680 prefix = "vect";
4681 break;
4682 case vect_scalar_var:
4683 prefix = "stmp";
4684 break;
4685 case vect_mask_var:
4686 prefix = "mask";
4687 break;
4688 case vect_pointer_var:
4689 prefix = "vectp";
4690 break;
4691 default:
4692 gcc_unreachable ();
4693 }
4694
4695 if (name)
4696 {
4697 char* tmp = concat (prefix, "_", name, NULL);
4698 new_vect_var = create_tmp_reg (type, tmp);
4699 free (tmp);
4700 }
4701 else
4702 new_vect_var = create_tmp_reg (type, prefix);
4703
4704 return new_vect_var;
4705 }
4706
4707 /* Like vect_get_new_vect_var but return an SSA name. */
4708
4709 tree
4710 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4711 {
4712 const char *prefix;
4713 tree new_vect_var;
4714
4715 switch (var_kind)
4716 {
4717 case vect_simple_var:
4718 prefix = "vect";
4719 break;
4720 case vect_scalar_var:
4721 prefix = "stmp";
4722 break;
4723 case vect_pointer_var:
4724 prefix = "vectp";
4725 break;
4726 default:
4727 gcc_unreachable ();
4728 }
4729
4730 if (name)
4731 {
4732 char* tmp = concat (prefix, "_", name, NULL);
4733 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4734 free (tmp);
4735 }
4736 else
4737 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4738
4739 return new_vect_var;
4740 }
4741
4742 /* Duplicate points-to info on NAME from DR_INFO. */
4743
4744 static void
4745 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4746 {
4747 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4748 /* DR_PTR_INFO is for a base SSA name, not including constant or
4749 variable offsets in the ref so its alignment info does not apply. */
4750 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4751 }
4752
4753 /* Function vect_create_addr_base_for_vector_ref.
4754
4755 Create an expression that computes the address of the first memory location
4756 that will be accessed for a data reference.
4757
4758 Input:
4759 STMT_INFO: The statement containing the data reference.
4760 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4761 OFFSET: Optional. If supplied, it is be added to the initial address.
4762 LOOP: Specify relative to which loop-nest should the address be computed.
4763 For example, when the dataref is in an inner-loop nested in an
4764 outer-loop that is now being vectorized, LOOP can be either the
4765 outer-loop, or the inner-loop. The first memory location accessed
4766 by the following dataref ('in' points to short):
4767
4768 for (i=0; i<N; i++)
4769 for (j=0; j<M; j++)
4770 s += in[i+j]
4771
4772 is as follows:
4773 if LOOP=i_loop: &in (relative to i_loop)
4774 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4775
4776 Output:
4777 1. Return an SSA_NAME whose value is the address of the memory location of
4778 the first vector of the data reference.
4779 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4780 these statement(s) which define the returned SSA_NAME.
4781
4782 FORNOW: We are only handling array accesses with step 1. */
4783
4784 tree
4785 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4786 gimple_seq *new_stmt_list,
4787 tree offset)
4788 {
4789 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4790 struct data_reference *dr = dr_info->dr;
4791 const char *base_name;
4792 tree addr_base;
4793 tree dest;
4794 gimple_seq seq = NULL;
4795 tree vect_ptr_type;
4796 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4797 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4798
4799 tree data_ref_base = unshare_expr (drb->base_address);
4800 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4801 tree init = unshare_expr (drb->init);
4802
4803 if (loop_vinfo)
4804 base_name = get_name (data_ref_base);
4805 else
4806 {
4807 base_offset = ssize_int (0);
4808 init = ssize_int (0);
4809 base_name = get_name (DR_REF (dr));
4810 }
4811
4812 /* Create base_offset */
4813 base_offset = size_binop (PLUS_EXPR,
4814 fold_convert (sizetype, base_offset),
4815 fold_convert (sizetype, init));
4816
4817 if (offset)
4818 {
4819 offset = fold_convert (sizetype, offset);
4820 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4821 base_offset, offset);
4822 }
4823
4824 /* base + base_offset */
4825 if (loop_vinfo)
4826 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4827 else
4828 {
4829 addr_base = build1 (ADDR_EXPR,
4830 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4831 unshare_expr (DR_REF (dr)));
4832 }
4833
4834 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4835 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4836 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4837 gimple_seq_add_seq (new_stmt_list, seq);
4838
4839 if (DR_PTR_INFO (dr)
4840 && TREE_CODE (addr_base) == SSA_NAME
4841 /* We should only duplicate pointer info to newly created SSA names. */
4842 && SSA_NAME_VAR (addr_base) == dest)
4843 {
4844 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4845 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4846 }
4847
4848 if (dump_enabled_p ())
4849 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4850
4851 return addr_base;
4852 }
4853
4854
4855 /* Function vect_create_data_ref_ptr.
4856
4857 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4858 location accessed in the loop by STMT_INFO, along with the def-use update
4859 chain to appropriately advance the pointer through the loop iterations.
4860 Also set aliasing information for the pointer. This pointer is used by
4861 the callers to this function to create a memory reference expression for
4862 vector load/store access.
4863
4864 Input:
4865 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4866 GIMPLE_ASSIGN <name, data-ref> or
4867 GIMPLE_ASSIGN <data-ref, name>.
4868 2. AGGR_TYPE: the type of the reference, which should be either a vector
4869 or an array.
4870 3. AT_LOOP: the loop where the vector memref is to be created.
4871 4. OFFSET (optional): a byte offset to be added to the initial address
4872 accessed by the data-ref in STMT_INFO.
4873 5. BSI: location where the new stmts are to be placed if there is no loop
4874 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4875 pointing to the initial address.
4876 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4877 to the IV during each iteration of the loop. NULL says to move
4878 by one copy of AGGR_TYPE up or down, depending on the step of the
4879 data reference.
4880
4881 Output:
4882 1. Declare a new ptr to vector_type, and have it point to the base of the
4883 data reference (initial addressed accessed by the data reference).
4884 For example, for vector of type V8HI, the following code is generated:
4885
4886 v8hi *ap;
4887 ap = (v8hi *)initial_address;
4888
4889 if OFFSET is not supplied:
4890 initial_address = &a[init];
4891 if OFFSET is supplied:
4892 initial_address = &a[init] + OFFSET;
4893 if BYTE_OFFSET is supplied:
4894 initial_address = &a[init] + BYTE_OFFSET;
4895
4896 Return the initial_address in INITIAL_ADDRESS.
4897
4898 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4899 update the pointer in each iteration of the loop.
4900
4901 Return the increment stmt that updates the pointer in PTR_INCR.
4902
4903 3. Return the pointer. */
4904
4905 tree
4906 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4907 tree aggr_type, class loop *at_loop, tree offset,
4908 tree *initial_address, gimple_stmt_iterator *gsi,
4909 gimple **ptr_incr, bool only_init,
4910 tree iv_step)
4911 {
4912 const char *base_name;
4913 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4914 class loop *loop = NULL;
4915 bool nested_in_vect_loop = false;
4916 class loop *containing_loop = NULL;
4917 tree aggr_ptr_type;
4918 tree aggr_ptr;
4919 tree new_temp;
4920 gimple_seq new_stmt_list = NULL;
4921 edge pe = NULL;
4922 basic_block new_bb;
4923 tree aggr_ptr_init;
4924 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4925 struct data_reference *dr = dr_info->dr;
4926 tree aptr;
4927 gimple_stmt_iterator incr_gsi;
4928 bool insert_after;
4929 tree indx_before_incr, indx_after_incr;
4930 gimple *incr;
4931 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4932
4933 gcc_assert (iv_step != NULL_TREE
4934 || TREE_CODE (aggr_type) == ARRAY_TYPE
4935 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4936
4937 if (loop_vinfo)
4938 {
4939 loop = LOOP_VINFO_LOOP (loop_vinfo);
4940 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4941 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4942 pe = loop_preheader_edge (loop);
4943 }
4944 else
4945 {
4946 gcc_assert (bb_vinfo);
4947 only_init = true;
4948 *ptr_incr = NULL;
4949 }
4950
4951 /* Create an expression for the first address accessed by this load
4952 in LOOP. */
4953 base_name = get_name (DR_BASE_ADDRESS (dr));
4954
4955 if (dump_enabled_p ())
4956 {
4957 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4958 dump_printf_loc (MSG_NOTE, vect_location,
4959 "create %s-pointer variable to type: %T",
4960 get_tree_code_name (TREE_CODE (aggr_type)),
4961 aggr_type);
4962 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4963 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4964 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4965 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4966 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4967 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4968 else
4969 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4970 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4971 }
4972
4973 /* (1) Create the new aggregate-pointer variable.
4974 Vector and array types inherit the alias set of their component
4975 type by default so we need to use a ref-all pointer if the data
4976 reference does not conflict with the created aggregated data
4977 reference because it is not addressable. */
4978 bool need_ref_all = false;
4979 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4980 get_alias_set (DR_REF (dr))))
4981 need_ref_all = true;
4982 /* Likewise for any of the data references in the stmt group. */
4983 else if (DR_GROUP_SIZE (stmt_info) > 1)
4984 {
4985 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4986 do
4987 {
4988 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4989 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4990 get_alias_set (DR_REF (sdr))))
4991 {
4992 need_ref_all = true;
4993 break;
4994 }
4995 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4996 }
4997 while (sinfo);
4998 }
4999 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5000 need_ref_all);
5001 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5002
5003
5004 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5005 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5006 def-use update cycles for the pointer: one relative to the outer-loop
5007 (LOOP), which is what steps (3) and (4) below do. The other is relative
5008 to the inner-loop (which is the inner-most loop containing the dataref),
5009 and this is done be step (5) below.
5010
5011 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5012 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5013 redundant. Steps (3),(4) create the following:
5014
5015 vp0 = &base_addr;
5016 LOOP: vp1 = phi(vp0,vp2)
5017 ...
5018 ...
5019 vp2 = vp1 + step
5020 goto LOOP
5021
5022 If there is an inner-loop nested in loop, then step (5) will also be
5023 applied, and an additional update in the inner-loop will be created:
5024
5025 vp0 = &base_addr;
5026 LOOP: vp1 = phi(vp0,vp2)
5027 ...
5028 inner: vp3 = phi(vp1,vp4)
5029 vp4 = vp3 + inner_step
5030 if () goto inner
5031 ...
5032 vp2 = vp1 + step
5033 if () goto LOOP */
5034
5035 /* (2) Calculate the initial address of the aggregate-pointer, and set
5036 the aggregate-pointer to point to it before the loop. */
5037
5038 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5039
5040 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5041 stmt_info, &new_stmt_list,
5042 offset);
5043 if (new_stmt_list)
5044 {
5045 if (pe)
5046 {
5047 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5048 gcc_assert (!new_bb);
5049 }
5050 else
5051 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5052 }
5053
5054 *initial_address = new_temp;
5055 aggr_ptr_init = new_temp;
5056
5057 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5058 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5059 inner-loop nested in LOOP (during outer-loop vectorization). */
5060
5061 /* No update in loop is required. */
5062 if (only_init && (!loop_vinfo || at_loop == loop))
5063 aptr = aggr_ptr_init;
5064 else
5065 {
5066 /* Accesses to invariant addresses should be handled specially
5067 by the caller. */
5068 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5069 gcc_assert (!integer_zerop (step));
5070
5071 if (iv_step == NULL_TREE)
5072 {
5073 /* The step of the aggregate pointer is the type size,
5074 negated for downward accesses. */
5075 iv_step = TYPE_SIZE_UNIT (aggr_type);
5076 if (tree_int_cst_sgn (step) == -1)
5077 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5078 }
5079
5080 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5081
5082 create_iv (aggr_ptr_init,
5083 fold_convert (aggr_ptr_type, iv_step),
5084 aggr_ptr, loop, &incr_gsi, insert_after,
5085 &indx_before_incr, &indx_after_incr);
5086 incr = gsi_stmt (incr_gsi);
5087
5088 /* Copy the points-to information if it exists. */
5089 if (DR_PTR_INFO (dr))
5090 {
5091 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5092 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5093 }
5094 if (ptr_incr)
5095 *ptr_incr = incr;
5096
5097 aptr = indx_before_incr;
5098 }
5099
5100 if (!nested_in_vect_loop || only_init)
5101 return aptr;
5102
5103
5104 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5105 nested in LOOP, if exists. */
5106
5107 gcc_assert (nested_in_vect_loop);
5108 if (!only_init)
5109 {
5110 standard_iv_increment_position (containing_loop, &incr_gsi,
5111 &insert_after);
5112 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5113 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5114 &indx_after_incr);
5115 incr = gsi_stmt (incr_gsi);
5116
5117 /* Copy the points-to information if it exists. */
5118 if (DR_PTR_INFO (dr))
5119 {
5120 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5121 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5122 }
5123 if (ptr_incr)
5124 *ptr_incr = incr;
5125
5126 return indx_before_incr;
5127 }
5128 else
5129 gcc_unreachable ();
5130 }
5131
5132
5133 /* Function bump_vector_ptr
5134
5135 Increment a pointer (to a vector type) by vector-size. If requested,
5136 i.e. if PTR-INCR is given, then also connect the new increment stmt
5137 to the existing def-use update-chain of the pointer, by modifying
5138 the PTR_INCR as illustrated below:
5139
5140 The pointer def-use update-chain before this function:
5141 DATAREF_PTR = phi (p_0, p_2)
5142 ....
5143 PTR_INCR: p_2 = DATAREF_PTR + step
5144
5145 The pointer def-use update-chain after this function:
5146 DATAREF_PTR = phi (p_0, p_2)
5147 ....
5148 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5149 ....
5150 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5151
5152 Input:
5153 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5154 in the loop.
5155 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5156 the loop. The increment amount across iterations is expected
5157 to be vector_size.
5158 BSI - location where the new update stmt is to be placed.
5159 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5160 BUMP - optional. The offset by which to bump the pointer. If not given,
5161 the offset is assumed to be vector_size.
5162
5163 Output: Return NEW_DATAREF_PTR as illustrated above.
5164
5165 */
5166
5167 tree
5168 bump_vector_ptr (vec_info *vinfo,
5169 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5170 stmt_vec_info stmt_info, tree bump)
5171 {
5172 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5173 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5174 tree update = TYPE_SIZE_UNIT (vectype);
5175 gimple *incr_stmt;
5176 ssa_op_iter iter;
5177 use_operand_p use_p;
5178 tree new_dataref_ptr;
5179
5180 if (bump)
5181 update = bump;
5182
5183 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5184 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5185 else
5186 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5187 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5188 dataref_ptr, update);
5189 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5190 /* Fold the increment, avoiding excessive chains use-def chains of
5191 those, leading to compile-time issues for passes until the next
5192 forwprop pass which would do this as well. */
5193 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5194 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5195 {
5196 incr_stmt = gsi_stmt (fold_gsi);
5197 update_stmt (incr_stmt);
5198 }
5199
5200 /* Copy the points-to information if it exists. */
5201 if (DR_PTR_INFO (dr))
5202 {
5203 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5204 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5205 }
5206
5207 if (!ptr_incr)
5208 return new_dataref_ptr;
5209
5210 /* Update the vector-pointer's cross-iteration increment. */
5211 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5212 {
5213 tree use = USE_FROM_PTR (use_p);
5214
5215 if (use == dataref_ptr)
5216 SET_USE (use_p, new_dataref_ptr);
5217 else
5218 gcc_assert (operand_equal_p (use, update, 0));
5219 }
5220
5221 return new_dataref_ptr;
5222 }
5223
5224
5225 /* Copy memory reference info such as base/clique from the SRC reference
5226 to the DEST MEM_REF. */
5227
5228 void
5229 vect_copy_ref_info (tree dest, tree src)
5230 {
5231 if (TREE_CODE (dest) != MEM_REF)
5232 return;
5233
5234 tree src_base = src;
5235 while (handled_component_p (src_base))
5236 src_base = TREE_OPERAND (src_base, 0);
5237 if (TREE_CODE (src_base) != MEM_REF
5238 && TREE_CODE (src_base) != TARGET_MEM_REF)
5239 return;
5240
5241 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5242 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5243 }
5244
5245
5246 /* Function vect_create_destination_var.
5247
5248 Create a new temporary of type VECTYPE. */
5249
5250 tree
5251 vect_create_destination_var (tree scalar_dest, tree vectype)
5252 {
5253 tree vec_dest;
5254 const char *name;
5255 char *new_name;
5256 tree type;
5257 enum vect_var_kind kind;
5258
5259 kind = vectype
5260 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5261 ? vect_mask_var
5262 : vect_simple_var
5263 : vect_scalar_var;
5264 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5265
5266 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5267
5268 name = get_name (scalar_dest);
5269 if (name)
5270 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5271 else
5272 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5273 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5274 free (new_name);
5275
5276 return vec_dest;
5277 }
5278
5279 /* Function vect_grouped_store_supported.
5280
5281 Returns TRUE if interleave high and interleave low permutations
5282 are supported, and FALSE otherwise. */
5283
5284 bool
5285 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5286 {
5287 machine_mode mode = TYPE_MODE (vectype);
5288
5289 /* vect_permute_store_chain requires the group size to be equal to 3 or
5290 be a power of two. */
5291 if (count != 3 && exact_log2 (count) == -1)
5292 {
5293 if (dump_enabled_p ())
5294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5295 "the size of the group of accesses"
5296 " is not a power of 2 or not eqaul to 3\n");
5297 return false;
5298 }
5299
5300 /* Check that the permutation is supported. */
5301 if (VECTOR_MODE_P (mode))
5302 {
5303 unsigned int i;
5304 if (count == 3)
5305 {
5306 unsigned int j0 = 0, j1 = 0, j2 = 0;
5307 unsigned int i, j;
5308
5309 unsigned int nelt;
5310 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5311 {
5312 if (dump_enabled_p ())
5313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5314 "cannot handle groups of 3 stores for"
5315 " variable-length vectors\n");
5316 return false;
5317 }
5318
5319 vec_perm_builder sel (nelt, nelt, 1);
5320 sel.quick_grow (nelt);
5321 vec_perm_indices indices;
5322 for (j = 0; j < 3; j++)
5323 {
5324 int nelt0 = ((3 - j) * nelt) % 3;
5325 int nelt1 = ((3 - j) * nelt + 1) % 3;
5326 int nelt2 = ((3 - j) * nelt + 2) % 3;
5327 for (i = 0; i < nelt; i++)
5328 {
5329 if (3 * i + nelt0 < nelt)
5330 sel[3 * i + nelt0] = j0++;
5331 if (3 * i + nelt1 < nelt)
5332 sel[3 * i + nelt1] = nelt + j1++;
5333 if (3 * i + nelt2 < nelt)
5334 sel[3 * i + nelt2] = 0;
5335 }
5336 indices.new_vector (sel, 2, nelt);
5337 if (!can_vec_perm_const_p (mode, indices))
5338 {
5339 if (dump_enabled_p ())
5340 dump_printf (MSG_MISSED_OPTIMIZATION,
5341 "permutation op not supported by target.\n");
5342 return false;
5343 }
5344
5345 for (i = 0; i < nelt; i++)
5346 {
5347 if (3 * i + nelt0 < nelt)
5348 sel[3 * i + nelt0] = 3 * i + nelt0;
5349 if (3 * i + nelt1 < nelt)
5350 sel[3 * i + nelt1] = 3 * i + nelt1;
5351 if (3 * i + nelt2 < nelt)
5352 sel[3 * i + nelt2] = nelt + j2++;
5353 }
5354 indices.new_vector (sel, 2, nelt);
5355 if (!can_vec_perm_const_p (mode, indices))
5356 {
5357 if (dump_enabled_p ())
5358 dump_printf (MSG_MISSED_OPTIMIZATION,
5359 "permutation op not supported by target.\n");
5360 return false;
5361 }
5362 }
5363 return true;
5364 }
5365 else
5366 {
5367 /* If length is not equal to 3 then only power of 2 is supported. */
5368 gcc_assert (pow2p_hwi (count));
5369 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5370
5371 /* The encoding has 2 interleaved stepped patterns. */
5372 vec_perm_builder sel (nelt, 2, 3);
5373 sel.quick_grow (6);
5374 for (i = 0; i < 3; i++)
5375 {
5376 sel[i * 2] = i;
5377 sel[i * 2 + 1] = i + nelt;
5378 }
5379 vec_perm_indices indices (sel, 2, nelt);
5380 if (can_vec_perm_const_p (mode, indices))
5381 {
5382 for (i = 0; i < 6; i++)
5383 sel[i] += exact_div (nelt, 2);
5384 indices.new_vector (sel, 2, nelt);
5385 if (can_vec_perm_const_p (mode, indices))
5386 return true;
5387 }
5388 }
5389 }
5390
5391 if (dump_enabled_p ())
5392 dump_printf (MSG_MISSED_OPTIMIZATION,
5393 "permutation op not supported by target.\n");
5394 return false;
5395 }
5396
5397
5398 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5399 type VECTYPE. MASKED_P says whether the masked form is needed. */
5400
5401 bool
5402 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5403 bool masked_p)
5404 {
5405 if (masked_p)
5406 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5407 vec_mask_store_lanes_optab,
5408 vectype, count);
5409 else
5410 return vect_lanes_optab_supported_p ("vec_store_lanes",
5411 vec_store_lanes_optab,
5412 vectype, count);
5413 }
5414
5415
5416 /* Function vect_permute_store_chain.
5417
5418 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5419 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5420 the data correctly for the stores. Return the final references for stores
5421 in RESULT_CHAIN.
5422
5423 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5424 The input is 4 vectors each containing 8 elements. We assign a number to
5425 each element, the input sequence is:
5426
5427 1st vec: 0 1 2 3 4 5 6 7
5428 2nd vec: 8 9 10 11 12 13 14 15
5429 3rd vec: 16 17 18 19 20 21 22 23
5430 4th vec: 24 25 26 27 28 29 30 31
5431
5432 The output sequence should be:
5433
5434 1st vec: 0 8 16 24 1 9 17 25
5435 2nd vec: 2 10 18 26 3 11 19 27
5436 3rd vec: 4 12 20 28 5 13 21 30
5437 4th vec: 6 14 22 30 7 15 23 31
5438
5439 i.e., we interleave the contents of the four vectors in their order.
5440
5441 We use interleave_high/low instructions to create such output. The input of
5442 each interleave_high/low operation is two vectors:
5443 1st vec 2nd vec
5444 0 1 2 3 4 5 6 7
5445 the even elements of the result vector are obtained left-to-right from the
5446 high/low elements of the first vector. The odd elements of the result are
5447 obtained left-to-right from the high/low elements of the second vector.
5448 The output of interleave_high will be: 0 4 1 5
5449 and of interleave_low: 2 6 3 7
5450
5451
5452 The permutation is done in log LENGTH stages. In each stage interleave_high
5453 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5454 where the first argument is taken from the first half of DR_CHAIN and the
5455 second argument from it's second half.
5456 In our example,
5457
5458 I1: interleave_high (1st vec, 3rd vec)
5459 I2: interleave_low (1st vec, 3rd vec)
5460 I3: interleave_high (2nd vec, 4th vec)
5461 I4: interleave_low (2nd vec, 4th vec)
5462
5463 The output for the first stage is:
5464
5465 I1: 0 16 1 17 2 18 3 19
5466 I2: 4 20 5 21 6 22 7 23
5467 I3: 8 24 9 25 10 26 11 27
5468 I4: 12 28 13 29 14 30 15 31
5469
5470 The output of the second stage, i.e. the final result is:
5471
5472 I1: 0 8 16 24 1 9 17 25
5473 I2: 2 10 18 26 3 11 19 27
5474 I3: 4 12 20 28 5 13 21 30
5475 I4: 6 14 22 30 7 15 23 31. */
5476
5477 void
5478 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5479 unsigned int length,
5480 stmt_vec_info stmt_info,
5481 gimple_stmt_iterator *gsi,
5482 vec<tree> *result_chain)
5483 {
5484 tree vect1, vect2, high, low;
5485 gimple *perm_stmt;
5486 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5487 tree perm_mask_low, perm_mask_high;
5488 tree data_ref;
5489 tree perm3_mask_low, perm3_mask_high;
5490 unsigned int i, j, n, log_length = exact_log2 (length);
5491
5492 result_chain->quick_grow (length);
5493 memcpy (result_chain->address (), dr_chain.address (),
5494 length * sizeof (tree));
5495
5496 if (length == 3)
5497 {
5498 /* vect_grouped_store_supported ensures that this is constant. */
5499 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5500 unsigned int j0 = 0, j1 = 0, j2 = 0;
5501
5502 vec_perm_builder sel (nelt, nelt, 1);
5503 sel.quick_grow (nelt);
5504 vec_perm_indices indices;
5505 for (j = 0; j < 3; j++)
5506 {
5507 int nelt0 = ((3 - j) * nelt) % 3;
5508 int nelt1 = ((3 - j) * nelt + 1) % 3;
5509 int nelt2 = ((3 - j) * nelt + 2) % 3;
5510
5511 for (i = 0; i < nelt; i++)
5512 {
5513 if (3 * i + nelt0 < nelt)
5514 sel[3 * i + nelt0] = j0++;
5515 if (3 * i + nelt1 < nelt)
5516 sel[3 * i + nelt1] = nelt + j1++;
5517 if (3 * i + nelt2 < nelt)
5518 sel[3 * i + nelt2] = 0;
5519 }
5520 indices.new_vector (sel, 2, nelt);
5521 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5522
5523 for (i = 0; i < nelt; i++)
5524 {
5525 if (3 * i + nelt0 < nelt)
5526 sel[3 * i + nelt0] = 3 * i + nelt0;
5527 if (3 * i + nelt1 < nelt)
5528 sel[3 * i + nelt1] = 3 * i + nelt1;
5529 if (3 * i + nelt2 < nelt)
5530 sel[3 * i + nelt2] = nelt + j2++;
5531 }
5532 indices.new_vector (sel, 2, nelt);
5533 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5534
5535 vect1 = dr_chain[0];
5536 vect2 = dr_chain[1];
5537
5538 /* Create interleaving stmt:
5539 low = VEC_PERM_EXPR <vect1, vect2,
5540 {j, nelt, *, j + 1, nelt + j + 1, *,
5541 j + 2, nelt + j + 2, *, ...}> */
5542 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5543 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5544 vect2, perm3_mask_low);
5545 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5546
5547 vect1 = data_ref;
5548 vect2 = dr_chain[2];
5549 /* Create interleaving stmt:
5550 low = VEC_PERM_EXPR <vect1, vect2,
5551 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5552 6, 7, nelt + j + 2, ...}> */
5553 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5554 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5555 vect2, perm3_mask_high);
5556 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5557 (*result_chain)[j] = data_ref;
5558 }
5559 }
5560 else
5561 {
5562 /* If length is not equal to 3 then only power of 2 is supported. */
5563 gcc_assert (pow2p_hwi (length));
5564
5565 /* The encoding has 2 interleaved stepped patterns. */
5566 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5567 vec_perm_builder sel (nelt, 2, 3);
5568 sel.quick_grow (6);
5569 for (i = 0; i < 3; i++)
5570 {
5571 sel[i * 2] = i;
5572 sel[i * 2 + 1] = i + nelt;
5573 }
5574 vec_perm_indices indices (sel, 2, nelt);
5575 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5576
5577 for (i = 0; i < 6; i++)
5578 sel[i] += exact_div (nelt, 2);
5579 indices.new_vector (sel, 2, nelt);
5580 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5581
5582 for (i = 0, n = log_length; i < n; i++)
5583 {
5584 for (j = 0; j < length/2; j++)
5585 {
5586 vect1 = dr_chain[j];
5587 vect2 = dr_chain[j+length/2];
5588
5589 /* Create interleaving stmt:
5590 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5591 ...}> */
5592 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5593 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5594 vect2, perm_mask_high);
5595 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5596 (*result_chain)[2*j] = high;
5597
5598 /* Create interleaving stmt:
5599 low = VEC_PERM_EXPR <vect1, vect2,
5600 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5601 ...}> */
5602 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5603 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5604 vect2, perm_mask_low);
5605 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5606 (*result_chain)[2*j+1] = low;
5607 }
5608 memcpy (dr_chain.address (), result_chain->address (),
5609 length * sizeof (tree));
5610 }
5611 }
5612 }
5613
5614 /* Function vect_setup_realignment
5615
5616 This function is called when vectorizing an unaligned load using
5617 the dr_explicit_realign[_optimized] scheme.
5618 This function generates the following code at the loop prolog:
5619
5620 p = initial_addr;
5621 x msq_init = *(floor(p)); # prolog load
5622 realignment_token = call target_builtin;
5623 loop:
5624 x msq = phi (msq_init, ---)
5625
5626 The stmts marked with x are generated only for the case of
5627 dr_explicit_realign_optimized.
5628
5629 The code above sets up a new (vector) pointer, pointing to the first
5630 location accessed by STMT_INFO, and a "floor-aligned" load using that
5631 pointer. It also generates code to compute the "realignment-token"
5632 (if the relevant target hook was defined), and creates a phi-node at the
5633 loop-header bb whose arguments are the result of the prolog-load (created
5634 by this function) and the result of a load that takes place in the loop
5635 (to be created by the caller to this function).
5636
5637 For the case of dr_explicit_realign_optimized:
5638 The caller to this function uses the phi-result (msq) to create the
5639 realignment code inside the loop, and sets up the missing phi argument,
5640 as follows:
5641 loop:
5642 msq = phi (msq_init, lsq)
5643 lsq = *(floor(p')); # load in loop
5644 result = realign_load (msq, lsq, realignment_token);
5645
5646 For the case of dr_explicit_realign:
5647 loop:
5648 msq = *(floor(p)); # load in loop
5649 p' = p + (VS-1);
5650 lsq = *(floor(p')); # load in loop
5651 result = realign_load (msq, lsq, realignment_token);
5652
5653 Input:
5654 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5655 a memory location that may be unaligned.
5656 BSI - place where new code is to be inserted.
5657 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5658 is used.
5659
5660 Output:
5661 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5662 target hook, if defined.
5663 Return value - the result of the loop-header phi node. */
5664
5665 tree
5666 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5667 gimple_stmt_iterator *gsi, tree *realignment_token,
5668 enum dr_alignment_support alignment_support_scheme,
5669 tree init_addr,
5670 class loop **at_loop)
5671 {
5672 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5673 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5674 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5675 struct data_reference *dr = dr_info->dr;
5676 class loop *loop = NULL;
5677 edge pe = NULL;
5678 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5679 tree vec_dest;
5680 gimple *inc;
5681 tree ptr;
5682 tree data_ref;
5683 basic_block new_bb;
5684 tree msq_init = NULL_TREE;
5685 tree new_temp;
5686 gphi *phi_stmt;
5687 tree msq = NULL_TREE;
5688 gimple_seq stmts = NULL;
5689 bool compute_in_loop = false;
5690 bool nested_in_vect_loop = false;
5691 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5692 class loop *loop_for_initial_load = NULL;
5693
5694 if (loop_vinfo)
5695 {
5696 loop = LOOP_VINFO_LOOP (loop_vinfo);
5697 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5698 }
5699
5700 gcc_assert (alignment_support_scheme == dr_explicit_realign
5701 || alignment_support_scheme == dr_explicit_realign_optimized);
5702
5703 /* We need to generate three things:
5704 1. the misalignment computation
5705 2. the extra vector load (for the optimized realignment scheme).
5706 3. the phi node for the two vectors from which the realignment is
5707 done (for the optimized realignment scheme). */
5708
5709 /* 1. Determine where to generate the misalignment computation.
5710
5711 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5712 calculation will be generated by this function, outside the loop (in the
5713 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5714 caller, inside the loop.
5715
5716 Background: If the misalignment remains fixed throughout the iterations of
5717 the loop, then both realignment schemes are applicable, and also the
5718 misalignment computation can be done outside LOOP. This is because we are
5719 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5720 are a multiple of VS (the Vector Size), and therefore the misalignment in
5721 different vectorized LOOP iterations is always the same.
5722 The problem arises only if the memory access is in an inner-loop nested
5723 inside LOOP, which is now being vectorized using outer-loop vectorization.
5724 This is the only case when the misalignment of the memory access may not
5725 remain fixed throughout the iterations of the inner-loop (as explained in
5726 detail in vect_supportable_dr_alignment). In this case, not only is the
5727 optimized realignment scheme not applicable, but also the misalignment
5728 computation (and generation of the realignment token that is passed to
5729 REALIGN_LOAD) have to be done inside the loop.
5730
5731 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5732 or not, which in turn determines if the misalignment is computed inside
5733 the inner-loop, or outside LOOP. */
5734
5735 if (init_addr != NULL_TREE || !loop_vinfo)
5736 {
5737 compute_in_loop = true;
5738 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5739 }
5740
5741
5742 /* 2. Determine where to generate the extra vector load.
5743
5744 For the optimized realignment scheme, instead of generating two vector
5745 loads in each iteration, we generate a single extra vector load in the
5746 preheader of the loop, and in each iteration reuse the result of the
5747 vector load from the previous iteration. In case the memory access is in
5748 an inner-loop nested inside LOOP, which is now being vectorized using
5749 outer-loop vectorization, we need to determine whether this initial vector
5750 load should be generated at the preheader of the inner-loop, or can be
5751 generated at the preheader of LOOP. If the memory access has no evolution
5752 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5753 to be generated inside LOOP (in the preheader of the inner-loop). */
5754
5755 if (nested_in_vect_loop)
5756 {
5757 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5758 bool invariant_in_outerloop =
5759 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5760 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5761 }
5762 else
5763 loop_for_initial_load = loop;
5764 if (at_loop)
5765 *at_loop = loop_for_initial_load;
5766
5767 if (loop_for_initial_load)
5768 pe = loop_preheader_edge (loop_for_initial_load);
5769
5770 /* 3. For the case of the optimized realignment, create the first vector
5771 load at the loop preheader. */
5772
5773 if (alignment_support_scheme == dr_explicit_realign_optimized)
5774 {
5775 /* Create msq_init = *(floor(p1)) in the loop preheader */
5776 gassign *new_stmt;
5777
5778 gcc_assert (!compute_in_loop);
5779 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5780 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5781 loop_for_initial_load, NULL_TREE,
5782 &init_addr, NULL, &inc, true);
5783 if (TREE_CODE (ptr) == SSA_NAME)
5784 new_temp = copy_ssa_name (ptr);
5785 else
5786 new_temp = make_ssa_name (TREE_TYPE (ptr));
5787 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5788 tree type = TREE_TYPE (ptr);
5789 new_stmt = gimple_build_assign
5790 (new_temp, BIT_AND_EXPR, ptr,
5791 fold_build2 (MINUS_EXPR, type,
5792 build_int_cst (type, 0),
5793 build_int_cst (type, align)));
5794 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5795 gcc_assert (!new_bb);
5796 data_ref
5797 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5798 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5799 vect_copy_ref_info (data_ref, DR_REF (dr));
5800 new_stmt = gimple_build_assign (vec_dest, data_ref);
5801 new_temp = make_ssa_name (vec_dest, new_stmt);
5802 gimple_assign_set_lhs (new_stmt, new_temp);
5803 if (pe)
5804 {
5805 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5806 gcc_assert (!new_bb);
5807 }
5808 else
5809 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5810
5811 msq_init = gimple_assign_lhs (new_stmt);
5812 }
5813
5814 /* 4. Create realignment token using a target builtin, if available.
5815 It is done either inside the containing loop, or before LOOP (as
5816 determined above). */
5817
5818 if (targetm.vectorize.builtin_mask_for_load)
5819 {
5820 gcall *new_stmt;
5821 tree builtin_decl;
5822
5823 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5824 if (!init_addr)
5825 {
5826 /* Generate the INIT_ADDR computation outside LOOP. */
5827 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5828 stmt_info, &stmts,
5829 NULL_TREE);
5830 if (loop)
5831 {
5832 pe = loop_preheader_edge (loop);
5833 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5834 gcc_assert (!new_bb);
5835 }
5836 else
5837 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5838 }
5839
5840 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5841 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5842 vec_dest =
5843 vect_create_destination_var (scalar_dest,
5844 gimple_call_return_type (new_stmt));
5845 new_temp = make_ssa_name (vec_dest, new_stmt);
5846 gimple_call_set_lhs (new_stmt, new_temp);
5847
5848 if (compute_in_loop)
5849 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5850 else
5851 {
5852 /* Generate the misalignment computation outside LOOP. */
5853 pe = loop_preheader_edge (loop);
5854 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5855 gcc_assert (!new_bb);
5856 }
5857
5858 *realignment_token = gimple_call_lhs (new_stmt);
5859
5860 /* The result of the CALL_EXPR to this builtin is determined from
5861 the value of the parameter and no global variables are touched
5862 which makes the builtin a "const" function. Requiring the
5863 builtin to have the "const" attribute makes it unnecessary
5864 to call mark_call_clobbered. */
5865 gcc_assert (TREE_READONLY (builtin_decl));
5866 }
5867
5868 if (alignment_support_scheme == dr_explicit_realign)
5869 return msq;
5870
5871 gcc_assert (!compute_in_loop);
5872 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5873
5874
5875 /* 5. Create msq = phi <msq_init, lsq> in loop */
5876
5877 pe = loop_preheader_edge (containing_loop);
5878 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5879 msq = make_ssa_name (vec_dest);
5880 phi_stmt = create_phi_node (msq, containing_loop->header);
5881 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5882
5883 return msq;
5884 }
5885
5886
5887 /* Function vect_grouped_load_supported.
5888
5889 COUNT is the size of the load group (the number of statements plus the
5890 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5891 only one statement, with a gap of COUNT - 1.
5892
5893 Returns true if a suitable permute exists. */
5894
5895 bool
5896 vect_grouped_load_supported (tree vectype, bool single_element_p,
5897 unsigned HOST_WIDE_INT count)
5898 {
5899 machine_mode mode = TYPE_MODE (vectype);
5900
5901 /* If this is single-element interleaving with an element distance
5902 that leaves unused vector loads around punt - we at least create
5903 very sub-optimal code in that case (and blow up memory,
5904 see PR65518). */
5905 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5906 {
5907 if (dump_enabled_p ())
5908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5909 "single-element interleaving not supported "
5910 "for not adjacent vector loads\n");
5911 return false;
5912 }
5913
5914 /* vect_permute_load_chain requires the group size to be equal to 3 or
5915 be a power of two. */
5916 if (count != 3 && exact_log2 (count) == -1)
5917 {
5918 if (dump_enabled_p ())
5919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5920 "the size of the group of accesses"
5921 " is not a power of 2 or not equal to 3\n");
5922 return false;
5923 }
5924
5925 /* Check that the permutation is supported. */
5926 if (VECTOR_MODE_P (mode))
5927 {
5928 unsigned int i, j;
5929 if (count == 3)
5930 {
5931 unsigned int nelt;
5932 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5933 {
5934 if (dump_enabled_p ())
5935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5936 "cannot handle groups of 3 loads for"
5937 " variable-length vectors\n");
5938 return false;
5939 }
5940
5941 vec_perm_builder sel (nelt, nelt, 1);
5942 sel.quick_grow (nelt);
5943 vec_perm_indices indices;
5944 unsigned int k;
5945 for (k = 0; k < 3; k++)
5946 {
5947 for (i = 0; i < nelt; i++)
5948 if (3 * i + k < 2 * nelt)
5949 sel[i] = 3 * i + k;
5950 else
5951 sel[i] = 0;
5952 indices.new_vector (sel, 2, nelt);
5953 if (!can_vec_perm_const_p (mode, indices))
5954 {
5955 if (dump_enabled_p ())
5956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5957 "shuffle of 3 loads is not supported by"
5958 " target\n");
5959 return false;
5960 }
5961 for (i = 0, j = 0; i < nelt; i++)
5962 if (3 * i + k < 2 * nelt)
5963 sel[i] = i;
5964 else
5965 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5966 indices.new_vector (sel, 2, nelt);
5967 if (!can_vec_perm_const_p (mode, indices))
5968 {
5969 if (dump_enabled_p ())
5970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5971 "shuffle of 3 loads is not supported by"
5972 " target\n");
5973 return false;
5974 }
5975 }
5976 return true;
5977 }
5978 else
5979 {
5980 /* If length is not equal to 3 then only power of 2 is supported. */
5981 gcc_assert (pow2p_hwi (count));
5982 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5983
5984 /* The encoding has a single stepped pattern. */
5985 vec_perm_builder sel (nelt, 1, 3);
5986 sel.quick_grow (3);
5987 for (i = 0; i < 3; i++)
5988 sel[i] = i * 2;
5989 vec_perm_indices indices (sel, 2, nelt);
5990 if (can_vec_perm_const_p (mode, indices))
5991 {
5992 for (i = 0; i < 3; i++)
5993 sel[i] = i * 2 + 1;
5994 indices.new_vector (sel, 2, nelt);
5995 if (can_vec_perm_const_p (mode, indices))
5996 return true;
5997 }
5998 }
5999 }
6000
6001 if (dump_enabled_p ())
6002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6003 "extract even/odd not supported by target\n");
6004 return false;
6005 }
6006
6007 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6008 type VECTYPE. MASKED_P says whether the masked form is needed. */
6009
6010 bool
6011 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6012 bool masked_p)
6013 {
6014 if (masked_p)
6015 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6016 vec_mask_load_lanes_optab,
6017 vectype, count);
6018 else
6019 return vect_lanes_optab_supported_p ("vec_load_lanes",
6020 vec_load_lanes_optab,
6021 vectype, count);
6022 }
6023
6024 /* Function vect_permute_load_chain.
6025
6026 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6027 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6028 the input data correctly. Return the final references for loads in
6029 RESULT_CHAIN.
6030
6031 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6032 The input is 4 vectors each containing 8 elements. We assign a number to each
6033 element, the input sequence is:
6034
6035 1st vec: 0 1 2 3 4 5 6 7
6036 2nd vec: 8 9 10 11 12 13 14 15
6037 3rd vec: 16 17 18 19 20 21 22 23
6038 4th vec: 24 25 26 27 28 29 30 31
6039
6040 The output sequence should be:
6041
6042 1st vec: 0 4 8 12 16 20 24 28
6043 2nd vec: 1 5 9 13 17 21 25 29
6044 3rd vec: 2 6 10 14 18 22 26 30
6045 4th vec: 3 7 11 15 19 23 27 31
6046
6047 i.e., the first output vector should contain the first elements of each
6048 interleaving group, etc.
6049
6050 We use extract_even/odd instructions to create such output. The input of
6051 each extract_even/odd operation is two vectors
6052 1st vec 2nd vec
6053 0 1 2 3 4 5 6 7
6054
6055 and the output is the vector of extracted even/odd elements. The output of
6056 extract_even will be: 0 2 4 6
6057 and of extract_odd: 1 3 5 7
6058
6059
6060 The permutation is done in log LENGTH stages. In each stage extract_even
6061 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6062 their order. In our example,
6063
6064 E1: extract_even (1st vec, 2nd vec)
6065 E2: extract_odd (1st vec, 2nd vec)
6066 E3: extract_even (3rd vec, 4th vec)
6067 E4: extract_odd (3rd vec, 4th vec)
6068
6069 The output for the first stage will be:
6070
6071 E1: 0 2 4 6 8 10 12 14
6072 E2: 1 3 5 7 9 11 13 15
6073 E3: 16 18 20 22 24 26 28 30
6074 E4: 17 19 21 23 25 27 29 31
6075
6076 In order to proceed and create the correct sequence for the next stage (or
6077 for the correct output, if the second stage is the last one, as in our
6078 example), we first put the output of extract_even operation and then the
6079 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6080 The input for the second stage is:
6081
6082 1st vec (E1): 0 2 4 6 8 10 12 14
6083 2nd vec (E3): 16 18 20 22 24 26 28 30
6084 3rd vec (E2): 1 3 5 7 9 11 13 15
6085 4th vec (E4): 17 19 21 23 25 27 29 31
6086
6087 The output of the second stage:
6088
6089 E1: 0 4 8 12 16 20 24 28
6090 E2: 2 6 10 14 18 22 26 30
6091 E3: 1 5 9 13 17 21 25 29
6092 E4: 3 7 11 15 19 23 27 31
6093
6094 And RESULT_CHAIN after reordering:
6095
6096 1st vec (E1): 0 4 8 12 16 20 24 28
6097 2nd vec (E3): 1 5 9 13 17 21 25 29
6098 3rd vec (E2): 2 6 10 14 18 22 26 30
6099 4th vec (E4): 3 7 11 15 19 23 27 31. */
6100
6101 static void
6102 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6103 unsigned int length,
6104 stmt_vec_info stmt_info,
6105 gimple_stmt_iterator *gsi,
6106 vec<tree> *result_chain)
6107 {
6108 tree data_ref, first_vect, second_vect;
6109 tree perm_mask_even, perm_mask_odd;
6110 tree perm3_mask_low, perm3_mask_high;
6111 gimple *perm_stmt;
6112 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6113 unsigned int i, j, log_length = exact_log2 (length);
6114
6115 result_chain->quick_grow (length);
6116 memcpy (result_chain->address (), dr_chain.address (),
6117 length * sizeof (tree));
6118
6119 if (length == 3)
6120 {
6121 /* vect_grouped_load_supported ensures that this is constant. */
6122 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6123 unsigned int k;
6124
6125 vec_perm_builder sel (nelt, nelt, 1);
6126 sel.quick_grow (nelt);
6127 vec_perm_indices indices;
6128 for (k = 0; k < 3; k++)
6129 {
6130 for (i = 0; i < nelt; i++)
6131 if (3 * i + k < 2 * nelt)
6132 sel[i] = 3 * i + k;
6133 else
6134 sel[i] = 0;
6135 indices.new_vector (sel, 2, nelt);
6136 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6137
6138 for (i = 0, j = 0; i < nelt; i++)
6139 if (3 * i + k < 2 * nelt)
6140 sel[i] = i;
6141 else
6142 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6143 indices.new_vector (sel, 2, nelt);
6144 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6145
6146 first_vect = dr_chain[0];
6147 second_vect = dr_chain[1];
6148
6149 /* Create interleaving stmt (low part of):
6150 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6151 ...}> */
6152 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6153 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6154 second_vect, perm3_mask_low);
6155 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6156
6157 /* Create interleaving stmt (high part of):
6158 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6159 ...}> */
6160 first_vect = data_ref;
6161 second_vect = dr_chain[2];
6162 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6163 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6164 second_vect, perm3_mask_high);
6165 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6166 (*result_chain)[k] = data_ref;
6167 }
6168 }
6169 else
6170 {
6171 /* If length is not equal to 3 then only power of 2 is supported. */
6172 gcc_assert (pow2p_hwi (length));
6173
6174 /* The encoding has a single stepped pattern. */
6175 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6176 vec_perm_builder sel (nelt, 1, 3);
6177 sel.quick_grow (3);
6178 for (i = 0; i < 3; ++i)
6179 sel[i] = i * 2;
6180 vec_perm_indices indices (sel, 2, nelt);
6181 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6182
6183 for (i = 0; i < 3; ++i)
6184 sel[i] = i * 2 + 1;
6185 indices.new_vector (sel, 2, nelt);
6186 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6187
6188 for (i = 0; i < log_length; i++)
6189 {
6190 for (j = 0; j < length; j += 2)
6191 {
6192 first_vect = dr_chain[j];
6193 second_vect = dr_chain[j+1];
6194
6195 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6196 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6197 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6198 first_vect, second_vect,
6199 perm_mask_even);
6200 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6201 (*result_chain)[j/2] = data_ref;
6202
6203 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6204 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6205 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6206 first_vect, second_vect,
6207 perm_mask_odd);
6208 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6209 (*result_chain)[j/2+length/2] = data_ref;
6210 }
6211 memcpy (dr_chain.address (), result_chain->address (),
6212 length * sizeof (tree));
6213 }
6214 }
6215 }
6216
6217 /* Function vect_shift_permute_load_chain.
6218
6219 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6220 sequence of stmts to reorder the input data accordingly.
6221 Return the final references for loads in RESULT_CHAIN.
6222 Return true if successed, false otherwise.
6223
6224 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6225 The input is 3 vectors each containing 8 elements. We assign a
6226 number to each element, the input sequence is:
6227
6228 1st vec: 0 1 2 3 4 5 6 7
6229 2nd vec: 8 9 10 11 12 13 14 15
6230 3rd vec: 16 17 18 19 20 21 22 23
6231
6232 The output sequence should be:
6233
6234 1st vec: 0 3 6 9 12 15 18 21
6235 2nd vec: 1 4 7 10 13 16 19 22
6236 3rd vec: 2 5 8 11 14 17 20 23
6237
6238 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6239
6240 First we shuffle all 3 vectors to get correct elements order:
6241
6242 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6243 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6244 3rd vec: (16 19 22) (17 20 23) (18 21)
6245
6246 Next we unite and shift vector 3 times:
6247
6248 1st step:
6249 shift right by 6 the concatenation of:
6250 "1st vec" and "2nd vec"
6251 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6252 "2nd vec" and "3rd vec"
6253 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6254 "3rd vec" and "1st vec"
6255 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6256 | New vectors |
6257
6258 So that now new vectors are:
6259
6260 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6261 2nd vec: (10 13) (16 19 22) (17 20 23)
6262 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6263
6264 2nd step:
6265 shift right by 5 the concatenation of:
6266 "1st vec" and "3rd vec"
6267 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6268 "2nd vec" and "1st vec"
6269 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6270 "3rd vec" and "2nd vec"
6271 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6272 | New vectors |
6273
6274 So that now new vectors are:
6275
6276 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6277 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6278 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6279
6280 3rd step:
6281 shift right by 5 the concatenation of:
6282 "1st vec" and "1st vec"
6283 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6284 shift right by 3 the concatenation of:
6285 "2nd vec" and "2nd vec"
6286 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6287 | New vectors |
6288
6289 So that now all vectors are READY:
6290 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6291 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6292 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6293
6294 This algorithm is faster than one in vect_permute_load_chain if:
6295 1. "shift of a concatination" is faster than general permutation.
6296 This is usually so.
6297 2. The TARGET machine can't execute vector instructions in parallel.
6298 This is because each step of the algorithm depends on previous.
6299 The algorithm in vect_permute_load_chain is much more parallel.
6300
6301 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6302 */
6303
6304 static bool
6305 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6306 unsigned int length,
6307 stmt_vec_info stmt_info,
6308 gimple_stmt_iterator *gsi,
6309 vec<tree> *result_chain)
6310 {
6311 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6312 tree perm2_mask1, perm2_mask2, perm3_mask;
6313 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6314 gimple *perm_stmt;
6315
6316 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6317 unsigned int i;
6318 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6319
6320 unsigned HOST_WIDE_INT nelt, vf;
6321 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6322 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6323 /* Not supported for variable-length vectors. */
6324 return false;
6325
6326 vec_perm_builder sel (nelt, nelt, 1);
6327 sel.quick_grow (nelt);
6328
6329 result_chain->quick_grow (length);
6330 memcpy (result_chain->address (), dr_chain.address (),
6331 length * sizeof (tree));
6332
6333 if (pow2p_hwi (length) && vf > 4)
6334 {
6335 unsigned int j, log_length = exact_log2 (length);
6336 for (i = 0; i < nelt / 2; ++i)
6337 sel[i] = i * 2;
6338 for (i = 0; i < nelt / 2; ++i)
6339 sel[nelt / 2 + i] = i * 2 + 1;
6340 vec_perm_indices indices (sel, 2, nelt);
6341 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6342 {
6343 if (dump_enabled_p ())
6344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6345 "shuffle of 2 fields structure is not \
6346 supported by target\n");
6347 return false;
6348 }
6349 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6350
6351 for (i = 0; i < nelt / 2; ++i)
6352 sel[i] = i * 2 + 1;
6353 for (i = 0; i < nelt / 2; ++i)
6354 sel[nelt / 2 + i] = i * 2;
6355 indices.new_vector (sel, 2, nelt);
6356 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6357 {
6358 if (dump_enabled_p ())
6359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6360 "shuffle of 2 fields structure is not \
6361 supported by target\n");
6362 return false;
6363 }
6364 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6365
6366 /* Generating permutation constant to shift all elements.
6367 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6368 for (i = 0; i < nelt; i++)
6369 sel[i] = nelt / 2 + i;
6370 indices.new_vector (sel, 2, nelt);
6371 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6372 {
6373 if (dump_enabled_p ())
6374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6375 "shift permutation is not supported by target\n");
6376 return false;
6377 }
6378 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6379
6380 /* Generating permutation constant to select vector from 2.
6381 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6382 for (i = 0; i < nelt / 2; i++)
6383 sel[i] = i;
6384 for (i = nelt / 2; i < nelt; i++)
6385 sel[i] = nelt + i;
6386 indices.new_vector (sel, 2, nelt);
6387 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6388 {
6389 if (dump_enabled_p ())
6390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6391 "select is not supported by target\n");
6392 return false;
6393 }
6394 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6395
6396 for (i = 0; i < log_length; i++)
6397 {
6398 for (j = 0; j < length; j += 2)
6399 {
6400 first_vect = dr_chain[j];
6401 second_vect = dr_chain[j + 1];
6402
6403 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6404 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6405 first_vect, first_vect,
6406 perm2_mask1);
6407 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6408 vect[0] = data_ref;
6409
6410 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6411 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6412 second_vect, second_vect,
6413 perm2_mask2);
6414 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6415 vect[1] = data_ref;
6416
6417 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6418 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6419 vect[0], vect[1], shift1_mask);
6420 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6421 (*result_chain)[j/2 + length/2] = data_ref;
6422
6423 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6424 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6425 vect[0], vect[1], select_mask);
6426 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6427 (*result_chain)[j/2] = data_ref;
6428 }
6429 memcpy (dr_chain.address (), result_chain->address (),
6430 length * sizeof (tree));
6431 }
6432 return true;
6433 }
6434 if (length == 3 && vf > 2)
6435 {
6436 unsigned int k = 0, l = 0;
6437
6438 /* Generating permutation constant to get all elements in rigth order.
6439 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6440 for (i = 0; i < nelt; i++)
6441 {
6442 if (3 * k + (l % 3) >= nelt)
6443 {
6444 k = 0;
6445 l += (3 - (nelt % 3));
6446 }
6447 sel[i] = 3 * k + (l % 3);
6448 k++;
6449 }
6450 vec_perm_indices indices (sel, 2, nelt);
6451 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6452 {
6453 if (dump_enabled_p ())
6454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6455 "shuffle of 3 fields structure is not \
6456 supported by target\n");
6457 return false;
6458 }
6459 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6460
6461 /* Generating permutation constant to shift all elements.
6462 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6463 for (i = 0; i < nelt; i++)
6464 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6465 indices.new_vector (sel, 2, nelt);
6466 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6467 {
6468 if (dump_enabled_p ())
6469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470 "shift permutation is not supported by target\n");
6471 return false;
6472 }
6473 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6474
6475 /* Generating permutation constant to shift all elements.
6476 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6477 for (i = 0; i < nelt; i++)
6478 sel[i] = 2 * (nelt / 3) + 1 + i;
6479 indices.new_vector (sel, 2, nelt);
6480 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6481 {
6482 if (dump_enabled_p ())
6483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6484 "shift permutation is not supported by target\n");
6485 return false;
6486 }
6487 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6488
6489 /* Generating permutation constant to shift all elements.
6490 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6491 for (i = 0; i < nelt; i++)
6492 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6493 indices.new_vector (sel, 2, nelt);
6494 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6495 {
6496 if (dump_enabled_p ())
6497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6498 "shift permutation is not supported by target\n");
6499 return false;
6500 }
6501 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6502
6503 /* Generating permutation constant to shift all elements.
6504 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6505 for (i = 0; i < nelt; i++)
6506 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6507 indices.new_vector (sel, 2, nelt);
6508 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6509 {
6510 if (dump_enabled_p ())
6511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6512 "shift permutation is not supported by target\n");
6513 return false;
6514 }
6515 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6516
6517 for (k = 0; k < 3; k++)
6518 {
6519 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6520 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6521 dr_chain[k], dr_chain[k],
6522 perm3_mask);
6523 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6524 vect[k] = data_ref;
6525 }
6526
6527 for (k = 0; k < 3; k++)
6528 {
6529 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6530 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6531 vect[k % 3], vect[(k + 1) % 3],
6532 shift1_mask);
6533 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6534 vect_shift[k] = data_ref;
6535 }
6536
6537 for (k = 0; k < 3; k++)
6538 {
6539 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6540 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6541 vect_shift[(4 - k) % 3],
6542 vect_shift[(3 - k) % 3],
6543 shift2_mask);
6544 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6545 vect[k] = data_ref;
6546 }
6547
6548 (*result_chain)[3 - (nelt % 3)] = vect[2];
6549
6550 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6551 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6552 vect[0], shift3_mask);
6553 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6554 (*result_chain)[nelt % 3] = data_ref;
6555
6556 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6557 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6558 vect[1], shift4_mask);
6559 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6560 (*result_chain)[0] = data_ref;
6561 return true;
6562 }
6563 return false;
6564 }
6565
6566 /* Function vect_transform_grouped_load.
6567
6568 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6569 to perform their permutation and ascribe the result vectorized statements to
6570 the scalar statements.
6571 */
6572
6573 void
6574 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6575 vec<tree> dr_chain,
6576 int size, gimple_stmt_iterator *gsi)
6577 {
6578 machine_mode mode;
6579 vec<tree> result_chain = vNULL;
6580
6581 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6582 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6583 vectors, that are ready for vector computation. */
6584 result_chain.create (size);
6585
6586 /* If reassociation width for vector type is 2 or greater target machine can
6587 execute 2 or more vector instructions in parallel. Otherwise try to
6588 get chain for loads group using vect_shift_permute_load_chain. */
6589 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6590 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6591 || pow2p_hwi (size)
6592 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6593 gsi, &result_chain))
6594 vect_permute_load_chain (vinfo, dr_chain,
6595 size, stmt_info, gsi, &result_chain);
6596 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6597 result_chain.release ();
6598 }
6599
6600 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6601 generated as part of the vectorization of STMT_INFO. Assign the statement
6602 for each vector to the associated scalar statement. */
6603
6604 void
6605 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6606 vec<tree> result_chain)
6607 {
6608 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6609 unsigned int i, gap_count;
6610 tree tmp_data_ref;
6611
6612 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6613 Since we scan the chain starting from it's first node, their order
6614 corresponds the order of data-refs in RESULT_CHAIN. */
6615 stmt_vec_info next_stmt_info = first_stmt_info;
6616 gap_count = 1;
6617 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6618 {
6619 if (!next_stmt_info)
6620 break;
6621
6622 /* Skip the gaps. Loads created for the gaps will be removed by dead
6623 code elimination pass later. No need to check for the first stmt in
6624 the group, since it always exists.
6625 DR_GROUP_GAP is the number of steps in elements from the previous
6626 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6627 correspond to the gaps. */
6628 if (next_stmt_info != first_stmt_info
6629 && gap_count < DR_GROUP_GAP (next_stmt_info))
6630 {
6631 gap_count++;
6632 continue;
6633 }
6634
6635 /* ??? The following needs cleanup after the removal of
6636 DR_GROUP_SAME_DR_STMT. */
6637 if (next_stmt_info)
6638 {
6639 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6640 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6641 copies, and we put the new vector statement last. */
6642 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6643
6644 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6645 gap_count = 1;
6646 }
6647 }
6648 }
6649
6650 /* Function vect_force_dr_alignment_p.
6651
6652 Returns whether the alignment of a DECL can be forced to be aligned
6653 on ALIGNMENT bit boundary. */
6654
6655 bool
6656 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6657 {
6658 if (!VAR_P (decl))
6659 return false;
6660
6661 if (decl_in_symtab_p (decl)
6662 && !symtab_node::get (decl)->can_increase_alignment_p ())
6663 return false;
6664
6665 if (TREE_STATIC (decl))
6666 return (known_le (alignment,
6667 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6668 else
6669 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6670 }
6671
6672 /* Return whether the data reference DR_INFO is supported with respect to its
6673 alignment.
6674 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6675 it is aligned, i.e., check if it is possible to vectorize it with different
6676 alignment. */
6677
6678 enum dr_alignment_support
6679 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6680 tree vectype, int misalignment)
6681 {
6682 data_reference *dr = dr_info->dr;
6683 stmt_vec_info stmt_info = dr_info->stmt;
6684 machine_mode mode = TYPE_MODE (vectype);
6685 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6686 class loop *vect_loop = NULL;
6687 bool nested_in_vect_loop = false;
6688
6689 if (misalignment == 0)
6690 return dr_aligned;
6691
6692 /* For now assume all conditional loads/stores support unaligned
6693 access without any special code. */
6694 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6695 if (gimple_call_internal_p (stmt)
6696 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6697 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6698 return dr_unaligned_supported;
6699
6700 if (loop_vinfo)
6701 {
6702 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6703 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6704 }
6705
6706 /* Possibly unaligned access. */
6707
6708 /* We can choose between using the implicit realignment scheme (generating
6709 a misaligned_move stmt) and the explicit realignment scheme (generating
6710 aligned loads with a REALIGN_LOAD). There are two variants to the
6711 explicit realignment scheme: optimized, and unoptimized.
6712 We can optimize the realignment only if the step between consecutive
6713 vector loads is equal to the vector size. Since the vector memory
6714 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6715 is guaranteed that the misalignment amount remains the same throughout the
6716 execution of the vectorized loop. Therefore, we can create the
6717 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6718 at the loop preheader.
6719
6720 However, in the case of outer-loop vectorization, when vectorizing a
6721 memory access in the inner-loop nested within the LOOP that is now being
6722 vectorized, while it is guaranteed that the misalignment of the
6723 vectorized memory access will remain the same in different outer-loop
6724 iterations, it is *not* guaranteed that is will remain the same throughout
6725 the execution of the inner-loop. This is because the inner-loop advances
6726 with the original scalar step (and not in steps of VS). If the inner-loop
6727 step happens to be a multiple of VS, then the misalignment remains fixed
6728 and we can use the optimized realignment scheme. For example:
6729
6730 for (i=0; i<N; i++)
6731 for (j=0; j<M; j++)
6732 s += a[i+j];
6733
6734 When vectorizing the i-loop in the above example, the step between
6735 consecutive vector loads is 1, and so the misalignment does not remain
6736 fixed across the execution of the inner-loop, and the realignment cannot
6737 be optimized (as illustrated in the following pseudo vectorized loop):
6738
6739 for (i=0; i<N; i+=4)
6740 for (j=0; j<M; j++){
6741 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6742 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6743 // (assuming that we start from an aligned address).
6744 }
6745
6746 We therefore have to use the unoptimized realignment scheme:
6747
6748 for (i=0; i<N; i+=4)
6749 for (j=k; j<M; j+=4)
6750 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6751 // that the misalignment of the initial address is
6752 // 0).
6753
6754 The loop can then be vectorized as follows:
6755
6756 for (k=0; k<4; k++){
6757 rt = get_realignment_token (&vp[k]);
6758 for (i=0; i<N; i+=4){
6759 v1 = vp[i+k];
6760 for (j=k; j<M; j+=4){
6761 v2 = vp[i+j+VS-1];
6762 va = REALIGN_LOAD <v1,v2,rt>;
6763 vs += va;
6764 v1 = v2;
6765 }
6766 }
6767 } */
6768
6769 if (DR_IS_READ (dr))
6770 {
6771 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6772 && (!targetm.vectorize.builtin_mask_for_load
6773 || targetm.vectorize.builtin_mask_for_load ()))
6774 {
6775 /* If we are doing SLP then the accesses need not have the
6776 same alignment, instead it depends on the SLP group size. */
6777 if (loop_vinfo
6778 && STMT_SLP_TYPE (stmt_info)
6779 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6780 * (DR_GROUP_SIZE
6781 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6782 TYPE_VECTOR_SUBPARTS (vectype)))
6783 ;
6784 else if (!loop_vinfo
6785 || (nested_in_vect_loop
6786 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6787 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6788 return dr_explicit_realign;
6789 else
6790 return dr_explicit_realign_optimized;
6791 }
6792 }
6793
6794 bool is_packed = false;
6795 tree type = TREE_TYPE (DR_REF (dr));
6796 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6797 is_packed = not_size_aligned (DR_REF (dr));
6798 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6799 is_packed))
6800 return dr_unaligned_supported;
6801
6802 /* Unsupported. */
6803 return dr_unaligned_unsupported;
6804 }