]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-data-refs.c
Update copyright years.
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
80 }
81 }
82
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 {
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
90 }
91
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
96
97 return true;
98 }
99
100
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
117
118 tree
119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
120 {
121 HOST_WIDE_INT lhs, rhs;
122
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126 return scalar_type;
127
128 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129
130 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131 if (assign)
132 {
133 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134 if (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
142 {
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
148 }
149 }
150 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
151 {
152 unsigned int i = 0;
153 if (gimple_call_internal_p (call))
154 {
155 internal_fn ifn = gimple_call_internal_fn (call);
156 if (internal_load_fn_p (ifn))
157 /* For loads the LHS type does the trick. */
158 i = ~0U;
159 else if (internal_store_fn_p (ifn))
160 {
161 /* For stores use the tyep of the stored value. */
162 i = internal_fn_stored_value_index (ifn);
163 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
164 i = ~0U;
165 }
166 else if (internal_fn_mask_index (ifn) == 0)
167 i = 1;
168 }
169 if (i < gimple_call_num_args (call))
170 {
171 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
172 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
173 {
174 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
175 if (rhs < lhs)
176 scalar_type = rhs_type;
177 }
178 }
179 }
180
181 return scalar_type;
182 }
183
184
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186 tested at run-time. Return TRUE if DDR was successfully inserted.
187 Return false if versioning is not supported. */
188
189 static opt_result
190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
191 {
192 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
193
194 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
195 return opt_result::failure_at (vect_location,
196 "will not create alias checks, as"
197 " --param vect-max-version-for-alias-checks"
198 " == 0\n");
199
200 opt_result res
201 = runtime_alias_check_p (ddr, loop,
202 optimize_loop_nest_for_speed_p (loop));
203 if (!res)
204 return res;
205
206 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207 return opt_result::success ();
208 }
209
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
211
212 static void
213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
214 {
215 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
216 for (unsigned int i = 0; i < checks.length(); ++i)
217 if (checks[i] == value)
218 return;
219
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location,
222 "need run-time check that %T is nonzero\n",
223 value);
224 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 }
226
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229 DR_INFO_B. At least one of the accesses is a write. */
230
231 static bool
232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
233 {
234 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
235 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
236
237 /* Single statements are always kept in their original order. */
238 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
239 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
240 return true;
241
242 /* STMT_A and STMT_B belong to overlapping groups. All loads are
243 emitted at the position of the first scalar load.
244 Stores in a group are emitted at the position of the last scalar store.
245 Compute that position and check whether the resulting order matches
246 the current one. */
247 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
248 if (il_a)
249 {
250 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
251 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
252 s = DR_GROUP_NEXT_ELEMENT (s))
253 il_a = get_later_stmt (il_a, s);
254 else /* DR_IS_READ */
255 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
256 s = DR_GROUP_NEXT_ELEMENT (s))
257 if (get_later_stmt (il_a, s) == il_a)
258 il_a = s;
259 }
260 else
261 il_a = stmtinfo_a;
262 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
263 if (il_b)
264 {
265 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
266 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
267 s = DR_GROUP_NEXT_ELEMENT (s))
268 il_b = get_later_stmt (il_b, s);
269 else /* DR_IS_READ */
270 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
271 s = DR_GROUP_NEXT_ELEMENT (s))
272 if (get_later_stmt (il_b, s) == il_b)
273 il_b = s;
274 }
275 else
276 il_b = stmtinfo_b;
277 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
278 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 }
280
281 /* A subroutine of vect_analyze_data_ref_dependence. Handle
282 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283 distances. These distances are conservatively correct but they don't
284 reflect a guaranteed dependence.
285
286 Return true if this function does all the work necessary to avoid
287 an alias or false if the caller should use the dependence distances
288 to limit the vectorization factor in the usual way. LOOP_DEPTH is
289 the depth of the loop described by LOOP_VINFO and the other arguments
290 are as for vect_analyze_data_ref_dependence. */
291
292 static bool
293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
294 loop_vec_info loop_vinfo,
295 int loop_depth, unsigned int *max_vf)
296 {
297 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
298 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
299 {
300 int dist = dist_v[loop_depth];
301 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
302 {
303 /* If the user asserted safelen >= DIST consecutive iterations
304 can be executed concurrently, assume independence.
305
306 ??? An alternative would be to add the alias check even
307 in this case, and vectorize the fallback loop with the
308 maximum VF set to safelen. However, if the user has
309 explicitly given a length, it's less likely that that
310 would be a win. */
311 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
312 {
313 if ((unsigned int) loop->safelen < *max_vf)
314 *max_vf = loop->safelen;
315 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
316 continue;
317 }
318
319 /* For dependence distances of 2 or more, we have the option
320 of limiting VF or checking for an alias at runtime.
321 Prefer to check at runtime if we can, to avoid limiting
322 the VF unnecessarily when the bases are in fact independent.
323
324 Note that the alias checks will be removed if the VF ends up
325 being small enough. */
326 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
327 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
328 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
329 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
330 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331 }
332 }
333 return true;
334 }
335
336
337 /* Function vect_analyze_data_ref_dependence.
338
339 FIXME: I needed to change the sense of the returned flag.
340
341 Return FALSE if there (might) exist a dependence between a memory-reference
342 DRA and a memory-reference DRB. When versioning for alias may check a
343 dependence at run-time, return TRUE. Adjust *MAX_VF according to
344 the data dependence. */
345
346 static opt_result
347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
348 loop_vec_info loop_vinfo,
349 unsigned int *max_vf)
350 {
351 unsigned int i;
352 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
353 struct data_reference *dra = DDR_A (ddr);
354 struct data_reference *drb = DDR_B (ddr);
355 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
356 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
357 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
358 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
359 lambda_vector dist_v;
360 unsigned int loop_depth;
361
362 /* If user asserted safelen consecutive iterations can be
363 executed concurrently, assume independence. */
364 auto apply_safelen = [&]()
365 {
366 if (loop->safelen >= 2)
367 {
368 if ((unsigned int) loop->safelen < *max_vf)
369 *max_vf = loop->safelen;
370 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
371 return true;
372 }
373 return false;
374 };
375
376 /* In loop analysis all data references should be vectorizable. */
377 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379 gcc_unreachable ();
380
381 /* Independent data accesses. */
382 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383 return opt_result::success ();
384
385 if (dra == drb
386 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387 return opt_result::success ();
388
389 /* We do not have to consider dependences between accesses that belong
390 to the same group, unless the stride could be smaller than the
391 group size. */
392 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396 return opt_result::success ();
397
398 /* Even if we have an anti-dependence then, as the vectorized loop covers at
399 least two scalar iterations, there is always also a true dependence.
400 As the vectorizer does not re-order loads and stores we can ignore
401 the anti-dependence if TBAA can disambiguate both DRs similar to the
402 case with known negative distance anti-dependences (positive
403 distance anti-dependences would violate TBAA constraints). */
404 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407 get_alias_set (DR_REF (drb))))
408 return opt_result::success ();
409
410 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
411 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
412 {
413 if (apply_safelen ())
414 return opt_result::success ();
415
416 return opt_result::failure_at
417 (stmtinfo_a->stmt,
418 "possible alias involving gather/scatter between %T and %T\n",
419 DR_REF (dra), DR_REF (drb));
420 }
421
422 /* Unknown data dependence. */
423 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
424 {
425 if (apply_safelen ())
426 return opt_result::success ();
427
428 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
430 "versioning for alias required: "
431 "can't determine dependence between %T and %T\n",
432 DR_REF (dra), DR_REF (drb));
433
434 /* Add to list of ddrs that need to be tested at run-time. */
435 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
436 }
437
438 /* Known data dependence. */
439 if (DDR_NUM_DIST_VECTS (ddr) == 0)
440 {
441 if (apply_safelen ())
442 return opt_result::success ();
443
444 if (dump_enabled_p ())
445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
446 "versioning for alias required: "
447 "bad dist vector for %T and %T\n",
448 DR_REF (dra), DR_REF (drb));
449 /* Add to list of ddrs that need to be tested at run-time. */
450 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
451 }
452
453 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
454
455 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
456 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
457 loop_depth, max_vf))
458 return opt_result::success ();
459
460 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
461 {
462 int dist = dist_v[loop_depth];
463
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "dependence distance = %d.\n", dist);
467
468 if (dist == 0)
469 {
470 if (dump_enabled_p ())
471 dump_printf_loc (MSG_NOTE, vect_location,
472 "dependence distance == 0 between %T and %T\n",
473 DR_REF (dra), DR_REF (drb));
474
475 /* When we perform grouped accesses and perform implicit CSE
476 by detecting equal accesses and doing disambiguation with
477 runtime alias tests like for
478 .. = a[i];
479 .. = a[i+1];
480 a[i] = ..;
481 a[i+1] = ..;
482 *p = ..;
483 .. = a[i];
484 .. = a[i+1];
485 where we will end up loading { a[i], a[i+1] } once, make
486 sure that inserting group loads before the first load and
487 stores after the last store will do the right thing.
488 Similar for groups like
489 a[i] = ...;
490 ... = a[i];
491 a[i+1] = ...;
492 where loads from the group interleave with the store. */
493 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
494 return opt_result::failure_at (stmtinfo_a->stmt,
495 "READ_WRITE dependence"
496 " in interleaving.\n");
497
498 if (loop->safelen < 2)
499 {
500 tree indicator = dr_zero_step_indicator (dra);
501 if (!indicator || integer_zerop (indicator))
502 return opt_result::failure_at (stmtinfo_a->stmt,
503 "access also has a zero step\n");
504 else if (TREE_CODE (indicator) != INTEGER_CST)
505 vect_check_nonzero_value (loop_vinfo, indicator);
506 }
507 continue;
508 }
509
510 if (dist > 0 && DDR_REVERSED_P (ddr))
511 {
512 /* If DDR_REVERSED_P the order of the data-refs in DDR was
513 reversed (to make distance vector positive), and the actual
514 distance is negative. */
515 if (dump_enabled_p ())
516 dump_printf_loc (MSG_NOTE, vect_location,
517 "dependence distance negative.\n");
518 /* When doing outer loop vectorization, we need to check if there is
519 a backward dependence at the inner loop level if the dependence
520 at the outer loop is reversed. See PR81740. */
521 if (nested_in_vect_loop_p (loop, stmtinfo_a)
522 || nested_in_vect_loop_p (loop, stmtinfo_b))
523 {
524 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
525 DDR_LOOP_NEST (ddr));
526 if (dist_v[inner_depth] < 0)
527 return opt_result::failure_at (stmtinfo_a->stmt,
528 "not vectorized, dependence "
529 "between data-refs %T and %T\n",
530 DR_REF (dra), DR_REF (drb));
531 }
532 /* Record a negative dependence distance to later limit the
533 amount of stmt copying / unrolling we can perform.
534 Only need to handle read-after-write dependence. */
535 if (DR_IS_READ (drb)
536 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
537 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
538 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
539 continue;
540 }
541
542 unsigned int abs_dist = abs (dist);
543 if (abs_dist >= 2 && abs_dist < *max_vf)
544 {
545 /* The dependence distance requires reduction of the maximal
546 vectorization factor. */
547 *max_vf = abs_dist;
548 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE, vect_location,
550 "adjusting maximal vectorization factor to %i\n",
551 *max_vf);
552 }
553
554 if (abs_dist >= *max_vf)
555 {
556 /* Dependence distance does not create dependence, as far as
557 vectorization is concerned, in this case. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "dependence distance >= VF.\n");
561 continue;
562 }
563
564 return opt_result::failure_at (stmtinfo_a->stmt,
565 "not vectorized, possible dependence "
566 "between data-refs %T and %T\n",
567 DR_REF (dra), DR_REF (drb));
568 }
569
570 return opt_result::success ();
571 }
572
573 /* Function vect_analyze_data_ref_dependences.
574
575 Examine all the data references in the loop, and make sure there do not
576 exist any data dependences between them. Set *MAX_VF according to
577 the maximum vectorization factor the data dependences allow. */
578
579 opt_result
580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
581 unsigned int *max_vf)
582 {
583 unsigned int i;
584 struct data_dependence_relation *ddr;
585
586 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587
588 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
589 {
590 LOOP_VINFO_DDRS (loop_vinfo)
591 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
592 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
593 /* We do not need read-read dependences. */
594 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
595 &LOOP_VINFO_DDRS (loop_vinfo),
596 LOOP_VINFO_LOOP_NEST (loop_vinfo),
597 false);
598 gcc_assert (res);
599 }
600
601 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
602
603 /* For epilogues we either have no aliases or alias versioning
604 was applied to original loop. Therefore we may just get max_vf
605 using VF of original loop. */
606 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
607 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
608 else
609 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
610 {
611 opt_result res
612 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
613 if (!res)
614 return res;
615 }
616
617 return opt_result::success ();
618 }
619
620
621 /* Function vect_slp_analyze_data_ref_dependence.
622
623 Return TRUE if there (might) exist a dependence between a memory-reference
624 DRA and a memory-reference DRB for VINFO. When versioning for alias
625 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
626 according to the data dependence. */
627
628 static bool
629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
630 struct data_dependence_relation *ddr)
631 {
632 struct data_reference *dra = DDR_A (ddr);
633 struct data_reference *drb = DDR_B (ddr);
634 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
635 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
636
637 /* We need to check dependences of statements marked as unvectorizable
638 as well, they still can prohibit vectorization. */
639
640 /* Independent data accesses. */
641 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
642 return false;
643
644 if (dra == drb)
645 return false;
646
647 /* Read-read is OK. */
648 if (DR_IS_READ (dra) && DR_IS_READ (drb))
649 return false;
650
651 /* If dra and drb are part of the same interleaving chain consider
652 them independent. */
653 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
654 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
655 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
656 return false;
657
658 /* Unknown data dependence. */
659 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
660 {
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "can't determine dependence between %T and %T\n",
664 DR_REF (dra), DR_REF (drb));
665 }
666 else if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location,
668 "determined dependence between %T and %T\n",
669 DR_REF (dra), DR_REF (drb));
670
671 return true;
672 }
673
674
675 /* Analyze dependences involved in the transform of SLP NODE. STORES
676 contain the vector of scalar stores of this instance if we are
677 disambiguating the loads. */
678
679 static bool
680 vect_slp_analyze_node_dependences (vec_info *vinfo, slp_tree node,
681 vec<stmt_vec_info> stores,
682 stmt_vec_info last_store_info)
683 {
684 /* This walks over all stmts involved in the SLP load/store done
685 in NODE verifying we can sink them up to the last stmt in the
686 group. */
687 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node))))
688 {
689 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
690 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
691 {
692 stmt_vec_info access_info
693 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
694 if (access_info == last_access_info)
695 continue;
696 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
697 ao_ref ref;
698 bool ref_initialized_p = false;
699 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
700 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
701 {
702 gimple *stmt = gsi_stmt (gsi);
703 if (! gimple_vuse (stmt))
704 continue;
705
706 /* If we couldn't record a (single) data reference for this
707 stmt we have to resort to the alias oracle. */
708 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
709 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
710 if (!dr_b)
711 {
712 /* We are moving a store - this means
713 we cannot use TBAA for disambiguation. */
714 if (!ref_initialized_p)
715 ao_ref_init (&ref, DR_REF (dr_a));
716 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
717 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
718 return false;
719 continue;
720 }
721
722 bool dependent = false;
723 /* If we run into a store of this same instance (we've just
724 marked those) then delay dependence checking until we run
725 into the last store because this is where it will have
726 been sunk to (and we verify if we can do that as well). */
727 if (gimple_visited_p (stmt))
728 {
729 if (stmt_info != last_store_info)
730 continue;
731
732 for (stmt_vec_info &store_info : stores)
733 {
734 data_reference *store_dr
735 = STMT_VINFO_DATA_REF (store_info);
736 ddr_p ddr = initialize_data_dependence_relation
737 (dr_a, store_dr, vNULL);
738 dependent
739 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
740 free_dependence_relation (ddr);
741 if (dependent)
742 break;
743 }
744 }
745 else
746 {
747 ddr_p ddr = initialize_data_dependence_relation (dr_a,
748 dr_b, vNULL);
749 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
750 free_dependence_relation (ddr);
751 }
752 if (dependent)
753 return false;
754 }
755 }
756 }
757 else /* DR_IS_READ */
758 {
759 stmt_vec_info first_access_info
760 = vect_find_first_scalar_stmt_in_slp (node);
761 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
762 {
763 stmt_vec_info access_info
764 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
765 if (access_info == first_access_info)
766 continue;
767 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
768 ao_ref ref;
769 bool ref_initialized_p = false;
770 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
771 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
772 {
773 gimple *stmt = gsi_stmt (gsi);
774 if (! gimple_vdef (stmt))
775 continue;
776
777 /* If we couldn't record a (single) data reference for this
778 stmt we have to resort to the alias oracle. */
779 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
780 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
781
782 /* We are hoisting a load - this means we can use
783 TBAA for disambiguation. */
784 if (!ref_initialized_p)
785 ao_ref_init (&ref, DR_REF (dr_a));
786 if (stmt_may_clobber_ref_p_1 (stmt, &ref, true))
787 {
788 if (!dr_b)
789 return false;
790 /* Resort to dependence checking below. */
791 }
792 else
793 /* No dependence. */
794 continue;
795
796 bool dependent = false;
797 /* If we run into a store of this same instance (we've just
798 marked those) then delay dependence checking until we run
799 into the last store because this is where it will have
800 been sunk to (and we verify if we can do that as well). */
801 if (gimple_visited_p (stmt))
802 {
803 if (stmt_info != last_store_info)
804 continue;
805
806 for (stmt_vec_info &store_info : stores)
807 {
808 data_reference *store_dr
809 = STMT_VINFO_DATA_REF (store_info);
810 ddr_p ddr = initialize_data_dependence_relation
811 (dr_a, store_dr, vNULL);
812 dependent
813 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
814 free_dependence_relation (ddr);
815 if (dependent)
816 break;
817 }
818 }
819 else
820 {
821 ddr_p ddr = initialize_data_dependence_relation (dr_a,
822 dr_b, vNULL);
823 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
824 free_dependence_relation (ddr);
825 }
826 if (dependent)
827 return false;
828 }
829 }
830 }
831 return true;
832 }
833
834
835 /* Function vect_analyze_data_ref_dependences.
836
837 Examine all the data references in the basic-block, and make sure there
838 do not exist any data dependences between them. Set *MAX_VF according to
839 the maximum vectorization factor the data dependences allow. */
840
841 bool
842 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
843 {
844 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
845
846 /* The stores of this instance are at the root of the SLP tree. */
847 slp_tree store = NULL;
848 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
849 store = SLP_INSTANCE_TREE (instance);
850
851 /* Verify we can sink stores to the vectorized stmt insert location. */
852 stmt_vec_info last_store_info = NULL;
853 if (store)
854 {
855 if (! vect_slp_analyze_node_dependences (vinfo, store, vNULL, NULL))
856 return false;
857
858 /* Mark stores in this instance and remember the last one. */
859 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
860 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
861 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
862 }
863
864 bool res = true;
865
866 /* Verify we can sink loads to the vectorized stmt insert location,
867 special-casing stores of this instance. */
868 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
869 if (! vect_slp_analyze_node_dependences (vinfo, load,
870 store
871 ? SLP_TREE_SCALAR_STMTS (store)
872 : vNULL, last_store_info))
873 {
874 res = false;
875 break;
876 }
877
878 /* Unset the visited flag. */
879 if (store)
880 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
881 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
882
883 return res;
884 }
885
886 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
887 applied. */
888
889 int
890 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
891 {
892 HOST_WIDE_INT diff = 0;
893 /* Alignment is only analyzed for the first element of a DR group,
894 use that but adjust misalignment by the offset of the access. */
895 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
896 {
897 dr_vec_info *first_dr
898 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
899 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
900 INTEGER_CSTs and the first element in the group has the lowest
901 address. */
902 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
903 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
904 gcc_assert (diff >= 0);
905 dr_info = first_dr;
906 }
907
908 int misalign = dr_info->misalignment;
909 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
910 if (misalign == DR_MISALIGNMENT_UNKNOWN)
911 return misalign;
912
913 /* If the access is only aligned for a vector type with smaller alignment
914 requirement the access has unknown misalignment. */
915 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
916 targetm.vectorize.preferred_vector_alignment (vectype)))
917 return DR_MISALIGNMENT_UNKNOWN;
918
919 /* Apply the offset from the DR group start and the externally supplied
920 offset which can for example result from a negative stride access. */
921 poly_int64 misalignment = misalign + diff + offset;
922
923 /* vect_compute_data_ref_alignment will have ensured that target_alignment
924 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
925 unsigned HOST_WIDE_INT target_alignment_c
926 = dr_info->target_alignment.to_constant ();
927 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
928 return DR_MISALIGNMENT_UNKNOWN;
929 return misalign;
930 }
931
932 /* Record the base alignment guarantee given by DRB, which occurs
933 in STMT_INFO. */
934
935 static void
936 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
937 innermost_loop_behavior *drb)
938 {
939 bool existed;
940 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
941 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
942 if (!existed || entry.second->base_alignment < drb->base_alignment)
943 {
944 entry = std::make_pair (stmt_info, drb);
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_NOTE, vect_location,
947 "recording new base alignment for %T\n"
948 " alignment: %d\n"
949 " misalignment: %d\n"
950 " based on: %G",
951 drb->base_address,
952 drb->base_alignment,
953 drb->base_misalignment,
954 stmt_info->stmt);
955 }
956 }
957
958 /* If the region we're going to vectorize is reached, all unconditional
959 data references occur at least once. We can therefore pool the base
960 alignment guarantees from each unconditional reference. Do this by
961 going through all the data references in VINFO and checking whether
962 the containing statement makes the reference unconditionally. If so,
963 record the alignment of the base address in VINFO so that it can be
964 used for all other references with the same base. */
965
966 void
967 vect_record_base_alignments (vec_info *vinfo)
968 {
969 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
970 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
971 for (data_reference *dr : vinfo->shared->datarefs)
972 {
973 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
974 stmt_vec_info stmt_info = dr_info->stmt;
975 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
976 && STMT_VINFO_VECTORIZABLE (stmt_info)
977 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
978 {
979 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
980
981 /* If DR is nested in the loop that is being vectorized, we can also
982 record the alignment of the base wrt the outer loop. */
983 if (loop && nested_in_vect_loop_p (loop, stmt_info))
984 vect_record_base_alignment
985 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
986 }
987 }
988 }
989
990 /* Function vect_compute_data_ref_alignment
991
992 Compute the misalignment of the data reference DR_INFO when vectorizing
993 with VECTYPE.
994
995 Output:
996 1. initialized misalignment info for DR_INFO
997
998 FOR NOW: No analysis is actually performed. Misalignment is calculated
999 only for trivial cases. TODO. */
1000
1001 static void
1002 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1003 tree vectype)
1004 {
1005 stmt_vec_info stmt_info = dr_info->stmt;
1006 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1007 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1008 class loop *loop = NULL;
1009 tree ref = DR_REF (dr_info->dr);
1010
1011 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_NOTE, vect_location,
1013 "vect_compute_data_ref_alignment:\n");
1014
1015 if (loop_vinfo)
1016 loop = LOOP_VINFO_LOOP (loop_vinfo);
1017
1018 /* Initialize misalignment to unknown. */
1019 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1020
1021 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1022 return;
1023
1024 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1025 bool step_preserves_misalignment_p;
1026
1027 poly_uint64 vector_alignment
1028 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1029 BITS_PER_UNIT);
1030 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1031
1032 /* If the main loop has peeled for alignment we have no way of knowing
1033 whether the data accesses in the epilogues are aligned. We can't at
1034 compile time answer the question whether we have entered the main loop or
1035 not. Fixes PR 92351. */
1036 if (loop_vinfo)
1037 {
1038 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1039 if (orig_loop_vinfo
1040 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1041 return;
1042 }
1043
1044 unsigned HOST_WIDE_INT vect_align_c;
1045 if (!vector_alignment.is_constant (&vect_align_c))
1046 return;
1047
1048 /* No step for BB vectorization. */
1049 if (!loop)
1050 {
1051 gcc_assert (integer_zerop (drb->step));
1052 step_preserves_misalignment_p = true;
1053 }
1054
1055 /* In case the dataref is in an inner-loop of the loop that is being
1056 vectorized (LOOP), we use the base and misalignment information
1057 relative to the outer-loop (LOOP). This is ok only if the misalignment
1058 stays the same throughout the execution of the inner-loop, which is why
1059 we have to check that the stride of the dataref in the inner-loop evenly
1060 divides by the vector alignment. */
1061 else if (nested_in_vect_loop_p (loop, stmt_info))
1062 {
1063 step_preserves_misalignment_p
1064 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1065
1066 if (dump_enabled_p ())
1067 {
1068 if (step_preserves_misalignment_p)
1069 dump_printf_loc (MSG_NOTE, vect_location,
1070 "inner step divides the vector alignment.\n");
1071 else
1072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1073 "inner step doesn't divide the vector"
1074 " alignment.\n");
1075 }
1076 }
1077
1078 /* Similarly we can only use base and misalignment information relative to
1079 an innermost loop if the misalignment stays the same throughout the
1080 execution of the loop. As above, this is the case if the stride of
1081 the dataref evenly divides by the alignment. */
1082 else
1083 {
1084 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1085 step_preserves_misalignment_p
1086 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1087
1088 if (!step_preserves_misalignment_p && dump_enabled_p ())
1089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090 "step doesn't divide the vector alignment.\n");
1091 }
1092
1093 unsigned int base_alignment = drb->base_alignment;
1094 unsigned int base_misalignment = drb->base_misalignment;
1095
1096 /* Calculate the maximum of the pooled base address alignment and the
1097 alignment that we can compute for DR itself. */
1098 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1099 = base_alignments->get (drb->base_address);
1100 if (entry
1101 && base_alignment < (*entry).second->base_alignment
1102 && (loop_vinfo
1103 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1104 gimple_bb (entry->first->stmt))
1105 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1106 || (entry->first->dr_aux.group <= dr_info->group)))))
1107 {
1108 base_alignment = entry->second->base_alignment;
1109 base_misalignment = entry->second->base_misalignment;
1110 }
1111
1112 if (drb->offset_alignment < vect_align_c
1113 || !step_preserves_misalignment_p
1114 /* We need to know whether the step wrt the vectorized loop is
1115 negative when computing the starting misalignment below. */
1116 || TREE_CODE (drb->step) != INTEGER_CST)
1117 {
1118 if (dump_enabled_p ())
1119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1120 "Unknown alignment for access: %T\n", ref);
1121 return;
1122 }
1123
1124 if (base_alignment < vect_align_c)
1125 {
1126 unsigned int max_alignment;
1127 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1128 if (max_alignment < vect_align_c
1129 || !vect_can_force_dr_alignment_p (base,
1130 vect_align_c * BITS_PER_UNIT))
1131 {
1132 if (dump_enabled_p ())
1133 dump_printf_loc (MSG_NOTE, vect_location,
1134 "can't force alignment of ref: %T\n", ref);
1135 return;
1136 }
1137
1138 /* Force the alignment of the decl.
1139 NOTE: This is the only change to the code we make during
1140 the analysis phase, before deciding to vectorize the loop. */
1141 if (dump_enabled_p ())
1142 dump_printf_loc (MSG_NOTE, vect_location,
1143 "force alignment of %T\n", ref);
1144
1145 dr_info->base_decl = base;
1146 dr_info->base_misaligned = true;
1147 base_misalignment = 0;
1148 }
1149 poly_int64 misalignment
1150 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1151
1152 unsigned int const_misalignment;
1153 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1154 {
1155 if (dump_enabled_p ())
1156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157 "Non-constant misalignment for access: %T\n", ref);
1158 return;
1159 }
1160
1161 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1162
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "misalign = %d bytes of ref %T\n",
1166 const_misalignment, ref);
1167
1168 return;
1169 }
1170
1171 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1172 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1173 is made aligned via peeling. */
1174
1175 static bool
1176 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1177 dr_vec_info *dr_peel_info)
1178 {
1179 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1180 DR_TARGET_ALIGNMENT (dr_info)))
1181 {
1182 poly_offset_int diff
1183 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1184 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1185 if (known_eq (diff, 0)
1186 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1187 return true;
1188 }
1189 return false;
1190 }
1191
1192 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1193 aligned via peeling. */
1194
1195 static bool
1196 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1197 dr_vec_info *dr_peel_info)
1198 {
1199 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1200 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1201 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1202 DR_OFFSET (dr_peel_info->dr), 0)
1203 || !operand_equal_p (DR_STEP (dr_info->dr),
1204 DR_STEP (dr_peel_info->dr), 0))
1205 return false;
1206
1207 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1208 }
1209
1210 /* Compute the value for dr_info->misalign so that the access appears
1211 aligned. This is used by peeling to compensate for dr_misalignment
1212 applying the offset for negative step. */
1213
1214 int
1215 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1216 {
1217 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1218 return 0;
1219
1220 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1221 poly_int64 misalignment
1222 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1223 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1224
1225 unsigned HOST_WIDE_INT target_alignment_c;
1226 int misalign;
1227 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1228 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1229 return DR_MISALIGNMENT_UNKNOWN;
1230 return misalign;
1231 }
1232
1233 /* Function vect_update_misalignment_for_peel.
1234 Sets DR_INFO's misalignment
1235 - to 0 if it has the same alignment as DR_PEEL_INFO,
1236 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1237 - to -1 (unknown) otherwise.
1238
1239 DR_INFO - the data reference whose misalignment is to be adjusted.
1240 DR_PEEL_INFO - the data reference whose misalignment is being made
1241 zero in the vector loop by the peel.
1242 NPEEL - the number of iterations in the peel loop if the misalignment
1243 of DR_PEEL_INFO is known at compile time. */
1244
1245 static void
1246 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1247 dr_vec_info *dr_peel_info, int npeel)
1248 {
1249 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1250 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1251 {
1252 SET_DR_MISALIGNMENT (dr_info,
1253 vect_dr_misalign_for_aligned_access (dr_peel_info));
1254 return;
1255 }
1256
1257 unsigned HOST_WIDE_INT alignment;
1258 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1259 && known_alignment_for_access_p (dr_info,
1260 STMT_VINFO_VECTYPE (dr_info->stmt))
1261 && known_alignment_for_access_p (dr_peel_info,
1262 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1263 {
1264 int misal = dr_info->misalignment;
1265 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1266 misal &= alignment - 1;
1267 set_dr_misalignment (dr_info, misal);
1268 return;
1269 }
1270
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1273 "to unknown (-1).\n");
1274 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1275 }
1276
1277 /* Return true if alignment is relevant for DR_INFO. */
1278
1279 static bool
1280 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1281 {
1282 stmt_vec_info stmt_info = dr_info->stmt;
1283
1284 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1285 return false;
1286
1287 /* For interleaving, only the alignment of the first access matters. */
1288 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1289 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1290 return false;
1291
1292 /* Scatter-gather and invariant accesses continue to address individual
1293 scalars, so vector-level alignment is irrelevant. */
1294 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1295 || integer_zerop (DR_STEP (dr_info->dr)))
1296 return false;
1297
1298 /* Strided accesses perform only component accesses, alignment is
1299 irrelevant for them. */
1300 if (STMT_VINFO_STRIDED_P (stmt_info)
1301 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1302 return false;
1303
1304 return true;
1305 }
1306
1307 /* Given an memory reference EXP return whether its alignment is less
1308 than its size. */
1309
1310 static bool
1311 not_size_aligned (tree exp)
1312 {
1313 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1314 return true;
1315
1316 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1317 > get_object_alignment (exp));
1318 }
1319
1320 /* Function vector_alignment_reachable_p
1321
1322 Return true if vector alignment for DR_INFO is reachable by peeling
1323 a few loop iterations. Return false otherwise. */
1324
1325 static bool
1326 vector_alignment_reachable_p (dr_vec_info *dr_info)
1327 {
1328 stmt_vec_info stmt_info = dr_info->stmt;
1329 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1330
1331 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1332 {
1333 /* For interleaved access we peel only if number of iterations in
1334 the prolog loop ({VF - misalignment}), is a multiple of the
1335 number of the interleaved accesses. */
1336 int elem_size, mis_in_elements;
1337
1338 /* FORNOW: handle only known alignment. */
1339 if (!known_alignment_for_access_p (dr_info, vectype))
1340 return false;
1341
1342 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1343 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1344 elem_size = vector_element_size (vector_size, nelements);
1345 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1346
1347 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1348 return false;
1349 }
1350
1351 /* If misalignment is known at the compile time then allow peeling
1352 only if natural alignment is reachable through peeling. */
1353 if (known_alignment_for_access_p (dr_info, vectype)
1354 && !aligned_access_p (dr_info, vectype))
1355 {
1356 HOST_WIDE_INT elmsize =
1357 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1358 if (dump_enabled_p ())
1359 {
1360 dump_printf_loc (MSG_NOTE, vect_location,
1361 "data size = %wd. misalignment = %d.\n", elmsize,
1362 dr_misalignment (dr_info, vectype));
1363 }
1364 if (dr_misalignment (dr_info, vectype) % elmsize)
1365 {
1366 if (dump_enabled_p ())
1367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1368 "data size does not divide the misalignment.\n");
1369 return false;
1370 }
1371 }
1372
1373 if (!known_alignment_for_access_p (dr_info, vectype))
1374 {
1375 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1376 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1377 if (dump_enabled_p ())
1378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379 "Unknown misalignment, %snaturally aligned\n",
1380 is_packed ? "not " : "");
1381 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1382 }
1383
1384 return true;
1385 }
1386
1387
1388 /* Calculate the cost of the memory access represented by DR_INFO. */
1389
1390 static void
1391 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1392 dr_alignment_support alignment_support_scheme,
1393 int misalignment,
1394 unsigned int *inside_cost,
1395 unsigned int *outside_cost,
1396 stmt_vector_for_cost *body_cost_vec,
1397 stmt_vector_for_cost *prologue_cost_vec)
1398 {
1399 stmt_vec_info stmt_info = dr_info->stmt;
1400 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1401 int ncopies;
1402
1403 if (PURE_SLP_STMT (stmt_info))
1404 ncopies = 1;
1405 else
1406 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1407
1408 if (DR_IS_READ (dr_info->dr))
1409 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1410 misalignment, true, inside_cost,
1411 outside_cost, prologue_cost_vec, body_cost_vec, false);
1412 else
1413 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1414 misalignment, inside_cost, body_cost_vec);
1415
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE, vect_location,
1418 "vect_get_data_access_cost: inside_cost = %d, "
1419 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1420 }
1421
1422
1423 typedef struct _vect_peel_info
1424 {
1425 dr_vec_info *dr_info;
1426 int npeel;
1427 unsigned int count;
1428 } *vect_peel_info;
1429
1430 typedef struct _vect_peel_extended_info
1431 {
1432 vec_info *vinfo;
1433 struct _vect_peel_info peel_info;
1434 unsigned int inside_cost;
1435 unsigned int outside_cost;
1436 } *vect_peel_extended_info;
1437
1438
1439 /* Peeling hashtable helpers. */
1440
1441 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1442 {
1443 static inline hashval_t hash (const _vect_peel_info *);
1444 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1445 };
1446
1447 inline hashval_t
1448 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1449 {
1450 return (hashval_t) peel_info->npeel;
1451 }
1452
1453 inline bool
1454 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1455 {
1456 return (a->npeel == b->npeel);
1457 }
1458
1459
1460 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1461
1462 static void
1463 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1464 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1465 int npeel, bool supportable_if_not_aligned)
1466 {
1467 struct _vect_peel_info elem, *slot;
1468 _vect_peel_info **new_slot;
1469
1470 elem.npeel = npeel;
1471 slot = peeling_htab->find (&elem);
1472 if (slot)
1473 slot->count++;
1474 else
1475 {
1476 slot = XNEW (struct _vect_peel_info);
1477 slot->npeel = npeel;
1478 slot->dr_info = dr_info;
1479 slot->count = 1;
1480 new_slot = peeling_htab->find_slot (slot, INSERT);
1481 *new_slot = slot;
1482 }
1483
1484 /* If this DR is not supported with unknown misalignment then bias
1485 this slot when the cost model is disabled. */
1486 if (!supportable_if_not_aligned
1487 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1488 slot->count += VECT_MAX_COST;
1489 }
1490
1491
1492 /* Traverse peeling hash table to find peeling option that aligns maximum
1493 number of data accesses. */
1494
1495 int
1496 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1497 _vect_peel_extended_info *max)
1498 {
1499 vect_peel_info elem = *slot;
1500
1501 if (elem->count > max->peel_info.count
1502 || (elem->count == max->peel_info.count
1503 && max->peel_info.npeel > elem->npeel))
1504 {
1505 max->peel_info.npeel = elem->npeel;
1506 max->peel_info.count = elem->count;
1507 max->peel_info.dr_info = elem->dr_info;
1508 }
1509
1510 return 1;
1511 }
1512
1513 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1514 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1515 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1516 after peeling. */
1517
1518 static void
1519 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1520 dr_vec_info *dr0_info,
1521 unsigned int *inside_cost,
1522 unsigned int *outside_cost,
1523 stmt_vector_for_cost *body_cost_vec,
1524 stmt_vector_for_cost *prologue_cost_vec,
1525 unsigned int npeel)
1526 {
1527 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1528
1529 bool dr0_alignment_known_p
1530 = (dr0_info
1531 && known_alignment_for_access_p (dr0_info,
1532 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1533
1534 for (data_reference *dr : datarefs)
1535 {
1536 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1537 if (!vect_relevant_for_alignment_p (dr_info))
1538 continue;
1539
1540 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1541 dr_alignment_support alignment_support_scheme;
1542 int misalignment;
1543 unsigned HOST_WIDE_INT alignment;
1544
1545 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1546 size_zero_node) < 0;
1547 poly_int64 off = 0;
1548 if (negative)
1549 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1550 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1551
1552 if (npeel == 0)
1553 misalignment = dr_misalignment (dr_info, vectype, off);
1554 else if (dr_info == dr0_info
1555 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1556 misalignment = 0;
1557 else if (!dr0_alignment_known_p
1558 || !known_alignment_for_access_p (dr_info, vectype)
1559 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1560 misalignment = DR_MISALIGNMENT_UNKNOWN;
1561 else
1562 {
1563 misalignment = dr_misalignment (dr_info, vectype, off);
1564 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1565 misalignment &= alignment - 1;
1566 }
1567 alignment_support_scheme
1568 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1569 misalignment);
1570
1571 vect_get_data_access_cost (loop_vinfo, dr_info,
1572 alignment_support_scheme, misalignment,
1573 inside_cost, outside_cost,
1574 body_cost_vec, prologue_cost_vec);
1575 }
1576 }
1577
1578 /* Traverse peeling hash table and calculate cost for each peeling option.
1579 Find the one with the lowest cost. */
1580
1581 int
1582 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1583 _vect_peel_extended_info *min)
1584 {
1585 vect_peel_info elem = *slot;
1586 int dummy;
1587 unsigned int inside_cost = 0, outside_cost = 0;
1588 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1589 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1590 epilogue_cost_vec;
1591
1592 prologue_cost_vec.create (2);
1593 body_cost_vec.create (2);
1594 epilogue_cost_vec.create (2);
1595
1596 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1597 &outside_cost, &body_cost_vec,
1598 &prologue_cost_vec, elem->npeel);
1599
1600 body_cost_vec.release ();
1601
1602 outside_cost += vect_get_known_peeling_cost
1603 (loop_vinfo, elem->npeel, &dummy,
1604 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1605 &prologue_cost_vec, &epilogue_cost_vec);
1606
1607 /* Prologue and epilogue costs are added to the target model later.
1608 These costs depend only on the scalar iteration cost, the
1609 number of peeling iterations finally chosen, and the number of
1610 misaligned statements. So discard the information found here. */
1611 prologue_cost_vec.release ();
1612 epilogue_cost_vec.release ();
1613
1614 if (inside_cost < min->inside_cost
1615 || (inside_cost == min->inside_cost
1616 && outside_cost < min->outside_cost))
1617 {
1618 min->inside_cost = inside_cost;
1619 min->outside_cost = outside_cost;
1620 min->peel_info.dr_info = elem->dr_info;
1621 min->peel_info.npeel = elem->npeel;
1622 min->peel_info.count = elem->count;
1623 }
1624
1625 return 1;
1626 }
1627
1628
1629 /* Choose best peeling option by traversing peeling hash table and either
1630 choosing an option with the lowest cost (if cost model is enabled) or the
1631 option that aligns as many accesses as possible. */
1632
1633 static struct _vect_peel_extended_info
1634 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1635 loop_vec_info loop_vinfo)
1636 {
1637 struct _vect_peel_extended_info res;
1638
1639 res.peel_info.dr_info = NULL;
1640 res.vinfo = loop_vinfo;
1641
1642 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1643 {
1644 res.inside_cost = INT_MAX;
1645 res.outside_cost = INT_MAX;
1646 peeling_htab->traverse <_vect_peel_extended_info *,
1647 vect_peeling_hash_get_lowest_cost> (&res);
1648 }
1649 else
1650 {
1651 res.peel_info.count = 0;
1652 peeling_htab->traverse <_vect_peel_extended_info *,
1653 vect_peeling_hash_get_most_frequent> (&res);
1654 res.inside_cost = 0;
1655 res.outside_cost = 0;
1656 }
1657
1658 return res;
1659 }
1660
1661 /* Return true if the new peeling NPEEL is supported. */
1662
1663 static bool
1664 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1665 unsigned npeel)
1666 {
1667 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1668 enum dr_alignment_support supportable_dr_alignment;
1669
1670 bool dr0_alignment_known_p
1671 = known_alignment_for_access_p (dr0_info,
1672 STMT_VINFO_VECTYPE (dr0_info->stmt));
1673
1674 /* Ensure that all data refs can be vectorized after the peel. */
1675 for (data_reference *dr : datarefs)
1676 {
1677 if (dr == dr0_info->dr)
1678 continue;
1679
1680 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1681 if (!vect_relevant_for_alignment_p (dr_info)
1682 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1683 continue;
1684
1685 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1686 int misalignment;
1687 unsigned HOST_WIDE_INT alignment;
1688 if (!dr0_alignment_known_p
1689 || !known_alignment_for_access_p (dr_info, vectype)
1690 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1691 misalignment = DR_MISALIGNMENT_UNKNOWN;
1692 else
1693 {
1694 misalignment = dr_misalignment (dr_info, vectype);
1695 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1696 misalignment &= alignment - 1;
1697 }
1698 supportable_dr_alignment
1699 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1700 misalignment);
1701 if (supportable_dr_alignment == dr_unaligned_unsupported)
1702 return false;
1703 }
1704
1705 return true;
1706 }
1707
1708 /* Compare two data-references DRA and DRB to group them into chunks
1709 with related alignment. */
1710
1711 static int
1712 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1713 {
1714 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1715 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1716 int cmp;
1717
1718 /* Stabilize sort. */
1719 if (dra == drb)
1720 return 0;
1721
1722 /* Ordering of DRs according to base. */
1723 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1724 DR_BASE_ADDRESS (drb));
1725 if (cmp != 0)
1726 return cmp;
1727
1728 /* And according to DR_OFFSET. */
1729 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1730 if (cmp != 0)
1731 return cmp;
1732
1733 /* And after step. */
1734 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1735 if (cmp != 0)
1736 return cmp;
1737
1738 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1739 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1740 if (cmp == 0)
1741 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1742 return cmp;
1743 }
1744
1745 /* Function vect_enhance_data_refs_alignment
1746
1747 This pass will use loop versioning and loop peeling in order to enhance
1748 the alignment of data references in the loop.
1749
1750 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1751 original loop is to be vectorized. Any other loops that are created by
1752 the transformations performed in this pass - are not supposed to be
1753 vectorized. This restriction will be relaxed.
1754
1755 This pass will require a cost model to guide it whether to apply peeling
1756 or versioning or a combination of the two. For example, the scheme that
1757 intel uses when given a loop with several memory accesses, is as follows:
1758 choose one memory access ('p') which alignment you want to force by doing
1759 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1760 other accesses are not necessarily aligned, or (2) use loop versioning to
1761 generate one loop in which all accesses are aligned, and another loop in
1762 which only 'p' is necessarily aligned.
1763
1764 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1765 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1766 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1767
1768 Devising a cost model is the most critical aspect of this work. It will
1769 guide us on which access to peel for, whether to use loop versioning, how
1770 many versions to create, etc. The cost model will probably consist of
1771 generic considerations as well as target specific considerations (on
1772 powerpc for example, misaligned stores are more painful than misaligned
1773 loads).
1774
1775 Here are the general steps involved in alignment enhancements:
1776
1777 -- original loop, before alignment analysis:
1778 for (i=0; i<N; i++){
1779 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1780 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1781 }
1782
1783 -- After vect_compute_data_refs_alignment:
1784 for (i=0; i<N; i++){
1785 x = q[i]; # DR_MISALIGNMENT(q) = 3
1786 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1787 }
1788
1789 -- Possibility 1: we do loop versioning:
1790 if (p is aligned) {
1791 for (i=0; i<N; i++){ # loop 1A
1792 x = q[i]; # DR_MISALIGNMENT(q) = 3
1793 p[i] = y; # DR_MISALIGNMENT(p) = 0
1794 }
1795 }
1796 else {
1797 for (i=0; i<N; i++){ # loop 1B
1798 x = q[i]; # DR_MISALIGNMENT(q) = 3
1799 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1800 }
1801 }
1802
1803 -- Possibility 2: we do loop peeling:
1804 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1805 x = q[i];
1806 p[i] = y;
1807 }
1808 for (i = 3; i < N; i++){ # loop 2A
1809 x = q[i]; # DR_MISALIGNMENT(q) = 0
1810 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1811 }
1812
1813 -- Possibility 3: combination of loop peeling and versioning:
1814 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1815 x = q[i];
1816 p[i] = y;
1817 }
1818 if (p is aligned) {
1819 for (i = 3; i<N; i++){ # loop 3A
1820 x = q[i]; # DR_MISALIGNMENT(q) = 0
1821 p[i] = y; # DR_MISALIGNMENT(p) = 0
1822 }
1823 }
1824 else {
1825 for (i = 3; i<N; i++){ # loop 3B
1826 x = q[i]; # DR_MISALIGNMENT(q) = 0
1827 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1828 }
1829 }
1830
1831 These loops are later passed to loop_transform to be vectorized. The
1832 vectorizer will use the alignment information to guide the transformation
1833 (whether to generate regular loads/stores, or with special handling for
1834 misalignment). */
1835
1836 opt_result
1837 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1838 {
1839 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1840 dr_vec_info *first_store = NULL;
1841 dr_vec_info *dr0_info = NULL;
1842 struct data_reference *dr;
1843 unsigned int i;
1844 bool do_peeling = false;
1845 bool do_versioning = false;
1846 unsigned int npeel = 0;
1847 bool one_misalignment_known = false;
1848 bool one_misalignment_unknown = false;
1849 bool one_dr_unsupportable = false;
1850 dr_vec_info *unsupportable_dr_info = NULL;
1851 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1852 hash_table<peel_info_hasher> peeling_htab (1);
1853
1854 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1855
1856 /* Reset data so we can safely be called multiple times. */
1857 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1858 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1859
1860 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1861 return opt_result::success ();
1862
1863 /* Sort the vector of datarefs so DRs that have the same or dependent
1864 alignment are next to each other. */
1865 auto_vec<data_reference_p> datarefs
1866 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1867 datarefs.qsort (dr_align_group_sort_cmp);
1868
1869 /* Compute the number of DRs that become aligned when we peel
1870 a dataref so it becomes aligned. */
1871 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1872 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1873 unsigned i0;
1874 for (i0 = 0; i0 < datarefs.length (); ++i0)
1875 if (DR_BASE_ADDRESS (datarefs[i0]))
1876 break;
1877 for (i = i0 + 1; i <= datarefs.length (); ++i)
1878 {
1879 if (i == datarefs.length ()
1880 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1881 DR_BASE_ADDRESS (datarefs[i]), 0)
1882 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1883 DR_OFFSET (datarefs[i]), 0)
1884 || !operand_equal_p (DR_STEP (datarefs[i0]),
1885 DR_STEP (datarefs[i]), 0))
1886 {
1887 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1888 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1889 will get known misalignment if we align one of the refs
1890 with the largest DR_TARGET_ALIGNMENT. */
1891 for (unsigned j = i0; j < i; ++j)
1892 {
1893 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1894 for (unsigned k = i0; k < i; ++k)
1895 {
1896 if (k == j)
1897 continue;
1898 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1899 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1900 dr_infoj))
1901 n_same_align_refs[j]++;
1902 }
1903 }
1904 i0 = i;
1905 }
1906 }
1907
1908 /* While cost model enhancements are expected in the future, the high level
1909 view of the code at this time is as follows:
1910
1911 A) If there is a misaligned access then see if peeling to align
1912 this access can make all data references satisfy
1913 vect_supportable_dr_alignment. If so, update data structures
1914 as needed and return true.
1915
1916 B) If peeling wasn't possible and there is a data reference with an
1917 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1918 then see if loop versioning checks can be used to make all data
1919 references satisfy vect_supportable_dr_alignment. If so, update
1920 data structures as needed and return true.
1921
1922 C) If neither peeling nor versioning were successful then return false if
1923 any data reference does not satisfy vect_supportable_dr_alignment.
1924
1925 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1926
1927 Note, Possibility 3 above (which is peeling and versioning together) is not
1928 being done at this time. */
1929
1930 /* (1) Peeling to force alignment. */
1931
1932 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1933 Considerations:
1934 + How many accesses will become aligned due to the peeling
1935 - How many accesses will become unaligned due to the peeling,
1936 and the cost of misaligned accesses.
1937 - The cost of peeling (the extra runtime checks, the increase
1938 in code size). */
1939
1940 FOR_EACH_VEC_ELT (datarefs, i, dr)
1941 {
1942 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1943 if (!vect_relevant_for_alignment_p (dr_info))
1944 continue;
1945
1946 stmt_vec_info stmt_info = dr_info->stmt;
1947 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1948 do_peeling = vector_alignment_reachable_p (dr_info);
1949 if (do_peeling)
1950 {
1951 if (known_alignment_for_access_p (dr_info, vectype))
1952 {
1953 unsigned int npeel_tmp = 0;
1954 bool negative = tree_int_cst_compare (DR_STEP (dr),
1955 size_zero_node) < 0;
1956
1957 /* If known_alignment_for_access_p then we have set
1958 DR_MISALIGNMENT which is only done if we know it at compiler
1959 time, so it is safe to assume target alignment is constant.
1960 */
1961 unsigned int target_align =
1962 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1963 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1964 poly_int64 off = 0;
1965 if (negative)
1966 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1967 unsigned int mis = dr_misalignment (dr_info, vectype, off);
1968 mis = negative ? mis : -mis;
1969 if (mis != 0)
1970 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1971
1972 /* For multiple types, it is possible that the bigger type access
1973 will have more than one peeling option. E.g., a loop with two
1974 types: one of size (vector size / 4), and the other one of
1975 size (vector size / 8). Vectorization factor will 8. If both
1976 accesses are misaligned by 3, the first one needs one scalar
1977 iteration to be aligned, and the second one needs 5. But the
1978 first one will be aligned also by peeling 5 scalar
1979 iterations, and in that case both accesses will be aligned.
1980 Hence, except for the immediate peeling amount, we also want
1981 to try to add full vector size, while we don't exceed
1982 vectorization factor.
1983 We do this automatically for cost model, since we calculate
1984 cost for every peeling option. */
1985 poly_uint64 nscalars = npeel_tmp;
1986 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1987 {
1988 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989 nscalars = (STMT_SLP_TYPE (stmt_info)
1990 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1991 }
1992
1993 /* Save info about DR in the hash table. Also include peeling
1994 amounts according to the explanation above. Indicate
1995 the alignment status when the ref is not aligned.
1996 ??? Rather than using unknown alignment here we should
1997 prune all entries from the peeling hashtable which cause
1998 DRs to be not supported. */
1999 bool supportable_if_not_aligned
2000 = vect_supportable_dr_alignment
2001 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2002 while (known_le (npeel_tmp, nscalars))
2003 {
2004 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2005 dr_info, npeel_tmp,
2006 supportable_if_not_aligned);
2007 npeel_tmp += MAX (1, target_align / dr_size);
2008 }
2009
2010 one_misalignment_known = true;
2011 }
2012 else
2013 {
2014 /* If we don't know any misalignment values, we prefer
2015 peeling for data-ref that has the maximum number of data-refs
2016 with the same alignment, unless the target prefers to align
2017 stores over load. */
2018 unsigned same_align_drs = n_same_align_refs[i];
2019 if (!dr0_info
2020 || dr0_same_align_drs < same_align_drs)
2021 {
2022 dr0_same_align_drs = same_align_drs;
2023 dr0_info = dr_info;
2024 }
2025 /* For data-refs with the same number of related
2026 accesses prefer the one where the misalign
2027 computation will be invariant in the outermost loop. */
2028 else if (dr0_same_align_drs == same_align_drs)
2029 {
2030 class loop *ivloop0, *ivloop;
2031 ivloop0 = outermost_invariant_loop_for_expr
2032 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2033 ivloop = outermost_invariant_loop_for_expr
2034 (loop, DR_BASE_ADDRESS (dr));
2035 if ((ivloop && !ivloop0)
2036 || (ivloop && ivloop0
2037 && flow_loop_nested_p (ivloop, ivloop0)))
2038 dr0_info = dr_info;
2039 }
2040
2041 one_misalignment_unknown = true;
2042
2043 /* Check for data refs with unsupportable alignment that
2044 can be peeled. */
2045 enum dr_alignment_support supportable_dr_alignment
2046 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2047 DR_MISALIGNMENT_UNKNOWN);
2048 if (supportable_dr_alignment == dr_unaligned_unsupported)
2049 {
2050 one_dr_unsupportable = true;
2051 unsupportable_dr_info = dr_info;
2052 }
2053
2054 if (!first_store && DR_IS_WRITE (dr))
2055 {
2056 first_store = dr_info;
2057 first_store_same_align_drs = same_align_drs;
2058 }
2059 }
2060 }
2061 else
2062 {
2063 if (!aligned_access_p (dr_info, vectype))
2064 {
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067 "vector alignment may not be reachable\n");
2068 break;
2069 }
2070 }
2071 }
2072
2073 /* Check if we can possibly peel the loop. */
2074 if (!vect_can_advance_ivs_p (loop_vinfo)
2075 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2076 || loop->inner)
2077 do_peeling = false;
2078
2079 struct _vect_peel_extended_info peel_for_known_alignment;
2080 struct _vect_peel_extended_info peel_for_unknown_alignment;
2081 struct _vect_peel_extended_info best_peel;
2082
2083 peel_for_unknown_alignment.inside_cost = INT_MAX;
2084 peel_for_unknown_alignment.outside_cost = INT_MAX;
2085 peel_for_unknown_alignment.peel_info.count = 0;
2086
2087 if (do_peeling
2088 && one_misalignment_unknown)
2089 {
2090 /* Check if the target requires to prefer stores over loads, i.e., if
2091 misaligned stores are more expensive than misaligned loads (taking
2092 drs with same alignment into account). */
2093 unsigned int load_inside_cost = 0;
2094 unsigned int load_outside_cost = 0;
2095 unsigned int store_inside_cost = 0;
2096 unsigned int store_outside_cost = 0;
2097 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2098
2099 stmt_vector_for_cost dummy;
2100 dummy.create (2);
2101 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2102 &load_inside_cost,
2103 &load_outside_cost,
2104 &dummy, &dummy, estimated_npeels);
2105 dummy.release ();
2106
2107 if (first_store)
2108 {
2109 dummy.create (2);
2110 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2111 &store_inside_cost,
2112 &store_outside_cost,
2113 &dummy, &dummy,
2114 estimated_npeels);
2115 dummy.release ();
2116 }
2117 else
2118 {
2119 store_inside_cost = INT_MAX;
2120 store_outside_cost = INT_MAX;
2121 }
2122
2123 if (load_inside_cost > store_inside_cost
2124 || (load_inside_cost == store_inside_cost
2125 && load_outside_cost > store_outside_cost))
2126 {
2127 dr0_info = first_store;
2128 dr0_same_align_drs = first_store_same_align_drs;
2129 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2130 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2131 }
2132 else
2133 {
2134 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2135 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2136 }
2137
2138 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2139 prologue_cost_vec.create (2);
2140 epilogue_cost_vec.create (2);
2141
2142 int dummy2;
2143 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2144 (loop_vinfo, estimated_npeels, &dummy2,
2145 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2146 &prologue_cost_vec, &epilogue_cost_vec);
2147
2148 prologue_cost_vec.release ();
2149 epilogue_cost_vec.release ();
2150
2151 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2152 }
2153
2154 peel_for_unknown_alignment.peel_info.npeel = 0;
2155 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2156
2157 best_peel = peel_for_unknown_alignment;
2158
2159 peel_for_known_alignment.inside_cost = INT_MAX;
2160 peel_for_known_alignment.outside_cost = INT_MAX;
2161 peel_for_known_alignment.peel_info.count = 0;
2162 peel_for_known_alignment.peel_info.dr_info = NULL;
2163
2164 if (do_peeling && one_misalignment_known)
2165 {
2166 /* Peeling is possible, but there is no data access that is not supported
2167 unless aligned. So we try to choose the best possible peeling from
2168 the hash table. */
2169 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2170 (&peeling_htab, loop_vinfo);
2171 }
2172
2173 /* Compare costs of peeling for known and unknown alignment. */
2174 if (peel_for_known_alignment.peel_info.dr_info != NULL
2175 && peel_for_unknown_alignment.inside_cost
2176 >= peel_for_known_alignment.inside_cost)
2177 {
2178 best_peel = peel_for_known_alignment;
2179
2180 /* If the best peeling for known alignment has NPEEL == 0, perform no
2181 peeling at all except if there is an unsupportable dr that we can
2182 align. */
2183 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2184 do_peeling = false;
2185 }
2186
2187 /* If there is an unsupportable data ref, prefer this over all choices so far
2188 since we'd have to discard a chosen peeling except when it accidentally
2189 aligned the unsupportable data ref. */
2190 if (one_dr_unsupportable)
2191 dr0_info = unsupportable_dr_info;
2192 else if (do_peeling)
2193 {
2194 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2195 TODO: Use nopeel_outside_cost or get rid of it? */
2196 unsigned nopeel_inside_cost = 0;
2197 unsigned nopeel_outside_cost = 0;
2198
2199 stmt_vector_for_cost dummy;
2200 dummy.create (2);
2201 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2202 &nopeel_outside_cost, &dummy, &dummy, 0);
2203 dummy.release ();
2204
2205 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2206 costs will be recorded. */
2207 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2208 prologue_cost_vec.create (2);
2209 epilogue_cost_vec.create (2);
2210
2211 int dummy2;
2212 nopeel_outside_cost += vect_get_known_peeling_cost
2213 (loop_vinfo, 0, &dummy2,
2214 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2215 &prologue_cost_vec, &epilogue_cost_vec);
2216
2217 prologue_cost_vec.release ();
2218 epilogue_cost_vec.release ();
2219
2220 npeel = best_peel.peel_info.npeel;
2221 dr0_info = best_peel.peel_info.dr_info;
2222
2223 /* If no peeling is not more expensive than the best peeling we
2224 have so far, don't perform any peeling. */
2225 if (nopeel_inside_cost <= best_peel.inside_cost)
2226 do_peeling = false;
2227 }
2228
2229 if (do_peeling)
2230 {
2231 stmt_vec_info stmt_info = dr0_info->stmt;
2232 if (known_alignment_for_access_p (dr0_info,
2233 STMT_VINFO_VECTYPE (stmt_info)))
2234 {
2235 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2236 size_zero_node) < 0;
2237 if (!npeel)
2238 {
2239 /* Since it's known at compile time, compute the number of
2240 iterations in the peeled loop (the peeling factor) for use in
2241 updating DR_MISALIGNMENT values. The peeling factor is the
2242 vectorization factor minus the misalignment as an element
2243 count. */
2244 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2245 poly_int64 off = 0;
2246 if (negative)
2247 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2248 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2249 unsigned int mis
2250 = dr_misalignment (dr0_info, vectype, off);
2251 mis = negative ? mis : -mis;
2252 /* If known_alignment_for_access_p then we have set
2253 DR_MISALIGNMENT which is only done if we know it at compiler
2254 time, so it is safe to assume target alignment is constant.
2255 */
2256 unsigned int target_align =
2257 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2258 npeel = ((mis & (target_align - 1))
2259 / vect_get_scalar_dr_size (dr0_info));
2260 }
2261
2262 /* For interleaved data access every iteration accesses all the
2263 members of the group, therefore we divide the number of iterations
2264 by the group size. */
2265 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2266 npeel /= DR_GROUP_SIZE (stmt_info);
2267
2268 if (dump_enabled_p ())
2269 dump_printf_loc (MSG_NOTE, vect_location,
2270 "Try peeling by %d\n", npeel);
2271 }
2272
2273 /* Ensure that all datarefs can be vectorized after the peel. */
2274 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2275 do_peeling = false;
2276
2277 /* Check if all datarefs are supportable and log. */
2278 if (do_peeling
2279 && npeel == 0
2280 && known_alignment_for_access_p (dr0_info,
2281 STMT_VINFO_VECTYPE (stmt_info)))
2282 return opt_result::success ();
2283
2284 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2285 if (do_peeling)
2286 {
2287 unsigned max_allowed_peel
2288 = param_vect_max_peeling_for_alignment;
2289 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2290 max_allowed_peel = 0;
2291 if (max_allowed_peel != (unsigned)-1)
2292 {
2293 unsigned max_peel = npeel;
2294 if (max_peel == 0)
2295 {
2296 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2297 unsigned HOST_WIDE_INT target_align_c;
2298 if (target_align.is_constant (&target_align_c))
2299 max_peel =
2300 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2301 else
2302 {
2303 do_peeling = false;
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_NOTE, vect_location,
2306 "Disable peeling, max peels set and vector"
2307 " alignment unknown\n");
2308 }
2309 }
2310 if (max_peel > max_allowed_peel)
2311 {
2312 do_peeling = false;
2313 if (dump_enabled_p ())
2314 dump_printf_loc (MSG_NOTE, vect_location,
2315 "Disable peeling, max peels reached: %d\n", max_peel);
2316 }
2317 }
2318 }
2319
2320 /* Cost model #2 - if peeling may result in a remaining loop not
2321 iterating enough to be vectorized then do not peel. Since this
2322 is a cost heuristic rather than a correctness decision, use the
2323 most likely runtime value for variable vectorization factors. */
2324 if (do_peeling
2325 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2326 {
2327 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2328 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2329 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2330 < assumed_vf + max_peel)
2331 do_peeling = false;
2332 }
2333
2334 if (do_peeling)
2335 {
2336 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2337 If the misalignment of DR_i is identical to that of dr0 then set
2338 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2339 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2340 by the peeling factor times the element size of DR_i (MOD the
2341 vectorization factor times the size). Otherwise, the
2342 misalignment of DR_i must be set to unknown. */
2343 FOR_EACH_VEC_ELT (datarefs, i, dr)
2344 if (dr != dr0_info->dr)
2345 {
2346 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2347 if (!vect_relevant_for_alignment_p (dr_info))
2348 continue;
2349
2350 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2351 }
2352
2353 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2354 if (npeel)
2355 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2356 else
2357 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2358 SET_DR_MISALIGNMENT (dr0_info,
2359 vect_dr_misalign_for_aligned_access (dr0_info));
2360 if (dump_enabled_p ())
2361 {
2362 dump_printf_loc (MSG_NOTE, vect_location,
2363 "Alignment of access forced using peeling.\n");
2364 dump_printf_loc (MSG_NOTE, vect_location,
2365 "Peeling for alignment will be applied.\n");
2366 }
2367
2368 /* The inside-loop cost will be accounted for in vectorizable_load
2369 and vectorizable_store correctly with adjusted alignments.
2370 Drop the body_cst_vec on the floor here. */
2371 return opt_result::success ();
2372 }
2373 }
2374
2375 /* (2) Versioning to force alignment. */
2376
2377 /* Try versioning if:
2378 1) optimize loop for speed and the cost-model is not cheap
2379 2) there is at least one unsupported misaligned data ref with an unknown
2380 misalignment, and
2381 3) all misaligned data refs with a known misalignment are supported, and
2382 4) the number of runtime alignment checks is within reason. */
2383
2384 do_versioning
2385 = (optimize_loop_nest_for_speed_p (loop)
2386 && !loop->inner /* FORNOW */
2387 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2388
2389 if (do_versioning)
2390 {
2391 FOR_EACH_VEC_ELT (datarefs, i, dr)
2392 {
2393 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2394 if (!vect_relevant_for_alignment_p (dr_info))
2395 continue;
2396
2397 stmt_vec_info stmt_info = dr_info->stmt;
2398 if (STMT_VINFO_STRIDED_P (stmt_info))
2399 {
2400 do_versioning = false;
2401 break;
2402 }
2403
2404 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2405 bool negative = tree_int_cst_compare (DR_STEP (dr),
2406 size_zero_node) < 0;
2407 poly_int64 off = 0;
2408 if (negative)
2409 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2410 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2411 int misalignment;
2412 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2413 continue;
2414
2415 enum dr_alignment_support supportable_dr_alignment
2416 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2417 misalignment);
2418 if (supportable_dr_alignment == dr_unaligned_unsupported)
2419 {
2420 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2421 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2422 >= (unsigned) param_vect_max_version_for_alignment_checks))
2423 {
2424 do_versioning = false;
2425 break;
2426 }
2427
2428 /* At present we don't support versioning for alignment
2429 with variable VF, since there's no guarantee that the
2430 VF is a power of two. We could relax this if we added
2431 a way of enforcing a power-of-two size. */
2432 unsigned HOST_WIDE_INT size;
2433 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2434 {
2435 do_versioning = false;
2436 break;
2437 }
2438
2439 /* Forcing alignment in the first iteration is no good if
2440 we don't keep it across iterations. For now, just disable
2441 versioning in this case.
2442 ?? We could actually unroll the loop to achieve the required
2443 overall step alignment, and forcing the alignment could be
2444 done by doing some iterations of the non-vectorized loop. */
2445 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2446 * DR_STEP_ALIGNMENT (dr),
2447 DR_TARGET_ALIGNMENT (dr_info)))
2448 {
2449 do_versioning = false;
2450 break;
2451 }
2452
2453 /* The rightmost bits of an aligned address must be zeros.
2454 Construct the mask needed for this test. For example,
2455 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2456 mask must be 15 = 0xf. */
2457 int mask = size - 1;
2458
2459 /* FORNOW: use the same mask to test all potentially unaligned
2460 references in the loop. */
2461 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2462 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2463 {
2464 do_versioning = false;
2465 break;
2466 }
2467
2468 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2469 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2470 }
2471 }
2472
2473 /* Versioning requires at least one misaligned data reference. */
2474 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2475 do_versioning = false;
2476 else if (!do_versioning)
2477 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2478 }
2479
2480 if (do_versioning)
2481 {
2482 const vec<stmt_vec_info> &may_misalign_stmts
2483 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2484 stmt_vec_info stmt_info;
2485
2486 /* It can now be assumed that the data references in the statements
2487 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2488 of the loop being vectorized. */
2489 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2490 {
2491 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2492 SET_DR_MISALIGNMENT (dr_info,
2493 vect_dr_misalign_for_aligned_access (dr_info));
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_NOTE, vect_location,
2496 "Alignment of access forced using versioning.\n");
2497 }
2498
2499 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_NOTE, vect_location,
2501 "Versioning for alignment will be applied.\n");
2502
2503 /* Peeling and versioning can't be done together at this time. */
2504 gcc_assert (! (do_peeling && do_versioning));
2505
2506 return opt_result::success ();
2507 }
2508
2509 /* This point is reached if neither peeling nor versioning is being done. */
2510 gcc_assert (! (do_peeling || do_versioning));
2511
2512 return opt_result::success ();
2513 }
2514
2515
2516 /* Function vect_analyze_data_refs_alignment
2517
2518 Analyze the alignment of the data-references in the loop.
2519 Return FALSE if a data reference is found that cannot be vectorized. */
2520
2521 opt_result
2522 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2523 {
2524 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2525
2526 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2527 struct data_reference *dr;
2528 unsigned int i;
2529
2530 vect_record_base_alignments (loop_vinfo);
2531 FOR_EACH_VEC_ELT (datarefs, i, dr)
2532 {
2533 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2534 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2535 {
2536 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2537 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2538 continue;
2539 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2540 STMT_VINFO_VECTYPE (dr_info->stmt));
2541 }
2542 }
2543
2544 return opt_result::success ();
2545 }
2546
2547
2548 /* Analyze alignment of DRs of stmts in NODE. */
2549
2550 static bool
2551 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2552 {
2553 /* Alignment is maintained in the first element of the group. */
2554 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2555 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2556 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2557 tree vectype = SLP_TREE_VECTYPE (node);
2558 poly_uint64 vector_alignment
2559 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2560 BITS_PER_UNIT);
2561 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2562 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2563 /* Re-analyze alignment when we're facing a vectorization with a bigger
2564 alignment requirement. */
2565 else if (known_lt (dr_info->target_alignment, vector_alignment))
2566 {
2567 poly_uint64 old_target_alignment = dr_info->target_alignment;
2568 int old_misalignment = dr_info->misalignment;
2569 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2570 /* But keep knowledge about a smaller alignment. */
2571 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2572 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2573 {
2574 dr_info->target_alignment = old_target_alignment;
2575 dr_info->misalignment = old_misalignment;
2576 }
2577 }
2578 /* When we ever face unordered target alignments the first one wins in terms
2579 of analyzing and the other will become unknown in dr_misalignment. */
2580 return true;
2581 }
2582
2583 /* Function vect_slp_analyze_instance_alignment
2584
2585 Analyze the alignment of the data-references in the SLP instance.
2586 Return FALSE if a data reference is found that cannot be vectorized. */
2587
2588 bool
2589 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2590 slp_instance instance)
2591 {
2592 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2593
2594 slp_tree node;
2595 unsigned i;
2596 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2597 if (! vect_slp_analyze_node_alignment (vinfo, node))
2598 return false;
2599
2600 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2601 && ! vect_slp_analyze_node_alignment
2602 (vinfo, SLP_INSTANCE_TREE (instance)))
2603 return false;
2604
2605 return true;
2606 }
2607
2608
2609 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2610 accesses of legal size, step, etc. Detect gaps, single element
2611 interleaving, and other special cases. Set grouped access info.
2612 Collect groups of strided stores for further use in SLP analysis.
2613 Worker for vect_analyze_group_access. */
2614
2615 static bool
2616 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2617 {
2618 data_reference *dr = dr_info->dr;
2619 tree step = DR_STEP (dr);
2620 tree scalar_type = TREE_TYPE (DR_REF (dr));
2621 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2622 stmt_vec_info stmt_info = dr_info->stmt;
2623 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2624 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2625 HOST_WIDE_INT dr_step = -1;
2626 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2627 bool slp_impossible = false;
2628
2629 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2630 size of the interleaving group (including gaps). */
2631 if (tree_fits_shwi_p (step))
2632 {
2633 dr_step = tree_to_shwi (step);
2634 /* Check that STEP is a multiple of type size. Otherwise there is
2635 a non-element-sized gap at the end of the group which we
2636 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2637 ??? As we can handle non-constant step fine here we should
2638 simply remove uses of DR_GROUP_GAP between the last and first
2639 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2640 simply not include that gap. */
2641 if ((dr_step % type_size) != 0)
2642 {
2643 if (dump_enabled_p ())
2644 dump_printf_loc (MSG_NOTE, vect_location,
2645 "Step %T is not a multiple of the element size"
2646 " for %T\n",
2647 step, DR_REF (dr));
2648 return false;
2649 }
2650 groupsize = absu_hwi (dr_step) / type_size;
2651 }
2652 else
2653 groupsize = 0;
2654
2655 /* Not consecutive access is possible only if it is a part of interleaving. */
2656 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2657 {
2658 /* Check if it this DR is a part of interleaving, and is a single
2659 element of the group that is accessed in the loop. */
2660
2661 /* Gaps are supported only for loads. STEP must be a multiple of the type
2662 size. */
2663 if (DR_IS_READ (dr)
2664 && (dr_step % type_size) == 0
2665 && groupsize > 0
2666 /* This could be UINT_MAX but as we are generating code in a very
2667 inefficient way we have to cap earlier.
2668 See PR91403 for example. */
2669 && groupsize <= 4096)
2670 {
2671 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2672 DR_GROUP_SIZE (stmt_info) = groupsize;
2673 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2674 if (dump_enabled_p ())
2675 dump_printf_loc (MSG_NOTE, vect_location,
2676 "Detected single element interleaving %T"
2677 " step %T\n",
2678 DR_REF (dr), step);
2679
2680 return true;
2681 }
2682
2683 if (dump_enabled_p ())
2684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2685 "not consecutive access %G", stmt_info->stmt);
2686
2687 if (bb_vinfo)
2688 {
2689 /* Mark the statement as unvectorizable. */
2690 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2691 return true;
2692 }
2693
2694 if (dump_enabled_p ())
2695 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2696 STMT_VINFO_STRIDED_P (stmt_info) = true;
2697 return true;
2698 }
2699
2700 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2701 {
2702 /* First stmt in the interleaving chain. Check the chain. */
2703 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2704 struct data_reference *data_ref = dr;
2705 unsigned int count = 1;
2706 tree prev_init = DR_INIT (data_ref);
2707 HOST_WIDE_INT diff, gaps = 0;
2708
2709 /* By construction, all group members have INTEGER_CST DR_INITs. */
2710 while (next)
2711 {
2712 /* We never have the same DR multiple times. */
2713 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2714 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2715
2716 data_ref = STMT_VINFO_DATA_REF (next);
2717
2718 /* All group members have the same STEP by construction. */
2719 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2720
2721 /* Check that the distance between two accesses is equal to the type
2722 size. Otherwise, we have gaps. */
2723 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2724 - TREE_INT_CST_LOW (prev_init)) / type_size;
2725 if (diff != 1)
2726 {
2727 /* FORNOW: SLP of accesses with gaps is not supported. */
2728 slp_impossible = true;
2729 if (DR_IS_WRITE (data_ref))
2730 {
2731 if (dump_enabled_p ())
2732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2733 "interleaved store with gaps\n");
2734 return false;
2735 }
2736
2737 gaps += diff - 1;
2738 }
2739
2740 last_accessed_element += diff;
2741
2742 /* Store the gap from the previous member of the group. If there is no
2743 gap in the access, DR_GROUP_GAP is always 1. */
2744 DR_GROUP_GAP (next) = diff;
2745
2746 prev_init = DR_INIT (data_ref);
2747 next = DR_GROUP_NEXT_ELEMENT (next);
2748 /* Count the number of data-refs in the chain. */
2749 count++;
2750 }
2751
2752 if (groupsize == 0)
2753 groupsize = count + gaps;
2754
2755 /* This could be UINT_MAX but as we are generating code in a very
2756 inefficient way we have to cap earlier. See PR78699 for example. */
2757 if (groupsize > 4096)
2758 {
2759 if (dump_enabled_p ())
2760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2761 "group is too large\n");
2762 return false;
2763 }
2764
2765 /* Check that the size of the interleaving is equal to count for stores,
2766 i.e., that there are no gaps. */
2767 if (groupsize != count
2768 && !DR_IS_READ (dr))
2769 {
2770 groupsize = count;
2771 STMT_VINFO_STRIDED_P (stmt_info) = true;
2772 }
2773
2774 /* If there is a gap after the last load in the group it is the
2775 difference between the groupsize and the last accessed
2776 element.
2777 When there is no gap, this difference should be 0. */
2778 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2779
2780 DR_GROUP_SIZE (stmt_info) = groupsize;
2781 if (dump_enabled_p ())
2782 {
2783 dump_printf_loc (MSG_NOTE, vect_location,
2784 "Detected interleaving ");
2785 if (DR_IS_READ (dr))
2786 dump_printf (MSG_NOTE, "load ");
2787 else if (STMT_VINFO_STRIDED_P (stmt_info))
2788 dump_printf (MSG_NOTE, "strided store ");
2789 else
2790 dump_printf (MSG_NOTE, "store ");
2791 dump_printf (MSG_NOTE, "of size %u\n",
2792 (unsigned)groupsize);
2793 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2794 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2795 while (next)
2796 {
2797 if (DR_GROUP_GAP (next) != 1)
2798 dump_printf_loc (MSG_NOTE, vect_location,
2799 "\t<gap of %d elements>\n",
2800 DR_GROUP_GAP (next) - 1);
2801 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2802 next = DR_GROUP_NEXT_ELEMENT (next);
2803 }
2804 if (DR_GROUP_GAP (stmt_info) != 0)
2805 dump_printf_loc (MSG_NOTE, vect_location,
2806 "\t<gap of %d elements>\n",
2807 DR_GROUP_GAP (stmt_info));
2808 }
2809
2810 /* SLP: create an SLP data structure for every interleaving group of
2811 stores for further analysis in vect_analyse_slp. */
2812 if (DR_IS_WRITE (dr) && !slp_impossible)
2813 {
2814 if (loop_vinfo)
2815 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2816 if (bb_vinfo)
2817 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2818 }
2819 }
2820
2821 return true;
2822 }
2823
2824 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2825 accesses of legal size, step, etc. Detect gaps, single element
2826 interleaving, and other special cases. Set grouped access info.
2827 Collect groups of strided stores for further use in SLP analysis. */
2828
2829 static bool
2830 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2831 {
2832 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2833 {
2834 /* Dissolve the group if present. */
2835 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2836 while (stmt_info)
2837 {
2838 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2839 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2840 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2841 stmt_info = next;
2842 }
2843 return false;
2844 }
2845 return true;
2846 }
2847
2848 /* Analyze the access pattern of the data-reference DR_INFO.
2849 In case of non-consecutive accesses call vect_analyze_group_access() to
2850 analyze groups of accesses. */
2851
2852 static bool
2853 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2854 {
2855 data_reference *dr = dr_info->dr;
2856 tree step = DR_STEP (dr);
2857 tree scalar_type = TREE_TYPE (DR_REF (dr));
2858 stmt_vec_info stmt_info = dr_info->stmt;
2859 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2860 class loop *loop = NULL;
2861
2862 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2863 return true;
2864
2865 if (loop_vinfo)
2866 loop = LOOP_VINFO_LOOP (loop_vinfo);
2867
2868 if (loop_vinfo && !step)
2869 {
2870 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2872 "bad data-ref access in loop\n");
2873 return false;
2874 }
2875
2876 /* Allow loads with zero step in inner-loop vectorization. */
2877 if (loop_vinfo && integer_zerop (step))
2878 {
2879 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2880 if (!nested_in_vect_loop_p (loop, stmt_info))
2881 return DR_IS_READ (dr);
2882 /* Allow references with zero step for outer loops marked
2883 with pragma omp simd only - it guarantees absence of
2884 loop-carried dependencies between inner loop iterations. */
2885 if (loop->safelen < 2)
2886 {
2887 if (dump_enabled_p ())
2888 dump_printf_loc (MSG_NOTE, vect_location,
2889 "zero step in inner loop of nest\n");
2890 return false;
2891 }
2892 }
2893
2894 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2895 {
2896 /* Interleaved accesses are not yet supported within outer-loop
2897 vectorization for references in the inner-loop. */
2898 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2899
2900 /* For the rest of the analysis we use the outer-loop step. */
2901 step = STMT_VINFO_DR_STEP (stmt_info);
2902 if (integer_zerop (step))
2903 {
2904 if (dump_enabled_p ())
2905 dump_printf_loc (MSG_NOTE, vect_location,
2906 "zero step in outer loop.\n");
2907 return DR_IS_READ (dr);
2908 }
2909 }
2910
2911 /* Consecutive? */
2912 if (TREE_CODE (step) == INTEGER_CST)
2913 {
2914 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2915 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2916 || (dr_step < 0
2917 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2918 {
2919 /* Mark that it is not interleaving. */
2920 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2921 return true;
2922 }
2923 }
2924
2925 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2926 {
2927 if (dump_enabled_p ())
2928 dump_printf_loc (MSG_NOTE, vect_location,
2929 "grouped access in outer loop.\n");
2930 return false;
2931 }
2932
2933
2934 /* Assume this is a DR handled by non-constant strided load case. */
2935 if (TREE_CODE (step) != INTEGER_CST)
2936 return (STMT_VINFO_STRIDED_P (stmt_info)
2937 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2938 || vect_analyze_group_access (vinfo, dr_info)));
2939
2940 /* Not consecutive access - check if it's a part of interleaving group. */
2941 return vect_analyze_group_access (vinfo, dr_info);
2942 }
2943
2944 /* Compare two data-references DRA and DRB to group them into chunks
2945 suitable for grouping. */
2946
2947 static int
2948 dr_group_sort_cmp (const void *dra_, const void *drb_)
2949 {
2950 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2951 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2952 data_reference_p dra = dra_info->dr;
2953 data_reference_p drb = drb_info->dr;
2954 int cmp;
2955
2956 /* Stabilize sort. */
2957 if (dra == drb)
2958 return 0;
2959
2960 /* Different group IDs lead never belong to the same group. */
2961 if (dra_info->group != drb_info->group)
2962 return dra_info->group < drb_info->group ? -1 : 1;
2963
2964 /* Ordering of DRs according to base. */
2965 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2966 DR_BASE_ADDRESS (drb));
2967 if (cmp != 0)
2968 return cmp;
2969
2970 /* And according to DR_OFFSET. */
2971 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2972 if (cmp != 0)
2973 return cmp;
2974
2975 /* Put reads before writes. */
2976 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2977 return DR_IS_READ (dra) ? -1 : 1;
2978
2979 /* Then sort after access size. */
2980 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2981 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2982 if (cmp != 0)
2983 return cmp;
2984
2985 /* And after step. */
2986 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2987 if (cmp != 0)
2988 return cmp;
2989
2990 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2991 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2992 if (cmp == 0)
2993 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2994 return cmp;
2995 }
2996
2997 /* If OP is the result of a conversion, return the unconverted value,
2998 otherwise return null. */
2999
3000 static tree
3001 strip_conversion (tree op)
3002 {
3003 if (TREE_CODE (op) != SSA_NAME)
3004 return NULL_TREE;
3005 gimple *stmt = SSA_NAME_DEF_STMT (op);
3006 if (!is_gimple_assign (stmt)
3007 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3008 return NULL_TREE;
3009 return gimple_assign_rhs1 (stmt);
3010 }
3011
3012 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3013 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3014 be grouped in SLP mode. */
3015
3016 static bool
3017 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3018 bool allow_slp_p)
3019 {
3020 if (gimple_assign_single_p (stmt1_info->stmt))
3021 return gimple_assign_single_p (stmt2_info->stmt);
3022
3023 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3024 if (call1 && gimple_call_internal_p (call1))
3025 {
3026 /* Check for two masked loads or two masked stores. */
3027 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3028 if (!call2 || !gimple_call_internal_p (call2))
3029 return false;
3030 internal_fn ifn = gimple_call_internal_fn (call1);
3031 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3032 return false;
3033 if (ifn != gimple_call_internal_fn (call2))
3034 return false;
3035
3036 /* Check that the masks are the same. Cope with casts of masks,
3037 like those created by build_mask_conversion. */
3038 tree mask1 = gimple_call_arg (call1, 2);
3039 tree mask2 = gimple_call_arg (call2, 2);
3040 if (!operand_equal_p (mask1, mask2, 0)
3041 && (ifn == IFN_MASK_STORE || !allow_slp_p))
3042 {
3043 mask1 = strip_conversion (mask1);
3044 if (!mask1)
3045 return false;
3046 mask2 = strip_conversion (mask2);
3047 if (!mask2)
3048 return false;
3049 if (!operand_equal_p (mask1, mask2, 0))
3050 return false;
3051 }
3052 return true;
3053 }
3054
3055 return false;
3056 }
3057
3058 /* Function vect_analyze_data_ref_accesses.
3059
3060 Analyze the access pattern of all the data references in the loop.
3061
3062 FORNOW: the only access pattern that is considered vectorizable is a
3063 simple step 1 (consecutive) access.
3064
3065 FORNOW: handle only arrays and pointer accesses. */
3066
3067 opt_result
3068 vect_analyze_data_ref_accesses (vec_info *vinfo,
3069 vec<int> *dataref_groups)
3070 {
3071 unsigned int i;
3072 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3073
3074 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3075
3076 if (datarefs.is_empty ())
3077 return opt_result::success ();
3078
3079 /* Sort the array of datarefs to make building the interleaving chains
3080 linear. Don't modify the original vector's order, it is needed for
3081 determining what dependencies are reversed. */
3082 vec<dr_vec_info *> datarefs_copy;
3083 datarefs_copy.create (datarefs.length ());
3084 for (unsigned i = 0; i < datarefs.length (); i++)
3085 {
3086 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3087 /* If the caller computed DR grouping use that, otherwise group by
3088 basic blocks. */
3089 if (dataref_groups)
3090 dr_info->group = (*dataref_groups)[i];
3091 else
3092 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3093 datarefs_copy.quick_push (dr_info);
3094 }
3095 datarefs_copy.qsort (dr_group_sort_cmp);
3096 hash_set<stmt_vec_info> to_fixup;
3097
3098 /* Build the interleaving chains. */
3099 for (i = 0; i < datarefs_copy.length () - 1;)
3100 {
3101 dr_vec_info *dr_info_a = datarefs_copy[i];
3102 data_reference_p dra = dr_info_a->dr;
3103 int dra_group_id = dr_info_a->group;
3104 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3105 stmt_vec_info lastinfo = NULL;
3106 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3107 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3108 {
3109 ++i;
3110 continue;
3111 }
3112 for (i = i + 1; i < datarefs_copy.length (); ++i)
3113 {
3114 dr_vec_info *dr_info_b = datarefs_copy[i];
3115 data_reference_p drb = dr_info_b->dr;
3116 int drb_group_id = dr_info_b->group;
3117 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3118 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3119 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3120 break;
3121
3122 /* ??? Imperfect sorting (non-compatible types, non-modulo
3123 accesses, same accesses) can lead to a group to be artificially
3124 split here as we don't just skip over those. If it really
3125 matters we can push those to a worklist and re-iterate
3126 over them. The we can just skip ahead to the next DR here. */
3127
3128 /* DRs in a different DR group should not be put into the same
3129 interleaving group. */
3130 if (dra_group_id != drb_group_id)
3131 break;
3132
3133 /* Check that the data-refs have same first location (except init)
3134 and they are both either store or load (not load and store,
3135 not masked loads or stores). */
3136 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3137 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3138 DR_BASE_ADDRESS (drb)) != 0
3139 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3140 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3141 break;
3142
3143 /* Check that the data-refs have the same constant size. */
3144 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3145 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3146 if (!tree_fits_uhwi_p (sza)
3147 || !tree_fits_uhwi_p (szb)
3148 || !tree_int_cst_equal (sza, szb))
3149 break;
3150
3151 /* Check that the data-refs have the same step. */
3152 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3153 break;
3154
3155 /* Check the types are compatible.
3156 ??? We don't distinguish this during sorting. */
3157 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3158 TREE_TYPE (DR_REF (drb))))
3159 break;
3160
3161 /* Check that the DR_INITs are compile-time constants. */
3162 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3163 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3164 break;
3165
3166 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3167 just hold extra information. */
3168 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3169 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3170 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3171 break;
3172
3173 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3174 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3175 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3176 HOST_WIDE_INT init_prev
3177 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3178 gcc_assert (init_a <= init_b
3179 && init_a <= init_prev
3180 && init_prev <= init_b);
3181
3182 /* Do not place the same access in the interleaving chain twice. */
3183 if (init_b == init_prev)
3184 {
3185 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3186 < gimple_uid (DR_STMT (drb)));
3187 /* Simply link in duplicates and fix up the chain below. */
3188 }
3189 else
3190 {
3191 /* If init_b == init_a + the size of the type * k, we have an
3192 interleaving, and DRA is accessed before DRB. */
3193 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3194 if (type_size_a == 0
3195 || (init_b - init_a) % type_size_a != 0)
3196 break;
3197
3198 /* If we have a store, the accesses are adjacent. This splits
3199 groups into chunks we support (we don't support vectorization
3200 of stores with gaps). */
3201 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3202 break;
3203
3204 /* If the step (if not zero or non-constant) is smaller than the
3205 difference between data-refs' inits this splits groups into
3206 suitable sizes. */
3207 if (tree_fits_shwi_p (DR_STEP (dra)))
3208 {
3209 unsigned HOST_WIDE_INT step
3210 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3211 if (step != 0
3212 && step <= (unsigned HOST_WIDE_INT)(init_b - init_a))
3213 break;
3214 }
3215 }
3216
3217 if (dump_enabled_p ())
3218 dump_printf_loc (MSG_NOTE, vect_location,
3219 DR_IS_READ (dra)
3220 ? "Detected interleaving load %T and %T\n"
3221 : "Detected interleaving store %T and %T\n",
3222 DR_REF (dra), DR_REF (drb));
3223
3224 /* Link the found element into the group list. */
3225 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3226 {
3227 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3228 lastinfo = stmtinfo_a;
3229 }
3230 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3231 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3232 lastinfo = stmtinfo_b;
3233
3234 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3235 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3236
3237 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3238 dump_printf_loc (MSG_NOTE, vect_location,
3239 "Load suitable for SLP vectorization only.\n");
3240
3241 if (init_b == init_prev
3242 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3243 && dump_enabled_p ())
3244 dump_printf_loc (MSG_NOTE, vect_location,
3245 "Queuing group with duplicate access for fixup\n");
3246 }
3247 }
3248
3249 /* Fixup groups with duplicate entries by splitting it. */
3250 while (1)
3251 {
3252 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3253 if (!(it != to_fixup.end ()))
3254 break;
3255 stmt_vec_info grp = *it;
3256 to_fixup.remove (grp);
3257
3258 /* Find the earliest duplicate group member. */
3259 unsigned first_duplicate = -1u;
3260 stmt_vec_info next, g = grp;
3261 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3262 {
3263 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3264 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3265 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3266 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3267 g = next;
3268 }
3269 if (first_duplicate == -1U)
3270 continue;
3271
3272 /* Then move all stmts after the first duplicate to a new group.
3273 Note this is a heuristic but one with the property that *it
3274 is fixed up completely. */
3275 g = grp;
3276 stmt_vec_info newgroup = NULL, ng = grp;
3277 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3278 {
3279 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3280 {
3281 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3282 if (!newgroup)
3283 newgroup = next;
3284 else
3285 DR_GROUP_NEXT_ELEMENT (ng) = next;
3286 ng = next;
3287 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3288 }
3289 else
3290 g = DR_GROUP_NEXT_ELEMENT (g);
3291 }
3292 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3293
3294 /* Fixup the new group which still may contain duplicates. */
3295 to_fixup.add (newgroup);
3296 }
3297
3298 dr_vec_info *dr_info;
3299 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3300 {
3301 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3302 && !vect_analyze_data_ref_access (vinfo, dr_info))
3303 {
3304 if (dump_enabled_p ())
3305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3306 "not vectorized: complicated access pattern.\n");
3307
3308 if (is_a <bb_vec_info> (vinfo))
3309 {
3310 /* Mark the statement as not vectorizable. */
3311 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3312 continue;
3313 }
3314 else
3315 {
3316 datarefs_copy.release ();
3317 return opt_result::failure_at (dr_info->stmt->stmt,
3318 "not vectorized:"
3319 " complicated access pattern.\n");
3320 }
3321 }
3322 }
3323
3324 datarefs_copy.release ();
3325 return opt_result::success ();
3326 }
3327
3328 /* Function vect_vfa_segment_size.
3329
3330 Input:
3331 DR_INFO: The data reference.
3332 LENGTH_FACTOR: segment length to consider.
3333
3334 Return a value suitable for the dr_with_seg_len::seg_len field.
3335 This is the "distance travelled" by the pointer from the first
3336 iteration in the segment to the last. Note that it does not include
3337 the size of the access; in effect it only describes the first byte. */
3338
3339 static tree
3340 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3341 {
3342 length_factor = size_binop (MINUS_EXPR,
3343 fold_convert (sizetype, length_factor),
3344 size_one_node);
3345 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3346 length_factor);
3347 }
3348
3349 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3350 gives the worst-case number of bytes covered by the segment. */
3351
3352 static unsigned HOST_WIDE_INT
3353 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3354 {
3355 stmt_vec_info stmt_vinfo = dr_info->stmt;
3356 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3357 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3358 unsigned HOST_WIDE_INT access_size = ref_size;
3359 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3360 {
3361 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3362 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3363 }
3364 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3365 int misalignment;
3366 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3367 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3368 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3369 == dr_explicit_realign_optimized))
3370 {
3371 /* We might access a full vector's worth. */
3372 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3373 }
3374 return access_size;
3375 }
3376
3377 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3378 describes. */
3379
3380 static unsigned int
3381 vect_vfa_align (dr_vec_info *dr_info)
3382 {
3383 return dr_alignment (dr_info->dr);
3384 }
3385
3386 /* Function vect_no_alias_p.
3387
3388 Given data references A and B with equal base and offset, see whether
3389 the alias relation can be decided at compilation time. Return 1 if
3390 it can and the references alias, 0 if it can and the references do
3391 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3392 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3393 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3394
3395 static int
3396 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3397 tree segment_length_a, tree segment_length_b,
3398 unsigned HOST_WIDE_INT access_size_a,
3399 unsigned HOST_WIDE_INT access_size_b)
3400 {
3401 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3402 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3403 poly_uint64 const_length_a;
3404 poly_uint64 const_length_b;
3405
3406 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3407 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3408 [a, a+12) */
3409 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3410 {
3411 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3412 offset_a -= const_length_a;
3413 }
3414 else
3415 const_length_a = tree_to_poly_uint64 (segment_length_a);
3416 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3417 {
3418 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3419 offset_b -= const_length_b;
3420 }
3421 else
3422 const_length_b = tree_to_poly_uint64 (segment_length_b);
3423
3424 const_length_a += access_size_a;
3425 const_length_b += access_size_b;
3426
3427 if (ranges_known_overlap_p (offset_a, const_length_a,
3428 offset_b, const_length_b))
3429 return 1;
3430
3431 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3432 offset_b, const_length_b))
3433 return 0;
3434
3435 return -1;
3436 }
3437
3438 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3439 in DDR is >= VF. */
3440
3441 static bool
3442 dependence_distance_ge_vf (data_dependence_relation *ddr,
3443 unsigned int loop_depth, poly_uint64 vf)
3444 {
3445 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3446 || DDR_NUM_DIST_VECTS (ddr) == 0)
3447 return false;
3448
3449 /* If the dependence is exact, we should have limited the VF instead. */
3450 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3451
3452 unsigned int i;
3453 lambda_vector dist_v;
3454 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3455 {
3456 HOST_WIDE_INT dist = dist_v[loop_depth];
3457 if (dist != 0
3458 && !(dist > 0 && DDR_REVERSED_P (ddr))
3459 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3460 return false;
3461 }
3462
3463 if (dump_enabled_p ())
3464 dump_printf_loc (MSG_NOTE, vect_location,
3465 "dependence distance between %T and %T is >= VF\n",
3466 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3467
3468 return true;
3469 }
3470
3471 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3472
3473 static void
3474 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3475 {
3476 dump_printf (dump_kind, "%s (%T) >= ",
3477 lower_bound.unsigned_p ? "unsigned" : "abs",
3478 lower_bound.expr);
3479 dump_dec (dump_kind, lower_bound.min_value);
3480 }
3481
3482 /* Record that the vectorized loop requires the vec_lower_bound described
3483 by EXPR, UNSIGNED_P and MIN_VALUE. */
3484
3485 static void
3486 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3487 poly_uint64 min_value)
3488 {
3489 vec<vec_lower_bound> &lower_bounds
3490 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3491 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3492 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3493 {
3494 unsigned_p &= lower_bounds[i].unsigned_p;
3495 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3496 if (lower_bounds[i].unsigned_p != unsigned_p
3497 || maybe_lt (lower_bounds[i].min_value, min_value))
3498 {
3499 lower_bounds[i].unsigned_p = unsigned_p;
3500 lower_bounds[i].min_value = min_value;
3501 if (dump_enabled_p ())
3502 {
3503 dump_printf_loc (MSG_NOTE, vect_location,
3504 "updating run-time check to ");
3505 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3506 dump_printf (MSG_NOTE, "\n");
3507 }
3508 }
3509 return;
3510 }
3511
3512 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3513 if (dump_enabled_p ())
3514 {
3515 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3516 dump_lower_bound (MSG_NOTE, lower_bound);
3517 dump_printf (MSG_NOTE, "\n");
3518 }
3519 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3520 }
3521
3522 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3523 will span fewer than GAP bytes. */
3524
3525 static bool
3526 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3527 poly_int64 gap)
3528 {
3529 stmt_vec_info stmt_info = dr_info->stmt;
3530 HOST_WIDE_INT count
3531 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3532 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3533 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3534 return (estimated_poly_value (gap)
3535 <= count * vect_get_scalar_dr_size (dr_info));
3536 }
3537
3538 /* Return true if we know that there is no alias between DR_INFO_A and
3539 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3540 When returning true, set *LOWER_BOUND_OUT to this N. */
3541
3542 static bool
3543 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3544 poly_uint64 *lower_bound_out)
3545 {
3546 /* Check that there is a constant gap of known sign between DR_A
3547 and DR_B. */
3548 data_reference *dr_a = dr_info_a->dr;
3549 data_reference *dr_b = dr_info_b->dr;
3550 poly_int64 init_a, init_b;
3551 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3552 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3553 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3554 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3555 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3556 || !ordered_p (init_a, init_b))
3557 return false;
3558
3559 /* Sort DR_A and DR_B by the address they access. */
3560 if (maybe_lt (init_b, init_a))
3561 {
3562 std::swap (init_a, init_b);
3563 std::swap (dr_info_a, dr_info_b);
3564 std::swap (dr_a, dr_b);
3565 }
3566
3567 /* If the two accesses could be dependent within a scalar iteration,
3568 make sure that we'd retain their order. */
3569 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3570 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3571 return false;
3572
3573 /* There is no alias if abs (DR_STEP) is greater than or equal to
3574 the bytes spanned by the combination of the two accesses. */
3575 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3576 return true;
3577 }
3578
3579 /* Function vect_prune_runtime_alias_test_list.
3580
3581 Prune a list of ddrs to be tested at run-time by versioning for alias.
3582 Merge several alias checks into one if possible.
3583 Return FALSE if resulting list of ddrs is longer then allowed by
3584 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3585
3586 opt_result
3587 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3588 {
3589 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3590 hash_set <tree_pair_hash> compared_objects;
3591
3592 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3593 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3594 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3595 const vec<vec_object_pair> &check_unequal_addrs
3596 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3597 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3598 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3599
3600 ddr_p ddr;
3601 unsigned int i;
3602 tree length_factor;
3603
3604 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3605
3606 /* Step values are irrelevant for aliasing if the number of vector
3607 iterations is equal to the number of scalar iterations (which can
3608 happen for fully-SLP loops). */
3609 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3610
3611 if (!vf_one_p)
3612 {
3613 /* Convert the checks for nonzero steps into bound tests. */
3614 tree value;
3615 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3616 vect_check_lower_bound (loop_vinfo, value, true, 1);
3617 }
3618
3619 if (may_alias_ddrs.is_empty ())
3620 return opt_result::success ();
3621
3622 comp_alias_ddrs.create (may_alias_ddrs.length ());
3623
3624 unsigned int loop_depth
3625 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3626 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3627
3628 /* First, we collect all data ref pairs for aliasing checks. */
3629 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3630 {
3631 poly_uint64 lower_bound;
3632 tree segment_length_a, segment_length_b;
3633 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3634 unsigned int align_a, align_b;
3635
3636 /* Ignore the alias if the VF we chose ended up being no greater
3637 than the dependence distance. */
3638 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3639 continue;
3640
3641 if (DDR_OBJECT_A (ddr))
3642 {
3643 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3644 if (!compared_objects.add (new_pair))
3645 {
3646 if (dump_enabled_p ())
3647 dump_printf_loc (MSG_NOTE, vect_location,
3648 "checking that %T and %T"
3649 " have different addresses\n",
3650 new_pair.first, new_pair.second);
3651 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3652 }
3653 continue;
3654 }
3655
3656 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3657 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3658
3659 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3660 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3661
3662 bool preserves_scalar_order_p
3663 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3664 bool ignore_step_p
3665 = (vf_one_p
3666 && (preserves_scalar_order_p
3667 || operand_equal_p (DR_STEP (dr_info_a->dr),
3668 DR_STEP (dr_info_b->dr))));
3669
3670 /* Skip the pair if inter-iteration dependencies are irrelevant
3671 and intra-iteration dependencies are guaranteed to be honored. */
3672 if (ignore_step_p
3673 && (preserves_scalar_order_p
3674 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3675 &lower_bound)))
3676 {
3677 if (dump_enabled_p ())
3678 dump_printf_loc (MSG_NOTE, vect_location,
3679 "no need for alias check between "
3680 "%T and %T when VF is 1\n",
3681 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3682 continue;
3683 }
3684
3685 /* See whether we can handle the alias using a bounds check on
3686 the step, and whether that's likely to be the best approach.
3687 (It might not be, for example, if the minimum step is much larger
3688 than the number of bytes handled by one vector iteration.) */
3689 if (!ignore_step_p
3690 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3691 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3692 &lower_bound)
3693 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3694 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3695 {
3696 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3697 if (dump_enabled_p ())
3698 {
3699 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3700 "%T and %T when the step %T is outside ",
3701 DR_REF (dr_info_a->dr),
3702 DR_REF (dr_info_b->dr),
3703 DR_STEP (dr_info_a->dr));
3704 if (unsigned_p)
3705 dump_printf (MSG_NOTE, "[0");
3706 else
3707 {
3708 dump_printf (MSG_NOTE, "(");
3709 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3710 }
3711 dump_printf (MSG_NOTE, ", ");
3712 dump_dec (MSG_NOTE, lower_bound);
3713 dump_printf (MSG_NOTE, ")\n");
3714 }
3715 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3716 unsigned_p, lower_bound);
3717 continue;
3718 }
3719
3720 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3721 if (dr_group_first_a)
3722 {
3723 stmt_info_a = dr_group_first_a;
3724 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3725 }
3726
3727 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3728 if (dr_group_first_b)
3729 {
3730 stmt_info_b = dr_group_first_b;
3731 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3732 }
3733
3734 if (ignore_step_p)
3735 {
3736 segment_length_a = size_zero_node;
3737 segment_length_b = size_zero_node;
3738 }
3739 else
3740 {
3741 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3742 DR_STEP (dr_info_b->dr), 0))
3743 length_factor = scalar_loop_iters;
3744 else
3745 length_factor = size_int (vect_factor);
3746 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3747 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3748 }
3749 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3750 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3751 align_a = vect_vfa_align (dr_info_a);
3752 align_b = vect_vfa_align (dr_info_b);
3753
3754 /* See whether the alias is known at compilation time. */
3755 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3756 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3757 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3758 DR_OFFSET (dr_info_b->dr), 0)
3759 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3760 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3761 && poly_int_tree_p (segment_length_a)
3762 && poly_int_tree_p (segment_length_b))
3763 {
3764 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3765 segment_length_a,
3766 segment_length_b,
3767 access_size_a,
3768 access_size_b);
3769 if (res >= 0 && dump_enabled_p ())
3770 {
3771 dump_printf_loc (MSG_NOTE, vect_location,
3772 "can tell at compile time that %T and %T",
3773 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3774 if (res == 0)
3775 dump_printf (MSG_NOTE, " do not alias\n");
3776 else
3777 dump_printf (MSG_NOTE, " alias\n");
3778 }
3779
3780 if (res == 0)
3781 continue;
3782
3783 if (res == 1)
3784 return opt_result::failure_at (stmt_info_b->stmt,
3785 "not vectorized:"
3786 " compilation time alias: %G%G",
3787 stmt_info_a->stmt,
3788 stmt_info_b->stmt);
3789 }
3790
3791 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3792 access_size_a, align_a);
3793 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3794 access_size_b, align_b);
3795 /* Canonicalize the order to be the one that's needed for accurate
3796 RAW, WAR and WAW flags, in cases where the data references are
3797 well-ordered. The order doesn't really matter otherwise,
3798 but we might as well be consistent. */
3799 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3800 std::swap (dr_a, dr_b);
3801
3802 dr_with_seg_len_pair_t dr_with_seg_len_pair
3803 (dr_a, dr_b, (preserves_scalar_order_p
3804 ? dr_with_seg_len_pair_t::WELL_ORDERED
3805 : dr_with_seg_len_pair_t::REORDERED));
3806
3807 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3808 }
3809
3810 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3811
3812 unsigned int count = (comp_alias_ddrs.length ()
3813 + check_unequal_addrs.length ());
3814
3815 if (count
3816 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3817 == VECT_COST_MODEL_VERY_CHEAP))
3818 return opt_result::failure_at
3819 (vect_location, "would need a runtime alias check\n");
3820
3821 if (dump_enabled_p ())
3822 dump_printf_loc (MSG_NOTE, vect_location,
3823 "improved number of alias checks from %d to %d\n",
3824 may_alias_ddrs.length (), count);
3825 unsigned limit = param_vect_max_version_for_alias_checks;
3826 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3827 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3828 if (count > limit)
3829 return opt_result::failure_at
3830 (vect_location,
3831 "number of versioning for alias run-time tests exceeds %d "
3832 "(--param vect-max-version-for-alias-checks)\n", limit);
3833
3834 return opt_result::success ();
3835 }
3836
3837 /* Check whether we can use an internal function for a gather load
3838 or scatter store. READ_P is true for loads and false for stores.
3839 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3840 the type of the memory elements being loaded or stored. OFFSET_TYPE
3841 is the type of the offset that is being applied to the invariant
3842 base address. SCALE is the amount by which the offset should
3843 be multiplied *after* it has been converted to address width.
3844
3845 Return true if the function is supported, storing the function id in
3846 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3847
3848 bool
3849 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3850 tree vectype, tree memory_type, tree offset_type,
3851 int scale, internal_fn *ifn_out,
3852 tree *offset_vectype_out)
3853 {
3854 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3855 unsigned int element_bits = vector_element_bits (vectype);
3856 if (element_bits != memory_bits)
3857 /* For now the vector elements must be the same width as the
3858 memory elements. */
3859 return false;
3860
3861 /* Work out which function we need. */
3862 internal_fn ifn, alt_ifn;
3863 if (read_p)
3864 {
3865 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3866 alt_ifn = IFN_MASK_GATHER_LOAD;
3867 }
3868 else
3869 {
3870 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3871 alt_ifn = IFN_MASK_SCATTER_STORE;
3872 }
3873
3874 for (;;)
3875 {
3876 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3877 if (!offset_vectype)
3878 return false;
3879
3880 /* Test whether the target supports this combination. */
3881 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3882 offset_vectype, scale))
3883 {
3884 *ifn_out = ifn;
3885 *offset_vectype_out = offset_vectype;
3886 return true;
3887 }
3888 else if (!masked_p
3889 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3890 memory_type,
3891 offset_vectype,
3892 scale))
3893 {
3894 *ifn_out = alt_ifn;
3895 *offset_vectype_out = offset_vectype;
3896 return true;
3897 }
3898
3899 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3900 && TYPE_PRECISION (offset_type) >= element_bits)
3901 return false;
3902
3903 offset_type = build_nonstandard_integer_type
3904 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3905 }
3906 }
3907
3908 /* STMT_INFO is a call to an internal gather load or scatter store function.
3909 Describe the operation in INFO. */
3910
3911 static void
3912 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3913 gather_scatter_info *info)
3914 {
3915 gcall *call = as_a <gcall *> (stmt_info->stmt);
3916 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3917 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3918
3919 info->ifn = gimple_call_internal_fn (call);
3920 info->decl = NULL_TREE;
3921 info->base = gimple_call_arg (call, 0);
3922 info->offset = gimple_call_arg (call, 1);
3923 info->offset_dt = vect_unknown_def_type;
3924 info->offset_vectype = NULL_TREE;
3925 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3926 info->element_type = TREE_TYPE (vectype);
3927 info->memory_type = TREE_TYPE (DR_REF (dr));
3928 }
3929
3930 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3931 gather load or scatter store. Describe the operation in *INFO if so. */
3932
3933 bool
3934 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3935 gather_scatter_info *info)
3936 {
3937 HOST_WIDE_INT scale = 1;
3938 poly_int64 pbitpos, pbitsize;
3939 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3940 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3941 tree offtype = NULL_TREE;
3942 tree decl = NULL_TREE, base, off;
3943 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3944 tree memory_type = TREE_TYPE (DR_REF (dr));
3945 machine_mode pmode;
3946 int punsignedp, reversep, pvolatilep = 0;
3947 internal_fn ifn;
3948 tree offset_vectype;
3949 bool masked_p = false;
3950
3951 /* See whether this is already a call to a gather/scatter internal function.
3952 If not, see whether it's a masked load or store. */
3953 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3954 if (call && gimple_call_internal_p (call))
3955 {
3956 ifn = gimple_call_internal_fn (call);
3957 if (internal_gather_scatter_fn_p (ifn))
3958 {
3959 vect_describe_gather_scatter_call (stmt_info, info);
3960 return true;
3961 }
3962 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3963 }
3964
3965 /* True if we should aim to use internal functions rather than
3966 built-in functions. */
3967 bool use_ifn_p = (DR_IS_READ (dr)
3968 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3969 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3970
3971 base = DR_REF (dr);
3972 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3973 see if we can use the def stmt of the address. */
3974 if (masked_p
3975 && TREE_CODE (base) == MEM_REF
3976 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3977 && integer_zerop (TREE_OPERAND (base, 1))
3978 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3979 {
3980 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3981 if (is_gimple_assign (def_stmt)
3982 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3983 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3984 }
3985
3986 /* The gather and scatter builtins need address of the form
3987 loop_invariant + vector * {1, 2, 4, 8}
3988 or
3989 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3990 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3991 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3992 multiplications and additions in it. To get a vector, we need
3993 a single SSA_NAME that will be defined in the loop and will
3994 contain everything that is not loop invariant and that can be
3995 vectorized. The following code attempts to find such a preexistng
3996 SSA_NAME OFF and put the loop invariants into a tree BASE
3997 that can be gimplified before the loop. */
3998 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3999 &punsignedp, &reversep, &pvolatilep);
4000 if (reversep)
4001 return false;
4002
4003 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4004
4005 if (TREE_CODE (base) == MEM_REF)
4006 {
4007 if (!integer_zerop (TREE_OPERAND (base, 1)))
4008 {
4009 if (off == NULL_TREE)
4010 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4011 else
4012 off = size_binop (PLUS_EXPR, off,
4013 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4014 }
4015 base = TREE_OPERAND (base, 0);
4016 }
4017 else
4018 base = build_fold_addr_expr (base);
4019
4020 if (off == NULL_TREE)
4021 off = size_zero_node;
4022
4023 /* If base is not loop invariant, either off is 0, then we start with just
4024 the constant offset in the loop invariant BASE and continue with base
4025 as OFF, otherwise give up.
4026 We could handle that case by gimplifying the addition of base + off
4027 into some SSA_NAME and use that as off, but for now punt. */
4028 if (!expr_invariant_in_loop_p (loop, base))
4029 {
4030 if (!integer_zerop (off))
4031 return false;
4032 off = base;
4033 base = size_int (pbytepos);
4034 }
4035 /* Otherwise put base + constant offset into the loop invariant BASE
4036 and continue with OFF. */
4037 else
4038 {
4039 base = fold_convert (sizetype, base);
4040 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4041 }
4042
4043 /* OFF at this point may be either a SSA_NAME or some tree expression
4044 from get_inner_reference. Try to peel off loop invariants from it
4045 into BASE as long as possible. */
4046 STRIP_NOPS (off);
4047 while (offtype == NULL_TREE)
4048 {
4049 enum tree_code code;
4050 tree op0, op1, add = NULL_TREE;
4051
4052 if (TREE_CODE (off) == SSA_NAME)
4053 {
4054 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4055
4056 if (expr_invariant_in_loop_p (loop, off))
4057 return false;
4058
4059 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4060 break;
4061
4062 op0 = gimple_assign_rhs1 (def_stmt);
4063 code = gimple_assign_rhs_code (def_stmt);
4064 op1 = gimple_assign_rhs2 (def_stmt);
4065 }
4066 else
4067 {
4068 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4069 return false;
4070 code = TREE_CODE (off);
4071 extract_ops_from_tree (off, &code, &op0, &op1);
4072 }
4073 switch (code)
4074 {
4075 case POINTER_PLUS_EXPR:
4076 case PLUS_EXPR:
4077 if (expr_invariant_in_loop_p (loop, op0))
4078 {
4079 add = op0;
4080 off = op1;
4081 do_add:
4082 add = fold_convert (sizetype, add);
4083 if (scale != 1)
4084 add = size_binop (MULT_EXPR, add, size_int (scale));
4085 base = size_binop (PLUS_EXPR, base, add);
4086 continue;
4087 }
4088 if (expr_invariant_in_loop_p (loop, op1))
4089 {
4090 add = op1;
4091 off = op0;
4092 goto do_add;
4093 }
4094 break;
4095 case MINUS_EXPR:
4096 if (expr_invariant_in_loop_p (loop, op1))
4097 {
4098 add = fold_convert (sizetype, op1);
4099 add = size_binop (MINUS_EXPR, size_zero_node, add);
4100 off = op0;
4101 goto do_add;
4102 }
4103 break;
4104 case MULT_EXPR:
4105 if (scale == 1 && tree_fits_shwi_p (op1))
4106 {
4107 int new_scale = tree_to_shwi (op1);
4108 /* Only treat this as a scaling operation if the target
4109 supports it for at least some offset type. */
4110 if (use_ifn_p
4111 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4112 masked_p, vectype, memory_type,
4113 signed_char_type_node,
4114 new_scale, &ifn,
4115 &offset_vectype)
4116 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4117 masked_p, vectype, memory_type,
4118 unsigned_char_type_node,
4119 new_scale, &ifn,
4120 &offset_vectype))
4121 break;
4122 scale = new_scale;
4123 off = op0;
4124 continue;
4125 }
4126 break;
4127 case SSA_NAME:
4128 off = op0;
4129 continue;
4130 CASE_CONVERT:
4131 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4132 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4133 break;
4134
4135 /* Don't include the conversion if the target is happy with
4136 the current offset type. */
4137 if (use_ifn_p
4138 && !POINTER_TYPE_P (TREE_TYPE (off))
4139 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4140 masked_p, vectype, memory_type,
4141 TREE_TYPE (off), scale, &ifn,
4142 &offset_vectype))
4143 break;
4144
4145 if (TYPE_PRECISION (TREE_TYPE (op0))
4146 == TYPE_PRECISION (TREE_TYPE (off)))
4147 {
4148 off = op0;
4149 continue;
4150 }
4151
4152 /* Include the conversion if it is widening and we're using
4153 the IFN path or the target can handle the converted from
4154 offset or the current size is not already the same as the
4155 data vector element size. */
4156 if ((TYPE_PRECISION (TREE_TYPE (op0))
4157 < TYPE_PRECISION (TREE_TYPE (off)))
4158 && (use_ifn_p
4159 || (DR_IS_READ (dr)
4160 ? (targetm.vectorize.builtin_gather
4161 && targetm.vectorize.builtin_gather (vectype,
4162 TREE_TYPE (op0),
4163 scale))
4164 : (targetm.vectorize.builtin_scatter
4165 && targetm.vectorize.builtin_scatter (vectype,
4166 TREE_TYPE (op0),
4167 scale)))
4168 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4169 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4170 {
4171 off = op0;
4172 offtype = TREE_TYPE (off);
4173 STRIP_NOPS (off);
4174 continue;
4175 }
4176 break;
4177 default:
4178 break;
4179 }
4180 break;
4181 }
4182
4183 /* If at the end OFF still isn't a SSA_NAME or isn't
4184 defined in the loop, punt. */
4185 if (TREE_CODE (off) != SSA_NAME
4186 || expr_invariant_in_loop_p (loop, off))
4187 return false;
4188
4189 if (offtype == NULL_TREE)
4190 offtype = TREE_TYPE (off);
4191
4192 if (use_ifn_p)
4193 {
4194 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4195 vectype, memory_type, offtype, scale,
4196 &ifn, &offset_vectype))
4197 ifn = IFN_LAST;
4198 decl = NULL_TREE;
4199 }
4200 else
4201 {
4202 if (DR_IS_READ (dr))
4203 {
4204 if (targetm.vectorize.builtin_gather)
4205 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4206 }
4207 else
4208 {
4209 if (targetm.vectorize.builtin_scatter)
4210 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4211 }
4212 ifn = IFN_LAST;
4213 /* The offset vector type will be read from DECL when needed. */
4214 offset_vectype = NULL_TREE;
4215 }
4216
4217 info->ifn = ifn;
4218 info->decl = decl;
4219 info->base = base;
4220 info->offset = off;
4221 info->offset_dt = vect_unknown_def_type;
4222 info->offset_vectype = offset_vectype;
4223 info->scale = scale;
4224 info->element_type = TREE_TYPE (vectype);
4225 info->memory_type = memory_type;
4226 return true;
4227 }
4228
4229 /* Find the data references in STMT, analyze them with respect to LOOP and
4230 append them to DATAREFS. Return false if datarefs in this stmt cannot
4231 be handled. */
4232
4233 opt_result
4234 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4235 vec<data_reference_p> *datarefs,
4236 vec<int> *dataref_groups, int group_id)
4237 {
4238 /* We can ignore clobbers for dataref analysis - they are removed during
4239 loop vectorization and BB vectorization checks dependences with a
4240 stmt walk. */
4241 if (gimple_clobber_p (stmt))
4242 return opt_result::success ();
4243
4244 if (gimple_has_volatile_ops (stmt))
4245 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4246 stmt);
4247
4248 if (stmt_can_throw_internal (cfun, stmt))
4249 return opt_result::failure_at (stmt,
4250 "not vectorized:"
4251 " statement can throw an exception: %G",
4252 stmt);
4253
4254 auto_vec<data_reference_p, 2> refs;
4255 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4256 if (!res)
4257 return res;
4258
4259 if (refs.is_empty ())
4260 return opt_result::success ();
4261
4262 if (refs.length () > 1)
4263 {
4264 while (!refs.is_empty ())
4265 free_data_ref (refs.pop ());
4266 return opt_result::failure_at (stmt,
4267 "not vectorized: more than one "
4268 "data ref in stmt: %G", stmt);
4269 }
4270
4271 data_reference_p dr = refs.pop ();
4272 if (gcall *call = dyn_cast <gcall *> (stmt))
4273 if (!gimple_call_internal_p (call)
4274 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4275 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4276 {
4277 free_data_ref (dr);
4278 return opt_result::failure_at (stmt,
4279 "not vectorized: dr in a call %G", stmt);
4280 }
4281
4282 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4283 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4284 {
4285 free_data_ref (dr);
4286 return opt_result::failure_at (stmt,
4287 "not vectorized:"
4288 " statement is bitfield access %G", stmt);
4289 }
4290
4291 if (DR_BASE_ADDRESS (dr)
4292 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4293 {
4294 free_data_ref (dr);
4295 return opt_result::failure_at (stmt,
4296 "not vectorized:"
4297 " base addr of dr is a constant\n");
4298 }
4299
4300 /* Check whether this may be a SIMD lane access and adjust the
4301 DR to make it easier for us to handle it. */
4302 if (loop
4303 && loop->simduid
4304 && (!DR_BASE_ADDRESS (dr)
4305 || !DR_OFFSET (dr)
4306 || !DR_INIT (dr)
4307 || !DR_STEP (dr)))
4308 {
4309 struct data_reference *newdr
4310 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4311 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4312 if (DR_BASE_ADDRESS (newdr)
4313 && DR_OFFSET (newdr)
4314 && DR_INIT (newdr)
4315 && DR_STEP (newdr)
4316 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4317 && integer_zerop (DR_STEP (newdr)))
4318 {
4319 tree base_address = DR_BASE_ADDRESS (newdr);
4320 tree off = DR_OFFSET (newdr);
4321 tree step = ssize_int (1);
4322 if (integer_zerop (off)
4323 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4324 {
4325 off = TREE_OPERAND (base_address, 1);
4326 base_address = TREE_OPERAND (base_address, 0);
4327 }
4328 STRIP_NOPS (off);
4329 if (TREE_CODE (off) == MULT_EXPR
4330 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4331 {
4332 step = TREE_OPERAND (off, 1);
4333 off = TREE_OPERAND (off, 0);
4334 STRIP_NOPS (off);
4335 }
4336 if (CONVERT_EXPR_P (off)
4337 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4338 < TYPE_PRECISION (TREE_TYPE (off))))
4339 off = TREE_OPERAND (off, 0);
4340 if (TREE_CODE (off) == SSA_NAME)
4341 {
4342 gimple *def = SSA_NAME_DEF_STMT (off);
4343 /* Look through widening conversion. */
4344 if (is_gimple_assign (def)
4345 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4346 {
4347 tree rhs1 = gimple_assign_rhs1 (def);
4348 if (TREE_CODE (rhs1) == SSA_NAME
4349 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4350 && (TYPE_PRECISION (TREE_TYPE (off))
4351 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4352 def = SSA_NAME_DEF_STMT (rhs1);
4353 }
4354 if (is_gimple_call (def)
4355 && gimple_call_internal_p (def)
4356 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4357 {
4358 tree arg = gimple_call_arg (def, 0);
4359 tree reft = TREE_TYPE (DR_REF (newdr));
4360 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4361 arg = SSA_NAME_VAR (arg);
4362 if (arg == loop->simduid
4363 /* For now. */
4364 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4365 {
4366 DR_BASE_ADDRESS (newdr) = base_address;
4367 DR_OFFSET (newdr) = ssize_int (0);
4368 DR_STEP (newdr) = step;
4369 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4370 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4371 /* Mark as simd-lane access. */
4372 tree arg2 = gimple_call_arg (def, 1);
4373 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4374 free_data_ref (dr);
4375 datarefs->safe_push (newdr);
4376 if (dataref_groups)
4377 dataref_groups->safe_push (group_id);
4378 return opt_result::success ();
4379 }
4380 }
4381 }
4382 }
4383 free_data_ref (newdr);
4384 }
4385
4386 datarefs->safe_push (dr);
4387 if (dataref_groups)
4388 dataref_groups->safe_push (group_id);
4389 return opt_result::success ();
4390 }
4391
4392 /* Function vect_analyze_data_refs.
4393
4394 Find all the data references in the loop or basic block.
4395
4396 The general structure of the analysis of data refs in the vectorizer is as
4397 follows:
4398 1- vect_analyze_data_refs(loop/bb): call
4399 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4400 in the loop/bb and their dependences.
4401 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4402 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4403 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4404
4405 */
4406
4407 opt_result
4408 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4409 {
4410 class loop *loop = NULL;
4411 unsigned int i;
4412 struct data_reference *dr;
4413 tree scalar_type;
4414
4415 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4416
4417 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4418 loop = LOOP_VINFO_LOOP (loop_vinfo);
4419
4420 /* Go through the data-refs, check that the analysis succeeded. Update
4421 pointer from stmt_vec_info struct to DR and vectype. */
4422
4423 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4424 FOR_EACH_VEC_ELT (datarefs, i, dr)
4425 {
4426 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4427 poly_uint64 vf;
4428
4429 gcc_assert (DR_REF (dr));
4430 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4431 gcc_assert (!stmt_info->dr_aux.dr);
4432 stmt_info->dr_aux.dr = dr;
4433 stmt_info->dr_aux.stmt = stmt_info;
4434
4435 /* Check that analysis of the data-ref succeeded. */
4436 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4437 || !DR_STEP (dr))
4438 {
4439 bool maybe_gather
4440 = DR_IS_READ (dr)
4441 && !TREE_THIS_VOLATILE (DR_REF (dr));
4442 bool maybe_scatter
4443 = DR_IS_WRITE (dr)
4444 && !TREE_THIS_VOLATILE (DR_REF (dr))
4445 && (targetm.vectorize.builtin_scatter != NULL
4446 || supports_vec_scatter_store_p ());
4447
4448 /* If target supports vector gather loads or scatter stores,
4449 see if they can't be used. */
4450 if (is_a <loop_vec_info> (vinfo)
4451 && !nested_in_vect_loop_p (loop, stmt_info))
4452 {
4453 if (maybe_gather || maybe_scatter)
4454 {
4455 if (maybe_gather)
4456 gatherscatter = GATHER;
4457 else
4458 gatherscatter = SCATTER;
4459 }
4460 }
4461
4462 if (gatherscatter == SG_NONE)
4463 {
4464 if (dump_enabled_p ())
4465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4466 "not vectorized: data ref analysis "
4467 "failed %G", stmt_info->stmt);
4468 if (is_a <bb_vec_info> (vinfo))
4469 {
4470 /* In BB vectorization the ref can still participate
4471 in dependence analysis, we just can't vectorize it. */
4472 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4473 continue;
4474 }
4475 return opt_result::failure_at (stmt_info->stmt,
4476 "not vectorized:"
4477 " data ref analysis failed: %G",
4478 stmt_info->stmt);
4479 }
4480 }
4481
4482 /* See if this was detected as SIMD lane access. */
4483 if (dr->aux == (void *)-1
4484 || dr->aux == (void *)-2
4485 || dr->aux == (void *)-3
4486 || dr->aux == (void *)-4)
4487 {
4488 if (nested_in_vect_loop_p (loop, stmt_info))
4489 return opt_result::failure_at (stmt_info->stmt,
4490 "not vectorized:"
4491 " data ref analysis failed: %G",
4492 stmt_info->stmt);
4493 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4494 = -(uintptr_t) dr->aux;
4495 }
4496
4497 tree base = get_base_address (DR_REF (dr));
4498 if (base && VAR_P (base) && DECL_NONALIASED (base))
4499 {
4500 if (dump_enabled_p ())
4501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4502 "not vectorized: base object not addressable "
4503 "for stmt: %G", stmt_info->stmt);
4504 if (is_a <bb_vec_info> (vinfo))
4505 {
4506 /* In BB vectorization the ref can still participate
4507 in dependence analysis, we just can't vectorize it. */
4508 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4509 continue;
4510 }
4511 return opt_result::failure_at (stmt_info->stmt,
4512 "not vectorized: base object not"
4513 " addressable for stmt: %G",
4514 stmt_info->stmt);
4515 }
4516
4517 if (is_a <loop_vec_info> (vinfo)
4518 && DR_STEP (dr)
4519 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4520 {
4521 if (nested_in_vect_loop_p (loop, stmt_info))
4522 return opt_result::failure_at (stmt_info->stmt,
4523 "not vectorized: "
4524 "not suitable for strided load %G",
4525 stmt_info->stmt);
4526 STMT_VINFO_STRIDED_P (stmt_info) = true;
4527 }
4528
4529 /* Update DR field in stmt_vec_info struct. */
4530
4531 /* If the dataref is in an inner-loop of the loop that is considered for
4532 for vectorization, we also want to analyze the access relative to
4533 the outer-loop (DR contains information only relative to the
4534 inner-most enclosing loop). We do that by building a reference to the
4535 first location accessed by the inner-loop, and analyze it relative to
4536 the outer-loop. */
4537 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4538 {
4539 /* Build a reference to the first location accessed by the
4540 inner loop: *(BASE + INIT + OFFSET). By construction,
4541 this address must be invariant in the inner loop, so we
4542 can consider it as being used in the outer loop. */
4543 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4544 tree offset = unshare_expr (DR_OFFSET (dr));
4545 tree init = unshare_expr (DR_INIT (dr));
4546 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4547 init, offset);
4548 tree init_addr = fold_build_pointer_plus (base, init_offset);
4549 tree init_ref = build_fold_indirect_ref (init_addr);
4550
4551 if (dump_enabled_p ())
4552 dump_printf_loc (MSG_NOTE, vect_location,
4553 "analyze in outer loop: %T\n", init_ref);
4554
4555 opt_result res
4556 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4557 init_ref, loop, stmt_info->stmt);
4558 if (!res)
4559 /* dr_analyze_innermost already explained the failure. */
4560 return res;
4561
4562 if (dump_enabled_p ())
4563 dump_printf_loc (MSG_NOTE, vect_location,
4564 "\touter base_address: %T\n"
4565 "\touter offset from base address: %T\n"
4566 "\touter constant offset from base address: %T\n"
4567 "\touter step: %T\n"
4568 "\touter base alignment: %d\n\n"
4569 "\touter base misalignment: %d\n"
4570 "\touter offset alignment: %d\n"
4571 "\touter step alignment: %d\n",
4572 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4573 STMT_VINFO_DR_OFFSET (stmt_info),
4574 STMT_VINFO_DR_INIT (stmt_info),
4575 STMT_VINFO_DR_STEP (stmt_info),
4576 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4577 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4578 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4579 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4580 }
4581
4582 /* Set vectype for STMT. */
4583 scalar_type = TREE_TYPE (DR_REF (dr));
4584 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4585 if (!vectype)
4586 {
4587 if (dump_enabled_p ())
4588 {
4589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4590 "not vectorized: no vectype for stmt: %G",
4591 stmt_info->stmt);
4592 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4593 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4594 scalar_type);
4595 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4596 }
4597
4598 if (is_a <bb_vec_info> (vinfo))
4599 {
4600 /* No vector type is fine, the ref can still participate
4601 in dependence analysis, we just can't vectorize it. */
4602 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4603 continue;
4604 }
4605 if (fatal)
4606 *fatal = false;
4607 return opt_result::failure_at (stmt_info->stmt,
4608 "not vectorized:"
4609 " no vectype for stmt: %G"
4610 " scalar_type: %T\n",
4611 stmt_info->stmt, scalar_type);
4612 }
4613 else
4614 {
4615 if (dump_enabled_p ())
4616 dump_printf_loc (MSG_NOTE, vect_location,
4617 "got vectype for stmt: %G%T\n",
4618 stmt_info->stmt, vectype);
4619 }
4620
4621 /* Adjust the minimal vectorization factor according to the
4622 vector type. */
4623 vf = TYPE_VECTOR_SUBPARTS (vectype);
4624 *min_vf = upper_bound (*min_vf, vf);
4625
4626 /* Leave the BB vectorizer to pick the vector type later, based on
4627 the final dataref group size and SLP node size. */
4628 if (is_a <loop_vec_info> (vinfo))
4629 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4630
4631 if (gatherscatter != SG_NONE)
4632 {
4633 gather_scatter_info gs_info;
4634 if (!vect_check_gather_scatter (stmt_info,
4635 as_a <loop_vec_info> (vinfo),
4636 &gs_info)
4637 || !get_vectype_for_scalar_type (vinfo,
4638 TREE_TYPE (gs_info.offset)))
4639 {
4640 if (fatal)
4641 *fatal = false;
4642 return opt_result::failure_at
4643 (stmt_info->stmt,
4644 (gatherscatter == GATHER)
4645 ? "not vectorized: not suitable for gather load %G"
4646 : "not vectorized: not suitable for scatter store %G",
4647 stmt_info->stmt);
4648 }
4649 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4650 }
4651 }
4652
4653 /* We used to stop processing and prune the list here. Verify we no
4654 longer need to. */
4655 gcc_assert (i == datarefs.length ());
4656
4657 return opt_result::success ();
4658 }
4659
4660
4661 /* Function vect_get_new_vect_var.
4662
4663 Returns a name for a new variable. The current naming scheme appends the
4664 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4665 the name of vectorizer generated variables, and appends that to NAME if
4666 provided. */
4667
4668 tree
4669 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4670 {
4671 const char *prefix;
4672 tree new_vect_var;
4673
4674 switch (var_kind)
4675 {
4676 case vect_simple_var:
4677 prefix = "vect";
4678 break;
4679 case vect_scalar_var:
4680 prefix = "stmp";
4681 break;
4682 case vect_mask_var:
4683 prefix = "mask";
4684 break;
4685 case vect_pointer_var:
4686 prefix = "vectp";
4687 break;
4688 default:
4689 gcc_unreachable ();
4690 }
4691
4692 if (name)
4693 {
4694 char* tmp = concat (prefix, "_", name, NULL);
4695 new_vect_var = create_tmp_reg (type, tmp);
4696 free (tmp);
4697 }
4698 else
4699 new_vect_var = create_tmp_reg (type, prefix);
4700
4701 return new_vect_var;
4702 }
4703
4704 /* Like vect_get_new_vect_var but return an SSA name. */
4705
4706 tree
4707 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4708 {
4709 const char *prefix;
4710 tree new_vect_var;
4711
4712 switch (var_kind)
4713 {
4714 case vect_simple_var:
4715 prefix = "vect";
4716 break;
4717 case vect_scalar_var:
4718 prefix = "stmp";
4719 break;
4720 case vect_pointer_var:
4721 prefix = "vectp";
4722 break;
4723 default:
4724 gcc_unreachable ();
4725 }
4726
4727 if (name)
4728 {
4729 char* tmp = concat (prefix, "_", name, NULL);
4730 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4731 free (tmp);
4732 }
4733 else
4734 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4735
4736 return new_vect_var;
4737 }
4738
4739 /* Duplicate points-to info on NAME from DR_INFO. */
4740
4741 static void
4742 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4743 {
4744 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4745 /* DR_PTR_INFO is for a base SSA name, not including constant or
4746 variable offsets in the ref so its alignment info does not apply. */
4747 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4748 }
4749
4750 /* Function vect_create_addr_base_for_vector_ref.
4751
4752 Create an expression that computes the address of the first memory location
4753 that will be accessed for a data reference.
4754
4755 Input:
4756 STMT_INFO: The statement containing the data reference.
4757 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4758 OFFSET: Optional. If supplied, it is be added to the initial address.
4759 LOOP: Specify relative to which loop-nest should the address be computed.
4760 For example, when the dataref is in an inner-loop nested in an
4761 outer-loop that is now being vectorized, LOOP can be either the
4762 outer-loop, or the inner-loop. The first memory location accessed
4763 by the following dataref ('in' points to short):
4764
4765 for (i=0; i<N; i++)
4766 for (j=0; j<M; j++)
4767 s += in[i+j]
4768
4769 is as follows:
4770 if LOOP=i_loop: &in (relative to i_loop)
4771 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4772
4773 Output:
4774 1. Return an SSA_NAME whose value is the address of the memory location of
4775 the first vector of the data reference.
4776 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4777 these statement(s) which define the returned SSA_NAME.
4778
4779 FORNOW: We are only handling array accesses with step 1. */
4780
4781 tree
4782 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4783 gimple_seq *new_stmt_list,
4784 tree offset)
4785 {
4786 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4787 struct data_reference *dr = dr_info->dr;
4788 const char *base_name;
4789 tree addr_base;
4790 tree dest;
4791 gimple_seq seq = NULL;
4792 tree vect_ptr_type;
4793 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4794 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4795
4796 tree data_ref_base = unshare_expr (drb->base_address);
4797 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4798 tree init = unshare_expr (drb->init);
4799
4800 if (loop_vinfo)
4801 base_name = get_name (data_ref_base);
4802 else
4803 {
4804 base_offset = ssize_int (0);
4805 init = ssize_int (0);
4806 base_name = get_name (DR_REF (dr));
4807 }
4808
4809 /* Create base_offset */
4810 base_offset = size_binop (PLUS_EXPR,
4811 fold_convert (sizetype, base_offset),
4812 fold_convert (sizetype, init));
4813
4814 if (offset)
4815 {
4816 offset = fold_convert (sizetype, offset);
4817 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4818 base_offset, offset);
4819 }
4820
4821 /* base + base_offset */
4822 if (loop_vinfo)
4823 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4824 else
4825 {
4826 addr_base = build1 (ADDR_EXPR,
4827 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4828 unshare_expr (DR_REF (dr)));
4829 }
4830
4831 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4832 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4833 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4834 gimple_seq_add_seq (new_stmt_list, seq);
4835
4836 if (DR_PTR_INFO (dr)
4837 && TREE_CODE (addr_base) == SSA_NAME
4838 /* We should only duplicate pointer info to newly created SSA names. */
4839 && SSA_NAME_VAR (addr_base) == dest)
4840 {
4841 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4842 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4843 }
4844
4845 if (dump_enabled_p ())
4846 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4847
4848 return addr_base;
4849 }
4850
4851
4852 /* Function vect_create_data_ref_ptr.
4853
4854 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4855 location accessed in the loop by STMT_INFO, along with the def-use update
4856 chain to appropriately advance the pointer through the loop iterations.
4857 Also set aliasing information for the pointer. This pointer is used by
4858 the callers to this function to create a memory reference expression for
4859 vector load/store access.
4860
4861 Input:
4862 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4863 GIMPLE_ASSIGN <name, data-ref> or
4864 GIMPLE_ASSIGN <data-ref, name>.
4865 2. AGGR_TYPE: the type of the reference, which should be either a vector
4866 or an array.
4867 3. AT_LOOP: the loop where the vector memref is to be created.
4868 4. OFFSET (optional): a byte offset to be added to the initial address
4869 accessed by the data-ref in STMT_INFO.
4870 5. BSI: location where the new stmts are to be placed if there is no loop
4871 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4872 pointing to the initial address.
4873 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4874 to the IV during each iteration of the loop. NULL says to move
4875 by one copy of AGGR_TYPE up or down, depending on the step of the
4876 data reference.
4877
4878 Output:
4879 1. Declare a new ptr to vector_type, and have it point to the base of the
4880 data reference (initial addressed accessed by the data reference).
4881 For example, for vector of type V8HI, the following code is generated:
4882
4883 v8hi *ap;
4884 ap = (v8hi *)initial_address;
4885
4886 if OFFSET is not supplied:
4887 initial_address = &a[init];
4888 if OFFSET is supplied:
4889 initial_address = &a[init] + OFFSET;
4890 if BYTE_OFFSET is supplied:
4891 initial_address = &a[init] + BYTE_OFFSET;
4892
4893 Return the initial_address in INITIAL_ADDRESS.
4894
4895 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4896 update the pointer in each iteration of the loop.
4897
4898 Return the increment stmt that updates the pointer in PTR_INCR.
4899
4900 3. Return the pointer. */
4901
4902 tree
4903 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4904 tree aggr_type, class loop *at_loop, tree offset,
4905 tree *initial_address, gimple_stmt_iterator *gsi,
4906 gimple **ptr_incr, bool only_init,
4907 tree iv_step)
4908 {
4909 const char *base_name;
4910 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4911 class loop *loop = NULL;
4912 bool nested_in_vect_loop = false;
4913 class loop *containing_loop = NULL;
4914 tree aggr_ptr_type;
4915 tree aggr_ptr;
4916 tree new_temp;
4917 gimple_seq new_stmt_list = NULL;
4918 edge pe = NULL;
4919 basic_block new_bb;
4920 tree aggr_ptr_init;
4921 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4922 struct data_reference *dr = dr_info->dr;
4923 tree aptr;
4924 gimple_stmt_iterator incr_gsi;
4925 bool insert_after;
4926 tree indx_before_incr, indx_after_incr;
4927 gimple *incr;
4928 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4929
4930 gcc_assert (iv_step != NULL_TREE
4931 || TREE_CODE (aggr_type) == ARRAY_TYPE
4932 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4933
4934 if (loop_vinfo)
4935 {
4936 loop = LOOP_VINFO_LOOP (loop_vinfo);
4937 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4938 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4939 pe = loop_preheader_edge (loop);
4940 }
4941 else
4942 {
4943 gcc_assert (bb_vinfo);
4944 only_init = true;
4945 *ptr_incr = NULL;
4946 }
4947
4948 /* Create an expression for the first address accessed by this load
4949 in LOOP. */
4950 base_name = get_name (DR_BASE_ADDRESS (dr));
4951
4952 if (dump_enabled_p ())
4953 {
4954 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4955 dump_printf_loc (MSG_NOTE, vect_location,
4956 "create %s-pointer variable to type: %T",
4957 get_tree_code_name (TREE_CODE (aggr_type)),
4958 aggr_type);
4959 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4960 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4961 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4962 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4963 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4964 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4965 else
4966 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4967 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4968 }
4969
4970 /* (1) Create the new aggregate-pointer variable.
4971 Vector and array types inherit the alias set of their component
4972 type by default so we need to use a ref-all pointer if the data
4973 reference does not conflict with the created aggregated data
4974 reference because it is not addressable. */
4975 bool need_ref_all = false;
4976 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4977 get_alias_set (DR_REF (dr))))
4978 need_ref_all = true;
4979 /* Likewise for any of the data references in the stmt group. */
4980 else if (DR_GROUP_SIZE (stmt_info) > 1)
4981 {
4982 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4983 do
4984 {
4985 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4986 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4987 get_alias_set (DR_REF (sdr))))
4988 {
4989 need_ref_all = true;
4990 break;
4991 }
4992 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4993 }
4994 while (sinfo);
4995 }
4996 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4997 need_ref_all);
4998 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4999
5000
5001 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5002 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5003 def-use update cycles for the pointer: one relative to the outer-loop
5004 (LOOP), which is what steps (3) and (4) below do. The other is relative
5005 to the inner-loop (which is the inner-most loop containing the dataref),
5006 and this is done be step (5) below.
5007
5008 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5009 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5010 redundant. Steps (3),(4) create the following:
5011
5012 vp0 = &base_addr;
5013 LOOP: vp1 = phi(vp0,vp2)
5014 ...
5015 ...
5016 vp2 = vp1 + step
5017 goto LOOP
5018
5019 If there is an inner-loop nested in loop, then step (5) will also be
5020 applied, and an additional update in the inner-loop will be created:
5021
5022 vp0 = &base_addr;
5023 LOOP: vp1 = phi(vp0,vp2)
5024 ...
5025 inner: vp3 = phi(vp1,vp4)
5026 vp4 = vp3 + inner_step
5027 if () goto inner
5028 ...
5029 vp2 = vp1 + step
5030 if () goto LOOP */
5031
5032 /* (2) Calculate the initial address of the aggregate-pointer, and set
5033 the aggregate-pointer to point to it before the loop. */
5034
5035 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5036
5037 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5038 stmt_info, &new_stmt_list,
5039 offset);
5040 if (new_stmt_list)
5041 {
5042 if (pe)
5043 {
5044 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5045 gcc_assert (!new_bb);
5046 }
5047 else
5048 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5049 }
5050
5051 *initial_address = new_temp;
5052 aggr_ptr_init = new_temp;
5053
5054 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5055 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5056 inner-loop nested in LOOP (during outer-loop vectorization). */
5057
5058 /* No update in loop is required. */
5059 if (only_init && (!loop_vinfo || at_loop == loop))
5060 aptr = aggr_ptr_init;
5061 else
5062 {
5063 /* Accesses to invariant addresses should be handled specially
5064 by the caller. */
5065 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5066 gcc_assert (!integer_zerop (step));
5067
5068 if (iv_step == NULL_TREE)
5069 {
5070 /* The step of the aggregate pointer is the type size,
5071 negated for downward accesses. */
5072 iv_step = TYPE_SIZE_UNIT (aggr_type);
5073 if (tree_int_cst_sgn (step) == -1)
5074 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5075 }
5076
5077 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5078
5079 create_iv (aggr_ptr_init,
5080 fold_convert (aggr_ptr_type, iv_step),
5081 aggr_ptr, loop, &incr_gsi, insert_after,
5082 &indx_before_incr, &indx_after_incr);
5083 incr = gsi_stmt (incr_gsi);
5084
5085 /* Copy the points-to information if it exists. */
5086 if (DR_PTR_INFO (dr))
5087 {
5088 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5089 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5090 }
5091 if (ptr_incr)
5092 *ptr_incr = incr;
5093
5094 aptr = indx_before_incr;
5095 }
5096
5097 if (!nested_in_vect_loop || only_init)
5098 return aptr;
5099
5100
5101 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5102 nested in LOOP, if exists. */
5103
5104 gcc_assert (nested_in_vect_loop);
5105 if (!only_init)
5106 {
5107 standard_iv_increment_position (containing_loop, &incr_gsi,
5108 &insert_after);
5109 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5110 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5111 &indx_after_incr);
5112 incr = gsi_stmt (incr_gsi);
5113
5114 /* Copy the points-to information if it exists. */
5115 if (DR_PTR_INFO (dr))
5116 {
5117 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5118 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5119 }
5120 if (ptr_incr)
5121 *ptr_incr = incr;
5122
5123 return indx_before_incr;
5124 }
5125 else
5126 gcc_unreachable ();
5127 }
5128
5129
5130 /* Function bump_vector_ptr
5131
5132 Increment a pointer (to a vector type) by vector-size. If requested,
5133 i.e. if PTR-INCR is given, then also connect the new increment stmt
5134 to the existing def-use update-chain of the pointer, by modifying
5135 the PTR_INCR as illustrated below:
5136
5137 The pointer def-use update-chain before this function:
5138 DATAREF_PTR = phi (p_0, p_2)
5139 ....
5140 PTR_INCR: p_2 = DATAREF_PTR + step
5141
5142 The pointer def-use update-chain after this function:
5143 DATAREF_PTR = phi (p_0, p_2)
5144 ....
5145 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5146 ....
5147 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5148
5149 Input:
5150 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5151 in the loop.
5152 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5153 the loop. The increment amount across iterations is expected
5154 to be vector_size.
5155 BSI - location where the new update stmt is to be placed.
5156 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5157 BUMP - optional. The offset by which to bump the pointer. If not given,
5158 the offset is assumed to be vector_size.
5159
5160 Output: Return NEW_DATAREF_PTR as illustrated above.
5161
5162 */
5163
5164 tree
5165 bump_vector_ptr (vec_info *vinfo,
5166 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5167 stmt_vec_info stmt_info, tree bump)
5168 {
5169 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5170 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5171 tree update = TYPE_SIZE_UNIT (vectype);
5172 gimple *incr_stmt;
5173 ssa_op_iter iter;
5174 use_operand_p use_p;
5175 tree new_dataref_ptr;
5176
5177 if (bump)
5178 update = bump;
5179
5180 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5181 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5182 else
5183 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5184 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5185 dataref_ptr, update);
5186 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5187 /* Fold the increment, avoiding excessive chains use-def chains of
5188 those, leading to compile-time issues for passes until the next
5189 forwprop pass which would do this as well. */
5190 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5191 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5192 {
5193 incr_stmt = gsi_stmt (fold_gsi);
5194 update_stmt (incr_stmt);
5195 }
5196
5197 /* Copy the points-to information if it exists. */
5198 if (DR_PTR_INFO (dr))
5199 {
5200 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5201 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5202 }
5203
5204 if (!ptr_incr)
5205 return new_dataref_ptr;
5206
5207 /* Update the vector-pointer's cross-iteration increment. */
5208 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5209 {
5210 tree use = USE_FROM_PTR (use_p);
5211
5212 if (use == dataref_ptr)
5213 SET_USE (use_p, new_dataref_ptr);
5214 else
5215 gcc_assert (operand_equal_p (use, update, 0));
5216 }
5217
5218 return new_dataref_ptr;
5219 }
5220
5221
5222 /* Copy memory reference info such as base/clique from the SRC reference
5223 to the DEST MEM_REF. */
5224
5225 void
5226 vect_copy_ref_info (tree dest, tree src)
5227 {
5228 if (TREE_CODE (dest) != MEM_REF)
5229 return;
5230
5231 tree src_base = src;
5232 while (handled_component_p (src_base))
5233 src_base = TREE_OPERAND (src_base, 0);
5234 if (TREE_CODE (src_base) != MEM_REF
5235 && TREE_CODE (src_base) != TARGET_MEM_REF)
5236 return;
5237
5238 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5239 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5240 }
5241
5242
5243 /* Function vect_create_destination_var.
5244
5245 Create a new temporary of type VECTYPE. */
5246
5247 tree
5248 vect_create_destination_var (tree scalar_dest, tree vectype)
5249 {
5250 tree vec_dest;
5251 const char *name;
5252 char *new_name;
5253 tree type;
5254 enum vect_var_kind kind;
5255
5256 kind = vectype
5257 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5258 ? vect_mask_var
5259 : vect_simple_var
5260 : vect_scalar_var;
5261 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5262
5263 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5264
5265 name = get_name (scalar_dest);
5266 if (name)
5267 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5268 else
5269 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5270 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5271 free (new_name);
5272
5273 return vec_dest;
5274 }
5275
5276 /* Function vect_grouped_store_supported.
5277
5278 Returns TRUE if interleave high and interleave low permutations
5279 are supported, and FALSE otherwise. */
5280
5281 bool
5282 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5283 {
5284 machine_mode mode = TYPE_MODE (vectype);
5285
5286 /* vect_permute_store_chain requires the group size to be equal to 3 or
5287 be a power of two. */
5288 if (count != 3 && exact_log2 (count) == -1)
5289 {
5290 if (dump_enabled_p ())
5291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5292 "the size of the group of accesses"
5293 " is not a power of 2 or not eqaul to 3\n");
5294 return false;
5295 }
5296
5297 /* Check that the permutation is supported. */
5298 if (VECTOR_MODE_P (mode))
5299 {
5300 unsigned int i;
5301 if (count == 3)
5302 {
5303 unsigned int j0 = 0, j1 = 0, j2 = 0;
5304 unsigned int i, j;
5305
5306 unsigned int nelt;
5307 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5308 {
5309 if (dump_enabled_p ())
5310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5311 "cannot handle groups of 3 stores for"
5312 " variable-length vectors\n");
5313 return false;
5314 }
5315
5316 vec_perm_builder sel (nelt, nelt, 1);
5317 sel.quick_grow (nelt);
5318 vec_perm_indices indices;
5319 for (j = 0; j < 3; j++)
5320 {
5321 int nelt0 = ((3 - j) * nelt) % 3;
5322 int nelt1 = ((3 - j) * nelt + 1) % 3;
5323 int nelt2 = ((3 - j) * nelt + 2) % 3;
5324 for (i = 0; i < nelt; i++)
5325 {
5326 if (3 * i + nelt0 < nelt)
5327 sel[3 * i + nelt0] = j0++;
5328 if (3 * i + nelt1 < nelt)
5329 sel[3 * i + nelt1] = nelt + j1++;
5330 if (3 * i + nelt2 < nelt)
5331 sel[3 * i + nelt2] = 0;
5332 }
5333 indices.new_vector (sel, 2, nelt);
5334 if (!can_vec_perm_const_p (mode, indices))
5335 {
5336 if (dump_enabled_p ())
5337 dump_printf (MSG_MISSED_OPTIMIZATION,
5338 "permutation op not supported by target.\n");
5339 return false;
5340 }
5341
5342 for (i = 0; i < nelt; i++)
5343 {
5344 if (3 * i + nelt0 < nelt)
5345 sel[3 * i + nelt0] = 3 * i + nelt0;
5346 if (3 * i + nelt1 < nelt)
5347 sel[3 * i + nelt1] = 3 * i + nelt1;
5348 if (3 * i + nelt2 < nelt)
5349 sel[3 * i + nelt2] = nelt + j2++;
5350 }
5351 indices.new_vector (sel, 2, nelt);
5352 if (!can_vec_perm_const_p (mode, indices))
5353 {
5354 if (dump_enabled_p ())
5355 dump_printf (MSG_MISSED_OPTIMIZATION,
5356 "permutation op not supported by target.\n");
5357 return false;
5358 }
5359 }
5360 return true;
5361 }
5362 else
5363 {
5364 /* If length is not equal to 3 then only power of 2 is supported. */
5365 gcc_assert (pow2p_hwi (count));
5366 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5367
5368 /* The encoding has 2 interleaved stepped patterns. */
5369 vec_perm_builder sel (nelt, 2, 3);
5370 sel.quick_grow (6);
5371 for (i = 0; i < 3; i++)
5372 {
5373 sel[i * 2] = i;
5374 sel[i * 2 + 1] = i + nelt;
5375 }
5376 vec_perm_indices indices (sel, 2, nelt);
5377 if (can_vec_perm_const_p (mode, indices))
5378 {
5379 for (i = 0; i < 6; i++)
5380 sel[i] += exact_div (nelt, 2);
5381 indices.new_vector (sel, 2, nelt);
5382 if (can_vec_perm_const_p (mode, indices))
5383 return true;
5384 }
5385 }
5386 }
5387
5388 if (dump_enabled_p ())
5389 dump_printf (MSG_MISSED_OPTIMIZATION,
5390 "permutation op not supported by target.\n");
5391 return false;
5392 }
5393
5394
5395 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5396 type VECTYPE. MASKED_P says whether the masked form is needed. */
5397
5398 bool
5399 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5400 bool masked_p)
5401 {
5402 if (masked_p)
5403 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5404 vec_mask_store_lanes_optab,
5405 vectype, count);
5406 else
5407 return vect_lanes_optab_supported_p ("vec_store_lanes",
5408 vec_store_lanes_optab,
5409 vectype, count);
5410 }
5411
5412
5413 /* Function vect_permute_store_chain.
5414
5415 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5416 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5417 the data correctly for the stores. Return the final references for stores
5418 in RESULT_CHAIN.
5419
5420 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5421 The input is 4 vectors each containing 8 elements. We assign a number to
5422 each element, the input sequence is:
5423
5424 1st vec: 0 1 2 3 4 5 6 7
5425 2nd vec: 8 9 10 11 12 13 14 15
5426 3rd vec: 16 17 18 19 20 21 22 23
5427 4th vec: 24 25 26 27 28 29 30 31
5428
5429 The output sequence should be:
5430
5431 1st vec: 0 8 16 24 1 9 17 25
5432 2nd vec: 2 10 18 26 3 11 19 27
5433 3rd vec: 4 12 20 28 5 13 21 30
5434 4th vec: 6 14 22 30 7 15 23 31
5435
5436 i.e., we interleave the contents of the four vectors in their order.
5437
5438 We use interleave_high/low instructions to create such output. The input of
5439 each interleave_high/low operation is two vectors:
5440 1st vec 2nd vec
5441 0 1 2 3 4 5 6 7
5442 the even elements of the result vector are obtained left-to-right from the
5443 high/low elements of the first vector. The odd elements of the result are
5444 obtained left-to-right from the high/low elements of the second vector.
5445 The output of interleave_high will be: 0 4 1 5
5446 and of interleave_low: 2 6 3 7
5447
5448
5449 The permutation is done in log LENGTH stages. In each stage interleave_high
5450 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5451 where the first argument is taken from the first half of DR_CHAIN and the
5452 second argument from it's second half.
5453 In our example,
5454
5455 I1: interleave_high (1st vec, 3rd vec)
5456 I2: interleave_low (1st vec, 3rd vec)
5457 I3: interleave_high (2nd vec, 4th vec)
5458 I4: interleave_low (2nd vec, 4th vec)
5459
5460 The output for the first stage is:
5461
5462 I1: 0 16 1 17 2 18 3 19
5463 I2: 4 20 5 21 6 22 7 23
5464 I3: 8 24 9 25 10 26 11 27
5465 I4: 12 28 13 29 14 30 15 31
5466
5467 The output of the second stage, i.e. the final result is:
5468
5469 I1: 0 8 16 24 1 9 17 25
5470 I2: 2 10 18 26 3 11 19 27
5471 I3: 4 12 20 28 5 13 21 30
5472 I4: 6 14 22 30 7 15 23 31. */
5473
5474 void
5475 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5476 unsigned int length,
5477 stmt_vec_info stmt_info,
5478 gimple_stmt_iterator *gsi,
5479 vec<tree> *result_chain)
5480 {
5481 tree vect1, vect2, high, low;
5482 gimple *perm_stmt;
5483 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5484 tree perm_mask_low, perm_mask_high;
5485 tree data_ref;
5486 tree perm3_mask_low, perm3_mask_high;
5487 unsigned int i, j, n, log_length = exact_log2 (length);
5488
5489 result_chain->quick_grow (length);
5490 memcpy (result_chain->address (), dr_chain.address (),
5491 length * sizeof (tree));
5492
5493 if (length == 3)
5494 {
5495 /* vect_grouped_store_supported ensures that this is constant. */
5496 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5497 unsigned int j0 = 0, j1 = 0, j2 = 0;
5498
5499 vec_perm_builder sel (nelt, nelt, 1);
5500 sel.quick_grow (nelt);
5501 vec_perm_indices indices;
5502 for (j = 0; j < 3; j++)
5503 {
5504 int nelt0 = ((3 - j) * nelt) % 3;
5505 int nelt1 = ((3 - j) * nelt + 1) % 3;
5506 int nelt2 = ((3 - j) * nelt + 2) % 3;
5507
5508 for (i = 0; i < nelt; i++)
5509 {
5510 if (3 * i + nelt0 < nelt)
5511 sel[3 * i + nelt0] = j0++;
5512 if (3 * i + nelt1 < nelt)
5513 sel[3 * i + nelt1] = nelt + j1++;
5514 if (3 * i + nelt2 < nelt)
5515 sel[3 * i + nelt2] = 0;
5516 }
5517 indices.new_vector (sel, 2, nelt);
5518 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5519
5520 for (i = 0; i < nelt; i++)
5521 {
5522 if (3 * i + nelt0 < nelt)
5523 sel[3 * i + nelt0] = 3 * i + nelt0;
5524 if (3 * i + nelt1 < nelt)
5525 sel[3 * i + nelt1] = 3 * i + nelt1;
5526 if (3 * i + nelt2 < nelt)
5527 sel[3 * i + nelt2] = nelt + j2++;
5528 }
5529 indices.new_vector (sel, 2, nelt);
5530 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5531
5532 vect1 = dr_chain[0];
5533 vect2 = dr_chain[1];
5534
5535 /* Create interleaving stmt:
5536 low = VEC_PERM_EXPR <vect1, vect2,
5537 {j, nelt, *, j + 1, nelt + j + 1, *,
5538 j + 2, nelt + j + 2, *, ...}> */
5539 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5540 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5541 vect2, perm3_mask_low);
5542 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5543
5544 vect1 = data_ref;
5545 vect2 = dr_chain[2];
5546 /* Create interleaving stmt:
5547 low = VEC_PERM_EXPR <vect1, vect2,
5548 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5549 6, 7, nelt + j + 2, ...}> */
5550 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5551 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5552 vect2, perm3_mask_high);
5553 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5554 (*result_chain)[j] = data_ref;
5555 }
5556 }
5557 else
5558 {
5559 /* If length is not equal to 3 then only power of 2 is supported. */
5560 gcc_assert (pow2p_hwi (length));
5561
5562 /* The encoding has 2 interleaved stepped patterns. */
5563 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5564 vec_perm_builder sel (nelt, 2, 3);
5565 sel.quick_grow (6);
5566 for (i = 0; i < 3; i++)
5567 {
5568 sel[i * 2] = i;
5569 sel[i * 2 + 1] = i + nelt;
5570 }
5571 vec_perm_indices indices (sel, 2, nelt);
5572 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5573
5574 for (i = 0; i < 6; i++)
5575 sel[i] += exact_div (nelt, 2);
5576 indices.new_vector (sel, 2, nelt);
5577 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5578
5579 for (i = 0, n = log_length; i < n; i++)
5580 {
5581 for (j = 0; j < length/2; j++)
5582 {
5583 vect1 = dr_chain[j];
5584 vect2 = dr_chain[j+length/2];
5585
5586 /* Create interleaving stmt:
5587 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5588 ...}> */
5589 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5590 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5591 vect2, perm_mask_high);
5592 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5593 (*result_chain)[2*j] = high;
5594
5595 /* Create interleaving stmt:
5596 low = VEC_PERM_EXPR <vect1, vect2,
5597 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5598 ...}> */
5599 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5600 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5601 vect2, perm_mask_low);
5602 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5603 (*result_chain)[2*j+1] = low;
5604 }
5605 memcpy (dr_chain.address (), result_chain->address (),
5606 length * sizeof (tree));
5607 }
5608 }
5609 }
5610
5611 /* Function vect_setup_realignment
5612
5613 This function is called when vectorizing an unaligned load using
5614 the dr_explicit_realign[_optimized] scheme.
5615 This function generates the following code at the loop prolog:
5616
5617 p = initial_addr;
5618 x msq_init = *(floor(p)); # prolog load
5619 realignment_token = call target_builtin;
5620 loop:
5621 x msq = phi (msq_init, ---)
5622
5623 The stmts marked with x are generated only for the case of
5624 dr_explicit_realign_optimized.
5625
5626 The code above sets up a new (vector) pointer, pointing to the first
5627 location accessed by STMT_INFO, and a "floor-aligned" load using that
5628 pointer. It also generates code to compute the "realignment-token"
5629 (if the relevant target hook was defined), and creates a phi-node at the
5630 loop-header bb whose arguments are the result of the prolog-load (created
5631 by this function) and the result of a load that takes place in the loop
5632 (to be created by the caller to this function).
5633
5634 For the case of dr_explicit_realign_optimized:
5635 The caller to this function uses the phi-result (msq) to create the
5636 realignment code inside the loop, and sets up the missing phi argument,
5637 as follows:
5638 loop:
5639 msq = phi (msq_init, lsq)
5640 lsq = *(floor(p')); # load in loop
5641 result = realign_load (msq, lsq, realignment_token);
5642
5643 For the case of dr_explicit_realign:
5644 loop:
5645 msq = *(floor(p)); # load in loop
5646 p' = p + (VS-1);
5647 lsq = *(floor(p')); # load in loop
5648 result = realign_load (msq, lsq, realignment_token);
5649
5650 Input:
5651 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5652 a memory location that may be unaligned.
5653 BSI - place where new code is to be inserted.
5654 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5655 is used.
5656
5657 Output:
5658 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5659 target hook, if defined.
5660 Return value - the result of the loop-header phi node. */
5661
5662 tree
5663 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5664 gimple_stmt_iterator *gsi, tree *realignment_token,
5665 enum dr_alignment_support alignment_support_scheme,
5666 tree init_addr,
5667 class loop **at_loop)
5668 {
5669 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5670 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5671 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5672 struct data_reference *dr = dr_info->dr;
5673 class loop *loop = NULL;
5674 edge pe = NULL;
5675 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5676 tree vec_dest;
5677 gimple *inc;
5678 tree ptr;
5679 tree data_ref;
5680 basic_block new_bb;
5681 tree msq_init = NULL_TREE;
5682 tree new_temp;
5683 gphi *phi_stmt;
5684 tree msq = NULL_TREE;
5685 gimple_seq stmts = NULL;
5686 bool compute_in_loop = false;
5687 bool nested_in_vect_loop = false;
5688 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5689 class loop *loop_for_initial_load = NULL;
5690
5691 if (loop_vinfo)
5692 {
5693 loop = LOOP_VINFO_LOOP (loop_vinfo);
5694 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5695 }
5696
5697 gcc_assert (alignment_support_scheme == dr_explicit_realign
5698 || alignment_support_scheme == dr_explicit_realign_optimized);
5699
5700 /* We need to generate three things:
5701 1. the misalignment computation
5702 2. the extra vector load (for the optimized realignment scheme).
5703 3. the phi node for the two vectors from which the realignment is
5704 done (for the optimized realignment scheme). */
5705
5706 /* 1. Determine where to generate the misalignment computation.
5707
5708 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5709 calculation will be generated by this function, outside the loop (in the
5710 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5711 caller, inside the loop.
5712
5713 Background: If the misalignment remains fixed throughout the iterations of
5714 the loop, then both realignment schemes are applicable, and also the
5715 misalignment computation can be done outside LOOP. This is because we are
5716 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5717 are a multiple of VS (the Vector Size), and therefore the misalignment in
5718 different vectorized LOOP iterations is always the same.
5719 The problem arises only if the memory access is in an inner-loop nested
5720 inside LOOP, which is now being vectorized using outer-loop vectorization.
5721 This is the only case when the misalignment of the memory access may not
5722 remain fixed throughout the iterations of the inner-loop (as explained in
5723 detail in vect_supportable_dr_alignment). In this case, not only is the
5724 optimized realignment scheme not applicable, but also the misalignment
5725 computation (and generation of the realignment token that is passed to
5726 REALIGN_LOAD) have to be done inside the loop.
5727
5728 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5729 or not, which in turn determines if the misalignment is computed inside
5730 the inner-loop, or outside LOOP. */
5731
5732 if (init_addr != NULL_TREE || !loop_vinfo)
5733 {
5734 compute_in_loop = true;
5735 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5736 }
5737
5738
5739 /* 2. Determine where to generate the extra vector load.
5740
5741 For the optimized realignment scheme, instead of generating two vector
5742 loads in each iteration, we generate a single extra vector load in the
5743 preheader of the loop, and in each iteration reuse the result of the
5744 vector load from the previous iteration. In case the memory access is in
5745 an inner-loop nested inside LOOP, which is now being vectorized using
5746 outer-loop vectorization, we need to determine whether this initial vector
5747 load should be generated at the preheader of the inner-loop, or can be
5748 generated at the preheader of LOOP. If the memory access has no evolution
5749 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5750 to be generated inside LOOP (in the preheader of the inner-loop). */
5751
5752 if (nested_in_vect_loop)
5753 {
5754 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5755 bool invariant_in_outerloop =
5756 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5757 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5758 }
5759 else
5760 loop_for_initial_load = loop;
5761 if (at_loop)
5762 *at_loop = loop_for_initial_load;
5763
5764 if (loop_for_initial_load)
5765 pe = loop_preheader_edge (loop_for_initial_load);
5766
5767 /* 3. For the case of the optimized realignment, create the first vector
5768 load at the loop preheader. */
5769
5770 if (alignment_support_scheme == dr_explicit_realign_optimized)
5771 {
5772 /* Create msq_init = *(floor(p1)) in the loop preheader */
5773 gassign *new_stmt;
5774
5775 gcc_assert (!compute_in_loop);
5776 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5777 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5778 loop_for_initial_load, NULL_TREE,
5779 &init_addr, NULL, &inc, true);
5780 if (TREE_CODE (ptr) == SSA_NAME)
5781 new_temp = copy_ssa_name (ptr);
5782 else
5783 new_temp = make_ssa_name (TREE_TYPE (ptr));
5784 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5785 tree type = TREE_TYPE (ptr);
5786 new_stmt = gimple_build_assign
5787 (new_temp, BIT_AND_EXPR, ptr,
5788 fold_build2 (MINUS_EXPR, type,
5789 build_int_cst (type, 0),
5790 build_int_cst (type, align)));
5791 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5792 gcc_assert (!new_bb);
5793 data_ref
5794 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5795 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5796 vect_copy_ref_info (data_ref, DR_REF (dr));
5797 new_stmt = gimple_build_assign (vec_dest, data_ref);
5798 new_temp = make_ssa_name (vec_dest, new_stmt);
5799 gimple_assign_set_lhs (new_stmt, new_temp);
5800 if (pe)
5801 {
5802 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5803 gcc_assert (!new_bb);
5804 }
5805 else
5806 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5807
5808 msq_init = gimple_assign_lhs (new_stmt);
5809 }
5810
5811 /* 4. Create realignment token using a target builtin, if available.
5812 It is done either inside the containing loop, or before LOOP (as
5813 determined above). */
5814
5815 if (targetm.vectorize.builtin_mask_for_load)
5816 {
5817 gcall *new_stmt;
5818 tree builtin_decl;
5819
5820 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5821 if (!init_addr)
5822 {
5823 /* Generate the INIT_ADDR computation outside LOOP. */
5824 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5825 stmt_info, &stmts,
5826 NULL_TREE);
5827 if (loop)
5828 {
5829 pe = loop_preheader_edge (loop);
5830 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5831 gcc_assert (!new_bb);
5832 }
5833 else
5834 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5835 }
5836
5837 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5838 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5839 vec_dest =
5840 vect_create_destination_var (scalar_dest,
5841 gimple_call_return_type (new_stmt));
5842 new_temp = make_ssa_name (vec_dest, new_stmt);
5843 gimple_call_set_lhs (new_stmt, new_temp);
5844
5845 if (compute_in_loop)
5846 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5847 else
5848 {
5849 /* Generate the misalignment computation outside LOOP. */
5850 pe = loop_preheader_edge (loop);
5851 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5852 gcc_assert (!new_bb);
5853 }
5854
5855 *realignment_token = gimple_call_lhs (new_stmt);
5856
5857 /* The result of the CALL_EXPR to this builtin is determined from
5858 the value of the parameter and no global variables are touched
5859 which makes the builtin a "const" function. Requiring the
5860 builtin to have the "const" attribute makes it unnecessary
5861 to call mark_call_clobbered. */
5862 gcc_assert (TREE_READONLY (builtin_decl));
5863 }
5864
5865 if (alignment_support_scheme == dr_explicit_realign)
5866 return msq;
5867
5868 gcc_assert (!compute_in_loop);
5869 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5870
5871
5872 /* 5. Create msq = phi <msq_init, lsq> in loop */
5873
5874 pe = loop_preheader_edge (containing_loop);
5875 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5876 msq = make_ssa_name (vec_dest);
5877 phi_stmt = create_phi_node (msq, containing_loop->header);
5878 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5879
5880 return msq;
5881 }
5882
5883
5884 /* Function vect_grouped_load_supported.
5885
5886 COUNT is the size of the load group (the number of statements plus the
5887 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5888 only one statement, with a gap of COUNT - 1.
5889
5890 Returns true if a suitable permute exists. */
5891
5892 bool
5893 vect_grouped_load_supported (tree vectype, bool single_element_p,
5894 unsigned HOST_WIDE_INT count)
5895 {
5896 machine_mode mode = TYPE_MODE (vectype);
5897
5898 /* If this is single-element interleaving with an element distance
5899 that leaves unused vector loads around punt - we at least create
5900 very sub-optimal code in that case (and blow up memory,
5901 see PR65518). */
5902 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5903 {
5904 if (dump_enabled_p ())
5905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5906 "single-element interleaving not supported "
5907 "for not adjacent vector loads\n");
5908 return false;
5909 }
5910
5911 /* vect_permute_load_chain requires the group size to be equal to 3 or
5912 be a power of two. */
5913 if (count != 3 && exact_log2 (count) == -1)
5914 {
5915 if (dump_enabled_p ())
5916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5917 "the size of the group of accesses"
5918 " is not a power of 2 or not equal to 3\n");
5919 return false;
5920 }
5921
5922 /* Check that the permutation is supported. */
5923 if (VECTOR_MODE_P (mode))
5924 {
5925 unsigned int i, j;
5926 if (count == 3)
5927 {
5928 unsigned int nelt;
5929 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5930 {
5931 if (dump_enabled_p ())
5932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5933 "cannot handle groups of 3 loads for"
5934 " variable-length vectors\n");
5935 return false;
5936 }
5937
5938 vec_perm_builder sel (nelt, nelt, 1);
5939 sel.quick_grow (nelt);
5940 vec_perm_indices indices;
5941 unsigned int k;
5942 for (k = 0; k < 3; k++)
5943 {
5944 for (i = 0; i < nelt; i++)
5945 if (3 * i + k < 2 * nelt)
5946 sel[i] = 3 * i + k;
5947 else
5948 sel[i] = 0;
5949 indices.new_vector (sel, 2, nelt);
5950 if (!can_vec_perm_const_p (mode, indices))
5951 {
5952 if (dump_enabled_p ())
5953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5954 "shuffle of 3 loads is not supported by"
5955 " target\n");
5956 return false;
5957 }
5958 for (i = 0, j = 0; i < nelt; i++)
5959 if (3 * i + k < 2 * nelt)
5960 sel[i] = i;
5961 else
5962 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5963 indices.new_vector (sel, 2, nelt);
5964 if (!can_vec_perm_const_p (mode, indices))
5965 {
5966 if (dump_enabled_p ())
5967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5968 "shuffle of 3 loads is not supported by"
5969 " target\n");
5970 return false;
5971 }
5972 }
5973 return true;
5974 }
5975 else
5976 {
5977 /* If length is not equal to 3 then only power of 2 is supported. */
5978 gcc_assert (pow2p_hwi (count));
5979 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5980
5981 /* The encoding has a single stepped pattern. */
5982 vec_perm_builder sel (nelt, 1, 3);
5983 sel.quick_grow (3);
5984 for (i = 0; i < 3; i++)
5985 sel[i] = i * 2;
5986 vec_perm_indices indices (sel, 2, nelt);
5987 if (can_vec_perm_const_p (mode, indices))
5988 {
5989 for (i = 0; i < 3; i++)
5990 sel[i] = i * 2 + 1;
5991 indices.new_vector (sel, 2, nelt);
5992 if (can_vec_perm_const_p (mode, indices))
5993 return true;
5994 }
5995 }
5996 }
5997
5998 if (dump_enabled_p ())
5999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6000 "extract even/odd not supported by target\n");
6001 return false;
6002 }
6003
6004 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6005 type VECTYPE. MASKED_P says whether the masked form is needed. */
6006
6007 bool
6008 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6009 bool masked_p)
6010 {
6011 if (masked_p)
6012 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6013 vec_mask_load_lanes_optab,
6014 vectype, count);
6015 else
6016 return vect_lanes_optab_supported_p ("vec_load_lanes",
6017 vec_load_lanes_optab,
6018 vectype, count);
6019 }
6020
6021 /* Function vect_permute_load_chain.
6022
6023 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6024 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6025 the input data correctly. Return the final references for loads in
6026 RESULT_CHAIN.
6027
6028 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6029 The input is 4 vectors each containing 8 elements. We assign a number to each
6030 element, the input sequence is:
6031
6032 1st vec: 0 1 2 3 4 5 6 7
6033 2nd vec: 8 9 10 11 12 13 14 15
6034 3rd vec: 16 17 18 19 20 21 22 23
6035 4th vec: 24 25 26 27 28 29 30 31
6036
6037 The output sequence should be:
6038
6039 1st vec: 0 4 8 12 16 20 24 28
6040 2nd vec: 1 5 9 13 17 21 25 29
6041 3rd vec: 2 6 10 14 18 22 26 30
6042 4th vec: 3 7 11 15 19 23 27 31
6043
6044 i.e., the first output vector should contain the first elements of each
6045 interleaving group, etc.
6046
6047 We use extract_even/odd instructions to create such output. The input of
6048 each extract_even/odd operation is two vectors
6049 1st vec 2nd vec
6050 0 1 2 3 4 5 6 7
6051
6052 and the output is the vector of extracted even/odd elements. The output of
6053 extract_even will be: 0 2 4 6
6054 and of extract_odd: 1 3 5 7
6055
6056
6057 The permutation is done in log LENGTH stages. In each stage extract_even
6058 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6059 their order. In our example,
6060
6061 E1: extract_even (1st vec, 2nd vec)
6062 E2: extract_odd (1st vec, 2nd vec)
6063 E3: extract_even (3rd vec, 4th vec)
6064 E4: extract_odd (3rd vec, 4th vec)
6065
6066 The output for the first stage will be:
6067
6068 E1: 0 2 4 6 8 10 12 14
6069 E2: 1 3 5 7 9 11 13 15
6070 E3: 16 18 20 22 24 26 28 30
6071 E4: 17 19 21 23 25 27 29 31
6072
6073 In order to proceed and create the correct sequence for the next stage (or
6074 for the correct output, if the second stage is the last one, as in our
6075 example), we first put the output of extract_even operation and then the
6076 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6077 The input for the second stage is:
6078
6079 1st vec (E1): 0 2 4 6 8 10 12 14
6080 2nd vec (E3): 16 18 20 22 24 26 28 30
6081 3rd vec (E2): 1 3 5 7 9 11 13 15
6082 4th vec (E4): 17 19 21 23 25 27 29 31
6083
6084 The output of the second stage:
6085
6086 E1: 0 4 8 12 16 20 24 28
6087 E2: 2 6 10 14 18 22 26 30
6088 E3: 1 5 9 13 17 21 25 29
6089 E4: 3 7 11 15 19 23 27 31
6090
6091 And RESULT_CHAIN after reordering:
6092
6093 1st vec (E1): 0 4 8 12 16 20 24 28
6094 2nd vec (E3): 1 5 9 13 17 21 25 29
6095 3rd vec (E2): 2 6 10 14 18 22 26 30
6096 4th vec (E4): 3 7 11 15 19 23 27 31. */
6097
6098 static void
6099 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6100 unsigned int length,
6101 stmt_vec_info stmt_info,
6102 gimple_stmt_iterator *gsi,
6103 vec<tree> *result_chain)
6104 {
6105 tree data_ref, first_vect, second_vect;
6106 tree perm_mask_even, perm_mask_odd;
6107 tree perm3_mask_low, perm3_mask_high;
6108 gimple *perm_stmt;
6109 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6110 unsigned int i, j, log_length = exact_log2 (length);
6111
6112 result_chain->quick_grow (length);
6113 memcpy (result_chain->address (), dr_chain.address (),
6114 length * sizeof (tree));
6115
6116 if (length == 3)
6117 {
6118 /* vect_grouped_load_supported ensures that this is constant. */
6119 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6120 unsigned int k;
6121
6122 vec_perm_builder sel (nelt, nelt, 1);
6123 sel.quick_grow (nelt);
6124 vec_perm_indices indices;
6125 for (k = 0; k < 3; k++)
6126 {
6127 for (i = 0; i < nelt; i++)
6128 if (3 * i + k < 2 * nelt)
6129 sel[i] = 3 * i + k;
6130 else
6131 sel[i] = 0;
6132 indices.new_vector (sel, 2, nelt);
6133 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6134
6135 for (i = 0, j = 0; i < nelt; i++)
6136 if (3 * i + k < 2 * nelt)
6137 sel[i] = i;
6138 else
6139 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6140 indices.new_vector (sel, 2, nelt);
6141 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6142
6143 first_vect = dr_chain[0];
6144 second_vect = dr_chain[1];
6145
6146 /* Create interleaving stmt (low part of):
6147 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6148 ...}> */
6149 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6150 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6151 second_vect, perm3_mask_low);
6152 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6153
6154 /* Create interleaving stmt (high part of):
6155 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6156 ...}> */
6157 first_vect = data_ref;
6158 second_vect = dr_chain[2];
6159 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6160 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6161 second_vect, perm3_mask_high);
6162 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6163 (*result_chain)[k] = data_ref;
6164 }
6165 }
6166 else
6167 {
6168 /* If length is not equal to 3 then only power of 2 is supported. */
6169 gcc_assert (pow2p_hwi (length));
6170
6171 /* The encoding has a single stepped pattern. */
6172 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6173 vec_perm_builder sel (nelt, 1, 3);
6174 sel.quick_grow (3);
6175 for (i = 0; i < 3; ++i)
6176 sel[i] = i * 2;
6177 vec_perm_indices indices (sel, 2, nelt);
6178 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6179
6180 for (i = 0; i < 3; ++i)
6181 sel[i] = i * 2 + 1;
6182 indices.new_vector (sel, 2, nelt);
6183 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6184
6185 for (i = 0; i < log_length; i++)
6186 {
6187 for (j = 0; j < length; j += 2)
6188 {
6189 first_vect = dr_chain[j];
6190 second_vect = dr_chain[j+1];
6191
6192 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6193 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6194 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6195 first_vect, second_vect,
6196 perm_mask_even);
6197 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6198 (*result_chain)[j/2] = data_ref;
6199
6200 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6201 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6202 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6203 first_vect, second_vect,
6204 perm_mask_odd);
6205 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6206 (*result_chain)[j/2+length/2] = data_ref;
6207 }
6208 memcpy (dr_chain.address (), result_chain->address (),
6209 length * sizeof (tree));
6210 }
6211 }
6212 }
6213
6214 /* Function vect_shift_permute_load_chain.
6215
6216 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6217 sequence of stmts to reorder the input data accordingly.
6218 Return the final references for loads in RESULT_CHAIN.
6219 Return true if successed, false otherwise.
6220
6221 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6222 The input is 3 vectors each containing 8 elements. We assign a
6223 number to each element, the input sequence is:
6224
6225 1st vec: 0 1 2 3 4 5 6 7
6226 2nd vec: 8 9 10 11 12 13 14 15
6227 3rd vec: 16 17 18 19 20 21 22 23
6228
6229 The output sequence should be:
6230
6231 1st vec: 0 3 6 9 12 15 18 21
6232 2nd vec: 1 4 7 10 13 16 19 22
6233 3rd vec: 2 5 8 11 14 17 20 23
6234
6235 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6236
6237 First we shuffle all 3 vectors to get correct elements order:
6238
6239 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6240 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6241 3rd vec: (16 19 22) (17 20 23) (18 21)
6242
6243 Next we unite and shift vector 3 times:
6244
6245 1st step:
6246 shift right by 6 the concatenation of:
6247 "1st vec" and "2nd vec"
6248 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6249 "2nd vec" and "3rd vec"
6250 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6251 "3rd vec" and "1st vec"
6252 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6253 | New vectors |
6254
6255 So that now new vectors are:
6256
6257 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6258 2nd vec: (10 13) (16 19 22) (17 20 23)
6259 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6260
6261 2nd step:
6262 shift right by 5 the concatenation of:
6263 "1st vec" and "3rd vec"
6264 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6265 "2nd vec" and "1st vec"
6266 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6267 "3rd vec" and "2nd vec"
6268 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6269 | New vectors |
6270
6271 So that now new vectors are:
6272
6273 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6274 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6275 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6276
6277 3rd step:
6278 shift right by 5 the concatenation of:
6279 "1st vec" and "1st vec"
6280 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6281 shift right by 3 the concatenation of:
6282 "2nd vec" and "2nd vec"
6283 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6284 | New vectors |
6285
6286 So that now all vectors are READY:
6287 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6288 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6289 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6290
6291 This algorithm is faster than one in vect_permute_load_chain if:
6292 1. "shift of a concatination" is faster than general permutation.
6293 This is usually so.
6294 2. The TARGET machine can't execute vector instructions in parallel.
6295 This is because each step of the algorithm depends on previous.
6296 The algorithm in vect_permute_load_chain is much more parallel.
6297
6298 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6299 */
6300
6301 static bool
6302 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6303 unsigned int length,
6304 stmt_vec_info stmt_info,
6305 gimple_stmt_iterator *gsi,
6306 vec<tree> *result_chain)
6307 {
6308 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6309 tree perm2_mask1, perm2_mask2, perm3_mask;
6310 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6311 gimple *perm_stmt;
6312
6313 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6314 unsigned int i;
6315 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6316
6317 unsigned HOST_WIDE_INT nelt, vf;
6318 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6319 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6320 /* Not supported for variable-length vectors. */
6321 return false;
6322
6323 vec_perm_builder sel (nelt, nelt, 1);
6324 sel.quick_grow (nelt);
6325
6326 result_chain->quick_grow (length);
6327 memcpy (result_chain->address (), dr_chain.address (),
6328 length * sizeof (tree));
6329
6330 if (pow2p_hwi (length) && vf > 4)
6331 {
6332 unsigned int j, log_length = exact_log2 (length);
6333 for (i = 0; i < nelt / 2; ++i)
6334 sel[i] = i * 2;
6335 for (i = 0; i < nelt / 2; ++i)
6336 sel[nelt / 2 + i] = i * 2 + 1;
6337 vec_perm_indices indices (sel, 2, nelt);
6338 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6339 {
6340 if (dump_enabled_p ())
6341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6342 "shuffle of 2 fields structure is not \
6343 supported by target\n");
6344 return false;
6345 }
6346 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6347
6348 for (i = 0; i < nelt / 2; ++i)
6349 sel[i] = i * 2 + 1;
6350 for (i = 0; i < nelt / 2; ++i)
6351 sel[nelt / 2 + i] = i * 2;
6352 indices.new_vector (sel, 2, nelt);
6353 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6354 {
6355 if (dump_enabled_p ())
6356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357 "shuffle of 2 fields structure is not \
6358 supported by target\n");
6359 return false;
6360 }
6361 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6362
6363 /* Generating permutation constant to shift all elements.
6364 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6365 for (i = 0; i < nelt; i++)
6366 sel[i] = nelt / 2 + i;
6367 indices.new_vector (sel, 2, nelt);
6368 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6369 {
6370 if (dump_enabled_p ())
6371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6372 "shift permutation is not supported by target\n");
6373 return false;
6374 }
6375 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6376
6377 /* Generating permutation constant to select vector from 2.
6378 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6379 for (i = 0; i < nelt / 2; i++)
6380 sel[i] = i;
6381 for (i = nelt / 2; i < nelt; i++)
6382 sel[i] = nelt + i;
6383 indices.new_vector (sel, 2, nelt);
6384 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6385 {
6386 if (dump_enabled_p ())
6387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6388 "select is not supported by target\n");
6389 return false;
6390 }
6391 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6392
6393 for (i = 0; i < log_length; i++)
6394 {
6395 for (j = 0; j < length; j += 2)
6396 {
6397 first_vect = dr_chain[j];
6398 second_vect = dr_chain[j + 1];
6399
6400 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6401 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6402 first_vect, first_vect,
6403 perm2_mask1);
6404 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6405 vect[0] = data_ref;
6406
6407 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6408 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6409 second_vect, second_vect,
6410 perm2_mask2);
6411 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6412 vect[1] = data_ref;
6413
6414 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6415 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6416 vect[0], vect[1], shift1_mask);
6417 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6418 (*result_chain)[j/2 + length/2] = data_ref;
6419
6420 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6421 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6422 vect[0], vect[1], select_mask);
6423 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6424 (*result_chain)[j/2] = data_ref;
6425 }
6426 memcpy (dr_chain.address (), result_chain->address (),
6427 length * sizeof (tree));
6428 }
6429 return true;
6430 }
6431 if (length == 3 && vf > 2)
6432 {
6433 unsigned int k = 0, l = 0;
6434
6435 /* Generating permutation constant to get all elements in rigth order.
6436 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6437 for (i = 0; i < nelt; i++)
6438 {
6439 if (3 * k + (l % 3) >= nelt)
6440 {
6441 k = 0;
6442 l += (3 - (nelt % 3));
6443 }
6444 sel[i] = 3 * k + (l % 3);
6445 k++;
6446 }
6447 vec_perm_indices indices (sel, 2, nelt);
6448 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6449 {
6450 if (dump_enabled_p ())
6451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6452 "shuffle of 3 fields structure is not \
6453 supported by target\n");
6454 return false;
6455 }
6456 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6457
6458 /* Generating permutation constant to shift all elements.
6459 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6460 for (i = 0; i < nelt; i++)
6461 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6462 indices.new_vector (sel, 2, nelt);
6463 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6464 {
6465 if (dump_enabled_p ())
6466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6467 "shift permutation is not supported by target\n");
6468 return false;
6469 }
6470 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6471
6472 /* Generating permutation constant to shift all elements.
6473 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6474 for (i = 0; i < nelt; i++)
6475 sel[i] = 2 * (nelt / 3) + 1 + i;
6476 indices.new_vector (sel, 2, nelt);
6477 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6478 {
6479 if (dump_enabled_p ())
6480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6481 "shift permutation is not supported by target\n");
6482 return false;
6483 }
6484 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6485
6486 /* Generating permutation constant to shift all elements.
6487 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6488 for (i = 0; i < nelt; i++)
6489 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6490 indices.new_vector (sel, 2, nelt);
6491 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6492 {
6493 if (dump_enabled_p ())
6494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6495 "shift permutation is not supported by target\n");
6496 return false;
6497 }
6498 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6499
6500 /* Generating permutation constant to shift all elements.
6501 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6502 for (i = 0; i < nelt; i++)
6503 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6504 indices.new_vector (sel, 2, nelt);
6505 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6506 {
6507 if (dump_enabled_p ())
6508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6509 "shift permutation is not supported by target\n");
6510 return false;
6511 }
6512 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6513
6514 for (k = 0; k < 3; k++)
6515 {
6516 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6517 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6518 dr_chain[k], dr_chain[k],
6519 perm3_mask);
6520 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6521 vect[k] = data_ref;
6522 }
6523
6524 for (k = 0; k < 3; k++)
6525 {
6526 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6527 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6528 vect[k % 3], vect[(k + 1) % 3],
6529 shift1_mask);
6530 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6531 vect_shift[k] = data_ref;
6532 }
6533
6534 for (k = 0; k < 3; k++)
6535 {
6536 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6537 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6538 vect_shift[(4 - k) % 3],
6539 vect_shift[(3 - k) % 3],
6540 shift2_mask);
6541 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6542 vect[k] = data_ref;
6543 }
6544
6545 (*result_chain)[3 - (nelt % 3)] = vect[2];
6546
6547 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6548 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6549 vect[0], shift3_mask);
6550 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6551 (*result_chain)[nelt % 3] = data_ref;
6552
6553 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6554 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6555 vect[1], shift4_mask);
6556 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6557 (*result_chain)[0] = data_ref;
6558 return true;
6559 }
6560 return false;
6561 }
6562
6563 /* Function vect_transform_grouped_load.
6564
6565 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6566 to perform their permutation and ascribe the result vectorized statements to
6567 the scalar statements.
6568 */
6569
6570 void
6571 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6572 vec<tree> dr_chain,
6573 int size, gimple_stmt_iterator *gsi)
6574 {
6575 machine_mode mode;
6576 vec<tree> result_chain = vNULL;
6577
6578 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6579 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6580 vectors, that are ready for vector computation. */
6581 result_chain.create (size);
6582
6583 /* If reassociation width for vector type is 2 or greater target machine can
6584 execute 2 or more vector instructions in parallel. Otherwise try to
6585 get chain for loads group using vect_shift_permute_load_chain. */
6586 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6587 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6588 || pow2p_hwi (size)
6589 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6590 gsi, &result_chain))
6591 vect_permute_load_chain (vinfo, dr_chain,
6592 size, stmt_info, gsi, &result_chain);
6593 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6594 result_chain.release ();
6595 }
6596
6597 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6598 generated as part of the vectorization of STMT_INFO. Assign the statement
6599 for each vector to the associated scalar statement. */
6600
6601 void
6602 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6603 vec<tree> result_chain)
6604 {
6605 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6606 unsigned int i, gap_count;
6607 tree tmp_data_ref;
6608
6609 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6610 Since we scan the chain starting from it's first node, their order
6611 corresponds the order of data-refs in RESULT_CHAIN. */
6612 stmt_vec_info next_stmt_info = first_stmt_info;
6613 gap_count = 1;
6614 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6615 {
6616 if (!next_stmt_info)
6617 break;
6618
6619 /* Skip the gaps. Loads created for the gaps will be removed by dead
6620 code elimination pass later. No need to check for the first stmt in
6621 the group, since it always exists.
6622 DR_GROUP_GAP is the number of steps in elements from the previous
6623 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6624 correspond to the gaps. */
6625 if (next_stmt_info != first_stmt_info
6626 && gap_count < DR_GROUP_GAP (next_stmt_info))
6627 {
6628 gap_count++;
6629 continue;
6630 }
6631
6632 /* ??? The following needs cleanup after the removal of
6633 DR_GROUP_SAME_DR_STMT. */
6634 if (next_stmt_info)
6635 {
6636 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6637 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6638 copies, and we put the new vector statement last. */
6639 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6640
6641 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6642 gap_count = 1;
6643 }
6644 }
6645 }
6646
6647 /* Function vect_force_dr_alignment_p.
6648
6649 Returns whether the alignment of a DECL can be forced to be aligned
6650 on ALIGNMENT bit boundary. */
6651
6652 bool
6653 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6654 {
6655 if (!VAR_P (decl))
6656 return false;
6657
6658 if (decl_in_symtab_p (decl)
6659 && !symtab_node::get (decl)->can_increase_alignment_p ())
6660 return false;
6661
6662 if (TREE_STATIC (decl))
6663 return (known_le (alignment,
6664 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6665 else
6666 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6667 }
6668
6669 /* Return whether the data reference DR_INFO is supported with respect to its
6670 alignment.
6671 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6672 it is aligned, i.e., check if it is possible to vectorize it with different
6673 alignment. */
6674
6675 enum dr_alignment_support
6676 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6677 tree vectype, int misalignment)
6678 {
6679 data_reference *dr = dr_info->dr;
6680 stmt_vec_info stmt_info = dr_info->stmt;
6681 machine_mode mode = TYPE_MODE (vectype);
6682 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6683 class loop *vect_loop = NULL;
6684 bool nested_in_vect_loop = false;
6685
6686 if (misalignment == 0)
6687 return dr_aligned;
6688
6689 /* For now assume all conditional loads/stores support unaligned
6690 access without any special code. */
6691 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6692 if (gimple_call_internal_p (stmt)
6693 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6694 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6695 return dr_unaligned_supported;
6696
6697 if (loop_vinfo)
6698 {
6699 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6700 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6701 }
6702
6703 /* Possibly unaligned access. */
6704
6705 /* We can choose between using the implicit realignment scheme (generating
6706 a misaligned_move stmt) and the explicit realignment scheme (generating
6707 aligned loads with a REALIGN_LOAD). There are two variants to the
6708 explicit realignment scheme: optimized, and unoptimized.
6709 We can optimize the realignment only if the step between consecutive
6710 vector loads is equal to the vector size. Since the vector memory
6711 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6712 is guaranteed that the misalignment amount remains the same throughout the
6713 execution of the vectorized loop. Therefore, we can create the
6714 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6715 at the loop preheader.
6716
6717 However, in the case of outer-loop vectorization, when vectorizing a
6718 memory access in the inner-loop nested within the LOOP that is now being
6719 vectorized, while it is guaranteed that the misalignment of the
6720 vectorized memory access will remain the same in different outer-loop
6721 iterations, it is *not* guaranteed that is will remain the same throughout
6722 the execution of the inner-loop. This is because the inner-loop advances
6723 with the original scalar step (and not in steps of VS). If the inner-loop
6724 step happens to be a multiple of VS, then the misalignment remains fixed
6725 and we can use the optimized realignment scheme. For example:
6726
6727 for (i=0; i<N; i++)
6728 for (j=0; j<M; j++)
6729 s += a[i+j];
6730
6731 When vectorizing the i-loop in the above example, the step between
6732 consecutive vector loads is 1, and so the misalignment does not remain
6733 fixed across the execution of the inner-loop, and the realignment cannot
6734 be optimized (as illustrated in the following pseudo vectorized loop):
6735
6736 for (i=0; i<N; i+=4)
6737 for (j=0; j<M; j++){
6738 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6739 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6740 // (assuming that we start from an aligned address).
6741 }
6742
6743 We therefore have to use the unoptimized realignment scheme:
6744
6745 for (i=0; i<N; i+=4)
6746 for (j=k; j<M; j+=4)
6747 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6748 // that the misalignment of the initial address is
6749 // 0).
6750
6751 The loop can then be vectorized as follows:
6752
6753 for (k=0; k<4; k++){
6754 rt = get_realignment_token (&vp[k]);
6755 for (i=0; i<N; i+=4){
6756 v1 = vp[i+k];
6757 for (j=k; j<M; j+=4){
6758 v2 = vp[i+j+VS-1];
6759 va = REALIGN_LOAD <v1,v2,rt>;
6760 vs += va;
6761 v1 = v2;
6762 }
6763 }
6764 } */
6765
6766 if (DR_IS_READ (dr))
6767 {
6768 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6769 && (!targetm.vectorize.builtin_mask_for_load
6770 || targetm.vectorize.builtin_mask_for_load ()))
6771 {
6772 /* If we are doing SLP then the accesses need not have the
6773 same alignment, instead it depends on the SLP group size. */
6774 if (loop_vinfo
6775 && STMT_SLP_TYPE (stmt_info)
6776 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6777 * (DR_GROUP_SIZE
6778 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6779 TYPE_VECTOR_SUBPARTS (vectype)))
6780 ;
6781 else if (!loop_vinfo
6782 || (nested_in_vect_loop
6783 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6784 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6785 return dr_explicit_realign;
6786 else
6787 return dr_explicit_realign_optimized;
6788 }
6789 }
6790
6791 bool is_packed = false;
6792 tree type = TREE_TYPE (DR_REF (dr));
6793 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6794 is_packed = not_size_aligned (DR_REF (dr));
6795 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6796 is_packed))
6797 return dr_unaligned_supported;
6798
6799 /* Unsupported. */
6800 return dr_unaligned_unsupported;
6801 }