]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-data-refs.c
Update copyright years.
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
59
60 static bool
61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 tree vectype, unsigned HOST_WIDE_INT count)
63 {
64 machine_mode mode;
65 scalar_int_mode array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 limit_p = !targetm.array_mode_supported_p (mode, count);
70 if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
71 limit_p).exists (&array_mode))
72 {
73 if (dump_enabled_p ())
74 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
75 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
76 GET_MODE_NAME (mode), count);
77 return false;
78 }
79
80 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 {
82 if (dump_enabled_p ())
83 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
84 "cannot use %s<%s><%s>\n", name,
85 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
86 return false;
87 }
88
89 if (dump_enabled_p ())
90 dump_printf_loc (MSG_NOTE, vect_location,
91 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
92 GET_MODE_NAME (mode));
93
94 return true;
95 }
96
97
98 /* Return the smallest scalar part of STMT.
99 This is used to determine the vectype of the stmt. We generally set the
100 vectype according to the type of the result (lhs). For stmts whose
101 result-type is different than the type of the arguments (e.g., demotion,
102 promotion), vectype will be reset appropriately (later). Note that we have
103 to visit the smallest datatype in this function, because that determines the
104 VF. If the smallest datatype in the loop is present only as the rhs of a
105 promotion operation - we'd miss it.
106 Such a case, where a variable of this datatype does not appear in the lhs
107 anywhere in the loop, can only occur if it's an invariant: e.g.:
108 'int_x = (int) short_inv', which we'd expect to have been optimized away by
109 invariant motion. However, we cannot rely on invariant motion to always
110 take invariants out of the loop, and so in the case of promotion we also
111 have to check the rhs.
112 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
113 types. */
114
115 tree
116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
117 HOST_WIDE_INT *rhs_size_unit)
118 {
119 tree scalar_type = gimple_expr_type (stmt);
120 HOST_WIDE_INT lhs, rhs;
121
122 /* During the analysis phase, this function is called on arbitrary
123 statements that might not have scalar results. */
124 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
125 return scalar_type;
126
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
128
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
134 {
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
136
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
140 }
141
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
145 }
146
147
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
151
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
154 {
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
156
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
159
160 if (!runtime_alias_check_p (ddr, loop,
161 optimize_loop_nest_for_speed_p (loop)))
162 return false;
163
164 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
165 return true;
166 }
167
168
169 /* A subroutine of vect_analyze_data_ref_dependence. Handle
170 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
171 distances. These distances are conservatively correct but they don't
172 reflect a guaranteed dependence.
173
174 Return true if this function does all the work necessary to avoid
175 an alias or false if the caller should use the dependence distances
176 to limit the vectorization factor in the usual way. LOOP_DEPTH is
177 the depth of the loop described by LOOP_VINFO and the other arguments
178 are as for vect_analyze_data_ref_dependence. */
179
180 static bool
181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
182 loop_vec_info loop_vinfo,
183 int loop_depth, unsigned int *max_vf)
184 {
185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 lambda_vector dist_v;
187 unsigned int i;
188 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
189 {
190 int dist = dist_v[loop_depth];
191 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
192 {
193 /* If the user asserted safelen >= DIST consecutive iterations
194 can be executed concurrently, assume independence.
195
196 ??? An alternative would be to add the alias check even
197 in this case, and vectorize the fallback loop with the
198 maximum VF set to safelen. However, if the user has
199 explicitly given a length, it's less likely that that
200 would be a win. */
201 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
202 {
203 if ((unsigned int) loop->safelen < *max_vf)
204 *max_vf = loop->safelen;
205 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
206 continue;
207 }
208
209 /* For dependence distances of 2 or more, we have the option
210 of limiting VF or checking for an alias at runtime.
211 Prefer to check at runtime if we can, to avoid limiting
212 the VF unnecessarily when the bases are in fact independent.
213
214 Note that the alias checks will be removed if the VF ends up
215 being small enough. */
216 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
217 }
218 }
219 return true;
220 }
221
222
223 /* Function vect_analyze_data_ref_dependence.
224
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
229
230 static bool
231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo,
233 unsigned int *max_vf)
234 {
235 unsigned int i;
236 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
237 struct data_reference *dra = DDR_A (ddr);
238 struct data_reference *drb = DDR_B (ddr);
239 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
240 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
241 lambda_vector dist_v;
242 unsigned int loop_depth;
243
244 /* In loop analysis all data references should be vectorizable. */
245 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
246 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
247 gcc_unreachable ();
248
249 /* Independent data accesses. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
251 return false;
252
253 if (dra == drb
254 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
255 return false;
256
257 /* We do not have to consider dependences between accesses that belong
258 to the same group. */
259 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
260 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
261 return false;
262
263 /* Even if we have an anti-dependence then, as the vectorized loop covers at
264 least two scalar iterations, there is always also a true dependence.
265 As the vectorizer does not re-order loads and stores we can ignore
266 the anti-dependence if TBAA can disambiguate both DRs similar to the
267 case with known negative distance anti-dependences (positive
268 distance anti-dependences would violate TBAA constraints). */
269 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
270 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
271 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
272 get_alias_set (DR_REF (drb))))
273 return false;
274
275 /* Unknown data dependence. */
276 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
277 {
278 /* If user asserted safelen consecutive iterations can be
279 executed concurrently, assume independence. */
280 if (loop->safelen >= 2)
281 {
282 if ((unsigned int) loop->safelen < *max_vf)
283 *max_vf = loop->safelen;
284 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
285 return false;
286 }
287
288 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
289 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
290 {
291 if (dump_enabled_p ())
292 {
293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
294 "versioning for alias not supported for: "
295 "can't determine dependence between ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (dra));
298 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
299 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
300 DR_REF (drb));
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
302 }
303 return true;
304 }
305
306 if (dump_enabled_p ())
307 {
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias required: "
310 "can't determine dependence between ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
317 }
318
319 /* Add to list of ddrs that need to be tested at run-time. */
320 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
321 }
322
323 /* Known data dependence. */
324 if (DDR_NUM_DIST_VECTS (ddr) == 0)
325 {
326 /* If user asserted safelen consecutive iterations can be
327 executed concurrently, assume independence. */
328 if (loop->safelen >= 2)
329 {
330 if ((unsigned int) loop->safelen < *max_vf)
331 *max_vf = loop->safelen;
332 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
333 return false;
334 }
335
336 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
337 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
338 {
339 if (dump_enabled_p ())
340 {
341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
342 "versioning for alias not supported for: "
343 "bad dist vector for ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (dra));
346 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
347 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
348 DR_REF (drb));
349 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
350 }
351 return true;
352 }
353
354 if (dump_enabled_p ())
355 {
356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
357 "versioning for alias required: "
358 "bad dist vector for ");
359 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
360 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
361 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 }
364 /* Add to list of ddrs that need to be tested at run-time. */
365 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
366 }
367
368 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
369
370 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
371 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
372 loop_depth, max_vf))
373 return false;
374
375 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
376 {
377 int dist = dist_v[loop_depth];
378
379 if (dump_enabled_p ())
380 dump_printf_loc (MSG_NOTE, vect_location,
381 "dependence distance = %d.\n", dist);
382
383 if (dist == 0)
384 {
385 if (dump_enabled_p ())
386 {
387 dump_printf_loc (MSG_NOTE, vect_location,
388 "dependence distance == 0 between ");
389 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
390 dump_printf (MSG_NOTE, " and ");
391 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
392 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
393 }
394
395 /* When we perform grouped accesses and perform implicit CSE
396 by detecting equal accesses and doing disambiguation with
397 runtime alias tests like for
398 .. = a[i];
399 .. = a[i+1];
400 a[i] = ..;
401 a[i+1] = ..;
402 *p = ..;
403 .. = a[i];
404 .. = a[i+1];
405 where we will end up loading { a[i], a[i+1] } once, make
406 sure that inserting group loads before the first load and
407 stores after the last store will do the right thing.
408 Similar for groups like
409 a[i] = ...;
410 ... = a[i];
411 a[i+1] = ...;
412 where loads from the group interleave with the store. */
413 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
414 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
415 {
416 gimple *earlier_stmt;
417 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
418 if (DR_IS_WRITE
419 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
420 {
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423 "READ_WRITE dependence in interleaving."
424 "\n");
425 return true;
426 }
427 }
428
429 continue;
430 }
431
432 if (dist > 0 && DDR_REVERSED_P (ddr))
433 {
434 /* If DDR_REVERSED_P the order of the data-refs in DDR was
435 reversed (to make distance vector positive), and the actual
436 distance is negative. */
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
439 "dependence distance negative.\n");
440 /* Record a negative dependence distance to later limit the
441 amount of stmt copying / unrolling we can perform.
442 Only need to handle read-after-write dependence. */
443 if (DR_IS_READ (drb)
444 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
445 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
446 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
447 continue;
448 }
449
450 unsigned int abs_dist = abs (dist);
451 if (abs_dist >= 2 && abs_dist < *max_vf)
452 {
453 /* The dependence distance requires reduction of the maximal
454 vectorization factor. */
455 *max_vf = abs (dist);
456 if (dump_enabled_p ())
457 dump_printf_loc (MSG_NOTE, vect_location,
458 "adjusting maximal vectorization factor to %i\n",
459 *max_vf);
460 }
461
462 if (abs_dist >= *max_vf)
463 {
464 /* Dependence distance does not create dependence, as far as
465 vectorization is concerned, in this case. */
466 if (dump_enabled_p ())
467 dump_printf_loc (MSG_NOTE, vect_location,
468 "dependence distance >= VF.\n");
469 continue;
470 }
471
472 if (dump_enabled_p ())
473 {
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "not vectorized, possible dependence "
476 "between data-refs ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
478 dump_printf (MSG_NOTE, " and ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
480 dump_printf (MSG_NOTE, "\n");
481 }
482
483 return true;
484 }
485
486 return false;
487 }
488
489 /* Function vect_analyze_data_ref_dependences.
490
491 Examine all the data references in the loop, and make sure there do not
492 exist any data dependences between them. Set *MAX_VF according to
493 the maximum vectorization factor the data dependences allow. */
494
495 bool
496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
497 unsigned int *max_vf)
498 {
499 unsigned int i;
500 struct data_dependence_relation *ddr;
501
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "=== vect_analyze_data_ref_dependences ===\n");
505
506 LOOP_VINFO_DDRS (loop_vinfo)
507 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
508 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
509 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
510 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
511 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
512 &LOOP_VINFO_DDRS (loop_vinfo),
513 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
514 return false;
515
516 /* For epilogues we either have no aliases or alias versioning
517 was applied to original loop. Therefore we may just get max_vf
518 using VF of original loop. */
519 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
520 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
521 else
522 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
523 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
524 return false;
525
526 return true;
527 }
528
529
530 /* Function vect_slp_analyze_data_ref_dependence.
531
532 Return TRUE if there (might) exist a dependence between a memory-reference
533 DRA and a memory-reference DRB. When versioning for alias may check a
534 dependence at run-time, return FALSE. Adjust *MAX_VF according to
535 the data dependence. */
536
537 static bool
538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
539 {
540 struct data_reference *dra = DDR_A (ddr);
541 struct data_reference *drb = DDR_B (ddr);
542
543 /* We need to check dependences of statements marked as unvectorizable
544 as well, they still can prohibit vectorization. */
545
546 /* Independent data accesses. */
547 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
548 return false;
549
550 if (dra == drb)
551 return false;
552
553 /* Read-read is OK. */
554 if (DR_IS_READ (dra) && DR_IS_READ (drb))
555 return false;
556
557 /* If dra and drb are part of the same interleaving chain consider
558 them independent. */
559 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
560 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
561 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
562 return false;
563
564 /* Unknown data dependence. */
565 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
566 {
567 if (dump_enabled_p ())
568 {
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
570 "can't determine dependence between ");
571 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
572 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
573 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
574 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
575 }
576 }
577 else if (dump_enabled_p ())
578 {
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "determined dependence between ");
581 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
582 dump_printf (MSG_NOTE, " and ");
583 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
584 dump_printf (MSG_NOTE, "\n");
585 }
586
587 return true;
588 }
589
590
591 /* Analyze dependences involved in the transform of SLP NODE. STORES
592 contain the vector of scalar stores of this instance if we are
593 disambiguating the loads. */
594
595 static bool
596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
597 vec<gimple *> stores, gimple *last_store)
598 {
599 /* This walks over all stmts involved in the SLP load/store done
600 in NODE verifying we can sink them up to the last stmt in the
601 group. */
602 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
603 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
604 {
605 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
606 if (access == last_access)
607 continue;
608 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
609 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
610 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
611 {
612 gimple *stmt = gsi_stmt (gsi);
613 if (! gimple_vuse (stmt)
614 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
615 continue;
616
617 /* If we couldn't record a (single) data reference for this
618 stmt we have to give up. */
619 /* ??? Here and below if dependence analysis fails we can resort
620 to the alias oracle which can handle more kinds of stmts. */
621 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
622 if (!dr_b)
623 return false;
624
625 bool dependent = false;
626 /* If we run into a store of this same instance (we've just
627 marked those) then delay dependence checking until we run
628 into the last store because this is where it will have
629 been sunk to (and we verify if we can do that as well). */
630 if (gimple_visited_p (stmt))
631 {
632 if (stmt != last_store)
633 continue;
634 unsigned i;
635 gimple *store;
636 FOR_EACH_VEC_ELT (stores, i, store)
637 {
638 data_reference *store_dr
639 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
640 ddr_p ddr = initialize_data_dependence_relation
641 (dr_a, store_dr, vNULL);
642 dependent = vect_slp_analyze_data_ref_dependence (ddr);
643 free_dependence_relation (ddr);
644 if (dependent)
645 break;
646 }
647 }
648 else
649 {
650 ddr_p ddr = initialize_data_dependence_relation (dr_a,
651 dr_b, vNULL);
652 dependent = vect_slp_analyze_data_ref_dependence (ddr);
653 free_dependence_relation (ddr);
654 }
655 if (dependent)
656 return false;
657 }
658 }
659 return true;
660 }
661
662
663 /* Function vect_analyze_data_ref_dependences.
664
665 Examine all the data references in the basic-block, and make sure there
666 do not exist any data dependences between them. Set *MAX_VF according to
667 the maximum vectorization factor the data dependences allow. */
668
669 bool
670 vect_slp_analyze_instance_dependence (slp_instance instance)
671 {
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE, vect_location,
674 "=== vect_slp_analyze_instance_dependence ===\n");
675
676 /* The stores of this instance are at the root of the SLP tree. */
677 slp_tree store = SLP_INSTANCE_TREE (instance);
678 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
679 store = NULL;
680
681 /* Verify we can sink stores to the vectorized stmt insert location. */
682 gimple *last_store = NULL;
683 if (store)
684 {
685 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
686 return false;
687
688 /* Mark stores in this instance and remember the last one. */
689 last_store = vect_find_last_scalar_stmt_in_slp (store);
690 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
691 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
692 }
693
694 bool res = true;
695
696 /* Verify we can sink loads to the vectorized stmt insert location,
697 special-casing stores of this instance. */
698 slp_tree load;
699 unsigned int i;
700 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
701 if (! vect_slp_analyze_node_dependences (instance, load,
702 store
703 ? SLP_TREE_SCALAR_STMTS (store)
704 : vNULL, last_store))
705 {
706 res = false;
707 break;
708 }
709
710 /* Unset the visited flag. */
711 if (store)
712 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
713 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
714
715 return res;
716 }
717
718 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
719 the statement that contains DRB, which is useful for recording in the
720 dump file. */
721
722 static void
723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
724 innermost_loop_behavior *drb)
725 {
726 bool existed;
727 innermost_loop_behavior *&entry
728 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
729 if (!existed || entry->base_alignment < drb->base_alignment)
730 {
731 entry = drb;
732 if (dump_enabled_p ())
733 {
734 dump_printf_loc (MSG_NOTE, vect_location,
735 "recording new base alignment for ");
736 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
737 dump_printf (MSG_NOTE, "\n");
738 dump_printf_loc (MSG_NOTE, vect_location,
739 " alignment: %d\n", drb->base_alignment);
740 dump_printf_loc (MSG_NOTE, vect_location,
741 " misalignment: %d\n", drb->base_misalignment);
742 dump_printf_loc (MSG_NOTE, vect_location,
743 " based on: ");
744 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
745 }
746 }
747 }
748
749 /* If the region we're going to vectorize is reached, all unconditional
750 data references occur at least once. We can therefore pool the base
751 alignment guarantees from each unconditional reference. Do this by
752 going through all the data references in VINFO and checking whether
753 the containing statement makes the reference unconditionally. If so,
754 record the alignment of the base address in VINFO so that it can be
755 used for all other references with the same base. */
756
757 void
758 vect_record_base_alignments (vec_info *vinfo)
759 {
760 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
761 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
762 data_reference *dr;
763 unsigned int i;
764 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
765 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
766 {
767 gimple *stmt = DR_STMT (dr);
768 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
769
770 /* If DR is nested in the loop that is being vectorized, we can also
771 record the alignment of the base wrt the outer loop. */
772 if (loop && nested_in_vect_loop_p (loop, stmt))
773 {
774 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
775 vect_record_base_alignment
776 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
777 }
778 }
779 }
780
781 /* Return the target alignment for the vectorized form of DR. */
782
783 static unsigned int
784 vect_calculate_target_alignment (struct data_reference *dr)
785 {
786 gimple *stmt = DR_STMT (dr);
787 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
788 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
789 return targetm.vectorize.preferred_vector_alignment (vectype);
790 }
791
792 /* Function vect_compute_data_ref_alignment
793
794 Compute the misalignment of the data reference DR.
795
796 Output:
797 1. If during the misalignment computation it is found that the data reference
798 cannot be vectorized then false is returned.
799 2. DR_MISALIGNMENT (DR) is defined.
800
801 FOR NOW: No analysis is actually performed. Misalignment is calculated
802 only for trivial cases. TODO. */
803
804 bool
805 vect_compute_data_ref_alignment (struct data_reference *dr)
806 {
807 gimple *stmt = DR_STMT (dr);
808 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
809 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
810 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
811 struct loop *loop = NULL;
812 tree ref = DR_REF (dr);
813 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
814
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_NOTE, vect_location,
817 "vect_compute_data_ref_alignment:\n");
818
819 if (loop_vinfo)
820 loop = LOOP_VINFO_LOOP (loop_vinfo);
821
822 /* Initialize misalignment to unknown. */
823 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
824
825 innermost_loop_behavior *drb = vect_dr_behavior (dr);
826 bool step_preserves_misalignment_p;
827
828 unsigned HOST_WIDE_INT vector_alignment
829 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
830 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
831
832 /* No step for BB vectorization. */
833 if (!loop)
834 {
835 gcc_assert (integer_zerop (drb->step));
836 step_preserves_misalignment_p = true;
837 }
838
839 /* In case the dataref is in an inner-loop of the loop that is being
840 vectorized (LOOP), we use the base and misalignment information
841 relative to the outer-loop (LOOP). This is ok only if the misalignment
842 stays the same throughout the execution of the inner-loop, which is why
843 we have to check that the stride of the dataref in the inner-loop evenly
844 divides by the vector alignment. */
845 else if (nested_in_vect_loop_p (loop, stmt))
846 {
847 step_preserves_misalignment_p
848 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
849
850 if (dump_enabled_p ())
851 {
852 if (step_preserves_misalignment_p)
853 dump_printf_loc (MSG_NOTE, vect_location,
854 "inner step divides the vector alignment.\n");
855 else
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
857 "inner step doesn't divide the vector"
858 " alignment.\n");
859 }
860 }
861
862 /* Similarly we can only use base and misalignment information relative to
863 an innermost loop if the misalignment stays the same throughout the
864 execution of the loop. As above, this is the case if the stride of
865 the dataref evenly divides by the alignment. */
866 else
867 {
868 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
869 step_preserves_misalignment_p
870 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
871
872 if (!step_preserves_misalignment_p && dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
874 "step doesn't divide the vector alignment.\n");
875 }
876
877 unsigned int base_alignment = drb->base_alignment;
878 unsigned int base_misalignment = drb->base_misalignment;
879
880 /* Calculate the maximum of the pooled base address alignment and the
881 alignment that we can compute for DR itself. */
882 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
883 if (entry && base_alignment < (*entry)->base_alignment)
884 {
885 base_alignment = (*entry)->base_alignment;
886 base_misalignment = (*entry)->base_misalignment;
887 }
888
889 if (drb->offset_alignment < vector_alignment
890 || !step_preserves_misalignment_p
891 /* We need to know whether the step wrt the vectorized loop is
892 negative when computing the starting misalignment below. */
893 || TREE_CODE (drb->step) != INTEGER_CST)
894 {
895 if (dump_enabled_p ())
896 {
897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
898 "Unknown alignment for access: ");
899 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
900 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
901 }
902 return true;
903 }
904
905 if (base_alignment < vector_alignment)
906 {
907 tree base = drb->base_address;
908 if (TREE_CODE (base) == ADDR_EXPR)
909 base = TREE_OPERAND (base, 0);
910 if (!vect_can_force_dr_alignment_p (base,
911 vector_alignment * BITS_PER_UNIT))
912 {
913 if (dump_enabled_p ())
914 {
915 dump_printf_loc (MSG_NOTE, vect_location,
916 "can't force alignment of ref: ");
917 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
918 dump_printf (MSG_NOTE, "\n");
919 }
920 return true;
921 }
922
923 if (DECL_USER_ALIGN (base))
924 {
925 if (dump_enabled_p ())
926 {
927 dump_printf_loc (MSG_NOTE, vect_location,
928 "not forcing alignment of user-aligned "
929 "variable: ");
930 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
931 dump_printf (MSG_NOTE, "\n");
932 }
933 return true;
934 }
935
936 /* Force the alignment of the decl.
937 NOTE: This is the only change to the code we make during
938 the analysis phase, before deciding to vectorize the loop. */
939 if (dump_enabled_p ())
940 {
941 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
942 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
943 dump_printf (MSG_NOTE, "\n");
944 }
945
946 DR_VECT_AUX (dr)->base_decl = base;
947 DR_VECT_AUX (dr)->base_misaligned = true;
948 base_misalignment = 0;
949 }
950 poly_int64 misalignment
951 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
952
953 /* If this is a backward running DR then first access in the larger
954 vectype actually is N-1 elements before the address in the DR.
955 Adjust misalign accordingly. */
956 if (tree_int_cst_sgn (drb->step) < 0)
957 /* PLUS because STEP is negative. */
958 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
959 * TREE_INT_CST_LOW (drb->step));
960
961 unsigned int const_misalignment;
962 if (!known_misalignment (misalignment, vector_alignment,
963 &const_misalignment))
964 {
965 if (dump_enabled_p ())
966 {
967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
968 "Non-constant misalignment for access: ");
969 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
970 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
971 }
972 return true;
973 }
974
975 SET_DR_MISALIGNMENT (dr, const_misalignment);
976
977 if (dump_enabled_p ())
978 {
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
981 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
982 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
983 }
984
985 return true;
986 }
987
988 /* Function vect_update_misalignment_for_peel.
989 Sets DR's misalignment
990 - to 0 if it has the same alignment as DR_PEEL,
991 - to the misalignment computed using NPEEL if DR's salignment is known,
992 - to -1 (unknown) otherwise.
993
994 DR - the data reference whose misalignment is to be adjusted.
995 DR_PEEL - the data reference whose misalignment is being made
996 zero in the vector loop by the peel.
997 NPEEL - the number of iterations in the peel loop if the misalignment
998 of DR_PEEL is known at compile time. */
999
1000 static void
1001 vect_update_misalignment_for_peel (struct data_reference *dr,
1002 struct data_reference *dr_peel, int npeel)
1003 {
1004 unsigned int i;
1005 vec<dr_p> same_aligned_drs;
1006 struct data_reference *current_dr;
1007 int dr_size = vect_get_scalar_dr_size (dr);
1008 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1009 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1010 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1011
1012 /* For interleaved data accesses the step in the loop must be multiplied by
1013 the size of the interleaving group. */
1014 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1015 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1016 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1017 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1018
1019 /* It can be assumed that the data refs with the same alignment as dr_peel
1020 are aligned in the vector loop. */
1021 same_aligned_drs
1022 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1023 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1024 {
1025 if (current_dr != dr)
1026 continue;
1027 gcc_assert (!known_alignment_for_access_p (dr)
1028 || !known_alignment_for_access_p (dr_peel)
1029 || (DR_MISALIGNMENT (dr) / dr_size
1030 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1031 SET_DR_MISALIGNMENT (dr, 0);
1032 return;
1033 }
1034
1035 if (known_alignment_for_access_p (dr)
1036 && known_alignment_for_access_p (dr_peel))
1037 {
1038 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1039 int misal = DR_MISALIGNMENT (dr);
1040 misal += negative ? -npeel * dr_size : npeel * dr_size;
1041 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1042 SET_DR_MISALIGNMENT (dr, misal);
1043 return;
1044 }
1045
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1048 "to unknown (-1).\n");
1049 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1050 }
1051
1052
1053 /* Function verify_data_ref_alignment
1054
1055 Return TRUE if DR can be handled with respect to alignment. */
1056
1057 static bool
1058 verify_data_ref_alignment (data_reference_p dr)
1059 {
1060 enum dr_alignment_support supportable_dr_alignment
1061 = vect_supportable_dr_alignment (dr, false);
1062 if (!supportable_dr_alignment)
1063 {
1064 if (dump_enabled_p ())
1065 {
1066 if (DR_IS_READ (dr))
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "not vectorized: unsupported unaligned load.");
1069 else
1070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071 "not vectorized: unsupported unaligned "
1072 "store.");
1073
1074 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1075 DR_REF (dr));
1076 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1077 }
1078 return false;
1079 }
1080
1081 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1082 dump_printf_loc (MSG_NOTE, vect_location,
1083 "Vectorizing an unaligned access.\n");
1084
1085 return true;
1086 }
1087
1088 /* Function vect_verify_datarefs_alignment
1089
1090 Return TRUE if all data references in the loop can be
1091 handled with respect to alignment. */
1092
1093 bool
1094 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1095 {
1096 vec<data_reference_p> datarefs = vinfo->datarefs;
1097 struct data_reference *dr;
1098 unsigned int i;
1099
1100 FOR_EACH_VEC_ELT (datarefs, i, dr)
1101 {
1102 gimple *stmt = DR_STMT (dr);
1103 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1104
1105 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1106 continue;
1107
1108 /* For interleaving, only the alignment of the first access matters. */
1109 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1110 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1111 continue;
1112
1113 /* Strided accesses perform only component accesses, alignment is
1114 irrelevant for them. */
1115 if (STMT_VINFO_STRIDED_P (stmt_info)
1116 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1117 continue;
1118
1119 if (! verify_data_ref_alignment (dr))
1120 return false;
1121 }
1122
1123 return true;
1124 }
1125
1126 /* Given an memory reference EXP return whether its alignment is less
1127 than its size. */
1128
1129 static bool
1130 not_size_aligned (tree exp)
1131 {
1132 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1133 return true;
1134
1135 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1136 > get_object_alignment (exp));
1137 }
1138
1139 /* Function vector_alignment_reachable_p
1140
1141 Return true if vector alignment for DR is reachable by peeling
1142 a few loop iterations. Return false otherwise. */
1143
1144 static bool
1145 vector_alignment_reachable_p (struct data_reference *dr)
1146 {
1147 gimple *stmt = DR_STMT (dr);
1148 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1150
1151 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1152 {
1153 /* For interleaved access we peel only if number of iterations in
1154 the prolog loop ({VF - misalignment}), is a multiple of the
1155 number of the interleaved accesses. */
1156 int elem_size, mis_in_elements;
1157
1158 /* FORNOW: handle only known alignment. */
1159 if (!known_alignment_for_access_p (dr))
1160 return false;
1161
1162 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1163 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1164 elem_size = vector_element_size (vector_size, nelements);
1165 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1166
1167 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1168 return false;
1169 }
1170
1171 /* If misalignment is known at the compile time then allow peeling
1172 only if natural alignment is reachable through peeling. */
1173 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1174 {
1175 HOST_WIDE_INT elmsize =
1176 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1177 if (dump_enabled_p ())
1178 {
1179 dump_printf_loc (MSG_NOTE, vect_location,
1180 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1181 dump_printf (MSG_NOTE,
1182 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1183 }
1184 if (DR_MISALIGNMENT (dr) % elmsize)
1185 {
1186 if (dump_enabled_p ())
1187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 "data size does not divide the misalignment.\n");
1189 return false;
1190 }
1191 }
1192
1193 if (!known_alignment_for_access_p (dr))
1194 {
1195 tree type = TREE_TYPE (DR_REF (dr));
1196 bool is_packed = not_size_aligned (DR_REF (dr));
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "Unknown misalignment, %snaturally aligned\n",
1200 is_packed ? "not " : "");
1201 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1202 }
1203
1204 return true;
1205 }
1206
1207
1208 /* Calculate the cost of the memory access represented by DR. */
1209
1210 static void
1211 vect_get_data_access_cost (struct data_reference *dr,
1212 unsigned int *inside_cost,
1213 unsigned int *outside_cost,
1214 stmt_vector_for_cost *body_cost_vec)
1215 {
1216 gimple *stmt = DR_STMT (dr);
1217 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1219 int ncopies;
1220
1221 if (PURE_SLP_STMT (stmt_info))
1222 ncopies = 1;
1223 else
1224 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1225
1226 if (DR_IS_READ (dr))
1227 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1228 NULL, body_cost_vec, false);
1229 else
1230 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1231
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_NOTE, vect_location,
1234 "vect_get_data_access_cost: inside_cost = %d, "
1235 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1236 }
1237
1238
1239 typedef struct _vect_peel_info
1240 {
1241 struct data_reference *dr;
1242 int npeel;
1243 unsigned int count;
1244 } *vect_peel_info;
1245
1246 typedef struct _vect_peel_extended_info
1247 {
1248 struct _vect_peel_info peel_info;
1249 unsigned int inside_cost;
1250 unsigned int outside_cost;
1251 } *vect_peel_extended_info;
1252
1253
1254 /* Peeling hashtable helpers. */
1255
1256 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1257 {
1258 static inline hashval_t hash (const _vect_peel_info *);
1259 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1260 };
1261
1262 inline hashval_t
1263 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1264 {
1265 return (hashval_t) peel_info->npeel;
1266 }
1267
1268 inline bool
1269 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1270 {
1271 return (a->npeel == b->npeel);
1272 }
1273
1274
1275 /* Insert DR into peeling hash table with NPEEL as key. */
1276
1277 static void
1278 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1279 loop_vec_info loop_vinfo, struct data_reference *dr,
1280 int npeel)
1281 {
1282 struct _vect_peel_info elem, *slot;
1283 _vect_peel_info **new_slot;
1284 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1285
1286 elem.npeel = npeel;
1287 slot = peeling_htab->find (&elem);
1288 if (slot)
1289 slot->count++;
1290 else
1291 {
1292 slot = XNEW (struct _vect_peel_info);
1293 slot->npeel = npeel;
1294 slot->dr = dr;
1295 slot->count = 1;
1296 new_slot = peeling_htab->find_slot (slot, INSERT);
1297 *new_slot = slot;
1298 }
1299
1300 if (!supportable_dr_alignment
1301 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1302 slot->count += VECT_MAX_COST;
1303 }
1304
1305
1306 /* Traverse peeling hash table to find peeling option that aligns maximum
1307 number of data accesses. */
1308
1309 int
1310 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1311 _vect_peel_extended_info *max)
1312 {
1313 vect_peel_info elem = *slot;
1314
1315 if (elem->count > max->peel_info.count
1316 || (elem->count == max->peel_info.count
1317 && max->peel_info.npeel > elem->npeel))
1318 {
1319 max->peel_info.npeel = elem->npeel;
1320 max->peel_info.count = elem->count;
1321 max->peel_info.dr = elem->dr;
1322 }
1323
1324 return 1;
1325 }
1326
1327 /* Get the costs of peeling NPEEL iterations checking data access costs
1328 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1329 misalignment will be zero after peeling. */
1330
1331 static void
1332 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1333 struct data_reference *dr0,
1334 unsigned int *inside_cost,
1335 unsigned int *outside_cost,
1336 stmt_vector_for_cost *body_cost_vec,
1337 unsigned int npeel,
1338 bool unknown_misalignment)
1339 {
1340 unsigned i;
1341 data_reference *dr;
1342
1343 FOR_EACH_VEC_ELT (datarefs, i, dr)
1344 {
1345 gimple *stmt = DR_STMT (dr);
1346 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1347 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1348 continue;
1349
1350 /* For interleaving, only the alignment of the first access
1351 matters. */
1352 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1353 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1354 continue;
1355
1356 /* Strided accesses perform only component accesses, alignment is
1357 irrelevant for them. */
1358 if (STMT_VINFO_STRIDED_P (stmt_info)
1359 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1360 continue;
1361
1362 int save_misalignment;
1363 save_misalignment = DR_MISALIGNMENT (dr);
1364 if (npeel == 0)
1365 ;
1366 else if (unknown_misalignment && dr == dr0)
1367 SET_DR_MISALIGNMENT (dr, 0);
1368 else
1369 vect_update_misalignment_for_peel (dr, dr0, npeel);
1370 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1371 body_cost_vec);
1372 SET_DR_MISALIGNMENT (dr, save_misalignment);
1373 }
1374 }
1375
1376 /* Traverse peeling hash table and calculate cost for each peeling option.
1377 Find the one with the lowest cost. */
1378
1379 int
1380 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1381 _vect_peel_extended_info *min)
1382 {
1383 vect_peel_info elem = *slot;
1384 int dummy;
1385 unsigned int inside_cost = 0, outside_cost = 0;
1386 gimple *stmt = DR_STMT (elem->dr);
1387 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1388 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1389 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1390 epilogue_cost_vec;
1391
1392 prologue_cost_vec.create (2);
1393 body_cost_vec.create (2);
1394 epilogue_cost_vec.create (2);
1395
1396 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1397 elem->dr, &inside_cost, &outside_cost,
1398 &body_cost_vec, elem->npeel, false);
1399
1400 body_cost_vec.release ();
1401
1402 outside_cost += vect_get_known_peeling_cost
1403 (loop_vinfo, elem->npeel, &dummy,
1404 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1405 &prologue_cost_vec, &epilogue_cost_vec);
1406
1407 /* Prologue and epilogue costs are added to the target model later.
1408 These costs depend only on the scalar iteration cost, the
1409 number of peeling iterations finally chosen, and the number of
1410 misaligned statements. So discard the information found here. */
1411 prologue_cost_vec.release ();
1412 epilogue_cost_vec.release ();
1413
1414 if (inside_cost < min->inside_cost
1415 || (inside_cost == min->inside_cost
1416 && outside_cost < min->outside_cost))
1417 {
1418 min->inside_cost = inside_cost;
1419 min->outside_cost = outside_cost;
1420 min->peel_info.dr = elem->dr;
1421 min->peel_info.npeel = elem->npeel;
1422 min->peel_info.count = elem->count;
1423 }
1424
1425 return 1;
1426 }
1427
1428
1429 /* Choose best peeling option by traversing peeling hash table and either
1430 choosing an option with the lowest cost (if cost model is enabled) or the
1431 option that aligns as many accesses as possible. */
1432
1433 static struct _vect_peel_extended_info
1434 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1435 loop_vec_info loop_vinfo)
1436 {
1437 struct _vect_peel_extended_info res;
1438
1439 res.peel_info.dr = NULL;
1440
1441 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1442 {
1443 res.inside_cost = INT_MAX;
1444 res.outside_cost = INT_MAX;
1445 peeling_htab->traverse <_vect_peel_extended_info *,
1446 vect_peeling_hash_get_lowest_cost> (&res);
1447 }
1448 else
1449 {
1450 res.peel_info.count = 0;
1451 peeling_htab->traverse <_vect_peel_extended_info *,
1452 vect_peeling_hash_get_most_frequent> (&res);
1453 res.inside_cost = 0;
1454 res.outside_cost = 0;
1455 }
1456
1457 return res;
1458 }
1459
1460 /* Return true if the new peeling NPEEL is supported. */
1461
1462 static bool
1463 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1464 unsigned npeel)
1465 {
1466 unsigned i;
1467 struct data_reference *dr = NULL;
1468 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1469 gimple *stmt;
1470 stmt_vec_info stmt_info;
1471 enum dr_alignment_support supportable_dr_alignment;
1472
1473 /* Ensure that all data refs can be vectorized after the peel. */
1474 FOR_EACH_VEC_ELT (datarefs, i, dr)
1475 {
1476 int save_misalignment;
1477
1478 if (dr == dr0)
1479 continue;
1480
1481 stmt = DR_STMT (dr);
1482 stmt_info = vinfo_for_stmt (stmt);
1483 /* For interleaving, only the alignment of the first access
1484 matters. */
1485 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1486 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1487 continue;
1488
1489 /* Strided accesses perform only component accesses, alignment is
1490 irrelevant for them. */
1491 if (STMT_VINFO_STRIDED_P (stmt_info)
1492 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1493 continue;
1494
1495 save_misalignment = DR_MISALIGNMENT (dr);
1496 vect_update_misalignment_for_peel (dr, dr0, npeel);
1497 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1498 SET_DR_MISALIGNMENT (dr, save_misalignment);
1499
1500 if (!supportable_dr_alignment)
1501 return false;
1502 }
1503
1504 return true;
1505 }
1506
1507 /* Function vect_enhance_data_refs_alignment
1508
1509 This pass will use loop versioning and loop peeling in order to enhance
1510 the alignment of data references in the loop.
1511
1512 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1513 original loop is to be vectorized. Any other loops that are created by
1514 the transformations performed in this pass - are not supposed to be
1515 vectorized. This restriction will be relaxed.
1516
1517 This pass will require a cost model to guide it whether to apply peeling
1518 or versioning or a combination of the two. For example, the scheme that
1519 intel uses when given a loop with several memory accesses, is as follows:
1520 choose one memory access ('p') which alignment you want to force by doing
1521 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1522 other accesses are not necessarily aligned, or (2) use loop versioning to
1523 generate one loop in which all accesses are aligned, and another loop in
1524 which only 'p' is necessarily aligned.
1525
1526 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1527 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1528 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1529
1530 Devising a cost model is the most critical aspect of this work. It will
1531 guide us on which access to peel for, whether to use loop versioning, how
1532 many versions to create, etc. The cost model will probably consist of
1533 generic considerations as well as target specific considerations (on
1534 powerpc for example, misaligned stores are more painful than misaligned
1535 loads).
1536
1537 Here are the general steps involved in alignment enhancements:
1538
1539 -- original loop, before alignment analysis:
1540 for (i=0; i<N; i++){
1541 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1542 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1543 }
1544
1545 -- After vect_compute_data_refs_alignment:
1546 for (i=0; i<N; i++){
1547 x = q[i]; # DR_MISALIGNMENT(q) = 3
1548 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1549 }
1550
1551 -- Possibility 1: we do loop versioning:
1552 if (p is aligned) {
1553 for (i=0; i<N; i++){ # loop 1A
1554 x = q[i]; # DR_MISALIGNMENT(q) = 3
1555 p[i] = y; # DR_MISALIGNMENT(p) = 0
1556 }
1557 }
1558 else {
1559 for (i=0; i<N; i++){ # loop 1B
1560 x = q[i]; # DR_MISALIGNMENT(q) = 3
1561 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1562 }
1563 }
1564
1565 -- Possibility 2: we do loop peeling:
1566 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1567 x = q[i];
1568 p[i] = y;
1569 }
1570 for (i = 3; i < N; i++){ # loop 2A
1571 x = q[i]; # DR_MISALIGNMENT(q) = 0
1572 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1573 }
1574
1575 -- Possibility 3: combination of loop peeling and versioning:
1576 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1577 x = q[i];
1578 p[i] = y;
1579 }
1580 if (p is aligned) {
1581 for (i = 3; i<N; i++){ # loop 3A
1582 x = q[i]; # DR_MISALIGNMENT(q) = 0
1583 p[i] = y; # DR_MISALIGNMENT(p) = 0
1584 }
1585 }
1586 else {
1587 for (i = 3; i<N; i++){ # loop 3B
1588 x = q[i]; # DR_MISALIGNMENT(q) = 0
1589 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1590 }
1591 }
1592
1593 These loops are later passed to loop_transform to be vectorized. The
1594 vectorizer will use the alignment information to guide the transformation
1595 (whether to generate regular loads/stores, or with special handling for
1596 misalignment). */
1597
1598 bool
1599 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1600 {
1601 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1602 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1603 enum dr_alignment_support supportable_dr_alignment;
1604 struct data_reference *dr0 = NULL, *first_store = NULL;
1605 struct data_reference *dr;
1606 unsigned int i, j;
1607 bool do_peeling = false;
1608 bool do_versioning = false;
1609 bool stat;
1610 gimple *stmt;
1611 stmt_vec_info stmt_info;
1612 unsigned int npeel = 0;
1613 bool one_misalignment_known = false;
1614 bool one_misalignment_unknown = false;
1615 bool one_dr_unsupportable = false;
1616 struct data_reference *unsupportable_dr = NULL;
1617 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1618 unsigned possible_npeel_number = 1;
1619 tree vectype;
1620 unsigned int mis, same_align_drs_max = 0;
1621 hash_table<peel_info_hasher> peeling_htab (1);
1622
1623 if (dump_enabled_p ())
1624 dump_printf_loc (MSG_NOTE, vect_location,
1625 "=== vect_enhance_data_refs_alignment ===\n");
1626
1627 /* Reset data so we can safely be called multiple times. */
1628 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1629 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1630
1631 /* While cost model enhancements are expected in the future, the high level
1632 view of the code at this time is as follows:
1633
1634 A) If there is a misaligned access then see if peeling to align
1635 this access can make all data references satisfy
1636 vect_supportable_dr_alignment. If so, update data structures
1637 as needed and return true.
1638
1639 B) If peeling wasn't possible and there is a data reference with an
1640 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1641 then see if loop versioning checks can be used to make all data
1642 references satisfy vect_supportable_dr_alignment. If so, update
1643 data structures as needed and return true.
1644
1645 C) If neither peeling nor versioning were successful then return false if
1646 any data reference does not satisfy vect_supportable_dr_alignment.
1647
1648 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1649
1650 Note, Possibility 3 above (which is peeling and versioning together) is not
1651 being done at this time. */
1652
1653 /* (1) Peeling to force alignment. */
1654
1655 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1656 Considerations:
1657 + How many accesses will become aligned due to the peeling
1658 - How many accesses will become unaligned due to the peeling,
1659 and the cost of misaligned accesses.
1660 - The cost of peeling (the extra runtime checks, the increase
1661 in code size). */
1662
1663 FOR_EACH_VEC_ELT (datarefs, i, dr)
1664 {
1665 stmt = DR_STMT (dr);
1666 stmt_info = vinfo_for_stmt (stmt);
1667
1668 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1669 continue;
1670
1671 /* For interleaving, only the alignment of the first access
1672 matters. */
1673 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1674 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1675 continue;
1676
1677 /* For invariant accesses there is nothing to enhance. */
1678 if (integer_zerop (DR_STEP (dr)))
1679 continue;
1680
1681 /* Strided accesses perform only component accesses, alignment is
1682 irrelevant for them. */
1683 if (STMT_VINFO_STRIDED_P (stmt_info)
1684 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1685 continue;
1686
1687 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1688 do_peeling = vector_alignment_reachable_p (dr);
1689 if (do_peeling)
1690 {
1691 if (known_alignment_for_access_p (dr))
1692 {
1693 unsigned int npeel_tmp = 0;
1694 bool negative = tree_int_cst_compare (DR_STEP (dr),
1695 size_zero_node) < 0;
1696
1697 vectype = STMT_VINFO_VECTYPE (stmt_info);
1698 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1699 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1700 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1701 if (DR_MISALIGNMENT (dr) != 0)
1702 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1703
1704 /* For multiple types, it is possible that the bigger type access
1705 will have more than one peeling option. E.g., a loop with two
1706 types: one of size (vector size / 4), and the other one of
1707 size (vector size / 8). Vectorization factor will 8. If both
1708 accesses are misaligned by 3, the first one needs one scalar
1709 iteration to be aligned, and the second one needs 5. But the
1710 first one will be aligned also by peeling 5 scalar
1711 iterations, and in that case both accesses will be aligned.
1712 Hence, except for the immediate peeling amount, we also want
1713 to try to add full vector size, while we don't exceed
1714 vectorization factor.
1715 We do this automatically for cost model, since we calculate
1716 cost for every peeling option. */
1717 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1718 {
1719 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1720 ? vf * GROUP_SIZE (stmt_info) : vf);
1721 possible_npeel_number
1722 = vect_get_num_vectors (nscalars, vectype);
1723
1724 /* NPEEL_TMP is 0 when there is no misalignment, but also
1725 allow peeling NELEMENTS. */
1726 if (DR_MISALIGNMENT (dr) == 0)
1727 possible_npeel_number++;
1728 }
1729
1730 /* Save info about DR in the hash table. Also include peeling
1731 amounts according to the explanation above. */
1732 for (j = 0; j < possible_npeel_number; j++)
1733 {
1734 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1735 dr, npeel_tmp);
1736 npeel_tmp += target_align / dr_size;
1737 }
1738
1739 one_misalignment_known = true;
1740 }
1741 else
1742 {
1743 /* If we don't know any misalignment values, we prefer
1744 peeling for data-ref that has the maximum number of data-refs
1745 with the same alignment, unless the target prefers to align
1746 stores over load. */
1747 unsigned same_align_drs
1748 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1749 if (!dr0
1750 || same_align_drs_max < same_align_drs)
1751 {
1752 same_align_drs_max = same_align_drs;
1753 dr0 = dr;
1754 }
1755 /* For data-refs with the same number of related
1756 accesses prefer the one where the misalign
1757 computation will be invariant in the outermost loop. */
1758 else if (same_align_drs_max == same_align_drs)
1759 {
1760 struct loop *ivloop0, *ivloop;
1761 ivloop0 = outermost_invariant_loop_for_expr
1762 (loop, DR_BASE_ADDRESS (dr0));
1763 ivloop = outermost_invariant_loop_for_expr
1764 (loop, DR_BASE_ADDRESS (dr));
1765 if ((ivloop && !ivloop0)
1766 || (ivloop && ivloop0
1767 && flow_loop_nested_p (ivloop, ivloop0)))
1768 dr0 = dr;
1769 }
1770
1771 one_misalignment_unknown = true;
1772
1773 /* Check for data refs with unsupportable alignment that
1774 can be peeled. */
1775 if (!supportable_dr_alignment)
1776 {
1777 one_dr_unsupportable = true;
1778 unsupportable_dr = dr;
1779 }
1780
1781 if (!first_store && DR_IS_WRITE (dr))
1782 first_store = dr;
1783 }
1784 }
1785 else
1786 {
1787 if (!aligned_access_p (dr))
1788 {
1789 if (dump_enabled_p ())
1790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1791 "vector alignment may not be reachable\n");
1792 break;
1793 }
1794 }
1795 }
1796
1797 /* Check if we can possibly peel the loop. */
1798 if (!vect_can_advance_ivs_p (loop_vinfo)
1799 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1800 || loop->inner)
1801 do_peeling = false;
1802
1803 struct _vect_peel_extended_info peel_for_known_alignment;
1804 struct _vect_peel_extended_info peel_for_unknown_alignment;
1805 struct _vect_peel_extended_info best_peel;
1806
1807 peel_for_unknown_alignment.inside_cost = INT_MAX;
1808 peel_for_unknown_alignment.outside_cost = INT_MAX;
1809 peel_for_unknown_alignment.peel_info.count = 0;
1810
1811 if (do_peeling
1812 && one_misalignment_unknown)
1813 {
1814 /* Check if the target requires to prefer stores over loads, i.e., if
1815 misaligned stores are more expensive than misaligned loads (taking
1816 drs with same alignment into account). */
1817 unsigned int load_inside_cost = 0;
1818 unsigned int load_outside_cost = 0;
1819 unsigned int store_inside_cost = 0;
1820 unsigned int store_outside_cost = 0;
1821 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1822
1823 stmt_vector_for_cost dummy;
1824 dummy.create (2);
1825 vect_get_peeling_costs_all_drs (datarefs, dr0,
1826 &load_inside_cost,
1827 &load_outside_cost,
1828 &dummy, estimated_npeels, true);
1829 dummy.release ();
1830
1831 if (first_store)
1832 {
1833 dummy.create (2);
1834 vect_get_peeling_costs_all_drs (datarefs, first_store,
1835 &store_inside_cost,
1836 &store_outside_cost,
1837 &dummy, estimated_npeels, true);
1838 dummy.release ();
1839 }
1840 else
1841 {
1842 store_inside_cost = INT_MAX;
1843 store_outside_cost = INT_MAX;
1844 }
1845
1846 if (load_inside_cost > store_inside_cost
1847 || (load_inside_cost == store_inside_cost
1848 && load_outside_cost > store_outside_cost))
1849 {
1850 dr0 = first_store;
1851 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1852 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1853 }
1854 else
1855 {
1856 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1857 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1858 }
1859
1860 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1861 prologue_cost_vec.create (2);
1862 epilogue_cost_vec.create (2);
1863
1864 int dummy2;
1865 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1866 (loop_vinfo, estimated_npeels, &dummy2,
1867 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1868 &prologue_cost_vec, &epilogue_cost_vec);
1869
1870 prologue_cost_vec.release ();
1871 epilogue_cost_vec.release ();
1872
1873 peel_for_unknown_alignment.peel_info.count = 1
1874 + STMT_VINFO_SAME_ALIGN_REFS
1875 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1876 }
1877
1878 peel_for_unknown_alignment.peel_info.npeel = 0;
1879 peel_for_unknown_alignment.peel_info.dr = dr0;
1880
1881 best_peel = peel_for_unknown_alignment;
1882
1883 peel_for_known_alignment.inside_cost = INT_MAX;
1884 peel_for_known_alignment.outside_cost = INT_MAX;
1885 peel_for_known_alignment.peel_info.count = 0;
1886 peel_for_known_alignment.peel_info.dr = NULL;
1887
1888 if (do_peeling && one_misalignment_known)
1889 {
1890 /* Peeling is possible, but there is no data access that is not supported
1891 unless aligned. So we try to choose the best possible peeling from
1892 the hash table. */
1893 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1894 (&peeling_htab, loop_vinfo);
1895 }
1896
1897 /* Compare costs of peeling for known and unknown alignment. */
1898 if (peel_for_known_alignment.peel_info.dr != NULL
1899 && peel_for_unknown_alignment.inside_cost
1900 >= peel_for_known_alignment.inside_cost)
1901 {
1902 best_peel = peel_for_known_alignment;
1903
1904 /* If the best peeling for known alignment has NPEEL == 0, perform no
1905 peeling at all except if there is an unsupportable dr that we can
1906 align. */
1907 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1908 do_peeling = false;
1909 }
1910
1911 /* If there is an unsupportable data ref, prefer this over all choices so far
1912 since we'd have to discard a chosen peeling except when it accidentally
1913 aligned the unsupportable data ref. */
1914 if (one_dr_unsupportable)
1915 dr0 = unsupportable_dr;
1916 else if (do_peeling)
1917 {
1918 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1919 TODO: Use nopeel_outside_cost or get rid of it? */
1920 unsigned nopeel_inside_cost = 0;
1921 unsigned nopeel_outside_cost = 0;
1922
1923 stmt_vector_for_cost dummy;
1924 dummy.create (2);
1925 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1926 &nopeel_outside_cost, &dummy, 0, false);
1927 dummy.release ();
1928
1929 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1930 costs will be recorded. */
1931 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1932 prologue_cost_vec.create (2);
1933 epilogue_cost_vec.create (2);
1934
1935 int dummy2;
1936 nopeel_outside_cost += vect_get_known_peeling_cost
1937 (loop_vinfo, 0, &dummy2,
1938 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1939 &prologue_cost_vec, &epilogue_cost_vec);
1940
1941 prologue_cost_vec.release ();
1942 epilogue_cost_vec.release ();
1943
1944 npeel = best_peel.peel_info.npeel;
1945 dr0 = best_peel.peel_info.dr;
1946
1947 /* If no peeling is not more expensive than the best peeling we
1948 have so far, don't perform any peeling. */
1949 if (nopeel_inside_cost <= best_peel.inside_cost)
1950 do_peeling = false;
1951 }
1952
1953 if (do_peeling)
1954 {
1955 stmt = DR_STMT (dr0);
1956 stmt_info = vinfo_for_stmt (stmt);
1957 vectype = STMT_VINFO_VECTYPE (stmt_info);
1958
1959 if (known_alignment_for_access_p (dr0))
1960 {
1961 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1962 size_zero_node) < 0;
1963 if (!npeel)
1964 {
1965 /* Since it's known at compile time, compute the number of
1966 iterations in the peeled loop (the peeling factor) for use in
1967 updating DR_MISALIGNMENT values. The peeling factor is the
1968 vectorization factor minus the misalignment as an element
1969 count. */
1970 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1971 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1972 npeel = ((mis & (target_align - 1))
1973 / vect_get_scalar_dr_size (dr0));
1974 }
1975
1976 /* For interleaved data access every iteration accesses all the
1977 members of the group, therefore we divide the number of iterations
1978 by the group size. */
1979 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1980 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1981 npeel /= GROUP_SIZE (stmt_info);
1982
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_NOTE, vect_location,
1985 "Try peeling by %d\n", npeel);
1986 }
1987
1988 /* Ensure that all datarefs can be vectorized after the peel. */
1989 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1990 do_peeling = false;
1991
1992 /* Check if all datarefs are supportable and log. */
1993 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1994 {
1995 stat = vect_verify_datarefs_alignment (loop_vinfo);
1996 if (!stat)
1997 do_peeling = false;
1998 else
1999 return stat;
2000 }
2001
2002 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2003 if (do_peeling)
2004 {
2005 unsigned max_allowed_peel
2006 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2007 if (max_allowed_peel != (unsigned)-1)
2008 {
2009 unsigned max_peel = npeel;
2010 if (max_peel == 0)
2011 {
2012 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2013 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2014 }
2015 if (max_peel > max_allowed_peel)
2016 {
2017 do_peeling = false;
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_NOTE, vect_location,
2020 "Disable peeling, max peels reached: %d\n", max_peel);
2021 }
2022 }
2023 }
2024
2025 /* Cost model #2 - if peeling may result in a remaining loop not
2026 iterating enough to be vectorized then do not peel. Since this
2027 is a cost heuristic rather than a correctness decision, use the
2028 most likely runtime value for variable vectorization factors. */
2029 if (do_peeling
2030 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2031 {
2032 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2033 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2034 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2035 < assumed_vf + max_peel)
2036 do_peeling = false;
2037 }
2038
2039 if (do_peeling)
2040 {
2041 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2042 If the misalignment of DR_i is identical to that of dr0 then set
2043 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2044 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2045 by the peeling factor times the element size of DR_i (MOD the
2046 vectorization factor times the size). Otherwise, the
2047 misalignment of DR_i must be set to unknown. */
2048 FOR_EACH_VEC_ELT (datarefs, i, dr)
2049 if (dr != dr0)
2050 {
2051 /* Strided accesses perform only component accesses, alignment
2052 is irrelevant for them. */
2053 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2054 if (STMT_VINFO_STRIDED_P (stmt_info)
2055 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2056 continue;
2057
2058 vect_update_misalignment_for_peel (dr, dr0, npeel);
2059 }
2060
2061 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2062 if (npeel)
2063 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2064 else
2065 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2066 = DR_MISALIGNMENT (dr0);
2067 SET_DR_MISALIGNMENT (dr0, 0);
2068 if (dump_enabled_p ())
2069 {
2070 dump_printf_loc (MSG_NOTE, vect_location,
2071 "Alignment of access forced using peeling.\n");
2072 dump_printf_loc (MSG_NOTE, vect_location,
2073 "Peeling for alignment will be applied.\n");
2074 }
2075
2076 /* The inside-loop cost will be accounted for in vectorizable_load
2077 and vectorizable_store correctly with adjusted alignments.
2078 Drop the body_cst_vec on the floor here. */
2079 stat = vect_verify_datarefs_alignment (loop_vinfo);
2080 gcc_assert (stat);
2081 return stat;
2082 }
2083 }
2084
2085 /* (2) Versioning to force alignment. */
2086
2087 /* Try versioning if:
2088 1) optimize loop for speed
2089 2) there is at least one unsupported misaligned data ref with an unknown
2090 misalignment, and
2091 3) all misaligned data refs with a known misalignment are supported, and
2092 4) the number of runtime alignment checks is within reason. */
2093
2094 do_versioning =
2095 optimize_loop_nest_for_speed_p (loop)
2096 && (!loop->inner); /* FORNOW */
2097
2098 if (do_versioning)
2099 {
2100 FOR_EACH_VEC_ELT (datarefs, i, dr)
2101 {
2102 stmt = DR_STMT (dr);
2103 stmt_info = vinfo_for_stmt (stmt);
2104
2105 /* For interleaving, only the alignment of the first access
2106 matters. */
2107 if (aligned_access_p (dr)
2108 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2109 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2110 continue;
2111
2112 if (STMT_VINFO_STRIDED_P (stmt_info))
2113 {
2114 /* Strided loads perform only component accesses, alignment is
2115 irrelevant for them. */
2116 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2117 continue;
2118 do_versioning = false;
2119 break;
2120 }
2121
2122 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2123
2124 if (!supportable_dr_alignment)
2125 {
2126 gimple *stmt;
2127 int mask;
2128 tree vectype;
2129
2130 if (known_alignment_for_access_p (dr)
2131 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2132 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2133 {
2134 do_versioning = false;
2135 break;
2136 }
2137
2138 stmt = DR_STMT (dr);
2139 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2140 gcc_assert (vectype);
2141
2142 /* The rightmost bits of an aligned address must be zeros.
2143 Construct the mask needed for this test. For example,
2144 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2145 mask must be 15 = 0xf. */
2146 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2147
2148 /* FORNOW: use the same mask to test all potentially unaligned
2149 references in the loop. The vectorizer currently supports
2150 a single vector size, see the reference to
2151 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2152 vectorization factor is computed. */
2153 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2154 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2155 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2156 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2157 DR_STMT (dr));
2158 }
2159 }
2160
2161 /* Versioning requires at least one misaligned data reference. */
2162 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2163 do_versioning = false;
2164 else if (!do_versioning)
2165 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2166 }
2167
2168 if (do_versioning)
2169 {
2170 vec<gimple *> may_misalign_stmts
2171 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2172 gimple *stmt;
2173
2174 /* It can now be assumed that the data references in the statements
2175 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2176 of the loop being vectorized. */
2177 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2178 {
2179 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2180 dr = STMT_VINFO_DATA_REF (stmt_info);
2181 SET_DR_MISALIGNMENT (dr, 0);
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_NOTE, vect_location,
2184 "Alignment of access forced using versioning.\n");
2185 }
2186
2187 if (dump_enabled_p ())
2188 dump_printf_loc (MSG_NOTE, vect_location,
2189 "Versioning for alignment will be applied.\n");
2190
2191 /* Peeling and versioning can't be done together at this time. */
2192 gcc_assert (! (do_peeling && do_versioning));
2193
2194 stat = vect_verify_datarefs_alignment (loop_vinfo);
2195 gcc_assert (stat);
2196 return stat;
2197 }
2198
2199 /* This point is reached if neither peeling nor versioning is being done. */
2200 gcc_assert (! (do_peeling || do_versioning));
2201
2202 stat = vect_verify_datarefs_alignment (loop_vinfo);
2203 return stat;
2204 }
2205
2206
2207 /* Function vect_find_same_alignment_drs.
2208
2209 Update group and alignment relations according to the chosen
2210 vectorization factor. */
2211
2212 static void
2213 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2214 {
2215 struct data_reference *dra = DDR_A (ddr);
2216 struct data_reference *drb = DDR_B (ddr);
2217 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2218 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2219
2220 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2221 return;
2222
2223 if (dra == drb)
2224 return;
2225
2226 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2227 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2228 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2229 return;
2230
2231 /* Two references with distance zero have the same alignment. */
2232 offset_int diff = (wi::to_offset (DR_INIT (dra))
2233 - wi::to_offset (DR_INIT (drb)));
2234 if (diff != 0)
2235 {
2236 /* Get the wider of the two alignments. */
2237 unsigned int align_a = (vect_calculate_target_alignment (dra)
2238 / BITS_PER_UNIT);
2239 unsigned int align_b = (vect_calculate_target_alignment (drb)
2240 / BITS_PER_UNIT);
2241 unsigned int max_align = MAX (align_a, align_b);
2242
2243 /* Require the gap to be a multiple of the larger vector alignment. */
2244 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2245 return;
2246 }
2247
2248 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2249 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2250 if (dump_enabled_p ())
2251 {
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "accesses have the same alignment: ");
2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2255 dump_printf (MSG_NOTE, " and ");
2256 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2257 dump_printf (MSG_NOTE, "\n");
2258 }
2259 }
2260
2261
2262 /* Function vect_analyze_data_refs_alignment
2263
2264 Analyze the alignment of the data-references in the loop.
2265 Return FALSE if a data reference is found that cannot be vectorized. */
2266
2267 bool
2268 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2269 {
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "=== vect_analyze_data_refs_alignment ===\n");
2273
2274 /* Mark groups of data references with same alignment using
2275 data dependence information. */
2276 vec<ddr_p> ddrs = vinfo->ddrs;
2277 struct data_dependence_relation *ddr;
2278 unsigned int i;
2279
2280 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2281 vect_find_same_alignment_drs (ddr);
2282
2283 vec<data_reference_p> datarefs = vinfo->datarefs;
2284 struct data_reference *dr;
2285
2286 vect_record_base_alignments (vinfo);
2287 FOR_EACH_VEC_ELT (datarefs, i, dr)
2288 {
2289 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2290 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2291 && !vect_compute_data_ref_alignment (dr))
2292 {
2293 /* Strided accesses perform only component accesses, misalignment
2294 information is irrelevant for them. */
2295 if (STMT_VINFO_STRIDED_P (stmt_info)
2296 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2297 continue;
2298
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "not vectorized: can't calculate alignment "
2302 "for data ref.\n");
2303
2304 return false;
2305 }
2306 }
2307
2308 return true;
2309 }
2310
2311
2312 /* Analyze alignment of DRs of stmts in NODE. */
2313
2314 static bool
2315 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2316 {
2317 /* We vectorize from the first scalar stmt in the node unless
2318 the node is permuted in which case we start from the first
2319 element in the group. */
2320 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2321 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2322 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2323 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2324
2325 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2326 if (! vect_compute_data_ref_alignment (dr)
2327 /* For creating the data-ref pointer we need alignment of the
2328 first element anyway. */
2329 || (dr != first_dr
2330 && ! vect_compute_data_ref_alignment (first_dr))
2331 || ! verify_data_ref_alignment (dr))
2332 {
2333 if (dump_enabled_p ())
2334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2335 "not vectorized: bad data alignment in basic "
2336 "block.\n");
2337 return false;
2338 }
2339
2340 return true;
2341 }
2342
2343 /* Function vect_slp_analyze_instance_alignment
2344
2345 Analyze the alignment of the data-references in the SLP instance.
2346 Return FALSE if a data reference is found that cannot be vectorized. */
2347
2348 bool
2349 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2350 {
2351 if (dump_enabled_p ())
2352 dump_printf_loc (MSG_NOTE, vect_location,
2353 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2354
2355 slp_tree node;
2356 unsigned i;
2357 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2358 if (! vect_slp_analyze_and_verify_node_alignment (node))
2359 return false;
2360
2361 node = SLP_INSTANCE_TREE (instance);
2362 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2363 && ! vect_slp_analyze_and_verify_node_alignment
2364 (SLP_INSTANCE_TREE (instance)))
2365 return false;
2366
2367 return true;
2368 }
2369
2370
2371 /* Analyze groups of accesses: check that DR belongs to a group of
2372 accesses of legal size, step, etc. Detect gaps, single element
2373 interleaving, and other special cases. Set grouped access info.
2374 Collect groups of strided stores for further use in SLP analysis.
2375 Worker for vect_analyze_group_access. */
2376
2377 static bool
2378 vect_analyze_group_access_1 (struct data_reference *dr)
2379 {
2380 tree step = DR_STEP (dr);
2381 tree scalar_type = TREE_TYPE (DR_REF (dr));
2382 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2383 gimple *stmt = DR_STMT (dr);
2384 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2385 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2386 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2387 HOST_WIDE_INT dr_step = -1;
2388 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2389 bool slp_impossible = false;
2390
2391 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2392 size of the interleaving group (including gaps). */
2393 if (tree_fits_shwi_p (step))
2394 {
2395 dr_step = tree_to_shwi (step);
2396 /* Check that STEP is a multiple of type size. Otherwise there is
2397 a non-element-sized gap at the end of the group which we
2398 cannot represent in GROUP_GAP or GROUP_SIZE.
2399 ??? As we can handle non-constant step fine here we should
2400 simply remove uses of GROUP_GAP between the last and first
2401 element and instead rely on DR_STEP. GROUP_SIZE then would
2402 simply not include that gap. */
2403 if ((dr_step % type_size) != 0)
2404 {
2405 if (dump_enabled_p ())
2406 {
2407 dump_printf_loc (MSG_NOTE, vect_location,
2408 "Step ");
2409 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2410 dump_printf (MSG_NOTE,
2411 " is not a multiple of the element size for ");
2412 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2413 dump_printf (MSG_NOTE, "\n");
2414 }
2415 return false;
2416 }
2417 groupsize = absu_hwi (dr_step) / type_size;
2418 }
2419 else
2420 groupsize = 0;
2421
2422 /* Not consecutive access is possible only if it is a part of interleaving. */
2423 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2424 {
2425 /* Check if it this DR is a part of interleaving, and is a single
2426 element of the group that is accessed in the loop. */
2427
2428 /* Gaps are supported only for loads. STEP must be a multiple of the type
2429 size. The size of the group must be a power of 2. */
2430 if (DR_IS_READ (dr)
2431 && (dr_step % type_size) == 0
2432 && groupsize > 0
2433 && pow2p_hwi (groupsize))
2434 {
2435 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2436 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2437 GROUP_GAP (stmt_info) = groupsize - 1;
2438 if (dump_enabled_p ())
2439 {
2440 dump_printf_loc (MSG_NOTE, vect_location,
2441 "Detected single element interleaving ");
2442 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2443 dump_printf (MSG_NOTE, " step ");
2444 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2445 dump_printf (MSG_NOTE, "\n");
2446 }
2447
2448 return true;
2449 }
2450
2451 if (dump_enabled_p ())
2452 {
2453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2454 "not consecutive access ");
2455 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2456 }
2457
2458 if (bb_vinfo)
2459 {
2460 /* Mark the statement as unvectorizable. */
2461 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2462 return true;
2463 }
2464
2465 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2466 STMT_VINFO_STRIDED_P (stmt_info) = true;
2467 return true;
2468 }
2469
2470 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2471 {
2472 /* First stmt in the interleaving chain. Check the chain. */
2473 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2474 struct data_reference *data_ref = dr;
2475 unsigned int count = 1;
2476 tree prev_init = DR_INIT (data_ref);
2477 gimple *prev = stmt;
2478 HOST_WIDE_INT diff, gaps = 0;
2479
2480 while (next)
2481 {
2482 /* Skip same data-refs. In case that two or more stmts share
2483 data-ref (supported only for loads), we vectorize only the first
2484 stmt, and the rest get their vectorized loads from the first
2485 one. */
2486 if (!tree_int_cst_compare (DR_INIT (data_ref),
2487 DR_INIT (STMT_VINFO_DATA_REF (
2488 vinfo_for_stmt (next)))))
2489 {
2490 if (DR_IS_WRITE (data_ref))
2491 {
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 "Two store stmts share the same dr.\n");
2495 return false;
2496 }
2497
2498 if (dump_enabled_p ())
2499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2500 "Two or more load stmts share the same dr.\n");
2501
2502 /* For load use the same data-ref load. */
2503 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2504
2505 prev = next;
2506 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2507 continue;
2508 }
2509
2510 prev = next;
2511 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2512
2513 /* All group members have the same STEP by construction. */
2514 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2515
2516 /* Check that the distance between two accesses is equal to the type
2517 size. Otherwise, we have gaps. */
2518 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2519 - TREE_INT_CST_LOW (prev_init)) / type_size;
2520 if (diff != 1)
2521 {
2522 /* FORNOW: SLP of accesses with gaps is not supported. */
2523 slp_impossible = true;
2524 if (DR_IS_WRITE (data_ref))
2525 {
2526 if (dump_enabled_p ())
2527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528 "interleaved store with gaps\n");
2529 return false;
2530 }
2531
2532 gaps += diff - 1;
2533 }
2534
2535 last_accessed_element += diff;
2536
2537 /* Store the gap from the previous member of the group. If there is no
2538 gap in the access, GROUP_GAP is always 1. */
2539 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2540
2541 prev_init = DR_INIT (data_ref);
2542 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2543 /* Count the number of data-refs in the chain. */
2544 count++;
2545 }
2546
2547 if (groupsize == 0)
2548 groupsize = count + gaps;
2549
2550 /* This could be UINT_MAX but as we are generating code in a very
2551 inefficient way we have to cap earlier. See PR78699 for example. */
2552 if (groupsize > 4096)
2553 {
2554 if (dump_enabled_p ())
2555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2556 "group is too large\n");
2557 return false;
2558 }
2559
2560 /* Check that the size of the interleaving is equal to count for stores,
2561 i.e., that there are no gaps. */
2562 if (groupsize != count
2563 && !DR_IS_READ (dr))
2564 {
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567 "interleaved store with gaps\n");
2568 return false;
2569 }
2570
2571 /* If there is a gap after the last load in the group it is the
2572 difference between the groupsize and the last accessed
2573 element.
2574 When there is no gap, this difference should be 0. */
2575 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2576
2577 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2578 if (dump_enabled_p ())
2579 {
2580 dump_printf_loc (MSG_NOTE, vect_location,
2581 "Detected interleaving ");
2582 if (DR_IS_READ (dr))
2583 dump_printf (MSG_NOTE, "load ");
2584 else
2585 dump_printf (MSG_NOTE, "store ");
2586 dump_printf (MSG_NOTE, "of size %u starting with ",
2587 (unsigned)groupsize);
2588 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2589 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2590 dump_printf_loc (MSG_NOTE, vect_location,
2591 "There is a gap of %u elements after the group\n",
2592 GROUP_GAP (vinfo_for_stmt (stmt)));
2593 }
2594
2595 /* SLP: create an SLP data structure for every interleaving group of
2596 stores for further analysis in vect_analyse_slp. */
2597 if (DR_IS_WRITE (dr) && !slp_impossible)
2598 {
2599 if (loop_vinfo)
2600 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2601 if (bb_vinfo)
2602 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2603 }
2604 }
2605
2606 return true;
2607 }
2608
2609 /* Analyze groups of accesses: check that DR belongs to a group of
2610 accesses of legal size, step, etc. Detect gaps, single element
2611 interleaving, and other special cases. Set grouped access info.
2612 Collect groups of strided stores for further use in SLP analysis. */
2613
2614 static bool
2615 vect_analyze_group_access (struct data_reference *dr)
2616 {
2617 if (!vect_analyze_group_access_1 (dr))
2618 {
2619 /* Dissolve the group if present. */
2620 gimple *next;
2621 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2622 while (stmt)
2623 {
2624 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2625 next = GROUP_NEXT_ELEMENT (vinfo);
2626 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2627 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2628 stmt = next;
2629 }
2630 return false;
2631 }
2632 return true;
2633 }
2634
2635 /* Analyze the access pattern of the data-reference DR.
2636 In case of non-consecutive accesses call vect_analyze_group_access() to
2637 analyze groups of accesses. */
2638
2639 static bool
2640 vect_analyze_data_ref_access (struct data_reference *dr)
2641 {
2642 tree step = DR_STEP (dr);
2643 tree scalar_type = TREE_TYPE (DR_REF (dr));
2644 gimple *stmt = DR_STMT (dr);
2645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2647 struct loop *loop = NULL;
2648
2649 if (loop_vinfo)
2650 loop = LOOP_VINFO_LOOP (loop_vinfo);
2651
2652 if (loop_vinfo && !step)
2653 {
2654 if (dump_enabled_p ())
2655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2656 "bad data-ref access in loop\n");
2657 return false;
2658 }
2659
2660 /* Allow loads with zero step in inner-loop vectorization. */
2661 if (loop_vinfo && integer_zerop (step))
2662 {
2663 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2664 if (!nested_in_vect_loop_p (loop, stmt))
2665 return DR_IS_READ (dr);
2666 /* Allow references with zero step for outer loops marked
2667 with pragma omp simd only - it guarantees absence of
2668 loop-carried dependencies between inner loop iterations. */
2669 if (!loop->force_vectorize)
2670 {
2671 if (dump_enabled_p ())
2672 dump_printf_loc (MSG_NOTE, vect_location,
2673 "zero step in inner loop of nest\n");
2674 return false;
2675 }
2676 }
2677
2678 if (loop && nested_in_vect_loop_p (loop, stmt))
2679 {
2680 /* Interleaved accesses are not yet supported within outer-loop
2681 vectorization for references in the inner-loop. */
2682 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2683
2684 /* For the rest of the analysis we use the outer-loop step. */
2685 step = STMT_VINFO_DR_STEP (stmt_info);
2686 if (integer_zerop (step))
2687 {
2688 if (dump_enabled_p ())
2689 dump_printf_loc (MSG_NOTE, vect_location,
2690 "zero step in outer loop.\n");
2691 return DR_IS_READ (dr);
2692 }
2693 }
2694
2695 /* Consecutive? */
2696 if (TREE_CODE (step) == INTEGER_CST)
2697 {
2698 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2699 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2700 || (dr_step < 0
2701 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2702 {
2703 /* Mark that it is not interleaving. */
2704 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2705 return true;
2706 }
2707 }
2708
2709 if (loop && nested_in_vect_loop_p (loop, stmt))
2710 {
2711 if (dump_enabled_p ())
2712 dump_printf_loc (MSG_NOTE, vect_location,
2713 "grouped access in outer loop.\n");
2714 return false;
2715 }
2716
2717
2718 /* Assume this is a DR handled by non-constant strided load case. */
2719 if (TREE_CODE (step) != INTEGER_CST)
2720 return (STMT_VINFO_STRIDED_P (stmt_info)
2721 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2722 || vect_analyze_group_access (dr)));
2723
2724 /* Not consecutive access - check if it's a part of interleaving group. */
2725 return vect_analyze_group_access (dr);
2726 }
2727
2728 /* Compare two data-references DRA and DRB to group them into chunks
2729 suitable for grouping. */
2730
2731 static int
2732 dr_group_sort_cmp (const void *dra_, const void *drb_)
2733 {
2734 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2735 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2736 int cmp;
2737
2738 /* Stabilize sort. */
2739 if (dra == drb)
2740 return 0;
2741
2742 /* DRs in different loops never belong to the same group. */
2743 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2744 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2745 if (loopa != loopb)
2746 return loopa->num < loopb->num ? -1 : 1;
2747
2748 /* Ordering of DRs according to base. */
2749 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2750 DR_BASE_ADDRESS (drb));
2751 if (cmp != 0)
2752 return cmp;
2753
2754 /* And according to DR_OFFSET. */
2755 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2756 if (cmp != 0)
2757 return cmp;
2758
2759 /* Put reads before writes. */
2760 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2761 return DR_IS_READ (dra) ? -1 : 1;
2762
2763 /* Then sort after access size. */
2764 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2765 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2766 if (cmp != 0)
2767 return cmp;
2768
2769 /* And after step. */
2770 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2771 if (cmp != 0)
2772 return cmp;
2773
2774 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2775 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2776 if (cmp == 0)
2777 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2778 return cmp;
2779 }
2780
2781 /* Function vect_analyze_data_ref_accesses.
2782
2783 Analyze the access pattern of all the data references in the loop.
2784
2785 FORNOW: the only access pattern that is considered vectorizable is a
2786 simple step 1 (consecutive) access.
2787
2788 FORNOW: handle only arrays and pointer accesses. */
2789
2790 bool
2791 vect_analyze_data_ref_accesses (vec_info *vinfo)
2792 {
2793 unsigned int i;
2794 vec<data_reference_p> datarefs = vinfo->datarefs;
2795 struct data_reference *dr;
2796
2797 if (dump_enabled_p ())
2798 dump_printf_loc (MSG_NOTE, vect_location,
2799 "=== vect_analyze_data_ref_accesses ===\n");
2800
2801 if (datarefs.is_empty ())
2802 return true;
2803
2804 /* Sort the array of datarefs to make building the interleaving chains
2805 linear. Don't modify the original vector's order, it is needed for
2806 determining what dependencies are reversed. */
2807 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2808 datarefs_copy.qsort (dr_group_sort_cmp);
2809
2810 /* Build the interleaving chains. */
2811 for (i = 0; i < datarefs_copy.length () - 1;)
2812 {
2813 data_reference_p dra = datarefs_copy[i];
2814 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2815 stmt_vec_info lastinfo = NULL;
2816 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2817 {
2818 ++i;
2819 continue;
2820 }
2821 for (i = i + 1; i < datarefs_copy.length (); ++i)
2822 {
2823 data_reference_p drb = datarefs_copy[i];
2824 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2825 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2826 break;
2827
2828 /* ??? Imperfect sorting (non-compatible types, non-modulo
2829 accesses, same accesses) can lead to a group to be artificially
2830 split here as we don't just skip over those. If it really
2831 matters we can push those to a worklist and re-iterate
2832 over them. The we can just skip ahead to the next DR here. */
2833
2834 /* DRs in a different loop should not be put into the same
2835 interleaving group. */
2836 if (gimple_bb (DR_STMT (dra))->loop_father
2837 != gimple_bb (DR_STMT (drb))->loop_father)
2838 break;
2839
2840 /* Check that the data-refs have same first location (except init)
2841 and they are both either store or load (not load and store,
2842 not masked loads or stores). */
2843 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2844 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2845 DR_BASE_ADDRESS (drb)) != 0
2846 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2847 || !gimple_assign_single_p (DR_STMT (dra))
2848 || !gimple_assign_single_p (DR_STMT (drb)))
2849 break;
2850
2851 /* Check that the data-refs have the same constant size. */
2852 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2853 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2854 if (!tree_fits_uhwi_p (sza)
2855 || !tree_fits_uhwi_p (szb)
2856 || !tree_int_cst_equal (sza, szb))
2857 break;
2858
2859 /* Check that the data-refs have the same step. */
2860 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2861 break;
2862
2863 /* Check the types are compatible.
2864 ??? We don't distinguish this during sorting. */
2865 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2866 TREE_TYPE (DR_REF (drb))))
2867 break;
2868
2869 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2870 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2871 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2872 HOST_WIDE_INT init_prev
2873 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2874 gcc_assert (init_a <= init_b
2875 && init_a <= init_prev
2876 && init_prev <= init_b);
2877
2878 /* Do not place the same access in the interleaving chain twice. */
2879 if (init_b == init_prev)
2880 {
2881 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2882 < gimple_uid (DR_STMT (drb)));
2883 /* ??? For now we simply "drop" the later reference which is
2884 otherwise the same rather than finishing off this group.
2885 In the end we'd want to re-process duplicates forming
2886 multiple groups from the refs, likely by just collecting
2887 all candidates (including duplicates and split points
2888 below) in a vector and then process them together. */
2889 continue;
2890 }
2891
2892 /* If init_b == init_a + the size of the type * k, we have an
2893 interleaving, and DRA is accessed before DRB. */
2894 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2895 if (type_size_a == 0
2896 || (init_b - init_a) % type_size_a != 0)
2897 break;
2898
2899 /* If we have a store, the accesses are adjacent. This splits
2900 groups into chunks we support (we don't support vectorization
2901 of stores with gaps). */
2902 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2903 break;
2904
2905 /* If the step (if not zero or non-constant) is greater than the
2906 difference between data-refs' inits this splits groups into
2907 suitable sizes. */
2908 if (tree_fits_shwi_p (DR_STEP (dra)))
2909 {
2910 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2911 if (step != 0 && step <= (init_b - init_a))
2912 break;
2913 }
2914
2915 if (dump_enabled_p ())
2916 {
2917 dump_printf_loc (MSG_NOTE, vect_location,
2918 "Detected interleaving ");
2919 if (DR_IS_READ (dra))
2920 dump_printf (MSG_NOTE, "load ");
2921 else
2922 dump_printf (MSG_NOTE, "store ");
2923 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2924 dump_printf (MSG_NOTE, " and ");
2925 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2926 dump_printf (MSG_NOTE, "\n");
2927 }
2928
2929 /* Link the found element into the group list. */
2930 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2931 {
2932 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2933 lastinfo = stmtinfo_a;
2934 }
2935 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2936 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2937 lastinfo = stmtinfo_b;
2938 }
2939 }
2940
2941 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2942 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2943 && !vect_analyze_data_ref_access (dr))
2944 {
2945 if (dump_enabled_p ())
2946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2947 "not vectorized: complicated access pattern.\n");
2948
2949 if (is_a <bb_vec_info> (vinfo))
2950 {
2951 /* Mark the statement as not vectorizable. */
2952 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2953 continue;
2954 }
2955 else
2956 {
2957 datarefs_copy.release ();
2958 return false;
2959 }
2960 }
2961
2962 datarefs_copy.release ();
2963 return true;
2964 }
2965
2966 /* Function vect_vfa_segment_size.
2967
2968 Create an expression that computes the size of segment
2969 that will be accessed for a data reference. The functions takes into
2970 account that realignment loads may access one more vector.
2971
2972 Input:
2973 DR: The data reference.
2974 LENGTH_FACTOR: segment length to consider.
2975
2976 Return an expression whose value is the size of segment which will be
2977 accessed by DR. */
2978
2979 static tree
2980 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2981 {
2982 tree segment_length;
2983
2984 if (integer_zerop (DR_STEP (dr)))
2985 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2986 else
2987 segment_length = size_binop (MULT_EXPR,
2988 fold_convert (sizetype, DR_STEP (dr)),
2989 fold_convert (sizetype, length_factor));
2990
2991 if (vect_supportable_dr_alignment (dr, false)
2992 == dr_explicit_realign_optimized)
2993 {
2994 tree vector_size = TYPE_SIZE_UNIT
2995 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2996
2997 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2998 }
2999 return segment_length;
3000 }
3001
3002 /* Function vect_no_alias_p.
3003
3004 Given data references A and B with equal base and offset, see whether
3005 the alias relation can be decided at compilation time. Return 1 if
3006 it can and the references alias, 0 if it can and the references do
3007 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A
3008 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3009 respectively. */
3010
3011 static int
3012 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3013 tree segment_length_a, tree segment_length_b)
3014 {
3015 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3016 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3017 poly_uint64 const_length_a;
3018 poly_uint64 const_length_b;
3019
3020 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3021 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3022 [a, a+12) */
3023 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3024 {
3025 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3026 offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
3027 }
3028 else
3029 const_length_a = tree_to_poly_uint64 (segment_length_a);
3030 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3031 {
3032 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3033 offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
3034 }
3035 else
3036 const_length_b = tree_to_poly_uint64 (segment_length_b);
3037
3038 if (ranges_known_overlap_p (offset_a, const_length_a,
3039 offset_b, const_length_b))
3040 return 1;
3041
3042 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3043 offset_b, const_length_b))
3044 return 0;
3045
3046 return -1;
3047 }
3048
3049 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3050 in DDR is >= VF. */
3051
3052 static bool
3053 dependence_distance_ge_vf (data_dependence_relation *ddr,
3054 unsigned int loop_depth, poly_uint64 vf)
3055 {
3056 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3057 || DDR_NUM_DIST_VECTS (ddr) == 0)
3058 return false;
3059
3060 /* If the dependence is exact, we should have limited the VF instead. */
3061 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3062
3063 unsigned int i;
3064 lambda_vector dist_v;
3065 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3066 {
3067 HOST_WIDE_INT dist = dist_v[loop_depth];
3068 if (dist != 0
3069 && !(dist > 0 && DDR_REVERSED_P (ddr))
3070 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3071 return false;
3072 }
3073
3074 if (dump_enabled_p ())
3075 {
3076 dump_printf_loc (MSG_NOTE, vect_location,
3077 "dependence distance between ");
3078 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3079 dump_printf (MSG_NOTE, " and ");
3080 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3081 dump_printf (MSG_NOTE, " is >= VF\n");
3082 }
3083
3084 return true;
3085 }
3086
3087 /* Function vect_prune_runtime_alias_test_list.
3088
3089 Prune a list of ddrs to be tested at run-time by versioning for alias.
3090 Merge several alias checks into one if possible.
3091 Return FALSE if resulting list of ddrs is longer then allowed by
3092 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3093
3094 bool
3095 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3096 {
3097 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3098 hash_set <tree_pair_hash> compared_objects;
3099
3100 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3101 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3102 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3103 vec<vec_object_pair> &check_unequal_addrs
3104 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3105 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3106 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3107
3108 ddr_p ddr;
3109 unsigned int i;
3110 tree length_factor;
3111
3112 if (dump_enabled_p ())
3113 dump_printf_loc (MSG_NOTE, vect_location,
3114 "=== vect_prune_runtime_alias_test_list ===\n");
3115
3116 if (may_alias_ddrs.is_empty ())
3117 return true;
3118
3119 comp_alias_ddrs.create (may_alias_ddrs.length ());
3120
3121 unsigned int loop_depth
3122 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3123 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3124
3125 /* First, we collect all data ref pairs for aliasing checks. */
3126 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3127 {
3128 int comp_res;
3129 struct data_reference *dr_a, *dr_b;
3130 gimple *dr_group_first_a, *dr_group_first_b;
3131 tree segment_length_a, segment_length_b;
3132 gimple *stmt_a, *stmt_b;
3133
3134 /* Ignore the alias if the VF we chose ended up being no greater
3135 than the dependence distance. */
3136 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3137 continue;
3138
3139 if (DDR_OBJECT_A (ddr))
3140 {
3141 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3142 if (!compared_objects.add (new_pair))
3143 {
3144 if (dump_enabled_p ())
3145 {
3146 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3147 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3148 dump_printf (MSG_NOTE, " and ");
3149 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3150 dump_printf (MSG_NOTE, " have different addresses\n");
3151 }
3152 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3153 }
3154 continue;
3155 }
3156
3157 dr_a = DDR_A (ddr);
3158 stmt_a = DR_STMT (DDR_A (ddr));
3159 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3160 if (dr_group_first_a)
3161 {
3162 stmt_a = dr_group_first_a;
3163 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3164 }
3165
3166 dr_b = DDR_B (ddr);
3167 stmt_b = DR_STMT (DDR_B (ddr));
3168 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3169 if (dr_group_first_b)
3170 {
3171 stmt_b = dr_group_first_b;
3172 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3173 }
3174
3175 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3176 length_factor = scalar_loop_iters;
3177 else
3178 length_factor = size_int (vect_factor);
3179 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3180 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3181
3182 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3183 DR_BASE_ADDRESS (dr_b));
3184 if (comp_res == 0)
3185 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3186 DR_OFFSET (dr_b));
3187
3188 /* See whether the alias is known at compilation time. */
3189 if (comp_res == 0
3190 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3191 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3192 && poly_int_tree_p (segment_length_a)
3193 && poly_int_tree_p (segment_length_b))
3194 {
3195 int res = vect_compile_time_alias (dr_a, dr_b,
3196 segment_length_a,
3197 segment_length_b);
3198 if (res == 0)
3199 continue;
3200
3201 if (res == 1)
3202 {
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_NOTE, vect_location,
3205 "not vectorized: compilation time alias.\n");
3206 return false;
3207 }
3208 }
3209
3210 dr_with_seg_len_pair_t dr_with_seg_len_pair
3211 (dr_with_seg_len (dr_a, segment_length_a),
3212 dr_with_seg_len (dr_b, segment_length_b));
3213
3214 /* Canonicalize pairs by sorting the two DR members. */
3215 if (comp_res > 0)
3216 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3217
3218 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3219 }
3220
3221 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3222
3223 unsigned int count = (comp_alias_ddrs.length ()
3224 + check_unequal_addrs.length ());
3225 dump_printf_loc (MSG_NOTE, vect_location,
3226 "improved number of alias checks from %d to %d\n",
3227 may_alias_ddrs.length (), count);
3228 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3229 {
3230 if (dump_enabled_p ())
3231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3232 "number of versioning for alias "
3233 "run-time tests exceeds %d "
3234 "(--param vect-max-version-for-alias-checks)\n",
3235 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3236 return false;
3237 }
3238
3239 return true;
3240 }
3241
3242 /* Return true if a non-affine read or write in STMT is suitable for a
3243 gather load or scatter store. Describe the operation in *INFO if so. */
3244
3245 bool
3246 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3247 gather_scatter_info *info)
3248 {
3249 HOST_WIDE_INT scale = 1;
3250 poly_int64 pbitpos, pbitsize;
3251 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3252 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3253 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3254 tree offtype = NULL_TREE;
3255 tree decl, base, off;
3256 machine_mode pmode;
3257 int punsignedp, reversep, pvolatilep = 0;
3258
3259 base = DR_REF (dr);
3260 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3261 see if we can use the def stmt of the address. */
3262 if (is_gimple_call (stmt)
3263 && gimple_call_internal_p (stmt)
3264 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3265 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3266 && TREE_CODE (base) == MEM_REF
3267 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3268 && integer_zerop (TREE_OPERAND (base, 1))
3269 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3270 {
3271 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3272 if (is_gimple_assign (def_stmt)
3273 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3274 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3275 }
3276
3277 /* The gather and scatter builtins need address of the form
3278 loop_invariant + vector * {1, 2, 4, 8}
3279 or
3280 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3281 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3282 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3283 multiplications and additions in it. To get a vector, we need
3284 a single SSA_NAME that will be defined in the loop and will
3285 contain everything that is not loop invariant and that can be
3286 vectorized. The following code attempts to find such a preexistng
3287 SSA_NAME OFF and put the loop invariants into a tree BASE
3288 that can be gimplified before the loop. */
3289 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3290 &punsignedp, &reversep, &pvolatilep);
3291 gcc_assert (base && !reversep);
3292 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3293
3294 if (TREE_CODE (base) == MEM_REF)
3295 {
3296 if (!integer_zerop (TREE_OPERAND (base, 1)))
3297 {
3298 if (off == NULL_TREE)
3299 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3300 else
3301 off = size_binop (PLUS_EXPR, off,
3302 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3303 }
3304 base = TREE_OPERAND (base, 0);
3305 }
3306 else
3307 base = build_fold_addr_expr (base);
3308
3309 if (off == NULL_TREE)
3310 off = size_zero_node;
3311
3312 /* If base is not loop invariant, either off is 0, then we start with just
3313 the constant offset in the loop invariant BASE and continue with base
3314 as OFF, otherwise give up.
3315 We could handle that case by gimplifying the addition of base + off
3316 into some SSA_NAME and use that as off, but for now punt. */
3317 if (!expr_invariant_in_loop_p (loop, base))
3318 {
3319 if (!integer_zerop (off))
3320 return false;
3321 off = base;
3322 base = size_int (pbytepos);
3323 }
3324 /* Otherwise put base + constant offset into the loop invariant BASE
3325 and continue with OFF. */
3326 else
3327 {
3328 base = fold_convert (sizetype, base);
3329 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3330 }
3331
3332 /* OFF at this point may be either a SSA_NAME or some tree expression
3333 from get_inner_reference. Try to peel off loop invariants from it
3334 into BASE as long as possible. */
3335 STRIP_NOPS (off);
3336 while (offtype == NULL_TREE)
3337 {
3338 enum tree_code code;
3339 tree op0, op1, add = NULL_TREE;
3340
3341 if (TREE_CODE (off) == SSA_NAME)
3342 {
3343 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3344
3345 if (expr_invariant_in_loop_p (loop, off))
3346 return false;
3347
3348 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3349 break;
3350
3351 op0 = gimple_assign_rhs1 (def_stmt);
3352 code = gimple_assign_rhs_code (def_stmt);
3353 op1 = gimple_assign_rhs2 (def_stmt);
3354 }
3355 else
3356 {
3357 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3358 return false;
3359 code = TREE_CODE (off);
3360 extract_ops_from_tree (off, &code, &op0, &op1);
3361 }
3362 switch (code)
3363 {
3364 case POINTER_PLUS_EXPR:
3365 case PLUS_EXPR:
3366 if (expr_invariant_in_loop_p (loop, op0))
3367 {
3368 add = op0;
3369 off = op1;
3370 do_add:
3371 add = fold_convert (sizetype, add);
3372 if (scale != 1)
3373 add = size_binop (MULT_EXPR, add, size_int (scale));
3374 base = size_binop (PLUS_EXPR, base, add);
3375 continue;
3376 }
3377 if (expr_invariant_in_loop_p (loop, op1))
3378 {
3379 add = op1;
3380 off = op0;
3381 goto do_add;
3382 }
3383 break;
3384 case MINUS_EXPR:
3385 if (expr_invariant_in_loop_p (loop, op1))
3386 {
3387 add = fold_convert (sizetype, op1);
3388 add = size_binop (MINUS_EXPR, size_zero_node, add);
3389 off = op0;
3390 goto do_add;
3391 }
3392 break;
3393 case MULT_EXPR:
3394 if (scale == 1 && tree_fits_shwi_p (op1))
3395 {
3396 scale = tree_to_shwi (op1);
3397 off = op0;
3398 continue;
3399 }
3400 break;
3401 case SSA_NAME:
3402 off = op0;
3403 continue;
3404 CASE_CONVERT:
3405 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3406 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3407 break;
3408 if (TYPE_PRECISION (TREE_TYPE (op0))
3409 == TYPE_PRECISION (TREE_TYPE (off)))
3410 {
3411 off = op0;
3412 continue;
3413 }
3414 if (TYPE_PRECISION (TREE_TYPE (op0))
3415 < TYPE_PRECISION (TREE_TYPE (off)))
3416 {
3417 off = op0;
3418 offtype = TREE_TYPE (off);
3419 STRIP_NOPS (off);
3420 continue;
3421 }
3422 break;
3423 default:
3424 break;
3425 }
3426 break;
3427 }
3428
3429 /* If at the end OFF still isn't a SSA_NAME or isn't
3430 defined in the loop, punt. */
3431 if (TREE_CODE (off) != SSA_NAME
3432 || expr_invariant_in_loop_p (loop, off))
3433 return false;
3434
3435 if (offtype == NULL_TREE)
3436 offtype = TREE_TYPE (off);
3437
3438 if (DR_IS_READ (dr))
3439 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3440 offtype, scale);
3441 else
3442 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3443 offtype, scale);
3444
3445 if (decl == NULL_TREE)
3446 return false;
3447
3448 info->decl = decl;
3449 info->base = base;
3450 info->offset = off;
3451 info->offset_dt = vect_unknown_def_type;
3452 info->offset_vectype = NULL_TREE;
3453 info->scale = scale;
3454 return true;
3455 }
3456
3457 /* Function vect_analyze_data_refs.
3458
3459 Find all the data references in the loop or basic block.
3460
3461 The general structure of the analysis of data refs in the vectorizer is as
3462 follows:
3463 1- vect_analyze_data_refs(loop/bb): call
3464 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3465 in the loop/bb and their dependences.
3466 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3467 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3468 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3469
3470 */
3471
3472 bool
3473 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3474 {
3475 struct loop *loop = NULL;
3476 unsigned int i;
3477 struct data_reference *dr;
3478 tree scalar_type;
3479
3480 if (dump_enabled_p ())
3481 dump_printf_loc (MSG_NOTE, vect_location,
3482 "=== vect_analyze_data_refs ===\n");
3483
3484 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3485 loop = LOOP_VINFO_LOOP (loop_vinfo);
3486
3487 /* Go through the data-refs, check that the analysis succeeded. Update
3488 pointer from stmt_vec_info struct to DR and vectype. */
3489
3490 vec<data_reference_p> datarefs = vinfo->datarefs;
3491 FOR_EACH_VEC_ELT (datarefs, i, dr)
3492 {
3493 gimple *stmt;
3494 stmt_vec_info stmt_info;
3495 tree base, offset, init;
3496 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3497 bool simd_lane_access = false;
3498 poly_uint64 vf;
3499
3500 again:
3501 if (!dr || !DR_REF (dr))
3502 {
3503 if (dump_enabled_p ())
3504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3505 "not vectorized: unhandled data-ref\n");
3506 return false;
3507 }
3508
3509 stmt = DR_STMT (dr);
3510 stmt_info = vinfo_for_stmt (stmt);
3511
3512 /* Discard clobbers from the dataref vector. We will remove
3513 clobber stmts during vectorization. */
3514 if (gimple_clobber_p (stmt))
3515 {
3516 free_data_ref (dr);
3517 if (i == datarefs.length () - 1)
3518 {
3519 datarefs.pop ();
3520 break;
3521 }
3522 datarefs.ordered_remove (i);
3523 dr = datarefs[i];
3524 goto again;
3525 }
3526
3527 /* Check that analysis of the data-ref succeeded. */
3528 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3529 || !DR_STEP (dr))
3530 {
3531 bool maybe_gather
3532 = DR_IS_READ (dr)
3533 && !TREE_THIS_VOLATILE (DR_REF (dr))
3534 && targetm.vectorize.builtin_gather != NULL;
3535 bool maybe_scatter
3536 = DR_IS_WRITE (dr)
3537 && !TREE_THIS_VOLATILE (DR_REF (dr))
3538 && targetm.vectorize.builtin_scatter != NULL;
3539 bool maybe_simd_lane_access
3540 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3541
3542 /* If target supports vector gather loads or scatter stores, or if
3543 this might be a SIMD lane access, see if they can't be used. */
3544 if (is_a <loop_vec_info> (vinfo)
3545 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3546 && !nested_in_vect_loop_p (loop, stmt))
3547 {
3548 struct data_reference *newdr
3549 = create_data_ref (NULL, loop_containing_stmt (stmt),
3550 DR_REF (dr), stmt, !maybe_scatter,
3551 DR_IS_CONDITIONAL_IN_STMT (dr));
3552 gcc_assert (newdr != NULL && DR_REF (newdr));
3553 if (DR_BASE_ADDRESS (newdr)
3554 && DR_OFFSET (newdr)
3555 && DR_INIT (newdr)
3556 && DR_STEP (newdr)
3557 && integer_zerop (DR_STEP (newdr)))
3558 {
3559 if (maybe_simd_lane_access)
3560 {
3561 tree off = DR_OFFSET (newdr);
3562 STRIP_NOPS (off);
3563 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3564 && TREE_CODE (off) == MULT_EXPR
3565 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3566 {
3567 tree step = TREE_OPERAND (off, 1);
3568 off = TREE_OPERAND (off, 0);
3569 STRIP_NOPS (off);
3570 if (CONVERT_EXPR_P (off)
3571 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3572 0)))
3573 < TYPE_PRECISION (TREE_TYPE (off)))
3574 off = TREE_OPERAND (off, 0);
3575 if (TREE_CODE (off) == SSA_NAME)
3576 {
3577 gimple *def = SSA_NAME_DEF_STMT (off);
3578 tree reft = TREE_TYPE (DR_REF (newdr));
3579 if (is_gimple_call (def)
3580 && gimple_call_internal_p (def)
3581 && (gimple_call_internal_fn (def)
3582 == IFN_GOMP_SIMD_LANE))
3583 {
3584 tree arg = gimple_call_arg (def, 0);
3585 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3586 arg = SSA_NAME_VAR (arg);
3587 if (arg == loop->simduid
3588 /* For now. */
3589 && tree_int_cst_equal
3590 (TYPE_SIZE_UNIT (reft),
3591 step))
3592 {
3593 DR_OFFSET (newdr) = ssize_int (0);
3594 DR_STEP (newdr) = step;
3595 DR_OFFSET_ALIGNMENT (newdr)
3596 = BIGGEST_ALIGNMENT;
3597 DR_STEP_ALIGNMENT (newdr)
3598 = highest_pow2_factor (step);
3599 dr = newdr;
3600 simd_lane_access = true;
3601 }
3602 }
3603 }
3604 }
3605 }
3606 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3607 {
3608 dr = newdr;
3609 if (maybe_gather)
3610 gatherscatter = GATHER;
3611 else
3612 gatherscatter = SCATTER;
3613 }
3614 }
3615 if (gatherscatter == SG_NONE && !simd_lane_access)
3616 free_data_ref (newdr);
3617 }
3618
3619 if (gatherscatter == SG_NONE && !simd_lane_access)
3620 {
3621 if (dump_enabled_p ())
3622 {
3623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3624 "not vectorized: data ref analysis "
3625 "failed ");
3626 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3627 }
3628
3629 if (is_a <bb_vec_info> (vinfo))
3630 break;
3631
3632 return false;
3633 }
3634 }
3635
3636 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3637 {
3638 if (dump_enabled_p ())
3639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3640 "not vectorized: base addr of dr is a "
3641 "constant\n");
3642
3643 if (is_a <bb_vec_info> (vinfo))
3644 break;
3645
3646 if (gatherscatter != SG_NONE || simd_lane_access)
3647 free_data_ref (dr);
3648 return false;
3649 }
3650
3651 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3652 {
3653 if (dump_enabled_p ())
3654 {
3655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3656 "not vectorized: volatile type ");
3657 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3658 }
3659
3660 if (is_a <bb_vec_info> (vinfo))
3661 break;
3662
3663 return false;
3664 }
3665
3666 if (stmt_can_throw_internal (stmt))
3667 {
3668 if (dump_enabled_p ())
3669 {
3670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3671 "not vectorized: statement can throw an "
3672 "exception ");
3673 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3674 }
3675
3676 if (is_a <bb_vec_info> (vinfo))
3677 break;
3678
3679 if (gatherscatter != SG_NONE || simd_lane_access)
3680 free_data_ref (dr);
3681 return false;
3682 }
3683
3684 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3685 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3686 {
3687 if (dump_enabled_p ())
3688 {
3689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3690 "not vectorized: statement is bitfield "
3691 "access ");
3692 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3693 }
3694
3695 if (is_a <bb_vec_info> (vinfo))
3696 break;
3697
3698 if (gatherscatter != SG_NONE || simd_lane_access)
3699 free_data_ref (dr);
3700 return false;
3701 }
3702
3703 base = unshare_expr (DR_BASE_ADDRESS (dr));
3704 offset = unshare_expr (DR_OFFSET (dr));
3705 init = unshare_expr (DR_INIT (dr));
3706
3707 if (is_gimple_call (stmt)
3708 && (!gimple_call_internal_p (stmt)
3709 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3710 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3711 {
3712 if (dump_enabled_p ())
3713 {
3714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3715 "not vectorized: dr in a call ");
3716 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3717 }
3718
3719 if (is_a <bb_vec_info> (vinfo))
3720 break;
3721
3722 if (gatherscatter != SG_NONE || simd_lane_access)
3723 free_data_ref (dr);
3724 return false;
3725 }
3726
3727 /* Update DR field in stmt_vec_info struct. */
3728
3729 /* If the dataref is in an inner-loop of the loop that is considered for
3730 for vectorization, we also want to analyze the access relative to
3731 the outer-loop (DR contains information only relative to the
3732 inner-most enclosing loop). We do that by building a reference to the
3733 first location accessed by the inner-loop, and analyze it relative to
3734 the outer-loop. */
3735 if (loop && nested_in_vect_loop_p (loop, stmt))
3736 {
3737 /* Build a reference to the first location accessed by the
3738 inner loop: *(BASE + INIT + OFFSET). By construction,
3739 this address must be invariant in the inner loop, so we
3740 can consider it as being used in the outer loop. */
3741 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3742 init, offset);
3743 tree init_addr = fold_build_pointer_plus (base, init_offset);
3744 tree init_ref = build_fold_indirect_ref (init_addr);
3745
3746 if (dump_enabled_p ())
3747 {
3748 dump_printf_loc (MSG_NOTE, vect_location,
3749 "analyze in outer loop: ");
3750 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3751 dump_printf (MSG_NOTE, "\n");
3752 }
3753
3754 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3755 init_ref, loop))
3756 /* dr_analyze_innermost already explained the failure. */
3757 return false;
3758
3759 if (dump_enabled_p ())
3760 {
3761 dump_printf_loc (MSG_NOTE, vect_location,
3762 "\touter base_address: ");
3763 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3764 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3765 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3766 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3767 STMT_VINFO_DR_OFFSET (stmt_info));
3768 dump_printf (MSG_NOTE,
3769 "\n\touter constant offset from base address: ");
3770 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771 STMT_VINFO_DR_INIT (stmt_info));
3772 dump_printf (MSG_NOTE, "\n\touter step: ");
3773 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3774 STMT_VINFO_DR_STEP (stmt_info));
3775 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3776 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3777 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3778 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3779 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3780 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3781 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3782 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3783 }
3784 }
3785
3786 if (STMT_VINFO_DATA_REF (stmt_info))
3787 {
3788 if (dump_enabled_p ())
3789 {
3790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3791 "not vectorized: more than one data ref "
3792 "in stmt: ");
3793 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3794 }
3795
3796 if (is_a <bb_vec_info> (vinfo))
3797 break;
3798
3799 if (gatherscatter != SG_NONE || simd_lane_access)
3800 free_data_ref (dr);
3801 return false;
3802 }
3803
3804 STMT_VINFO_DATA_REF (stmt_info) = dr;
3805 if (simd_lane_access)
3806 {
3807 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3808 free_data_ref (datarefs[i]);
3809 datarefs[i] = dr;
3810 }
3811
3812 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3813 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3814 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3815 {
3816 if (dump_enabled_p ())
3817 {
3818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3819 "not vectorized: base object not addressable "
3820 "for stmt: ");
3821 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3822 }
3823 if (is_a <bb_vec_info> (vinfo))
3824 {
3825 /* In BB vectorization the ref can still participate
3826 in dependence analysis, we just can't vectorize it. */
3827 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3828 continue;
3829 }
3830 return false;
3831 }
3832
3833 /* Set vectype for STMT. */
3834 scalar_type = TREE_TYPE (DR_REF (dr));
3835 STMT_VINFO_VECTYPE (stmt_info)
3836 = get_vectype_for_scalar_type (scalar_type);
3837 if (!STMT_VINFO_VECTYPE (stmt_info))
3838 {
3839 if (dump_enabled_p ())
3840 {
3841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3842 "not vectorized: no vectype for stmt: ");
3843 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3844 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3845 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3846 scalar_type);
3847 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3848 }
3849
3850 if (is_a <bb_vec_info> (vinfo))
3851 {
3852 /* No vector type is fine, the ref can still participate
3853 in dependence analysis, we just can't vectorize it. */
3854 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3855 continue;
3856 }
3857
3858 if (gatherscatter != SG_NONE || simd_lane_access)
3859 {
3860 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3861 if (gatherscatter != SG_NONE)
3862 free_data_ref (dr);
3863 }
3864 return false;
3865 }
3866 else
3867 {
3868 if (dump_enabled_p ())
3869 {
3870 dump_printf_loc (MSG_NOTE, vect_location,
3871 "got vectype for stmt: ");
3872 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3873 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3874 STMT_VINFO_VECTYPE (stmt_info));
3875 dump_printf (MSG_NOTE, "\n");
3876 }
3877 }
3878
3879 /* Adjust the minimal vectorization factor according to the
3880 vector type. */
3881 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3882 *min_vf = upper_bound (*min_vf, vf);
3883
3884 if (gatherscatter != SG_NONE)
3885 {
3886 gather_scatter_info gs_info;
3887 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3888 &gs_info)
3889 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3890 {
3891 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3892 free_data_ref (dr);
3893 if (dump_enabled_p ())
3894 {
3895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3896 (gatherscatter == GATHER) ?
3897 "not vectorized: not suitable for gather "
3898 "load " :
3899 "not vectorized: not suitable for scatter "
3900 "store ");
3901 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3902 }
3903 return false;
3904 }
3905
3906 free_data_ref (datarefs[i]);
3907 datarefs[i] = dr;
3908 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3909 }
3910
3911 else if (is_a <loop_vec_info> (vinfo)
3912 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3913 {
3914 if (nested_in_vect_loop_p (loop, stmt))
3915 {
3916 if (dump_enabled_p ())
3917 {
3918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3919 "not vectorized: not suitable for strided "
3920 "load ");
3921 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3922 }
3923 return false;
3924 }
3925 STMT_VINFO_STRIDED_P (stmt_info) = true;
3926 }
3927 }
3928
3929 /* If we stopped analysis at the first dataref we could not analyze
3930 when trying to vectorize a basic-block mark the rest of the datarefs
3931 as not vectorizable and truncate the vector of datarefs. That
3932 avoids spending useless time in analyzing their dependence. */
3933 if (i != datarefs.length ())
3934 {
3935 gcc_assert (is_a <bb_vec_info> (vinfo));
3936 for (unsigned j = i; j < datarefs.length (); ++j)
3937 {
3938 data_reference_p dr = datarefs[j];
3939 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3940 free_data_ref (dr);
3941 }
3942 datarefs.truncate (i);
3943 }
3944
3945 return true;
3946 }
3947
3948
3949 /* Function vect_get_new_vect_var.
3950
3951 Returns a name for a new variable. The current naming scheme appends the
3952 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3953 the name of vectorizer generated variables, and appends that to NAME if
3954 provided. */
3955
3956 tree
3957 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3958 {
3959 const char *prefix;
3960 tree new_vect_var;
3961
3962 switch (var_kind)
3963 {
3964 case vect_simple_var:
3965 prefix = "vect";
3966 break;
3967 case vect_scalar_var:
3968 prefix = "stmp";
3969 break;
3970 case vect_mask_var:
3971 prefix = "mask";
3972 break;
3973 case vect_pointer_var:
3974 prefix = "vectp";
3975 break;
3976 default:
3977 gcc_unreachable ();
3978 }
3979
3980 if (name)
3981 {
3982 char* tmp = concat (prefix, "_", name, NULL);
3983 new_vect_var = create_tmp_reg (type, tmp);
3984 free (tmp);
3985 }
3986 else
3987 new_vect_var = create_tmp_reg (type, prefix);
3988
3989 return new_vect_var;
3990 }
3991
3992 /* Like vect_get_new_vect_var but return an SSA name. */
3993
3994 tree
3995 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3996 {
3997 const char *prefix;
3998 tree new_vect_var;
3999
4000 switch (var_kind)
4001 {
4002 case vect_simple_var:
4003 prefix = "vect";
4004 break;
4005 case vect_scalar_var:
4006 prefix = "stmp";
4007 break;
4008 case vect_pointer_var:
4009 prefix = "vectp";
4010 break;
4011 default:
4012 gcc_unreachable ();
4013 }
4014
4015 if (name)
4016 {
4017 char* tmp = concat (prefix, "_", name, NULL);
4018 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4019 free (tmp);
4020 }
4021 else
4022 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4023
4024 return new_vect_var;
4025 }
4026
4027 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4028
4029 static void
4030 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4031 {
4032 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4033 int misalign = DR_MISALIGNMENT (dr);
4034 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4035 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4036 else
4037 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4038 DR_TARGET_ALIGNMENT (dr), misalign);
4039 }
4040
4041 /* Function vect_create_addr_base_for_vector_ref.
4042
4043 Create an expression that computes the address of the first memory location
4044 that will be accessed for a data reference.
4045
4046 Input:
4047 STMT: The statement containing the data reference.
4048 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4049 OFFSET: Optional. If supplied, it is be added to the initial address.
4050 LOOP: Specify relative to which loop-nest should the address be computed.
4051 For example, when the dataref is in an inner-loop nested in an
4052 outer-loop that is now being vectorized, LOOP can be either the
4053 outer-loop, or the inner-loop. The first memory location accessed
4054 by the following dataref ('in' points to short):
4055
4056 for (i=0; i<N; i++)
4057 for (j=0; j<M; j++)
4058 s += in[i+j]
4059
4060 is as follows:
4061 if LOOP=i_loop: &in (relative to i_loop)
4062 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4063 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4064 initial address. Unlike OFFSET, which is number of elements to
4065 be added, BYTE_OFFSET is measured in bytes.
4066
4067 Output:
4068 1. Return an SSA_NAME whose value is the address of the memory location of
4069 the first vector of the data reference.
4070 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4071 these statement(s) which define the returned SSA_NAME.
4072
4073 FORNOW: We are only handling array accesses with step 1. */
4074
4075 tree
4076 vect_create_addr_base_for_vector_ref (gimple *stmt,
4077 gimple_seq *new_stmt_list,
4078 tree offset,
4079 tree byte_offset)
4080 {
4081 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4082 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4083 const char *base_name;
4084 tree addr_base;
4085 tree dest;
4086 gimple_seq seq = NULL;
4087 tree vect_ptr_type;
4088 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4089 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4090 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4091
4092 tree data_ref_base = unshare_expr (drb->base_address);
4093 tree base_offset = unshare_expr (drb->offset);
4094 tree init = unshare_expr (drb->init);
4095
4096 if (loop_vinfo)
4097 base_name = get_name (data_ref_base);
4098 else
4099 {
4100 base_offset = ssize_int (0);
4101 init = ssize_int (0);
4102 base_name = get_name (DR_REF (dr));
4103 }
4104
4105 /* Create base_offset */
4106 base_offset = size_binop (PLUS_EXPR,
4107 fold_convert (sizetype, base_offset),
4108 fold_convert (sizetype, init));
4109
4110 if (offset)
4111 {
4112 offset = fold_build2 (MULT_EXPR, sizetype,
4113 fold_convert (sizetype, offset), step);
4114 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4115 base_offset, offset);
4116 }
4117 if (byte_offset)
4118 {
4119 byte_offset = fold_convert (sizetype, byte_offset);
4120 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4121 base_offset, byte_offset);
4122 }
4123
4124 /* base + base_offset */
4125 if (loop_vinfo)
4126 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4127 else
4128 {
4129 addr_base = build1 (ADDR_EXPR,
4130 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4131 unshare_expr (DR_REF (dr)));
4132 }
4133
4134 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4135 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4136 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4137 gimple_seq_add_seq (new_stmt_list, seq);
4138
4139 if (DR_PTR_INFO (dr)
4140 && TREE_CODE (addr_base) == SSA_NAME
4141 && !SSA_NAME_PTR_INFO (addr_base))
4142 {
4143 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4144 if (offset || byte_offset)
4145 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4146 }
4147
4148 if (dump_enabled_p ())
4149 {
4150 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4151 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4152 dump_printf (MSG_NOTE, "\n");
4153 }
4154
4155 return addr_base;
4156 }
4157
4158
4159 /* Function vect_create_data_ref_ptr.
4160
4161 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4162 location accessed in the loop by STMT, along with the def-use update
4163 chain to appropriately advance the pointer through the loop iterations.
4164 Also set aliasing information for the pointer. This pointer is used by
4165 the callers to this function to create a memory reference expression for
4166 vector load/store access.
4167
4168 Input:
4169 1. STMT: a stmt that references memory. Expected to be of the form
4170 GIMPLE_ASSIGN <name, data-ref> or
4171 GIMPLE_ASSIGN <data-ref, name>.
4172 2. AGGR_TYPE: the type of the reference, which should be either a vector
4173 or an array.
4174 3. AT_LOOP: the loop where the vector memref is to be created.
4175 4. OFFSET (optional): an offset to be added to the initial address accessed
4176 by the data-ref in STMT.
4177 5. BSI: location where the new stmts are to be placed if there is no loop
4178 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4179 pointing to the initial address.
4180 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4181 to the initial address accessed by the data-ref in STMT. This is
4182 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4183 in bytes.
4184
4185 Output:
4186 1. Declare a new ptr to vector_type, and have it point to the base of the
4187 data reference (initial addressed accessed by the data reference).
4188 For example, for vector of type V8HI, the following code is generated:
4189
4190 v8hi *ap;
4191 ap = (v8hi *)initial_address;
4192
4193 if OFFSET is not supplied:
4194 initial_address = &a[init];
4195 if OFFSET is supplied:
4196 initial_address = &a[init + OFFSET];
4197 if BYTE_OFFSET is supplied:
4198 initial_address = &a[init] + BYTE_OFFSET;
4199
4200 Return the initial_address in INITIAL_ADDRESS.
4201
4202 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4203 update the pointer in each iteration of the loop.
4204
4205 Return the increment stmt that updates the pointer in PTR_INCR.
4206
4207 3. Set INV_P to true if the access pattern of the data reference in the
4208 vectorized loop is invariant. Set it to false otherwise.
4209
4210 4. Return the pointer. */
4211
4212 tree
4213 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4214 tree offset, tree *initial_address,
4215 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4216 bool only_init, bool *inv_p, tree byte_offset)
4217 {
4218 const char *base_name;
4219 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4220 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4221 struct loop *loop = NULL;
4222 bool nested_in_vect_loop = false;
4223 struct loop *containing_loop = NULL;
4224 tree aggr_ptr_type;
4225 tree aggr_ptr;
4226 tree new_temp;
4227 gimple_seq new_stmt_list = NULL;
4228 edge pe = NULL;
4229 basic_block new_bb;
4230 tree aggr_ptr_init;
4231 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4232 tree aptr;
4233 gimple_stmt_iterator incr_gsi;
4234 bool insert_after;
4235 tree indx_before_incr, indx_after_incr;
4236 gimple *incr;
4237 tree step;
4238 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4239
4240 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4241 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4242
4243 if (loop_vinfo)
4244 {
4245 loop = LOOP_VINFO_LOOP (loop_vinfo);
4246 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4247 containing_loop = (gimple_bb (stmt))->loop_father;
4248 pe = loop_preheader_edge (loop);
4249 }
4250 else
4251 {
4252 gcc_assert (bb_vinfo);
4253 only_init = true;
4254 *ptr_incr = NULL;
4255 }
4256
4257 /* Check the step (evolution) of the load in LOOP, and record
4258 whether it's invariant. */
4259 step = vect_dr_behavior (dr)->step;
4260 if (integer_zerop (step))
4261 *inv_p = true;
4262 else
4263 *inv_p = false;
4264
4265 /* Create an expression for the first address accessed by this load
4266 in LOOP. */
4267 base_name = get_name (DR_BASE_ADDRESS (dr));
4268
4269 if (dump_enabled_p ())
4270 {
4271 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4272 dump_printf_loc (MSG_NOTE, vect_location,
4273 "create %s-pointer variable to type: ",
4274 get_tree_code_name (TREE_CODE (aggr_type)));
4275 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4276 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4277 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4278 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4279 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4280 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4281 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4282 else
4283 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4284 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4285 dump_printf (MSG_NOTE, "\n");
4286 }
4287
4288 /* (1) Create the new aggregate-pointer variable.
4289 Vector and array types inherit the alias set of their component
4290 type by default so we need to use a ref-all pointer if the data
4291 reference does not conflict with the created aggregated data
4292 reference because it is not addressable. */
4293 bool need_ref_all = false;
4294 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4295 get_alias_set (DR_REF (dr))))
4296 need_ref_all = true;
4297 /* Likewise for any of the data references in the stmt group. */
4298 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4299 {
4300 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4301 do
4302 {
4303 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4304 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4305 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4306 get_alias_set (DR_REF (sdr))))
4307 {
4308 need_ref_all = true;
4309 break;
4310 }
4311 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4312 }
4313 while (orig_stmt);
4314 }
4315 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4316 need_ref_all);
4317 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4318
4319
4320 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4321 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4322 def-use update cycles for the pointer: one relative to the outer-loop
4323 (LOOP), which is what steps (3) and (4) below do. The other is relative
4324 to the inner-loop (which is the inner-most loop containing the dataref),
4325 and this is done be step (5) below.
4326
4327 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4328 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4329 redundant. Steps (3),(4) create the following:
4330
4331 vp0 = &base_addr;
4332 LOOP: vp1 = phi(vp0,vp2)
4333 ...
4334 ...
4335 vp2 = vp1 + step
4336 goto LOOP
4337
4338 If there is an inner-loop nested in loop, then step (5) will also be
4339 applied, and an additional update in the inner-loop will be created:
4340
4341 vp0 = &base_addr;
4342 LOOP: vp1 = phi(vp0,vp2)
4343 ...
4344 inner: vp3 = phi(vp1,vp4)
4345 vp4 = vp3 + inner_step
4346 if () goto inner
4347 ...
4348 vp2 = vp1 + step
4349 if () goto LOOP */
4350
4351 /* (2) Calculate the initial address of the aggregate-pointer, and set
4352 the aggregate-pointer to point to it before the loop. */
4353
4354 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4355
4356 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4357 offset, byte_offset);
4358 if (new_stmt_list)
4359 {
4360 if (pe)
4361 {
4362 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4363 gcc_assert (!new_bb);
4364 }
4365 else
4366 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4367 }
4368
4369 *initial_address = new_temp;
4370 aggr_ptr_init = new_temp;
4371
4372 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4373 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4374 inner-loop nested in LOOP (during outer-loop vectorization). */
4375
4376 /* No update in loop is required. */
4377 if (only_init && (!loop_vinfo || at_loop == loop))
4378 aptr = aggr_ptr_init;
4379 else
4380 {
4381 /* The step of the aggregate pointer is the type size. */
4382 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4383 /* One exception to the above is when the scalar step of the load in
4384 LOOP is zero. In this case the step here is also zero. */
4385 if (*inv_p)
4386 iv_step = size_zero_node;
4387 else if (tree_int_cst_sgn (step) == -1)
4388 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4389
4390 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4391
4392 create_iv (aggr_ptr_init,
4393 fold_convert (aggr_ptr_type, iv_step),
4394 aggr_ptr, loop, &incr_gsi, insert_after,
4395 &indx_before_incr, &indx_after_incr);
4396 incr = gsi_stmt (incr_gsi);
4397 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4398
4399 /* Copy the points-to information if it exists. */
4400 if (DR_PTR_INFO (dr))
4401 {
4402 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4403 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4404 }
4405 if (ptr_incr)
4406 *ptr_incr = incr;
4407
4408 aptr = indx_before_incr;
4409 }
4410
4411 if (!nested_in_vect_loop || only_init)
4412 return aptr;
4413
4414
4415 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4416 nested in LOOP, if exists. */
4417
4418 gcc_assert (nested_in_vect_loop);
4419 if (!only_init)
4420 {
4421 standard_iv_increment_position (containing_loop, &incr_gsi,
4422 &insert_after);
4423 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4424 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4425 &indx_after_incr);
4426 incr = gsi_stmt (incr_gsi);
4427 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4428
4429 /* Copy the points-to information if it exists. */
4430 if (DR_PTR_INFO (dr))
4431 {
4432 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4433 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4434 }
4435 if (ptr_incr)
4436 *ptr_incr = incr;
4437
4438 return indx_before_incr;
4439 }
4440 else
4441 gcc_unreachable ();
4442 }
4443
4444
4445 /* Function bump_vector_ptr
4446
4447 Increment a pointer (to a vector type) by vector-size. If requested,
4448 i.e. if PTR-INCR is given, then also connect the new increment stmt
4449 to the existing def-use update-chain of the pointer, by modifying
4450 the PTR_INCR as illustrated below:
4451
4452 The pointer def-use update-chain before this function:
4453 DATAREF_PTR = phi (p_0, p_2)
4454 ....
4455 PTR_INCR: p_2 = DATAREF_PTR + step
4456
4457 The pointer def-use update-chain after this function:
4458 DATAREF_PTR = phi (p_0, p_2)
4459 ....
4460 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4461 ....
4462 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4463
4464 Input:
4465 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4466 in the loop.
4467 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4468 the loop. The increment amount across iterations is expected
4469 to be vector_size.
4470 BSI - location where the new update stmt is to be placed.
4471 STMT - the original scalar memory-access stmt that is being vectorized.
4472 BUMP - optional. The offset by which to bump the pointer. If not given,
4473 the offset is assumed to be vector_size.
4474
4475 Output: Return NEW_DATAREF_PTR as illustrated above.
4476
4477 */
4478
4479 tree
4480 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4481 gimple *stmt, tree bump)
4482 {
4483 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4484 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4485 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4486 tree update = TYPE_SIZE_UNIT (vectype);
4487 gassign *incr_stmt;
4488 ssa_op_iter iter;
4489 use_operand_p use_p;
4490 tree new_dataref_ptr;
4491
4492 if (bump)
4493 update = bump;
4494
4495 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4496 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4497 else
4498 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4499 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4500 dataref_ptr, update);
4501 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4502
4503 /* Copy the points-to information if it exists. */
4504 if (DR_PTR_INFO (dr))
4505 {
4506 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4507 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4508 }
4509
4510 if (!ptr_incr)
4511 return new_dataref_ptr;
4512
4513 /* Update the vector-pointer's cross-iteration increment. */
4514 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4515 {
4516 tree use = USE_FROM_PTR (use_p);
4517
4518 if (use == dataref_ptr)
4519 SET_USE (use_p, new_dataref_ptr);
4520 else
4521 gcc_assert (tree_int_cst_compare (use, update) == 0);
4522 }
4523
4524 return new_dataref_ptr;
4525 }
4526
4527
4528 /* Function vect_create_destination_var.
4529
4530 Create a new temporary of type VECTYPE. */
4531
4532 tree
4533 vect_create_destination_var (tree scalar_dest, tree vectype)
4534 {
4535 tree vec_dest;
4536 const char *name;
4537 char *new_name;
4538 tree type;
4539 enum vect_var_kind kind;
4540
4541 kind = vectype
4542 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4543 ? vect_mask_var
4544 : vect_simple_var
4545 : vect_scalar_var;
4546 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4547
4548 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4549
4550 name = get_name (scalar_dest);
4551 if (name)
4552 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4553 else
4554 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4555 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4556 free (new_name);
4557
4558 return vec_dest;
4559 }
4560
4561 /* Function vect_grouped_store_supported.
4562
4563 Returns TRUE if interleave high and interleave low permutations
4564 are supported, and FALSE otherwise. */
4565
4566 bool
4567 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4568 {
4569 machine_mode mode = TYPE_MODE (vectype);
4570
4571 /* vect_permute_store_chain requires the group size to be equal to 3 or
4572 be a power of two. */
4573 if (count != 3 && exact_log2 (count) == -1)
4574 {
4575 if (dump_enabled_p ())
4576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4577 "the size of the group of accesses"
4578 " is not a power of 2 or not eqaul to 3\n");
4579 return false;
4580 }
4581
4582 /* Check that the permutation is supported. */
4583 if (VECTOR_MODE_P (mode))
4584 {
4585 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4586 if (count == 3)
4587 {
4588 unsigned int j0 = 0, j1 = 0, j2 = 0;
4589 unsigned int i, j;
4590
4591 vec_perm_builder sel (nelt, nelt, 1);
4592 sel.quick_grow (nelt);
4593 vec_perm_indices indices;
4594 for (j = 0; j < 3; j++)
4595 {
4596 int nelt0 = ((3 - j) * nelt) % 3;
4597 int nelt1 = ((3 - j) * nelt + 1) % 3;
4598 int nelt2 = ((3 - j) * nelt + 2) % 3;
4599 for (i = 0; i < nelt; i++)
4600 {
4601 if (3 * i + nelt0 < nelt)
4602 sel[3 * i + nelt0] = j0++;
4603 if (3 * i + nelt1 < nelt)
4604 sel[3 * i + nelt1] = nelt + j1++;
4605 if (3 * i + nelt2 < nelt)
4606 sel[3 * i + nelt2] = 0;
4607 }
4608 indices.new_vector (sel, 2, nelt);
4609 if (!can_vec_perm_const_p (mode, indices))
4610 {
4611 if (dump_enabled_p ())
4612 dump_printf (MSG_MISSED_OPTIMIZATION,
4613 "permutation op not supported by target.\n");
4614 return false;
4615 }
4616
4617 for (i = 0; i < nelt; i++)
4618 {
4619 if (3 * i + nelt0 < nelt)
4620 sel[3 * i + nelt0] = 3 * i + nelt0;
4621 if (3 * i + nelt1 < nelt)
4622 sel[3 * i + nelt1] = 3 * i + nelt1;
4623 if (3 * i + nelt2 < nelt)
4624 sel[3 * i + nelt2] = nelt + j2++;
4625 }
4626 indices.new_vector (sel, 2, nelt);
4627 if (!can_vec_perm_const_p (mode, indices))
4628 {
4629 if (dump_enabled_p ())
4630 dump_printf (MSG_MISSED_OPTIMIZATION,
4631 "permutation op not supported by target.\n");
4632 return false;
4633 }
4634 }
4635 return true;
4636 }
4637 else
4638 {
4639 /* If length is not equal to 3 then only power of 2 is supported. */
4640 gcc_assert (pow2p_hwi (count));
4641
4642 /* The encoding has 2 interleaved stepped patterns. */
4643 vec_perm_builder sel (nelt, 2, 3);
4644 sel.quick_grow (6);
4645 for (i = 0; i < 3; i++)
4646 {
4647 sel[i * 2] = i;
4648 sel[i * 2 + 1] = i + nelt;
4649 }
4650 vec_perm_indices indices (sel, 2, nelt);
4651 if (can_vec_perm_const_p (mode, indices))
4652 {
4653 for (i = 0; i < 6; i++)
4654 sel[i] += nelt / 2;
4655 indices.new_vector (sel, 2, nelt);
4656 if (can_vec_perm_const_p (mode, indices))
4657 return true;
4658 }
4659 }
4660 }
4661
4662 if (dump_enabled_p ())
4663 dump_printf (MSG_MISSED_OPTIMIZATION,
4664 "permutaion op not supported by target.\n");
4665 return false;
4666 }
4667
4668
4669 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4670 type VECTYPE. */
4671
4672 bool
4673 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4674 {
4675 return vect_lanes_optab_supported_p ("vec_store_lanes",
4676 vec_store_lanes_optab,
4677 vectype, count);
4678 }
4679
4680
4681 /* Function vect_permute_store_chain.
4682
4683 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4684 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4685 the data correctly for the stores. Return the final references for stores
4686 in RESULT_CHAIN.
4687
4688 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4689 The input is 4 vectors each containing 8 elements. We assign a number to
4690 each element, the input sequence is:
4691
4692 1st vec: 0 1 2 3 4 5 6 7
4693 2nd vec: 8 9 10 11 12 13 14 15
4694 3rd vec: 16 17 18 19 20 21 22 23
4695 4th vec: 24 25 26 27 28 29 30 31
4696
4697 The output sequence should be:
4698
4699 1st vec: 0 8 16 24 1 9 17 25
4700 2nd vec: 2 10 18 26 3 11 19 27
4701 3rd vec: 4 12 20 28 5 13 21 30
4702 4th vec: 6 14 22 30 7 15 23 31
4703
4704 i.e., we interleave the contents of the four vectors in their order.
4705
4706 We use interleave_high/low instructions to create such output. The input of
4707 each interleave_high/low operation is two vectors:
4708 1st vec 2nd vec
4709 0 1 2 3 4 5 6 7
4710 the even elements of the result vector are obtained left-to-right from the
4711 high/low elements of the first vector. The odd elements of the result are
4712 obtained left-to-right from the high/low elements of the second vector.
4713 The output of interleave_high will be: 0 4 1 5
4714 and of interleave_low: 2 6 3 7
4715
4716
4717 The permutation is done in log LENGTH stages. In each stage interleave_high
4718 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4719 where the first argument is taken from the first half of DR_CHAIN and the
4720 second argument from it's second half.
4721 In our example,
4722
4723 I1: interleave_high (1st vec, 3rd vec)
4724 I2: interleave_low (1st vec, 3rd vec)
4725 I3: interleave_high (2nd vec, 4th vec)
4726 I4: interleave_low (2nd vec, 4th vec)
4727
4728 The output for the first stage is:
4729
4730 I1: 0 16 1 17 2 18 3 19
4731 I2: 4 20 5 21 6 22 7 23
4732 I3: 8 24 9 25 10 26 11 27
4733 I4: 12 28 13 29 14 30 15 31
4734
4735 The output of the second stage, i.e. the final result is:
4736
4737 I1: 0 8 16 24 1 9 17 25
4738 I2: 2 10 18 26 3 11 19 27
4739 I3: 4 12 20 28 5 13 21 30
4740 I4: 6 14 22 30 7 15 23 31. */
4741
4742 void
4743 vect_permute_store_chain (vec<tree> dr_chain,
4744 unsigned int length,
4745 gimple *stmt,
4746 gimple_stmt_iterator *gsi,
4747 vec<tree> *result_chain)
4748 {
4749 tree vect1, vect2, high, low;
4750 gimple *perm_stmt;
4751 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4752 tree perm_mask_low, perm_mask_high;
4753 tree data_ref;
4754 tree perm3_mask_low, perm3_mask_high;
4755 unsigned int i, n, log_length = exact_log2 (length);
4756 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4757
4758 result_chain->quick_grow (length);
4759 memcpy (result_chain->address (), dr_chain.address (),
4760 length * sizeof (tree));
4761
4762 if (length == 3)
4763 {
4764 unsigned int j0 = 0, j1 = 0, j2 = 0;
4765
4766 vec_perm_builder sel (nelt, nelt, 1);
4767 sel.quick_grow (nelt);
4768 vec_perm_indices indices;
4769 for (j = 0; j < 3; j++)
4770 {
4771 int nelt0 = ((3 - j) * nelt) % 3;
4772 int nelt1 = ((3 - j) * nelt + 1) % 3;
4773 int nelt2 = ((3 - j) * nelt + 2) % 3;
4774
4775 for (i = 0; i < nelt; i++)
4776 {
4777 if (3 * i + nelt0 < nelt)
4778 sel[3 * i + nelt0] = j0++;
4779 if (3 * i + nelt1 < nelt)
4780 sel[3 * i + nelt1] = nelt + j1++;
4781 if (3 * i + nelt2 < nelt)
4782 sel[3 * i + nelt2] = 0;
4783 }
4784 indices.new_vector (sel, 2, nelt);
4785 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4786
4787 for (i = 0; i < nelt; i++)
4788 {
4789 if (3 * i + nelt0 < nelt)
4790 sel[3 * i + nelt0] = 3 * i + nelt0;
4791 if (3 * i + nelt1 < nelt)
4792 sel[3 * i + nelt1] = 3 * i + nelt1;
4793 if (3 * i + nelt2 < nelt)
4794 sel[3 * i + nelt2] = nelt + j2++;
4795 }
4796 indices.new_vector (sel, 2, nelt);
4797 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4798
4799 vect1 = dr_chain[0];
4800 vect2 = dr_chain[1];
4801
4802 /* Create interleaving stmt:
4803 low = VEC_PERM_EXPR <vect1, vect2,
4804 {j, nelt, *, j + 1, nelt + j + 1, *,
4805 j + 2, nelt + j + 2, *, ...}> */
4806 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4807 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4808 vect2, perm3_mask_low);
4809 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4810
4811 vect1 = data_ref;
4812 vect2 = dr_chain[2];
4813 /* Create interleaving stmt:
4814 low = VEC_PERM_EXPR <vect1, vect2,
4815 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4816 6, 7, nelt + j + 2, ...}> */
4817 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4818 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4819 vect2, perm3_mask_high);
4820 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4821 (*result_chain)[j] = data_ref;
4822 }
4823 }
4824 else
4825 {
4826 /* If length is not equal to 3 then only power of 2 is supported. */
4827 gcc_assert (pow2p_hwi (length));
4828
4829 /* The encoding has 2 interleaved stepped patterns. */
4830 vec_perm_builder sel (nelt, 2, 3);
4831 sel.quick_grow (6);
4832 for (i = 0; i < 3; i++)
4833 {
4834 sel[i * 2] = i;
4835 sel[i * 2 + 1] = i + nelt;
4836 }
4837 vec_perm_indices indices (sel, 2, nelt);
4838 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4839
4840 for (i = 0; i < 6; i++)
4841 sel[i] += nelt / 2;
4842 indices.new_vector (sel, 2, nelt);
4843 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4844
4845 for (i = 0, n = log_length; i < n; i++)
4846 {
4847 for (j = 0; j < length/2; j++)
4848 {
4849 vect1 = dr_chain[j];
4850 vect2 = dr_chain[j+length/2];
4851
4852 /* Create interleaving stmt:
4853 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4854 ...}> */
4855 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4856 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4857 vect2, perm_mask_high);
4858 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4859 (*result_chain)[2*j] = high;
4860
4861 /* Create interleaving stmt:
4862 low = VEC_PERM_EXPR <vect1, vect2,
4863 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4864 ...}> */
4865 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4866 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4867 vect2, perm_mask_low);
4868 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4869 (*result_chain)[2*j+1] = low;
4870 }
4871 memcpy (dr_chain.address (), result_chain->address (),
4872 length * sizeof (tree));
4873 }
4874 }
4875 }
4876
4877 /* Function vect_setup_realignment
4878
4879 This function is called when vectorizing an unaligned load using
4880 the dr_explicit_realign[_optimized] scheme.
4881 This function generates the following code at the loop prolog:
4882
4883 p = initial_addr;
4884 x msq_init = *(floor(p)); # prolog load
4885 realignment_token = call target_builtin;
4886 loop:
4887 x msq = phi (msq_init, ---)
4888
4889 The stmts marked with x are generated only for the case of
4890 dr_explicit_realign_optimized.
4891
4892 The code above sets up a new (vector) pointer, pointing to the first
4893 location accessed by STMT, and a "floor-aligned" load using that pointer.
4894 It also generates code to compute the "realignment-token" (if the relevant
4895 target hook was defined), and creates a phi-node at the loop-header bb
4896 whose arguments are the result of the prolog-load (created by this
4897 function) and the result of a load that takes place in the loop (to be
4898 created by the caller to this function).
4899
4900 For the case of dr_explicit_realign_optimized:
4901 The caller to this function uses the phi-result (msq) to create the
4902 realignment code inside the loop, and sets up the missing phi argument,
4903 as follows:
4904 loop:
4905 msq = phi (msq_init, lsq)
4906 lsq = *(floor(p')); # load in loop
4907 result = realign_load (msq, lsq, realignment_token);
4908
4909 For the case of dr_explicit_realign:
4910 loop:
4911 msq = *(floor(p)); # load in loop
4912 p' = p + (VS-1);
4913 lsq = *(floor(p')); # load in loop
4914 result = realign_load (msq, lsq, realignment_token);
4915
4916 Input:
4917 STMT - (scalar) load stmt to be vectorized. This load accesses
4918 a memory location that may be unaligned.
4919 BSI - place where new code is to be inserted.
4920 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4921 is used.
4922
4923 Output:
4924 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4925 target hook, if defined.
4926 Return value - the result of the loop-header phi node. */
4927
4928 tree
4929 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4930 tree *realignment_token,
4931 enum dr_alignment_support alignment_support_scheme,
4932 tree init_addr,
4933 struct loop **at_loop)
4934 {
4935 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4936 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4937 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4938 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4939 struct loop *loop = NULL;
4940 edge pe = NULL;
4941 tree scalar_dest = gimple_assign_lhs (stmt);
4942 tree vec_dest;
4943 gimple *inc;
4944 tree ptr;
4945 tree data_ref;
4946 basic_block new_bb;
4947 tree msq_init = NULL_TREE;
4948 tree new_temp;
4949 gphi *phi_stmt;
4950 tree msq = NULL_TREE;
4951 gimple_seq stmts = NULL;
4952 bool inv_p;
4953 bool compute_in_loop = false;
4954 bool nested_in_vect_loop = false;
4955 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4956 struct loop *loop_for_initial_load = NULL;
4957
4958 if (loop_vinfo)
4959 {
4960 loop = LOOP_VINFO_LOOP (loop_vinfo);
4961 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4962 }
4963
4964 gcc_assert (alignment_support_scheme == dr_explicit_realign
4965 || alignment_support_scheme == dr_explicit_realign_optimized);
4966
4967 /* We need to generate three things:
4968 1. the misalignment computation
4969 2. the extra vector load (for the optimized realignment scheme).
4970 3. the phi node for the two vectors from which the realignment is
4971 done (for the optimized realignment scheme). */
4972
4973 /* 1. Determine where to generate the misalignment computation.
4974
4975 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4976 calculation will be generated by this function, outside the loop (in the
4977 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4978 caller, inside the loop.
4979
4980 Background: If the misalignment remains fixed throughout the iterations of
4981 the loop, then both realignment schemes are applicable, and also the
4982 misalignment computation can be done outside LOOP. This is because we are
4983 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4984 are a multiple of VS (the Vector Size), and therefore the misalignment in
4985 different vectorized LOOP iterations is always the same.
4986 The problem arises only if the memory access is in an inner-loop nested
4987 inside LOOP, which is now being vectorized using outer-loop vectorization.
4988 This is the only case when the misalignment of the memory access may not
4989 remain fixed throughout the iterations of the inner-loop (as explained in
4990 detail in vect_supportable_dr_alignment). In this case, not only is the
4991 optimized realignment scheme not applicable, but also the misalignment
4992 computation (and generation of the realignment token that is passed to
4993 REALIGN_LOAD) have to be done inside the loop.
4994
4995 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4996 or not, which in turn determines if the misalignment is computed inside
4997 the inner-loop, or outside LOOP. */
4998
4999 if (init_addr != NULL_TREE || !loop_vinfo)
5000 {
5001 compute_in_loop = true;
5002 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5003 }
5004
5005
5006 /* 2. Determine where to generate the extra vector load.
5007
5008 For the optimized realignment scheme, instead of generating two vector
5009 loads in each iteration, we generate a single extra vector load in the
5010 preheader of the loop, and in each iteration reuse the result of the
5011 vector load from the previous iteration. In case the memory access is in
5012 an inner-loop nested inside LOOP, which is now being vectorized using
5013 outer-loop vectorization, we need to determine whether this initial vector
5014 load should be generated at the preheader of the inner-loop, or can be
5015 generated at the preheader of LOOP. If the memory access has no evolution
5016 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5017 to be generated inside LOOP (in the preheader of the inner-loop). */
5018
5019 if (nested_in_vect_loop)
5020 {
5021 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5022 bool invariant_in_outerloop =
5023 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5024 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5025 }
5026 else
5027 loop_for_initial_load = loop;
5028 if (at_loop)
5029 *at_loop = loop_for_initial_load;
5030
5031 if (loop_for_initial_load)
5032 pe = loop_preheader_edge (loop_for_initial_load);
5033
5034 /* 3. For the case of the optimized realignment, create the first vector
5035 load at the loop preheader. */
5036
5037 if (alignment_support_scheme == dr_explicit_realign_optimized)
5038 {
5039 /* Create msq_init = *(floor(p1)) in the loop preheader */
5040 gassign *new_stmt;
5041
5042 gcc_assert (!compute_in_loop);
5043 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5044 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5045 NULL_TREE, &init_addr, NULL, &inc,
5046 true, &inv_p);
5047 if (TREE_CODE (ptr) == SSA_NAME)
5048 new_temp = copy_ssa_name (ptr);
5049 else
5050 new_temp = make_ssa_name (TREE_TYPE (ptr));
5051 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5052 new_stmt = gimple_build_assign
5053 (new_temp, BIT_AND_EXPR, ptr,
5054 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5055 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5056 gcc_assert (!new_bb);
5057 data_ref
5058 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5059 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5060 new_stmt = gimple_build_assign (vec_dest, data_ref);
5061 new_temp = make_ssa_name (vec_dest, new_stmt);
5062 gimple_assign_set_lhs (new_stmt, new_temp);
5063 if (pe)
5064 {
5065 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5066 gcc_assert (!new_bb);
5067 }
5068 else
5069 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5070
5071 msq_init = gimple_assign_lhs (new_stmt);
5072 }
5073
5074 /* 4. Create realignment token using a target builtin, if available.
5075 It is done either inside the containing loop, or before LOOP (as
5076 determined above). */
5077
5078 if (targetm.vectorize.builtin_mask_for_load)
5079 {
5080 gcall *new_stmt;
5081 tree builtin_decl;
5082
5083 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5084 if (!init_addr)
5085 {
5086 /* Generate the INIT_ADDR computation outside LOOP. */
5087 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5088 NULL_TREE);
5089 if (loop)
5090 {
5091 pe = loop_preheader_edge (loop);
5092 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5093 gcc_assert (!new_bb);
5094 }
5095 else
5096 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5097 }
5098
5099 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5100 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5101 vec_dest =
5102 vect_create_destination_var (scalar_dest,
5103 gimple_call_return_type (new_stmt));
5104 new_temp = make_ssa_name (vec_dest, new_stmt);
5105 gimple_call_set_lhs (new_stmt, new_temp);
5106
5107 if (compute_in_loop)
5108 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5109 else
5110 {
5111 /* Generate the misalignment computation outside LOOP. */
5112 pe = loop_preheader_edge (loop);
5113 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5114 gcc_assert (!new_bb);
5115 }
5116
5117 *realignment_token = gimple_call_lhs (new_stmt);
5118
5119 /* The result of the CALL_EXPR to this builtin is determined from
5120 the value of the parameter and no global variables are touched
5121 which makes the builtin a "const" function. Requiring the
5122 builtin to have the "const" attribute makes it unnecessary
5123 to call mark_call_clobbered. */
5124 gcc_assert (TREE_READONLY (builtin_decl));
5125 }
5126
5127 if (alignment_support_scheme == dr_explicit_realign)
5128 return msq;
5129
5130 gcc_assert (!compute_in_loop);
5131 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5132
5133
5134 /* 5. Create msq = phi <msq_init, lsq> in loop */
5135
5136 pe = loop_preheader_edge (containing_loop);
5137 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5138 msq = make_ssa_name (vec_dest);
5139 phi_stmt = create_phi_node (msq, containing_loop->header);
5140 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5141
5142 return msq;
5143 }
5144
5145
5146 /* Function vect_grouped_load_supported.
5147
5148 COUNT is the size of the load group (the number of statements plus the
5149 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5150 only one statement, with a gap of COUNT - 1.
5151
5152 Returns true if a suitable permute exists. */
5153
5154 bool
5155 vect_grouped_load_supported (tree vectype, bool single_element_p,
5156 unsigned HOST_WIDE_INT count)
5157 {
5158 machine_mode mode = TYPE_MODE (vectype);
5159
5160 /* If this is single-element interleaving with an element distance
5161 that leaves unused vector loads around punt - we at least create
5162 very sub-optimal code in that case (and blow up memory,
5163 see PR65518). */
5164 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5165 {
5166 if (dump_enabled_p ())
5167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5168 "single-element interleaving not supported "
5169 "for not adjacent vector loads\n");
5170 return false;
5171 }
5172
5173 /* vect_permute_load_chain requires the group size to be equal to 3 or
5174 be a power of two. */
5175 if (count != 3 && exact_log2 (count) == -1)
5176 {
5177 if (dump_enabled_p ())
5178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5179 "the size of the group of accesses"
5180 " is not a power of 2 or not equal to 3\n");
5181 return false;
5182 }
5183
5184 /* Check that the permutation is supported. */
5185 if (VECTOR_MODE_P (mode))
5186 {
5187 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5188
5189 if (count == 3)
5190 {
5191 vec_perm_builder sel (nelt, nelt, 1);
5192 sel.quick_grow (nelt);
5193 vec_perm_indices indices;
5194 unsigned int k;
5195 for (k = 0; k < 3; k++)
5196 {
5197 for (i = 0; i < nelt; i++)
5198 if (3 * i + k < 2 * nelt)
5199 sel[i] = 3 * i + k;
5200 else
5201 sel[i] = 0;
5202 indices.new_vector (sel, 2, nelt);
5203 if (!can_vec_perm_const_p (mode, indices))
5204 {
5205 if (dump_enabled_p ())
5206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5207 "shuffle of 3 loads is not supported by"
5208 " target\n");
5209 return false;
5210 }
5211 for (i = 0, j = 0; i < nelt; i++)
5212 if (3 * i + k < 2 * nelt)
5213 sel[i] = i;
5214 else
5215 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5216 indices.new_vector (sel, 2, nelt);
5217 if (!can_vec_perm_const_p (mode, indices))
5218 {
5219 if (dump_enabled_p ())
5220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5221 "shuffle of 3 loads is not supported by"
5222 " target\n");
5223 return false;
5224 }
5225 }
5226 return true;
5227 }
5228 else
5229 {
5230 /* If length is not equal to 3 then only power of 2 is supported. */
5231 gcc_assert (pow2p_hwi (count));
5232
5233 /* The encoding has a single stepped pattern. */
5234 vec_perm_builder sel (nelt, 1, 3);
5235 sel.quick_grow (3);
5236 for (i = 0; i < 3; i++)
5237 sel[i] = i * 2;
5238 vec_perm_indices indices (sel, 2, nelt);
5239 if (can_vec_perm_const_p (mode, indices))
5240 {
5241 for (i = 0; i < 3; i++)
5242 sel[i] = i * 2 + 1;
5243 indices.new_vector (sel, 2, nelt);
5244 if (can_vec_perm_const_p (mode, indices))
5245 return true;
5246 }
5247 }
5248 }
5249
5250 if (dump_enabled_p ())
5251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5252 "extract even/odd not supported by target\n");
5253 return false;
5254 }
5255
5256 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5257 type VECTYPE. */
5258
5259 bool
5260 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5261 {
5262 return vect_lanes_optab_supported_p ("vec_load_lanes",
5263 vec_load_lanes_optab,
5264 vectype, count);
5265 }
5266
5267 /* Function vect_permute_load_chain.
5268
5269 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5270 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5271 the input data correctly. Return the final references for loads in
5272 RESULT_CHAIN.
5273
5274 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5275 The input is 4 vectors each containing 8 elements. We assign a number to each
5276 element, the input sequence is:
5277
5278 1st vec: 0 1 2 3 4 5 6 7
5279 2nd vec: 8 9 10 11 12 13 14 15
5280 3rd vec: 16 17 18 19 20 21 22 23
5281 4th vec: 24 25 26 27 28 29 30 31
5282
5283 The output sequence should be:
5284
5285 1st vec: 0 4 8 12 16 20 24 28
5286 2nd vec: 1 5 9 13 17 21 25 29
5287 3rd vec: 2 6 10 14 18 22 26 30
5288 4th vec: 3 7 11 15 19 23 27 31
5289
5290 i.e., the first output vector should contain the first elements of each
5291 interleaving group, etc.
5292
5293 We use extract_even/odd instructions to create such output. The input of
5294 each extract_even/odd operation is two vectors
5295 1st vec 2nd vec
5296 0 1 2 3 4 5 6 7
5297
5298 and the output is the vector of extracted even/odd elements. The output of
5299 extract_even will be: 0 2 4 6
5300 and of extract_odd: 1 3 5 7
5301
5302
5303 The permutation is done in log LENGTH stages. In each stage extract_even
5304 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5305 their order. In our example,
5306
5307 E1: extract_even (1st vec, 2nd vec)
5308 E2: extract_odd (1st vec, 2nd vec)
5309 E3: extract_even (3rd vec, 4th vec)
5310 E4: extract_odd (3rd vec, 4th vec)
5311
5312 The output for the first stage will be:
5313
5314 E1: 0 2 4 6 8 10 12 14
5315 E2: 1 3 5 7 9 11 13 15
5316 E3: 16 18 20 22 24 26 28 30
5317 E4: 17 19 21 23 25 27 29 31
5318
5319 In order to proceed and create the correct sequence for the next stage (or
5320 for the correct output, if the second stage is the last one, as in our
5321 example), we first put the output of extract_even operation and then the
5322 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5323 The input for the second stage is:
5324
5325 1st vec (E1): 0 2 4 6 8 10 12 14
5326 2nd vec (E3): 16 18 20 22 24 26 28 30
5327 3rd vec (E2): 1 3 5 7 9 11 13 15
5328 4th vec (E4): 17 19 21 23 25 27 29 31
5329
5330 The output of the second stage:
5331
5332 E1: 0 4 8 12 16 20 24 28
5333 E2: 2 6 10 14 18 22 26 30
5334 E3: 1 5 9 13 17 21 25 29
5335 E4: 3 7 11 15 19 23 27 31
5336
5337 And RESULT_CHAIN after reordering:
5338
5339 1st vec (E1): 0 4 8 12 16 20 24 28
5340 2nd vec (E3): 1 5 9 13 17 21 25 29
5341 3rd vec (E2): 2 6 10 14 18 22 26 30
5342 4th vec (E4): 3 7 11 15 19 23 27 31. */
5343
5344 static void
5345 vect_permute_load_chain (vec<tree> dr_chain,
5346 unsigned int length,
5347 gimple *stmt,
5348 gimple_stmt_iterator *gsi,
5349 vec<tree> *result_chain)
5350 {
5351 tree data_ref, first_vect, second_vect;
5352 tree perm_mask_even, perm_mask_odd;
5353 tree perm3_mask_low, perm3_mask_high;
5354 gimple *perm_stmt;
5355 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5356 unsigned int i, j, log_length = exact_log2 (length);
5357 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5358
5359 result_chain->quick_grow (length);
5360 memcpy (result_chain->address (), dr_chain.address (),
5361 length * sizeof (tree));
5362
5363 if (length == 3)
5364 {
5365 unsigned int k;
5366
5367 vec_perm_builder sel (nelt, nelt, 1);
5368 sel.quick_grow (nelt);
5369 vec_perm_indices indices;
5370 for (k = 0; k < 3; k++)
5371 {
5372 for (i = 0; i < nelt; i++)
5373 if (3 * i + k < 2 * nelt)
5374 sel[i] = 3 * i + k;
5375 else
5376 sel[i] = 0;
5377 indices.new_vector (sel, 2, nelt);
5378 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5379
5380 for (i = 0, j = 0; i < nelt; i++)
5381 if (3 * i + k < 2 * nelt)
5382 sel[i] = i;
5383 else
5384 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5385 indices.new_vector (sel, 2, nelt);
5386 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5387
5388 first_vect = dr_chain[0];
5389 second_vect = dr_chain[1];
5390
5391 /* Create interleaving stmt (low part of):
5392 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5393 ...}> */
5394 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5395 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5396 second_vect, perm3_mask_low);
5397 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5398
5399 /* Create interleaving stmt (high part of):
5400 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5401 ...}> */
5402 first_vect = data_ref;
5403 second_vect = dr_chain[2];
5404 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5405 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5406 second_vect, perm3_mask_high);
5407 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5408 (*result_chain)[k] = data_ref;
5409 }
5410 }
5411 else
5412 {
5413 /* If length is not equal to 3 then only power of 2 is supported. */
5414 gcc_assert (pow2p_hwi (length));
5415
5416 /* The encoding has a single stepped pattern. */
5417 vec_perm_builder sel (nelt, 1, 3);
5418 sel.quick_grow (3);
5419 for (i = 0; i < 3; ++i)
5420 sel[i] = i * 2;
5421 vec_perm_indices indices (sel, 2, nelt);
5422 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5423
5424 for (i = 0; i < 3; ++i)
5425 sel[i] = i * 2 + 1;
5426 indices.new_vector (sel, 2, nelt);
5427 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5428
5429 for (i = 0; i < log_length; i++)
5430 {
5431 for (j = 0; j < length; j += 2)
5432 {
5433 first_vect = dr_chain[j];
5434 second_vect = dr_chain[j+1];
5435
5436 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5437 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5438 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5439 first_vect, second_vect,
5440 perm_mask_even);
5441 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5442 (*result_chain)[j/2] = data_ref;
5443
5444 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5445 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5446 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5447 first_vect, second_vect,
5448 perm_mask_odd);
5449 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5450 (*result_chain)[j/2+length/2] = data_ref;
5451 }
5452 memcpy (dr_chain.address (), result_chain->address (),
5453 length * sizeof (tree));
5454 }
5455 }
5456 }
5457
5458 /* Function vect_shift_permute_load_chain.
5459
5460 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5461 sequence of stmts to reorder the input data accordingly.
5462 Return the final references for loads in RESULT_CHAIN.
5463 Return true if successed, false otherwise.
5464
5465 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5466 The input is 3 vectors each containing 8 elements. We assign a
5467 number to each element, the input sequence is:
5468
5469 1st vec: 0 1 2 3 4 5 6 7
5470 2nd vec: 8 9 10 11 12 13 14 15
5471 3rd vec: 16 17 18 19 20 21 22 23
5472
5473 The output sequence should be:
5474
5475 1st vec: 0 3 6 9 12 15 18 21
5476 2nd vec: 1 4 7 10 13 16 19 22
5477 3rd vec: 2 5 8 11 14 17 20 23
5478
5479 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5480
5481 First we shuffle all 3 vectors to get correct elements order:
5482
5483 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5484 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5485 3rd vec: (16 19 22) (17 20 23) (18 21)
5486
5487 Next we unite and shift vector 3 times:
5488
5489 1st step:
5490 shift right by 6 the concatenation of:
5491 "1st vec" and "2nd vec"
5492 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5493 "2nd vec" and "3rd vec"
5494 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5495 "3rd vec" and "1st vec"
5496 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5497 | New vectors |
5498
5499 So that now new vectors are:
5500
5501 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5502 2nd vec: (10 13) (16 19 22) (17 20 23)
5503 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5504
5505 2nd step:
5506 shift right by 5 the concatenation of:
5507 "1st vec" and "3rd vec"
5508 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5509 "2nd vec" and "1st vec"
5510 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5511 "3rd vec" and "2nd vec"
5512 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5513 | New vectors |
5514
5515 So that now new vectors are:
5516
5517 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5518 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5519 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5520
5521 3rd step:
5522 shift right by 5 the concatenation of:
5523 "1st vec" and "1st vec"
5524 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5525 shift right by 3 the concatenation of:
5526 "2nd vec" and "2nd vec"
5527 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5528 | New vectors |
5529
5530 So that now all vectors are READY:
5531 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5532 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5533 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5534
5535 This algorithm is faster than one in vect_permute_load_chain if:
5536 1. "shift of a concatination" is faster than general permutation.
5537 This is usually so.
5538 2. The TARGET machine can't execute vector instructions in parallel.
5539 This is because each step of the algorithm depends on previous.
5540 The algorithm in vect_permute_load_chain is much more parallel.
5541
5542 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5543 */
5544
5545 static bool
5546 vect_shift_permute_load_chain (vec<tree> dr_chain,
5547 unsigned int length,
5548 gimple *stmt,
5549 gimple_stmt_iterator *gsi,
5550 vec<tree> *result_chain)
5551 {
5552 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5553 tree perm2_mask1, perm2_mask2, perm3_mask;
5554 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5555 gimple *perm_stmt;
5556
5557 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5558 unsigned int i;
5559 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5560 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5561 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5562
5563 unsigned HOST_WIDE_INT vf;
5564 if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5565 /* Not supported for variable-length vectors. */
5566 return false;
5567
5568 vec_perm_builder sel (nelt, nelt, 1);
5569 sel.quick_grow (nelt);
5570
5571 result_chain->quick_grow (length);
5572 memcpy (result_chain->address (), dr_chain.address (),
5573 length * sizeof (tree));
5574
5575 if (pow2p_hwi (length) && vf > 4)
5576 {
5577 unsigned int j, log_length = exact_log2 (length);
5578 for (i = 0; i < nelt / 2; ++i)
5579 sel[i] = i * 2;
5580 for (i = 0; i < nelt / 2; ++i)
5581 sel[nelt / 2 + i] = i * 2 + 1;
5582 vec_perm_indices indices (sel, 2, nelt);
5583 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5584 {
5585 if (dump_enabled_p ())
5586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5587 "shuffle of 2 fields structure is not \
5588 supported by target\n");
5589 return false;
5590 }
5591 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5592
5593 for (i = 0; i < nelt / 2; ++i)
5594 sel[i] = i * 2 + 1;
5595 for (i = 0; i < nelt / 2; ++i)
5596 sel[nelt / 2 + i] = i * 2;
5597 indices.new_vector (sel, 2, nelt);
5598 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5599 {
5600 if (dump_enabled_p ())
5601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5602 "shuffle of 2 fields structure is not \
5603 supported by target\n");
5604 return false;
5605 }
5606 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5607
5608 /* Generating permutation constant to shift all elements.
5609 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5610 for (i = 0; i < nelt; i++)
5611 sel[i] = nelt / 2 + i;
5612 indices.new_vector (sel, 2, nelt);
5613 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5614 {
5615 if (dump_enabled_p ())
5616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5617 "shift permutation is not supported by target\n");
5618 return false;
5619 }
5620 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5621
5622 /* Generating permutation constant to select vector from 2.
5623 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5624 for (i = 0; i < nelt / 2; i++)
5625 sel[i] = i;
5626 for (i = nelt / 2; i < nelt; i++)
5627 sel[i] = nelt + i;
5628 indices.new_vector (sel, 2, nelt);
5629 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5630 {
5631 if (dump_enabled_p ())
5632 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5633 "select is not supported by target\n");
5634 return false;
5635 }
5636 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5637
5638 for (i = 0; i < log_length; i++)
5639 {
5640 for (j = 0; j < length; j += 2)
5641 {
5642 first_vect = dr_chain[j];
5643 second_vect = dr_chain[j + 1];
5644
5645 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5646 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5647 first_vect, first_vect,
5648 perm2_mask1);
5649 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5650 vect[0] = data_ref;
5651
5652 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5653 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5654 second_vect, second_vect,
5655 perm2_mask2);
5656 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5657 vect[1] = data_ref;
5658
5659 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5660 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5661 vect[0], vect[1], shift1_mask);
5662 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5663 (*result_chain)[j/2 + length/2] = data_ref;
5664
5665 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5666 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5667 vect[0], vect[1], select_mask);
5668 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5669 (*result_chain)[j/2] = data_ref;
5670 }
5671 memcpy (dr_chain.address (), result_chain->address (),
5672 length * sizeof (tree));
5673 }
5674 return true;
5675 }
5676 if (length == 3 && vf > 2)
5677 {
5678 unsigned int k = 0, l = 0;
5679
5680 /* Generating permutation constant to get all elements in rigth order.
5681 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5682 for (i = 0; i < nelt; i++)
5683 {
5684 if (3 * k + (l % 3) >= nelt)
5685 {
5686 k = 0;
5687 l += (3 - (nelt % 3));
5688 }
5689 sel[i] = 3 * k + (l % 3);
5690 k++;
5691 }
5692 vec_perm_indices indices (sel, 2, nelt);
5693 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5694 {
5695 if (dump_enabled_p ())
5696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5697 "shuffle of 3 fields structure is not \
5698 supported by target\n");
5699 return false;
5700 }
5701 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5702
5703 /* Generating permutation constant to shift all elements.
5704 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5705 for (i = 0; i < nelt; i++)
5706 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5707 indices.new_vector (sel, 2, nelt);
5708 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5709 {
5710 if (dump_enabled_p ())
5711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712 "shift permutation is not supported by target\n");
5713 return false;
5714 }
5715 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5716
5717 /* Generating permutation constant to shift all elements.
5718 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5719 for (i = 0; i < nelt; i++)
5720 sel[i] = 2 * (nelt / 3) + 1 + i;
5721 indices.new_vector (sel, 2, nelt);
5722 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5723 {
5724 if (dump_enabled_p ())
5725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726 "shift permutation is not supported by target\n");
5727 return false;
5728 }
5729 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5730
5731 /* Generating permutation constant to shift all elements.
5732 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5733 for (i = 0; i < nelt; i++)
5734 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5735 indices.new_vector (sel, 2, nelt);
5736 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5737 {
5738 if (dump_enabled_p ())
5739 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5740 "shift permutation is not supported by target\n");
5741 return false;
5742 }
5743 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5744
5745 /* Generating permutation constant to shift all elements.
5746 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5747 for (i = 0; i < nelt; i++)
5748 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5749 indices.new_vector (sel, 2, nelt);
5750 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5751 {
5752 if (dump_enabled_p ())
5753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5754 "shift permutation is not supported by target\n");
5755 return false;
5756 }
5757 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5758
5759 for (k = 0; k < 3; k++)
5760 {
5761 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5762 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5763 dr_chain[k], dr_chain[k],
5764 perm3_mask);
5765 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5766 vect[k] = data_ref;
5767 }
5768
5769 for (k = 0; k < 3; k++)
5770 {
5771 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5772 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5773 vect[k % 3], vect[(k + 1) % 3],
5774 shift1_mask);
5775 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5776 vect_shift[k] = data_ref;
5777 }
5778
5779 for (k = 0; k < 3; k++)
5780 {
5781 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5782 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5783 vect_shift[(4 - k) % 3],
5784 vect_shift[(3 - k) % 3],
5785 shift2_mask);
5786 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5787 vect[k] = data_ref;
5788 }
5789
5790 (*result_chain)[3 - (nelt % 3)] = vect[2];
5791
5792 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5793 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5794 vect[0], shift3_mask);
5795 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5796 (*result_chain)[nelt % 3] = data_ref;
5797
5798 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5799 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5800 vect[1], shift4_mask);
5801 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5802 (*result_chain)[0] = data_ref;
5803 return true;
5804 }
5805 return false;
5806 }
5807
5808 /* Function vect_transform_grouped_load.
5809
5810 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5811 to perform their permutation and ascribe the result vectorized statements to
5812 the scalar statements.
5813 */
5814
5815 void
5816 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5817 gimple_stmt_iterator *gsi)
5818 {
5819 machine_mode mode;
5820 vec<tree> result_chain = vNULL;
5821
5822 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5823 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5824 vectors, that are ready for vector computation. */
5825 result_chain.create (size);
5826
5827 /* If reassociation width for vector type is 2 or greater target machine can
5828 execute 2 or more vector instructions in parallel. Otherwise try to
5829 get chain for loads group using vect_shift_permute_load_chain. */
5830 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5831 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5832 || pow2p_hwi (size)
5833 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5834 gsi, &result_chain))
5835 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5836 vect_record_grouped_load_vectors (stmt, result_chain);
5837 result_chain.release ();
5838 }
5839
5840 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5841 generated as part of the vectorization of STMT. Assign the statement
5842 for each vector to the associated scalar statement. */
5843
5844 void
5845 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5846 {
5847 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5848 gimple *next_stmt, *new_stmt;
5849 unsigned int i, gap_count;
5850 tree tmp_data_ref;
5851
5852 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5853 Since we scan the chain starting from it's first node, their order
5854 corresponds the order of data-refs in RESULT_CHAIN. */
5855 next_stmt = first_stmt;
5856 gap_count = 1;
5857 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5858 {
5859 if (!next_stmt)
5860 break;
5861
5862 /* Skip the gaps. Loads created for the gaps will be removed by dead
5863 code elimination pass later. No need to check for the first stmt in
5864 the group, since it always exists.
5865 GROUP_GAP is the number of steps in elements from the previous
5866 access (if there is no gap GROUP_GAP is 1). We skip loads that
5867 correspond to the gaps. */
5868 if (next_stmt != first_stmt
5869 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5870 {
5871 gap_count++;
5872 continue;
5873 }
5874
5875 while (next_stmt)
5876 {
5877 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5878 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5879 copies, and we put the new vector statement in the first available
5880 RELATED_STMT. */
5881 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5882 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5883 else
5884 {
5885 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5886 {
5887 gimple *prev_stmt =
5888 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5889 gimple *rel_stmt =
5890 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5891 while (rel_stmt)
5892 {
5893 prev_stmt = rel_stmt;
5894 rel_stmt =
5895 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5896 }
5897
5898 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5899 new_stmt;
5900 }
5901 }
5902
5903 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5904 gap_count = 1;
5905 /* If NEXT_STMT accesses the same DR as the previous statement,
5906 put the same TMP_DATA_REF as its vectorized statement; otherwise
5907 get the next data-ref from RESULT_CHAIN. */
5908 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5909 break;
5910 }
5911 }
5912 }
5913
5914 /* Function vect_force_dr_alignment_p.
5915
5916 Returns whether the alignment of a DECL can be forced to be aligned
5917 on ALIGNMENT bit boundary. */
5918
5919 bool
5920 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5921 {
5922 if (!VAR_P (decl))
5923 return false;
5924
5925 if (decl_in_symtab_p (decl)
5926 && !symtab_node::get (decl)->can_increase_alignment_p ())
5927 return false;
5928
5929 if (TREE_STATIC (decl))
5930 return (alignment <= MAX_OFILE_ALIGNMENT);
5931 else
5932 return (alignment <= MAX_STACK_ALIGNMENT);
5933 }
5934
5935
5936 /* Return whether the data reference DR is supported with respect to its
5937 alignment.
5938 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5939 it is aligned, i.e., check if it is possible to vectorize it with different
5940 alignment. */
5941
5942 enum dr_alignment_support
5943 vect_supportable_dr_alignment (struct data_reference *dr,
5944 bool check_aligned_accesses)
5945 {
5946 gimple *stmt = DR_STMT (dr);
5947 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5948 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5949 machine_mode mode = TYPE_MODE (vectype);
5950 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5951 struct loop *vect_loop = NULL;
5952 bool nested_in_vect_loop = false;
5953
5954 if (aligned_access_p (dr) && !check_aligned_accesses)
5955 return dr_aligned;
5956
5957 /* For now assume all conditional loads/stores support unaligned
5958 access without any special code. */
5959 if (is_gimple_call (stmt)
5960 && gimple_call_internal_p (stmt)
5961 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5962 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5963 return dr_unaligned_supported;
5964
5965 if (loop_vinfo)
5966 {
5967 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5968 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5969 }
5970
5971 /* Possibly unaligned access. */
5972
5973 /* We can choose between using the implicit realignment scheme (generating
5974 a misaligned_move stmt) and the explicit realignment scheme (generating
5975 aligned loads with a REALIGN_LOAD). There are two variants to the
5976 explicit realignment scheme: optimized, and unoptimized.
5977 We can optimize the realignment only if the step between consecutive
5978 vector loads is equal to the vector size. Since the vector memory
5979 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5980 is guaranteed that the misalignment amount remains the same throughout the
5981 execution of the vectorized loop. Therefore, we can create the
5982 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5983 at the loop preheader.
5984
5985 However, in the case of outer-loop vectorization, when vectorizing a
5986 memory access in the inner-loop nested within the LOOP that is now being
5987 vectorized, while it is guaranteed that the misalignment of the
5988 vectorized memory access will remain the same in different outer-loop
5989 iterations, it is *not* guaranteed that is will remain the same throughout
5990 the execution of the inner-loop. This is because the inner-loop advances
5991 with the original scalar step (and not in steps of VS). If the inner-loop
5992 step happens to be a multiple of VS, then the misalignment remains fixed
5993 and we can use the optimized realignment scheme. For example:
5994
5995 for (i=0; i<N; i++)
5996 for (j=0; j<M; j++)
5997 s += a[i+j];
5998
5999 When vectorizing the i-loop in the above example, the step between
6000 consecutive vector loads is 1, and so the misalignment does not remain
6001 fixed across the execution of the inner-loop, and the realignment cannot
6002 be optimized (as illustrated in the following pseudo vectorized loop):
6003
6004 for (i=0; i<N; i+=4)
6005 for (j=0; j<M; j++){
6006 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6007 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6008 // (assuming that we start from an aligned address).
6009 }
6010
6011 We therefore have to use the unoptimized realignment scheme:
6012
6013 for (i=0; i<N; i+=4)
6014 for (j=k; j<M; j+=4)
6015 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6016 // that the misalignment of the initial address is
6017 // 0).
6018
6019 The loop can then be vectorized as follows:
6020
6021 for (k=0; k<4; k++){
6022 rt = get_realignment_token (&vp[k]);
6023 for (i=0; i<N; i+=4){
6024 v1 = vp[i+k];
6025 for (j=k; j<M; j+=4){
6026 v2 = vp[i+j+VS-1];
6027 va = REALIGN_LOAD <v1,v2,rt>;
6028 vs += va;
6029 v1 = v2;
6030 }
6031 }
6032 } */
6033
6034 if (DR_IS_READ (dr))
6035 {
6036 bool is_packed = false;
6037 tree type = (TREE_TYPE (DR_REF (dr)));
6038
6039 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6040 && (!targetm.vectorize.builtin_mask_for_load
6041 || targetm.vectorize.builtin_mask_for_load ()))
6042 {
6043 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6044
6045 /* If we are doing SLP then the accesses need not have the
6046 same alignment, instead it depends on the SLP group size. */
6047 if (loop_vinfo
6048 && STMT_SLP_TYPE (stmt_info)
6049 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6050 * GROUP_SIZE (vinfo_for_stmt
6051 (GROUP_FIRST_ELEMENT (stmt_info))),
6052 TYPE_VECTOR_SUBPARTS (vectype)))
6053 ;
6054 else if (!loop_vinfo
6055 || (nested_in_vect_loop
6056 && (TREE_INT_CST_LOW (DR_STEP (dr))
6057 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6058 return dr_explicit_realign;
6059 else
6060 return dr_explicit_realign_optimized;
6061 }
6062 if (!known_alignment_for_access_p (dr))
6063 is_packed = not_size_aligned (DR_REF (dr));
6064
6065 if (targetm.vectorize.support_vector_misalignment
6066 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6067 /* Can't software pipeline the loads, but can at least do them. */
6068 return dr_unaligned_supported;
6069 }
6070 else
6071 {
6072 bool is_packed = false;
6073 tree type = (TREE_TYPE (DR_REF (dr)));
6074
6075 if (!known_alignment_for_access_p (dr))
6076 is_packed = not_size_aligned (DR_REF (dr));
6077
6078 if (targetm.vectorize.support_vector_misalignment
6079 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6080 return dr_unaligned_supported;
6081 }
6082
6083 /* Unsupported. */
6084 return dr_unaligned_unsupported;
6085 }