]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-data-refs.c
e908c865fbd79a3fb5f24d7a861732dfdd4acf55
[thirdparty/gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
52
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
55
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
59 {
60 machine_mode mode, array_mode;
61 bool limit_p;
62
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
67
68 if (array_mode == BLKmode)
69 {
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
75 }
76
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
78 {
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
84 }
85
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
90
91 return true;
92 }
93
94
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
111
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
115 {
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
118
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
120
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
126 {
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
128
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
132 }
133
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
137 }
138
139
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
143
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
146 {
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
148
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
151
152 if (dump_enabled_p ())
153 {
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
160 }
161
162 if (optimize_loop_nest_for_size_p (loop))
163 {
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
169 }
170
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
178 }
179
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
184 {
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
190 }
191
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
194 }
195
196
197 /* Function vect_analyze_data_ref_dependence.
198
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
203
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
207 {
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
216
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
221
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
225
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
229
230 /* We do not have to consider dependences between accesses that belong
231 to the same group. */
232 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
233 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
234 return false;
235
236 /* Even if we have an anti-dependence then, as the vectorized loop covers at
237 least two scalar iterations, there is always also a true dependence.
238 As the vectorizer does not re-order loads and stores we can ignore
239 the anti-dependence if TBAA can disambiguate both DRs similar to the
240 case with known negative distance anti-dependences (positive
241 distance anti-dependences would violate TBAA constraints). */
242 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
243 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
244 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
245 get_alias_set (DR_REF (drb))))
246 return false;
247
248 /* Unknown data dependence. */
249 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
250 {
251 /* If user asserted safelen consecutive iterations can be
252 executed concurrently, assume independence. */
253 if (loop->safelen >= 2)
254 {
255 if (loop->safelen < *max_vf)
256 *max_vf = loop->safelen;
257 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
258 return false;
259 }
260
261 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
262 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
263 {
264 if (dump_enabled_p ())
265 {
266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
267 "versioning for alias not supported for: "
268 "can't determine dependence between ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (dra));
271 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
272 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
273 DR_REF (drb));
274 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
275 }
276 return true;
277 }
278
279 if (dump_enabled_p ())
280 {
281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
282 "versioning for alias required: "
283 "can't determine dependence between ");
284 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
285 DR_REF (dra));
286 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
287 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
288 DR_REF (drb));
289 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
290 }
291
292 /* Add to list of ddrs that need to be tested at run-time. */
293 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
294 }
295
296 /* Known data dependence. */
297 if (DDR_NUM_DIST_VECTS (ddr) == 0)
298 {
299 /* If user asserted safelen consecutive iterations can be
300 executed concurrently, assume independence. */
301 if (loop->safelen >= 2)
302 {
303 if (loop->safelen < *max_vf)
304 *max_vf = loop->safelen;
305 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
306 return false;
307 }
308
309 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
310 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
311 {
312 if (dump_enabled_p ())
313 {
314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
315 "versioning for alias not supported for: "
316 "bad dist vector for ");
317 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
318 DR_REF (dra));
319 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
320 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
321 DR_REF (drb));
322 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
323 }
324 return true;
325 }
326
327 if (dump_enabled_p ())
328 {
329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
330 "versioning for alias required: "
331 "bad dist vector for ");
332 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
333 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
335 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
336 }
337 /* Add to list of ddrs that need to be tested at run-time. */
338 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
339 }
340
341 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
342 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
343 {
344 int dist = dist_v[loop_depth];
345
346 if (dump_enabled_p ())
347 dump_printf_loc (MSG_NOTE, vect_location,
348 "dependence distance = %d.\n", dist);
349
350 if (dist == 0)
351 {
352 if (dump_enabled_p ())
353 {
354 dump_printf_loc (MSG_NOTE, vect_location,
355 "dependence distance == 0 between ");
356 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_NOTE, " and ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
360 }
361
362 /* When we perform grouped accesses and perform implicit CSE
363 by detecting equal accesses and doing disambiguation with
364 runtime alias tests like for
365 .. = a[i];
366 .. = a[i+1];
367 a[i] = ..;
368 a[i+1] = ..;
369 *p = ..;
370 .. = a[i];
371 .. = a[i+1];
372 where we will end up loading { a[i], a[i+1] } once, make
373 sure that inserting group loads before the first load and
374 stores after the last store will do the right thing.
375 Similar for groups like
376 a[i] = ...;
377 ... = a[i];
378 a[i+1] = ...;
379 where loads from the group interleave with the store. */
380 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
381 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
382 {
383 gimple *earlier_stmt;
384 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
385 if (DR_IS_WRITE
386 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
387 {
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "READ_WRITE dependence in interleaving."
391 "\n");
392 return true;
393 }
394 }
395
396 continue;
397 }
398
399 if (dist > 0 && DDR_REVERSED_P (ddr))
400 {
401 /* If DDR_REVERSED_P the order of the data-refs in DDR was
402 reversed (to make distance vector positive), and the actual
403 distance is negative. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
406 "dependence distance negative.\n");
407 /* Record a negative dependence distance to later limit the
408 amount of stmt copying / unrolling we can perform.
409 Only need to handle read-after-write dependence. */
410 if (DR_IS_READ (drb)
411 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
412 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
413 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
414 continue;
415 }
416
417 if (abs (dist) >= 2
418 && abs (dist) < *max_vf)
419 {
420 /* The dependence distance requires reduction of the maximal
421 vectorization factor. */
422 *max_vf = abs (dist);
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_NOTE, vect_location,
425 "adjusting maximal vectorization factor to %i\n",
426 *max_vf);
427 }
428
429 if (abs (dist) >= *max_vf)
430 {
431 /* Dependence distance does not create dependence, as far as
432 vectorization is concerned, in this case. */
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_NOTE, vect_location,
435 "dependence distance >= VF.\n");
436 continue;
437 }
438
439 if (dump_enabled_p ())
440 {
441 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
442 "not vectorized, possible dependence "
443 "between data-refs ");
444 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
445 dump_printf (MSG_NOTE, " and ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
447 dump_printf (MSG_NOTE, "\n");
448 }
449
450 return true;
451 }
452
453 return false;
454 }
455
456 /* Function vect_analyze_data_ref_dependences.
457
458 Examine all the data references in the loop, and make sure there do not
459 exist any data dependences between them. Set *MAX_VF according to
460 the maximum vectorization factor the data dependences allow. */
461
462 bool
463 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
464 {
465 unsigned int i;
466 struct data_dependence_relation *ddr;
467
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "=== vect_analyze_data_ref_dependences ===\n");
471
472 LOOP_VINFO_DDRS (loop_vinfo)
473 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
474 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
475 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
476 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
477 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
478 &LOOP_VINFO_DDRS (loop_vinfo),
479 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
480 return false;
481
482 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
483 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
484 return false;
485
486 return true;
487 }
488
489
490 /* Function vect_slp_analyze_data_ref_dependence.
491
492 Return TRUE if there (might) exist a dependence between a memory-reference
493 DRA and a memory-reference DRB. When versioning for alias may check a
494 dependence at run-time, return FALSE. Adjust *MAX_VF according to
495 the data dependence. */
496
497 static bool
498 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
499 {
500 struct data_reference *dra = DDR_A (ddr);
501 struct data_reference *drb = DDR_B (ddr);
502
503 /* We need to check dependences of statements marked as unvectorizable
504 as well, they still can prohibit vectorization. */
505
506 /* Independent data accesses. */
507 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
508 return false;
509
510 if (dra == drb)
511 return false;
512
513 /* Read-read is OK. */
514 if (DR_IS_READ (dra) && DR_IS_READ (drb))
515 return false;
516
517 /* If dra and drb are part of the same interleaving chain consider
518 them independent. */
519 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
520 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
521 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
522 return false;
523
524 /* Unknown data dependence. */
525 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
526 {
527 if (dump_enabled_p ())
528 {
529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
530 "can't determine dependence between ");
531 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
532 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
533 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
534 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
535 }
536 }
537 else if (dump_enabled_p ())
538 {
539 dump_printf_loc (MSG_NOTE, vect_location,
540 "determined dependence between ");
541 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
542 dump_printf (MSG_NOTE, " and ");
543 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
544 dump_printf (MSG_NOTE, "\n");
545 }
546
547 return true;
548 }
549
550
551 /* Analyze dependences involved in the transform of SLP NODE. STORES
552 contain the vector of scalar stores of this instance if we are
553 disambiguating the loads. */
554
555 static bool
556 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
557 vec<gimple *> stores, gimple *last_store)
558 {
559 /* This walks over all stmts involved in the SLP load/store done
560 in NODE verifying we can sink them up to the last stmt in the
561 group. */
562 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
563 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
564 {
565 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
566 if (access == last_access)
567 continue;
568 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
569 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
570 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
571 {
572 gimple *stmt = gsi_stmt (gsi);
573 if (! gimple_vuse (stmt)
574 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
575 continue;
576
577 /* If we couldn't record a (single) data reference for this
578 stmt we have to give up. */
579 /* ??? Here and below if dependence analysis fails we can resort
580 to the alias oracle which can handle more kinds of stmts. */
581 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
582 if (!dr_b)
583 return false;
584
585 /* If we run into a store of this same instance (we've just
586 marked those) then delay dependence checking until we run
587 into the last store because this is where it will have
588 been sunk to (and we verify if we can do that as well). */
589 if (gimple_visited_p (stmt))
590 {
591 if (stmt != last_store)
592 continue;
593 unsigned i;
594 gimple *store;
595 FOR_EACH_VEC_ELT (stores, i, store)
596 {
597 data_reference *store_dr
598 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
599 ddr_p ddr = initialize_data_dependence_relation
600 (dr_a, store_dr, vNULL);
601 if (vect_slp_analyze_data_ref_dependence (ddr))
602 {
603 free_dependence_relation (ddr);
604 return false;
605 }
606 free_dependence_relation (ddr);
607 }
608 }
609
610 ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
611 if (vect_slp_analyze_data_ref_dependence (ddr))
612 {
613 free_dependence_relation (ddr);
614 return false;
615 }
616 free_dependence_relation (ddr);
617 }
618 }
619 return true;
620 }
621
622
623 /* Function vect_analyze_data_ref_dependences.
624
625 Examine all the data references in the basic-block, and make sure there
626 do not exist any data dependences between them. Set *MAX_VF according to
627 the maximum vectorization factor the data dependences allow. */
628
629 bool
630 vect_slp_analyze_instance_dependence (slp_instance instance)
631 {
632 if (dump_enabled_p ())
633 dump_printf_loc (MSG_NOTE, vect_location,
634 "=== vect_slp_analyze_instance_dependence ===\n");
635
636 /* The stores of this instance are at the root of the SLP tree. */
637 slp_tree store = SLP_INSTANCE_TREE (instance);
638 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
639 store = NULL;
640
641 /* Verify we can sink stores to the vectorized stmt insert location. */
642 gimple *last_store = NULL;
643 if (store)
644 {
645 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
646 return false;
647
648 /* Mark stores in this instance and remember the last one. */
649 last_store = vect_find_last_scalar_stmt_in_slp (store);
650 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
651 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
652 }
653
654 bool res = true;
655
656 /* Verify we can sink loads to the vectorized stmt insert location,
657 special-casing stores of this instance. */
658 slp_tree load;
659 unsigned int i;
660 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
661 if (! vect_slp_analyze_node_dependences (instance, load,
662 store
663 ? SLP_TREE_SCALAR_STMTS (store)
664 : vNULL, last_store))
665 {
666 res = false;
667 break;
668 }
669
670 /* Unset the visited flag. */
671 if (store)
672 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
673 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
674
675 return res;
676 }
677
678 /* Function vect_compute_data_ref_alignment
679
680 Compute the misalignment of the data reference DR.
681
682 Output:
683 1. If during the misalignment computation it is found that the data reference
684 cannot be vectorized then false is returned.
685 2. DR_MISALIGNMENT (DR) is defined.
686
687 FOR NOW: No analysis is actually performed. Misalignment is calculated
688 only for trivial cases. TODO. */
689
690 bool
691 vect_compute_data_ref_alignment (struct data_reference *dr)
692 {
693 gimple *stmt = DR_STMT (dr);
694 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
695 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
696 struct loop *loop = NULL;
697 tree ref = DR_REF (dr);
698 tree vectype;
699 tree base, base_addr;
700 tree misalign = NULL_TREE;
701 tree aligned_to;
702 tree step;
703 unsigned HOST_WIDE_INT alignment;
704
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_NOTE, vect_location,
707 "vect_compute_data_ref_alignment:\n");
708
709 if (loop_vinfo)
710 loop = LOOP_VINFO_LOOP (loop_vinfo);
711
712 /* Initialize misalignment to unknown. */
713 SET_DR_MISALIGNMENT (dr, -1);
714
715 if (tree_fits_shwi_p (DR_STEP (dr)))
716 misalign = DR_INIT (dr);
717 aligned_to = DR_ALIGNED_TO (dr);
718 base_addr = DR_BASE_ADDRESS (dr);
719 vectype = STMT_VINFO_VECTYPE (stmt_info);
720
721 /* In case the dataref is in an inner-loop of the loop that is being
722 vectorized (LOOP), we use the base and misalignment information
723 relative to the outer-loop (LOOP). This is ok only if the misalignment
724 stays the same throughout the execution of the inner-loop, which is why
725 we have to check that the stride of the dataref in the inner-loop evenly
726 divides by the vector size. */
727 if (loop && nested_in_vect_loop_p (loop, stmt))
728 {
729 tree step = DR_STEP (dr);
730
731 if (tree_fits_shwi_p (step)
732 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
733 {
734 if (dump_enabled_p ())
735 dump_printf_loc (MSG_NOTE, vect_location,
736 "inner step divides the vector-size.\n");
737 misalign = STMT_VINFO_DR_INIT (stmt_info);
738 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
739 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
740 }
741 else
742 {
743 if (dump_enabled_p ())
744 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
745 "inner step doesn't divide the vector-size.\n");
746 misalign = NULL_TREE;
747 }
748 }
749
750 /* Similarly we can only use base and misalignment information relative to
751 an innermost loop if the misalignment stays the same throughout the
752 execution of the loop. As above, this is the case if the stride of
753 the dataref evenly divides by the vector size. */
754 else
755 {
756 tree step = DR_STEP (dr);
757 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
758
759 if (tree_fits_shwi_p (step)
760 && ((tree_to_shwi (step) * vf)
761 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
762 {
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 "step doesn't divide the vector-size.\n");
766 misalign = NULL_TREE;
767 }
768 }
769
770 /* To look at alignment of the base we have to preserve an inner MEM_REF
771 as that carries alignment information of the actual access. */
772 base = ref;
773 while (handled_component_p (base))
774 base = TREE_OPERAND (base, 0);
775 if (TREE_CODE (base) == MEM_REF)
776 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
777 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
778 unsigned int base_alignment = get_object_alignment (base);
779
780 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
781 DR_VECT_AUX (dr)->base_element_aligned = true;
782
783 alignment = TYPE_ALIGN_UNIT (vectype);
784
785 if ((compare_tree_int (aligned_to, alignment) < 0)
786 || !misalign)
787 {
788 if (dump_enabled_p ())
789 {
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 "Unknown alignment for access: ");
792 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
793 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
794 }
795 return true;
796 }
797
798 if (base_alignment < TYPE_ALIGN (vectype))
799 {
800 /* Strip an inner MEM_REF to a bare decl if possible. */
801 if (TREE_CODE (base) == MEM_REF
802 && integer_zerop (TREE_OPERAND (base, 1))
803 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
804 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
805
806 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
807 {
808 if (dump_enabled_p ())
809 {
810 dump_printf_loc (MSG_NOTE, vect_location,
811 "can't force alignment of ref: ");
812 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
813 dump_printf (MSG_NOTE, "\n");
814 }
815 return true;
816 }
817
818 /* Force the alignment of the decl.
819 NOTE: This is the only change to the code we make during
820 the analysis phase, before deciding to vectorize the loop. */
821 if (dump_enabled_p ())
822 {
823 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
824 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
825 dump_printf (MSG_NOTE, "\n");
826 }
827
828 DR_VECT_AUX (dr)->base_decl = base;
829 DR_VECT_AUX (dr)->base_misaligned = true;
830 DR_VECT_AUX (dr)->base_element_aligned = true;
831 }
832
833 if (loop && nested_in_vect_loop_p (loop, stmt))
834 step = STMT_VINFO_DR_STEP (stmt_info);
835 else
836 step = DR_STEP (dr);
837 /* If this is a backward running DR then first access in the larger
838 vectype actually is N-1 elements before the address in the DR.
839 Adjust misalign accordingly. */
840 if (tree_int_cst_sgn (step) < 0)
841 {
842 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
843 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
844 otherwise we wouldn't be here. */
845 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
846 /* PLUS because STEP was negative. */
847 misalign = size_binop (PLUS_EXPR, misalign, offset);
848 }
849
850 SET_DR_MISALIGNMENT (dr,
851 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
852
853 if (dump_enabled_p ())
854 {
855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
856 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
857 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
858 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
859 }
860
861 return true;
862 }
863
864
865 /* Function vect_update_misalignment_for_peel
866
867 DR - the data reference whose misalignment is to be adjusted.
868 DR_PEEL - the data reference whose misalignment is being made
869 zero in the vector loop by the peel.
870 NPEEL - the number of iterations in the peel loop if the misalignment
871 of DR_PEEL is known at compile time. */
872
873 static void
874 vect_update_misalignment_for_peel (struct data_reference *dr,
875 struct data_reference *dr_peel, int npeel)
876 {
877 unsigned int i;
878 vec<dr_p> same_align_drs;
879 struct data_reference *current_dr;
880 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
881 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
882 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
883 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
884
885 /* For interleaved data accesses the step in the loop must be multiplied by
886 the size of the interleaving group. */
887 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
888 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
889 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
890 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
891
892 /* It can be assumed that the data refs with the same alignment as dr_peel
893 are aligned in the vector loop. */
894 same_align_drs
895 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
896 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
897 {
898 if (current_dr != dr)
899 continue;
900 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
901 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
902 SET_DR_MISALIGNMENT (dr, 0);
903 return;
904 }
905
906 if (known_alignment_for_access_p (dr)
907 && known_alignment_for_access_p (dr_peel))
908 {
909 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
910 int misal = DR_MISALIGNMENT (dr);
911 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
912 misal += negative ? -npeel * dr_size : npeel * dr_size;
913 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
914 SET_DR_MISALIGNMENT (dr, misal);
915 return;
916 }
917
918 if (dump_enabled_p ())
919 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
920 SET_DR_MISALIGNMENT (dr, -1);
921 }
922
923
924 /* Function verify_data_ref_alignment
925
926 Return TRUE if DR can be handled with respect to alignment. */
927
928 static bool
929 verify_data_ref_alignment (data_reference_p dr)
930 {
931 enum dr_alignment_support supportable_dr_alignment
932 = vect_supportable_dr_alignment (dr, false);
933 if (!supportable_dr_alignment)
934 {
935 if (dump_enabled_p ())
936 {
937 if (DR_IS_READ (dr))
938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939 "not vectorized: unsupported unaligned load.");
940 else
941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
942 "not vectorized: unsupported unaligned "
943 "store.");
944
945 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
946 DR_REF (dr));
947 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
948 }
949 return false;
950 }
951
952 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
953 dump_printf_loc (MSG_NOTE, vect_location,
954 "Vectorizing an unaligned access.\n");
955
956 return true;
957 }
958
959 /* Function vect_verify_datarefs_alignment
960
961 Return TRUE if all data references in the loop can be
962 handled with respect to alignment. */
963
964 bool
965 vect_verify_datarefs_alignment (loop_vec_info vinfo)
966 {
967 vec<data_reference_p> datarefs = vinfo->datarefs;
968 struct data_reference *dr;
969 unsigned int i;
970
971 FOR_EACH_VEC_ELT (datarefs, i, dr)
972 {
973 gimple *stmt = DR_STMT (dr);
974 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
975
976 if (!STMT_VINFO_RELEVANT_P (stmt_info))
977 continue;
978
979 /* For interleaving, only the alignment of the first access matters. */
980 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
981 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
982 continue;
983
984 /* Strided accesses perform only component accesses, alignment is
985 irrelevant for them. */
986 if (STMT_VINFO_STRIDED_P (stmt_info)
987 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
988 continue;
989
990 if (! verify_data_ref_alignment (dr))
991 return false;
992 }
993
994 return true;
995 }
996
997 /* Given an memory reference EXP return whether its alignment is less
998 than its size. */
999
1000 static bool
1001 not_size_aligned (tree exp)
1002 {
1003 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1004 return true;
1005
1006 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1007 > get_object_alignment (exp));
1008 }
1009
1010 /* Function vector_alignment_reachable_p
1011
1012 Return true if vector alignment for DR is reachable by peeling
1013 a few loop iterations. Return false otherwise. */
1014
1015 static bool
1016 vector_alignment_reachable_p (struct data_reference *dr)
1017 {
1018 gimple *stmt = DR_STMT (dr);
1019 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1020 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1021
1022 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1023 {
1024 /* For interleaved access we peel only if number of iterations in
1025 the prolog loop ({VF - misalignment}), is a multiple of the
1026 number of the interleaved accesses. */
1027 int elem_size, mis_in_elements;
1028 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1029
1030 /* FORNOW: handle only known alignment. */
1031 if (!known_alignment_for_access_p (dr))
1032 return false;
1033
1034 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1035 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1036
1037 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1038 return false;
1039 }
1040
1041 /* If misalignment is known at the compile time then allow peeling
1042 only if natural alignment is reachable through peeling. */
1043 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1044 {
1045 HOST_WIDE_INT elmsize =
1046 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1047 if (dump_enabled_p ())
1048 {
1049 dump_printf_loc (MSG_NOTE, vect_location,
1050 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1051 dump_printf (MSG_NOTE,
1052 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1053 }
1054 if (DR_MISALIGNMENT (dr) % elmsize)
1055 {
1056 if (dump_enabled_p ())
1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 "data size does not divide the misalignment.\n");
1059 return false;
1060 }
1061 }
1062
1063 if (!known_alignment_for_access_p (dr))
1064 {
1065 tree type = TREE_TYPE (DR_REF (dr));
1066 bool is_packed = not_size_aligned (DR_REF (dr));
1067 if (dump_enabled_p ())
1068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069 "Unknown misalignment, is_packed = %d\n",is_packed);
1070 if ((TYPE_USER_ALIGN (type) && !is_packed)
1071 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1072 return true;
1073 else
1074 return false;
1075 }
1076
1077 return true;
1078 }
1079
1080
1081 /* Calculate the cost of the memory access represented by DR. */
1082
1083 static void
1084 vect_get_data_access_cost (struct data_reference *dr,
1085 unsigned int *inside_cost,
1086 unsigned int *outside_cost,
1087 stmt_vector_for_cost *body_cost_vec)
1088 {
1089 gimple *stmt = DR_STMT (dr);
1090 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1091 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1092 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1093 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1094 int ncopies = vf / nunits;
1095
1096 if (DR_IS_READ (dr))
1097 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1098 NULL, body_cost_vec, false);
1099 else
1100 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1101
1102 if (dump_enabled_p ())
1103 dump_printf_loc (MSG_NOTE, vect_location,
1104 "vect_get_data_access_cost: inside_cost = %d, "
1105 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1106 }
1107
1108
1109 typedef struct _vect_peel_info
1110 {
1111 int npeel;
1112 struct data_reference *dr;
1113 unsigned int count;
1114 } *vect_peel_info;
1115
1116 typedef struct _vect_peel_extended_info
1117 {
1118 struct _vect_peel_info peel_info;
1119 unsigned int inside_cost;
1120 unsigned int outside_cost;
1121 stmt_vector_for_cost body_cost_vec;
1122 } *vect_peel_extended_info;
1123
1124
1125 /* Peeling hashtable helpers. */
1126
1127 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1128 {
1129 static inline hashval_t hash (const _vect_peel_info *);
1130 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1131 };
1132
1133 inline hashval_t
1134 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1135 {
1136 return (hashval_t) peel_info->npeel;
1137 }
1138
1139 inline bool
1140 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1141 {
1142 return (a->npeel == b->npeel);
1143 }
1144
1145
1146 /* Insert DR into peeling hash table with NPEEL as key. */
1147
1148 static void
1149 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1150 loop_vec_info loop_vinfo, struct data_reference *dr,
1151 int npeel)
1152 {
1153 struct _vect_peel_info elem, *slot;
1154 _vect_peel_info **new_slot;
1155 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1156
1157 elem.npeel = npeel;
1158 slot = peeling_htab->find (&elem);
1159 if (slot)
1160 slot->count++;
1161 else
1162 {
1163 slot = XNEW (struct _vect_peel_info);
1164 slot->npeel = npeel;
1165 slot->dr = dr;
1166 slot->count = 1;
1167 new_slot = peeling_htab->find_slot (slot, INSERT);
1168 *new_slot = slot;
1169 }
1170
1171 if (!supportable_dr_alignment
1172 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1173 slot->count += VECT_MAX_COST;
1174 }
1175
1176
1177 /* Traverse peeling hash table to find peeling option that aligns maximum
1178 number of data accesses. */
1179
1180 int
1181 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1182 _vect_peel_extended_info *max)
1183 {
1184 vect_peel_info elem = *slot;
1185
1186 if (elem->count > max->peel_info.count
1187 || (elem->count == max->peel_info.count
1188 && max->peel_info.npeel > elem->npeel))
1189 {
1190 max->peel_info.npeel = elem->npeel;
1191 max->peel_info.count = elem->count;
1192 max->peel_info.dr = elem->dr;
1193 }
1194
1195 return 1;
1196 }
1197
1198
1199 /* Traverse peeling hash table and calculate cost for each peeling option.
1200 Find the one with the lowest cost. */
1201
1202 int
1203 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1204 _vect_peel_extended_info *min)
1205 {
1206 vect_peel_info elem = *slot;
1207 int save_misalignment, dummy;
1208 unsigned int inside_cost = 0, outside_cost = 0, i;
1209 gimple *stmt = DR_STMT (elem->dr);
1210 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1212 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1213 struct data_reference *dr;
1214 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1215
1216 prologue_cost_vec.create (2);
1217 body_cost_vec.create (2);
1218 epilogue_cost_vec.create (2);
1219
1220 FOR_EACH_VEC_ELT (datarefs, i, dr)
1221 {
1222 stmt = DR_STMT (dr);
1223 stmt_info = vinfo_for_stmt (stmt);
1224 /* For interleaving, only the alignment of the first access
1225 matters. */
1226 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1227 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1228 continue;
1229
1230 /* Strided accesses perform only component accesses, alignment is
1231 irrelevant for them. */
1232 if (STMT_VINFO_STRIDED_P (stmt_info)
1233 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1234 continue;
1235
1236 save_misalignment = DR_MISALIGNMENT (dr);
1237 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1238 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1239 &body_cost_vec);
1240 SET_DR_MISALIGNMENT (dr, save_misalignment);
1241 }
1242
1243 outside_cost += vect_get_known_peeling_cost
1244 (loop_vinfo, elem->npeel, &dummy,
1245 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1246 &prologue_cost_vec, &epilogue_cost_vec);
1247
1248 /* Prologue and epilogue costs are added to the target model later.
1249 These costs depend only on the scalar iteration cost, the
1250 number of peeling iterations finally chosen, and the number of
1251 misaligned statements. So discard the information found here. */
1252 prologue_cost_vec.release ();
1253 epilogue_cost_vec.release ();
1254
1255 if (inside_cost < min->inside_cost
1256 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1257 {
1258 min->inside_cost = inside_cost;
1259 min->outside_cost = outside_cost;
1260 min->body_cost_vec.release ();
1261 min->body_cost_vec = body_cost_vec;
1262 min->peel_info.dr = elem->dr;
1263 min->peel_info.npeel = elem->npeel;
1264 }
1265 else
1266 body_cost_vec.release ();
1267
1268 return 1;
1269 }
1270
1271
1272 /* Choose best peeling option by traversing peeling hash table and either
1273 choosing an option with the lowest cost (if cost model is enabled) or the
1274 option that aligns as many accesses as possible. */
1275
1276 static struct data_reference *
1277 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1278 loop_vec_info loop_vinfo,
1279 unsigned int *npeel,
1280 stmt_vector_for_cost *body_cost_vec)
1281 {
1282 struct _vect_peel_extended_info res;
1283
1284 res.peel_info.dr = NULL;
1285 res.body_cost_vec = stmt_vector_for_cost ();
1286
1287 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1288 {
1289 res.inside_cost = INT_MAX;
1290 res.outside_cost = INT_MAX;
1291 peeling_htab->traverse <_vect_peel_extended_info *,
1292 vect_peeling_hash_get_lowest_cost> (&res);
1293 }
1294 else
1295 {
1296 res.peel_info.count = 0;
1297 peeling_htab->traverse <_vect_peel_extended_info *,
1298 vect_peeling_hash_get_most_frequent> (&res);
1299 }
1300
1301 *npeel = res.peel_info.npeel;
1302 *body_cost_vec = res.body_cost_vec;
1303 return res.peel_info.dr;
1304 }
1305
1306
1307 /* Function vect_enhance_data_refs_alignment
1308
1309 This pass will use loop versioning and loop peeling in order to enhance
1310 the alignment of data references in the loop.
1311
1312 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1313 original loop is to be vectorized. Any other loops that are created by
1314 the transformations performed in this pass - are not supposed to be
1315 vectorized. This restriction will be relaxed.
1316
1317 This pass will require a cost model to guide it whether to apply peeling
1318 or versioning or a combination of the two. For example, the scheme that
1319 intel uses when given a loop with several memory accesses, is as follows:
1320 choose one memory access ('p') which alignment you want to force by doing
1321 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1322 other accesses are not necessarily aligned, or (2) use loop versioning to
1323 generate one loop in which all accesses are aligned, and another loop in
1324 which only 'p' is necessarily aligned.
1325
1326 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1327 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1328 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1329
1330 Devising a cost model is the most critical aspect of this work. It will
1331 guide us on which access to peel for, whether to use loop versioning, how
1332 many versions to create, etc. The cost model will probably consist of
1333 generic considerations as well as target specific considerations (on
1334 powerpc for example, misaligned stores are more painful than misaligned
1335 loads).
1336
1337 Here are the general steps involved in alignment enhancements:
1338
1339 -- original loop, before alignment analysis:
1340 for (i=0; i<N; i++){
1341 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1342 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1343 }
1344
1345 -- After vect_compute_data_refs_alignment:
1346 for (i=0; i<N; i++){
1347 x = q[i]; # DR_MISALIGNMENT(q) = 3
1348 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1349 }
1350
1351 -- Possibility 1: we do loop versioning:
1352 if (p is aligned) {
1353 for (i=0; i<N; i++){ # loop 1A
1354 x = q[i]; # DR_MISALIGNMENT(q) = 3
1355 p[i] = y; # DR_MISALIGNMENT(p) = 0
1356 }
1357 }
1358 else {
1359 for (i=0; i<N; i++){ # loop 1B
1360 x = q[i]; # DR_MISALIGNMENT(q) = 3
1361 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1362 }
1363 }
1364
1365 -- Possibility 2: we do loop peeling:
1366 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1367 x = q[i];
1368 p[i] = y;
1369 }
1370 for (i = 3; i < N; i++){ # loop 2A
1371 x = q[i]; # DR_MISALIGNMENT(q) = 0
1372 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1373 }
1374
1375 -- Possibility 3: combination of loop peeling and versioning:
1376 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1377 x = q[i];
1378 p[i] = y;
1379 }
1380 if (p is aligned) {
1381 for (i = 3; i<N; i++){ # loop 3A
1382 x = q[i]; # DR_MISALIGNMENT(q) = 0
1383 p[i] = y; # DR_MISALIGNMENT(p) = 0
1384 }
1385 }
1386 else {
1387 for (i = 3; i<N; i++){ # loop 3B
1388 x = q[i]; # DR_MISALIGNMENT(q) = 0
1389 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1390 }
1391 }
1392
1393 These loops are later passed to loop_transform to be vectorized. The
1394 vectorizer will use the alignment information to guide the transformation
1395 (whether to generate regular loads/stores, or with special handling for
1396 misalignment). */
1397
1398 bool
1399 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1400 {
1401 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1403 enum dr_alignment_support supportable_dr_alignment;
1404 struct data_reference *dr0 = NULL, *first_store = NULL;
1405 struct data_reference *dr;
1406 unsigned int i, j;
1407 bool do_peeling = false;
1408 bool do_versioning = false;
1409 bool stat;
1410 gimple *stmt;
1411 stmt_vec_info stmt_info;
1412 unsigned int npeel = 0;
1413 bool all_misalignments_unknown = true;
1414 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1415 unsigned possible_npeel_number = 1;
1416 tree vectype;
1417 unsigned int nelements, mis, same_align_drs_max = 0;
1418 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1419 hash_table<peel_info_hasher> peeling_htab (1);
1420
1421 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_NOTE, vect_location,
1423 "=== vect_enhance_data_refs_alignment ===\n");
1424
1425 /* Reset data so we can safely be called multiple times. */
1426 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1427 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1428
1429 /* While cost model enhancements are expected in the future, the high level
1430 view of the code at this time is as follows:
1431
1432 A) If there is a misaligned access then see if peeling to align
1433 this access can make all data references satisfy
1434 vect_supportable_dr_alignment. If so, update data structures
1435 as needed and return true.
1436
1437 B) If peeling wasn't possible and there is a data reference with an
1438 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1439 then see if loop versioning checks can be used to make all data
1440 references satisfy vect_supportable_dr_alignment. If so, update
1441 data structures as needed and return true.
1442
1443 C) If neither peeling nor versioning were successful then return false if
1444 any data reference does not satisfy vect_supportable_dr_alignment.
1445
1446 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1447
1448 Note, Possibility 3 above (which is peeling and versioning together) is not
1449 being done at this time. */
1450
1451 /* (1) Peeling to force alignment. */
1452
1453 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1454 Considerations:
1455 + How many accesses will become aligned due to the peeling
1456 - How many accesses will become unaligned due to the peeling,
1457 and the cost of misaligned accesses.
1458 - The cost of peeling (the extra runtime checks, the increase
1459 in code size). */
1460
1461 FOR_EACH_VEC_ELT (datarefs, i, dr)
1462 {
1463 stmt = DR_STMT (dr);
1464 stmt_info = vinfo_for_stmt (stmt);
1465
1466 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1467 continue;
1468
1469 /* For interleaving, only the alignment of the first access
1470 matters. */
1471 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1472 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1473 continue;
1474
1475 /* For invariant accesses there is nothing to enhance. */
1476 if (integer_zerop (DR_STEP (dr)))
1477 continue;
1478
1479 /* Strided accesses perform only component accesses, alignment is
1480 irrelevant for them. */
1481 if (STMT_VINFO_STRIDED_P (stmt_info)
1482 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1483 continue;
1484
1485 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1486 do_peeling = vector_alignment_reachable_p (dr);
1487 if (do_peeling)
1488 {
1489 if (known_alignment_for_access_p (dr))
1490 {
1491 unsigned int npeel_tmp;
1492 bool negative = tree_int_cst_compare (DR_STEP (dr),
1493 size_zero_node) < 0;
1494
1495 /* Save info about DR in the hash table. */
1496 vectype = STMT_VINFO_VECTYPE (stmt_info);
1497 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1498 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1499 TREE_TYPE (DR_REF (dr))));
1500 npeel_tmp = (negative
1501 ? (mis - nelements) : (nelements - mis))
1502 & (nelements - 1);
1503
1504 /* For multiple types, it is possible that the bigger type access
1505 will have more than one peeling option. E.g., a loop with two
1506 types: one of size (vector size / 4), and the other one of
1507 size (vector size / 8). Vectorization factor will 8. If both
1508 access are misaligned by 3, the first one needs one scalar
1509 iteration to be aligned, and the second one needs 5. But the
1510 first one will be aligned also by peeling 5 scalar
1511 iterations, and in that case both accesses will be aligned.
1512 Hence, except for the immediate peeling amount, we also want
1513 to try to add full vector size, while we don't exceed
1514 vectorization factor.
1515 We do this automtically for cost model, since we calculate cost
1516 for every peeling option. */
1517 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1518 {
1519 if (STMT_SLP_TYPE (stmt_info))
1520 possible_npeel_number
1521 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1522 else
1523 possible_npeel_number = vf / nelements;
1524 }
1525
1526 /* Handle the aligned case. We may decide to align some other
1527 access, making DR unaligned. */
1528 if (DR_MISALIGNMENT (dr) == 0)
1529 {
1530 npeel_tmp = 0;
1531 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1532 possible_npeel_number++;
1533 }
1534
1535 for (j = 0; j < possible_npeel_number; j++)
1536 {
1537 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1538 dr, npeel_tmp);
1539 npeel_tmp += nelements;
1540 }
1541
1542 all_misalignments_unknown = false;
1543 /* Data-ref that was chosen for the case that all the
1544 misalignments are unknown is not relevant anymore, since we
1545 have a data-ref with known alignment. */
1546 dr0 = NULL;
1547 }
1548 else
1549 {
1550 /* If we don't know any misalignment values, we prefer
1551 peeling for data-ref that has the maximum number of data-refs
1552 with the same alignment, unless the target prefers to align
1553 stores over load. */
1554 if (all_misalignments_unknown)
1555 {
1556 unsigned same_align_drs
1557 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1558 if (!dr0
1559 || same_align_drs_max < same_align_drs)
1560 {
1561 same_align_drs_max = same_align_drs;
1562 dr0 = dr;
1563 }
1564 /* For data-refs with the same number of related
1565 accesses prefer the one where the misalign
1566 computation will be invariant in the outermost loop. */
1567 else if (same_align_drs_max == same_align_drs)
1568 {
1569 struct loop *ivloop0, *ivloop;
1570 ivloop0 = outermost_invariant_loop_for_expr
1571 (loop, DR_BASE_ADDRESS (dr0));
1572 ivloop = outermost_invariant_loop_for_expr
1573 (loop, DR_BASE_ADDRESS (dr));
1574 if ((ivloop && !ivloop0)
1575 || (ivloop && ivloop0
1576 && flow_loop_nested_p (ivloop, ivloop0)))
1577 dr0 = dr;
1578 }
1579
1580 if (!first_store && DR_IS_WRITE (dr))
1581 first_store = dr;
1582 }
1583
1584 /* If there are both known and unknown misaligned accesses in the
1585 loop, we choose peeling amount according to the known
1586 accesses. */
1587 if (!supportable_dr_alignment)
1588 {
1589 dr0 = dr;
1590 if (!first_store && DR_IS_WRITE (dr))
1591 first_store = dr;
1592 }
1593 }
1594 }
1595 else
1596 {
1597 if (!aligned_access_p (dr))
1598 {
1599 if (dump_enabled_p ())
1600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601 "vector alignment may not be reachable\n");
1602 break;
1603 }
1604 }
1605 }
1606
1607 /* Check if we can possibly peel the loop. */
1608 if (!vect_can_advance_ivs_p (loop_vinfo)
1609 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1610 || loop->inner)
1611 do_peeling = false;
1612
1613 if (do_peeling
1614 && all_misalignments_unknown
1615 && vect_supportable_dr_alignment (dr0, false))
1616 {
1617 /* Check if the target requires to prefer stores over loads, i.e., if
1618 misaligned stores are more expensive than misaligned loads (taking
1619 drs with same alignment into account). */
1620 if (first_store && DR_IS_READ (dr0))
1621 {
1622 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1623 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1624 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1625 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1626 stmt_vector_for_cost dummy;
1627 dummy.create (2);
1628
1629 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1630 &dummy);
1631 vect_get_data_access_cost (first_store, &store_inside_cost,
1632 &store_outside_cost, &dummy);
1633
1634 dummy.release ();
1635
1636 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1637 aligning the load DR0). */
1638 load_inside_penalty = store_inside_cost;
1639 load_outside_penalty = store_outside_cost;
1640 for (i = 0;
1641 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1642 DR_STMT (first_store))).iterate (i, &dr);
1643 i++)
1644 if (DR_IS_READ (dr))
1645 {
1646 load_inside_penalty += load_inside_cost;
1647 load_outside_penalty += load_outside_cost;
1648 }
1649 else
1650 {
1651 load_inside_penalty += store_inside_cost;
1652 load_outside_penalty += store_outside_cost;
1653 }
1654
1655 /* Calculate the penalty for leaving DR0 unaligned (by
1656 aligning the FIRST_STORE). */
1657 store_inside_penalty = load_inside_cost;
1658 store_outside_penalty = load_outside_cost;
1659 for (i = 0;
1660 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1661 DR_STMT (dr0))).iterate (i, &dr);
1662 i++)
1663 if (DR_IS_READ (dr))
1664 {
1665 store_inside_penalty += load_inside_cost;
1666 store_outside_penalty += load_outside_cost;
1667 }
1668 else
1669 {
1670 store_inside_penalty += store_inside_cost;
1671 store_outside_penalty += store_outside_cost;
1672 }
1673
1674 if (load_inside_penalty > store_inside_penalty
1675 || (load_inside_penalty == store_inside_penalty
1676 && load_outside_penalty > store_outside_penalty))
1677 dr0 = first_store;
1678 }
1679
1680 /* In case there are only loads with different unknown misalignments, use
1681 peeling only if it may help to align other accesses in the loop or
1682 if it may help improving load bandwith when we'd end up using
1683 unaligned loads. */
1684 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1685 if (!first_store
1686 && !STMT_VINFO_SAME_ALIGN_REFS (
1687 vinfo_for_stmt (DR_STMT (dr0))).length ()
1688 && (vect_supportable_dr_alignment (dr0, false)
1689 != dr_unaligned_supported
1690 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1691 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1692 do_peeling = false;
1693 }
1694
1695 if (do_peeling && !dr0)
1696 {
1697 /* Peeling is possible, but there is no data access that is not supported
1698 unless aligned. So we try to choose the best possible peeling. */
1699
1700 /* We should get here only if there are drs with known misalignment. */
1701 gcc_assert (!all_misalignments_unknown);
1702
1703 /* Choose the best peeling from the hash table. */
1704 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1705 loop_vinfo, &npeel,
1706 &body_cost_vec);
1707 if (!dr0 || !npeel)
1708 do_peeling = false;
1709 }
1710
1711 if (do_peeling)
1712 {
1713 stmt = DR_STMT (dr0);
1714 stmt_info = vinfo_for_stmt (stmt);
1715 vectype = STMT_VINFO_VECTYPE (stmt_info);
1716 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1717
1718 if (known_alignment_for_access_p (dr0))
1719 {
1720 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1721 size_zero_node) < 0;
1722 if (!npeel)
1723 {
1724 /* Since it's known at compile time, compute the number of
1725 iterations in the peeled loop (the peeling factor) for use in
1726 updating DR_MISALIGNMENT values. The peeling factor is the
1727 vectorization factor minus the misalignment as an element
1728 count. */
1729 mis = DR_MISALIGNMENT (dr0);
1730 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1731 npeel = ((negative ? mis - nelements : nelements - mis)
1732 & (nelements - 1));
1733 }
1734
1735 /* For interleaved data access every iteration accesses all the
1736 members of the group, therefore we divide the number of iterations
1737 by the group size. */
1738 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1739 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1740 npeel /= GROUP_SIZE (stmt_info);
1741
1742 if (dump_enabled_p ())
1743 dump_printf_loc (MSG_NOTE, vect_location,
1744 "Try peeling by %d\n", npeel);
1745 }
1746
1747 /* Ensure that all data refs can be vectorized after the peel. */
1748 FOR_EACH_VEC_ELT (datarefs, i, dr)
1749 {
1750 int save_misalignment;
1751
1752 if (dr == dr0)
1753 continue;
1754
1755 stmt = DR_STMT (dr);
1756 stmt_info = vinfo_for_stmt (stmt);
1757 /* For interleaving, only the alignment of the first access
1758 matters. */
1759 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1760 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1761 continue;
1762
1763 /* Strided accesses perform only component accesses, alignment is
1764 irrelevant for them. */
1765 if (STMT_VINFO_STRIDED_P (stmt_info)
1766 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1767 continue;
1768
1769 save_misalignment = DR_MISALIGNMENT (dr);
1770 vect_update_misalignment_for_peel (dr, dr0, npeel);
1771 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1772 SET_DR_MISALIGNMENT (dr, save_misalignment);
1773
1774 if (!supportable_dr_alignment)
1775 {
1776 do_peeling = false;
1777 break;
1778 }
1779 }
1780
1781 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1782 {
1783 stat = vect_verify_datarefs_alignment (loop_vinfo);
1784 if (!stat)
1785 do_peeling = false;
1786 else
1787 {
1788 body_cost_vec.release ();
1789 return stat;
1790 }
1791 }
1792
1793 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1794 if (do_peeling)
1795 {
1796 unsigned max_allowed_peel
1797 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1798 if (max_allowed_peel != (unsigned)-1)
1799 {
1800 unsigned max_peel = npeel;
1801 if (max_peel == 0)
1802 {
1803 gimple *dr_stmt = DR_STMT (dr0);
1804 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1805 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1806 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1807 }
1808 if (max_peel > max_allowed_peel)
1809 {
1810 do_peeling = false;
1811 if (dump_enabled_p ())
1812 dump_printf_loc (MSG_NOTE, vect_location,
1813 "Disable peeling, max peels reached: %d\n", max_peel);
1814 }
1815 }
1816 }
1817
1818 /* Cost model #2 - if peeling may result in a remaining loop not
1819 iterating enough to be vectorized then do not peel. */
1820 if (do_peeling
1821 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1822 {
1823 unsigned max_peel
1824 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1825 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1826 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1827 do_peeling = false;
1828 }
1829
1830 if (do_peeling)
1831 {
1832 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1833 If the misalignment of DR_i is identical to that of dr0 then set
1834 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1835 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1836 by the peeling factor times the element size of DR_i (MOD the
1837 vectorization factor times the size). Otherwise, the
1838 misalignment of DR_i must be set to unknown. */
1839 FOR_EACH_VEC_ELT (datarefs, i, dr)
1840 if (dr != dr0)
1841 {
1842 /* Strided accesses perform only component accesses, alignment
1843 is irrelevant for them. */
1844 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1845 if (STMT_VINFO_STRIDED_P (stmt_info)
1846 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1847 continue;
1848
1849 vect_update_misalignment_for_peel (dr, dr0, npeel);
1850 }
1851
1852 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1853 if (npeel)
1854 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1855 else
1856 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857 = DR_MISALIGNMENT (dr0);
1858 SET_DR_MISALIGNMENT (dr0, 0);
1859 if (dump_enabled_p ())
1860 {
1861 dump_printf_loc (MSG_NOTE, vect_location,
1862 "Alignment of access forced using peeling.\n");
1863 dump_printf_loc (MSG_NOTE, vect_location,
1864 "Peeling for alignment will be applied.\n");
1865 }
1866 /* The inside-loop cost will be accounted for in vectorizable_load
1867 and vectorizable_store correctly with adjusted alignments.
1868 Drop the body_cst_vec on the floor here. */
1869 body_cost_vec.release ();
1870
1871 stat = vect_verify_datarefs_alignment (loop_vinfo);
1872 gcc_assert (stat);
1873 return stat;
1874 }
1875 }
1876
1877 body_cost_vec.release ();
1878
1879 /* (2) Versioning to force alignment. */
1880
1881 /* Try versioning if:
1882 1) optimize loop for speed
1883 2) there is at least one unsupported misaligned data ref with an unknown
1884 misalignment, and
1885 3) all misaligned data refs with a known misalignment are supported, and
1886 4) the number of runtime alignment checks is within reason. */
1887
1888 do_versioning =
1889 optimize_loop_nest_for_speed_p (loop)
1890 && (!loop->inner); /* FORNOW */
1891
1892 if (do_versioning)
1893 {
1894 FOR_EACH_VEC_ELT (datarefs, i, dr)
1895 {
1896 stmt = DR_STMT (dr);
1897 stmt_info = vinfo_for_stmt (stmt);
1898
1899 /* For interleaving, only the alignment of the first access
1900 matters. */
1901 if (aligned_access_p (dr)
1902 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1903 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1904 continue;
1905
1906 if (STMT_VINFO_STRIDED_P (stmt_info))
1907 {
1908 /* Strided loads perform only component accesses, alignment is
1909 irrelevant for them. */
1910 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1911 continue;
1912 do_versioning = false;
1913 break;
1914 }
1915
1916 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1917
1918 if (!supportable_dr_alignment)
1919 {
1920 gimple *stmt;
1921 int mask;
1922 tree vectype;
1923
1924 if (known_alignment_for_access_p (dr)
1925 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1926 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1927 {
1928 do_versioning = false;
1929 break;
1930 }
1931
1932 stmt = DR_STMT (dr);
1933 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1934 gcc_assert (vectype);
1935
1936 /* The rightmost bits of an aligned address must be zeros.
1937 Construct the mask needed for this test. For example,
1938 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1939 mask must be 15 = 0xf. */
1940 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1941
1942 /* FORNOW: use the same mask to test all potentially unaligned
1943 references in the loop. The vectorizer currently supports
1944 a single vector size, see the reference to
1945 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1946 vectorization factor is computed. */
1947 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1948 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1949 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1950 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1951 DR_STMT (dr));
1952 }
1953 }
1954
1955 /* Versioning requires at least one misaligned data reference. */
1956 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1957 do_versioning = false;
1958 else if (!do_versioning)
1959 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1960 }
1961
1962 if (do_versioning)
1963 {
1964 vec<gimple *> may_misalign_stmts
1965 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1966 gimple *stmt;
1967
1968 /* It can now be assumed that the data references in the statements
1969 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1970 of the loop being vectorized. */
1971 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1972 {
1973 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1974 dr = STMT_VINFO_DATA_REF (stmt_info);
1975 SET_DR_MISALIGNMENT (dr, 0);
1976 if (dump_enabled_p ())
1977 dump_printf_loc (MSG_NOTE, vect_location,
1978 "Alignment of access forced using versioning.\n");
1979 }
1980
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_NOTE, vect_location,
1983 "Versioning for alignment will be applied.\n");
1984
1985 /* Peeling and versioning can't be done together at this time. */
1986 gcc_assert (! (do_peeling && do_versioning));
1987
1988 stat = vect_verify_datarefs_alignment (loop_vinfo);
1989 gcc_assert (stat);
1990 return stat;
1991 }
1992
1993 /* This point is reached if neither peeling nor versioning is being done. */
1994 gcc_assert (! (do_peeling || do_versioning));
1995
1996 stat = vect_verify_datarefs_alignment (loop_vinfo);
1997 return stat;
1998 }
1999
2000
2001 /* Function vect_find_same_alignment_drs.
2002
2003 Update group and alignment relations according to the chosen
2004 vectorization factor. */
2005
2006 static void
2007 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2008 loop_vec_info loop_vinfo)
2009 {
2010 unsigned int i;
2011 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2012 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2013 struct data_reference *dra = DDR_A (ddr);
2014 struct data_reference *drb = DDR_B (ddr);
2015 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2016 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2017 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2018 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2019 lambda_vector dist_v;
2020 unsigned int loop_depth;
2021
2022 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2023 return;
2024
2025 if (dra == drb)
2026 return;
2027
2028 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2029 return;
2030
2031 /* Loop-based vectorization and known data dependence. */
2032 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2033 return;
2034
2035 /* Data-dependence analysis reports a distance vector of zero
2036 for data-references that overlap only in the first iteration
2037 but have different sign step (see PR45764).
2038 So as a sanity check require equal DR_STEP. */
2039 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2040 return;
2041
2042 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2043 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2044 {
2045 int dist = dist_v[loop_depth];
2046
2047 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "dependence distance = %d.\n", dist);
2050
2051 /* Same loop iteration. */
2052 if (dist == 0
2053 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2054 {
2055 /* Two references with distance zero have the same alignment. */
2056 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2057 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2058 if (dump_enabled_p ())
2059 {
2060 dump_printf_loc (MSG_NOTE, vect_location,
2061 "accesses have the same alignment.\n");
2062 dump_printf (MSG_NOTE,
2063 "dependence distance modulo vf == 0 between ");
2064 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2065 dump_printf (MSG_NOTE, " and ");
2066 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2067 dump_printf (MSG_NOTE, "\n");
2068 }
2069 }
2070 }
2071 }
2072
2073
2074 /* Function vect_analyze_data_refs_alignment
2075
2076 Analyze the alignment of the data-references in the loop.
2077 Return FALSE if a data reference is found that cannot be vectorized. */
2078
2079 bool
2080 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2081 {
2082 if (dump_enabled_p ())
2083 dump_printf_loc (MSG_NOTE, vect_location,
2084 "=== vect_analyze_data_refs_alignment ===\n");
2085
2086 /* Mark groups of data references with same alignment using
2087 data dependence information. */
2088 vec<ddr_p> ddrs = vinfo->ddrs;
2089 struct data_dependence_relation *ddr;
2090 unsigned int i;
2091
2092 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2093 vect_find_same_alignment_drs (ddr, vinfo);
2094
2095 vec<data_reference_p> datarefs = vinfo->datarefs;
2096 struct data_reference *dr;
2097
2098 FOR_EACH_VEC_ELT (datarefs, i, dr)
2099 {
2100 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2101 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2102 && !vect_compute_data_ref_alignment (dr))
2103 {
2104 /* Strided accesses perform only component accesses, misalignment
2105 information is irrelevant for them. */
2106 if (STMT_VINFO_STRIDED_P (stmt_info)
2107 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2108 continue;
2109
2110 if (dump_enabled_p ())
2111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2112 "not vectorized: can't calculate alignment "
2113 "for data ref.\n");
2114
2115 return false;
2116 }
2117 }
2118
2119 return true;
2120 }
2121
2122
2123 /* Analyze alignment of DRs of stmts in NODE. */
2124
2125 static bool
2126 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2127 {
2128 /* We vectorize from the first scalar stmt in the node unless
2129 the node is permuted in which case we start from the first
2130 element in the group. */
2131 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2132 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2133 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2134 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2135
2136 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2137 if (! vect_compute_data_ref_alignment (dr)
2138 /* For creating the data-ref pointer we need alignment of the
2139 first element anyway. */
2140 || (dr != first_dr
2141 && ! vect_compute_data_ref_alignment (first_dr))
2142 || ! verify_data_ref_alignment (dr))
2143 {
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2146 "not vectorized: bad data alignment in basic "
2147 "block.\n");
2148 return false;
2149 }
2150
2151 return true;
2152 }
2153
2154 /* Function vect_slp_analyze_instance_alignment
2155
2156 Analyze the alignment of the data-references in the SLP instance.
2157 Return FALSE if a data reference is found that cannot be vectorized. */
2158
2159 bool
2160 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2161 {
2162 if (dump_enabled_p ())
2163 dump_printf_loc (MSG_NOTE, vect_location,
2164 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2165
2166 slp_tree node;
2167 unsigned i;
2168 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2169 if (! vect_slp_analyze_and_verify_node_alignment (node))
2170 return false;
2171
2172 node = SLP_INSTANCE_TREE (instance);
2173 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2174 && ! vect_slp_analyze_and_verify_node_alignment
2175 (SLP_INSTANCE_TREE (instance)))
2176 return false;
2177
2178 return true;
2179 }
2180
2181
2182 /* Analyze groups of accesses: check that DR belongs to a group of
2183 accesses of legal size, step, etc. Detect gaps, single element
2184 interleaving, and other special cases. Set grouped access info.
2185 Collect groups of strided stores for further use in SLP analysis.
2186 Worker for vect_analyze_group_access. */
2187
2188 static bool
2189 vect_analyze_group_access_1 (struct data_reference *dr)
2190 {
2191 tree step = DR_STEP (dr);
2192 tree scalar_type = TREE_TYPE (DR_REF (dr));
2193 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2194 gimple *stmt = DR_STMT (dr);
2195 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2196 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2197 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2198 HOST_WIDE_INT dr_step = -1;
2199 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2200 bool slp_impossible = false;
2201
2202 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2203 size of the interleaving group (including gaps). */
2204 if (tree_fits_shwi_p (step))
2205 {
2206 dr_step = tree_to_shwi (step);
2207 /* Check that STEP is a multiple of type size. Otherwise there is
2208 a non-element-sized gap at the end of the group which we
2209 cannot represent in GROUP_GAP or GROUP_SIZE.
2210 ??? As we can handle non-constant step fine here we should
2211 simply remove uses of GROUP_GAP between the last and first
2212 element and instead rely on DR_STEP. GROUP_SIZE then would
2213 simply not include that gap. */
2214 if ((dr_step % type_size) != 0)
2215 {
2216 if (dump_enabled_p ())
2217 {
2218 dump_printf_loc (MSG_NOTE, vect_location,
2219 "Step ");
2220 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2221 dump_printf (MSG_NOTE,
2222 " is not a multiple of the element size for ");
2223 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2224 dump_printf (MSG_NOTE, "\n");
2225 }
2226 return false;
2227 }
2228 groupsize = absu_hwi (dr_step) / type_size;
2229 }
2230 else
2231 groupsize = 0;
2232
2233 /* Not consecutive access is possible only if it is a part of interleaving. */
2234 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2235 {
2236 /* Check if it this DR is a part of interleaving, and is a single
2237 element of the group that is accessed in the loop. */
2238
2239 /* Gaps are supported only for loads. STEP must be a multiple of the type
2240 size. The size of the group must be a power of 2. */
2241 if (DR_IS_READ (dr)
2242 && (dr_step % type_size) == 0
2243 && groupsize > 0
2244 && exact_log2 (groupsize) != -1)
2245 {
2246 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2247 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2248 GROUP_GAP (stmt_info) = groupsize - 1;
2249 if (dump_enabled_p ())
2250 {
2251 dump_printf_loc (MSG_NOTE, vect_location,
2252 "Detected single element interleaving ");
2253 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2254 dump_printf (MSG_NOTE, " step ");
2255 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2256 dump_printf (MSG_NOTE, "\n");
2257 }
2258
2259 return true;
2260 }
2261
2262 if (dump_enabled_p ())
2263 {
2264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2265 "not consecutive access ");
2266 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2267 }
2268
2269 if (bb_vinfo)
2270 {
2271 /* Mark the statement as unvectorizable. */
2272 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2273 return true;
2274 }
2275
2276 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2277 STMT_VINFO_STRIDED_P (stmt_info) = true;
2278 return true;
2279 }
2280
2281 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2282 {
2283 /* First stmt in the interleaving chain. Check the chain. */
2284 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2285 struct data_reference *data_ref = dr;
2286 unsigned int count = 1;
2287 tree prev_init = DR_INIT (data_ref);
2288 gimple *prev = stmt;
2289 HOST_WIDE_INT diff, gaps = 0;
2290
2291 while (next)
2292 {
2293 /* Skip same data-refs. In case that two or more stmts share
2294 data-ref (supported only for loads), we vectorize only the first
2295 stmt, and the rest get their vectorized loads from the first
2296 one. */
2297 if (!tree_int_cst_compare (DR_INIT (data_ref),
2298 DR_INIT (STMT_VINFO_DATA_REF (
2299 vinfo_for_stmt (next)))))
2300 {
2301 if (DR_IS_WRITE (data_ref))
2302 {
2303 if (dump_enabled_p ())
2304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2305 "Two store stmts share the same dr.\n");
2306 return false;
2307 }
2308
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "Two or more load stmts share the same dr.\n");
2312
2313 /* For load use the same data-ref load. */
2314 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2315
2316 prev = next;
2317 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2318 continue;
2319 }
2320
2321 prev = next;
2322 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2323
2324 /* All group members have the same STEP by construction. */
2325 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2326
2327 /* Check that the distance between two accesses is equal to the type
2328 size. Otherwise, we have gaps. */
2329 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2330 - TREE_INT_CST_LOW (prev_init)) / type_size;
2331 if (diff != 1)
2332 {
2333 /* FORNOW: SLP of accesses with gaps is not supported. */
2334 slp_impossible = true;
2335 if (DR_IS_WRITE (data_ref))
2336 {
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339 "interleaved store with gaps\n");
2340 return false;
2341 }
2342
2343 gaps += diff - 1;
2344 }
2345
2346 last_accessed_element += diff;
2347
2348 /* Store the gap from the previous member of the group. If there is no
2349 gap in the access, GROUP_GAP is always 1. */
2350 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2351
2352 prev_init = DR_INIT (data_ref);
2353 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2354 /* Count the number of data-refs in the chain. */
2355 count++;
2356 }
2357
2358 if (groupsize == 0)
2359 groupsize = count + gaps;
2360
2361 if (groupsize > UINT_MAX)
2362 {
2363 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2365 "group is too large\n");
2366 return false;
2367 }
2368
2369 /* Check that the size of the interleaving is equal to count for stores,
2370 i.e., that there are no gaps. */
2371 if (groupsize != count
2372 && !DR_IS_READ (dr))
2373 {
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2376 "interleaved store with gaps\n");
2377 return false;
2378 }
2379
2380 /* If there is a gap after the last load in the group it is the
2381 difference between the groupsize and the last accessed
2382 element.
2383 When there is no gap, this difference should be 0. */
2384 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2385
2386 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2387 if (dump_enabled_p ())
2388 {
2389 dump_printf_loc (MSG_NOTE, vect_location,
2390 "Detected interleaving ");
2391 if (DR_IS_READ (dr))
2392 dump_printf (MSG_NOTE, "load ");
2393 else
2394 dump_printf (MSG_NOTE, "store ");
2395 dump_printf (MSG_NOTE, "of size %u starting with ",
2396 (unsigned)groupsize);
2397 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2398 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2399 dump_printf_loc (MSG_NOTE, vect_location,
2400 "There is a gap of %u elements after the group\n",
2401 GROUP_GAP (vinfo_for_stmt (stmt)));
2402 }
2403
2404 /* SLP: create an SLP data structure for every interleaving group of
2405 stores for further analysis in vect_analyse_slp. */
2406 if (DR_IS_WRITE (dr) && !slp_impossible)
2407 {
2408 if (loop_vinfo)
2409 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2410 if (bb_vinfo)
2411 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2412 }
2413 }
2414
2415 return true;
2416 }
2417
2418 /* Analyze groups of accesses: check that DR belongs to a group of
2419 accesses of legal size, step, etc. Detect gaps, single element
2420 interleaving, and other special cases. Set grouped access info.
2421 Collect groups of strided stores for further use in SLP analysis. */
2422
2423 static bool
2424 vect_analyze_group_access (struct data_reference *dr)
2425 {
2426 if (!vect_analyze_group_access_1 (dr))
2427 {
2428 /* Dissolve the group if present. */
2429 gimple *next;
2430 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2431 while (stmt)
2432 {
2433 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2434 next = GROUP_NEXT_ELEMENT (vinfo);
2435 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2436 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2437 stmt = next;
2438 }
2439 return false;
2440 }
2441 return true;
2442 }
2443
2444 /* Analyze the access pattern of the data-reference DR.
2445 In case of non-consecutive accesses call vect_analyze_group_access() to
2446 analyze groups of accesses. */
2447
2448 static bool
2449 vect_analyze_data_ref_access (struct data_reference *dr)
2450 {
2451 tree step = DR_STEP (dr);
2452 tree scalar_type = TREE_TYPE (DR_REF (dr));
2453 gimple *stmt = DR_STMT (dr);
2454 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2455 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2456 struct loop *loop = NULL;
2457
2458 if (loop_vinfo)
2459 loop = LOOP_VINFO_LOOP (loop_vinfo);
2460
2461 if (loop_vinfo && !step)
2462 {
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465 "bad data-ref access in loop\n");
2466 return false;
2467 }
2468
2469 /* Allow loads with zero step in inner-loop vectorization. */
2470 if (loop_vinfo && integer_zerop (step))
2471 {
2472 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2473 if (!nested_in_vect_loop_p (loop, stmt))
2474 return DR_IS_READ (dr);
2475 /* Allow references with zero step for outer loops marked
2476 with pragma omp simd only - it guarantees absence of
2477 loop-carried dependencies between inner loop iterations. */
2478 if (!loop->force_vectorize)
2479 {
2480 if (dump_enabled_p ())
2481 dump_printf_loc (MSG_NOTE, vect_location,
2482 "zero step in inner loop of nest\n");
2483 return false;
2484 }
2485 }
2486
2487 if (loop && nested_in_vect_loop_p (loop, stmt))
2488 {
2489 /* Interleaved accesses are not yet supported within outer-loop
2490 vectorization for references in the inner-loop. */
2491 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2492
2493 /* For the rest of the analysis we use the outer-loop step. */
2494 step = STMT_VINFO_DR_STEP (stmt_info);
2495 if (integer_zerop (step))
2496 {
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_NOTE, vect_location,
2499 "zero step in outer loop.\n");
2500 return DR_IS_READ (dr);
2501 }
2502 }
2503
2504 /* Consecutive? */
2505 if (TREE_CODE (step) == INTEGER_CST)
2506 {
2507 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2508 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2509 || (dr_step < 0
2510 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2511 {
2512 /* Mark that it is not interleaving. */
2513 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2514 return true;
2515 }
2516 }
2517
2518 if (loop && nested_in_vect_loop_p (loop, stmt))
2519 {
2520 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_NOTE, vect_location,
2522 "grouped access in outer loop.\n");
2523 return false;
2524 }
2525
2526
2527 /* Assume this is a DR handled by non-constant strided load case. */
2528 if (TREE_CODE (step) != INTEGER_CST)
2529 return (STMT_VINFO_STRIDED_P (stmt_info)
2530 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2531 || vect_analyze_group_access (dr)));
2532
2533 /* Not consecutive access - check if it's a part of interleaving group. */
2534 return vect_analyze_group_access (dr);
2535 }
2536
2537
2538
2539 /* A helper function used in the comparator function to sort data
2540 references. T1 and T2 are two data references to be compared.
2541 The function returns -1, 0, or 1. */
2542
2543 static int
2544 compare_tree (tree t1, tree t2)
2545 {
2546 int i, cmp;
2547 enum tree_code code;
2548 char tclass;
2549
2550 if (t1 == t2)
2551 return 0;
2552 if (t1 == NULL)
2553 return -1;
2554 if (t2 == NULL)
2555 return 1;
2556
2557 STRIP_NOPS (t1);
2558 STRIP_NOPS (t2);
2559
2560 if (TREE_CODE (t1) != TREE_CODE (t2))
2561 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2562
2563 code = TREE_CODE (t1);
2564 switch (code)
2565 {
2566 /* For const values, we can just use hash values for comparisons. */
2567 case INTEGER_CST:
2568 case REAL_CST:
2569 case FIXED_CST:
2570 case STRING_CST:
2571 case COMPLEX_CST:
2572 case VECTOR_CST:
2573 {
2574 hashval_t h1 = iterative_hash_expr (t1, 0);
2575 hashval_t h2 = iterative_hash_expr (t2, 0);
2576 if (h1 != h2)
2577 return h1 < h2 ? -1 : 1;
2578 break;
2579 }
2580
2581 case SSA_NAME:
2582 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2583 if (cmp != 0)
2584 return cmp;
2585
2586 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2587 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2588 break;
2589
2590 default:
2591 tclass = TREE_CODE_CLASS (code);
2592
2593 /* For var-decl, we could compare their UIDs. */
2594 if (tclass == tcc_declaration)
2595 {
2596 if (DECL_UID (t1) != DECL_UID (t2))
2597 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2598 break;
2599 }
2600
2601 /* For expressions with operands, compare their operands recursively. */
2602 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2603 {
2604 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2605 if (cmp != 0)
2606 return cmp;
2607 }
2608 }
2609
2610 return 0;
2611 }
2612
2613
2614 /* Compare two data-references DRA and DRB to group them into chunks
2615 suitable for grouping. */
2616
2617 static int
2618 dr_group_sort_cmp (const void *dra_, const void *drb_)
2619 {
2620 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2621 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2622 int cmp;
2623
2624 /* Stabilize sort. */
2625 if (dra == drb)
2626 return 0;
2627
2628 /* DRs in different loops never belong to the same group. */
2629 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2630 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2631 if (loopa != loopb)
2632 return loopa->num < loopb->num ? -1 : 1;
2633
2634 /* Ordering of DRs according to base. */
2635 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2636 {
2637 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2638 if (cmp != 0)
2639 return cmp;
2640 }
2641
2642 /* And according to DR_OFFSET. */
2643 if (!dr_equal_offsets_p (dra, drb))
2644 {
2645 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2646 if (cmp != 0)
2647 return cmp;
2648 }
2649
2650 /* Put reads before writes. */
2651 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2652 return DR_IS_READ (dra) ? -1 : 1;
2653
2654 /* Then sort after access size. */
2655 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2656 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2657 {
2658 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2659 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2660 if (cmp != 0)
2661 return cmp;
2662 }
2663
2664 /* And after step. */
2665 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2666 {
2667 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2668 if (cmp != 0)
2669 return cmp;
2670 }
2671
2672 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2673 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2674 if (cmp == 0)
2675 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2676 return cmp;
2677 }
2678
2679 /* Function vect_analyze_data_ref_accesses.
2680
2681 Analyze the access pattern of all the data references in the loop.
2682
2683 FORNOW: the only access pattern that is considered vectorizable is a
2684 simple step 1 (consecutive) access.
2685
2686 FORNOW: handle only arrays and pointer accesses. */
2687
2688 bool
2689 vect_analyze_data_ref_accesses (vec_info *vinfo)
2690 {
2691 unsigned int i;
2692 vec<data_reference_p> datarefs = vinfo->datarefs;
2693 struct data_reference *dr;
2694
2695 if (dump_enabled_p ())
2696 dump_printf_loc (MSG_NOTE, vect_location,
2697 "=== vect_analyze_data_ref_accesses ===\n");
2698
2699 if (datarefs.is_empty ())
2700 return true;
2701
2702 /* Sort the array of datarefs to make building the interleaving chains
2703 linear. Don't modify the original vector's order, it is needed for
2704 determining what dependencies are reversed. */
2705 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2706 datarefs_copy.qsort (dr_group_sort_cmp);
2707
2708 /* Build the interleaving chains. */
2709 for (i = 0; i < datarefs_copy.length () - 1;)
2710 {
2711 data_reference_p dra = datarefs_copy[i];
2712 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2713 stmt_vec_info lastinfo = NULL;
2714 for (i = i + 1; i < datarefs_copy.length (); ++i)
2715 {
2716 data_reference_p drb = datarefs_copy[i];
2717 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2718
2719 /* ??? Imperfect sorting (non-compatible types, non-modulo
2720 accesses, same accesses) can lead to a group to be artificially
2721 split here as we don't just skip over those. If it really
2722 matters we can push those to a worklist and re-iterate
2723 over them. The we can just skip ahead to the next DR here. */
2724
2725 /* DRs in a different loop should not be put into the same
2726 interleaving group. */
2727 if (gimple_bb (DR_STMT (dra))->loop_father
2728 != gimple_bb (DR_STMT (drb))->loop_father)
2729 break;
2730
2731 /* Check that the data-refs have same first location (except init)
2732 and they are both either store or load (not load and store,
2733 not masked loads or stores). */
2734 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2735 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2736 DR_BASE_ADDRESS (drb), 0)
2737 || !dr_equal_offsets_p (dra, drb)
2738 || !gimple_assign_single_p (DR_STMT (dra))
2739 || !gimple_assign_single_p (DR_STMT (drb)))
2740 break;
2741
2742 /* Check that the data-refs have the same constant size. */
2743 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2744 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2745 if (!tree_fits_uhwi_p (sza)
2746 || !tree_fits_uhwi_p (szb)
2747 || !tree_int_cst_equal (sza, szb))
2748 break;
2749
2750 /* Check that the data-refs have the same step. */
2751 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2752 break;
2753
2754 /* Do not place the same access in the interleaving chain twice. */
2755 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2756 break;
2757
2758 /* Check the types are compatible.
2759 ??? We don't distinguish this during sorting. */
2760 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2761 TREE_TYPE (DR_REF (drb))))
2762 break;
2763
2764 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2765 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2766 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2767 gcc_assert (init_a <= init_b);
2768
2769 /* If init_b == init_a + the size of the type * k, we have an
2770 interleaving, and DRA is accessed before DRB. */
2771 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2772 if (type_size_a == 0
2773 || (init_b - init_a) % type_size_a != 0)
2774 break;
2775
2776 /* If we have a store, the accesses are adjacent. This splits
2777 groups into chunks we support (we don't support vectorization
2778 of stores with gaps). */
2779 if (!DR_IS_READ (dra)
2780 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2781 (DR_INIT (datarefs_copy[i-1]))
2782 != type_size_a))
2783 break;
2784
2785 /* If the step (if not zero or non-constant) is greater than the
2786 difference between data-refs' inits this splits groups into
2787 suitable sizes. */
2788 if (tree_fits_shwi_p (DR_STEP (dra)))
2789 {
2790 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2791 if (step != 0 && step <= (init_b - init_a))
2792 break;
2793 }
2794
2795 if (dump_enabled_p ())
2796 {
2797 dump_printf_loc (MSG_NOTE, vect_location,
2798 "Detected interleaving ");
2799 if (DR_IS_READ (dra))
2800 dump_printf (MSG_NOTE, "load ");
2801 else
2802 dump_printf (MSG_NOTE, "store ");
2803 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2804 dump_printf (MSG_NOTE, " and ");
2805 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2806 dump_printf (MSG_NOTE, "\n");
2807 }
2808
2809 /* Link the found element into the group list. */
2810 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2811 {
2812 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2813 lastinfo = stmtinfo_a;
2814 }
2815 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2816 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2817 lastinfo = stmtinfo_b;
2818 }
2819 }
2820
2821 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2822 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2823 && !vect_analyze_data_ref_access (dr))
2824 {
2825 if (dump_enabled_p ())
2826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2827 "not vectorized: complicated access pattern.\n");
2828
2829 if (is_a <bb_vec_info> (vinfo))
2830 {
2831 /* Mark the statement as not vectorizable. */
2832 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2833 continue;
2834 }
2835 else
2836 {
2837 datarefs_copy.release ();
2838 return false;
2839 }
2840 }
2841
2842 datarefs_copy.release ();
2843 return true;
2844 }
2845
2846
2847 /* Operator == between two dr_with_seg_len objects.
2848
2849 This equality operator is used to make sure two data refs
2850 are the same one so that we will consider to combine the
2851 aliasing checks of those two pairs of data dependent data
2852 refs. */
2853
2854 static bool
2855 operator == (const dr_with_seg_len& d1,
2856 const dr_with_seg_len& d2)
2857 {
2858 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2859 DR_BASE_ADDRESS (d2.dr), 0)
2860 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2861 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2862 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2863 }
2864
2865 /* Function comp_dr_with_seg_len_pair.
2866
2867 Comparison function for sorting objects of dr_with_seg_len_pair_t
2868 so that we can combine aliasing checks in one scan. */
2869
2870 static int
2871 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2872 {
2873 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2874 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2875 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2876 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2877
2878 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2879 if a and c have the same basic address snd step, and b and d have the same
2880 address and step. Therefore, if any a&c or b&d don't have the same address
2881 and step, we don't care the order of those two pairs after sorting. */
2882 int comp_res;
2883
2884 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2885 DR_BASE_ADDRESS (b1.dr))) != 0)
2886 return comp_res;
2887 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2888 DR_BASE_ADDRESS (b2.dr))) != 0)
2889 return comp_res;
2890 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2891 return comp_res;
2892 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2893 return comp_res;
2894 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2895 return comp_res;
2896 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2897 return comp_res;
2898 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2899 return comp_res;
2900 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2901 return comp_res;
2902
2903 return 0;
2904 }
2905
2906 /* Function vect_vfa_segment_size.
2907
2908 Create an expression that computes the size of segment
2909 that will be accessed for a data reference. The functions takes into
2910 account that realignment loads may access one more vector.
2911
2912 Input:
2913 DR: The data reference.
2914 LENGTH_FACTOR: segment length to consider.
2915
2916 Return an expression whose value is the size of segment which will be
2917 accessed by DR. */
2918
2919 static tree
2920 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2921 {
2922 tree segment_length;
2923
2924 if (integer_zerop (DR_STEP (dr)))
2925 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2926 else
2927 segment_length = size_binop (MULT_EXPR,
2928 fold_convert (sizetype, DR_STEP (dr)),
2929 fold_convert (sizetype, length_factor));
2930
2931 if (vect_supportable_dr_alignment (dr, false)
2932 == dr_explicit_realign_optimized)
2933 {
2934 tree vector_size = TYPE_SIZE_UNIT
2935 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2936
2937 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2938 }
2939 return segment_length;
2940 }
2941
2942 /* Function vect_no_alias_p.
2943
2944 Given data references A and B with equal base and offset, the alias
2945 relation can be decided at compilation time, return TRUE if they do
2946 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2947 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2948 respectively. */
2949
2950 static bool
2951 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2952 tree segment_length_a, tree segment_length_b)
2953 {
2954 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2955 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2956 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2957 return false;
2958
2959 tree seg_a_min = DR_INIT (a);
2960 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2961 seg_a_min, segment_length_a);
2962 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2963 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2964 [a, a+12) */
2965 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2966 {
2967 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2968 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2969 seg_a_max, unit_size);
2970 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2971 DR_INIT (a), unit_size);
2972 }
2973 tree seg_b_min = DR_INIT (b);
2974 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2975 seg_b_min, segment_length_b);
2976 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2977 {
2978 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2979 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2980 seg_b_max, unit_size);
2981 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2982 DR_INIT (b), unit_size);
2983 }
2984
2985 if (tree_int_cst_le (seg_a_max, seg_b_min)
2986 || tree_int_cst_le (seg_b_max, seg_a_min))
2987 return true;
2988
2989 return false;
2990 }
2991
2992 /* Function vect_prune_runtime_alias_test_list.
2993
2994 Prune a list of ddrs to be tested at run-time by versioning for alias.
2995 Merge several alias checks into one if possible.
2996 Return FALSE if resulting list of ddrs is longer then allowed by
2997 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2998
2999 bool
3000 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3001 {
3002 vec<ddr_p> may_alias_ddrs =
3003 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3004 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3005 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3006 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3007 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3008
3009 ddr_p ddr;
3010 unsigned int i;
3011 tree length_factor;
3012
3013 if (dump_enabled_p ())
3014 dump_printf_loc (MSG_NOTE, vect_location,
3015 "=== vect_prune_runtime_alias_test_list ===\n");
3016
3017 if (may_alias_ddrs.is_empty ())
3018 return true;
3019
3020 /* Basically, for each pair of dependent data refs store_ptr_0
3021 and load_ptr_0, we create an expression:
3022
3023 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3024 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3025
3026 for aliasing checks. However, in some cases we can decrease
3027 the number of checks by combining two checks into one. For
3028 example, suppose we have another pair of data refs store_ptr_0
3029 and load_ptr_1, and if the following condition is satisfied:
3030
3031 load_ptr_0 < load_ptr_1 &&
3032 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3033
3034 (this condition means, in each iteration of vectorized loop,
3035 the accessed memory of store_ptr_0 cannot be between the memory
3036 of load_ptr_0 and load_ptr_1.)
3037
3038 we then can use only the following expression to finish the
3039 alising checks between store_ptr_0 & load_ptr_0 and
3040 store_ptr_0 & load_ptr_1:
3041
3042 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3043 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3044
3045 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3046 same basic address. */
3047
3048 comp_alias_ddrs.create (may_alias_ddrs.length ());
3049
3050 /* First, we collect all data ref pairs for aliasing checks. */
3051 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3052 {
3053 int comp_res;
3054 struct data_reference *dr_a, *dr_b;
3055 gimple *dr_group_first_a, *dr_group_first_b;
3056 tree segment_length_a, segment_length_b;
3057 gimple *stmt_a, *stmt_b;
3058
3059 dr_a = DDR_A (ddr);
3060 stmt_a = DR_STMT (DDR_A (ddr));
3061 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3062 if (dr_group_first_a)
3063 {
3064 stmt_a = dr_group_first_a;
3065 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3066 }
3067
3068 dr_b = DDR_B (ddr);
3069 stmt_b = DR_STMT (DDR_B (ddr));
3070 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3071 if (dr_group_first_b)
3072 {
3073 stmt_b = dr_group_first_b;
3074 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3075 }
3076
3077 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3078 length_factor = scalar_loop_iters;
3079 else
3080 length_factor = size_int (vect_factor);
3081 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3082 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3083
3084 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3085 if (comp_res == 0)
3086 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3087
3088 /* Alias is known at compilation time. */
3089 if (comp_res == 0
3090 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3091 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3092 && TREE_CODE (segment_length_a) == INTEGER_CST
3093 && TREE_CODE (segment_length_b) == INTEGER_CST)
3094 {
3095 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3096 continue;
3097
3098 if (dump_enabled_p ())
3099 dump_printf_loc (MSG_NOTE, vect_location,
3100 "not vectorized: compilation time alias.\n");
3101
3102 return false;
3103 }
3104
3105 dr_with_seg_len_pair_t dr_with_seg_len_pair
3106 (dr_with_seg_len (dr_a, segment_length_a),
3107 dr_with_seg_len (dr_b, segment_length_b));
3108
3109 /* Canonicalize pairs by sorting the two DR members. */
3110 if (comp_res > 0)
3111 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3112
3113 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3114 }
3115
3116 /* Second, we sort the collected data ref pairs so that we can scan
3117 them once to combine all possible aliasing checks. */
3118 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3119
3120 /* Third, we scan the sorted dr pairs and check if we can combine
3121 alias checks of two neighboring dr pairs. */
3122 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3123 {
3124 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3125 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3126 *dr_b1 = &comp_alias_ddrs[i-1].second,
3127 *dr_a2 = &comp_alias_ddrs[i].first,
3128 *dr_b2 = &comp_alias_ddrs[i].second;
3129
3130 /* Remove duplicate data ref pairs. */
3131 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3132 {
3133 if (dump_enabled_p ())
3134 {
3135 dump_printf_loc (MSG_NOTE, vect_location,
3136 "found equal ranges ");
3137 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138 DR_REF (dr_a1->dr));
3139 dump_printf (MSG_NOTE, ", ");
3140 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3141 DR_REF (dr_b1->dr));
3142 dump_printf (MSG_NOTE, " and ");
3143 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3144 DR_REF (dr_a2->dr));
3145 dump_printf (MSG_NOTE, ", ");
3146 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3147 DR_REF (dr_b2->dr));
3148 dump_printf (MSG_NOTE, "\n");
3149 }
3150
3151 comp_alias_ddrs.ordered_remove (i--);
3152 continue;
3153 }
3154
3155 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3156 {
3157 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3158 and DR_A1 and DR_A2 are two consecutive memrefs. */
3159 if (*dr_a1 == *dr_a2)
3160 {
3161 std::swap (dr_a1, dr_b1);
3162 std::swap (dr_a2, dr_b2);
3163 }
3164
3165 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3166 DR_BASE_ADDRESS (dr_a2->dr), 0)
3167 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3168 DR_OFFSET (dr_a2->dr), 0)
3169 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3170 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3171 continue;
3172
3173 /* Make sure dr_a1 starts left of dr_a2. */
3174 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3175 std::swap (*dr_a1, *dr_a2);
3176
3177 bool do_remove = false;
3178 unsigned HOST_WIDE_INT diff
3179 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3180 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3181
3182 /* If the left segment does not extend beyond the start of the
3183 right segment the new segment length is that of the right
3184 plus the segment distance. */
3185 if (tree_fits_uhwi_p (dr_a1->seg_len)
3186 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3187 {
3188 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3189 size_int (diff));
3190 do_remove = true;
3191 }
3192 /* Generally the new segment length is the maximum of the
3193 left segment size and the right segment size plus the distance.
3194 ??? We can also build tree MAX_EXPR here but it's not clear this
3195 is profitable. */
3196 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3197 && tree_fits_uhwi_p (dr_a2->seg_len))
3198 {
3199 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3200 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3201 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3202 do_remove = true;
3203 }
3204 /* Now we check if the following condition is satisfied:
3205
3206 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3207
3208 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3209 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3210 have to make a best estimation. We can get the minimum value
3211 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3212 then either of the following two conditions can guarantee the
3213 one above:
3214
3215 1: DIFF <= MIN_SEG_LEN_B
3216 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3217 else
3218 {
3219 unsigned HOST_WIDE_INT min_seg_len_b
3220 = (tree_fits_uhwi_p (dr_b1->seg_len)
3221 ? tree_to_uhwi (dr_b1->seg_len)
3222 : vect_factor);
3223
3224 if (diff <= min_seg_len_b
3225 || (tree_fits_uhwi_p (dr_a1->seg_len)
3226 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3227 {
3228 dr_a1->seg_len = size_binop (PLUS_EXPR,
3229 dr_a2->seg_len, size_int (diff));
3230 do_remove = true;
3231 }
3232 }
3233
3234 if (do_remove)
3235 {
3236 if (dump_enabled_p ())
3237 {
3238 dump_printf_loc (MSG_NOTE, vect_location,
3239 "merging ranges for ");
3240 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3241 dump_printf (MSG_NOTE, ", ");
3242 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3243 dump_printf (MSG_NOTE, " and ");
3244 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3245 dump_printf (MSG_NOTE, ", ");
3246 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3247 dump_printf (MSG_NOTE, "\n");
3248 }
3249 comp_alias_ddrs.ordered_remove (i--);
3250 }
3251 }
3252 }
3253
3254 dump_printf_loc (MSG_NOTE, vect_location,
3255 "improved number of alias checks from %d to %d\n",
3256 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3257 if ((int) comp_alias_ddrs.length () >
3258 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3259 {
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3262 "number of versioning for alias "
3263 "run-time tests exceeds %d "
3264 "(--param vect-max-version-for-alias-checks)\n",
3265 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3266 return false;
3267 }
3268
3269 /* All alias checks have been resolved at compilation time. */
3270 if (!comp_alias_ddrs.length ())
3271 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3272
3273 return true;
3274 }
3275
3276 /* Return true if a non-affine read or write in STMT is suitable for a
3277 gather load or scatter store. Describe the operation in *INFO if so. */
3278
3279 bool
3280 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3281 gather_scatter_info *info)
3282 {
3283 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3284 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3285 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3286 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3287 tree offtype = NULL_TREE;
3288 tree decl, base, off;
3289 machine_mode pmode;
3290 int punsignedp, reversep, pvolatilep = 0;
3291
3292 base = DR_REF (dr);
3293 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3294 see if we can use the def stmt of the address. */
3295 if (is_gimple_call (stmt)
3296 && gimple_call_internal_p (stmt)
3297 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3298 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3299 && TREE_CODE (base) == MEM_REF
3300 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3301 && integer_zerop (TREE_OPERAND (base, 1))
3302 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3303 {
3304 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3305 if (is_gimple_assign (def_stmt)
3306 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3307 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3308 }
3309
3310 /* The gather and scatter builtins need address of the form
3311 loop_invariant + vector * {1, 2, 4, 8}
3312 or
3313 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3314 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3315 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3316 multiplications and additions in it. To get a vector, we need
3317 a single SSA_NAME that will be defined in the loop and will
3318 contain everything that is not loop invariant and that can be
3319 vectorized. The following code attempts to find such a preexistng
3320 SSA_NAME OFF and put the loop invariants into a tree BASE
3321 that can be gimplified before the loop. */
3322 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3323 &punsignedp, &reversep, &pvolatilep);
3324 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3325
3326 if (TREE_CODE (base) == MEM_REF)
3327 {
3328 if (!integer_zerop (TREE_OPERAND (base, 1)))
3329 {
3330 if (off == NULL_TREE)
3331 {
3332 offset_int moff = mem_ref_offset (base);
3333 off = wide_int_to_tree (sizetype, moff);
3334 }
3335 else
3336 off = size_binop (PLUS_EXPR, off,
3337 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3338 }
3339 base = TREE_OPERAND (base, 0);
3340 }
3341 else
3342 base = build_fold_addr_expr (base);
3343
3344 if (off == NULL_TREE)
3345 off = size_zero_node;
3346
3347 /* If base is not loop invariant, either off is 0, then we start with just
3348 the constant offset in the loop invariant BASE and continue with base
3349 as OFF, otherwise give up.
3350 We could handle that case by gimplifying the addition of base + off
3351 into some SSA_NAME and use that as off, but for now punt. */
3352 if (!expr_invariant_in_loop_p (loop, base))
3353 {
3354 if (!integer_zerop (off))
3355 return false;
3356 off = base;
3357 base = size_int (pbitpos / BITS_PER_UNIT);
3358 }
3359 /* Otherwise put base + constant offset into the loop invariant BASE
3360 and continue with OFF. */
3361 else
3362 {
3363 base = fold_convert (sizetype, base);
3364 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3365 }
3366
3367 /* OFF at this point may be either a SSA_NAME or some tree expression
3368 from get_inner_reference. Try to peel off loop invariants from it
3369 into BASE as long as possible. */
3370 STRIP_NOPS (off);
3371 while (offtype == NULL_TREE)
3372 {
3373 enum tree_code code;
3374 tree op0, op1, add = NULL_TREE;
3375
3376 if (TREE_CODE (off) == SSA_NAME)
3377 {
3378 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3379
3380 if (expr_invariant_in_loop_p (loop, off))
3381 return false;
3382
3383 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3384 break;
3385
3386 op0 = gimple_assign_rhs1 (def_stmt);
3387 code = gimple_assign_rhs_code (def_stmt);
3388 op1 = gimple_assign_rhs2 (def_stmt);
3389 }
3390 else
3391 {
3392 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3393 return false;
3394 code = TREE_CODE (off);
3395 extract_ops_from_tree (off, &code, &op0, &op1);
3396 }
3397 switch (code)
3398 {
3399 case POINTER_PLUS_EXPR:
3400 case PLUS_EXPR:
3401 if (expr_invariant_in_loop_p (loop, op0))
3402 {
3403 add = op0;
3404 off = op1;
3405 do_add:
3406 add = fold_convert (sizetype, add);
3407 if (scale != 1)
3408 add = size_binop (MULT_EXPR, add, size_int (scale));
3409 base = size_binop (PLUS_EXPR, base, add);
3410 continue;
3411 }
3412 if (expr_invariant_in_loop_p (loop, op1))
3413 {
3414 add = op1;
3415 off = op0;
3416 goto do_add;
3417 }
3418 break;
3419 case MINUS_EXPR:
3420 if (expr_invariant_in_loop_p (loop, op1))
3421 {
3422 add = fold_convert (sizetype, op1);
3423 add = size_binop (MINUS_EXPR, size_zero_node, add);
3424 off = op0;
3425 goto do_add;
3426 }
3427 break;
3428 case MULT_EXPR:
3429 if (scale == 1 && tree_fits_shwi_p (op1))
3430 {
3431 scale = tree_to_shwi (op1);
3432 off = op0;
3433 continue;
3434 }
3435 break;
3436 case SSA_NAME:
3437 off = op0;
3438 continue;
3439 CASE_CONVERT:
3440 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3441 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3442 break;
3443 if (TYPE_PRECISION (TREE_TYPE (op0))
3444 == TYPE_PRECISION (TREE_TYPE (off)))
3445 {
3446 off = op0;
3447 continue;
3448 }
3449 if (TYPE_PRECISION (TREE_TYPE (op0))
3450 < TYPE_PRECISION (TREE_TYPE (off)))
3451 {
3452 off = op0;
3453 offtype = TREE_TYPE (off);
3454 STRIP_NOPS (off);
3455 continue;
3456 }
3457 break;
3458 default:
3459 break;
3460 }
3461 break;
3462 }
3463
3464 /* If at the end OFF still isn't a SSA_NAME or isn't
3465 defined in the loop, punt. */
3466 if (TREE_CODE (off) != SSA_NAME
3467 || expr_invariant_in_loop_p (loop, off))
3468 return false;
3469
3470 if (offtype == NULL_TREE)
3471 offtype = TREE_TYPE (off);
3472
3473 if (DR_IS_READ (dr))
3474 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3475 offtype, scale);
3476 else
3477 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3478 offtype, scale);
3479
3480 if (decl == NULL_TREE)
3481 return false;
3482
3483 info->decl = decl;
3484 info->base = base;
3485 info->offset = off;
3486 info->offset_dt = vect_unknown_def_type;
3487 info->offset_vectype = NULL_TREE;
3488 info->scale = scale;
3489 return true;
3490 }
3491
3492 /* Function vect_analyze_data_refs.
3493
3494 Find all the data references in the loop or basic block.
3495
3496 The general structure of the analysis of data refs in the vectorizer is as
3497 follows:
3498 1- vect_analyze_data_refs(loop/bb): call
3499 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3500 in the loop/bb and their dependences.
3501 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3502 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3503 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3504
3505 */
3506
3507 bool
3508 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3509 {
3510 struct loop *loop = NULL;
3511 unsigned int i;
3512 struct data_reference *dr;
3513 tree scalar_type;
3514
3515 if (dump_enabled_p ())
3516 dump_printf_loc (MSG_NOTE, vect_location,
3517 "=== vect_analyze_data_refs ===\n");
3518
3519 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3520 loop = LOOP_VINFO_LOOP (loop_vinfo);
3521
3522 /* Go through the data-refs, check that the analysis succeeded. Update
3523 pointer from stmt_vec_info struct to DR and vectype. */
3524
3525 vec<data_reference_p> datarefs = vinfo->datarefs;
3526 FOR_EACH_VEC_ELT (datarefs, i, dr)
3527 {
3528 gimple *stmt;
3529 stmt_vec_info stmt_info;
3530 tree base, offset, init;
3531 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3532 bool simd_lane_access = false;
3533 int vf;
3534
3535 again:
3536 if (!dr || !DR_REF (dr))
3537 {
3538 if (dump_enabled_p ())
3539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3540 "not vectorized: unhandled data-ref\n");
3541 return false;
3542 }
3543
3544 stmt = DR_STMT (dr);
3545 stmt_info = vinfo_for_stmt (stmt);
3546
3547 /* Discard clobbers from the dataref vector. We will remove
3548 clobber stmts during vectorization. */
3549 if (gimple_clobber_p (stmt))
3550 {
3551 free_data_ref (dr);
3552 if (i == datarefs.length () - 1)
3553 {
3554 datarefs.pop ();
3555 break;
3556 }
3557 datarefs.ordered_remove (i);
3558 dr = datarefs[i];
3559 goto again;
3560 }
3561
3562 /* Check that analysis of the data-ref succeeded. */
3563 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3564 || !DR_STEP (dr))
3565 {
3566 bool maybe_gather
3567 = DR_IS_READ (dr)
3568 && !TREE_THIS_VOLATILE (DR_REF (dr))
3569 && targetm.vectorize.builtin_gather != NULL;
3570 bool maybe_scatter
3571 = DR_IS_WRITE (dr)
3572 && !TREE_THIS_VOLATILE (DR_REF (dr))
3573 && targetm.vectorize.builtin_scatter != NULL;
3574 bool maybe_simd_lane_access
3575 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3576
3577 /* If target supports vector gather loads or scatter stores, or if
3578 this might be a SIMD lane access, see if they can't be used. */
3579 if (is_a <loop_vec_info> (vinfo)
3580 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3581 && !nested_in_vect_loop_p (loop, stmt))
3582 {
3583 struct data_reference *newdr
3584 = create_data_ref (NULL, loop_containing_stmt (stmt),
3585 DR_REF (dr), stmt, maybe_scatter ? false : true);
3586 gcc_assert (newdr != NULL && DR_REF (newdr));
3587 if (DR_BASE_ADDRESS (newdr)
3588 && DR_OFFSET (newdr)
3589 && DR_INIT (newdr)
3590 && DR_STEP (newdr)
3591 && integer_zerop (DR_STEP (newdr)))
3592 {
3593 if (maybe_simd_lane_access)
3594 {
3595 tree off = DR_OFFSET (newdr);
3596 STRIP_NOPS (off);
3597 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3598 && TREE_CODE (off) == MULT_EXPR
3599 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3600 {
3601 tree step = TREE_OPERAND (off, 1);
3602 off = TREE_OPERAND (off, 0);
3603 STRIP_NOPS (off);
3604 if (CONVERT_EXPR_P (off)
3605 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3606 0)))
3607 < TYPE_PRECISION (TREE_TYPE (off)))
3608 off = TREE_OPERAND (off, 0);
3609 if (TREE_CODE (off) == SSA_NAME)
3610 {
3611 gimple *def = SSA_NAME_DEF_STMT (off);
3612 tree reft = TREE_TYPE (DR_REF (newdr));
3613 if (is_gimple_call (def)
3614 && gimple_call_internal_p (def)
3615 && (gimple_call_internal_fn (def)
3616 == IFN_GOMP_SIMD_LANE))
3617 {
3618 tree arg = gimple_call_arg (def, 0);
3619 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3620 arg = SSA_NAME_VAR (arg);
3621 if (arg == loop->simduid
3622 /* For now. */
3623 && tree_int_cst_equal
3624 (TYPE_SIZE_UNIT (reft),
3625 step))
3626 {
3627 DR_OFFSET (newdr) = ssize_int (0);
3628 DR_STEP (newdr) = step;
3629 DR_ALIGNED_TO (newdr)
3630 = size_int (BIGGEST_ALIGNMENT);
3631 dr = newdr;
3632 simd_lane_access = true;
3633 }
3634 }
3635 }
3636 }
3637 }
3638 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3639 {
3640 dr = newdr;
3641 if (maybe_gather)
3642 gatherscatter = GATHER;
3643 else
3644 gatherscatter = SCATTER;
3645 }
3646 }
3647 if (gatherscatter == SG_NONE && !simd_lane_access)
3648 free_data_ref (newdr);
3649 }
3650
3651 if (gatherscatter == SG_NONE && !simd_lane_access)
3652 {
3653 if (dump_enabled_p ())
3654 {
3655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3656 "not vectorized: data ref analysis "
3657 "failed ");
3658 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3659 }
3660
3661 if (is_a <bb_vec_info> (vinfo))
3662 break;
3663
3664 return false;
3665 }
3666 }
3667
3668 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3669 {
3670 if (dump_enabled_p ())
3671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3672 "not vectorized: base addr of dr is a "
3673 "constant\n");
3674
3675 if (is_a <bb_vec_info> (vinfo))
3676 break;
3677
3678 if (gatherscatter != SG_NONE || simd_lane_access)
3679 free_data_ref (dr);
3680 return false;
3681 }
3682
3683 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3684 {
3685 if (dump_enabled_p ())
3686 {
3687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3688 "not vectorized: volatile type ");
3689 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3690 }
3691
3692 if (is_a <bb_vec_info> (vinfo))
3693 break;
3694
3695 return false;
3696 }
3697
3698 if (stmt_can_throw_internal (stmt))
3699 {
3700 if (dump_enabled_p ())
3701 {
3702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3703 "not vectorized: statement can throw an "
3704 "exception ");
3705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3706 }
3707
3708 if (is_a <bb_vec_info> (vinfo))
3709 break;
3710
3711 if (gatherscatter != SG_NONE || simd_lane_access)
3712 free_data_ref (dr);
3713 return false;
3714 }
3715
3716 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3717 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3718 {
3719 if (dump_enabled_p ())
3720 {
3721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3722 "not vectorized: statement is bitfield "
3723 "access ");
3724 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3725 }
3726
3727 if (is_a <bb_vec_info> (vinfo))
3728 break;
3729
3730 if (gatherscatter != SG_NONE || simd_lane_access)
3731 free_data_ref (dr);
3732 return false;
3733 }
3734
3735 base = unshare_expr (DR_BASE_ADDRESS (dr));
3736 offset = unshare_expr (DR_OFFSET (dr));
3737 init = unshare_expr (DR_INIT (dr));
3738
3739 if (is_gimple_call (stmt)
3740 && (!gimple_call_internal_p (stmt)
3741 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3742 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3743 {
3744 if (dump_enabled_p ())
3745 {
3746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3747 "not vectorized: dr in a call ");
3748 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3749 }
3750
3751 if (is_a <bb_vec_info> (vinfo))
3752 break;
3753
3754 if (gatherscatter != SG_NONE || simd_lane_access)
3755 free_data_ref (dr);
3756 return false;
3757 }
3758
3759 /* Update DR field in stmt_vec_info struct. */
3760
3761 /* If the dataref is in an inner-loop of the loop that is considered for
3762 for vectorization, we also want to analyze the access relative to
3763 the outer-loop (DR contains information only relative to the
3764 inner-most enclosing loop). We do that by building a reference to the
3765 first location accessed by the inner-loop, and analyze it relative to
3766 the outer-loop. */
3767 if (loop && nested_in_vect_loop_p (loop, stmt))
3768 {
3769 tree outer_step, outer_base, outer_init;
3770 HOST_WIDE_INT pbitsize, pbitpos;
3771 tree poffset;
3772 machine_mode pmode;
3773 int punsignedp, preversep, pvolatilep;
3774 affine_iv base_iv, offset_iv;
3775 tree dinit;
3776
3777 /* Build a reference to the first location accessed by the
3778 inner-loop: *(BASE+INIT). (The first location is actually
3779 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3780 tree inner_base = build_fold_indirect_ref
3781 (fold_build_pointer_plus (base, init));
3782
3783 if (dump_enabled_p ())
3784 {
3785 dump_printf_loc (MSG_NOTE, vect_location,
3786 "analyze in outer-loop: ");
3787 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3788 dump_printf (MSG_NOTE, "\n");
3789 }
3790
3791 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3792 &poffset, &pmode, &punsignedp,
3793 &preversep, &pvolatilep);
3794 gcc_assert (outer_base != NULL_TREE);
3795
3796 if (pbitpos % BITS_PER_UNIT != 0)
3797 {
3798 if (dump_enabled_p ())
3799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3800 "failed: bit offset alignment.\n");
3801 return false;
3802 }
3803
3804 if (preversep)
3805 {
3806 if (dump_enabled_p ())
3807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3808 "failed: reverse storage order.\n");
3809 return false;
3810 }
3811
3812 outer_base = build_fold_addr_expr (outer_base);
3813 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3814 &base_iv, false))
3815 {
3816 if (dump_enabled_p ())
3817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3818 "failed: evolution of base is not affine.\n");
3819 return false;
3820 }
3821
3822 if (offset)
3823 {
3824 if (poffset)
3825 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3826 poffset);
3827 else
3828 poffset = offset;
3829 }
3830
3831 if (!poffset)
3832 {
3833 offset_iv.base = ssize_int (0);
3834 offset_iv.step = ssize_int (0);
3835 }
3836 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3837 &offset_iv, false))
3838 {
3839 if (dump_enabled_p ())
3840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3841 "evolution of offset is not affine.\n");
3842 return false;
3843 }
3844
3845 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3846 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3847 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3848 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3849 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3850
3851 outer_step = size_binop (PLUS_EXPR,
3852 fold_convert (ssizetype, base_iv.step),
3853 fold_convert (ssizetype, offset_iv.step));
3854
3855 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3856 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3857 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3858 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3859 STMT_VINFO_DR_OFFSET (stmt_info) =
3860 fold_convert (ssizetype, offset_iv.base);
3861 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3862 size_int (highest_pow2_factor (offset_iv.base));
3863
3864 if (dump_enabled_p ())
3865 {
3866 dump_printf_loc (MSG_NOTE, vect_location,
3867 "\touter base_address: ");
3868 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3869 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3870 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3871 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3872 STMT_VINFO_DR_OFFSET (stmt_info));
3873 dump_printf (MSG_NOTE,
3874 "\n\touter constant offset from base address: ");
3875 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3876 STMT_VINFO_DR_INIT (stmt_info));
3877 dump_printf (MSG_NOTE, "\n\touter step: ");
3878 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3879 STMT_VINFO_DR_STEP (stmt_info));
3880 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3881 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3882 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3883 dump_printf (MSG_NOTE, "\n");
3884 }
3885 }
3886
3887 if (STMT_VINFO_DATA_REF (stmt_info))
3888 {
3889 if (dump_enabled_p ())
3890 {
3891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3892 "not vectorized: more than one data ref "
3893 "in stmt: ");
3894 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3895 }
3896
3897 if (is_a <bb_vec_info> (vinfo))
3898 break;
3899
3900 if (gatherscatter != SG_NONE || simd_lane_access)
3901 free_data_ref (dr);
3902 return false;
3903 }
3904
3905 STMT_VINFO_DATA_REF (stmt_info) = dr;
3906 if (simd_lane_access)
3907 {
3908 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3909 free_data_ref (datarefs[i]);
3910 datarefs[i] = dr;
3911 }
3912
3913 /* Set vectype for STMT. */
3914 scalar_type = TREE_TYPE (DR_REF (dr));
3915 STMT_VINFO_VECTYPE (stmt_info)
3916 = get_vectype_for_scalar_type (scalar_type);
3917 if (!STMT_VINFO_VECTYPE (stmt_info))
3918 {
3919 if (dump_enabled_p ())
3920 {
3921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3922 "not vectorized: no vectype for stmt: ");
3923 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3924 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3925 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3926 scalar_type);
3927 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3928 }
3929
3930 if (is_a <bb_vec_info> (vinfo))
3931 {
3932 /* No vector type is fine, the ref can still participate
3933 in dependence analysis, we just can't vectorize it. */
3934 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3935 continue;
3936 }
3937
3938 if (gatherscatter != SG_NONE || simd_lane_access)
3939 {
3940 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3941 if (gatherscatter != SG_NONE)
3942 free_data_ref (dr);
3943 }
3944 return false;
3945 }
3946 else
3947 {
3948 if (dump_enabled_p ())
3949 {
3950 dump_printf_loc (MSG_NOTE, vect_location,
3951 "got vectype for stmt: ");
3952 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3953 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3954 STMT_VINFO_VECTYPE (stmt_info));
3955 dump_printf (MSG_NOTE, "\n");
3956 }
3957 }
3958
3959 /* Adjust the minimal vectorization factor according to the
3960 vector type. */
3961 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3962 if (vf > *min_vf)
3963 *min_vf = vf;
3964
3965 if (gatherscatter != SG_NONE)
3966 {
3967 gather_scatter_info gs_info;
3968 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3969 &gs_info)
3970 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3971 {
3972 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3973 free_data_ref (dr);
3974 if (dump_enabled_p ())
3975 {
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977 (gatherscatter == GATHER) ?
3978 "not vectorized: not suitable for gather "
3979 "load " :
3980 "not vectorized: not suitable for scatter "
3981 "store ");
3982 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3983 }
3984 return false;
3985 }
3986
3987 free_data_ref (datarefs[i]);
3988 datarefs[i] = dr;
3989 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3990 }
3991
3992 else if (is_a <loop_vec_info> (vinfo)
3993 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3994 {
3995 if (nested_in_vect_loop_p (loop, stmt))
3996 {
3997 if (dump_enabled_p ())
3998 {
3999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4000 "not vectorized: not suitable for strided "
4001 "load ");
4002 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4003 }
4004 return false;
4005 }
4006 STMT_VINFO_STRIDED_P (stmt_info) = true;
4007 }
4008 }
4009
4010 /* If we stopped analysis at the first dataref we could not analyze
4011 when trying to vectorize a basic-block mark the rest of the datarefs
4012 as not vectorizable and truncate the vector of datarefs. That
4013 avoids spending useless time in analyzing their dependence. */
4014 if (i != datarefs.length ())
4015 {
4016 gcc_assert (is_a <bb_vec_info> (vinfo));
4017 for (unsigned j = i; j < datarefs.length (); ++j)
4018 {
4019 data_reference_p dr = datarefs[j];
4020 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4021 free_data_ref (dr);
4022 }
4023 datarefs.truncate (i);
4024 }
4025
4026 return true;
4027 }
4028
4029
4030 /* Function vect_get_new_vect_var.
4031
4032 Returns a name for a new variable. The current naming scheme appends the
4033 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4034 the name of vectorizer generated variables, and appends that to NAME if
4035 provided. */
4036
4037 tree
4038 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4039 {
4040 const char *prefix;
4041 tree new_vect_var;
4042
4043 switch (var_kind)
4044 {
4045 case vect_simple_var:
4046 prefix = "vect";
4047 break;
4048 case vect_scalar_var:
4049 prefix = "stmp";
4050 break;
4051 case vect_mask_var:
4052 prefix = "mask";
4053 break;
4054 case vect_pointer_var:
4055 prefix = "vectp";
4056 break;
4057 default:
4058 gcc_unreachable ();
4059 }
4060
4061 if (name)
4062 {
4063 char* tmp = concat (prefix, "_", name, NULL);
4064 new_vect_var = create_tmp_reg (type, tmp);
4065 free (tmp);
4066 }
4067 else
4068 new_vect_var = create_tmp_reg (type, prefix);
4069
4070 return new_vect_var;
4071 }
4072
4073 /* Like vect_get_new_vect_var but return an SSA name. */
4074
4075 tree
4076 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4077 {
4078 const char *prefix;
4079 tree new_vect_var;
4080
4081 switch (var_kind)
4082 {
4083 case vect_simple_var:
4084 prefix = "vect";
4085 break;
4086 case vect_scalar_var:
4087 prefix = "stmp";
4088 break;
4089 case vect_pointer_var:
4090 prefix = "vectp";
4091 break;
4092 default:
4093 gcc_unreachable ();
4094 }
4095
4096 if (name)
4097 {
4098 char* tmp = concat (prefix, "_", name, NULL);
4099 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4100 free (tmp);
4101 }
4102 else
4103 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4104
4105 return new_vect_var;
4106 }
4107
4108 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4109
4110 static void
4111 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4112 stmt_vec_info stmt_info)
4113 {
4114 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4115 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4116 int misalign = DR_MISALIGNMENT (dr);
4117 if (misalign == -1)
4118 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4119 else
4120 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4121 }
4122
4123 /* Function vect_create_addr_base_for_vector_ref.
4124
4125 Create an expression that computes the address of the first memory location
4126 that will be accessed for a data reference.
4127
4128 Input:
4129 STMT: The statement containing the data reference.
4130 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4131 OFFSET: Optional. If supplied, it is be added to the initial address.
4132 LOOP: Specify relative to which loop-nest should the address be computed.
4133 For example, when the dataref is in an inner-loop nested in an
4134 outer-loop that is now being vectorized, LOOP can be either the
4135 outer-loop, or the inner-loop. The first memory location accessed
4136 by the following dataref ('in' points to short):
4137
4138 for (i=0; i<N; i++)
4139 for (j=0; j<M; j++)
4140 s += in[i+j]
4141
4142 is as follows:
4143 if LOOP=i_loop: &in (relative to i_loop)
4144 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4145 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4146 initial address. Unlike OFFSET, which is number of elements to
4147 be added, BYTE_OFFSET is measured in bytes.
4148
4149 Output:
4150 1. Return an SSA_NAME whose value is the address of the memory location of
4151 the first vector of the data reference.
4152 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4153 these statement(s) which define the returned SSA_NAME.
4154
4155 FORNOW: We are only handling array accesses with step 1. */
4156
4157 tree
4158 vect_create_addr_base_for_vector_ref (gimple *stmt,
4159 gimple_seq *new_stmt_list,
4160 tree offset,
4161 struct loop *loop,
4162 tree byte_offset)
4163 {
4164 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4165 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4166 tree data_ref_base;
4167 const char *base_name;
4168 tree addr_base;
4169 tree dest;
4170 gimple_seq seq = NULL;
4171 tree base_offset;
4172 tree init;
4173 tree vect_ptr_type;
4174 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4175 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4176
4177 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4178 {
4179 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4180
4181 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4182
4183 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4184 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4185 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4186 }
4187 else
4188 {
4189 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4190 base_offset = unshare_expr (DR_OFFSET (dr));
4191 init = unshare_expr (DR_INIT (dr));
4192 }
4193
4194 if (loop_vinfo)
4195 base_name = get_name (data_ref_base);
4196 else
4197 {
4198 base_offset = ssize_int (0);
4199 init = ssize_int (0);
4200 base_name = get_name (DR_REF (dr));
4201 }
4202
4203 /* Create base_offset */
4204 base_offset = size_binop (PLUS_EXPR,
4205 fold_convert (sizetype, base_offset),
4206 fold_convert (sizetype, init));
4207
4208 if (offset)
4209 {
4210 offset = fold_build2 (MULT_EXPR, sizetype,
4211 fold_convert (sizetype, offset), step);
4212 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4213 base_offset, offset);
4214 }
4215 if (byte_offset)
4216 {
4217 byte_offset = fold_convert (sizetype, byte_offset);
4218 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4219 base_offset, byte_offset);
4220 }
4221
4222 /* base + base_offset */
4223 if (loop_vinfo)
4224 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4225 else
4226 {
4227 addr_base = build1 (ADDR_EXPR,
4228 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4229 unshare_expr (DR_REF (dr)));
4230 }
4231
4232 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4233 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4234 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4235 gimple_seq_add_seq (new_stmt_list, seq);
4236
4237 if (DR_PTR_INFO (dr)
4238 && TREE_CODE (addr_base) == SSA_NAME
4239 && !SSA_NAME_PTR_INFO (addr_base))
4240 {
4241 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4242 if (offset || byte_offset)
4243 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4244 }
4245
4246 if (dump_enabled_p ())
4247 {
4248 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4249 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4250 dump_printf (MSG_NOTE, "\n");
4251 }
4252
4253 return addr_base;
4254 }
4255
4256
4257 /* Function vect_create_data_ref_ptr.
4258
4259 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4260 location accessed in the loop by STMT, along with the def-use update
4261 chain to appropriately advance the pointer through the loop iterations.
4262 Also set aliasing information for the pointer. This pointer is used by
4263 the callers to this function to create a memory reference expression for
4264 vector load/store access.
4265
4266 Input:
4267 1. STMT: a stmt that references memory. Expected to be of the form
4268 GIMPLE_ASSIGN <name, data-ref> or
4269 GIMPLE_ASSIGN <data-ref, name>.
4270 2. AGGR_TYPE: the type of the reference, which should be either a vector
4271 or an array.
4272 3. AT_LOOP: the loop where the vector memref is to be created.
4273 4. OFFSET (optional): an offset to be added to the initial address accessed
4274 by the data-ref in STMT.
4275 5. BSI: location where the new stmts are to be placed if there is no loop
4276 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4277 pointing to the initial address.
4278 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4279 to the initial address accessed by the data-ref in STMT. This is
4280 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4281 in bytes.
4282
4283 Output:
4284 1. Declare a new ptr to vector_type, and have it point to the base of the
4285 data reference (initial addressed accessed by the data reference).
4286 For example, for vector of type V8HI, the following code is generated:
4287
4288 v8hi *ap;
4289 ap = (v8hi *)initial_address;
4290
4291 if OFFSET is not supplied:
4292 initial_address = &a[init];
4293 if OFFSET is supplied:
4294 initial_address = &a[init + OFFSET];
4295 if BYTE_OFFSET is supplied:
4296 initial_address = &a[init] + BYTE_OFFSET;
4297
4298 Return the initial_address in INITIAL_ADDRESS.
4299
4300 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4301 update the pointer in each iteration of the loop.
4302
4303 Return the increment stmt that updates the pointer in PTR_INCR.
4304
4305 3. Set INV_P to true if the access pattern of the data reference in the
4306 vectorized loop is invariant. Set it to false otherwise.
4307
4308 4. Return the pointer. */
4309
4310 tree
4311 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4312 tree offset, tree *initial_address,
4313 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4314 bool only_init, bool *inv_p, tree byte_offset)
4315 {
4316 const char *base_name;
4317 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4318 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4319 struct loop *loop = NULL;
4320 bool nested_in_vect_loop = false;
4321 struct loop *containing_loop = NULL;
4322 tree aggr_ptr_type;
4323 tree aggr_ptr;
4324 tree new_temp;
4325 gimple_seq new_stmt_list = NULL;
4326 edge pe = NULL;
4327 basic_block new_bb;
4328 tree aggr_ptr_init;
4329 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4330 tree aptr;
4331 gimple_stmt_iterator incr_gsi;
4332 bool insert_after;
4333 tree indx_before_incr, indx_after_incr;
4334 gimple *incr;
4335 tree step;
4336 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4337
4338 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4339 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4340
4341 if (loop_vinfo)
4342 {
4343 loop = LOOP_VINFO_LOOP (loop_vinfo);
4344 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4345 containing_loop = (gimple_bb (stmt))->loop_father;
4346 pe = loop_preheader_edge (loop);
4347 }
4348 else
4349 {
4350 gcc_assert (bb_vinfo);
4351 only_init = true;
4352 *ptr_incr = NULL;
4353 }
4354
4355 /* Check the step (evolution) of the load in LOOP, and record
4356 whether it's invariant. */
4357 if (nested_in_vect_loop)
4358 step = STMT_VINFO_DR_STEP (stmt_info);
4359 else
4360 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4361
4362 if (integer_zerop (step))
4363 *inv_p = true;
4364 else
4365 *inv_p = false;
4366
4367 /* Create an expression for the first address accessed by this load
4368 in LOOP. */
4369 base_name = get_name (DR_BASE_ADDRESS (dr));
4370
4371 if (dump_enabled_p ())
4372 {
4373 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4374 dump_printf_loc (MSG_NOTE, vect_location,
4375 "create %s-pointer variable to type: ",
4376 get_tree_code_name (TREE_CODE (aggr_type)));
4377 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4378 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4379 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4380 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4381 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4382 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4383 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4384 else
4385 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4386 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4387 dump_printf (MSG_NOTE, "\n");
4388 }
4389
4390 /* (1) Create the new aggregate-pointer variable.
4391 Vector and array types inherit the alias set of their component
4392 type by default so we need to use a ref-all pointer if the data
4393 reference does not conflict with the created aggregated data
4394 reference because it is not addressable. */
4395 bool need_ref_all = false;
4396 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4397 get_alias_set (DR_REF (dr))))
4398 need_ref_all = true;
4399 /* Likewise for any of the data references in the stmt group. */
4400 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4401 {
4402 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4403 do
4404 {
4405 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4406 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4407 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4408 get_alias_set (DR_REF (sdr))))
4409 {
4410 need_ref_all = true;
4411 break;
4412 }
4413 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4414 }
4415 while (orig_stmt);
4416 }
4417 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4418 need_ref_all);
4419 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4420
4421
4422 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4423 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4424 def-use update cycles for the pointer: one relative to the outer-loop
4425 (LOOP), which is what steps (3) and (4) below do. The other is relative
4426 to the inner-loop (which is the inner-most loop containing the dataref),
4427 and this is done be step (5) below.
4428
4429 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4430 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4431 redundant. Steps (3),(4) create the following:
4432
4433 vp0 = &base_addr;
4434 LOOP: vp1 = phi(vp0,vp2)
4435 ...
4436 ...
4437 vp2 = vp1 + step
4438 goto LOOP
4439
4440 If there is an inner-loop nested in loop, then step (5) will also be
4441 applied, and an additional update in the inner-loop will be created:
4442
4443 vp0 = &base_addr;
4444 LOOP: vp1 = phi(vp0,vp2)
4445 ...
4446 inner: vp3 = phi(vp1,vp4)
4447 vp4 = vp3 + inner_step
4448 if () goto inner
4449 ...
4450 vp2 = vp1 + step
4451 if () goto LOOP */
4452
4453 /* (2) Calculate the initial address of the aggregate-pointer, and set
4454 the aggregate-pointer to point to it before the loop. */
4455
4456 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4457
4458 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4459 offset, loop, byte_offset);
4460 if (new_stmt_list)
4461 {
4462 if (pe)
4463 {
4464 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4465 gcc_assert (!new_bb);
4466 }
4467 else
4468 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4469 }
4470
4471 *initial_address = new_temp;
4472 aggr_ptr_init = new_temp;
4473
4474 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4475 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4476 inner-loop nested in LOOP (during outer-loop vectorization). */
4477
4478 /* No update in loop is required. */
4479 if (only_init && (!loop_vinfo || at_loop == loop))
4480 aptr = aggr_ptr_init;
4481 else
4482 {
4483 /* The step of the aggregate pointer is the type size. */
4484 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4485 /* One exception to the above is when the scalar step of the load in
4486 LOOP is zero. In this case the step here is also zero. */
4487 if (*inv_p)
4488 iv_step = size_zero_node;
4489 else if (tree_int_cst_sgn (step) == -1)
4490 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4491
4492 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4493
4494 create_iv (aggr_ptr_init,
4495 fold_convert (aggr_ptr_type, iv_step),
4496 aggr_ptr, loop, &incr_gsi, insert_after,
4497 &indx_before_incr, &indx_after_incr);
4498 incr = gsi_stmt (incr_gsi);
4499 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4500
4501 /* Copy the points-to information if it exists. */
4502 if (DR_PTR_INFO (dr))
4503 {
4504 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4505 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4506 }
4507 if (ptr_incr)
4508 *ptr_incr = incr;
4509
4510 aptr = indx_before_incr;
4511 }
4512
4513 if (!nested_in_vect_loop || only_init)
4514 return aptr;
4515
4516
4517 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4518 nested in LOOP, if exists. */
4519
4520 gcc_assert (nested_in_vect_loop);
4521 if (!only_init)
4522 {
4523 standard_iv_increment_position (containing_loop, &incr_gsi,
4524 &insert_after);
4525 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4526 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4527 &indx_after_incr);
4528 incr = gsi_stmt (incr_gsi);
4529 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4530
4531 /* Copy the points-to information if it exists. */
4532 if (DR_PTR_INFO (dr))
4533 {
4534 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4535 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4536 }
4537 if (ptr_incr)
4538 *ptr_incr = incr;
4539
4540 return indx_before_incr;
4541 }
4542 else
4543 gcc_unreachable ();
4544 }
4545
4546
4547 /* Function bump_vector_ptr
4548
4549 Increment a pointer (to a vector type) by vector-size. If requested,
4550 i.e. if PTR-INCR is given, then also connect the new increment stmt
4551 to the existing def-use update-chain of the pointer, by modifying
4552 the PTR_INCR as illustrated below:
4553
4554 The pointer def-use update-chain before this function:
4555 DATAREF_PTR = phi (p_0, p_2)
4556 ....
4557 PTR_INCR: p_2 = DATAREF_PTR + step
4558
4559 The pointer def-use update-chain after this function:
4560 DATAREF_PTR = phi (p_0, p_2)
4561 ....
4562 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4563 ....
4564 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4565
4566 Input:
4567 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4568 in the loop.
4569 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4570 the loop. The increment amount across iterations is expected
4571 to be vector_size.
4572 BSI - location where the new update stmt is to be placed.
4573 STMT - the original scalar memory-access stmt that is being vectorized.
4574 BUMP - optional. The offset by which to bump the pointer. If not given,
4575 the offset is assumed to be vector_size.
4576
4577 Output: Return NEW_DATAREF_PTR as illustrated above.
4578
4579 */
4580
4581 tree
4582 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4583 gimple *stmt, tree bump)
4584 {
4585 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4586 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4587 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4588 tree update = TYPE_SIZE_UNIT (vectype);
4589 gassign *incr_stmt;
4590 ssa_op_iter iter;
4591 use_operand_p use_p;
4592 tree new_dataref_ptr;
4593
4594 if (bump)
4595 update = bump;
4596
4597 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4598 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4599 else
4600 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4601 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4602 dataref_ptr, update);
4603 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4604
4605 /* Copy the points-to information if it exists. */
4606 if (DR_PTR_INFO (dr))
4607 {
4608 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4609 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4610 }
4611
4612 if (!ptr_incr)
4613 return new_dataref_ptr;
4614
4615 /* Update the vector-pointer's cross-iteration increment. */
4616 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4617 {
4618 tree use = USE_FROM_PTR (use_p);
4619
4620 if (use == dataref_ptr)
4621 SET_USE (use_p, new_dataref_ptr);
4622 else
4623 gcc_assert (tree_int_cst_compare (use, update) == 0);
4624 }
4625
4626 return new_dataref_ptr;
4627 }
4628
4629
4630 /* Function vect_create_destination_var.
4631
4632 Create a new temporary of type VECTYPE. */
4633
4634 tree
4635 vect_create_destination_var (tree scalar_dest, tree vectype)
4636 {
4637 tree vec_dest;
4638 const char *name;
4639 char *new_name;
4640 tree type;
4641 enum vect_var_kind kind;
4642
4643 kind = vectype
4644 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4645 ? vect_mask_var
4646 : vect_simple_var
4647 : vect_scalar_var;
4648 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4649
4650 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4651
4652 name = get_name (scalar_dest);
4653 if (name)
4654 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4655 else
4656 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4657 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4658 free (new_name);
4659
4660 return vec_dest;
4661 }
4662
4663 /* Function vect_grouped_store_supported.
4664
4665 Returns TRUE if interleave high and interleave low permutations
4666 are supported, and FALSE otherwise. */
4667
4668 bool
4669 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4670 {
4671 machine_mode mode = TYPE_MODE (vectype);
4672
4673 /* vect_permute_store_chain requires the group size to be equal to 3 or
4674 be a power of two. */
4675 if (count != 3 && exact_log2 (count) == -1)
4676 {
4677 if (dump_enabled_p ())
4678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4679 "the size of the group of accesses"
4680 " is not a power of 2 or not eqaul to 3\n");
4681 return false;
4682 }
4683
4684 /* Check that the permutation is supported. */
4685 if (VECTOR_MODE_P (mode))
4686 {
4687 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4688 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4689
4690 if (count == 3)
4691 {
4692 unsigned int j0 = 0, j1 = 0, j2 = 0;
4693 unsigned int i, j;
4694
4695 for (j = 0; j < 3; j++)
4696 {
4697 int nelt0 = ((3 - j) * nelt) % 3;
4698 int nelt1 = ((3 - j) * nelt + 1) % 3;
4699 int nelt2 = ((3 - j) * nelt + 2) % 3;
4700 for (i = 0; i < nelt; i++)
4701 {
4702 if (3 * i + nelt0 < nelt)
4703 sel[3 * i + nelt0] = j0++;
4704 if (3 * i + nelt1 < nelt)
4705 sel[3 * i + nelt1] = nelt + j1++;
4706 if (3 * i + nelt2 < nelt)
4707 sel[3 * i + nelt2] = 0;
4708 }
4709 if (!can_vec_perm_p (mode, false, sel))
4710 {
4711 if (dump_enabled_p ())
4712 dump_printf (MSG_MISSED_OPTIMIZATION,
4713 "permutaion op not supported by target.\n");
4714 return false;
4715 }
4716
4717 for (i = 0; i < nelt; i++)
4718 {
4719 if (3 * i + nelt0 < nelt)
4720 sel[3 * i + nelt0] = 3 * i + nelt0;
4721 if (3 * i + nelt1 < nelt)
4722 sel[3 * i + nelt1] = 3 * i + nelt1;
4723 if (3 * i + nelt2 < nelt)
4724 sel[3 * i + nelt2] = nelt + j2++;
4725 }
4726 if (!can_vec_perm_p (mode, false, sel))
4727 {
4728 if (dump_enabled_p ())
4729 dump_printf (MSG_MISSED_OPTIMIZATION,
4730 "permutaion op not supported by target.\n");
4731 return false;
4732 }
4733 }
4734 return true;
4735 }
4736 else
4737 {
4738 /* If length is not equal to 3 then only power of 2 is supported. */
4739 gcc_assert (exact_log2 (count) != -1);
4740
4741 for (i = 0; i < nelt / 2; i++)
4742 {
4743 sel[i * 2] = i;
4744 sel[i * 2 + 1] = i + nelt;
4745 }
4746 if (can_vec_perm_p (mode, false, sel))
4747 {
4748 for (i = 0; i < nelt; i++)
4749 sel[i] += nelt / 2;
4750 if (can_vec_perm_p (mode, false, sel))
4751 return true;
4752 }
4753 }
4754 }
4755
4756 if (dump_enabled_p ())
4757 dump_printf (MSG_MISSED_OPTIMIZATION,
4758 "permutaion op not supported by target.\n");
4759 return false;
4760 }
4761
4762
4763 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4764 type VECTYPE. */
4765
4766 bool
4767 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4768 {
4769 return vect_lanes_optab_supported_p ("vec_store_lanes",
4770 vec_store_lanes_optab,
4771 vectype, count);
4772 }
4773
4774
4775 /* Function vect_permute_store_chain.
4776
4777 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4778 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4779 the data correctly for the stores. Return the final references for stores
4780 in RESULT_CHAIN.
4781
4782 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4783 The input is 4 vectors each containing 8 elements. We assign a number to
4784 each element, the input sequence is:
4785
4786 1st vec: 0 1 2 3 4 5 6 7
4787 2nd vec: 8 9 10 11 12 13 14 15
4788 3rd vec: 16 17 18 19 20 21 22 23
4789 4th vec: 24 25 26 27 28 29 30 31
4790
4791 The output sequence should be:
4792
4793 1st vec: 0 8 16 24 1 9 17 25
4794 2nd vec: 2 10 18 26 3 11 19 27
4795 3rd vec: 4 12 20 28 5 13 21 30
4796 4th vec: 6 14 22 30 7 15 23 31
4797
4798 i.e., we interleave the contents of the four vectors in their order.
4799
4800 We use interleave_high/low instructions to create such output. The input of
4801 each interleave_high/low operation is two vectors:
4802 1st vec 2nd vec
4803 0 1 2 3 4 5 6 7
4804 the even elements of the result vector are obtained left-to-right from the
4805 high/low elements of the first vector. The odd elements of the result are
4806 obtained left-to-right from the high/low elements of the second vector.
4807 The output of interleave_high will be: 0 4 1 5
4808 and of interleave_low: 2 6 3 7
4809
4810
4811 The permutation is done in log LENGTH stages. In each stage interleave_high
4812 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4813 where the first argument is taken from the first half of DR_CHAIN and the
4814 second argument from it's second half.
4815 In our example,
4816
4817 I1: interleave_high (1st vec, 3rd vec)
4818 I2: interleave_low (1st vec, 3rd vec)
4819 I3: interleave_high (2nd vec, 4th vec)
4820 I4: interleave_low (2nd vec, 4th vec)
4821
4822 The output for the first stage is:
4823
4824 I1: 0 16 1 17 2 18 3 19
4825 I2: 4 20 5 21 6 22 7 23
4826 I3: 8 24 9 25 10 26 11 27
4827 I4: 12 28 13 29 14 30 15 31
4828
4829 The output of the second stage, i.e. the final result is:
4830
4831 I1: 0 8 16 24 1 9 17 25
4832 I2: 2 10 18 26 3 11 19 27
4833 I3: 4 12 20 28 5 13 21 30
4834 I4: 6 14 22 30 7 15 23 31. */
4835
4836 void
4837 vect_permute_store_chain (vec<tree> dr_chain,
4838 unsigned int length,
4839 gimple *stmt,
4840 gimple_stmt_iterator *gsi,
4841 vec<tree> *result_chain)
4842 {
4843 tree vect1, vect2, high, low;
4844 gimple *perm_stmt;
4845 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4846 tree perm_mask_low, perm_mask_high;
4847 tree data_ref;
4848 tree perm3_mask_low, perm3_mask_high;
4849 unsigned int i, n, log_length = exact_log2 (length);
4850 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4851 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4852
4853 result_chain->quick_grow (length);
4854 memcpy (result_chain->address (), dr_chain.address (),
4855 length * sizeof (tree));
4856
4857 if (length == 3)
4858 {
4859 unsigned int j0 = 0, j1 = 0, j2 = 0;
4860
4861 for (j = 0; j < 3; j++)
4862 {
4863 int nelt0 = ((3 - j) * nelt) % 3;
4864 int nelt1 = ((3 - j) * nelt + 1) % 3;
4865 int nelt2 = ((3 - j) * nelt + 2) % 3;
4866
4867 for (i = 0; i < nelt; i++)
4868 {
4869 if (3 * i + nelt0 < nelt)
4870 sel[3 * i + nelt0] = j0++;
4871 if (3 * i + nelt1 < nelt)
4872 sel[3 * i + nelt1] = nelt + j1++;
4873 if (3 * i + nelt2 < nelt)
4874 sel[3 * i + nelt2] = 0;
4875 }
4876 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4877
4878 for (i = 0; i < nelt; i++)
4879 {
4880 if (3 * i + nelt0 < nelt)
4881 sel[3 * i + nelt0] = 3 * i + nelt0;
4882 if (3 * i + nelt1 < nelt)
4883 sel[3 * i + nelt1] = 3 * i + nelt1;
4884 if (3 * i + nelt2 < nelt)
4885 sel[3 * i + nelt2] = nelt + j2++;
4886 }
4887 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4888
4889 vect1 = dr_chain[0];
4890 vect2 = dr_chain[1];
4891
4892 /* Create interleaving stmt:
4893 low = VEC_PERM_EXPR <vect1, vect2,
4894 {j, nelt, *, j + 1, nelt + j + 1, *,
4895 j + 2, nelt + j + 2, *, ...}> */
4896 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4897 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4898 vect2, perm3_mask_low);
4899 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4900
4901 vect1 = data_ref;
4902 vect2 = dr_chain[2];
4903 /* Create interleaving stmt:
4904 low = VEC_PERM_EXPR <vect1, vect2,
4905 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4906 6, 7, nelt + j + 2, ...}> */
4907 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4908 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4909 vect2, perm3_mask_high);
4910 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4911 (*result_chain)[j] = data_ref;
4912 }
4913 }
4914 else
4915 {
4916 /* If length is not equal to 3 then only power of 2 is supported. */
4917 gcc_assert (exact_log2 (length) != -1);
4918
4919 for (i = 0, n = nelt / 2; i < n; i++)
4920 {
4921 sel[i * 2] = i;
4922 sel[i * 2 + 1] = i + nelt;
4923 }
4924 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4925
4926 for (i = 0; i < nelt; i++)
4927 sel[i] += nelt / 2;
4928 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4929
4930 for (i = 0, n = log_length; i < n; i++)
4931 {
4932 for (j = 0; j < length/2; j++)
4933 {
4934 vect1 = dr_chain[j];
4935 vect2 = dr_chain[j+length/2];
4936
4937 /* Create interleaving stmt:
4938 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4939 ...}> */
4940 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4941 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4942 vect2, perm_mask_high);
4943 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4944 (*result_chain)[2*j] = high;
4945
4946 /* Create interleaving stmt:
4947 low = VEC_PERM_EXPR <vect1, vect2,
4948 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4949 ...}> */
4950 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4951 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4952 vect2, perm_mask_low);
4953 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4954 (*result_chain)[2*j+1] = low;
4955 }
4956 memcpy (dr_chain.address (), result_chain->address (),
4957 length * sizeof (tree));
4958 }
4959 }
4960 }
4961
4962 /* Function vect_setup_realignment
4963
4964 This function is called when vectorizing an unaligned load using
4965 the dr_explicit_realign[_optimized] scheme.
4966 This function generates the following code at the loop prolog:
4967
4968 p = initial_addr;
4969 x msq_init = *(floor(p)); # prolog load
4970 realignment_token = call target_builtin;
4971 loop:
4972 x msq = phi (msq_init, ---)
4973
4974 The stmts marked with x are generated only for the case of
4975 dr_explicit_realign_optimized.
4976
4977 The code above sets up a new (vector) pointer, pointing to the first
4978 location accessed by STMT, and a "floor-aligned" load using that pointer.
4979 It also generates code to compute the "realignment-token" (if the relevant
4980 target hook was defined), and creates a phi-node at the loop-header bb
4981 whose arguments are the result of the prolog-load (created by this
4982 function) and the result of a load that takes place in the loop (to be
4983 created by the caller to this function).
4984
4985 For the case of dr_explicit_realign_optimized:
4986 The caller to this function uses the phi-result (msq) to create the
4987 realignment code inside the loop, and sets up the missing phi argument,
4988 as follows:
4989 loop:
4990 msq = phi (msq_init, lsq)
4991 lsq = *(floor(p')); # load in loop
4992 result = realign_load (msq, lsq, realignment_token);
4993
4994 For the case of dr_explicit_realign:
4995 loop:
4996 msq = *(floor(p)); # load in loop
4997 p' = p + (VS-1);
4998 lsq = *(floor(p')); # load in loop
4999 result = realign_load (msq, lsq, realignment_token);
5000
5001 Input:
5002 STMT - (scalar) load stmt to be vectorized. This load accesses
5003 a memory location that may be unaligned.
5004 BSI - place where new code is to be inserted.
5005 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5006 is used.
5007
5008 Output:
5009 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5010 target hook, if defined.
5011 Return value - the result of the loop-header phi node. */
5012
5013 tree
5014 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5015 tree *realignment_token,
5016 enum dr_alignment_support alignment_support_scheme,
5017 tree init_addr,
5018 struct loop **at_loop)
5019 {
5020 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5021 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5022 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5023 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5024 struct loop *loop = NULL;
5025 edge pe = NULL;
5026 tree scalar_dest = gimple_assign_lhs (stmt);
5027 tree vec_dest;
5028 gimple *inc;
5029 tree ptr;
5030 tree data_ref;
5031 basic_block new_bb;
5032 tree msq_init = NULL_TREE;
5033 tree new_temp;
5034 gphi *phi_stmt;
5035 tree msq = NULL_TREE;
5036 gimple_seq stmts = NULL;
5037 bool inv_p;
5038 bool compute_in_loop = false;
5039 bool nested_in_vect_loop = false;
5040 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5041 struct loop *loop_for_initial_load = NULL;
5042
5043 if (loop_vinfo)
5044 {
5045 loop = LOOP_VINFO_LOOP (loop_vinfo);
5046 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5047 }
5048
5049 gcc_assert (alignment_support_scheme == dr_explicit_realign
5050 || alignment_support_scheme == dr_explicit_realign_optimized);
5051
5052 /* We need to generate three things:
5053 1. the misalignment computation
5054 2. the extra vector load (for the optimized realignment scheme).
5055 3. the phi node for the two vectors from which the realignment is
5056 done (for the optimized realignment scheme). */
5057
5058 /* 1. Determine where to generate the misalignment computation.
5059
5060 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5061 calculation will be generated by this function, outside the loop (in the
5062 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5063 caller, inside the loop.
5064
5065 Background: If the misalignment remains fixed throughout the iterations of
5066 the loop, then both realignment schemes are applicable, and also the
5067 misalignment computation can be done outside LOOP. This is because we are
5068 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5069 are a multiple of VS (the Vector Size), and therefore the misalignment in
5070 different vectorized LOOP iterations is always the same.
5071 The problem arises only if the memory access is in an inner-loop nested
5072 inside LOOP, which is now being vectorized using outer-loop vectorization.
5073 This is the only case when the misalignment of the memory access may not
5074 remain fixed throughout the iterations of the inner-loop (as explained in
5075 detail in vect_supportable_dr_alignment). In this case, not only is the
5076 optimized realignment scheme not applicable, but also the misalignment
5077 computation (and generation of the realignment token that is passed to
5078 REALIGN_LOAD) have to be done inside the loop.
5079
5080 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5081 or not, which in turn determines if the misalignment is computed inside
5082 the inner-loop, or outside LOOP. */
5083
5084 if (init_addr != NULL_TREE || !loop_vinfo)
5085 {
5086 compute_in_loop = true;
5087 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5088 }
5089
5090
5091 /* 2. Determine where to generate the extra vector load.
5092
5093 For the optimized realignment scheme, instead of generating two vector
5094 loads in each iteration, we generate a single extra vector load in the
5095 preheader of the loop, and in each iteration reuse the result of the
5096 vector load from the previous iteration. In case the memory access is in
5097 an inner-loop nested inside LOOP, which is now being vectorized using
5098 outer-loop vectorization, we need to determine whether this initial vector
5099 load should be generated at the preheader of the inner-loop, or can be
5100 generated at the preheader of LOOP. If the memory access has no evolution
5101 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5102 to be generated inside LOOP (in the preheader of the inner-loop). */
5103
5104 if (nested_in_vect_loop)
5105 {
5106 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5107 bool invariant_in_outerloop =
5108 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5109 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5110 }
5111 else
5112 loop_for_initial_load = loop;
5113 if (at_loop)
5114 *at_loop = loop_for_initial_load;
5115
5116 if (loop_for_initial_load)
5117 pe = loop_preheader_edge (loop_for_initial_load);
5118
5119 /* 3. For the case of the optimized realignment, create the first vector
5120 load at the loop preheader. */
5121
5122 if (alignment_support_scheme == dr_explicit_realign_optimized)
5123 {
5124 /* Create msq_init = *(floor(p1)) in the loop preheader */
5125 gassign *new_stmt;
5126
5127 gcc_assert (!compute_in_loop);
5128 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5129 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5130 NULL_TREE, &init_addr, NULL, &inc,
5131 true, &inv_p);
5132 if (TREE_CODE (ptr) == SSA_NAME)
5133 new_temp = copy_ssa_name (ptr);
5134 else
5135 new_temp = make_ssa_name (TREE_TYPE (ptr));
5136 new_stmt = gimple_build_assign
5137 (new_temp, BIT_AND_EXPR, ptr,
5138 build_int_cst (TREE_TYPE (ptr),
5139 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5140 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5141 gcc_assert (!new_bb);
5142 data_ref
5143 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5144 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5145 new_stmt = gimple_build_assign (vec_dest, data_ref);
5146 new_temp = make_ssa_name (vec_dest, new_stmt);
5147 gimple_assign_set_lhs (new_stmt, new_temp);
5148 if (pe)
5149 {
5150 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5151 gcc_assert (!new_bb);
5152 }
5153 else
5154 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5155
5156 msq_init = gimple_assign_lhs (new_stmt);
5157 }
5158
5159 /* 4. Create realignment token using a target builtin, if available.
5160 It is done either inside the containing loop, or before LOOP (as
5161 determined above). */
5162
5163 if (targetm.vectorize.builtin_mask_for_load)
5164 {
5165 gcall *new_stmt;
5166 tree builtin_decl;
5167
5168 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5169 if (!init_addr)
5170 {
5171 /* Generate the INIT_ADDR computation outside LOOP. */
5172 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5173 NULL_TREE, loop);
5174 if (loop)
5175 {
5176 pe = loop_preheader_edge (loop);
5177 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5178 gcc_assert (!new_bb);
5179 }
5180 else
5181 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5182 }
5183
5184 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5185 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5186 vec_dest =
5187 vect_create_destination_var (scalar_dest,
5188 gimple_call_return_type (new_stmt));
5189 new_temp = make_ssa_name (vec_dest, new_stmt);
5190 gimple_call_set_lhs (new_stmt, new_temp);
5191
5192 if (compute_in_loop)
5193 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5194 else
5195 {
5196 /* Generate the misalignment computation outside LOOP. */
5197 pe = loop_preheader_edge (loop);
5198 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5199 gcc_assert (!new_bb);
5200 }
5201
5202 *realignment_token = gimple_call_lhs (new_stmt);
5203
5204 /* The result of the CALL_EXPR to this builtin is determined from
5205 the value of the parameter and no global variables are touched
5206 which makes the builtin a "const" function. Requiring the
5207 builtin to have the "const" attribute makes it unnecessary
5208 to call mark_call_clobbered. */
5209 gcc_assert (TREE_READONLY (builtin_decl));
5210 }
5211
5212 if (alignment_support_scheme == dr_explicit_realign)
5213 return msq;
5214
5215 gcc_assert (!compute_in_loop);
5216 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5217
5218
5219 /* 5. Create msq = phi <msq_init, lsq> in loop */
5220
5221 pe = loop_preheader_edge (containing_loop);
5222 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5223 msq = make_ssa_name (vec_dest);
5224 phi_stmt = create_phi_node (msq, containing_loop->header);
5225 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5226
5227 return msq;
5228 }
5229
5230
5231 /* Function vect_grouped_load_supported.
5232
5233 COUNT is the size of the load group (the number of statements plus the
5234 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5235 only one statement, with a gap of COUNT - 1.
5236
5237 Returns true if a suitable permute exists. */
5238
5239 bool
5240 vect_grouped_load_supported (tree vectype, bool single_element_p,
5241 unsigned HOST_WIDE_INT count)
5242 {
5243 machine_mode mode = TYPE_MODE (vectype);
5244
5245 /* If this is single-element interleaving with an element distance
5246 that leaves unused vector loads around punt - we at least create
5247 very sub-optimal code in that case (and blow up memory,
5248 see PR65518). */
5249 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5250 {
5251 if (dump_enabled_p ())
5252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5253 "single-element interleaving not supported "
5254 "for not adjacent vector loads\n");
5255 return false;
5256 }
5257
5258 /* vect_permute_load_chain requires the group size to be equal to 3 or
5259 be a power of two. */
5260 if (count != 3 && exact_log2 (count) == -1)
5261 {
5262 if (dump_enabled_p ())
5263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5264 "the size of the group of accesses"
5265 " is not a power of 2 or not equal to 3\n");
5266 return false;
5267 }
5268
5269 /* Check that the permutation is supported. */
5270 if (VECTOR_MODE_P (mode))
5271 {
5272 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5273 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5274
5275 if (count == 3)
5276 {
5277 unsigned int k;
5278 for (k = 0; k < 3; k++)
5279 {
5280 for (i = 0; i < nelt; i++)
5281 if (3 * i + k < 2 * nelt)
5282 sel[i] = 3 * i + k;
5283 else
5284 sel[i] = 0;
5285 if (!can_vec_perm_p (mode, false, sel))
5286 {
5287 if (dump_enabled_p ())
5288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5289 "shuffle of 3 loads is not supported by"
5290 " target\n");
5291 return false;
5292 }
5293 for (i = 0, j = 0; i < nelt; i++)
5294 if (3 * i + k < 2 * nelt)
5295 sel[i] = i;
5296 else
5297 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5298 if (!can_vec_perm_p (mode, false, sel))
5299 {
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5302 "shuffle of 3 loads is not supported by"
5303 " target\n");
5304 return false;
5305 }
5306 }
5307 return true;
5308 }
5309 else
5310 {
5311 /* If length is not equal to 3 then only power of 2 is supported. */
5312 gcc_assert (exact_log2 (count) != -1);
5313 for (i = 0; i < nelt; i++)
5314 sel[i] = i * 2;
5315 if (can_vec_perm_p (mode, false, sel))
5316 {
5317 for (i = 0; i < nelt; i++)
5318 sel[i] = i * 2 + 1;
5319 if (can_vec_perm_p (mode, false, sel))
5320 return true;
5321 }
5322 }
5323 }
5324
5325 if (dump_enabled_p ())
5326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5327 "extract even/odd not supported by target\n");
5328 return false;
5329 }
5330
5331 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5332 type VECTYPE. */
5333
5334 bool
5335 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5336 {
5337 return vect_lanes_optab_supported_p ("vec_load_lanes",
5338 vec_load_lanes_optab,
5339 vectype, count);
5340 }
5341
5342 /* Function vect_permute_load_chain.
5343
5344 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5345 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5346 the input data correctly. Return the final references for loads in
5347 RESULT_CHAIN.
5348
5349 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5350 The input is 4 vectors each containing 8 elements. We assign a number to each
5351 element, the input sequence is:
5352
5353 1st vec: 0 1 2 3 4 5 6 7
5354 2nd vec: 8 9 10 11 12 13 14 15
5355 3rd vec: 16 17 18 19 20 21 22 23
5356 4th vec: 24 25 26 27 28 29 30 31
5357
5358 The output sequence should be:
5359
5360 1st vec: 0 4 8 12 16 20 24 28
5361 2nd vec: 1 5 9 13 17 21 25 29
5362 3rd vec: 2 6 10 14 18 22 26 30
5363 4th vec: 3 7 11 15 19 23 27 31
5364
5365 i.e., the first output vector should contain the first elements of each
5366 interleaving group, etc.
5367
5368 We use extract_even/odd instructions to create such output. The input of
5369 each extract_even/odd operation is two vectors
5370 1st vec 2nd vec
5371 0 1 2 3 4 5 6 7
5372
5373 and the output is the vector of extracted even/odd elements. The output of
5374 extract_even will be: 0 2 4 6
5375 and of extract_odd: 1 3 5 7
5376
5377
5378 The permutation is done in log LENGTH stages. In each stage extract_even
5379 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5380 their order. In our example,
5381
5382 E1: extract_even (1st vec, 2nd vec)
5383 E2: extract_odd (1st vec, 2nd vec)
5384 E3: extract_even (3rd vec, 4th vec)
5385 E4: extract_odd (3rd vec, 4th vec)
5386
5387 The output for the first stage will be:
5388
5389 E1: 0 2 4 6 8 10 12 14
5390 E2: 1 3 5 7 9 11 13 15
5391 E3: 16 18 20 22 24 26 28 30
5392 E4: 17 19 21 23 25 27 29 31
5393
5394 In order to proceed and create the correct sequence for the next stage (or
5395 for the correct output, if the second stage is the last one, as in our
5396 example), we first put the output of extract_even operation and then the
5397 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5398 The input for the second stage is:
5399
5400 1st vec (E1): 0 2 4 6 8 10 12 14
5401 2nd vec (E3): 16 18 20 22 24 26 28 30
5402 3rd vec (E2): 1 3 5 7 9 11 13 15
5403 4th vec (E4): 17 19 21 23 25 27 29 31
5404
5405 The output of the second stage:
5406
5407 E1: 0 4 8 12 16 20 24 28
5408 E2: 2 6 10 14 18 22 26 30
5409 E3: 1 5 9 13 17 21 25 29
5410 E4: 3 7 11 15 19 23 27 31
5411
5412 And RESULT_CHAIN after reordering:
5413
5414 1st vec (E1): 0 4 8 12 16 20 24 28
5415 2nd vec (E3): 1 5 9 13 17 21 25 29
5416 3rd vec (E2): 2 6 10 14 18 22 26 30
5417 4th vec (E4): 3 7 11 15 19 23 27 31. */
5418
5419 static void
5420 vect_permute_load_chain (vec<tree> dr_chain,
5421 unsigned int length,
5422 gimple *stmt,
5423 gimple_stmt_iterator *gsi,
5424 vec<tree> *result_chain)
5425 {
5426 tree data_ref, first_vect, second_vect;
5427 tree perm_mask_even, perm_mask_odd;
5428 tree perm3_mask_low, perm3_mask_high;
5429 gimple *perm_stmt;
5430 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5431 unsigned int i, j, log_length = exact_log2 (length);
5432 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5433 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5434
5435 result_chain->quick_grow (length);
5436 memcpy (result_chain->address (), dr_chain.address (),
5437 length * sizeof (tree));
5438
5439 if (length == 3)
5440 {
5441 unsigned int k;
5442
5443 for (k = 0; k < 3; k++)
5444 {
5445 for (i = 0; i < nelt; i++)
5446 if (3 * i + k < 2 * nelt)
5447 sel[i] = 3 * i + k;
5448 else
5449 sel[i] = 0;
5450 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5451
5452 for (i = 0, j = 0; i < nelt; i++)
5453 if (3 * i + k < 2 * nelt)
5454 sel[i] = i;
5455 else
5456 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5457
5458 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5459
5460 first_vect = dr_chain[0];
5461 second_vect = dr_chain[1];
5462
5463 /* Create interleaving stmt (low part of):
5464 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5465 ...}> */
5466 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5467 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5468 second_vect, perm3_mask_low);
5469 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5470
5471 /* Create interleaving stmt (high part of):
5472 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5473 ...}> */
5474 first_vect = data_ref;
5475 second_vect = dr_chain[2];
5476 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5477 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5478 second_vect, perm3_mask_high);
5479 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5480 (*result_chain)[k] = data_ref;
5481 }
5482 }
5483 else
5484 {
5485 /* If length is not equal to 3 then only power of 2 is supported. */
5486 gcc_assert (exact_log2 (length) != -1);
5487
5488 for (i = 0; i < nelt; ++i)
5489 sel[i] = i * 2;
5490 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5491
5492 for (i = 0; i < nelt; ++i)
5493 sel[i] = i * 2 + 1;
5494 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5495
5496 for (i = 0; i < log_length; i++)
5497 {
5498 for (j = 0; j < length; j += 2)
5499 {
5500 first_vect = dr_chain[j];
5501 second_vect = dr_chain[j+1];
5502
5503 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5504 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5505 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5506 first_vect, second_vect,
5507 perm_mask_even);
5508 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5509 (*result_chain)[j/2] = data_ref;
5510
5511 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5512 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5513 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5514 first_vect, second_vect,
5515 perm_mask_odd);
5516 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5517 (*result_chain)[j/2+length/2] = data_ref;
5518 }
5519 memcpy (dr_chain.address (), result_chain->address (),
5520 length * sizeof (tree));
5521 }
5522 }
5523 }
5524
5525 /* Function vect_shift_permute_load_chain.
5526
5527 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5528 sequence of stmts to reorder the input data accordingly.
5529 Return the final references for loads in RESULT_CHAIN.
5530 Return true if successed, false otherwise.
5531
5532 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5533 The input is 3 vectors each containing 8 elements. We assign a
5534 number to each element, the input sequence is:
5535
5536 1st vec: 0 1 2 3 4 5 6 7
5537 2nd vec: 8 9 10 11 12 13 14 15
5538 3rd vec: 16 17 18 19 20 21 22 23
5539
5540 The output sequence should be:
5541
5542 1st vec: 0 3 6 9 12 15 18 21
5543 2nd vec: 1 4 7 10 13 16 19 22
5544 3rd vec: 2 5 8 11 14 17 20 23
5545
5546 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5547
5548 First we shuffle all 3 vectors to get correct elements order:
5549
5550 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5551 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5552 3rd vec: (16 19 22) (17 20 23) (18 21)
5553
5554 Next we unite and shift vector 3 times:
5555
5556 1st step:
5557 shift right by 6 the concatenation of:
5558 "1st vec" and "2nd vec"
5559 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5560 "2nd vec" and "3rd vec"
5561 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5562 "3rd vec" and "1st vec"
5563 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5564 | New vectors |
5565
5566 So that now new vectors are:
5567
5568 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5569 2nd vec: (10 13) (16 19 22) (17 20 23)
5570 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5571
5572 2nd step:
5573 shift right by 5 the concatenation of:
5574 "1st vec" and "3rd vec"
5575 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5576 "2nd vec" and "1st vec"
5577 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5578 "3rd vec" and "2nd vec"
5579 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5580 | New vectors |
5581
5582 So that now new vectors are:
5583
5584 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5585 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5586 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5587
5588 3rd step:
5589 shift right by 5 the concatenation of:
5590 "1st vec" and "1st vec"
5591 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5592 shift right by 3 the concatenation of:
5593 "2nd vec" and "2nd vec"
5594 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5595 | New vectors |
5596
5597 So that now all vectors are READY:
5598 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5599 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5600 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5601
5602 This algorithm is faster than one in vect_permute_load_chain if:
5603 1. "shift of a concatination" is faster than general permutation.
5604 This is usually so.
5605 2. The TARGET machine can't execute vector instructions in parallel.
5606 This is because each step of the algorithm depends on previous.
5607 The algorithm in vect_permute_load_chain is much more parallel.
5608
5609 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5610 */
5611
5612 static bool
5613 vect_shift_permute_load_chain (vec<tree> dr_chain,
5614 unsigned int length,
5615 gimple *stmt,
5616 gimple_stmt_iterator *gsi,
5617 vec<tree> *result_chain)
5618 {
5619 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5620 tree perm2_mask1, perm2_mask2, perm3_mask;
5621 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5622 gimple *perm_stmt;
5623
5624 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5625 unsigned int i;
5626 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5627 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5628 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5629 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5630
5631 result_chain->quick_grow (length);
5632 memcpy (result_chain->address (), dr_chain.address (),
5633 length * sizeof (tree));
5634
5635 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5636 {
5637 unsigned int j, log_length = exact_log2 (length);
5638 for (i = 0; i < nelt / 2; ++i)
5639 sel[i] = i * 2;
5640 for (i = 0; i < nelt / 2; ++i)
5641 sel[nelt / 2 + i] = i * 2 + 1;
5642 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5643 {
5644 if (dump_enabled_p ())
5645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5646 "shuffle of 2 fields structure is not \
5647 supported by target\n");
5648 return false;
5649 }
5650 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5651
5652 for (i = 0; i < nelt / 2; ++i)
5653 sel[i] = i * 2 + 1;
5654 for (i = 0; i < nelt / 2; ++i)
5655 sel[nelt / 2 + i] = i * 2;
5656 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5657 {
5658 if (dump_enabled_p ())
5659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5660 "shuffle of 2 fields structure is not \
5661 supported by target\n");
5662 return false;
5663 }
5664 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5665
5666 /* Generating permutation constant to shift all elements.
5667 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5668 for (i = 0; i < nelt; i++)
5669 sel[i] = nelt / 2 + i;
5670 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5671 {
5672 if (dump_enabled_p ())
5673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5674 "shift permutation is not supported by target\n");
5675 return false;
5676 }
5677 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5678
5679 /* Generating permutation constant to select vector from 2.
5680 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5681 for (i = 0; i < nelt / 2; i++)
5682 sel[i] = i;
5683 for (i = nelt / 2; i < nelt; i++)
5684 sel[i] = nelt + i;
5685 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5686 {
5687 if (dump_enabled_p ())
5688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5689 "select is not supported by target\n");
5690 return false;
5691 }
5692 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5693
5694 for (i = 0; i < log_length; i++)
5695 {
5696 for (j = 0; j < length; j += 2)
5697 {
5698 first_vect = dr_chain[j];
5699 second_vect = dr_chain[j + 1];
5700
5701 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5702 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5703 first_vect, first_vect,
5704 perm2_mask1);
5705 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5706 vect[0] = data_ref;
5707
5708 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5709 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5710 second_vect, second_vect,
5711 perm2_mask2);
5712 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5713 vect[1] = data_ref;
5714
5715 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5716 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5717 vect[0], vect[1], shift1_mask);
5718 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5719 (*result_chain)[j/2 + length/2] = data_ref;
5720
5721 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5722 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5723 vect[0], vect[1], select_mask);
5724 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5725 (*result_chain)[j/2] = data_ref;
5726 }
5727 memcpy (dr_chain.address (), result_chain->address (),
5728 length * sizeof (tree));
5729 }
5730 return true;
5731 }
5732 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5733 {
5734 unsigned int k = 0, l = 0;
5735
5736 /* Generating permutation constant to get all elements in rigth order.
5737 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5738 for (i = 0; i < nelt; i++)
5739 {
5740 if (3 * k + (l % 3) >= nelt)
5741 {
5742 k = 0;
5743 l += (3 - (nelt % 3));
5744 }
5745 sel[i] = 3 * k + (l % 3);
5746 k++;
5747 }
5748 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5749 {
5750 if (dump_enabled_p ())
5751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5752 "shuffle of 3 fields structure is not \
5753 supported by target\n");
5754 return false;
5755 }
5756 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5757
5758 /* Generating permutation constant to shift all elements.
5759 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5760 for (i = 0; i < nelt; i++)
5761 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5762 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5763 {
5764 if (dump_enabled_p ())
5765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5766 "shift permutation is not supported by target\n");
5767 return false;
5768 }
5769 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5770
5771 /* Generating permutation constant to shift all elements.
5772 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5773 for (i = 0; i < nelt; i++)
5774 sel[i] = 2 * (nelt / 3) + 1 + i;
5775 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5776 {
5777 if (dump_enabled_p ())
5778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5779 "shift permutation is not supported by target\n");
5780 return false;
5781 }
5782 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5783
5784 /* Generating permutation constant to shift all elements.
5785 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5786 for (i = 0; i < nelt; i++)
5787 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5788 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5789 {
5790 if (dump_enabled_p ())
5791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5792 "shift permutation is not supported by target\n");
5793 return false;
5794 }
5795 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5796
5797 /* Generating permutation constant to shift all elements.
5798 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5799 for (i = 0; i < nelt; i++)
5800 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5801 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5802 {
5803 if (dump_enabled_p ())
5804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805 "shift permutation is not supported by target\n");
5806 return false;
5807 }
5808 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5809
5810 for (k = 0; k < 3; k++)
5811 {
5812 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5813 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5814 dr_chain[k], dr_chain[k],
5815 perm3_mask);
5816 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5817 vect[k] = data_ref;
5818 }
5819
5820 for (k = 0; k < 3; k++)
5821 {
5822 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5823 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5824 vect[k % 3], vect[(k + 1) % 3],
5825 shift1_mask);
5826 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5827 vect_shift[k] = data_ref;
5828 }
5829
5830 for (k = 0; k < 3; k++)
5831 {
5832 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5833 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5834 vect_shift[(4 - k) % 3],
5835 vect_shift[(3 - k) % 3],
5836 shift2_mask);
5837 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5838 vect[k] = data_ref;
5839 }
5840
5841 (*result_chain)[3 - (nelt % 3)] = vect[2];
5842
5843 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5844 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5845 vect[0], shift3_mask);
5846 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5847 (*result_chain)[nelt % 3] = data_ref;
5848
5849 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5850 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5851 vect[1], shift4_mask);
5852 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5853 (*result_chain)[0] = data_ref;
5854 return true;
5855 }
5856 return false;
5857 }
5858
5859 /* Function vect_transform_grouped_load.
5860
5861 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5862 to perform their permutation and ascribe the result vectorized statements to
5863 the scalar statements.
5864 */
5865
5866 void
5867 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5868 gimple_stmt_iterator *gsi)
5869 {
5870 machine_mode mode;
5871 vec<tree> result_chain = vNULL;
5872
5873 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5874 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5875 vectors, that are ready for vector computation. */
5876 result_chain.create (size);
5877
5878 /* If reassociation width for vector type is 2 or greater target machine can
5879 execute 2 or more vector instructions in parallel. Otherwise try to
5880 get chain for loads group using vect_shift_permute_load_chain. */
5881 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5882 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5883 || exact_log2 (size) != -1
5884 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5885 gsi, &result_chain))
5886 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5887 vect_record_grouped_load_vectors (stmt, result_chain);
5888 result_chain.release ();
5889 }
5890
5891 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5892 generated as part of the vectorization of STMT. Assign the statement
5893 for each vector to the associated scalar statement. */
5894
5895 void
5896 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5897 {
5898 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5899 gimple *next_stmt, *new_stmt;
5900 unsigned int i, gap_count;
5901 tree tmp_data_ref;
5902
5903 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5904 Since we scan the chain starting from it's first node, their order
5905 corresponds the order of data-refs in RESULT_CHAIN. */
5906 next_stmt = first_stmt;
5907 gap_count = 1;
5908 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5909 {
5910 if (!next_stmt)
5911 break;
5912
5913 /* Skip the gaps. Loads created for the gaps will be removed by dead
5914 code elimination pass later. No need to check for the first stmt in
5915 the group, since it always exists.
5916 GROUP_GAP is the number of steps in elements from the previous
5917 access (if there is no gap GROUP_GAP is 1). We skip loads that
5918 correspond to the gaps. */
5919 if (next_stmt != first_stmt
5920 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5921 {
5922 gap_count++;
5923 continue;
5924 }
5925
5926 while (next_stmt)
5927 {
5928 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5929 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5930 copies, and we put the new vector statement in the first available
5931 RELATED_STMT. */
5932 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5933 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5934 else
5935 {
5936 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5937 {
5938 gimple *prev_stmt =
5939 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5940 gimple *rel_stmt =
5941 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5942 while (rel_stmt)
5943 {
5944 prev_stmt = rel_stmt;
5945 rel_stmt =
5946 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5947 }
5948
5949 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5950 new_stmt;
5951 }
5952 }
5953
5954 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5955 gap_count = 1;
5956 /* If NEXT_STMT accesses the same DR as the previous statement,
5957 put the same TMP_DATA_REF as its vectorized statement; otherwise
5958 get the next data-ref from RESULT_CHAIN. */
5959 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5960 break;
5961 }
5962 }
5963 }
5964
5965 /* Function vect_force_dr_alignment_p.
5966
5967 Returns whether the alignment of a DECL can be forced to be aligned
5968 on ALIGNMENT bit boundary. */
5969
5970 bool
5971 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5972 {
5973 if (TREE_CODE (decl) != VAR_DECL)
5974 return false;
5975
5976 if (decl_in_symtab_p (decl)
5977 && !symtab_node::get (decl)->can_increase_alignment_p ())
5978 return false;
5979
5980 if (TREE_STATIC (decl))
5981 return (alignment <= MAX_OFILE_ALIGNMENT);
5982 else
5983 return (alignment <= MAX_STACK_ALIGNMENT);
5984 }
5985
5986
5987 /* Return whether the data reference DR is supported with respect to its
5988 alignment.
5989 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5990 it is aligned, i.e., check if it is possible to vectorize it with different
5991 alignment. */
5992
5993 enum dr_alignment_support
5994 vect_supportable_dr_alignment (struct data_reference *dr,
5995 bool check_aligned_accesses)
5996 {
5997 gimple *stmt = DR_STMT (dr);
5998 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5999 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6000 machine_mode mode = TYPE_MODE (vectype);
6001 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6002 struct loop *vect_loop = NULL;
6003 bool nested_in_vect_loop = false;
6004
6005 if (aligned_access_p (dr) && !check_aligned_accesses)
6006 return dr_aligned;
6007
6008 /* For now assume all conditional loads/stores support unaligned
6009 access without any special code. */
6010 if (is_gimple_call (stmt)
6011 && gimple_call_internal_p (stmt)
6012 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6013 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6014 return dr_unaligned_supported;
6015
6016 if (loop_vinfo)
6017 {
6018 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6019 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6020 }
6021
6022 /* Possibly unaligned access. */
6023
6024 /* We can choose between using the implicit realignment scheme (generating
6025 a misaligned_move stmt) and the explicit realignment scheme (generating
6026 aligned loads with a REALIGN_LOAD). There are two variants to the
6027 explicit realignment scheme: optimized, and unoptimized.
6028 We can optimize the realignment only if the step between consecutive
6029 vector loads is equal to the vector size. Since the vector memory
6030 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6031 is guaranteed that the misalignment amount remains the same throughout the
6032 execution of the vectorized loop. Therefore, we can create the
6033 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6034 at the loop preheader.
6035
6036 However, in the case of outer-loop vectorization, when vectorizing a
6037 memory access in the inner-loop nested within the LOOP that is now being
6038 vectorized, while it is guaranteed that the misalignment of the
6039 vectorized memory access will remain the same in different outer-loop
6040 iterations, it is *not* guaranteed that is will remain the same throughout
6041 the execution of the inner-loop. This is because the inner-loop advances
6042 with the original scalar step (and not in steps of VS). If the inner-loop
6043 step happens to be a multiple of VS, then the misalignment remains fixed
6044 and we can use the optimized realignment scheme. For example:
6045
6046 for (i=0; i<N; i++)
6047 for (j=0; j<M; j++)
6048 s += a[i+j];
6049
6050 When vectorizing the i-loop in the above example, the step between
6051 consecutive vector loads is 1, and so the misalignment does not remain
6052 fixed across the execution of the inner-loop, and the realignment cannot
6053 be optimized (as illustrated in the following pseudo vectorized loop):
6054
6055 for (i=0; i<N; i+=4)
6056 for (j=0; j<M; j++){
6057 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6058 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6059 // (assuming that we start from an aligned address).
6060 }
6061
6062 We therefore have to use the unoptimized realignment scheme:
6063
6064 for (i=0; i<N; i+=4)
6065 for (j=k; j<M; j+=4)
6066 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6067 // that the misalignment of the initial address is
6068 // 0).
6069
6070 The loop can then be vectorized as follows:
6071
6072 for (k=0; k<4; k++){
6073 rt = get_realignment_token (&vp[k]);
6074 for (i=0; i<N; i+=4){
6075 v1 = vp[i+k];
6076 for (j=k; j<M; j+=4){
6077 v2 = vp[i+j+VS-1];
6078 va = REALIGN_LOAD <v1,v2,rt>;
6079 vs += va;
6080 v1 = v2;
6081 }
6082 }
6083 } */
6084
6085 if (DR_IS_READ (dr))
6086 {
6087 bool is_packed = false;
6088 tree type = (TREE_TYPE (DR_REF (dr)));
6089
6090 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6091 && (!targetm.vectorize.builtin_mask_for_load
6092 || targetm.vectorize.builtin_mask_for_load ()))
6093 {
6094 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6095
6096 /* If we are doing SLP then the accesses need not have the
6097 same alignment, instead it depends on the SLP group size. */
6098 if (loop_vinfo
6099 && STMT_SLP_TYPE (stmt_info)
6100 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6101 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6102 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6103 ;
6104 else if (!loop_vinfo
6105 || (nested_in_vect_loop
6106 && (TREE_INT_CST_LOW (DR_STEP (dr))
6107 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6108 return dr_explicit_realign;
6109 else
6110 return dr_explicit_realign_optimized;
6111 }
6112 if (!known_alignment_for_access_p (dr))
6113 is_packed = not_size_aligned (DR_REF (dr));
6114
6115 if ((TYPE_USER_ALIGN (type) && !is_packed)
6116 || targetm.vectorize.
6117 support_vector_misalignment (mode, type,
6118 DR_MISALIGNMENT (dr), is_packed))
6119 /* Can't software pipeline the loads, but can at least do them. */
6120 return dr_unaligned_supported;
6121 }
6122 else
6123 {
6124 bool is_packed = false;
6125 tree type = (TREE_TYPE (DR_REF (dr)));
6126
6127 if (!known_alignment_for_access_p (dr))
6128 is_packed = not_size_aligned (DR_REF (dr));
6129
6130 if ((TYPE_USER_ALIGN (type) && !is_packed)
6131 || targetm.vectorize.
6132 support_vector_misalignment (mode, type,
6133 DR_MISALIGNMENT (dr), is_packed))
6134 return dr_unaligned_supported;
6135 }
6136
6137 /* Unsupported. */
6138 return dr_unaligned_unsupported;
6139 }