]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop-manip.c
2019-06-13 Richard Biener <rguenther@suse.de>
[thirdparty/gcc.git] / gcc / tree-vect-loop-manip.c
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50
51 /*************************************************************************
52 Simple Loop Peeling Utilities
53
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
56
57
58 /* Renames the use *OP_P. */
59
60 static void
61 rename_use_op (use_operand_p op_p)
62 {
63 tree new_name;
64
65 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66 return;
67
68 new_name = get_current_def (USE_FROM_PTR (op_p));
69
70 /* Something defined outside of the loop. */
71 if (!new_name)
72 return;
73
74 /* An ordinary ssa name defined in the loop. */
75
76 SET_USE (op_p, new_name);
77 }
78
79
80 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82 true. */
83
84 static void
85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
86 {
87 gimple *stmt;
88 use_operand_p use_p;
89 ssa_op_iter iter;
90 edge e;
91 edge_iterator ei;
92 struct loop *loop = bb->loop_father;
93 struct loop *outer_loop = NULL;
94
95 if (rename_from_outer_loop)
96 {
97 gcc_assert (loop);
98 outer_loop = loop_outer (loop);
99 }
100
101 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102 gsi_next (&gsi))
103 {
104 stmt = gsi_stmt (gsi);
105 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 rename_use_op (use_p);
107 }
108
109 FOR_EACH_EDGE (e, ei, bb->preds)
110 {
111 if (!flow_bb_inside_loop_p (loop, e->src))
112 {
113 if (!rename_from_outer_loop)
114 continue;
115 if (e->src != outer_loop->header)
116 {
117 if (outer_loop->inner->next)
118 {
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e->src)
123 || single_pred (e->src) != outer_loop->header)
124 continue;
125 }
126 }
127 }
128 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 gsi_next (&gsi))
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
131 }
132 }
133
134
135 struct adjust_info
136 {
137 tree from, to;
138 basic_block bb;
139 };
140
141 /* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
146 more. */
147 static vec<adjust_info, va_heap> adjust_vec;
148
149 /* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
152
153 static void
154 adjust_debug_stmts_now (adjust_info *ai)
155 {
156 basic_block bbphi = ai->bb;
157 tree orig_def = ai->from;
158 tree new_def = ai->to;
159 imm_use_iterator imm_iter;
160 gimple *stmt;
161 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164
165 /* Adjust any debug stmts that held onto non-loop-closed
166 references. */
167 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168 {
169 use_operand_p use_p;
170 basic_block bbuse;
171
172 if (!is_gimple_debug (stmt))
173 continue;
174
175 gcc_assert (gimple_debug_bind_p (stmt));
176
177 bbuse = gimple_bb (stmt);
178
179 if ((bbuse == bbphi
180 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 && !(bbuse == bbdef
182 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183 {
184 if (new_def)
185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 SET_USE (use_p, new_def);
187 else
188 {
189 gimple_debug_bind_reset_value (stmt);
190 update_stmt (stmt);
191 }
192 }
193 }
194 }
195
196 /* Adjust debug stmts as scheduled before. */
197
198 static void
199 adjust_vec_debug_stmts (void)
200 {
201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
202 return;
203
204 gcc_assert (adjust_vec.exists ());
205
206 while (!adjust_vec.is_empty ())
207 {
208 adjust_debug_stmts_now (&adjust_vec.last ());
209 adjust_vec.pop ();
210 }
211 }
212
213 /* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
217
218 static void
219 adjust_debug_stmts (tree from, tree to, basic_block bb)
220 {
221 adjust_info ai;
222
223 if (MAY_HAVE_DEBUG_BIND_STMTS
224 && TREE_CODE (from) == SSA_NAME
225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
226 && ! virtual_operand_p (from))
227 {
228 ai.from = from;
229 ai.to = to;
230 ai.bb = bb;
231
232 if (adjust_vec.exists ())
233 adjust_vec.safe_push (ai);
234 else
235 adjust_debug_stmts_now (&ai);
236 }
237 }
238
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
242 transformations. */
243
244 static void
245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
246 {
247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248
249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250
251 if (MAY_HAVE_DEBUG_BIND_STMTS)
252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 gimple_bb (update_phi));
254 }
255
256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
259
260 static void
261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262 tree next_mask)
263 {
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267 }
268
269 /* Add SEQ to the end of LOOP's preheader block. */
270
271 static void
272 add_preheader_seq (struct loop *loop, gimple_seq seq)
273 {
274 if (seq)
275 {
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
279 }
280 }
281
282 /* Add SEQ to the beginning of LOOP's header block. */
283
284 static void
285 add_header_seq (struct loop *loop, gimple_seq seq)
286 {
287 if (seq)
288 {
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291 }
292 }
293
294 /* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
298
299 static bool
300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
302 {
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
307 {
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
310 }
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313 }
314
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
318
319 static bool
320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 rgroup_masks *src_rgm)
322 {
323 tree src_masktype = src_rgm->mask_type;
324 tree dest_masktype = dest_rgm->mask_type;
325 machine_mode src_mode = TYPE_MODE (src_masktype);
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329 {
330 /* Unpacking the source masks gives at least as many mask bits as
331 we need. We can then VIEW_CONVERT any excess bits away. */
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334 {
335 tree src = src_rgm->masks[i / 2];
336 tree dest = dest_rgm->masks[i];
337 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338 ? VEC_UNPACK_HI_EXPR
339 : VEC_UNPACK_LO_EXPR);
340 gassign *stmt;
341 if (dest_masktype == unpack_masktype)
342 stmt = gimple_build_assign (dest, code, src);
343 else
344 {
345 tree temp = make_ssa_name (unpack_masktype);
346 stmt = gimple_build_assign (temp, code, src);
347 gimple_seq_add_stmt (seq, stmt);
348 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349 build1 (VIEW_CONVERT_EXPR,
350 dest_masktype, temp));
351 }
352 gimple_seq_add_stmt (seq, stmt);
353 }
354 return true;
355 }
356 vec_perm_indices indices[2];
357 if (dest_masktype == src_masktype
358 && interleave_supported_p (&indices[0], src_masktype, 0)
359 && interleave_supported_p (&indices[1], src_masktype, 1))
360 {
361 /* The destination requires twice as many mask bits as the source, so
362 we can use interleaving permutes to double up the number of bits. */
363 tree masks[2];
364 for (unsigned int i = 0; i < 2; ++i)
365 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367 {
368 tree src = src_rgm->masks[i / 2];
369 tree dest = dest_rgm->masks[i];
370 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371 src, src, masks[i & 1]);
372 gimple_seq_add_stmt (seq, stmt);
373 }
374 return true;
375 }
376 return false;
377 }
378
379 /* Helper for vect_set_loop_condition_masked. Generate definitions for
380 all the masks in RGM and return a mask that is nonzero when the loop
381 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
382 Use LOOP_COND_GSI to insert code before the exit gcond.
383
384 RGM belongs to loop LOOP. The loop originally iterated NITERS
385 times and has been vectorized according to LOOP_VINFO. Each iteration
386 of the vectorized loop handles VF iterations of the scalar loop.
387
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389 starts with NITERS_SKIP dummy iterations of the scalar loop before
390 the real work starts. The mask elements for these dummy iterations
391 must be 0, to ensure that the extra iterations do not have an effect.
392
393 It is known that:
394
395 NITERS * RGM->max_nscalars_per_iter
396
397 does not overflow. However, MIGHT_WRAP_P says whether an induction
398 variable that starts at 0 and has step:
399
400 VF * RGM->max_nscalars_per_iter
401
402 might overflow before hitting a value above:
403
404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405
406 This means that we cannot guarantee that such an induction variable
407 would ever hit a value that produces a set of all-false masks for RGM. */
408
409 static tree
410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411 gimple_seq *preheader_seq,
412 gimple_stmt_iterator loop_cond_gsi,
413 rgroup_masks *rgm, tree vf,
414 tree niters, tree niters_skip,
415 bool might_wrap_p)
416 {
417 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
419 tree mask_type = rgm->mask_type;
420 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
421 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
422
423 /* Calculate the maximum number of scalar values that the rgroup
424 handles in total, the number that it handles for each iteration
425 of the vector loop, and the number that it should skip during the
426 first iteration of the vector loop. */
427 tree nscalars_total = niters;
428 tree nscalars_step = vf;
429 tree nscalars_skip = niters_skip;
430 if (nscalars_per_iter != 1)
431 {
432 /* We checked before choosing to use a fully-masked loop that these
433 multiplications don't overflow. */
434 tree factor = build_int_cst (compare_type, nscalars_per_iter);
435 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
436 nscalars_total, factor);
437 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
438 nscalars_step, factor);
439 if (nscalars_skip)
440 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
441 nscalars_skip, factor);
442 }
443
444 /* Create an induction variable that counts the number of scalars
445 processed. */
446 tree index_before_incr, index_after_incr;
447 gimple_stmt_iterator incr_gsi;
448 bool insert_after;
449 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450
451 tree zero_index = build_int_cst (iv_type, 0);
452 tree step = build_int_cst (iv_type,
453 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
454 /* Create IV of iv_type. */
455 create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
456 insert_after, &index_before_incr, &index_after_incr);
457
458 zero_index = build_int_cst (compare_type, 0);
459 tree test_index, test_limit, first_limit;
460 gimple_stmt_iterator *test_gsi;
461 if (might_wrap_p)
462 {
463 /* In principle the loop should stop iterating once the incremented
464 IV reaches a value greater than or equal to:
465
466 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
467
468 However, there's no guarantee that this addition doesn't overflow
469 the comparison type, or that the IV hits a value above it before
470 wrapping around. We therefore adjust the limit down by one
471 IV step:
472
473 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
474 -[infinite-prec] NSCALARS_STEP
475
476 and compare the IV against this limit _before_ incrementing it.
477 Since the comparison type is unsigned, we actually want the
478 subtraction to saturate at zero:
479
480 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
481 -[sat] NSCALARS_STEP
482
483 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
484
485 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
486
487 where the rightmost subtraction can be done directly in
488 COMPARE_TYPE. */
489 test_index = index_before_incr;
490 tree adjust = nscalars_step;
491 if (nscalars_skip)
492 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
493 adjust, nscalars_skip);
494 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
495 nscalars_total, adjust);
496 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497 test_limit, adjust);
498 test_gsi = &incr_gsi;
499
500 /* Get a safe limit for the first iteration. */
501 if (nscalars_skip)
502 {
503 /* The first vector iteration can handle at most NSCALARS_STEP
504 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
505 NSCALARS_SKIP to that cannot overflow. */
506 tree const_limit = build_int_cst (compare_type,
507 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
508 * nscalars_per_iter);
509 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
510 nscalars_total, const_limit);
511 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
512 first_limit, nscalars_skip);
513 }
514 else
515 /* For the first iteration it doesn't matter whether the IV hits
516 a value above NSCALARS_TOTAL. That only matters for the latch
517 condition. */
518 first_limit = nscalars_total;
519 }
520 else
521 {
522 /* Test the incremented IV, which will always hit a value above
523 the bound before wrapping. */
524 test_index = index_after_incr;
525 test_limit = nscalars_total;
526 if (nscalars_skip)
527 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
528 test_limit, nscalars_skip);
529 test_gsi = &loop_cond_gsi;
530
531 first_limit = test_limit;
532 }
533
534 /* Provide a definition of each mask in the group. */
535 tree next_mask = NULL_TREE;
536 tree mask;
537 unsigned int i;
538 gimple_seq test_seq = NULL;
539 test_index = gimple_convert (&test_seq, compare_type, test_index);
540 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
541
542 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
543 {
544 /* Previous masks will cover BIAS scalars. This mask covers the
545 next batch. */
546 poly_uint64 bias = nscalars_per_mask * i;
547 tree bias_tree = build_int_cst (compare_type, bias);
548 gimple *tmp_stmt;
549
550 /* See whether the first iteration of the vector loop is known
551 to have a full mask. */
552 poly_uint64 const_limit;
553 bool first_iteration_full
554 = (poly_int_tree_p (first_limit, &const_limit)
555 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
556
557 /* Rather than have a new IV that starts at BIAS and goes up to
558 TEST_LIMIT, prefer to use the same 0-based IV for each mask
559 and adjust the bound down by BIAS. */
560 tree this_test_limit = test_limit;
561 if (i != 0)
562 {
563 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
564 compare_type, this_test_limit,
565 bias_tree);
566 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
567 compare_type, this_test_limit,
568 bias_tree);
569 }
570
571 /* Create the initial mask. First include all scalars that
572 are within the loop limit. */
573 tree init_mask = NULL_TREE;
574 if (!first_iteration_full)
575 {
576 tree start, end;
577 if (first_limit == test_limit)
578 {
579 /* Use a natural test between zero (the initial IV value)
580 and the loop limit. The "else" block would be valid too,
581 but this choice can avoid the need to load BIAS_TREE into
582 a register. */
583 start = zero_index;
584 end = this_test_limit;
585 }
586 else
587 {
588 /* FIRST_LIMIT is the maximum number of scalars handled by the
589 first iteration of the vector loop. Test the portion
590 associated with this mask. */
591 start = bias_tree;
592 end = first_limit;
593 }
594
595 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
596 tmp_stmt = vect_gen_while (init_mask, start, end);
597 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
598 }
599
600 /* Now AND out the bits that are within the number of skipped
601 scalars. */
602 poly_uint64 const_skip;
603 if (nscalars_skip
604 && !(poly_int_tree_p (nscalars_skip, &const_skip)
605 && known_le (const_skip, bias)))
606 {
607 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
608 bias_tree, nscalars_skip);
609 if (init_mask)
610 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
611 init_mask, unskipped_mask);
612 else
613 init_mask = unskipped_mask;
614 }
615
616 if (!init_mask)
617 /* First iteration is full. */
618 init_mask = build_minus_one_cst (mask_type);
619
620 /* Get the mask value for the next iteration of the loop. */
621 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
622 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
623 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
624
625 vect_set_loop_mask (loop, mask, init_mask, next_mask);
626 }
627 return next_mask;
628 }
629
630 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
631 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
632 number of iterations of the original scalar loop that should be
633 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
634 as for vect_set_loop_condition.
635
636 Insert the branch-back condition before LOOP_COND_GSI and return the
637 final gcond. */
638
639 static gcond *
640 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
641 tree niters, tree final_iv,
642 bool niters_maybe_zero,
643 gimple_stmt_iterator loop_cond_gsi)
644 {
645 gimple_seq preheader_seq = NULL;
646 gimple_seq header_seq = NULL;
647
648 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
649 unsigned int compare_precision = TYPE_PRECISION (compare_type);
650 tree orig_niters = niters;
651
652 /* Type of the initial value of NITERS. */
653 tree ni_actual_type = TREE_TYPE (niters);
654 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
655 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
656
657 /* Convert NITERS to the same size as the compare. */
658 if (compare_precision > ni_actual_precision
659 && niters_maybe_zero)
660 {
661 /* We know that there is always at least one iteration, so if the
662 count is zero then it must have wrapped. Cope with this by
663 subtracting 1 before the conversion and adding 1 to the result. */
664 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
665 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
666 niters, build_minus_one_cst (ni_actual_type));
667 niters = gimple_convert (&preheader_seq, compare_type, niters);
668 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
669 niters, build_one_cst (compare_type));
670 }
671 else
672 niters = gimple_convert (&preheader_seq, compare_type, niters);
673
674 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
675 /* Get the vectorization factor in tree form. */
676 tree vf = build_int_cst (compare_type,
677 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
678
679 /* Iterate over all the rgroups and fill in their masks. We could use
680 the first mask from any rgroup for the loop condition; here we
681 arbitrarily pick the last. */
682 tree test_mask = NULL_TREE;
683 rgroup_masks *rgm;
684 unsigned int i;
685 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
686 FOR_EACH_VEC_ELT (*masks, i, rgm)
687 if (!rgm->masks.is_empty ())
688 {
689 /* First try using permutes. This adds a single vector
690 instruction to the loop for each mask, but needs no extra
691 loop invariants or IVs. */
692 unsigned int nmasks = i + 1;
693 if ((nmasks & 1) == 0)
694 {
695 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
696 if (!half_rgm->masks.is_empty ()
697 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
698 continue;
699 }
700
701 /* See whether zero-based IV would ever generate all-false masks
702 before wrapping around. */
703 bool might_wrap_p
704 = (iv_limit == -1
705 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
706 UNSIGNED)
707 > compare_precision));
708
709 /* Set up all masks for this group. */
710 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
711 &preheader_seq,
712 loop_cond_gsi, rgm, vf,
713 niters, niters_skip,
714 might_wrap_p);
715 }
716
717 /* Emit all accumulated statements. */
718 add_preheader_seq (loop, preheader_seq);
719 add_header_seq (loop, header_seq);
720
721 /* Get a boolean result that tells us whether to iterate. */
722 edge exit_edge = single_exit (loop);
723 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
724 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
725 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
726 NULL_TREE, NULL_TREE);
727 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
728
729 /* The loop iterates (NITERS - 1) / VF + 1 times.
730 Subtract one from this to get the latch count. */
731 tree step = build_int_cst (compare_type,
732 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
733 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
734 build_minus_one_cst (compare_type));
735 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
736 niters_minus_one, step);
737
738 if (final_iv)
739 {
740 gassign *assign = gimple_build_assign (final_iv, orig_niters);
741 gsi_insert_on_edge_immediate (single_exit (loop), assign);
742 }
743
744 return cond_stmt;
745 }
746
747 /* Like vect_set_loop_condition, but handle the case in which there
748 are no loop masks. */
749
750 static gcond *
751 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
752 tree step, tree final_iv,
753 bool niters_maybe_zero,
754 gimple_stmt_iterator loop_cond_gsi)
755 {
756 tree indx_before_incr, indx_after_incr;
757 gcond *cond_stmt;
758 gcond *orig_cond;
759 edge pe = loop_preheader_edge (loop);
760 edge exit_edge = single_exit (loop);
761 gimple_stmt_iterator incr_gsi;
762 bool insert_after;
763 enum tree_code code;
764 tree niters_type = TREE_TYPE (niters);
765
766 orig_cond = get_loop_exit_condition (loop);
767 gcc_assert (orig_cond);
768 loop_cond_gsi = gsi_for_stmt (orig_cond);
769
770 tree init, limit;
771 if (!niters_maybe_zero && integer_onep (step))
772 {
773 /* In this case we can use a simple 0-based IV:
774
775 A:
776 x = 0;
777 do
778 {
779 ...
780 x += 1;
781 }
782 while (x < NITERS); */
783 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
784 init = build_zero_cst (niters_type);
785 limit = niters;
786 }
787 else
788 {
789 /* The following works for all values of NITERS except 0:
790
791 B:
792 x = 0;
793 do
794 {
795 ...
796 x += STEP;
797 }
798 while (x <= NITERS - STEP);
799
800 so that the loop continues to iterate if x + STEP - 1 < NITERS
801 but stops if x + STEP - 1 >= NITERS.
802
803 However, if NITERS is zero, x never hits a value above NITERS - STEP
804 before wrapping around. There are two obvious ways of dealing with
805 this:
806
807 - start at STEP - 1 and compare x before incrementing it
808 - start at -1 and compare x after incrementing it
809
810 The latter is simpler and is what we use. The loop in this case
811 looks like:
812
813 C:
814 x = -1;
815 do
816 {
817 ...
818 x += STEP;
819 }
820 while (x < NITERS - STEP);
821
822 In both cases the loop limit is NITERS - STEP. */
823 gimple_seq seq = NULL;
824 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
825 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
826 if (seq)
827 {
828 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
829 gcc_assert (!new_bb);
830 }
831 if (niters_maybe_zero)
832 {
833 /* Case C. */
834 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
835 init = build_all_ones_cst (niters_type);
836 }
837 else
838 {
839 /* Case B. */
840 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
841 init = build_zero_cst (niters_type);
842 }
843 }
844
845 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
846 create_iv (init, step, NULL_TREE, loop,
847 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
848 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
849 true, NULL_TREE, true,
850 GSI_SAME_STMT);
851 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
852 true, GSI_SAME_STMT);
853
854 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
855 NULL_TREE);
856
857 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
858
859 /* Record the number of latch iterations. */
860 if (limit == niters)
861 /* Case A: the loop iterates NITERS times. Subtract one to get the
862 latch count. */
863 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
864 build_int_cst (niters_type, 1));
865 else
866 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
867 Subtract one from this to get the latch count. */
868 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
869 limit, step);
870
871 if (final_iv)
872 {
873 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
874 indx_after_incr, init);
875 gsi_insert_on_edge_immediate (single_exit (loop), assign);
876 }
877
878 return cond_stmt;
879 }
880
881 /* If we're using fully-masked loops, make LOOP iterate:
882
883 N == (NITERS - 1) / STEP + 1
884
885 times. When NITERS is zero, this is equivalent to making the loop
886 execute (1 << M) / STEP times, where M is the precision of NITERS.
887 NITERS_MAYBE_ZERO is true if this last case might occur.
888
889 If we're not using fully-masked loops, make LOOP iterate:
890
891 N == (NITERS - STEP) / STEP + 1
892
893 times, where NITERS is known to be outside the range [1, STEP - 1].
894 This is equivalent to making the loop execute NITERS / STEP times
895 when NITERS is nonzero and (1 << M) / STEP times otherwise.
896 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
897
898 If FINAL_IV is nonnull, it is an SSA name that should be set to
899 N * STEP on exit from the loop.
900
901 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
902
903 void
904 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
905 tree niters, tree step, tree final_iv,
906 bool niters_maybe_zero)
907 {
908 gcond *cond_stmt;
909 gcond *orig_cond = get_loop_exit_condition (loop);
910 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
911
912 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
913 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
914 final_iv, niters_maybe_zero,
915 loop_cond_gsi);
916 else
917 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
918 final_iv, niters_maybe_zero,
919 loop_cond_gsi);
920
921 /* Remove old loop exit test. */
922 stmt_vec_info orig_cond_info;
923 if (loop_vinfo
924 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
925 loop_vinfo->remove_stmt (orig_cond_info);
926 else
927 gsi_remove (&loop_cond_gsi, true);
928
929 if (dump_enabled_p ())
930 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
931 cond_stmt);
932 }
933
934 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
935 For all PHI arguments in FROM->dest and TO->dest from those
936 edges ensure that TO->dest PHI arguments have current_def
937 to that in from. */
938
939 static void
940 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
941 {
942 gimple_stmt_iterator gsi_from, gsi_to;
943
944 for (gsi_from = gsi_start_phis (from->dest),
945 gsi_to = gsi_start_phis (to->dest);
946 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
947 {
948 gimple *from_phi = gsi_stmt (gsi_from);
949 gimple *to_phi = gsi_stmt (gsi_to);
950 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
951 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
952 if (virtual_operand_p (from_arg))
953 {
954 gsi_next (&gsi_from);
955 continue;
956 }
957 if (virtual_operand_p (to_arg))
958 {
959 gsi_next (&gsi_to);
960 continue;
961 }
962 if (TREE_CODE (from_arg) != SSA_NAME)
963 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
964 else if (TREE_CODE (to_arg) == SSA_NAME
965 && from_arg != to_arg)
966 {
967 if (get_current_def (to_arg) == NULL_TREE)
968 {
969 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
970 TREE_TYPE (get_current_def
971 (from_arg))));
972 set_current_def (to_arg, get_current_def (from_arg));
973 }
974 }
975 gsi_next (&gsi_from);
976 gsi_next (&gsi_to);
977 }
978
979 gphi *from_phi = get_virtual_phi (from->dest);
980 gphi *to_phi = get_virtual_phi (to->dest);
981 if (from_phi)
982 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
983 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
984 }
985
986
987 /* Given LOOP this function generates a new copy of it and puts it
988 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
989 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
990 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
991 entry or exit of LOOP. */
992
993 struct loop *
994 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
995 struct loop *scalar_loop, edge e)
996 {
997 struct loop *new_loop;
998 basic_block *new_bbs, *bbs, *pbbs;
999 bool at_exit;
1000 bool was_imm_dom;
1001 basic_block exit_dest;
1002 edge exit, new_exit;
1003 bool duplicate_outer_loop = false;
1004
1005 exit = single_exit (loop);
1006 at_exit = (e == exit);
1007 if (!at_exit && e != loop_preheader_edge (loop))
1008 return NULL;
1009
1010 if (scalar_loop == NULL)
1011 scalar_loop = loop;
1012
1013 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1014 pbbs = bbs + 1;
1015 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1016 /* Allow duplication of outer loops. */
1017 if (scalar_loop->inner)
1018 duplicate_outer_loop = true;
1019 /* Check whether duplication is possible. */
1020 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1021 {
1022 free (bbs);
1023 return NULL;
1024 }
1025
1026 /* Generate new loop structure. */
1027 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1028 duplicate_subloops (scalar_loop, new_loop);
1029
1030 exit_dest = exit->dest;
1031 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1032 exit_dest) == loop->header ?
1033 true : false);
1034
1035 /* Also copy the pre-header, this avoids jumping through hoops to
1036 duplicate the loop entry PHI arguments. Create an empty
1037 pre-header unconditionally for this. */
1038 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1039 edge entry_e = single_pred_edge (preheader);
1040 bbs[0] = preheader;
1041 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1042
1043 exit = single_exit (scalar_loop);
1044 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1045 &exit, 1, &new_exit, NULL,
1046 at_exit ? loop->latch : e->src, true);
1047 exit = single_exit (loop);
1048 basic_block new_preheader = new_bbs[0];
1049
1050 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1051
1052 if (scalar_loop != loop)
1053 {
1054 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1055 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1056 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1057 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1058 header) to have current_def set, so copy them over. */
1059 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1060 exit);
1061 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1062 0),
1063 EDGE_SUCC (loop->latch, 0));
1064 }
1065
1066 if (at_exit) /* Add the loop copy at exit. */
1067 {
1068 if (scalar_loop != loop)
1069 {
1070 gphi_iterator gsi;
1071 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1072
1073 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1074 gsi_next (&gsi))
1075 {
1076 gphi *phi = gsi.phi ();
1077 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1078 location_t orig_locus
1079 = gimple_phi_arg_location_from_edge (phi, e);
1080
1081 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1082 }
1083 }
1084 redirect_edge_and_branch_force (e, new_preheader);
1085 flush_pending_stmts (e);
1086 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1087 if (was_imm_dom || duplicate_outer_loop)
1088 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1089
1090 /* And remove the non-necessary forwarder again. Keep the other
1091 one so we have a proper pre-header for the loop at the exit edge. */
1092 redirect_edge_pred (single_succ_edge (preheader),
1093 single_pred (preheader));
1094 delete_basic_block (preheader);
1095 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1096 loop_preheader_edge (scalar_loop)->src);
1097 }
1098 else /* Add the copy at entry. */
1099 {
1100 if (scalar_loop != loop)
1101 {
1102 /* Remove the non-necessary forwarder of scalar_loop again. */
1103 redirect_edge_pred (single_succ_edge (preheader),
1104 single_pred (preheader));
1105 delete_basic_block (preheader);
1106 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1107 loop_preheader_edge (scalar_loop)->src);
1108 preheader = split_edge (loop_preheader_edge (loop));
1109 entry_e = single_pred_edge (preheader);
1110 }
1111
1112 redirect_edge_and_branch_force (entry_e, new_preheader);
1113 flush_pending_stmts (entry_e);
1114 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1115
1116 redirect_edge_and_branch_force (new_exit, preheader);
1117 flush_pending_stmts (new_exit);
1118 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1119
1120 /* And remove the non-necessary forwarder again. Keep the other
1121 one so we have a proper pre-header for the loop at the exit edge. */
1122 redirect_edge_pred (single_succ_edge (new_preheader),
1123 single_pred (new_preheader));
1124 delete_basic_block (new_preheader);
1125 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1126 loop_preheader_edge (new_loop)->src);
1127 }
1128
1129 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1130 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1131 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1132
1133 if (scalar_loop != loop)
1134 {
1135 /* Update new_loop->header PHIs, so that on the preheader
1136 edge they are the ones from loop rather than scalar_loop. */
1137 gphi_iterator gsi_orig, gsi_new;
1138 edge orig_e = loop_preheader_edge (loop);
1139 edge new_e = loop_preheader_edge (new_loop);
1140
1141 for (gsi_orig = gsi_start_phis (loop->header),
1142 gsi_new = gsi_start_phis (new_loop->header);
1143 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1144 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1145 {
1146 gphi *orig_phi = gsi_orig.phi ();
1147 gphi *new_phi = gsi_new.phi ();
1148 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1149 location_t orig_locus
1150 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1151
1152 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1153 }
1154 }
1155
1156 free (new_bbs);
1157 free (bbs);
1158
1159 checking_verify_dominators (CDI_DOMINATORS);
1160
1161 return new_loop;
1162 }
1163
1164
1165 /* Given the condition expression COND, put it as the last statement of
1166 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1167 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1168 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1169 new edge as irreducible if IRREDUCIBLE_P is true. */
1170
1171 static edge
1172 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1173 basic_block guard_to, basic_block dom_bb,
1174 profile_probability probability, bool irreducible_p)
1175 {
1176 gimple_stmt_iterator gsi;
1177 edge new_e, enter_e;
1178 gcond *cond_stmt;
1179 gimple_seq gimplify_stmt_list = NULL;
1180
1181 enter_e = EDGE_SUCC (guard_bb, 0);
1182 enter_e->flags &= ~EDGE_FALLTHRU;
1183 enter_e->flags |= EDGE_FALSE_VALUE;
1184 gsi = gsi_last_bb (guard_bb);
1185
1186 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1187 NULL_TREE);
1188 if (gimplify_stmt_list)
1189 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1190
1191 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1192 gsi = gsi_last_bb (guard_bb);
1193 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1194
1195 /* Add new edge to connect guard block to the merge/loop-exit block. */
1196 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1197
1198 new_e->probability = probability;
1199 if (irreducible_p)
1200 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1201
1202 enter_e->probability = probability.invert ();
1203 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1204
1205 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1206 if (enter_e->dest->loop_father->header == enter_e->dest)
1207 split_edge (enter_e);
1208
1209 return new_e;
1210 }
1211
1212
1213 /* This function verifies that the following restrictions apply to LOOP:
1214 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1215 for innermost loop and 5 basic blocks for outer-loop.
1216 (2) it is single entry, single exit
1217 (3) its exit condition is the last stmt in the header
1218 (4) E is the entry/exit edge of LOOP.
1219 */
1220
1221 bool
1222 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1223 {
1224 edge exit_e = single_exit (loop);
1225 edge entry_e = loop_preheader_edge (loop);
1226 gcond *orig_cond = get_loop_exit_condition (loop);
1227 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1228 unsigned int num_bb = loop->inner? 5 : 2;
1229
1230 /* All loops have an outer scope; the only case loop->outer is NULL is for
1231 the function itself. */
1232 if (!loop_outer (loop)
1233 || loop->num_nodes != num_bb
1234 || !empty_block_p (loop->latch)
1235 || !single_exit (loop)
1236 /* Verify that new loop exit condition can be trivially modified. */
1237 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1238 || (e != exit_e && e != entry_e))
1239 return false;
1240
1241 return true;
1242 }
1243
1244 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1245 in the exit bb and rename all the uses after the loop. This simplifies
1246 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1247 (but normally loop closed SSA form doesn't require virtual PHIs to be
1248 in the same form). Doing this early simplifies the checking what
1249 uses should be renamed. */
1250
1251 static void
1252 create_lcssa_for_virtual_phi (struct loop *loop)
1253 {
1254 gphi_iterator gsi;
1255 edge exit_e = single_exit (loop);
1256
1257 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1258 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1259 {
1260 gphi *phi = gsi.phi ();
1261 for (gsi = gsi_start_phis (exit_e->dest);
1262 !gsi_end_p (gsi); gsi_next (&gsi))
1263 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1264 break;
1265 if (gsi_end_p (gsi))
1266 {
1267 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1268 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1269 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1270 imm_use_iterator imm_iter;
1271 gimple *stmt;
1272 use_operand_p use_p;
1273
1274 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1275 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1276 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1277 gimple_phi_set_result (new_phi, new_vop);
1278 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1279 if (stmt != new_phi
1280 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1281 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1282 SET_USE (use_p, new_vop);
1283 }
1284 break;
1285 }
1286
1287 }
1288
1289 /* Function vect_get_loop_location.
1290
1291 Extract the location of the loop in the source code.
1292 If the loop is not well formed for vectorization, an estimated
1293 location is calculated.
1294 Return the loop location if succeed and NULL if not. */
1295
1296 dump_user_location_t
1297 find_loop_location (struct loop *loop)
1298 {
1299 gimple *stmt = NULL;
1300 basic_block bb;
1301 gimple_stmt_iterator si;
1302
1303 if (!loop)
1304 return dump_user_location_t ();
1305
1306 stmt = get_loop_exit_condition (loop);
1307
1308 if (stmt
1309 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1310 return stmt;
1311
1312 /* If we got here the loop is probably not "well formed",
1313 try to estimate the loop location */
1314
1315 if (!loop->header)
1316 return dump_user_location_t ();
1317
1318 bb = loop->header;
1319
1320 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1321 {
1322 stmt = gsi_stmt (si);
1323 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1324 return stmt;
1325 }
1326
1327 return dump_user_location_t ();
1328 }
1329
1330 /* Return true if the phi described by STMT_INFO defines an IV of the
1331 loop to be vectorized. */
1332
1333 static bool
1334 iv_phi_p (stmt_vec_info stmt_info)
1335 {
1336 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1337 if (virtual_operand_p (PHI_RESULT (phi)))
1338 return false;
1339
1340 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1341 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1342 return false;
1343
1344 return true;
1345 }
1346
1347 /* Function vect_can_advance_ivs_p
1348
1349 In case the number of iterations that LOOP iterates is unknown at compile
1350 time, an epilog loop will be generated, and the loop induction variables
1351 (IVs) will be "advanced" to the value they are supposed to take just before
1352 the epilog loop. Here we check that the access function of the loop IVs
1353 and the expression that represents the loop bound are simple enough.
1354 These restrictions will be relaxed in the future. */
1355
1356 bool
1357 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1358 {
1359 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1360 basic_block bb = loop->header;
1361 gphi_iterator gsi;
1362
1363 /* Analyze phi functions of the loop header. */
1364
1365 if (dump_enabled_p ())
1366 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1367 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1368 {
1369 tree evolution_part;
1370
1371 gphi *phi = gsi.phi ();
1372 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1375 phi_info->stmt);
1376
1377 /* Skip virtual phi's. The data dependences that are associated with
1378 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1379
1380 Skip reduction phis. */
1381 if (!iv_phi_p (phi_info))
1382 {
1383 if (dump_enabled_p ())
1384 dump_printf_loc (MSG_NOTE, vect_location,
1385 "reduc or virtual phi. skip.\n");
1386 continue;
1387 }
1388
1389 /* Analyze the evolution function. */
1390
1391 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1392 if (evolution_part == NULL_TREE)
1393 {
1394 if (dump_enabled_p ())
1395 dump_printf (MSG_MISSED_OPTIMIZATION,
1396 "No access function or evolution.\n");
1397 return false;
1398 }
1399
1400 /* FORNOW: We do not transform initial conditions of IVs
1401 which evolution functions are not invariants in the loop. */
1402
1403 if (!expr_invariant_in_loop_p (loop, evolution_part))
1404 {
1405 if (dump_enabled_p ())
1406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407 "evolution not invariant in loop.\n");
1408 return false;
1409 }
1410
1411 /* FORNOW: We do not transform initial conditions of IVs
1412 which evolution functions are a polynomial of degree >= 2. */
1413
1414 if (tree_is_chrec (evolution_part))
1415 {
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1418 "evolution is chrec.\n");
1419 return false;
1420 }
1421 }
1422
1423 return true;
1424 }
1425
1426
1427 /* Function vect_update_ivs_after_vectorizer.
1428
1429 "Advance" the induction variables of LOOP to the value they should take
1430 after the execution of LOOP. This is currently necessary because the
1431 vectorizer does not handle induction variables that are used after the
1432 loop. Such a situation occurs when the last iterations of LOOP are
1433 peeled, because:
1434 1. We introduced new uses after LOOP for IVs that were not originally used
1435 after LOOP: the IVs of LOOP are now used by an epilog loop.
1436 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1437 times, whereas the loop IVs should be bumped N times.
1438
1439 Input:
1440 - LOOP - a loop that is going to be vectorized. The last few iterations
1441 of LOOP were peeled.
1442 - NITERS - the number of iterations that LOOP executes (before it is
1443 vectorized). i.e, the number of times the ivs should be bumped.
1444 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1445 coming out from LOOP on which there are uses of the LOOP ivs
1446 (this is the path from LOOP->exit to epilog_loop->preheader).
1447
1448 The new definitions of the ivs are placed in LOOP->exit.
1449 The phi args associated with the edge UPDATE_E in the bb
1450 UPDATE_E->dest are updated accordingly.
1451
1452 Assumption 1: Like the rest of the vectorizer, this function assumes
1453 a single loop exit that has a single predecessor.
1454
1455 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1456 organized in the same order.
1457
1458 Assumption 3: The access function of the ivs is simple enough (see
1459 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1460
1461 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1462 coming out of LOOP on which the ivs of LOOP are used (this is the path
1463 that leads to the epilog loop; other paths skip the epilog loop). This
1464 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1465 needs to have its phis updated.
1466 */
1467
1468 static void
1469 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1470 tree niters, edge update_e)
1471 {
1472 gphi_iterator gsi, gsi1;
1473 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1474 basic_block update_bb = update_e->dest;
1475 basic_block exit_bb = single_exit (loop)->dest;
1476
1477 /* Make sure there exists a single-predecessor exit bb: */
1478 gcc_assert (single_pred_p (exit_bb));
1479 gcc_assert (single_succ_edge (exit_bb) == update_e);
1480
1481 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1482 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1483 gsi_next (&gsi), gsi_next (&gsi1))
1484 {
1485 tree init_expr;
1486 tree step_expr, off;
1487 tree type;
1488 tree var, ni, ni_name;
1489 gimple_stmt_iterator last_gsi;
1490
1491 gphi *phi = gsi.phi ();
1492 gphi *phi1 = gsi1.phi ();
1493 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1494 if (dump_enabled_p ())
1495 dump_printf_loc (MSG_NOTE, vect_location,
1496 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1497
1498 /* Skip reduction and virtual phis. */
1499 if (!iv_phi_p (phi_info))
1500 {
1501 if (dump_enabled_p ())
1502 dump_printf_loc (MSG_NOTE, vect_location,
1503 "reduc or virtual phi. skip.\n");
1504 continue;
1505 }
1506
1507 type = TREE_TYPE (gimple_phi_result (phi));
1508 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1509 step_expr = unshare_expr (step_expr);
1510
1511 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1512 of degree >= 2 or exponential. */
1513 gcc_assert (!tree_is_chrec (step_expr));
1514
1515 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1516
1517 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1518 fold_convert (TREE_TYPE (step_expr), niters),
1519 step_expr);
1520 if (POINTER_TYPE_P (type))
1521 ni = fold_build_pointer_plus (init_expr, off);
1522 else
1523 ni = fold_build2 (PLUS_EXPR, type,
1524 init_expr, fold_convert (type, off));
1525
1526 var = create_tmp_var (type, "tmp");
1527
1528 last_gsi = gsi_last_bb (exit_bb);
1529 gimple_seq new_stmts = NULL;
1530 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1531 /* Exit_bb shouldn't be empty. */
1532 if (!gsi_end_p (last_gsi))
1533 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1534 else
1535 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1536
1537 /* Fix phi expressions in the successor bb. */
1538 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1539 }
1540 }
1541
1542 /* Return a gimple value containing the misalignment (measured in vector
1543 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1544 it is away from a perfectly aligned address. Add any new statements
1545 to SEQ. */
1546
1547 static tree
1548 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1549 {
1550 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1551 stmt_vec_info stmt_info = dr_info->stmt;
1552 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1553
1554 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1555 unsigned HOST_WIDE_INT target_align_c;
1556 tree target_align_minus_1;
1557
1558 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1559 size_zero_node) < 0;
1560 tree offset = (negative
1561 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1562 : size_zero_node);
1563 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1564 offset);
1565 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1566 if (target_align.is_constant (&target_align_c))
1567 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1568 else
1569 {
1570 tree vla = build_int_cst (type, target_align);
1571 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1572 fold_build2 (MINUS_EXPR, type,
1573 build_int_cst (type, 0), vla));
1574 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1575 build_int_cst (type, 1));
1576 }
1577
1578 HOST_WIDE_INT elem_size
1579 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1580 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1581
1582 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1583 tree int_start_addr = fold_convert (type, start_addr);
1584 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1585 target_align_minus_1);
1586
1587 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1588 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1589 elem_size_log);
1590
1591 return misalign_in_elems;
1592 }
1593
1594 /* Function vect_gen_prolog_loop_niters
1595
1596 Generate the number of iterations which should be peeled as prolog for the
1597 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1598 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1599 As a result, after the execution of this loop, the data reference DR will
1600 refer to an aligned location. The following computation is generated:
1601
1602 If the misalignment of DR is known at compile time:
1603 addr_mis = int mis = DR_MISALIGNMENT (dr);
1604 Else, compute address misalignment in bytes:
1605 addr_mis = addr & (target_align - 1)
1606
1607 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1608
1609 (elem_size = element type size; an element is the scalar element whose type
1610 is the inner type of the vectype)
1611
1612 The computations will be emitted at the end of BB. We also compute and
1613 store upper bound (included) of the result in BOUND.
1614
1615 When the step of the data-ref in the loop is not 1 (as in interleaved data
1616 and SLP), the number of iterations of the prolog must be divided by the step
1617 (which is equal to the size of interleaved group).
1618
1619 The above formulas assume that VF == number of elements in the vector. This
1620 may not hold when there are multiple-types in the loop.
1621 In this case, for some data-references in the loop the VF does not represent
1622 the number of elements that fit in the vector. Therefore, instead of VF we
1623 use TYPE_VECTOR_SUBPARTS. */
1624
1625 static tree
1626 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1627 basic_block bb, int *bound)
1628 {
1629 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1630 tree var;
1631 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1632 gimple_seq stmts = NULL, new_stmts = NULL;
1633 tree iters, iters_name;
1634 stmt_vec_info stmt_info = dr_info->stmt;
1635 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1636 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1637
1638 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1639 {
1640 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1641
1642 if (dump_enabled_p ())
1643 dump_printf_loc (MSG_NOTE, vect_location,
1644 "known peeling = %d.\n", npeel);
1645
1646 iters = build_int_cst (niters_type, npeel);
1647 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1648 }
1649 else
1650 {
1651 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1652 tree type = TREE_TYPE (misalign_in_elems);
1653 HOST_WIDE_INT elem_size
1654 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1655 /* We only do prolog peeling if the target alignment is known at compile
1656 time. */
1657 poly_uint64 align_in_elems =
1658 exact_div (target_align, elem_size);
1659 tree align_in_elems_minus_1 =
1660 build_int_cst (type, align_in_elems - 1);
1661 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1662
1663 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1664 & (align_in_elems - 1)). */
1665 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1666 size_zero_node) < 0;
1667 if (negative)
1668 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1669 align_in_elems_tree);
1670 else
1671 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1672 misalign_in_elems);
1673 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1674 iters = fold_convert (niters_type, iters);
1675 unsigned HOST_WIDE_INT align_in_elems_c;
1676 if (align_in_elems.is_constant (&align_in_elems_c))
1677 *bound = align_in_elems_c - 1;
1678 else
1679 *bound = -1;
1680 }
1681
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE, vect_location,
1684 "niters for prolog loop: %T\n", iters);
1685
1686 var = create_tmp_var (niters_type, "prolog_loop_niters");
1687 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1688
1689 if (new_stmts)
1690 gimple_seq_add_seq (&stmts, new_stmts);
1691 if (stmts)
1692 {
1693 gcc_assert (single_succ_p (bb));
1694 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1695 if (gsi_end_p (gsi))
1696 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1697 else
1698 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1699 }
1700 return iters_name;
1701 }
1702
1703
1704 /* Function vect_update_init_of_dr
1705
1706 If CODE is PLUS, the vector loop starts NITERS iterations after the
1707 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1708 iterations before the scalar one (using masking to skip inactive
1709 elements). This function updates the information recorded in DR to
1710 account for the difference. Specifically, it updates the OFFSET
1711 field of DR. */
1712
1713 static void
1714 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1715 {
1716 tree offset = DR_OFFSET (dr);
1717
1718 niters = fold_build2 (MULT_EXPR, sizetype,
1719 fold_convert (sizetype, niters),
1720 fold_convert (sizetype, DR_STEP (dr)));
1721 offset = fold_build2 (code, sizetype,
1722 fold_convert (sizetype, offset), niters);
1723 DR_OFFSET (dr) = offset;
1724 }
1725
1726
1727 /* Function vect_update_inits_of_drs
1728
1729 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1730 CODE and NITERS are as for vect_update_inits_of_dr. */
1731
1732 static void
1733 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1734 tree_code code)
1735 {
1736 unsigned int i;
1737 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1738 struct data_reference *dr;
1739
1740 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1741
1742 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1743 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1744 {
1745 gimple_seq seq;
1746 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1747 tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1748
1749 niters = fold_convert (sizetype, niters);
1750 niters = force_gimple_operand (niters, &seq, false, var);
1751 if (seq)
1752 {
1753 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1754 gcc_assert (!new_bb);
1755 }
1756 }
1757
1758 FOR_EACH_VEC_ELT (datarefs, i, dr)
1759 {
1760 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1761 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1762 vect_update_init_of_dr (dr, niters, code);
1763 }
1764 }
1765
1766 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1767 by masking. This involves calculating the number of iterations to
1768 be peeled and then aligning all memory references appropriately. */
1769
1770 void
1771 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1772 {
1773 tree misalign_in_elems;
1774 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1775
1776 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1777
1778 /* From the information recorded in LOOP_VINFO get the number of iterations
1779 that need to be skipped via masking. */
1780 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1781 {
1782 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1783 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1784 misalign_in_elems = build_int_cst (type, misalign);
1785 }
1786 else
1787 {
1788 gimple_seq seq1 = NULL, seq2 = NULL;
1789 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1790 misalign_in_elems = fold_convert (type, misalign_in_elems);
1791 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1792 &seq2, true, NULL_TREE);
1793 gimple_seq_add_seq (&seq1, seq2);
1794 if (seq1)
1795 {
1796 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1797 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1798 gcc_assert (!new_bb);
1799 }
1800 }
1801
1802 if (dump_enabled_p ())
1803 dump_printf_loc (MSG_NOTE, vect_location,
1804 "misalignment for fully-masked loop: %T\n",
1805 misalign_in_elems);
1806
1807 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1808
1809 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1810 }
1811
1812 /* This function builds ni_name = number of iterations. Statements
1813 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1814 it to TRUE if new ssa_var is generated. */
1815
1816 tree
1817 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1818 {
1819 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1820 if (TREE_CODE (ni) == INTEGER_CST)
1821 return ni;
1822 else
1823 {
1824 tree ni_name, var;
1825 gimple_seq stmts = NULL;
1826 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1827
1828 var = create_tmp_var (TREE_TYPE (ni), "niters");
1829 ni_name = force_gimple_operand (ni, &stmts, false, var);
1830 if (stmts)
1831 {
1832 gsi_insert_seq_on_edge_immediate (pe, stmts);
1833 if (new_var_p != NULL)
1834 *new_var_p = true;
1835 }
1836
1837 return ni_name;
1838 }
1839 }
1840
1841 /* Calculate the number of iterations above which vectorized loop will be
1842 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1843 of prolog loop. If it's integer const, the integer number is also passed
1844 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1845 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1846 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1847 threshold below which the scalar (rather than vectorized) loop will be
1848 executed. This function stores the upper bound (inclusive) of the result
1849 in BOUND_SCALAR. */
1850
1851 static tree
1852 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1853 int bound_prolog, poly_int64 bound_epilog, int th,
1854 poly_uint64 *bound_scalar,
1855 bool check_profitability)
1856 {
1857 tree type = TREE_TYPE (niters_prolog);
1858 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1859 build_int_cst (type, bound_epilog));
1860
1861 *bound_scalar = bound_prolog + bound_epilog;
1862 if (check_profitability)
1863 {
1864 /* TH indicates the minimum niters of vectorized loop, while we
1865 compute the maximum niters of scalar loop. */
1866 th--;
1867 /* Peeling for constant times. */
1868 if (int_niters_prolog >= 0)
1869 {
1870 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1871 return build_int_cst (type, *bound_scalar);
1872 }
1873 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1874 and BOUND_EPILOG are inclusive upper bounds. */
1875 if (known_ge (th, bound_prolog + bound_epilog))
1876 {
1877 *bound_scalar = th;
1878 return build_int_cst (type, th);
1879 }
1880 /* Need to do runtime comparison. */
1881 else if (maybe_gt (th, bound_epilog))
1882 {
1883 *bound_scalar = upper_bound (*bound_scalar, th);
1884 return fold_build2 (MAX_EXPR, type,
1885 build_int_cst (type, th), niters);
1886 }
1887 }
1888 return niters;
1889 }
1890
1891 /* NITERS is the number of times that the original scalar loop executes
1892 after peeling. Work out the maximum number of iterations N that can
1893 be handled by the vectorized form of the loop and then either:
1894
1895 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1896
1897 niters_vector = N
1898
1899 b) set *STEP_VECTOR_PTR to one and generate:
1900
1901 niters_vector = N / vf
1902
1903 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1904 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1905 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1906
1907 void
1908 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1909 tree *niters_vector_ptr, tree *step_vector_ptr,
1910 bool niters_no_overflow)
1911 {
1912 tree ni_minus_gap, var;
1913 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1914 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1915 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1916 tree log_vf = NULL_TREE;
1917
1918 /* If epilogue loop is required because of data accesses with gaps, we
1919 subtract one iteration from the total number of iterations here for
1920 correct calculation of RATIO. */
1921 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1922 {
1923 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1924 build_one_cst (type));
1925 if (!is_gimple_val (ni_minus_gap))
1926 {
1927 var = create_tmp_var (type, "ni_gap");
1928 gimple *stmts = NULL;
1929 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1930 true, var);
1931 gsi_insert_seq_on_edge_immediate (pe, stmts);
1932 }
1933 }
1934 else
1935 ni_minus_gap = niters;
1936
1937 unsigned HOST_WIDE_INT const_vf;
1938 if (vf.is_constant (&const_vf)
1939 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1940 {
1941 /* Create: niters >> log2(vf) */
1942 /* If it's known that niters == number of latch executions + 1 doesn't
1943 overflow, we can generate niters >> log2(vf); otherwise we generate
1944 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1945 will be at least one. */
1946 log_vf = build_int_cst (type, exact_log2 (const_vf));
1947 if (niters_no_overflow)
1948 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1949 else
1950 niters_vector
1951 = fold_build2 (PLUS_EXPR, type,
1952 fold_build2 (RSHIFT_EXPR, type,
1953 fold_build2 (MINUS_EXPR, type,
1954 ni_minus_gap,
1955 build_int_cst (type, vf)),
1956 log_vf),
1957 build_int_cst (type, 1));
1958 step_vector = build_one_cst (type);
1959 }
1960 else
1961 {
1962 niters_vector = ni_minus_gap;
1963 step_vector = build_int_cst (type, vf);
1964 }
1965
1966 if (!is_gimple_val (niters_vector))
1967 {
1968 var = create_tmp_var (type, "bnd");
1969 gimple_seq stmts = NULL;
1970 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1971 gsi_insert_seq_on_edge_immediate (pe, stmts);
1972 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1973 we set range information to make niters analyzer's life easier. */
1974 if (stmts != NULL && log_vf)
1975 set_range_info (niters_vector, VR_RANGE,
1976 wi::to_wide (build_int_cst (type, 1)),
1977 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1978 TYPE_MAX_VALUE (type),
1979 log_vf)));
1980 }
1981 *niters_vector_ptr = niters_vector;
1982 *step_vector_ptr = step_vector;
1983
1984 return;
1985 }
1986
1987 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1988 loop specified by LOOP_VINFO after vectorization, compute the number
1989 of iterations before vectorization (niters_vector * vf) and store it
1990 to NITERS_VECTOR_MULT_VF_PTR. */
1991
1992 static void
1993 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1994 tree niters_vector,
1995 tree *niters_vector_mult_vf_ptr)
1996 {
1997 /* We should be using a step_vector of VF if VF is variable. */
1998 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1999 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2000 tree type = TREE_TYPE (niters_vector);
2001 tree log_vf = build_int_cst (type, exact_log2 (vf));
2002 basic_block exit_bb = single_exit (loop)->dest;
2003
2004 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2005 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2006 niters_vector, log_vf);
2007 if (!is_gimple_val (niters_vector_mult_vf))
2008 {
2009 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2010 gimple_seq stmts = NULL;
2011 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2012 &stmts, true, var);
2013 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2014 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2015 }
2016 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2017 }
2018
2019 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2020 from SECOND/FIRST and puts it at the original loop's preheader/exit
2021 edge, the two loops are arranged as below:
2022
2023 preheader_a:
2024 first_loop:
2025 header_a:
2026 i_1 = PHI<i_0, i_2>;
2027 ...
2028 i_2 = i_1 + 1;
2029 if (cond_a)
2030 goto latch_a;
2031 else
2032 goto between_bb;
2033 latch_a:
2034 goto header_a;
2035
2036 between_bb:
2037 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2038
2039 second_loop:
2040 header_b:
2041 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2042 or with i_2 if no LCSSA phi is created
2043 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2044 ...
2045 i_4 = i_3 + 1;
2046 if (cond_b)
2047 goto latch_b;
2048 else
2049 goto exit_bb;
2050 latch_b:
2051 goto header_b;
2052
2053 exit_bb:
2054
2055 This function creates loop closed SSA for the first loop; update the
2056 second loop's PHI nodes by replacing argument on incoming edge with the
2057 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2058 is false, Loop closed ssa phis will only be created for non-iv phis for
2059 the first loop.
2060
2061 This function assumes exit bb of the first loop is preheader bb of the
2062 second loop, i.e, between_bb in the example code. With PHIs updated,
2063 the second loop will execute rest iterations of the first. */
2064
2065 static void
2066 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2067 struct loop *first, struct loop *second,
2068 bool create_lcssa_for_iv_phis)
2069 {
2070 gphi_iterator gsi_update, gsi_orig;
2071 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2072
2073 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2074 edge second_preheader_e = loop_preheader_edge (second);
2075 basic_block between_bb = single_exit (first)->dest;
2076
2077 gcc_assert (between_bb == second_preheader_e->src);
2078 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2079 /* Either the first loop or the second is the loop to be vectorized. */
2080 gcc_assert (loop == first || loop == second);
2081
2082 for (gsi_orig = gsi_start_phis (first->header),
2083 gsi_update = gsi_start_phis (second->header);
2084 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2085 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2086 {
2087 gphi *orig_phi = gsi_orig.phi ();
2088 gphi *update_phi = gsi_update.phi ();
2089
2090 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2091 /* Generate lcssa PHI node for the first loop. */
2092 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2093 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2094 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2095 {
2096 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2097 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2098 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2099 arg = new_res;
2100 }
2101
2102 /* Update PHI node in the second loop by replacing arg on the loop's
2103 incoming edge. */
2104 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2105 }
2106 }
2107
2108 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2109 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2110 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2111 appear like below:
2112
2113 guard_bb:
2114 if (cond)
2115 goto merge_bb;
2116 else
2117 goto skip_loop;
2118
2119 skip_loop:
2120 header_a:
2121 i_1 = PHI<i_0, i_2>;
2122 ...
2123 i_2 = i_1 + 1;
2124 if (cond_a)
2125 goto latch_a;
2126 else
2127 goto exit_a;
2128 latch_a:
2129 goto header_a;
2130
2131 exit_a:
2132 i_5 = PHI<i_2>;
2133
2134 merge_bb:
2135 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2136
2137 update_loop:
2138 header_b:
2139 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2140 ...
2141 i_4 = i_3 + 1;
2142 if (cond_b)
2143 goto latch_b;
2144 else
2145 goto exit_bb;
2146 latch_b:
2147 goto header_b;
2148
2149 exit_bb:
2150
2151 This function creates PHI nodes at merge_bb and replaces the use of i_5
2152 in the update_loop's PHI node with the result of new PHI result. */
2153
2154 static void
2155 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2156 struct loop *update_loop,
2157 edge guard_edge, edge merge_edge)
2158 {
2159 location_t merge_loc, guard_loc;
2160 edge orig_e = loop_preheader_edge (skip_loop);
2161 edge update_e = loop_preheader_edge (update_loop);
2162 gphi_iterator gsi_orig, gsi_update;
2163
2164 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2165 gsi_update = gsi_start_phis (update_loop->header));
2166 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2167 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2168 {
2169 gphi *orig_phi = gsi_orig.phi ();
2170 gphi *update_phi = gsi_update.phi ();
2171
2172 /* Generate new phi node at merge bb of the guard. */
2173 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2174 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2175
2176 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2177 args in NEW_PHI for these edges. */
2178 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2179 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2180 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2181 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2182 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2183 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2184
2185 /* Update phi in UPDATE_PHI. */
2186 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2187 }
2188 }
2189
2190 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2191 this function searches for the corresponding lcssa phi node in exit
2192 bb of LOOP. If it is found, return the phi result; otherwise return
2193 NULL. */
2194
2195 static tree
2196 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2197 gphi *lcssa_phi)
2198 {
2199 gphi_iterator gsi;
2200 edge e = single_exit (loop);
2201
2202 gcc_assert (single_pred_p (e->dest));
2203 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2204 {
2205 gphi *phi = gsi.phi ();
2206 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2207 PHI_ARG_DEF (lcssa_phi, 0), 0))
2208 return PHI_RESULT (phi);
2209 }
2210 return NULL_TREE;
2211 }
2212
2213 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2214 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2215 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2216 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2217 The CFG looks like:
2218
2219 loop:
2220 header_a:
2221 i_1 = PHI<i_0, i_2>;
2222 ...
2223 i_2 = i_1 + 1;
2224 if (cond_a)
2225 goto latch_a;
2226 else
2227 goto exit_a;
2228 latch_a:
2229 goto header_a;
2230
2231 exit_a:
2232
2233 guard_bb:
2234 if (cond)
2235 goto merge_bb;
2236 else
2237 goto epilog_loop;
2238
2239 ;; fall_through_bb
2240
2241 epilog_loop:
2242 header_b:
2243 i_3 = PHI<i_2, i_4>;
2244 ...
2245 i_4 = i_3 + 1;
2246 if (cond_b)
2247 goto latch_b;
2248 else
2249 goto merge_bb;
2250 latch_b:
2251 goto header_b;
2252
2253 merge_bb:
2254 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2255
2256 exit_bb:
2257 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2258
2259 For each name used out side EPILOG (i.e - for each name that has a lcssa
2260 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2261 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2262 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2263 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2264 in exit_bb will also be updated. */
2265
2266 static void
2267 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2268 edge guard_edge, edge merge_edge)
2269 {
2270 gphi_iterator gsi;
2271 basic_block merge_bb = guard_edge->dest;
2272
2273 gcc_assert (single_succ_p (merge_bb));
2274 edge e = single_succ_edge (merge_bb);
2275 basic_block exit_bb = e->dest;
2276 gcc_assert (single_pred_p (exit_bb));
2277 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2278
2279 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2280 {
2281 gphi *update_phi = gsi.phi ();
2282 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2283 /* This loop-closed-phi actually doesn't represent a use out of the
2284 loop - the phi arg is a constant. */
2285 if (TREE_CODE (old_arg) != SSA_NAME)
2286 continue;
2287
2288 tree merge_arg = get_current_def (old_arg);
2289 if (!merge_arg)
2290 merge_arg = old_arg;
2291
2292 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2293 /* If the var is live after loop but not a reduction, we simply
2294 use the old arg. */
2295 if (!guard_arg)
2296 guard_arg = old_arg;
2297
2298 /* Create new phi node in MERGE_BB: */
2299 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2300 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2301
2302 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2303 the two PHI args in merge_phi for these edges. */
2304 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2305 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2306
2307 /* Update the original phi in exit_bb. */
2308 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2309 }
2310 }
2311
2312 /* EPILOG loop is duplicated from the original loop for vectorizing,
2313 the arg of its loop closed ssa PHI needs to be updated. */
2314
2315 static void
2316 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2317 {
2318 gphi_iterator gsi;
2319 basic_block exit_bb = single_exit (epilog)->dest;
2320
2321 gcc_assert (single_pred_p (exit_bb));
2322 edge e = EDGE_PRED (exit_bb, 0);
2323 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2324 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2325 }
2326
2327 /* Function vect_do_peeling.
2328
2329 Input:
2330 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2331
2332 preheader:
2333 LOOP:
2334 header_bb:
2335 loop_body
2336 if (exit_loop_cond) goto exit_bb
2337 else goto header_bb
2338 exit_bb:
2339
2340 - NITERS: The number of iterations of the loop.
2341 - NITERSM1: The number of iterations of the loop's latch.
2342 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2343 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2344 CHECK_PROFITABILITY is true.
2345 Output:
2346 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2347 iterate after vectorization; see vect_set_loop_condition for details.
2348 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2349 should be set to the number of scalar iterations handled by the
2350 vector loop. The SSA name is only used on exit from the loop.
2351
2352 This function peels prolog and epilog from the loop, adds guards skipping
2353 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2354 would look like:
2355
2356 guard_bb_1:
2357 if (prefer_scalar_loop) goto merge_bb_1
2358 else goto guard_bb_2
2359
2360 guard_bb_2:
2361 if (skip_prolog) goto merge_bb_2
2362 else goto prolog_preheader
2363
2364 prolog_preheader:
2365 PROLOG:
2366 prolog_header_bb:
2367 prolog_body
2368 if (exit_prolog_cond) goto prolog_exit_bb
2369 else goto prolog_header_bb
2370 prolog_exit_bb:
2371
2372 merge_bb_2:
2373
2374 vector_preheader:
2375 VECTOR LOOP:
2376 vector_header_bb:
2377 vector_body
2378 if (exit_vector_cond) goto vector_exit_bb
2379 else goto vector_header_bb
2380 vector_exit_bb:
2381
2382 guard_bb_3:
2383 if (skip_epilog) goto merge_bb_3
2384 else goto epilog_preheader
2385
2386 merge_bb_1:
2387
2388 epilog_preheader:
2389 EPILOG:
2390 epilog_header_bb:
2391 epilog_body
2392 if (exit_epilog_cond) goto merge_bb_3
2393 else goto epilog_header_bb
2394
2395 merge_bb_3:
2396
2397 Note this function peels prolog and epilog only if it's necessary,
2398 as well as guards.
2399 Returns created epilogue or NULL.
2400
2401 TODO: Guard for prefer_scalar_loop should be emitted along with
2402 versioning conditions if loop versioning is needed. */
2403
2404
2405 struct loop *
2406 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2407 tree *niters_vector, tree *step_vector,
2408 tree *niters_vector_mult_vf_var, int th,
2409 bool check_profitability, bool niters_no_overflow)
2410 {
2411 edge e, guard_e;
2412 tree type = TREE_TYPE (niters), guard_cond;
2413 basic_block guard_bb, guard_to;
2414 profile_probability prob_prolog, prob_vector, prob_epilog;
2415 int estimated_vf;
2416 int prolog_peeling = 0;
2417 /* We currently do not support prolog peeling if the target alignment is not
2418 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2419 target alignment being constant. */
2420 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2421 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2422 return NULL;
2423
2424 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2425 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2426
2427 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2428 poly_uint64 bound_epilog = 0;
2429 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2430 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2431 bound_epilog += vf - 1;
2432 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2433 bound_epilog += 1;
2434 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2435 poly_uint64 bound_scalar = bound_epilog;
2436
2437 if (!prolog_peeling && !epilog_peeling)
2438 return NULL;
2439
2440 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2441 estimated_vf = vect_vf_for_cost (loop_vinfo);
2442 if (estimated_vf == 2)
2443 estimated_vf = 3;
2444 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2445 .apply_scale (estimated_vf - 1, estimated_vf);
2446
2447 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2448 struct loop *first_loop = loop;
2449 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2450 create_lcssa_for_virtual_phi (loop);
2451 update_ssa (TODO_update_ssa_only_virtuals);
2452
2453 if (MAY_HAVE_DEBUG_BIND_STMTS)
2454 {
2455 gcc_assert (!adjust_vec.exists ());
2456 adjust_vec.create (32);
2457 }
2458 initialize_original_copy_tables ();
2459
2460 /* Record the anchor bb at which the guard should be placed if the scalar
2461 loop might be preferred. */
2462 basic_block anchor = loop_preheader_edge (loop)->src;
2463
2464 /* Generate the number of iterations for the prolog loop. We do this here
2465 so that we can also get the upper bound on the number of iterations. */
2466 tree niters_prolog;
2467 int bound_prolog = 0;
2468 if (prolog_peeling)
2469 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2470 &bound_prolog);
2471 else
2472 niters_prolog = build_int_cst (type, 0);
2473
2474 /* Prolog loop may be skipped. */
2475 bool skip_prolog = (prolog_peeling != 0);
2476 /* Skip to epilog if scalar loop may be preferred. It's only needed
2477 when we peel for epilog loop and when it hasn't been checked with
2478 loop versioning. */
2479 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2480 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2481 bound_prolog + bound_epilog)
2482 : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2483 /* Epilog loop must be executed if the number of iterations for epilog
2484 loop is known at compile time, otherwise we need to add a check at
2485 the end of vector loop and skip to the end of epilog loop. */
2486 bool skip_epilog = (prolog_peeling < 0
2487 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2488 || !vf.is_constant ());
2489 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2490 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2491 skip_epilog = false;
2492
2493 if (skip_vector)
2494 {
2495 split_edge (loop_preheader_edge (loop));
2496
2497 /* Due to the order in which we peel prolog and epilog, we first
2498 propagate probability to the whole loop. The purpose is to
2499 avoid adjusting probabilities of both prolog and vector loops
2500 separately. Note in this case, the probability of epilog loop
2501 needs to be scaled back later. */
2502 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2503 if (prob_vector.initialized_p ())
2504 {
2505 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2506 scale_loop_profile (loop, prob_vector, 0);
2507 }
2508 }
2509
2510 dump_user_location_t loop_loc = find_loop_location (loop);
2511 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2512 if (prolog_peeling)
2513 {
2514 e = loop_preheader_edge (loop);
2515 if (!slpeel_can_duplicate_loop_p (loop, e))
2516 {
2517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2518 "loop can't be duplicated to preheader edge.\n");
2519 gcc_unreachable ();
2520 }
2521 /* Peel prolog and put it on preheader edge of loop. */
2522 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2523 if (!prolog)
2524 {
2525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2526 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2527 gcc_unreachable ();
2528 }
2529 prolog->force_vectorize = false;
2530 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2531 first_loop = prolog;
2532 reset_original_copy_tables ();
2533
2534 /* Update the number of iterations for prolog loop. */
2535 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2536 vect_set_loop_condition (prolog, NULL, niters_prolog,
2537 step_prolog, NULL_TREE, false);
2538
2539 /* Skip the prolog loop. */
2540 if (skip_prolog)
2541 {
2542 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2543 niters_prolog, build_int_cst (type, 0));
2544 guard_bb = loop_preheader_edge (prolog)->src;
2545 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2546 guard_to = split_edge (loop_preheader_edge (loop));
2547 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2548 guard_to, guard_bb,
2549 prob_prolog.invert (),
2550 irred_flag);
2551 e = EDGE_PRED (guard_to, 0);
2552 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2553 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2554
2555 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2556 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2557 }
2558 /* Update init address of DRs. */
2559 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2560 /* Update niters for vector loop. */
2561 LOOP_VINFO_NITERS (loop_vinfo)
2562 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2563 LOOP_VINFO_NITERSM1 (loop_vinfo)
2564 = fold_build2 (MINUS_EXPR, type,
2565 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2566 bool new_var_p = false;
2567 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2568 /* It's guaranteed that vector loop bound before vectorization is at
2569 least VF, so set range information for newly generated var. */
2570 if (new_var_p)
2571 set_range_info (niters, VR_RANGE,
2572 wi::to_wide (build_int_cst (type, vf)),
2573 wi::to_wide (TYPE_MAX_VALUE (type)));
2574
2575 /* Prolog iterates at most bound_prolog times, latch iterates at
2576 most bound_prolog - 1 times. */
2577 record_niter_bound (prolog, bound_prolog - 1, false, true);
2578 delete_update_ssa ();
2579 adjust_vec_debug_stmts ();
2580 scev_reset ();
2581 }
2582
2583 if (epilog_peeling)
2584 {
2585 e = single_exit (loop);
2586 if (!slpeel_can_duplicate_loop_p (loop, e))
2587 {
2588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2589 "loop can't be duplicated to exit edge.\n");
2590 gcc_unreachable ();
2591 }
2592 /* Peel epilog and put it on exit edge of loop. */
2593 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2594 if (!epilog)
2595 {
2596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2597 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2598 gcc_unreachable ();
2599 }
2600 epilog->force_vectorize = false;
2601 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2602
2603 /* Scalar version loop may be preferred. In this case, add guard
2604 and skip to epilog. Note this only happens when the number of
2605 iterations of loop is unknown at compile time, otherwise this
2606 won't be vectorized. */
2607 if (skip_vector)
2608 {
2609 /* Additional epilogue iteration is peeled if gap exists. */
2610 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2611 bound_prolog, bound_epilog,
2612 th, &bound_scalar,
2613 check_profitability);
2614 /* Build guard against NITERSM1 since NITERS may overflow. */
2615 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2616 guard_bb = anchor;
2617 guard_to = split_edge (loop_preheader_edge (epilog));
2618 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2619 guard_to, guard_bb,
2620 prob_vector.invert (),
2621 irred_flag);
2622 e = EDGE_PRED (guard_to, 0);
2623 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2624 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2625
2626 /* Simply propagate profile info from guard_bb to guard_to which is
2627 a merge point of control flow. */
2628 guard_to->count = guard_bb->count;
2629
2630 /* Scale probability of epilog loop back.
2631 FIXME: We should avoid scaling down and back up. Profile may
2632 get lost if we scale down to 0. */
2633 basic_block *bbs = get_loop_body (epilog);
2634 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2635 bbs[i]->count = bbs[i]->count.apply_scale
2636 (bbs[i]->count,
2637 bbs[i]->count.apply_probability
2638 (prob_vector));
2639 free (bbs);
2640 }
2641
2642 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2643 tree niters_vector_mult_vf;
2644 /* If loop is peeled for non-zero constant times, now niters refers to
2645 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2646 overflows. */
2647 niters_no_overflow |= (prolog_peeling > 0);
2648 vect_gen_vector_loop_niters (loop_vinfo, niters,
2649 niters_vector, step_vector,
2650 niters_no_overflow);
2651 if (!integer_onep (*step_vector))
2652 {
2653 /* On exit from the loop we will have an easy way of calcalating
2654 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2655 until then. */
2656 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2657 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2658 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2659 }
2660 else
2661 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2662 &niters_vector_mult_vf);
2663 /* Update IVs of original loop as if they were advanced by
2664 niters_vector_mult_vf steps. */
2665 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2666 edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2667 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2668 update_e);
2669
2670 if (skip_epilog)
2671 {
2672 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2673 niters, niters_vector_mult_vf);
2674 guard_bb = single_exit (loop)->dest;
2675 guard_to = split_edge (single_exit (epilog));
2676 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2677 skip_vector ? anchor : guard_bb,
2678 prob_epilog.invert (),
2679 irred_flag);
2680 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2681 single_exit (epilog));
2682 /* Only need to handle basic block before epilog loop if it's not
2683 the guard_bb, which is the case when skip_vector is true. */
2684 if (guard_bb != bb_before_epilog)
2685 {
2686 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2687
2688 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2689 }
2690 scale_loop_profile (epilog, prob_epilog, 0);
2691 }
2692 else
2693 slpeel_update_phi_nodes_for_lcssa (epilog);
2694
2695 unsigned HOST_WIDE_INT bound;
2696 if (bound_scalar.is_constant (&bound))
2697 {
2698 gcc_assert (bound != 0);
2699 /* -1 to convert loop iterations to latch iterations. */
2700 record_niter_bound (epilog, bound - 1, false, true);
2701 }
2702
2703 delete_update_ssa ();
2704 adjust_vec_debug_stmts ();
2705 scev_reset ();
2706 }
2707 adjust_vec.release ();
2708 free_original_copy_tables ();
2709
2710 return epilog;
2711 }
2712
2713 /* Function vect_create_cond_for_niters_checks.
2714
2715 Create a conditional expression that represents the run-time checks for
2716 loop's niter. The loop is guaranteed to terminate if the run-time
2717 checks hold.
2718
2719 Input:
2720 COND_EXPR - input conditional expression. New conditions will be chained
2721 with logical AND operation. If it is NULL, then the function
2722 is used to return the number of alias checks.
2723 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2724 to be checked.
2725
2726 Output:
2727 COND_EXPR - conditional expression.
2728
2729 The returned COND_EXPR is the conditional expression to be used in the
2730 if statement that controls which version of the loop gets executed at
2731 runtime. */
2732
2733 static void
2734 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2735 {
2736 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2737
2738 if (*cond_expr)
2739 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2740 *cond_expr, part_cond_expr);
2741 else
2742 *cond_expr = part_cond_expr;
2743 }
2744
2745 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2746 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2747
2748 static void
2749 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2750 {
2751 if (*cond_expr)
2752 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2753 *cond_expr, part_cond_expr);
2754 else
2755 *cond_expr = part_cond_expr;
2756 }
2757
2758 /* Function vect_create_cond_for_align_checks.
2759
2760 Create a conditional expression that represents the alignment checks for
2761 all of data references (array element references) whose alignment must be
2762 checked at runtime.
2763
2764 Input:
2765 COND_EXPR - input conditional expression. New conditions will be chained
2766 with logical AND operation.
2767 LOOP_VINFO - two fields of the loop information are used.
2768 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2769 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2770
2771 Output:
2772 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2773 expression.
2774 The returned value is the conditional expression to be used in the if
2775 statement that controls which version of the loop gets executed at runtime.
2776
2777 The algorithm makes two assumptions:
2778 1) The number of bytes "n" in a vector is a power of 2.
2779 2) An address "a" is aligned if a%n is zero and that this
2780 test can be done as a&(n-1) == 0. For example, for 16
2781 byte vectors the test is a&0xf == 0. */
2782
2783 static void
2784 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2785 tree *cond_expr,
2786 gimple_seq *cond_expr_stmt_list)
2787 {
2788 vec<stmt_vec_info> may_misalign_stmts
2789 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2790 stmt_vec_info stmt_info;
2791 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2792 tree mask_cst;
2793 unsigned int i;
2794 tree int_ptrsize_type;
2795 char tmp_name[20];
2796 tree or_tmp_name = NULL_TREE;
2797 tree and_tmp_name;
2798 gimple *and_stmt;
2799 tree ptrsize_zero;
2800 tree part_cond_expr;
2801
2802 /* Check that mask is one less than a power of 2, i.e., mask is
2803 all zeros followed by all ones. */
2804 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2805
2806 int_ptrsize_type = signed_type_for (ptr_type_node);
2807
2808 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2809 of the first vector of the i'th data reference. */
2810
2811 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2812 {
2813 gimple_seq new_stmt_list = NULL;
2814 tree addr_base;
2815 tree addr_tmp_name;
2816 tree new_or_tmp_name;
2817 gimple *addr_stmt, *or_stmt;
2818 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2819 bool negative = tree_int_cst_compare
2820 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2821 tree offset = negative
2822 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2823
2824 /* create: addr_tmp = (int)(address_of_first_vector) */
2825 addr_base =
2826 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2827 offset);
2828 if (new_stmt_list != NULL)
2829 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2830
2831 sprintf (tmp_name, "addr2int%d", i);
2832 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2833 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2834 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2835
2836 /* The addresses are OR together. */
2837
2838 if (or_tmp_name != NULL_TREE)
2839 {
2840 /* create: or_tmp = or_tmp | addr_tmp */
2841 sprintf (tmp_name, "orptrs%d", i);
2842 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2843 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2844 or_tmp_name, addr_tmp_name);
2845 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2846 or_tmp_name = new_or_tmp_name;
2847 }
2848 else
2849 or_tmp_name = addr_tmp_name;
2850
2851 } /* end for i */
2852
2853 mask_cst = build_int_cst (int_ptrsize_type, mask);
2854
2855 /* create: and_tmp = or_tmp & mask */
2856 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2857
2858 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2859 or_tmp_name, mask_cst);
2860 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2861
2862 /* Make and_tmp the left operand of the conditional test against zero.
2863 if and_tmp has a nonzero bit then some address is unaligned. */
2864 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2865 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2866 and_tmp_name, ptrsize_zero);
2867 chain_cond_expr (cond_expr, part_cond_expr);
2868 }
2869
2870 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2871 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2872 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2873 and this new condition are true. Treat a null *COND_EXPR as "true". */
2874
2875 static void
2876 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2877 {
2878 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2879 unsigned int i;
2880 vec_object_pair *pair;
2881 FOR_EACH_VEC_ELT (pairs, i, pair)
2882 {
2883 tree addr1 = build_fold_addr_expr (pair->first);
2884 tree addr2 = build_fold_addr_expr (pair->second);
2885 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2886 addr1, addr2);
2887 chain_cond_expr (cond_expr, part_cond_expr);
2888 }
2889 }
2890
2891 /* Create an expression that is true when all lower-bound conditions for
2892 the vectorized loop are met. Chain this condition with *COND_EXPR. */
2893
2894 static void
2895 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2896 {
2897 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2898 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2899 {
2900 tree expr = lower_bounds[i].expr;
2901 tree type = unsigned_type_for (TREE_TYPE (expr));
2902 expr = fold_convert (type, expr);
2903 poly_uint64 bound = lower_bounds[i].min_value;
2904 if (!lower_bounds[i].unsigned_p)
2905 {
2906 expr = fold_build2 (PLUS_EXPR, type, expr,
2907 build_int_cstu (type, bound - 1));
2908 bound += bound - 1;
2909 }
2910 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2911 build_int_cstu (type, bound));
2912 chain_cond_expr (cond_expr, part_cond_expr);
2913 }
2914 }
2915
2916 /* Function vect_create_cond_for_alias_checks.
2917
2918 Create a conditional expression that represents the run-time checks for
2919 overlapping of address ranges represented by a list of data references
2920 relations passed as input.
2921
2922 Input:
2923 COND_EXPR - input conditional expression. New conditions will be chained
2924 with logical AND operation. If it is NULL, then the function
2925 is used to return the number of alias checks.
2926 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2927 to be checked.
2928
2929 Output:
2930 COND_EXPR - conditional expression.
2931
2932 The returned COND_EXPR is the conditional expression to be used in the if
2933 statement that controls which version of the loop gets executed at runtime.
2934 */
2935
2936 void
2937 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2938 {
2939 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2940 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2941
2942 if (comp_alias_ddrs.is_empty ())
2943 return;
2944
2945 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2946 &comp_alias_ddrs, cond_expr);
2947 if (dump_enabled_p ())
2948 dump_printf_loc (MSG_NOTE, vect_location,
2949 "created %u versioning for alias checks.\n",
2950 comp_alias_ddrs.length ());
2951 }
2952
2953
2954 /* Function vect_loop_versioning.
2955
2956 If the loop has data references that may or may not be aligned or/and
2957 has data reference relations whose independence was not proven then
2958 two versions of the loop need to be generated, one which is vectorized
2959 and one which isn't. A test is then generated to control which of the
2960 loops is executed. The test checks for the alignment of all of the
2961 data references that may or may not be aligned. An additional
2962 sequence of runtime tests is generated for each pairs of DDRs whose
2963 independence was not proven. The vectorized version of loop is
2964 executed only if both alias and alignment tests are passed.
2965
2966 The test generated to check which version of loop is executed
2967 is modified to also check for profitability as indicated by the
2968 cost model threshold TH.
2969
2970 The versioning precondition(s) are placed in *COND_EXPR and
2971 *COND_EXPR_STMT_LIST. */
2972
2973 struct loop *
2974 vect_loop_versioning (loop_vec_info loop_vinfo,
2975 unsigned int th, bool check_profitability,
2976 poly_uint64 versioning_threshold)
2977 {
2978 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2979 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2980 basic_block condition_bb;
2981 gphi_iterator gsi;
2982 gimple_stmt_iterator cond_exp_gsi;
2983 basic_block merge_bb;
2984 basic_block new_exit_bb;
2985 edge new_exit_e, e;
2986 gphi *orig_phi, *new_phi;
2987 tree cond_expr = NULL_TREE;
2988 gimple_seq cond_expr_stmt_list = NULL;
2989 tree arg;
2990 profile_probability prob = profile_probability::likely ();
2991 gimple_seq gimplify_stmt_list = NULL;
2992 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2993 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2994 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2995 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2996 tree version_simd_if_cond
2997 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
2998
2999 if (check_profitability)
3000 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3001 build_int_cst (TREE_TYPE (scalar_loop_iters),
3002 th - 1));
3003 if (maybe_ne (versioning_threshold, 0U))
3004 {
3005 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3006 build_int_cst (TREE_TYPE (scalar_loop_iters),
3007 versioning_threshold - 1));
3008 if (cond_expr)
3009 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3010 expr, cond_expr);
3011 else
3012 cond_expr = expr;
3013 }
3014
3015 if (version_niter)
3016 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3017
3018 if (cond_expr)
3019 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3020 &cond_expr_stmt_list,
3021 is_gimple_condexpr, NULL_TREE);
3022
3023 if (version_align)
3024 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3025 &cond_expr_stmt_list);
3026
3027 if (version_alias)
3028 {
3029 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3030 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3031 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3032 }
3033
3034 if (version_simd_if_cond)
3035 {
3036 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3037 if (flag_checking)
3038 if (basic_block bb
3039 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3040 gcc_assert (bb != loop->header
3041 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3042 && (scalar_loop == NULL
3043 || (bb != scalar_loop->header
3044 && dominated_by_p (CDI_DOMINATORS,
3045 scalar_loop->header, bb))));
3046 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3047 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3048 version_simd_if_cond, zero);
3049 if (cond_expr)
3050 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3051 c, cond_expr);
3052 else
3053 cond_expr = c;
3054 if (dump_enabled_p ())
3055 dump_printf_loc (MSG_NOTE, vect_location,
3056 "created versioning for simd if condition check.\n");
3057 }
3058
3059 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3060 &gimplify_stmt_list,
3061 is_gimple_condexpr, NULL_TREE);
3062 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3063
3064 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3065 invariant in. */
3066 struct loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3067 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3068 !gsi_end_p (gsi); gsi_next (&gsi))
3069 {
3070 gimple *stmt = gsi_stmt (gsi);
3071 update_stmt (stmt);
3072 ssa_op_iter iter;
3073 use_operand_p use_p;
3074 basic_block def_bb;
3075 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3076 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3077 && flow_bb_inside_loop_p (outermost, def_bb))
3078 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3079 }
3080
3081 /* Search for the outermost loop we can version. Avoid versioning of
3082 non-perfect nests but allow if-conversion versioned loops inside. */
3083 struct loop *loop_to_version = loop;
3084 if (flow_loop_nested_p (outermost, loop))
3085 {
3086 if (dump_enabled_p ())
3087 dump_printf_loc (MSG_NOTE, vect_location,
3088 "trying to apply versioning to outer loop %d\n",
3089 outermost->num);
3090 if (outermost->num == 0)
3091 outermost = superloop_at_depth (loop, 1);
3092 /* And avoid applying versioning on non-perfect nests. */
3093 while (loop_to_version != outermost
3094 && single_exit (loop_outer (loop_to_version))
3095 && (!loop_outer (loop_to_version)->inner->next
3096 || vect_loop_vectorized_call (loop_to_version))
3097 && (!loop_outer (loop_to_version)->inner->next
3098 || !loop_outer (loop_to_version)->inner->next->next))
3099 loop_to_version = loop_outer (loop_to_version);
3100 }
3101
3102 /* Apply versioning. If there is already a scalar version created by
3103 if-conversion re-use that. */
3104 gcond *cond;
3105 if (gimple *call = vect_loop_vectorized_call (loop_to_version, &cond))
3106 {
3107 gcc_assert (scalar_loop);
3108 condition_bb = gimple_bb (cond);
3109 gimple_cond_set_condition_from_tree (cond, cond_expr);
3110 update_stmt (cond);
3111
3112 if (cond_expr_stmt_list)
3113 {
3114 cond_exp_gsi = gsi_for_stmt (call);
3115 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3116 GSI_SAME_STMT);
3117 }
3118
3119 /* ??? if-conversion uses profile_probability::always () but
3120 prob below is profile_probability::likely (). */
3121 nloop = scalar_loop;
3122 if (dump_enabled_p ())
3123 dump_printf_loc (MSG_NOTE, vect_location,
3124 "reusing %sloop version created by if conversion\n",
3125 loop_to_version != loop ? "outer " : "");
3126 }
3127 else
3128 {
3129 if (loop_to_version != loop
3130 && dump_enabled_p ())
3131 dump_printf_loc (MSG_NOTE, vect_location,
3132 "applying loop versioning to outer loop %d\n",
3133 loop_to_version->num);
3134
3135 initialize_original_copy_tables ();
3136 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3137 prob, prob.invert (), prob, prob.invert (), true);
3138 gcc_assert (nloop);
3139 nloop = get_loop_copy (loop);
3140
3141 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3142 reap those otherwise; they also refer to the original
3143 loops. */
3144 struct loop *l = loop;
3145 while (gimple *call = vect_loop_vectorized_call (l))
3146 {
3147 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3148 fold_loop_internal_call (call, boolean_false_node);
3149 l = loop_outer (l);
3150 }
3151 free_original_copy_tables ();
3152
3153 if (cond_expr_stmt_list)
3154 {
3155 cond_exp_gsi = gsi_last_bb (condition_bb);
3156 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3157 GSI_SAME_STMT);
3158 }
3159
3160 /* Loop versioning violates an assumption we try to maintain during
3161 vectorization - that the loop exit block has a single predecessor.
3162 After versioning, the exit block of both loop versions is the same
3163 basic block (i.e. it has two predecessors). Just in order to simplify
3164 following transformations in the vectorizer, we fix this situation
3165 here by adding a new (empty) block on the exit-edge of the loop,
3166 with the proper loop-exit phis to maintain loop-closed-form.
3167 If loop versioning wasn't done from loop, but scalar_loop instead,
3168 merge_bb will have already just a single successor. */
3169
3170 merge_bb = single_exit (loop_to_version)->dest;
3171 if (EDGE_COUNT (merge_bb->preds) >= 2)
3172 {
3173 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3174 new_exit_bb = split_edge (single_exit (loop_to_version));
3175 new_exit_e = single_exit (loop_to_version);
3176 e = EDGE_SUCC (new_exit_bb, 0);
3177
3178 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3179 gsi_next (&gsi))
3180 {
3181 tree new_res;
3182 orig_phi = gsi.phi ();
3183 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3184 new_phi = create_phi_node (new_res, new_exit_bb);
3185 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3186 add_phi_arg (new_phi, arg, new_exit_e,
3187 gimple_phi_arg_location_from_edge (orig_phi, e));
3188 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3189 }
3190 }
3191
3192 update_ssa (TODO_update_ssa);
3193 }
3194
3195 if (version_niter)
3196 {
3197 /* The versioned loop could be infinite, we need to clear existing
3198 niter information which is copied from the original loop. */
3199 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3200 vect_free_loop_info_assumptions (nloop);
3201 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3202 loop_constraint_set (loop, LOOP_C_INFINITE);
3203 }
3204
3205 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3206 && dump_enabled_p ())
3207 {
3208 if (version_alias)
3209 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3210 vect_location,
3211 "loop versioned for vectorization because of "
3212 "possible aliasing\n");
3213 if (version_align)
3214 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3215 vect_location,
3216 "loop versioned for vectorization to enhance "
3217 "alignment\n");
3218
3219 }
3220
3221 return nloop;
3222 }