]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop-manip.c
arm: Factorize several occurrences of the same code into reg_needs_saving_p
[thirdparty/gcc.git] / gcc / tree-vect-loop-manip.c
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50 #include "insn-config.h"
51 #include "rtl.h"
52 #include "recog.h"
53
54 /*************************************************************************
55 Simple Loop Peeling Utilities
56
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
59
60
61 /* Renames the use *OP_P. */
62
63 static void
64 rename_use_op (use_operand_p op_p)
65 {
66 tree new_name;
67
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
70
71 new_name = get_current_def (USE_FROM_PTR (op_p));
72
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
76
77 /* An ordinary ssa name defined in the loop. */
78
79 SET_USE (op_p, new_name);
80 }
81
82
83 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
86
87 static void
88 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89 {
90 gimple *stmt;
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
97
98 if (rename_from_outer_loop)
99 {
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
102 }
103
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
106 {
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
110 }
111
112 FOR_EACH_EDGE (e, ei, bb->preds)
113 {
114 if (!flow_bb_inside_loop_p (loop, e->src))
115 {
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
119 {
120 if (outer_loop->inner->next)
121 {
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
128 }
129 }
130 }
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134 }
135 }
136
137
138 struct adjust_info
139 {
140 tree from, to;
141 basic_block bb;
142 };
143
144 /* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
150 static vec<adjust_info, va_heap> adjust_vec;
151
152 /* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
155
156 static void
157 adjust_debug_stmts_now (adjust_info *ai)
158 {
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
163 gimple *stmt;
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171 {
172 use_operand_p use_p;
173 basic_block bbuse;
174
175 if (!is_gimple_debug (stmt))
176 continue;
177
178 gcc_assert (gimple_debug_bind_p (stmt));
179
180 bbuse = gimple_bb (stmt);
181
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 {
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
191 {
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
194 }
195 }
196 }
197 }
198
199 /* Adjust debug stmts as scheduled before. */
200
201 static void
202 adjust_vec_debug_stmts (void)
203 {
204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
205 return;
206
207 gcc_assert (adjust_vec.exists ());
208
209 while (!adjust_vec.is_empty ())
210 {
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
213 }
214 }
215
216 /* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
220
221 static void
222 adjust_debug_stmts (tree from, tree to, basic_block bb)
223 {
224 adjust_info ai;
225
226 if (MAY_HAVE_DEBUG_BIND_STMTS
227 && TREE_CODE (from) == SSA_NAME
228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
229 && ! virtual_operand_p (from))
230 {
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
234
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
237 else
238 adjust_debug_stmts_now (&ai);
239 }
240 }
241
242 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
246
247 static void
248 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249 {
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253
254 if (MAY_HAVE_DEBUG_BIND_STMTS)
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
257 }
258
259 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
260 the mask should have during the first iteration and NEXT_MASK is the
261 value that it should have on subsequent iterations. */
262
263 static void
264 vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
265 tree next_mask)
266 {
267 gphi *phi = create_phi_node (mask, loop->header);
268 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
270 }
271
272 /* Add SEQ to the end of LOOP's preheader block. */
273
274 static void
275 add_preheader_seq (class loop *loop, gimple_seq seq)
276 {
277 if (seq)
278 {
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
282 }
283 }
284
285 /* Add SEQ to the beginning of LOOP's header block. */
286
287 static void
288 add_header_seq (class loop *loop, gimple_seq seq)
289 {
290 if (seq)
291 {
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 }
295 }
296
297 /* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
301
302 static bool
303 interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
305 {
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
313 }
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316 }
317
318 /* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
321
322 static bool
323 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
324 rgroup_masks *src_rgm)
325 {
326 tree src_masktype = src_rgm->mask_type;
327 tree dest_masktype = dest_rgm->mask_type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
329 insn_code icode1, icode2;
330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
335 {
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
342 {
343 tree src = src_rgm->masks[i / 2];
344 tree dest = dest_rgm->masks[i];
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
352 {
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
359 }
360 gimple_seq_add_stmt (seq, stmt);
361 }
362 return true;
363 }
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
368 {
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
375 {
376 tree src = src_rgm->masks[i / 2];
377 tree dest = dest_rgm->masks[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
381 }
382 return true;
383 }
384 return false;
385 }
386
387 /* Helper for vect_set_loop_condition_masked. Generate definitions for
388 all the masks in RGM and return a mask that is nonzero when the loop
389 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
390 Use LOOP_COND_GSI to insert code before the exit gcond.
391
392 RGM belongs to loop LOOP. The loop originally iterated NITERS
393 times and has been vectorized according to LOOP_VINFO.
394
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
399
400 It is known that:
401
402 NITERS * RGM->max_nscalars_per_iter
403
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
406
407 VF * RGM->max_nscalars_per_iter
408
409 might overflow before hitting a value above:
410
411 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
412
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks for RGM. */
415
416 static tree
417 vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
418 gimple_seq *preheader_seq,
419 gimple_stmt_iterator loop_cond_gsi,
420 rgroup_masks *rgm, tree niters, tree niters_skip,
421 bool might_wrap_p)
422 {
423 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
424 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
425 tree mask_type = rgm->mask_type;
426 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
428 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
429
430 /* Calculate the maximum number of scalar values that the rgroup
431 handles in total, the number that it handles for each iteration
432 of the vector loop, and the number that it should skip during the
433 first iteration of the vector loop. */
434 tree nscalars_total = niters;
435 tree nscalars_step = build_int_cst (iv_type, vf);
436 tree nscalars_skip = niters_skip;
437 if (nscalars_per_iter != 1)
438 {
439 /* We checked before choosing to use a fully-masked loop that these
440 multiplications don't overflow. */
441 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444 nscalars_total, compare_factor);
445 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446 nscalars_step, iv_factor);
447 if (nscalars_skip)
448 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449 nscalars_skip, compare_factor);
450 }
451
452 /* Create an induction variable that counts the number of scalars
453 processed. */
454 tree index_before_incr, index_after_incr;
455 gimple_stmt_iterator incr_gsi;
456 bool insert_after;
457 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
460
461 tree zero_index = build_int_cst (compare_type, 0);
462 tree test_index, test_limit, first_limit;
463 gimple_stmt_iterator *test_gsi;
464 if (might_wrap_p)
465 {
466 /* In principle the loop should stop iterating once the incremented
467 IV reaches a value greater than or equal to:
468
469 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
470
471 However, there's no guarantee that this addition doesn't overflow
472 the comparison type, or that the IV hits a value above it before
473 wrapping around. We therefore adjust the limit down by one
474 IV step:
475
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 -[infinite-prec] NSCALARS_STEP
478
479 and compare the IV against this limit _before_ incrementing it.
480 Since the comparison type is unsigned, we actually want the
481 subtraction to saturate at zero:
482
483 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484 -[sat] NSCALARS_STEP
485
486 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
487
488 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
489
490 where the rightmost subtraction can be done directly in
491 COMPARE_TYPE. */
492 test_index = index_before_incr;
493 tree adjust = gimple_convert (preheader_seq, compare_type,
494 nscalars_step);
495 if (nscalars_skip)
496 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497 adjust, nscalars_skip);
498 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499 nscalars_total, adjust);
500 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501 test_limit, adjust);
502 test_gsi = &incr_gsi;
503
504 /* Get a safe limit for the first iteration. */
505 if (nscalars_skip)
506 {
507 /* The first vector iteration can handle at most NSCALARS_STEP
508 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
509 NSCALARS_SKIP to that cannot overflow. */
510 tree const_limit = build_int_cst (compare_type,
511 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512 * nscalars_per_iter);
513 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514 nscalars_total, const_limit);
515 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516 first_limit, nscalars_skip);
517 }
518 else
519 /* For the first iteration it doesn't matter whether the IV hits
520 a value above NSCALARS_TOTAL. That only matters for the latch
521 condition. */
522 first_limit = nscalars_total;
523 }
524 else
525 {
526 /* Test the incremented IV, which will always hit a value above
527 the bound before wrapping. */
528 test_index = index_after_incr;
529 test_limit = nscalars_total;
530 if (nscalars_skip)
531 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532 test_limit, nscalars_skip);
533 test_gsi = &loop_cond_gsi;
534
535 first_limit = test_limit;
536 }
537
538 /* Convert the IV value to the comparison type (either a no-op or
539 a demotion). */
540 gimple_seq test_seq = NULL;
541 test_index = gimple_convert (&test_seq, compare_type, test_index);
542 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
543
544 /* Provide a definition of each mask in the group. */
545 tree next_mask = NULL_TREE;
546 tree mask;
547 unsigned int i;
548 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
549 {
550 /* Previous masks will cover BIAS scalars. This mask covers the
551 next batch. */
552 poly_uint64 bias = nscalars_per_mask * i;
553 tree bias_tree = build_int_cst (compare_type, bias);
554 gimple *tmp_stmt;
555
556 /* See whether the first iteration of the vector loop is known
557 to have a full mask. */
558 poly_uint64 const_limit;
559 bool first_iteration_full
560 = (poly_int_tree_p (first_limit, &const_limit)
561 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
562
563 /* Rather than have a new IV that starts at BIAS and goes up to
564 TEST_LIMIT, prefer to use the same 0-based IV for each mask
565 and adjust the bound down by BIAS. */
566 tree this_test_limit = test_limit;
567 if (i != 0)
568 {
569 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570 compare_type, this_test_limit,
571 bias_tree);
572 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573 compare_type, this_test_limit,
574 bias_tree);
575 }
576
577 /* Create the initial mask. First include all scalars that
578 are within the loop limit. */
579 tree init_mask = NULL_TREE;
580 if (!first_iteration_full)
581 {
582 tree start, end;
583 if (first_limit == test_limit)
584 {
585 /* Use a natural test between zero (the initial IV value)
586 and the loop limit. The "else" block would be valid too,
587 but this choice can avoid the need to load BIAS_TREE into
588 a register. */
589 start = zero_index;
590 end = this_test_limit;
591 }
592 else
593 {
594 /* FIRST_LIMIT is the maximum number of scalars handled by the
595 first iteration of the vector loop. Test the portion
596 associated with this mask. */
597 start = bias_tree;
598 end = first_limit;
599 }
600
601 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602 tmp_stmt = vect_gen_while (init_mask, start, end);
603 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
604 }
605
606 /* Now AND out the bits that are within the number of skipped
607 scalars. */
608 poly_uint64 const_skip;
609 if (nscalars_skip
610 && !(poly_int_tree_p (nscalars_skip, &const_skip)
611 && known_le (const_skip, bias)))
612 {
613 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614 bias_tree, nscalars_skip);
615 if (init_mask)
616 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617 init_mask, unskipped_mask);
618 else
619 init_mask = unskipped_mask;
620 }
621
622 if (!init_mask)
623 /* First iteration is full. */
624 init_mask = build_minus_one_cst (mask_type);
625
626 /* Get the mask value for the next iteration of the loop. */
627 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
630
631 vect_set_loop_mask (loop, mask, init_mask, next_mask);
632 }
633 return next_mask;
634 }
635
636 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
638 number of iterations of the original scalar loop that should be
639 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
640 as for vect_set_loop_condition.
641
642 Insert the branch-back condition before LOOP_COND_GSI and return the
643 final gcond. */
644
645 static gcond *
646 vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
647 tree niters, tree final_iv,
648 bool niters_maybe_zero,
649 gimple_stmt_iterator loop_cond_gsi)
650 {
651 gimple_seq preheader_seq = NULL;
652 gimple_seq header_seq = NULL;
653
654 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655 unsigned int compare_precision = TYPE_PRECISION (compare_type);
656 tree orig_niters = niters;
657
658 /* Type of the initial value of NITERS. */
659 tree ni_actual_type = TREE_TYPE (niters);
660 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
661 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
662
663 /* Convert NITERS to the same size as the compare. */
664 if (compare_precision > ni_actual_precision
665 && niters_maybe_zero)
666 {
667 /* We know that there is always at least one iteration, so if the
668 count is zero then it must have wrapped. Cope with this by
669 subtracting 1 before the conversion and adding 1 to the result. */
670 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672 niters, build_minus_one_cst (ni_actual_type));
673 niters = gimple_convert (&preheader_seq, compare_type, niters);
674 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675 niters, build_one_cst (compare_type));
676 }
677 else
678 niters = gimple_convert (&preheader_seq, compare_type, niters);
679
680 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
681
682 /* Iterate over all the rgroups and fill in their masks. We could use
683 the first mask from any rgroup for the loop condition; here we
684 arbitrarily pick the last. */
685 tree test_mask = NULL_TREE;
686 rgroup_masks *rgm;
687 unsigned int i;
688 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689 FOR_EACH_VEC_ELT (*masks, i, rgm)
690 if (!rgm->masks.is_empty ())
691 {
692 /* First try using permutes. This adds a single vector
693 instruction to the loop for each mask, but needs no extra
694 loop invariants or IVs. */
695 unsigned int nmasks = i + 1;
696 if ((nmasks & 1) == 0)
697 {
698 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699 if (!half_rgm->masks.is_empty ()
700 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
701 continue;
702 }
703
704 /* See whether zero-based IV would ever generate all-false masks
705 before wrapping around. */
706 bool might_wrap_p
707 = (iv_limit == -1
708 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709 UNSIGNED)
710 > compare_precision));
711
712 /* Set up all masks for this group. */
713 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714 &preheader_seq,
715 loop_cond_gsi, rgm,
716 niters, niters_skip,
717 might_wrap_p);
718 }
719
720 /* Emit all accumulated statements. */
721 add_preheader_seq (loop, preheader_seq);
722 add_header_seq (loop, header_seq);
723
724 /* Get a boolean result that tells us whether to iterate. */
725 edge exit_edge = single_exit (loop);
726 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729 NULL_TREE, NULL_TREE);
730 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
731
732 /* The loop iterates (NITERS - 1) / VF + 1 times.
733 Subtract one from this to get the latch count. */
734 tree step = build_int_cst (compare_type,
735 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737 build_minus_one_cst (compare_type));
738 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739 niters_minus_one, step);
740
741 if (final_iv)
742 {
743 gassign *assign = gimple_build_assign (final_iv, orig_niters);
744 gsi_insert_on_edge_immediate (single_exit (loop), assign);
745 }
746
747 return cond_stmt;
748 }
749
750 /* Like vect_set_loop_condition, but handle the case in which there
751 are no loop masks. */
752
753 static gcond *
754 vect_set_loop_condition_unmasked (class loop *loop, tree niters,
755 tree step, tree final_iv,
756 bool niters_maybe_zero,
757 gimple_stmt_iterator loop_cond_gsi)
758 {
759 tree indx_before_incr, indx_after_incr;
760 gcond *cond_stmt;
761 gcond *orig_cond;
762 edge pe = loop_preheader_edge (loop);
763 edge exit_edge = single_exit (loop);
764 gimple_stmt_iterator incr_gsi;
765 bool insert_after;
766 enum tree_code code;
767 tree niters_type = TREE_TYPE (niters);
768
769 orig_cond = get_loop_exit_condition (loop);
770 gcc_assert (orig_cond);
771 loop_cond_gsi = gsi_for_stmt (orig_cond);
772
773 tree init, limit;
774 if (!niters_maybe_zero && integer_onep (step))
775 {
776 /* In this case we can use a simple 0-based IV:
777
778 A:
779 x = 0;
780 do
781 {
782 ...
783 x += 1;
784 }
785 while (x < NITERS); */
786 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787 init = build_zero_cst (niters_type);
788 limit = niters;
789 }
790 else
791 {
792 /* The following works for all values of NITERS except 0:
793
794 B:
795 x = 0;
796 do
797 {
798 ...
799 x += STEP;
800 }
801 while (x <= NITERS - STEP);
802
803 so that the loop continues to iterate if x + STEP - 1 < NITERS
804 but stops if x + STEP - 1 >= NITERS.
805
806 However, if NITERS is zero, x never hits a value above NITERS - STEP
807 before wrapping around. There are two obvious ways of dealing with
808 this:
809
810 - start at STEP - 1 and compare x before incrementing it
811 - start at -1 and compare x after incrementing it
812
813 The latter is simpler and is what we use. The loop in this case
814 looks like:
815
816 C:
817 x = -1;
818 do
819 {
820 ...
821 x += STEP;
822 }
823 while (x < NITERS - STEP);
824
825 In both cases the loop limit is NITERS - STEP. */
826 gimple_seq seq = NULL;
827 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829 if (seq)
830 {
831 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832 gcc_assert (!new_bb);
833 }
834 if (niters_maybe_zero)
835 {
836 /* Case C. */
837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838 init = build_all_ones_cst (niters_type);
839 }
840 else
841 {
842 /* Case B. */
843 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844 init = build_zero_cst (niters_type);
845 }
846 }
847
848 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
849 create_iv (init, step, NULL_TREE, loop,
850 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
851 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
852 true, NULL_TREE, true,
853 GSI_SAME_STMT);
854 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
855 true, GSI_SAME_STMT);
856
857 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
858 NULL_TREE);
859
860 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
861
862 /* Record the number of latch iterations. */
863 if (limit == niters)
864 /* Case A: the loop iterates NITERS times. Subtract one to get the
865 latch count. */
866 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867 build_int_cst (niters_type, 1));
868 else
869 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870 Subtract one from this to get the latch count. */
871 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872 limit, step);
873
874 if (final_iv)
875 {
876 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877 indx_after_incr, init);
878 gsi_insert_on_edge_immediate (single_exit (loop), assign);
879 }
880
881 return cond_stmt;
882 }
883
884 /* If we're using fully-masked loops, make LOOP iterate:
885
886 N == (NITERS - 1) / STEP + 1
887
888 times. When NITERS is zero, this is equivalent to making the loop
889 execute (1 << M) / STEP times, where M is the precision of NITERS.
890 NITERS_MAYBE_ZERO is true if this last case might occur.
891
892 If we're not using fully-masked loops, make LOOP iterate:
893
894 N == (NITERS - STEP) / STEP + 1
895
896 times, where NITERS is known to be outside the range [1, STEP - 1].
897 This is equivalent to making the loop execute NITERS / STEP times
898 when NITERS is nonzero and (1 << M) / STEP times otherwise.
899 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
900
901 If FINAL_IV is nonnull, it is an SSA name that should be set to
902 N * STEP on exit from the loop.
903
904 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
905
906 void
907 vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908 tree niters, tree step, tree final_iv,
909 bool niters_maybe_zero)
910 {
911 gcond *cond_stmt;
912 gcond *orig_cond = get_loop_exit_condition (loop);
913 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
914
915 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917 final_iv, niters_maybe_zero,
918 loop_cond_gsi);
919 else
920 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921 final_iv, niters_maybe_zero,
922 loop_cond_gsi);
923
924 /* Remove old loop exit test. */
925 stmt_vec_info orig_cond_info;
926 if (loop_vinfo
927 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
928 loop_vinfo->remove_stmt (orig_cond_info);
929 else
930 gsi_remove (&loop_cond_gsi, true);
931
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
934 cond_stmt);
935 }
936
937 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
938 For all PHI arguments in FROM->dest and TO->dest from those
939 edges ensure that TO->dest PHI arguments have current_def
940 to that in from. */
941
942 static void
943 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
944 {
945 gimple_stmt_iterator gsi_from, gsi_to;
946
947 for (gsi_from = gsi_start_phis (from->dest),
948 gsi_to = gsi_start_phis (to->dest);
949 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
950 {
951 gimple *from_phi = gsi_stmt (gsi_from);
952 gimple *to_phi = gsi_stmt (gsi_to);
953 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
954 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
955 if (virtual_operand_p (from_arg))
956 {
957 gsi_next (&gsi_from);
958 continue;
959 }
960 if (virtual_operand_p (to_arg))
961 {
962 gsi_next (&gsi_to);
963 continue;
964 }
965 if (TREE_CODE (from_arg) != SSA_NAME)
966 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
967 else if (TREE_CODE (to_arg) == SSA_NAME
968 && from_arg != to_arg)
969 {
970 if (get_current_def (to_arg) == NULL_TREE)
971 {
972 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
973 TREE_TYPE (get_current_def
974 (from_arg))));
975 set_current_def (to_arg, get_current_def (from_arg));
976 }
977 }
978 gsi_next (&gsi_from);
979 gsi_next (&gsi_to);
980 }
981
982 gphi *from_phi = get_virtual_phi (from->dest);
983 gphi *to_phi = get_virtual_phi (to->dest);
984 if (from_phi)
985 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
986 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
987 }
988
989
990 /* Given LOOP this function generates a new copy of it and puts it
991 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
992 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
993 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
994 entry or exit of LOOP. */
995
996 class loop *
997 slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998 class loop *scalar_loop, edge e)
999 {
1000 class loop *new_loop;
1001 basic_block *new_bbs, *bbs, *pbbs;
1002 bool at_exit;
1003 bool was_imm_dom;
1004 basic_block exit_dest;
1005 edge exit, new_exit;
1006 bool duplicate_outer_loop = false;
1007
1008 exit = single_exit (loop);
1009 at_exit = (e == exit);
1010 if (!at_exit && e != loop_preheader_edge (loop))
1011 return NULL;
1012
1013 if (scalar_loop == NULL)
1014 scalar_loop = loop;
1015
1016 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1017 pbbs = bbs + 1;
1018 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1019 /* Allow duplication of outer loops. */
1020 if (scalar_loop->inner)
1021 duplicate_outer_loop = true;
1022 /* Check whether duplication is possible. */
1023 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1024 {
1025 free (bbs);
1026 return NULL;
1027 }
1028
1029 /* Generate new loop structure. */
1030 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031 duplicate_subloops (scalar_loop, new_loop);
1032
1033 exit_dest = exit->dest;
1034 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035 exit_dest) == loop->header ?
1036 true : false);
1037
1038 /* Also copy the pre-header, this avoids jumping through hoops to
1039 duplicate the loop entry PHI arguments. Create an empty
1040 pre-header unconditionally for this. */
1041 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1042 edge entry_e = single_pred_edge (preheader);
1043 bbs[0] = preheader;
1044 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1045
1046 exit = single_exit (scalar_loop);
1047 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1048 &exit, 1, &new_exit, NULL,
1049 at_exit ? loop->latch : e->src, true);
1050 exit = single_exit (loop);
1051 basic_block new_preheader = new_bbs[0];
1052
1053 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1054
1055 if (scalar_loop != loop)
1056 {
1057 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1058 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1059 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1060 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1061 header) to have current_def set, so copy them over. */
1062 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1063 exit);
1064 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1065 0),
1066 EDGE_SUCC (loop->latch, 0));
1067 }
1068
1069 if (at_exit) /* Add the loop copy at exit. */
1070 {
1071 if (scalar_loop != loop)
1072 {
1073 gphi_iterator gsi;
1074 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1075
1076 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1077 gsi_next (&gsi))
1078 {
1079 gphi *phi = gsi.phi ();
1080 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1081 location_t orig_locus
1082 = gimple_phi_arg_location_from_edge (phi, e);
1083
1084 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1085 }
1086 }
1087 redirect_edge_and_branch_force (e, new_preheader);
1088 flush_pending_stmts (e);
1089 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1090 if (was_imm_dom || duplicate_outer_loop)
1091 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1092
1093 /* And remove the non-necessary forwarder again. Keep the other
1094 one so we have a proper pre-header for the loop at the exit edge. */
1095 redirect_edge_pred (single_succ_edge (preheader),
1096 single_pred (preheader));
1097 delete_basic_block (preheader);
1098 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1099 loop_preheader_edge (scalar_loop)->src);
1100 }
1101 else /* Add the copy at entry. */
1102 {
1103 if (scalar_loop != loop)
1104 {
1105 /* Remove the non-necessary forwarder of scalar_loop again. */
1106 redirect_edge_pred (single_succ_edge (preheader),
1107 single_pred (preheader));
1108 delete_basic_block (preheader);
1109 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1110 loop_preheader_edge (scalar_loop)->src);
1111 preheader = split_edge (loop_preheader_edge (loop));
1112 entry_e = single_pred_edge (preheader);
1113 }
1114
1115 redirect_edge_and_branch_force (entry_e, new_preheader);
1116 flush_pending_stmts (entry_e);
1117 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1118
1119 redirect_edge_and_branch_force (new_exit, preheader);
1120 flush_pending_stmts (new_exit);
1121 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1122
1123 /* And remove the non-necessary forwarder again. Keep the other
1124 one so we have a proper pre-header for the loop at the exit edge. */
1125 redirect_edge_pred (single_succ_edge (new_preheader),
1126 single_pred (new_preheader));
1127 delete_basic_block (new_preheader);
1128 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1129 loop_preheader_edge (new_loop)->src);
1130 }
1131
1132 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1133 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1134 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1135
1136 if (scalar_loop != loop)
1137 {
1138 /* Update new_loop->header PHIs, so that on the preheader
1139 edge they are the ones from loop rather than scalar_loop. */
1140 gphi_iterator gsi_orig, gsi_new;
1141 edge orig_e = loop_preheader_edge (loop);
1142 edge new_e = loop_preheader_edge (new_loop);
1143
1144 for (gsi_orig = gsi_start_phis (loop->header),
1145 gsi_new = gsi_start_phis (new_loop->header);
1146 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1148 {
1149 gphi *orig_phi = gsi_orig.phi ();
1150 gphi *new_phi = gsi_new.phi ();
1151 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152 location_t orig_locus
1153 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1154
1155 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1156 }
1157 }
1158
1159 free (new_bbs);
1160 free (bbs);
1161
1162 checking_verify_dominators (CDI_DOMINATORS);
1163
1164 return new_loop;
1165 }
1166
1167
1168 /* Given the condition expression COND, put it as the last statement of
1169 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1171 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1172 new edge as irreducible if IRREDUCIBLE_P is true. */
1173
1174 static edge
1175 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1176 basic_block guard_to, basic_block dom_bb,
1177 profile_probability probability, bool irreducible_p)
1178 {
1179 gimple_stmt_iterator gsi;
1180 edge new_e, enter_e;
1181 gcond *cond_stmt;
1182 gimple_seq gimplify_stmt_list = NULL;
1183
1184 enter_e = EDGE_SUCC (guard_bb, 0);
1185 enter_e->flags &= ~EDGE_FALLTHRU;
1186 enter_e->flags |= EDGE_FALSE_VALUE;
1187 gsi = gsi_last_bb (guard_bb);
1188
1189 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190 NULL_TREE);
1191 if (gimplify_stmt_list)
1192 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1193
1194 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1195 gsi = gsi_last_bb (guard_bb);
1196 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1197
1198 /* Add new edge to connect guard block to the merge/loop-exit block. */
1199 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1200
1201 new_e->probability = probability;
1202 if (irreducible_p)
1203 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1204
1205 enter_e->probability = probability.invert ();
1206 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1207
1208 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1209 if (enter_e->dest->loop_father->header == enter_e->dest)
1210 split_edge (enter_e);
1211
1212 return new_e;
1213 }
1214
1215
1216 /* This function verifies that the following restrictions apply to LOOP:
1217 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218 for innermost loop and 5 basic blocks for outer-loop.
1219 (2) it is single entry, single exit
1220 (3) its exit condition is the last stmt in the header
1221 (4) E is the entry/exit edge of LOOP.
1222 */
1223
1224 bool
1225 slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1226 {
1227 edge exit_e = single_exit (loop);
1228 edge entry_e = loop_preheader_edge (loop);
1229 gcond *orig_cond = get_loop_exit_condition (loop);
1230 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1231 unsigned int num_bb = loop->inner? 5 : 2;
1232
1233 /* All loops have an outer scope; the only case loop->outer is NULL is for
1234 the function itself. */
1235 if (!loop_outer (loop)
1236 || loop->num_nodes != num_bb
1237 || !empty_block_p (loop->latch)
1238 || !single_exit (loop)
1239 /* Verify that new loop exit condition can be trivially modified. */
1240 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241 || (e != exit_e && e != entry_e))
1242 return false;
1243
1244 return true;
1245 }
1246
1247 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248 in the exit bb and rename all the uses after the loop. This simplifies
1249 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250 (but normally loop closed SSA form doesn't require virtual PHIs to be
1251 in the same form). Doing this early simplifies the checking what
1252 uses should be renamed.
1253
1254 If we create a new phi after the loop, return the definition that
1255 applies on entry to the loop, otherwise return null. */
1256
1257 static tree
1258 create_lcssa_for_virtual_phi (class loop *loop)
1259 {
1260 gphi_iterator gsi;
1261 edge exit_e = single_exit (loop);
1262
1263 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1264 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1265 {
1266 gphi *phi = gsi.phi ();
1267 for (gsi = gsi_start_phis (exit_e->dest);
1268 !gsi_end_p (gsi); gsi_next (&gsi))
1269 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1270 break;
1271 if (gsi_end_p (gsi))
1272 {
1273 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1274 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1275 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276 imm_use_iterator imm_iter;
1277 gimple *stmt;
1278 use_operand_p use_p;
1279
1280 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1282 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1283 gimple_phi_set_result (new_phi, new_vop);
1284 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1285 if (stmt != new_phi
1286 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1287 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288 SET_USE (use_p, new_vop);
1289
1290 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1291 }
1292 break;
1293 }
1294 return NULL_TREE;
1295 }
1296
1297 /* Function vect_get_loop_location.
1298
1299 Extract the location of the loop in the source code.
1300 If the loop is not well formed for vectorization, an estimated
1301 location is calculated.
1302 Return the loop location if succeed and NULL if not. */
1303
1304 dump_user_location_t
1305 find_loop_location (class loop *loop)
1306 {
1307 gimple *stmt = NULL;
1308 basic_block bb;
1309 gimple_stmt_iterator si;
1310
1311 if (!loop)
1312 return dump_user_location_t ();
1313
1314 stmt = get_loop_exit_condition (loop);
1315
1316 if (stmt
1317 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318 return stmt;
1319
1320 /* If we got here the loop is probably not "well formed",
1321 try to estimate the loop location */
1322
1323 if (!loop->header)
1324 return dump_user_location_t ();
1325
1326 bb = loop->header;
1327
1328 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329 {
1330 stmt = gsi_stmt (si);
1331 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332 return stmt;
1333 }
1334
1335 return dump_user_location_t ();
1336 }
1337
1338 /* Return true if the phi described by STMT_INFO defines an IV of the
1339 loop to be vectorized. */
1340
1341 static bool
1342 iv_phi_p (stmt_vec_info stmt_info)
1343 {
1344 gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345 if (virtual_operand_p (PHI_RESULT (phi)))
1346 return false;
1347
1348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350 return false;
1351
1352 return true;
1353 }
1354
1355 /* Function vect_can_advance_ivs_p
1356
1357 In case the number of iterations that LOOP iterates is unknown at compile
1358 time, an epilog loop will be generated, and the loop induction variables
1359 (IVs) will be "advanced" to the value they are supposed to take just before
1360 the epilog loop. Here we check that the access function of the loop IVs
1361 and the expression that represents the loop bound are simple enough.
1362 These restrictions will be relaxed in the future. */
1363
1364 bool
1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 {
1367 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block bb = loop->header;
1369 gphi_iterator gsi;
1370
1371 /* Analyze phi functions of the loop header. */
1372
1373 if (dump_enabled_p ())
1374 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376 {
1377 tree evolution_part;
1378
1379 gphi *phi = gsi.phi ();
1380 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383 phi_info->stmt);
1384
1385 /* Skip virtual phi's. The data dependences that are associated with
1386 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1387
1388 Skip reduction phis. */
1389 if (!iv_phi_p (phi_info))
1390 {
1391 if (dump_enabled_p ())
1392 dump_printf_loc (MSG_NOTE, vect_location,
1393 "reduc or virtual phi. skip.\n");
1394 continue;
1395 }
1396
1397 /* Analyze the evolution function. */
1398
1399 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1400 if (evolution_part == NULL_TREE)
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf (MSG_MISSED_OPTIMIZATION,
1404 "No access function or evolution.\n");
1405 return false;
1406 }
1407
1408 /* FORNOW: We do not transform initial conditions of IVs
1409 which evolution functions are not invariants in the loop. */
1410
1411 if (!expr_invariant_in_loop_p (loop, evolution_part))
1412 {
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "evolution not invariant in loop.\n");
1416 return false;
1417 }
1418
1419 /* FORNOW: We do not transform initial conditions of IVs
1420 which evolution functions are a polynomial of degree >= 2. */
1421
1422 if (tree_is_chrec (evolution_part))
1423 {
1424 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426 "evolution is chrec.\n");
1427 return false;
1428 }
1429 }
1430
1431 return true;
1432 }
1433
1434
1435 /* Function vect_update_ivs_after_vectorizer.
1436
1437 "Advance" the induction variables of LOOP to the value they should take
1438 after the execution of LOOP. This is currently necessary because the
1439 vectorizer does not handle induction variables that are used after the
1440 loop. Such a situation occurs when the last iterations of LOOP are
1441 peeled, because:
1442 1. We introduced new uses after LOOP for IVs that were not originally used
1443 after LOOP: the IVs of LOOP are now used by an epilog loop.
1444 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445 times, whereas the loop IVs should be bumped N times.
1446
1447 Input:
1448 - LOOP - a loop that is going to be vectorized. The last few iterations
1449 of LOOP were peeled.
1450 - NITERS - the number of iterations that LOOP executes (before it is
1451 vectorized). i.e, the number of times the ivs should be bumped.
1452 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453 coming out from LOOP on which there are uses of the LOOP ivs
1454 (this is the path from LOOP->exit to epilog_loop->preheader).
1455
1456 The new definitions of the ivs are placed in LOOP->exit.
1457 The phi args associated with the edge UPDATE_E in the bb
1458 UPDATE_E->dest are updated accordingly.
1459
1460 Assumption 1: Like the rest of the vectorizer, this function assumes
1461 a single loop exit that has a single predecessor.
1462
1463 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464 organized in the same order.
1465
1466 Assumption 3: The access function of the ivs is simple enough (see
1467 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1468
1469 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1470 coming out of LOOP on which the ivs of LOOP are used (this is the path
1471 that leads to the epilog loop; other paths skip the epilog loop). This
1472 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473 needs to have its phis updated.
1474 */
1475
1476 static void
1477 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478 tree niters, edge update_e)
1479 {
1480 gphi_iterator gsi, gsi1;
1481 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1482 basic_block update_bb = update_e->dest;
1483 basic_block exit_bb = single_exit (loop)->dest;
1484
1485 /* Make sure there exists a single-predecessor exit bb: */
1486 gcc_assert (single_pred_p (exit_bb));
1487 gcc_assert (single_succ_edge (exit_bb) == update_e);
1488
1489 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491 gsi_next (&gsi), gsi_next (&gsi1))
1492 {
1493 tree init_expr;
1494 tree step_expr, off;
1495 tree type;
1496 tree var, ni, ni_name;
1497 gimple_stmt_iterator last_gsi;
1498
1499 gphi *phi = gsi.phi ();
1500 gphi *phi1 = gsi1.phi ();
1501 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1502 if (dump_enabled_p ())
1503 dump_printf_loc (MSG_NOTE, vect_location,
1504 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1505
1506 /* Skip reduction and virtual phis. */
1507 if (!iv_phi_p (phi_info))
1508 {
1509 if (dump_enabled_p ())
1510 dump_printf_loc (MSG_NOTE, vect_location,
1511 "reduc or virtual phi. skip.\n");
1512 continue;
1513 }
1514
1515 type = TREE_TYPE (gimple_phi_result (phi));
1516 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1517 step_expr = unshare_expr (step_expr);
1518
1519 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520 of degree >= 2 or exponential. */
1521 gcc_assert (!tree_is_chrec (step_expr));
1522
1523 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1524
1525 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526 fold_convert (TREE_TYPE (step_expr), niters),
1527 step_expr);
1528 if (POINTER_TYPE_P (type))
1529 ni = fold_build_pointer_plus (init_expr, off);
1530 else
1531 ni = fold_build2 (PLUS_EXPR, type,
1532 init_expr, fold_convert (type, off));
1533
1534 var = create_tmp_var (type, "tmp");
1535
1536 last_gsi = gsi_last_bb (exit_bb);
1537 gimple_seq new_stmts = NULL;
1538 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539 /* Exit_bb shouldn't be empty. */
1540 if (!gsi_end_p (last_gsi))
1541 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542 else
1543 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1544
1545 /* Fix phi expressions in the successor bb. */
1546 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1547 }
1548 }
1549
1550 /* Return a gimple value containing the misalignment (measured in vector
1551 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552 it is away from a perfectly aligned address. Add any new statements
1553 to SEQ. */
1554
1555 static tree
1556 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557 {
1558 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1559 stmt_vec_info stmt_info = dr_info->stmt;
1560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561
1562 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563 unsigned HOST_WIDE_INT target_align_c;
1564 tree target_align_minus_1;
1565
1566 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567 size_zero_node) < 0;
1568 tree offset = (negative
1569 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570 : size_zero_node);
1571 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1572 stmt_info, seq,
1573 offset);
1574 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1575 if (target_align.is_constant (&target_align_c))
1576 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1577 else
1578 {
1579 tree vla = build_int_cst (type, target_align);
1580 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1581 fold_build2 (MINUS_EXPR, type,
1582 build_int_cst (type, 0), vla));
1583 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1584 build_int_cst (type, 1));
1585 }
1586
1587 HOST_WIDE_INT elem_size
1588 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1589 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1590
1591 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1592 tree int_start_addr = fold_convert (type, start_addr);
1593 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1594 target_align_minus_1);
1595
1596 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1597 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1598 elem_size_log);
1599
1600 return misalign_in_elems;
1601 }
1602
1603 /* Function vect_gen_prolog_loop_niters
1604
1605 Generate the number of iterations which should be peeled as prolog for the
1606 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1607 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1608 As a result, after the execution of this loop, the data reference DR will
1609 refer to an aligned location. The following computation is generated:
1610
1611 If the misalignment of DR is known at compile time:
1612 addr_mis = int mis = DR_MISALIGNMENT (dr);
1613 Else, compute address misalignment in bytes:
1614 addr_mis = addr & (target_align - 1)
1615
1616 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1617
1618 (elem_size = element type size; an element is the scalar element whose type
1619 is the inner type of the vectype)
1620
1621 The computations will be emitted at the end of BB. We also compute and
1622 store upper bound (included) of the result in BOUND.
1623
1624 When the step of the data-ref in the loop is not 1 (as in interleaved data
1625 and SLP), the number of iterations of the prolog must be divided by the step
1626 (which is equal to the size of interleaved group).
1627
1628 The above formulas assume that VF == number of elements in the vector. This
1629 may not hold when there are multiple-types in the loop.
1630 In this case, for some data-references in the loop the VF does not represent
1631 the number of elements that fit in the vector. Therefore, instead of VF we
1632 use TYPE_VECTOR_SUBPARTS. */
1633
1634 static tree
1635 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1636 basic_block bb, int *bound)
1637 {
1638 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1639 tree var;
1640 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1641 gimple_seq stmts = NULL, new_stmts = NULL;
1642 tree iters, iters_name;
1643 stmt_vec_info stmt_info = dr_info->stmt;
1644 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1645 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1646
1647 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1648 {
1649 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1650
1651 if (dump_enabled_p ())
1652 dump_printf_loc (MSG_NOTE, vect_location,
1653 "known peeling = %d.\n", npeel);
1654
1655 iters = build_int_cst (niters_type, npeel);
1656 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1657 }
1658 else
1659 {
1660 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1661 tree type = TREE_TYPE (misalign_in_elems);
1662 HOST_WIDE_INT elem_size
1663 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1664 /* We only do prolog peeling if the target alignment is known at compile
1665 time. */
1666 poly_uint64 align_in_elems =
1667 exact_div (target_align, elem_size);
1668 tree align_in_elems_minus_1 =
1669 build_int_cst (type, align_in_elems - 1);
1670 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1671
1672 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1673 & (align_in_elems - 1)). */
1674 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1675 size_zero_node) < 0;
1676 if (negative)
1677 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1678 align_in_elems_tree);
1679 else
1680 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1681 misalign_in_elems);
1682 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1683 iters = fold_convert (niters_type, iters);
1684 unsigned HOST_WIDE_INT align_in_elems_c;
1685 if (align_in_elems.is_constant (&align_in_elems_c))
1686 *bound = align_in_elems_c - 1;
1687 else
1688 *bound = -1;
1689 }
1690
1691 if (dump_enabled_p ())
1692 dump_printf_loc (MSG_NOTE, vect_location,
1693 "niters for prolog loop: %T\n", iters);
1694
1695 var = create_tmp_var (niters_type, "prolog_loop_niters");
1696 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1697
1698 if (new_stmts)
1699 gimple_seq_add_seq (&stmts, new_stmts);
1700 if (stmts)
1701 {
1702 gcc_assert (single_succ_p (bb));
1703 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1704 if (gsi_end_p (gsi))
1705 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1706 else
1707 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1708 }
1709 return iters_name;
1710 }
1711
1712
1713 /* Function vect_update_init_of_dr
1714
1715 If CODE is PLUS, the vector loop starts NITERS iterations after the
1716 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1717 iterations before the scalar one (using masking to skip inactive
1718 elements). This function updates the information recorded in DR to
1719 account for the difference. Specifically, it updates the OFFSET
1720 field of DR_INFO. */
1721
1722 static void
1723 vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1724 {
1725 struct data_reference *dr = dr_info->dr;
1726 tree offset = dr_info->offset;
1727 if (!offset)
1728 offset = build_zero_cst (sizetype);
1729
1730 niters = fold_build2 (MULT_EXPR, sizetype,
1731 fold_convert (sizetype, niters),
1732 fold_convert (sizetype, DR_STEP (dr)));
1733 offset = fold_build2 (code, sizetype,
1734 fold_convert (sizetype, offset), niters);
1735 dr_info->offset = offset;
1736 }
1737
1738
1739 /* Function vect_update_inits_of_drs
1740
1741 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1742 CODE and NITERS are as for vect_update_inits_of_dr. */
1743
1744 void
1745 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1746 tree_code code)
1747 {
1748 unsigned int i;
1749 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1750 struct data_reference *dr;
1751
1752 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1753
1754 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1755 here, but since we might use these niters to update the epilogues niters
1756 and data references we can't insert them here as this definition might not
1757 always dominate its uses. */
1758 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1759 niters = fold_convert (sizetype, niters);
1760
1761 FOR_EACH_VEC_ELT (datarefs, i, dr)
1762 {
1763 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1764 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1765 vect_update_init_of_dr (dr_info, niters, code);
1766 }
1767 }
1768
1769 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1770 by masking. This involves calculating the number of iterations to
1771 be peeled and then aligning all memory references appropriately. */
1772
1773 void
1774 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1775 {
1776 tree misalign_in_elems;
1777 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1778
1779 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1780
1781 /* From the information recorded in LOOP_VINFO get the number of iterations
1782 that need to be skipped via masking. */
1783 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1784 {
1785 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1787 misalign_in_elems = build_int_cst (type, misalign);
1788 }
1789 else
1790 {
1791 gimple_seq seq1 = NULL, seq2 = NULL;
1792 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1793 misalign_in_elems = fold_convert (type, misalign_in_elems);
1794 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1795 &seq2, true, NULL_TREE);
1796 gimple_seq_add_seq (&seq1, seq2);
1797 if (seq1)
1798 {
1799 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1800 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1801 gcc_assert (!new_bb);
1802 }
1803 }
1804
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE, vect_location,
1807 "misalignment for fully-masked loop: %T\n",
1808 misalign_in_elems);
1809
1810 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1811
1812 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1813 }
1814
1815 /* This function builds ni_name = number of iterations. Statements
1816 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1817 it to TRUE if new ssa_var is generated. */
1818
1819 tree
1820 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1821 {
1822 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1823 if (TREE_CODE (ni) == INTEGER_CST)
1824 return ni;
1825 else
1826 {
1827 tree ni_name, var;
1828 gimple_seq stmts = NULL;
1829 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1830
1831 var = create_tmp_var (TREE_TYPE (ni), "niters");
1832 ni_name = force_gimple_operand (ni, &stmts, false, var);
1833 if (stmts)
1834 {
1835 gsi_insert_seq_on_edge_immediate (pe, stmts);
1836 if (new_var_p != NULL)
1837 *new_var_p = true;
1838 }
1839
1840 return ni_name;
1841 }
1842 }
1843
1844 /* Calculate the number of iterations above which vectorized loop will be
1845 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1846 of prolog loop. If it's integer const, the integer number is also passed
1847 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1848 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1849 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1850 threshold below which the scalar (rather than vectorized) loop will be
1851 executed. This function stores the upper bound (inclusive) of the result
1852 in BOUND_SCALAR. */
1853
1854 static tree
1855 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1856 int bound_prolog, poly_int64 bound_epilog, int th,
1857 poly_uint64 *bound_scalar,
1858 bool check_profitability)
1859 {
1860 tree type = TREE_TYPE (niters_prolog);
1861 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1862 build_int_cst (type, bound_epilog));
1863
1864 *bound_scalar = bound_prolog + bound_epilog;
1865 if (check_profitability)
1866 {
1867 /* TH indicates the minimum niters of vectorized loop, while we
1868 compute the maximum niters of scalar loop. */
1869 th--;
1870 /* Peeling for constant times. */
1871 if (int_niters_prolog >= 0)
1872 {
1873 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1874 return build_int_cst (type, *bound_scalar);
1875 }
1876 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1877 and BOUND_EPILOG are inclusive upper bounds. */
1878 if (known_ge (th, bound_prolog + bound_epilog))
1879 {
1880 *bound_scalar = th;
1881 return build_int_cst (type, th);
1882 }
1883 /* Need to do runtime comparison. */
1884 else if (maybe_gt (th, bound_epilog))
1885 {
1886 *bound_scalar = upper_bound (*bound_scalar, th);
1887 return fold_build2 (MAX_EXPR, type,
1888 build_int_cst (type, th), niters);
1889 }
1890 }
1891 return niters;
1892 }
1893
1894 /* NITERS is the number of times that the original scalar loop executes
1895 after peeling. Work out the maximum number of iterations N that can
1896 be handled by the vectorized form of the loop and then either:
1897
1898 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1899
1900 niters_vector = N
1901
1902 b) set *STEP_VECTOR_PTR to one and generate:
1903
1904 niters_vector = N / vf
1905
1906 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1907 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1908 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1909
1910 void
1911 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1912 tree *niters_vector_ptr, tree *step_vector_ptr,
1913 bool niters_no_overflow)
1914 {
1915 tree ni_minus_gap, var;
1916 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1917 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1918 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1919 tree log_vf = NULL_TREE;
1920
1921 /* If epilogue loop is required because of data accesses with gaps, we
1922 subtract one iteration from the total number of iterations here for
1923 correct calculation of RATIO. */
1924 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1925 {
1926 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1927 build_one_cst (type));
1928 if (!is_gimple_val (ni_minus_gap))
1929 {
1930 var = create_tmp_var (type, "ni_gap");
1931 gimple *stmts = NULL;
1932 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1933 true, var);
1934 gsi_insert_seq_on_edge_immediate (pe, stmts);
1935 }
1936 }
1937 else
1938 ni_minus_gap = niters;
1939
1940 unsigned HOST_WIDE_INT const_vf;
1941 if (vf.is_constant (&const_vf)
1942 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1943 {
1944 /* Create: niters >> log2(vf) */
1945 /* If it's known that niters == number of latch executions + 1 doesn't
1946 overflow, we can generate niters >> log2(vf); otherwise we generate
1947 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1948 will be at least one. */
1949 log_vf = build_int_cst (type, exact_log2 (const_vf));
1950 if (niters_no_overflow)
1951 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1952 else
1953 niters_vector
1954 = fold_build2 (PLUS_EXPR, type,
1955 fold_build2 (RSHIFT_EXPR, type,
1956 fold_build2 (MINUS_EXPR, type,
1957 ni_minus_gap,
1958 build_int_cst (type, vf)),
1959 log_vf),
1960 build_int_cst (type, 1));
1961 step_vector = build_one_cst (type);
1962 }
1963 else
1964 {
1965 niters_vector = ni_minus_gap;
1966 step_vector = build_int_cst (type, vf);
1967 }
1968
1969 if (!is_gimple_val (niters_vector))
1970 {
1971 var = create_tmp_var (type, "bnd");
1972 gimple_seq stmts = NULL;
1973 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1974 gsi_insert_seq_on_edge_immediate (pe, stmts);
1975 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1976 we set range information to make niters analyzer's life easier. */
1977 if (stmts != NULL && log_vf)
1978 set_range_info (niters_vector, VR_RANGE,
1979 wi::to_wide (build_int_cst (type, 1)),
1980 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1981 TYPE_MAX_VALUE (type),
1982 log_vf)));
1983 }
1984 *niters_vector_ptr = niters_vector;
1985 *step_vector_ptr = step_vector;
1986
1987 return;
1988 }
1989
1990 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1991 loop specified by LOOP_VINFO after vectorization, compute the number
1992 of iterations before vectorization (niters_vector * vf) and store it
1993 to NITERS_VECTOR_MULT_VF_PTR. */
1994
1995 static void
1996 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1997 tree niters_vector,
1998 tree *niters_vector_mult_vf_ptr)
1999 {
2000 /* We should be using a step_vector of VF if VF is variable. */
2001 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2002 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2003 tree type = TREE_TYPE (niters_vector);
2004 tree log_vf = build_int_cst (type, exact_log2 (vf));
2005 basic_block exit_bb = single_exit (loop)->dest;
2006
2007 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2008 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2009 niters_vector, log_vf);
2010 if (!is_gimple_val (niters_vector_mult_vf))
2011 {
2012 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2013 gimple_seq stmts = NULL;
2014 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2015 &stmts, true, var);
2016 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2017 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2018 }
2019 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2020 }
2021
2022 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2023 this function searches for the corresponding lcssa phi node in exit
2024 bb of LOOP. If it is found, return the phi result; otherwise return
2025 NULL. */
2026
2027 static tree
2028 find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2029 gphi *lcssa_phi)
2030 {
2031 gphi_iterator gsi;
2032 edge e = single_exit (loop);
2033
2034 gcc_assert (single_pred_p (e->dest));
2035 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2036 {
2037 gphi *phi = gsi.phi ();
2038 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2039 PHI_ARG_DEF (lcssa_phi, 0), 0))
2040 return PHI_RESULT (phi);
2041 }
2042 return NULL_TREE;
2043 }
2044
2045 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2046 from SECOND/FIRST and puts it at the original loop's preheader/exit
2047 edge, the two loops are arranged as below:
2048
2049 preheader_a:
2050 first_loop:
2051 header_a:
2052 i_1 = PHI<i_0, i_2>;
2053 ...
2054 i_2 = i_1 + 1;
2055 if (cond_a)
2056 goto latch_a;
2057 else
2058 goto between_bb;
2059 latch_a:
2060 goto header_a;
2061
2062 between_bb:
2063 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2064
2065 second_loop:
2066 header_b:
2067 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2068 or with i_2 if no LCSSA phi is created
2069 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2070 ...
2071 i_4 = i_3 + 1;
2072 if (cond_b)
2073 goto latch_b;
2074 else
2075 goto exit_bb;
2076 latch_b:
2077 goto header_b;
2078
2079 exit_bb:
2080
2081 This function creates loop closed SSA for the first loop; update the
2082 second loop's PHI nodes by replacing argument on incoming edge with the
2083 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2084 is false, Loop closed ssa phis will only be created for non-iv phis for
2085 the first loop.
2086
2087 This function assumes exit bb of the first loop is preheader bb of the
2088 second loop, i.e, between_bb in the example code. With PHIs updated,
2089 the second loop will execute rest iterations of the first. */
2090
2091 static void
2092 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2093 class loop *first, class loop *second,
2094 bool create_lcssa_for_iv_phis)
2095 {
2096 gphi_iterator gsi_update, gsi_orig;
2097 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2098
2099 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2100 edge second_preheader_e = loop_preheader_edge (second);
2101 basic_block between_bb = single_exit (first)->dest;
2102
2103 gcc_assert (between_bb == second_preheader_e->src);
2104 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2105 /* Either the first loop or the second is the loop to be vectorized. */
2106 gcc_assert (loop == first || loop == second);
2107
2108 for (gsi_orig = gsi_start_phis (first->header),
2109 gsi_update = gsi_start_phis (second->header);
2110 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2111 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2112 {
2113 gphi *orig_phi = gsi_orig.phi ();
2114 gphi *update_phi = gsi_update.phi ();
2115
2116 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2117 /* Generate lcssa PHI node for the first loop. */
2118 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2119 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2120 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2121 {
2122 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2123 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2124 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2125 arg = new_res;
2126 }
2127
2128 /* Update PHI node in the second loop by replacing arg on the loop's
2129 incoming edge. */
2130 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2131 }
2132
2133 /* For epilogue peeling we have to make sure to copy all LC PHIs
2134 for correct vectorization of live stmts. */
2135 if (loop == first)
2136 {
2137 basic_block orig_exit = single_exit (second)->dest;
2138 for (gsi_orig = gsi_start_phis (orig_exit);
2139 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2140 {
2141 gphi *orig_phi = gsi_orig.phi ();
2142 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2143 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2144 continue;
2145
2146 /* Already created in the above loop. */
2147 if (find_guard_arg (first, second, orig_phi))
2148 continue;
2149
2150 tree new_res = copy_ssa_name (orig_arg);
2151 gphi *lcphi = create_phi_node (new_res, between_bb);
2152 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2153 }
2154 }
2155 }
2156
2157 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2158 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2159 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2160 appear like below:
2161
2162 guard_bb:
2163 if (cond)
2164 goto merge_bb;
2165 else
2166 goto skip_loop;
2167
2168 skip_loop:
2169 header_a:
2170 i_1 = PHI<i_0, i_2>;
2171 ...
2172 i_2 = i_1 + 1;
2173 if (cond_a)
2174 goto latch_a;
2175 else
2176 goto exit_a;
2177 latch_a:
2178 goto header_a;
2179
2180 exit_a:
2181 i_5 = PHI<i_2>;
2182
2183 merge_bb:
2184 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2185
2186 update_loop:
2187 header_b:
2188 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2189 ...
2190 i_4 = i_3 + 1;
2191 if (cond_b)
2192 goto latch_b;
2193 else
2194 goto exit_bb;
2195 latch_b:
2196 goto header_b;
2197
2198 exit_bb:
2199
2200 This function creates PHI nodes at merge_bb and replaces the use of i_5
2201 in the update_loop's PHI node with the result of new PHI result. */
2202
2203 static void
2204 slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2205 class loop *update_loop,
2206 edge guard_edge, edge merge_edge)
2207 {
2208 location_t merge_loc, guard_loc;
2209 edge orig_e = loop_preheader_edge (skip_loop);
2210 edge update_e = loop_preheader_edge (update_loop);
2211 gphi_iterator gsi_orig, gsi_update;
2212
2213 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2214 gsi_update = gsi_start_phis (update_loop->header));
2215 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2216 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2217 {
2218 gphi *orig_phi = gsi_orig.phi ();
2219 gphi *update_phi = gsi_update.phi ();
2220
2221 /* Generate new phi node at merge bb of the guard. */
2222 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2223 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2224
2225 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2226 args in NEW_PHI for these edges. */
2227 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2228 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2229 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2230 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2231 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2232 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2233
2234 /* Update phi in UPDATE_PHI. */
2235 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2236 }
2237 }
2238
2239 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2240 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2241 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2242 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2243 The CFG looks like:
2244
2245 loop:
2246 header_a:
2247 i_1 = PHI<i_0, i_2>;
2248 ...
2249 i_2 = i_1 + 1;
2250 if (cond_a)
2251 goto latch_a;
2252 else
2253 goto exit_a;
2254 latch_a:
2255 goto header_a;
2256
2257 exit_a:
2258
2259 guard_bb:
2260 if (cond)
2261 goto merge_bb;
2262 else
2263 goto epilog_loop;
2264
2265 ;; fall_through_bb
2266
2267 epilog_loop:
2268 header_b:
2269 i_3 = PHI<i_2, i_4>;
2270 ...
2271 i_4 = i_3 + 1;
2272 if (cond_b)
2273 goto latch_b;
2274 else
2275 goto merge_bb;
2276 latch_b:
2277 goto header_b;
2278
2279 merge_bb:
2280 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2281
2282 exit_bb:
2283 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2284
2285 For each name used out side EPILOG (i.e - for each name that has a lcssa
2286 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2287 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2288 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2289 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2290 in exit_bb will also be updated. */
2291
2292 static void
2293 slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2294 edge guard_edge, edge merge_edge)
2295 {
2296 gphi_iterator gsi;
2297 basic_block merge_bb = guard_edge->dest;
2298
2299 gcc_assert (single_succ_p (merge_bb));
2300 edge e = single_succ_edge (merge_bb);
2301 basic_block exit_bb = e->dest;
2302 gcc_assert (single_pred_p (exit_bb));
2303 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2304
2305 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2306 {
2307 gphi *update_phi = gsi.phi ();
2308 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2309
2310 tree merge_arg = NULL_TREE;
2311
2312 /* If the old argument is a SSA_NAME use its current_def. */
2313 if (TREE_CODE (old_arg) == SSA_NAME)
2314 merge_arg = get_current_def (old_arg);
2315 /* If it's a constant or doesn't have a current_def, just use the old
2316 argument. */
2317 if (!merge_arg)
2318 merge_arg = old_arg;
2319
2320 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2321 /* If the var is live after loop but not a reduction, we simply
2322 use the old arg. */
2323 if (!guard_arg)
2324 guard_arg = old_arg;
2325
2326 /* Create new phi node in MERGE_BB: */
2327 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2328 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2329
2330 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2331 the two PHI args in merge_phi for these edges. */
2332 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2333 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2334
2335 /* Update the original phi in exit_bb. */
2336 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2337 }
2338 }
2339
2340 /* EPILOG loop is duplicated from the original loop for vectorizing,
2341 the arg of its loop closed ssa PHI needs to be updated. */
2342
2343 static void
2344 slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2345 {
2346 gphi_iterator gsi;
2347 basic_block exit_bb = single_exit (epilog)->dest;
2348
2349 gcc_assert (single_pred_p (exit_bb));
2350 edge e = EDGE_PRED (exit_bb, 0);
2351 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2352 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2353 }
2354
2355 /* Function vect_do_peeling.
2356
2357 Input:
2358 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2359
2360 preheader:
2361 LOOP:
2362 header_bb:
2363 loop_body
2364 if (exit_loop_cond) goto exit_bb
2365 else goto header_bb
2366 exit_bb:
2367
2368 - NITERS: The number of iterations of the loop.
2369 - NITERSM1: The number of iterations of the loop's latch.
2370 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2371 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2372 CHECK_PROFITABILITY is true.
2373 Output:
2374 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2375 iterate after vectorization; see vect_set_loop_condition for details.
2376 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2377 should be set to the number of scalar iterations handled by the
2378 vector loop. The SSA name is only used on exit from the loop.
2379
2380 This function peels prolog and epilog from the loop, adds guards skipping
2381 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2382 would look like:
2383
2384 guard_bb_1:
2385 if (prefer_scalar_loop) goto merge_bb_1
2386 else goto guard_bb_2
2387
2388 guard_bb_2:
2389 if (skip_prolog) goto merge_bb_2
2390 else goto prolog_preheader
2391
2392 prolog_preheader:
2393 PROLOG:
2394 prolog_header_bb:
2395 prolog_body
2396 if (exit_prolog_cond) goto prolog_exit_bb
2397 else goto prolog_header_bb
2398 prolog_exit_bb:
2399
2400 merge_bb_2:
2401
2402 vector_preheader:
2403 VECTOR LOOP:
2404 vector_header_bb:
2405 vector_body
2406 if (exit_vector_cond) goto vector_exit_bb
2407 else goto vector_header_bb
2408 vector_exit_bb:
2409
2410 guard_bb_3:
2411 if (skip_epilog) goto merge_bb_3
2412 else goto epilog_preheader
2413
2414 merge_bb_1:
2415
2416 epilog_preheader:
2417 EPILOG:
2418 epilog_header_bb:
2419 epilog_body
2420 if (exit_epilog_cond) goto merge_bb_3
2421 else goto epilog_header_bb
2422
2423 merge_bb_3:
2424
2425 Note this function peels prolog and epilog only if it's necessary,
2426 as well as guards.
2427 This function returns the epilogue loop if a decision was made to vectorize
2428 it, otherwise NULL.
2429
2430 The analysis resulting in this epilogue loop's loop_vec_info was performed
2431 in the same vect_analyze_loop call as the main loop's. At that time
2432 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2433 vectorization factors than the main loop. This list is stored in the main
2434 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2435 vectorize the epilogue loop for a lower vectorization factor, the
2436 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2437 updated and linked to the epilogue loop. This is later used to vectorize
2438 the epilogue. The reason the loop_vec_info needs updating is that it was
2439 constructed based on the original main loop, and the epilogue loop is a
2440 copy of this loop, so all links pointing to statements in the original loop
2441 need updating. Furthermore, these loop_vec_infos share the
2442 data_reference's records, which will also need to be updated.
2443
2444 TODO: Guard for prefer_scalar_loop should be emitted along with
2445 versioning conditions if loop versioning is needed. */
2446
2447
2448 class loop *
2449 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2450 tree *niters_vector, tree *step_vector,
2451 tree *niters_vector_mult_vf_var, int th,
2452 bool check_profitability, bool niters_no_overflow,
2453 tree *advance)
2454 {
2455 edge e, guard_e;
2456 tree type = TREE_TYPE (niters), guard_cond;
2457 basic_block guard_bb, guard_to;
2458 profile_probability prob_prolog, prob_vector, prob_epilog;
2459 int estimated_vf;
2460 int prolog_peeling = 0;
2461 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2462 /* We currently do not support prolog peeling if the target alignment is not
2463 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2464 target alignment being constant. */
2465 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2466 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2467 return NULL;
2468
2469 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2470 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2471
2472 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2473 poly_uint64 bound_epilog = 0;
2474 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2475 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2476 bound_epilog += vf - 1;
2477 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2478 bound_epilog += 1;
2479 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2480 poly_uint64 bound_scalar = bound_epilog;
2481
2482 if (!prolog_peeling && !epilog_peeling)
2483 return NULL;
2484
2485 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2486 estimated_vf = vect_vf_for_cost (loop_vinfo);
2487 if (estimated_vf == 2)
2488 estimated_vf = 3;
2489 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2490 .apply_scale (estimated_vf - 1, estimated_vf);
2491
2492 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2493 class loop *first_loop = loop;
2494 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2495
2496 /* We might have a queued need to update virtual SSA form. As we
2497 delete the update SSA machinery below after doing a regular
2498 incremental SSA update during loop copying make sure we don't
2499 lose that fact.
2500 ??? Needing to update virtual SSA form by renaming is unfortunate
2501 but not all of the vectorizer code inserting new loads / stores
2502 properly assigns virtual operands to those statements. */
2503 update_ssa (TODO_update_ssa_only_virtuals);
2504
2505 create_lcssa_for_virtual_phi (loop);
2506
2507 /* If we're vectorizing an epilogue loop, the update_ssa above will
2508 have ensured that the virtual operand is in SSA form throughout the
2509 vectorized main loop. Normally it is possible to trace the updated
2510 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2511 back to scalar-stmt vuses, meaning that the effect of the SSA update
2512 remains local to the main loop. However, there are rare cases in
2513 which the vectorized loop has vdefs even when the original scalar
2514 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2515 introduces clobbers of the temporary vector array, which in turn
2516 needs new vdefs. If the scalar loop doesn't write to memory, these
2517 new vdefs will be the only ones in the vector loop.
2518
2519 In that case, update_ssa will have added a new virtual phi to the
2520 main loop, which previously didn't need one. Ensure that we (locally)
2521 maintain LCSSA form for the virtual operand, just as we would have
2522 done if the virtual phi had existed from the outset. This makes it
2523 easier to duplicate the scalar epilogue loop below. */
2524 tree vop_to_rename = NULL_TREE;
2525 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2526 {
2527 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2528 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2529 }
2530
2531 if (MAY_HAVE_DEBUG_BIND_STMTS)
2532 {
2533 gcc_assert (!adjust_vec.exists ());
2534 adjust_vec.create (32);
2535 }
2536 initialize_original_copy_tables ();
2537
2538 /* Record the anchor bb at which the guard should be placed if the scalar
2539 loop might be preferred. */
2540 basic_block anchor = loop_preheader_edge (loop)->src;
2541
2542 /* Generate the number of iterations for the prolog loop. We do this here
2543 so that we can also get the upper bound on the number of iterations. */
2544 tree niters_prolog;
2545 int bound_prolog = 0;
2546 if (prolog_peeling)
2547 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2548 &bound_prolog);
2549 else
2550 niters_prolog = build_int_cst (type, 0);
2551
2552 loop_vec_info epilogue_vinfo = NULL;
2553 if (vect_epilogues)
2554 {
2555 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2556 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2557 }
2558
2559 tree niters_vector_mult_vf = NULL_TREE;
2560 /* Saving NITERs before the loop, as this may be changed by prologue. */
2561 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2562 edge update_e = NULL, skip_e = NULL;
2563 unsigned int lowest_vf = constant_lower_bound (vf);
2564 /* If we know the number of scalar iterations for the main loop we should
2565 check whether after the main loop there are enough iterations left over
2566 for the epilogue. */
2567 if (vect_epilogues
2568 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2569 && prolog_peeling >= 0
2570 && known_eq (vf, lowest_vf))
2571 {
2572 unsigned HOST_WIDE_INT eiters
2573 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2574 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2575
2576 eiters -= prolog_peeling;
2577 eiters
2578 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2579
2580 unsigned int ratio;
2581 unsigned int epilogue_gaps
2582 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2583 while (!(constant_multiple_p
2584 (GET_MODE_SIZE (loop_vinfo->vector_mode),
2585 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2586 && eiters >= lowest_vf / ratio + epilogue_gaps))
2587 {
2588 delete epilogue_vinfo;
2589 epilogue_vinfo = NULL;
2590 if (loop_vinfo->epilogue_vinfos.length () == 0)
2591 {
2592 vect_epilogues = false;
2593 break;
2594 }
2595 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2596 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2597 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2598 }
2599 }
2600 /* Prolog loop may be skipped. */
2601 bool skip_prolog = (prolog_peeling != 0);
2602 /* Skip this loop to epilog when there are not enough iterations to enter this
2603 vectorized loop. If true we should perform runtime checks on the NITERS
2604 to check whether we should skip the current vectorized loop. If we know
2605 the number of scalar iterations we may choose to add a runtime check if
2606 this number "maybe" smaller than the number of iterations required
2607 when we know the number of scalar iterations may potentially
2608 be smaller than the number of iterations required to enter this loop, for
2609 this we use the upper bounds on the prolog and epilog peeling. When we
2610 don't know the number of iterations and don't require versioning it is
2611 because we have asserted that there are enough scalar iterations to enter
2612 the main loop, so this skip is not necessary. When we are versioning then
2613 we only add such a skip if we have chosen to vectorize the epilogue. */
2614 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2615 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2616 bound_prolog + bound_epilog)
2617 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2618 || vect_epilogues));
2619 /* Epilog loop must be executed if the number of iterations for epilog
2620 loop is known at compile time, otherwise we need to add a check at
2621 the end of vector loop and skip to the end of epilog loop. */
2622 bool skip_epilog = (prolog_peeling < 0
2623 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2624 || !vf.is_constant ());
2625 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2626 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2627 skip_epilog = false;
2628
2629 if (skip_vector)
2630 {
2631 split_edge (loop_preheader_edge (loop));
2632
2633 /* Due to the order in which we peel prolog and epilog, we first
2634 propagate probability to the whole loop. The purpose is to
2635 avoid adjusting probabilities of both prolog and vector loops
2636 separately. Note in this case, the probability of epilog loop
2637 needs to be scaled back later. */
2638 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2639 if (prob_vector.initialized_p ())
2640 {
2641 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2642 scale_loop_profile (loop, prob_vector, 0);
2643 }
2644 }
2645
2646 dump_user_location_t loop_loc = find_loop_location (loop);
2647 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2648 if (vect_epilogues)
2649 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2650 use the original scalar loop as remaining epilogue if necessary. */
2651 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2652 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2653
2654 if (prolog_peeling)
2655 {
2656 e = loop_preheader_edge (loop);
2657 if (!slpeel_can_duplicate_loop_p (loop, e))
2658 {
2659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2660 "loop can't be duplicated to preheader edge.\n");
2661 gcc_unreachable ();
2662 }
2663 /* Peel prolog and put it on preheader edge of loop. */
2664 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2665 if (!prolog)
2666 {
2667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2668 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2669 gcc_unreachable ();
2670 }
2671 prolog->force_vectorize = false;
2672 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2673 first_loop = prolog;
2674 reset_original_copy_tables ();
2675
2676 /* Update the number of iterations for prolog loop. */
2677 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2678 vect_set_loop_condition (prolog, NULL, niters_prolog,
2679 step_prolog, NULL_TREE, false);
2680
2681 /* Skip the prolog loop. */
2682 if (skip_prolog)
2683 {
2684 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2685 niters_prolog, build_int_cst (type, 0));
2686 guard_bb = loop_preheader_edge (prolog)->src;
2687 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2688 guard_to = split_edge (loop_preheader_edge (loop));
2689 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2690 guard_to, guard_bb,
2691 prob_prolog.invert (),
2692 irred_flag);
2693 e = EDGE_PRED (guard_to, 0);
2694 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2695 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2696
2697 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2698 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2699 }
2700
2701 /* Update init address of DRs. */
2702 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2703 /* Update niters for vector loop. */
2704 LOOP_VINFO_NITERS (loop_vinfo)
2705 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2706 LOOP_VINFO_NITERSM1 (loop_vinfo)
2707 = fold_build2 (MINUS_EXPR, type,
2708 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2709 bool new_var_p = false;
2710 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2711 /* It's guaranteed that vector loop bound before vectorization is at
2712 least VF, so set range information for newly generated var. */
2713 if (new_var_p)
2714 set_range_info (niters, VR_RANGE,
2715 wi::to_wide (build_int_cst (type, vf)),
2716 wi::to_wide (TYPE_MAX_VALUE (type)));
2717
2718 /* Prolog iterates at most bound_prolog times, latch iterates at
2719 most bound_prolog - 1 times. */
2720 record_niter_bound (prolog, bound_prolog - 1, false, true);
2721 delete_update_ssa ();
2722 adjust_vec_debug_stmts ();
2723 scev_reset ();
2724 }
2725
2726 if (epilog_peeling)
2727 {
2728 e = single_exit (loop);
2729 if (!slpeel_can_duplicate_loop_p (loop, e))
2730 {
2731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2732 "loop can't be duplicated to exit edge.\n");
2733 gcc_unreachable ();
2734 }
2735 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2736 said epilog then we should use a copy of the main loop as a starting
2737 point. This loop may have already had some preliminary transformations
2738 to allow for more optimal vectorization, for example if-conversion.
2739 If we are not vectorizing the epilog then we should use the scalar loop
2740 as the transformations mentioned above make less or no sense when not
2741 vectorizing. */
2742 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2743 if (vop_to_rename)
2744 {
2745 /* Vectorizing the main loop can sometimes introduce a vdef to
2746 a loop that previously didn't have one; see the comment above
2747 the definition of VOP_TO_RENAME for details. The definition
2748 D that holds on E will then be different from the definition
2749 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2750 rename VOP_TO_RENAME to D when copying the loop.
2751
2752 The virtual operand is in LCSSA form for the main loop,
2753 and no stmt between the main loop and E needs a vdef,
2754 so we know that D is provided by a phi rather than by a
2755 vdef on a normal gimple stmt. */
2756 basic_block vdef_bb = e->src;
2757 gphi *vphi;
2758 while (!(vphi = get_virtual_phi (vdef_bb)))
2759 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2760 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2761 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2762 }
2763 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2764 if (!epilog)
2765 {
2766 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2767 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2768 gcc_unreachable ();
2769 }
2770 epilog->force_vectorize = false;
2771 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2772
2773 /* Scalar version loop may be preferred. In this case, add guard
2774 and skip to epilog. Note this only happens when the number of
2775 iterations of loop is unknown at compile time, otherwise this
2776 won't be vectorized. */
2777 if (skip_vector)
2778 {
2779 /* Additional epilogue iteration is peeled if gap exists. */
2780 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2781 bound_prolog, bound_epilog,
2782 th, &bound_scalar,
2783 check_profitability);
2784 /* Build guard against NITERSM1 since NITERS may overflow. */
2785 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2786 guard_bb = anchor;
2787 guard_to = split_edge (loop_preheader_edge (epilog));
2788 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2789 guard_to, guard_bb,
2790 prob_vector.invert (),
2791 irred_flag);
2792 skip_e = guard_e;
2793 e = EDGE_PRED (guard_to, 0);
2794 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2795 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2796
2797 /* Simply propagate profile info from guard_bb to guard_to which is
2798 a merge point of control flow. */
2799 guard_to->count = guard_bb->count;
2800
2801 /* Scale probability of epilog loop back.
2802 FIXME: We should avoid scaling down and back up. Profile may
2803 get lost if we scale down to 0. */
2804 basic_block *bbs = get_loop_body (epilog);
2805 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2806 bbs[i]->count = bbs[i]->count.apply_scale
2807 (bbs[i]->count,
2808 bbs[i]->count.apply_probability
2809 (prob_vector));
2810 free (bbs);
2811 }
2812
2813 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2814 /* If loop is peeled for non-zero constant times, now niters refers to
2815 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2816 overflows. */
2817 niters_no_overflow |= (prolog_peeling > 0);
2818 vect_gen_vector_loop_niters (loop_vinfo, niters,
2819 niters_vector, step_vector,
2820 niters_no_overflow);
2821 if (!integer_onep (*step_vector))
2822 {
2823 /* On exit from the loop we will have an easy way of calcalating
2824 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2825 until then. */
2826 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2827 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2828 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2829 }
2830 else
2831 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2832 &niters_vector_mult_vf);
2833 /* Update IVs of original loop as if they were advanced by
2834 niters_vector_mult_vf steps. */
2835 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2836 update_e = skip_vector ? e : loop_preheader_edge (epilog);
2837 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2838 update_e);
2839
2840 if (skip_epilog)
2841 {
2842 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2843 niters, niters_vector_mult_vf);
2844 guard_bb = single_exit (loop)->dest;
2845 guard_to = split_edge (single_exit (epilog));
2846 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2847 skip_vector ? anchor : guard_bb,
2848 prob_epilog.invert (),
2849 irred_flag);
2850 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2851 single_exit (epilog));
2852 /* Only need to handle basic block before epilog loop if it's not
2853 the guard_bb, which is the case when skip_vector is true. */
2854 if (guard_bb != bb_before_epilog)
2855 {
2856 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2857
2858 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2859 }
2860 scale_loop_profile (epilog, prob_epilog, 0);
2861 }
2862 else
2863 slpeel_update_phi_nodes_for_lcssa (epilog);
2864
2865 unsigned HOST_WIDE_INT bound;
2866 if (bound_scalar.is_constant (&bound))
2867 {
2868 gcc_assert (bound != 0);
2869 /* -1 to convert loop iterations to latch iterations. */
2870 record_niter_bound (epilog, bound - 1, false, true);
2871 }
2872
2873 delete_update_ssa ();
2874 adjust_vec_debug_stmts ();
2875 scev_reset ();
2876 }
2877
2878 if (vect_epilogues)
2879 {
2880 epilog->aux = epilogue_vinfo;
2881 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2882
2883 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2884
2885 /* We now must calculate the number of NITERS performed by the previous
2886 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2887 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2888 niters_prolog, niters_vector_mult_vf);
2889
2890 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2891 determine whether we are coming from the previous vectorized loop
2892 using the update_e edge or the skip_vector basic block using the
2893 skip_e edge. */
2894 if (skip_vector)
2895 {
2896 gcc_assert (update_e != NULL && skip_e != NULL);
2897 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2898 update_e->dest);
2899 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2900 gimple *stmt = gimple_build_assign (new_ssa, niters);
2901 gimple_stmt_iterator gsi;
2902 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2903 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2904 {
2905 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2906 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2907 }
2908 else
2909 {
2910 gsi = gsi_last_bb (update_e->src);
2911 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2912 }
2913
2914 niters = new_ssa;
2915 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2916 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2917 UNKNOWN_LOCATION);
2918 niters = PHI_RESULT (new_phi);
2919 }
2920
2921 /* Subtract the number of iterations performed by the vectorized loop
2922 from the number of total iterations. */
2923 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2924 before_loop_niters,
2925 niters);
2926
2927 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2928 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2929 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2930 epilogue_niters,
2931 build_one_cst (TREE_TYPE (epilogue_niters)));
2932
2933 /* Set ADVANCE to the number of iterations performed by the previous
2934 loop and its prologue. */
2935 *advance = niters;
2936
2937 /* Redo the peeling for niter analysis as the NITERs and alignment
2938 may have been updated to take the main loop into account. */
2939 determine_peel_for_niter (epilogue_vinfo);
2940 }
2941
2942 adjust_vec.release ();
2943 free_original_copy_tables ();
2944
2945 return vect_epilogues ? epilog : NULL;
2946 }
2947
2948 /* Function vect_create_cond_for_niters_checks.
2949
2950 Create a conditional expression that represents the run-time checks for
2951 loop's niter. The loop is guaranteed to terminate if the run-time
2952 checks hold.
2953
2954 Input:
2955 COND_EXPR - input conditional expression. New conditions will be chained
2956 with logical AND operation. If it is NULL, then the function
2957 is used to return the number of alias checks.
2958 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2959 to be checked.
2960
2961 Output:
2962 COND_EXPR - conditional expression.
2963
2964 The returned COND_EXPR is the conditional expression to be used in the
2965 if statement that controls which version of the loop gets executed at
2966 runtime. */
2967
2968 static void
2969 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2970 {
2971 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2972
2973 if (*cond_expr)
2974 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2975 *cond_expr, part_cond_expr);
2976 else
2977 *cond_expr = part_cond_expr;
2978 }
2979
2980 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2981 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2982
2983 static void
2984 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2985 {
2986 if (*cond_expr)
2987 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2988 *cond_expr, part_cond_expr);
2989 else
2990 *cond_expr = part_cond_expr;
2991 }
2992
2993 /* Function vect_create_cond_for_align_checks.
2994
2995 Create a conditional expression that represents the alignment checks for
2996 all of data references (array element references) whose alignment must be
2997 checked at runtime.
2998
2999 Input:
3000 COND_EXPR - input conditional expression. New conditions will be chained
3001 with logical AND operation.
3002 LOOP_VINFO - two fields of the loop information are used.
3003 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3004 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3005
3006 Output:
3007 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3008 expression.
3009 The returned value is the conditional expression to be used in the if
3010 statement that controls which version of the loop gets executed at runtime.
3011
3012 The algorithm makes two assumptions:
3013 1) The number of bytes "n" in a vector is a power of 2.
3014 2) An address "a" is aligned if a%n is zero and that this
3015 test can be done as a&(n-1) == 0. For example, for 16
3016 byte vectors the test is a&0xf == 0. */
3017
3018 static void
3019 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3020 tree *cond_expr,
3021 gimple_seq *cond_expr_stmt_list)
3022 {
3023 vec<stmt_vec_info> may_misalign_stmts
3024 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3025 stmt_vec_info stmt_info;
3026 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3027 tree mask_cst;
3028 unsigned int i;
3029 tree int_ptrsize_type;
3030 char tmp_name[20];
3031 tree or_tmp_name = NULL_TREE;
3032 tree and_tmp_name;
3033 gimple *and_stmt;
3034 tree ptrsize_zero;
3035 tree part_cond_expr;
3036
3037 /* Check that mask is one less than a power of 2, i.e., mask is
3038 all zeros followed by all ones. */
3039 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3040
3041 int_ptrsize_type = signed_type_for (ptr_type_node);
3042
3043 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3044 of the first vector of the i'th data reference. */
3045
3046 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3047 {
3048 gimple_seq new_stmt_list = NULL;
3049 tree addr_base;
3050 tree addr_tmp_name;
3051 tree new_or_tmp_name;
3052 gimple *addr_stmt, *or_stmt;
3053 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3054 bool negative = tree_int_cst_compare
3055 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3056 tree offset = negative
3057 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3058
3059 /* create: addr_tmp = (int)(address_of_first_vector) */
3060 addr_base =
3061 vect_create_addr_base_for_vector_ref (loop_vinfo,
3062 stmt_info, &new_stmt_list,
3063 offset);
3064 if (new_stmt_list != NULL)
3065 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3066
3067 sprintf (tmp_name, "addr2int%d", i);
3068 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3069 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3070 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3071
3072 /* The addresses are OR together. */
3073
3074 if (or_tmp_name != NULL_TREE)
3075 {
3076 /* create: or_tmp = or_tmp | addr_tmp */
3077 sprintf (tmp_name, "orptrs%d", i);
3078 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3079 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3080 or_tmp_name, addr_tmp_name);
3081 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3082 or_tmp_name = new_or_tmp_name;
3083 }
3084 else
3085 or_tmp_name = addr_tmp_name;
3086
3087 } /* end for i */
3088
3089 mask_cst = build_int_cst (int_ptrsize_type, mask);
3090
3091 /* create: and_tmp = or_tmp & mask */
3092 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3093
3094 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3095 or_tmp_name, mask_cst);
3096 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3097
3098 /* Make and_tmp the left operand of the conditional test against zero.
3099 if and_tmp has a nonzero bit then some address is unaligned. */
3100 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3101 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3102 and_tmp_name, ptrsize_zero);
3103 chain_cond_expr (cond_expr, part_cond_expr);
3104 }
3105
3106 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3107 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3108 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3109 and this new condition are true. Treat a null *COND_EXPR as "true". */
3110
3111 static void
3112 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3113 {
3114 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3115 unsigned int i;
3116 vec_object_pair *pair;
3117 FOR_EACH_VEC_ELT (pairs, i, pair)
3118 {
3119 tree addr1 = build_fold_addr_expr (pair->first);
3120 tree addr2 = build_fold_addr_expr (pair->second);
3121 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3122 addr1, addr2);
3123 chain_cond_expr (cond_expr, part_cond_expr);
3124 }
3125 }
3126
3127 /* Create an expression that is true when all lower-bound conditions for
3128 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3129
3130 static void
3131 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3132 {
3133 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3134 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3135 {
3136 tree expr = lower_bounds[i].expr;
3137 tree type = unsigned_type_for (TREE_TYPE (expr));
3138 expr = fold_convert (type, expr);
3139 poly_uint64 bound = lower_bounds[i].min_value;
3140 if (!lower_bounds[i].unsigned_p)
3141 {
3142 expr = fold_build2 (PLUS_EXPR, type, expr,
3143 build_int_cstu (type, bound - 1));
3144 bound += bound - 1;
3145 }
3146 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3147 build_int_cstu (type, bound));
3148 chain_cond_expr (cond_expr, part_cond_expr);
3149 }
3150 }
3151
3152 /* Function vect_create_cond_for_alias_checks.
3153
3154 Create a conditional expression that represents the run-time checks for
3155 overlapping of address ranges represented by a list of data references
3156 relations passed as input.
3157
3158 Input:
3159 COND_EXPR - input conditional expression. New conditions will be chained
3160 with logical AND operation. If it is NULL, then the function
3161 is used to return the number of alias checks.
3162 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3163 to be checked.
3164
3165 Output:
3166 COND_EXPR - conditional expression.
3167
3168 The returned COND_EXPR is the conditional expression to be used in the if
3169 statement that controls which version of the loop gets executed at runtime.
3170 */
3171
3172 void
3173 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3174 {
3175 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3176 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3177
3178 if (comp_alias_ddrs.is_empty ())
3179 return;
3180
3181 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3182 &comp_alias_ddrs, cond_expr);
3183 if (dump_enabled_p ())
3184 dump_printf_loc (MSG_NOTE, vect_location,
3185 "created %u versioning for alias checks.\n",
3186 comp_alias_ddrs.length ());
3187 }
3188
3189
3190 /* Function vect_loop_versioning.
3191
3192 If the loop has data references that may or may not be aligned or/and
3193 has data reference relations whose independence was not proven then
3194 two versions of the loop need to be generated, one which is vectorized
3195 and one which isn't. A test is then generated to control which of the
3196 loops is executed. The test checks for the alignment of all of the
3197 data references that may or may not be aligned. An additional
3198 sequence of runtime tests is generated for each pairs of DDRs whose
3199 independence was not proven. The vectorized version of loop is
3200 executed only if both alias and alignment tests are passed.
3201
3202 The test generated to check which version of loop is executed
3203 is modified to also check for profitability as indicated by the
3204 cost model threshold TH.
3205
3206 The versioning precondition(s) are placed in *COND_EXPR and
3207 *COND_EXPR_STMT_LIST. */
3208
3209 class loop *
3210 vect_loop_versioning (loop_vec_info loop_vinfo,
3211 gimple *loop_vectorized_call)
3212 {
3213 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3214 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3215 basic_block condition_bb;
3216 gphi_iterator gsi;
3217 gimple_stmt_iterator cond_exp_gsi;
3218 basic_block merge_bb;
3219 basic_block new_exit_bb;
3220 edge new_exit_e, e;
3221 gphi *orig_phi, *new_phi;
3222 tree cond_expr = NULL_TREE;
3223 gimple_seq cond_expr_stmt_list = NULL;
3224 tree arg;
3225 profile_probability prob = profile_probability::likely ();
3226 gimple_seq gimplify_stmt_list = NULL;
3227 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3228 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3229 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3230 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3231 poly_uint64 versioning_threshold
3232 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3233 tree version_simd_if_cond
3234 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3235 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3236
3237 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3238 && !ordered_p (th, versioning_threshold))
3239 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3240 build_int_cst (TREE_TYPE (scalar_loop_iters),
3241 th - 1));
3242 if (maybe_ne (versioning_threshold, 0U))
3243 {
3244 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3245 build_int_cst (TREE_TYPE (scalar_loop_iters),
3246 versioning_threshold - 1));
3247 if (cond_expr)
3248 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3249 expr, cond_expr);
3250 else
3251 cond_expr = expr;
3252 }
3253
3254 if (version_niter)
3255 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3256
3257 if (cond_expr)
3258 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3259 &cond_expr_stmt_list,
3260 is_gimple_condexpr, NULL_TREE);
3261
3262 if (version_align)
3263 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3264 &cond_expr_stmt_list);
3265
3266 if (version_alias)
3267 {
3268 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3269 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3270 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3271 }
3272
3273 if (version_simd_if_cond)
3274 {
3275 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3276 if (flag_checking)
3277 if (basic_block bb
3278 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3279 gcc_assert (bb != loop->header
3280 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3281 && (scalar_loop == NULL
3282 || (bb != scalar_loop->header
3283 && dominated_by_p (CDI_DOMINATORS,
3284 scalar_loop->header, bb))));
3285 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3286 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3287 version_simd_if_cond, zero);
3288 if (cond_expr)
3289 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3290 c, cond_expr);
3291 else
3292 cond_expr = c;
3293 if (dump_enabled_p ())
3294 dump_printf_loc (MSG_NOTE, vect_location,
3295 "created versioning for simd if condition check.\n");
3296 }
3297
3298 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3299 &gimplify_stmt_list,
3300 is_gimple_condexpr, NULL_TREE);
3301 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3302
3303 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3304 invariant in. */
3305 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3306 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3307 !gsi_end_p (gsi); gsi_next (&gsi))
3308 {
3309 gimple *stmt = gsi_stmt (gsi);
3310 update_stmt (stmt);
3311 ssa_op_iter iter;
3312 use_operand_p use_p;
3313 basic_block def_bb;
3314 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3315 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3316 && flow_bb_inside_loop_p (outermost, def_bb))
3317 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3318 }
3319
3320 /* Search for the outermost loop we can version. Avoid versioning of
3321 non-perfect nests but allow if-conversion versioned loops inside. */
3322 class loop *loop_to_version = loop;
3323 if (flow_loop_nested_p (outermost, loop))
3324 {
3325 if (dump_enabled_p ())
3326 dump_printf_loc (MSG_NOTE, vect_location,
3327 "trying to apply versioning to outer loop %d\n",
3328 outermost->num);
3329 if (outermost->num == 0)
3330 outermost = superloop_at_depth (loop, 1);
3331 /* And avoid applying versioning on non-perfect nests. */
3332 while (loop_to_version != outermost
3333 && single_exit (loop_outer (loop_to_version))
3334 && (!loop_outer (loop_to_version)->inner->next
3335 || vect_loop_vectorized_call (loop_to_version))
3336 && (!loop_outer (loop_to_version)->inner->next
3337 || !loop_outer (loop_to_version)->inner->next->next))
3338 loop_to_version = loop_outer (loop_to_version);
3339 }
3340
3341 /* Apply versioning. If there is already a scalar version created by
3342 if-conversion re-use that. Note we cannot re-use the copy of
3343 an if-converted outer-loop when vectorizing the inner loop only. */
3344 gcond *cond;
3345 if ((!loop_to_version->inner || loop == loop_to_version)
3346 && loop_vectorized_call)
3347 {
3348 gcc_assert (scalar_loop);
3349 condition_bb = gimple_bb (loop_vectorized_call);
3350 cond = as_a <gcond *> (last_stmt (condition_bb));
3351 gimple_cond_set_condition_from_tree (cond, cond_expr);
3352 update_stmt (cond);
3353
3354 if (cond_expr_stmt_list)
3355 {
3356 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3357 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3358 GSI_SAME_STMT);
3359 }
3360
3361 /* if-conversion uses profile_probability::always () for both paths,
3362 reset the paths probabilities appropriately. */
3363 edge te, fe;
3364 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3365 te->probability = prob;
3366 fe->probability = prob.invert ();
3367 /* We can scale loops counts immediately but have to postpone
3368 scaling the scalar loop because we re-use it during peeling. */
3369 scale_loop_frequencies (loop_to_version, te->probability);
3370 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3371
3372 nloop = scalar_loop;
3373 if (dump_enabled_p ())
3374 dump_printf_loc (MSG_NOTE, vect_location,
3375 "reusing %sloop version created by if conversion\n",
3376 loop_to_version != loop ? "outer " : "");
3377 }
3378 else
3379 {
3380 if (loop_to_version != loop
3381 && dump_enabled_p ())
3382 dump_printf_loc (MSG_NOTE, vect_location,
3383 "applying loop versioning to outer loop %d\n",
3384 loop_to_version->num);
3385
3386 initialize_original_copy_tables ();
3387 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3388 prob, prob.invert (), prob, prob.invert (), true);
3389 gcc_assert (nloop);
3390 nloop = get_loop_copy (loop);
3391
3392 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3393 reap those otherwise; they also refer to the original
3394 loops. */
3395 class loop *l = loop;
3396 while (gimple *call = vect_loop_vectorized_call (l))
3397 {
3398 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3399 fold_loop_internal_call (call, boolean_false_node);
3400 l = loop_outer (l);
3401 }
3402 free_original_copy_tables ();
3403
3404 if (cond_expr_stmt_list)
3405 {
3406 cond_exp_gsi = gsi_last_bb (condition_bb);
3407 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3408 GSI_SAME_STMT);
3409 }
3410
3411 /* Loop versioning violates an assumption we try to maintain during
3412 vectorization - that the loop exit block has a single predecessor.
3413 After versioning, the exit block of both loop versions is the same
3414 basic block (i.e. it has two predecessors). Just in order to simplify
3415 following transformations in the vectorizer, we fix this situation
3416 here by adding a new (empty) block on the exit-edge of the loop,
3417 with the proper loop-exit phis to maintain loop-closed-form.
3418 If loop versioning wasn't done from loop, but scalar_loop instead,
3419 merge_bb will have already just a single successor. */
3420
3421 merge_bb = single_exit (loop_to_version)->dest;
3422 if (EDGE_COUNT (merge_bb->preds) >= 2)
3423 {
3424 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3425 new_exit_bb = split_edge (single_exit (loop_to_version));
3426 new_exit_e = single_exit (loop_to_version);
3427 e = EDGE_SUCC (new_exit_bb, 0);
3428
3429 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3430 gsi_next (&gsi))
3431 {
3432 tree new_res;
3433 orig_phi = gsi.phi ();
3434 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3435 new_phi = create_phi_node (new_res, new_exit_bb);
3436 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3437 add_phi_arg (new_phi, arg, new_exit_e,
3438 gimple_phi_arg_location_from_edge (orig_phi, e));
3439 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3440 }
3441 }
3442
3443 update_ssa (TODO_update_ssa);
3444 }
3445
3446 if (version_niter)
3447 {
3448 /* The versioned loop could be infinite, we need to clear existing
3449 niter information which is copied from the original loop. */
3450 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3451 vect_free_loop_info_assumptions (nloop);
3452 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3453 loop_constraint_set (loop, LOOP_C_INFINITE);
3454 }
3455
3456 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3457 && dump_enabled_p ())
3458 {
3459 if (version_alias)
3460 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3461 vect_location,
3462 "loop versioned for vectorization because of "
3463 "possible aliasing\n");
3464 if (version_align)
3465 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3466 vect_location,
3467 "loop versioned for vectorization to enhance "
3468 "alignment\n");
3469
3470 }
3471
3472 return nloop;
3473 }