]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop-manip.c
Add support for fully-predicated loops
[thirdparty/gcc.git] / gcc / tree-vect-loop-manip.c
1 /* Vectorizer Specific Loop Manipulations
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "fold-const.h"
32 #include "cfganal.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimplify-me.h"
36 #include "tree-cfg.h"
37 #include "tree-ssa-loop-manip.h"
38 #include "tree-into-ssa.h"
39 #include "tree-ssa.h"
40 #include "cfgloop.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "gimple-fold.h"
45 #include "tree-ssa-loop-niter.h"
46 #include "internal-fn.h"
47 #include "stor-layout.h"
48 #include "optabs-query.h"
49 #include "vec-perm-indices.h"
50
51 /*************************************************************************
52 Simple Loop Peeling Utilities
53
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
56
57
58 /* Renames the use *OP_P. */
59
60 static void
61 rename_use_op (use_operand_p op_p)
62 {
63 tree new_name;
64
65 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66 return;
67
68 new_name = get_current_def (USE_FROM_PTR (op_p));
69
70 /* Something defined outside of the loop. */
71 if (!new_name)
72 return;
73
74 /* An ordinary ssa name defined in the loop. */
75
76 SET_USE (op_p, new_name);
77 }
78
79
80 /* Renames the variables in basic block BB. Allow renaming of PHI arguments
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82 true. */
83
84 static void
85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
86 {
87 gimple *stmt;
88 use_operand_p use_p;
89 ssa_op_iter iter;
90 edge e;
91 edge_iterator ei;
92 struct loop *loop = bb->loop_father;
93 struct loop *outer_loop = NULL;
94
95 if (rename_from_outer_loop)
96 {
97 gcc_assert (loop);
98 outer_loop = loop_outer (loop);
99 }
100
101 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102 gsi_next (&gsi))
103 {
104 stmt = gsi_stmt (gsi);
105 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 rename_use_op (use_p);
107 }
108
109 FOR_EACH_EDGE (e, ei, bb->preds)
110 {
111 if (!flow_bb_inside_loop_p (loop, e->src))
112 {
113 if (!rename_from_outer_loop)
114 continue;
115 if (e->src != outer_loop->header)
116 {
117 if (outer_loop->inner->next)
118 {
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e->src)
123 || single_pred (e->src) != outer_loop->header)
124 continue;
125 }
126 }
127 }
128 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 gsi_next (&gsi))
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
131 }
132 }
133
134
135 struct adjust_info
136 {
137 tree from, to;
138 basic_block bb;
139 };
140
141 /* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
146 more. */
147 static vec<adjust_info, va_heap> adjust_vec;
148
149 /* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
152
153 static void
154 adjust_debug_stmts_now (adjust_info *ai)
155 {
156 basic_block bbphi = ai->bb;
157 tree orig_def = ai->from;
158 tree new_def = ai->to;
159 imm_use_iterator imm_iter;
160 gimple *stmt;
161 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164
165 /* Adjust any debug stmts that held onto non-loop-closed
166 references. */
167 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168 {
169 use_operand_p use_p;
170 basic_block bbuse;
171
172 if (!is_gimple_debug (stmt))
173 continue;
174
175 gcc_assert (gimple_debug_bind_p (stmt));
176
177 bbuse = gimple_bb (stmt);
178
179 if ((bbuse == bbphi
180 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 && !(bbuse == bbdef
182 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183 {
184 if (new_def)
185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 SET_USE (use_p, new_def);
187 else
188 {
189 gimple_debug_bind_reset_value (stmt);
190 update_stmt (stmt);
191 }
192 }
193 }
194 }
195
196 /* Adjust debug stmts as scheduled before. */
197
198 static void
199 adjust_vec_debug_stmts (void)
200 {
201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
202 return;
203
204 gcc_assert (adjust_vec.exists ());
205
206 while (!adjust_vec.is_empty ())
207 {
208 adjust_debug_stmts_now (&adjust_vec.last ());
209 adjust_vec.pop ();
210 }
211 }
212
213 /* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
217
218 static void
219 adjust_debug_stmts (tree from, tree to, basic_block bb)
220 {
221 adjust_info ai;
222
223 if (MAY_HAVE_DEBUG_BIND_STMTS
224 && TREE_CODE (from) == SSA_NAME
225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
226 && ! virtual_operand_p (from))
227 {
228 ai.from = from;
229 ai.to = to;
230 ai.bb = bb;
231
232 if (adjust_vec.exists ())
233 adjust_vec.safe_push (ai);
234 else
235 adjust_debug_stmts_now (&ai);
236 }
237 }
238
239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
242 transformations. */
243
244 static void
245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
246 {
247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248
249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250
251 if (MAY_HAVE_DEBUG_BIND_STMTS)
252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 gimple_bb (update_phi));
254 }
255
256 /* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
259
260 static void
261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262 tree next_mask)
263 {
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267 }
268
269 /* Add SEQ to the end of LOOP's preheader block. */
270
271 static void
272 add_preheader_seq (struct loop *loop, gimple_seq seq)
273 {
274 if (seq)
275 {
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
279 }
280 }
281
282 /* Add SEQ to the beginning of LOOP's header block. */
283
284 static void
285 add_header_seq (struct loop *loop, gimple_seq seq)
286 {
287 if (seq)
288 {
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291 }
292 }
293
294 /* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
298
299 static bool
300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
302 {
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
307 {
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
310 }
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313 }
314
315 /* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
318
319 static bool
320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 rgroup_masks *src_rgm)
322 {
323 tree src_masktype = src_rgm->mask_type;
324 tree dest_masktype = dest_rgm->mask_type;
325 machine_mode src_mode = TYPE_MODE (src_masktype);
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329 {
330 /* Unpacking the source masks gives at least as many mask bits as
331 we need. We can then VIEW_CONVERT any excess bits away. */
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334 {
335 tree src = src_rgm->masks[i / 2];
336 tree dest = dest_rgm->masks[i];
337 tree_code code = (i & 1 ? VEC_UNPACK_HI_EXPR
338 : VEC_UNPACK_LO_EXPR);
339 gassign *stmt;
340 if (dest_masktype == unpack_masktype)
341 stmt = gimple_build_assign (dest, code, src);
342 else
343 {
344 tree temp = make_ssa_name (unpack_masktype);
345 stmt = gimple_build_assign (temp, code, src);
346 gimple_seq_add_stmt (seq, stmt);
347 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
348 build1 (VIEW_CONVERT_EXPR,
349 dest_masktype, temp));
350 }
351 gimple_seq_add_stmt (seq, stmt);
352 }
353 return true;
354 }
355 vec_perm_indices indices[2];
356 if (dest_masktype == src_masktype
357 && interleave_supported_p (&indices[0], src_masktype, 0)
358 && interleave_supported_p (&indices[1], src_masktype, 1))
359 {
360 /* The destination requires twice as many mask bits as the source, so
361 we can use interleaving permutes to double up the number of bits. */
362 tree masks[2];
363 for (unsigned int i = 0; i < 2; ++i)
364 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
365 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
366 {
367 tree src = src_rgm->masks[i / 2];
368 tree dest = dest_rgm->masks[i];
369 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
370 src, src, masks[i & 1]);
371 gimple_seq_add_stmt (seq, stmt);
372 }
373 return true;
374 }
375 return false;
376 }
377
378 /* Helper for vect_set_loop_condition_masked. Generate definitions for
379 all the masks in RGM and return a mask that is nonzero when the loop
380 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
381 Use LOOP_COND_GSI to insert code before the exit gcond.
382
383 RGM belongs to loop LOOP. The loop originally iterated NITERS
384 times and has been vectorized according to LOOP_VINFO. Each iteration
385 of the vectorized loop handles VF iterations of the scalar loop.
386
387 It is known that:
388
389 NITERS * RGM->max_nscalars_per_iter
390
391 does not overflow. However, MIGHT_WRAP_P says whether an induction
392 variable that starts at 0 and has step:
393
394 VF * RGM->max_nscalars_per_iter
395
396 might overflow before hitting a value above:
397
398 NITERS * RGM->max_nscalars_per_iter
399
400 This means that we cannot guarantee that such an induction variable
401 would ever hit a value that produces a set of all-false masks for RGM. */
402
403 static tree
404 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
405 gimple_seq *preheader_seq,
406 gimple_stmt_iterator loop_cond_gsi,
407 rgroup_masks *rgm, tree vf,
408 tree niters, bool might_wrap_p)
409 {
410 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
411 tree mask_type = rgm->mask_type;
412 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
413 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
414
415 /* Calculate the maximum number of scalar values that the rgroup
416 handles in total and the number that it handles for each iteration
417 of the vector loop. */
418 tree nscalars_total = niters;
419 tree nscalars_step = vf;
420 if (nscalars_per_iter != 1)
421 {
422 /* We checked before choosing to use a fully-masked loop that these
423 multiplications don't overflow. */
424 tree factor = build_int_cst (compare_type, nscalars_per_iter);
425 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
426 nscalars_total, factor);
427 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
428 nscalars_step, factor);
429 }
430
431 /* Create an induction variable that counts the number of scalars
432 processed. */
433 tree index_before_incr, index_after_incr;
434 gimple_stmt_iterator incr_gsi;
435 bool insert_after;
436 tree zero_index = build_int_cst (compare_type, 0);
437 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
438 create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
439 insert_after, &index_before_incr, &index_after_incr);
440
441 tree test_index, test_limit;
442 gimple_stmt_iterator *test_gsi;
443 if (might_wrap_p)
444 {
445 /* In principle the loop should stop iterating once the incremented
446 IV reaches a value greater than or equal to NSCALAR_TOTAL.
447 However, there's no guarantee that the IV hits a value above
448 this value before wrapping around. We therefore adjust the
449 limit down by one IV step:
450
451 NSCALARS_TOTAL -[infinite-prec] NSCALARS_STEP
452
453 and compare the IV against this limit _before_ incrementing it.
454 Since the comparison type is unsigned, we actually want the
455 subtraction to saturate at zero:
456
457 NSCALARS_TOTAL -[sat] NSCALARS_STEP. */
458 test_index = index_before_incr;
459 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
460 nscalars_total, nscalars_step);
461 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
462 test_limit, nscalars_step);
463 test_gsi = &incr_gsi;
464 }
465 else
466 {
467 /* Test the incremented IV, which will always hit a value above
468 the bound before wrapping. */
469 test_index = index_after_incr;
470 test_limit = nscalars_total;
471 test_gsi = &loop_cond_gsi;
472 }
473
474 /* Provide a definition of each mask in the group. */
475 tree next_mask = NULL_TREE;
476 tree mask;
477 unsigned int i;
478 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
479 {
480 /* Previous masks will cover BIAS scalars. This mask covers the
481 next batch. */
482 poly_uint64 bias = nscalars_per_mask * i;
483 tree bias_tree = build_int_cst (compare_type, bias);
484 gimple *tmp_stmt;
485
486 /* See whether the first iteration of the vector loop is known
487 to have a full mask. */
488 poly_uint64 const_limit;
489 bool first_iteration_full
490 = (poly_int_tree_p (nscalars_total, &const_limit)
491 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
492
493 /* Rather than have a new IV that starts at BIAS and goes up to
494 TEST_LIMIT, prefer to use the same 0-based IV for each mask
495 and adjust the bound down by BIAS. */
496 tree this_test_limit = test_limit;
497 if (i != 0)
498 {
499 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
500 compare_type, this_test_limit,
501 bias_tree);
502 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
503 compare_type, this_test_limit,
504 bias_tree);
505 }
506
507 /* Create the initial mask. */
508 tree init_mask = NULL_TREE;
509 if (!first_iteration_full)
510 {
511 tree start, end;
512 if (nscalars_total == test_limit)
513 {
514 /* Use a natural test between zero (the initial IV value)
515 and the loop limit. The "else" block would be valid too,
516 but this choice can avoid the need to load BIAS_TREE into
517 a register. */
518 start = zero_index;
519 end = this_test_limit;
520 }
521 else
522 {
523 start = bias_tree;
524 end = nscalars_total;
525 }
526
527 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
528 tmp_stmt = vect_gen_while (init_mask, start, end);
529 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
530 }
531
532 if (!init_mask)
533 /* First iteration is full. */
534 init_mask = build_minus_one_cst (mask_type);
535
536 /* Get the mask value for the next iteration of the loop. */
537 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
538 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
539 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
540
541 vect_set_loop_mask (loop, mask, init_mask, next_mask);
542 }
543 return next_mask;
544 }
545
546 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
547 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
548 number of iterations of the original scalar loop. NITERS_MAYBE_ZERO
549 and FINAL_IV are as for vect_set_loop_condition.
550
551 Insert the branch-back condition before LOOP_COND_GSI and return the
552 final gcond. */
553
554 static gcond *
555 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
556 tree niters, tree final_iv,
557 bool niters_maybe_zero,
558 gimple_stmt_iterator loop_cond_gsi)
559 {
560 gimple_seq preheader_seq = NULL;
561 gimple_seq header_seq = NULL;
562
563 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
564 unsigned int compare_precision = TYPE_PRECISION (compare_type);
565 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
566 tree orig_niters = niters;
567
568 /* Type of the initial value of NITERS. */
569 tree ni_actual_type = TREE_TYPE (niters);
570 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
571
572 /* Convert NITERS to the same size as the compare. */
573 if (compare_precision > ni_actual_precision
574 && niters_maybe_zero)
575 {
576 /* We know that there is always at least one iteration, so if the
577 count is zero then it must have wrapped. Cope with this by
578 subtracting 1 before the conversion and adding 1 to the result. */
579 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
580 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
581 niters, build_minus_one_cst (ni_actual_type));
582 niters = gimple_convert (&preheader_seq, compare_type, niters);
583 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
584 niters, build_one_cst (compare_type));
585 }
586 else
587 niters = gimple_convert (&preheader_seq, compare_type, niters);
588
589 /* Now calculate the value that the induction variable must be able
590 to hit in order to ensure that we end the loop with an all-false mask.
591 This involves adding the maximum number of inactive trailing scalar
592 iterations. */
593 widest_int iv_limit;
594 bool known_max_iters = max_loop_iterations (loop, &iv_limit);
595 if (known_max_iters)
596 {
597 /* IV_LIMIT is the maximum number of latch iterations, which is also
598 the maximum in-range IV value. Round this value down to the previous
599 vector alignment boundary and then add an extra full iteration. */
600 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
601 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
602 }
603
604 /* Get the vectorization factor in tree form. */
605 tree vf = build_int_cst (compare_type,
606 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
607
608 /* Iterate over all the rgroups and fill in their masks. We could use
609 the first mask from any rgroup for the loop condition; here we
610 arbitrarily pick the last. */
611 tree test_mask = NULL_TREE;
612 rgroup_masks *rgm;
613 unsigned int i;
614 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
615 FOR_EACH_VEC_ELT (*masks, i, rgm)
616 if (!rgm->masks.is_empty ())
617 {
618 /* First try using permutes. This adds a single vector
619 instruction to the loop for each mask, but needs no extra
620 loop invariants or IVs. */
621 unsigned int nmasks = i + 1;
622 if ((nmasks & 1) == 0)
623 {
624 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
625 if (!half_rgm->masks.is_empty ()
626 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
627 continue;
628 }
629
630 /* See whether zero-based IV would ever generate all-false masks
631 before wrapping around. */
632 bool might_wrap_p
633 = (!known_max_iters
634 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
635 UNSIGNED)
636 > compare_precision));
637
638 /* Set up all masks for this group. */
639 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
640 &preheader_seq,
641 loop_cond_gsi, rgm, vf,
642 niters, might_wrap_p);
643 }
644
645 /* Emit all accumulated statements. */
646 add_preheader_seq (loop, preheader_seq);
647 add_header_seq (loop, header_seq);
648
649 /* Get a boolean result that tells us whether to iterate. */
650 edge exit_edge = single_exit (loop);
651 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
652 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
653 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
654 NULL_TREE, NULL_TREE);
655 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
656
657 /* The loop iterates (NITERS - 1) / VF + 1 times.
658 Subtract one from this to get the latch count. */
659 tree step = build_int_cst (compare_type,
660 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
661 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
662 build_minus_one_cst (compare_type));
663 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
664 niters_minus_one, step);
665
666 if (final_iv)
667 {
668 gassign *assign = gimple_build_assign (final_iv, orig_niters);
669 gsi_insert_on_edge_immediate (single_exit (loop), assign);
670 }
671
672 return cond_stmt;
673 }
674
675 /* Like vect_set_loop_condition, but handle the case in which there
676 are no loop masks. */
677
678 static gcond *
679 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
680 tree step, tree final_iv,
681 bool niters_maybe_zero,
682 gimple_stmt_iterator loop_cond_gsi)
683 {
684 tree indx_before_incr, indx_after_incr;
685 gcond *cond_stmt;
686 gcond *orig_cond;
687 edge pe = loop_preheader_edge (loop);
688 edge exit_edge = single_exit (loop);
689 gimple_stmt_iterator incr_gsi;
690 bool insert_after;
691 enum tree_code code;
692 tree niters_type = TREE_TYPE (niters);
693
694 orig_cond = get_loop_exit_condition (loop);
695 gcc_assert (orig_cond);
696 loop_cond_gsi = gsi_for_stmt (orig_cond);
697
698 tree init, limit;
699 if (!niters_maybe_zero && integer_onep (step))
700 {
701 /* In this case we can use a simple 0-based IV:
702
703 A:
704 x = 0;
705 do
706 {
707 ...
708 x += 1;
709 }
710 while (x < NITERS); */
711 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
712 init = build_zero_cst (niters_type);
713 limit = niters;
714 }
715 else
716 {
717 /* The following works for all values of NITERS except 0:
718
719 B:
720 x = 0;
721 do
722 {
723 ...
724 x += STEP;
725 }
726 while (x <= NITERS - STEP);
727
728 so that the loop continues to iterate if x + STEP - 1 < NITERS
729 but stops if x + STEP - 1 >= NITERS.
730
731 However, if NITERS is zero, x never hits a value above NITERS - STEP
732 before wrapping around. There are two obvious ways of dealing with
733 this:
734
735 - start at STEP - 1 and compare x before incrementing it
736 - start at -1 and compare x after incrementing it
737
738 The latter is simpler and is what we use. The loop in this case
739 looks like:
740
741 C:
742 x = -1;
743 do
744 {
745 ...
746 x += STEP;
747 }
748 while (x < NITERS - STEP);
749
750 In both cases the loop limit is NITERS - STEP. */
751 gimple_seq seq = NULL;
752 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
753 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
754 if (seq)
755 {
756 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
757 gcc_assert (!new_bb);
758 }
759 if (niters_maybe_zero)
760 {
761 /* Case C. */
762 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
763 init = build_all_ones_cst (niters_type);
764 }
765 else
766 {
767 /* Case B. */
768 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
769 init = build_zero_cst (niters_type);
770 }
771 }
772
773 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
774 create_iv (init, step, NULL_TREE, loop,
775 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
776 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
777 true, NULL_TREE, true,
778 GSI_SAME_STMT);
779 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
780 true, GSI_SAME_STMT);
781
782 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
783 NULL_TREE);
784
785 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
786
787 /* Record the number of latch iterations. */
788 if (limit == niters)
789 /* Case A: the loop iterates NITERS times. Subtract one to get the
790 latch count. */
791 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
792 build_int_cst (niters_type, 1));
793 else
794 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
795 Subtract one from this to get the latch count. */
796 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
797 limit, step);
798
799 if (final_iv)
800 {
801 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
802 indx_after_incr, init);
803 gsi_insert_on_edge_immediate (single_exit (loop), assign);
804 }
805
806 return cond_stmt;
807 }
808
809 /* If we're using fully-masked loops, make LOOP iterate:
810
811 N == (NITERS - 1) / STEP + 1
812
813 times. When NITERS is zero, this is equivalent to making the loop
814 execute (1 << M) / STEP times, where M is the precision of NITERS.
815 NITERS_MAYBE_ZERO is true if this last case might occur.
816
817 If we're not using fully-masked loops, make LOOP iterate:
818
819 N == (NITERS - STEP) / STEP + 1
820
821 times, where NITERS is known to be outside the range [1, STEP - 1].
822 This is equivalent to making the loop execute NITERS / STEP times
823 when NITERS is nonzero and (1 << M) / STEP times otherwise.
824 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
825
826 If FINAL_IV is nonnull, it is an SSA name that should be set to
827 N * STEP on exit from the loop.
828
829 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
830
831 void
832 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
833 tree niters, tree step, tree final_iv,
834 bool niters_maybe_zero)
835 {
836 gcond *cond_stmt;
837 gcond *orig_cond = get_loop_exit_condition (loop);
838 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
839
840 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
841 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
842 final_iv, niters_maybe_zero,
843 loop_cond_gsi);
844 else
845 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
846 final_iv, niters_maybe_zero,
847 loop_cond_gsi);
848
849 /* Remove old loop exit test. */
850 gsi_remove (&loop_cond_gsi, true);
851 free_stmt_vec_info (orig_cond);
852
853 if (dump_enabled_p ())
854 {
855 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
856 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
857 }
858 }
859
860 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
861 For all PHI arguments in FROM->dest and TO->dest from those
862 edges ensure that TO->dest PHI arguments have current_def
863 to that in from. */
864
865 static void
866 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
867 {
868 gimple_stmt_iterator gsi_from, gsi_to;
869
870 for (gsi_from = gsi_start_phis (from->dest),
871 gsi_to = gsi_start_phis (to->dest);
872 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
873 {
874 gimple *from_phi = gsi_stmt (gsi_from);
875 gimple *to_phi = gsi_stmt (gsi_to);
876 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
877 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
878 if (virtual_operand_p (from_arg))
879 {
880 gsi_next (&gsi_from);
881 continue;
882 }
883 if (virtual_operand_p (to_arg))
884 {
885 gsi_next (&gsi_to);
886 continue;
887 }
888 if (TREE_CODE (from_arg) != SSA_NAME)
889 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
890 else
891 {
892 if (get_current_def (to_arg) == NULL_TREE)
893 set_current_def (to_arg, get_current_def (from_arg));
894 }
895 gsi_next (&gsi_from);
896 gsi_next (&gsi_to);
897 }
898
899 gphi *from_phi = get_virtual_phi (from->dest);
900 gphi *to_phi = get_virtual_phi (to->dest);
901 if (from_phi)
902 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
903 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
904 }
905
906
907 /* Given LOOP this function generates a new copy of it and puts it
908 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
909 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
910 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
911 entry or exit of LOOP. */
912
913 struct loop *
914 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
915 struct loop *scalar_loop, edge e)
916 {
917 struct loop *new_loop;
918 basic_block *new_bbs, *bbs, *pbbs;
919 bool at_exit;
920 bool was_imm_dom;
921 basic_block exit_dest;
922 edge exit, new_exit;
923 bool duplicate_outer_loop = false;
924
925 exit = single_exit (loop);
926 at_exit = (e == exit);
927 if (!at_exit && e != loop_preheader_edge (loop))
928 return NULL;
929
930 if (scalar_loop == NULL)
931 scalar_loop = loop;
932
933 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
934 pbbs = bbs + 1;
935 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
936 /* Allow duplication of outer loops. */
937 if (scalar_loop->inner)
938 duplicate_outer_loop = true;
939 /* Check whether duplication is possible. */
940 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
941 {
942 free (bbs);
943 return NULL;
944 }
945
946 /* Generate new loop structure. */
947 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
948 duplicate_subloops (scalar_loop, new_loop);
949
950 exit_dest = exit->dest;
951 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
952 exit_dest) == loop->header ?
953 true : false);
954
955 /* Also copy the pre-header, this avoids jumping through hoops to
956 duplicate the loop entry PHI arguments. Create an empty
957 pre-header unconditionally for this. */
958 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
959 edge entry_e = single_pred_edge (preheader);
960 bbs[0] = preheader;
961 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
962
963 exit = single_exit (scalar_loop);
964 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
965 &exit, 1, &new_exit, NULL,
966 at_exit ? loop->latch : e->src, true);
967 exit = single_exit (loop);
968 basic_block new_preheader = new_bbs[0];
969
970 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
971
972 if (scalar_loop != loop)
973 {
974 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
975 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
976 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
977 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
978 header) to have current_def set, so copy them over. */
979 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
980 exit);
981 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
982 0),
983 EDGE_SUCC (loop->latch, 0));
984 }
985
986 if (at_exit) /* Add the loop copy at exit. */
987 {
988 if (scalar_loop != loop)
989 {
990 gphi_iterator gsi;
991 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
992
993 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
994 gsi_next (&gsi))
995 {
996 gphi *phi = gsi.phi ();
997 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
998 location_t orig_locus
999 = gimple_phi_arg_location_from_edge (phi, e);
1000
1001 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1002 }
1003 }
1004 redirect_edge_and_branch_force (e, new_preheader);
1005 flush_pending_stmts (e);
1006 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1007 if (was_imm_dom || duplicate_outer_loop)
1008 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1009
1010 /* And remove the non-necessary forwarder again. Keep the other
1011 one so we have a proper pre-header for the loop at the exit edge. */
1012 redirect_edge_pred (single_succ_edge (preheader),
1013 single_pred (preheader));
1014 delete_basic_block (preheader);
1015 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1016 loop_preheader_edge (scalar_loop)->src);
1017 }
1018 else /* Add the copy at entry. */
1019 {
1020 if (scalar_loop != loop)
1021 {
1022 /* Remove the non-necessary forwarder of scalar_loop again. */
1023 redirect_edge_pred (single_succ_edge (preheader),
1024 single_pred (preheader));
1025 delete_basic_block (preheader);
1026 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1027 loop_preheader_edge (scalar_loop)->src);
1028 preheader = split_edge (loop_preheader_edge (loop));
1029 entry_e = single_pred_edge (preheader);
1030 }
1031
1032 redirect_edge_and_branch_force (entry_e, new_preheader);
1033 flush_pending_stmts (entry_e);
1034 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1035
1036 redirect_edge_and_branch_force (new_exit, preheader);
1037 flush_pending_stmts (new_exit);
1038 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1039
1040 /* And remove the non-necessary forwarder again. Keep the other
1041 one so we have a proper pre-header for the loop at the exit edge. */
1042 redirect_edge_pred (single_succ_edge (new_preheader),
1043 single_pred (new_preheader));
1044 delete_basic_block (new_preheader);
1045 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1046 loop_preheader_edge (new_loop)->src);
1047 }
1048
1049 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1050 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1051 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1052
1053 if (scalar_loop != loop)
1054 {
1055 /* Update new_loop->header PHIs, so that on the preheader
1056 edge they are the ones from loop rather than scalar_loop. */
1057 gphi_iterator gsi_orig, gsi_new;
1058 edge orig_e = loop_preheader_edge (loop);
1059 edge new_e = loop_preheader_edge (new_loop);
1060
1061 for (gsi_orig = gsi_start_phis (loop->header),
1062 gsi_new = gsi_start_phis (new_loop->header);
1063 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1064 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1065 {
1066 gphi *orig_phi = gsi_orig.phi ();
1067 gphi *new_phi = gsi_new.phi ();
1068 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1069 location_t orig_locus
1070 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1071
1072 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1073 }
1074 }
1075
1076 free (new_bbs);
1077 free (bbs);
1078
1079 checking_verify_dominators (CDI_DOMINATORS);
1080
1081 return new_loop;
1082 }
1083
1084
1085 /* Given the condition expression COND, put it as the last statement of
1086 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1087 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
1088 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1089 new edge as irreducible if IRREDUCIBLE_P is true. */
1090
1091 static edge
1092 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1093 basic_block guard_to, basic_block dom_bb,
1094 profile_probability probability, bool irreducible_p)
1095 {
1096 gimple_stmt_iterator gsi;
1097 edge new_e, enter_e;
1098 gcond *cond_stmt;
1099 gimple_seq gimplify_stmt_list = NULL;
1100
1101 enter_e = EDGE_SUCC (guard_bb, 0);
1102 enter_e->flags &= ~EDGE_FALLTHRU;
1103 enter_e->flags |= EDGE_FALSE_VALUE;
1104 gsi = gsi_last_bb (guard_bb);
1105
1106 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1107 NULL_TREE);
1108 if (gimplify_stmt_list)
1109 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1110
1111 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1112 gsi = gsi_last_bb (guard_bb);
1113 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1114
1115 /* Add new edge to connect guard block to the merge/loop-exit block. */
1116 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1117
1118 new_e->probability = probability;
1119 if (irreducible_p)
1120 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1121
1122 enter_e->probability = probability.invert ();
1123 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1124
1125 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1126 if (enter_e->dest->loop_father->header == enter_e->dest)
1127 split_edge (enter_e);
1128
1129 return new_e;
1130 }
1131
1132
1133 /* This function verifies that the following restrictions apply to LOOP:
1134 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1135 for innermost loop and 5 basic blocks for outer-loop.
1136 (2) it is single entry, single exit
1137 (3) its exit condition is the last stmt in the header
1138 (4) E is the entry/exit edge of LOOP.
1139 */
1140
1141 bool
1142 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1143 {
1144 edge exit_e = single_exit (loop);
1145 edge entry_e = loop_preheader_edge (loop);
1146 gcond *orig_cond = get_loop_exit_condition (loop);
1147 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1148 unsigned int num_bb = loop->inner? 5 : 2;
1149
1150 /* All loops have an outer scope; the only case loop->outer is NULL is for
1151 the function itself. */
1152 if (!loop_outer (loop)
1153 || loop->num_nodes != num_bb
1154 || !empty_block_p (loop->latch)
1155 || !single_exit (loop)
1156 /* Verify that new loop exit condition can be trivially modified. */
1157 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1158 || (e != exit_e && e != entry_e))
1159 return false;
1160
1161 return true;
1162 }
1163
1164 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1165 in the exit bb and rename all the uses after the loop. This simplifies
1166 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1167 (but normally loop closed SSA form doesn't require virtual PHIs to be
1168 in the same form). Doing this early simplifies the checking what
1169 uses should be renamed. */
1170
1171 static void
1172 create_lcssa_for_virtual_phi (struct loop *loop)
1173 {
1174 gphi_iterator gsi;
1175 edge exit_e = single_exit (loop);
1176
1177 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1178 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1179 {
1180 gphi *phi = gsi.phi ();
1181 for (gsi = gsi_start_phis (exit_e->dest);
1182 !gsi_end_p (gsi); gsi_next (&gsi))
1183 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1184 break;
1185 if (gsi_end_p (gsi))
1186 {
1187 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1188 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1189 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1190 imm_use_iterator imm_iter;
1191 gimple *stmt;
1192 use_operand_p use_p;
1193
1194 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1195 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1196 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1197 gimple_phi_set_result (new_phi, new_vop);
1198 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1199 if (stmt != new_phi
1200 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1201 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1202 SET_USE (use_p, new_vop);
1203 }
1204 break;
1205 }
1206
1207 }
1208
1209 /* Function vect_get_loop_location.
1210
1211 Extract the location of the loop in the source code.
1212 If the loop is not well formed for vectorization, an estimated
1213 location is calculated.
1214 Return the loop location if succeed and NULL if not. */
1215
1216 source_location
1217 find_loop_location (struct loop *loop)
1218 {
1219 gimple *stmt = NULL;
1220 basic_block bb;
1221 gimple_stmt_iterator si;
1222
1223 if (!loop)
1224 return UNKNOWN_LOCATION;
1225
1226 stmt = get_loop_exit_condition (loop);
1227
1228 if (stmt
1229 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1230 return gimple_location (stmt);
1231
1232 /* If we got here the loop is probably not "well formed",
1233 try to estimate the loop location */
1234
1235 if (!loop->header)
1236 return UNKNOWN_LOCATION;
1237
1238 bb = loop->header;
1239
1240 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1241 {
1242 stmt = gsi_stmt (si);
1243 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1244 return gimple_location (stmt);
1245 }
1246
1247 return UNKNOWN_LOCATION;
1248 }
1249
1250 /* Return true if PHI defines an IV of the loop to be vectorized. */
1251
1252 static bool
1253 iv_phi_p (gphi *phi)
1254 {
1255 if (virtual_operand_p (PHI_RESULT (phi)))
1256 return false;
1257
1258 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1259 gcc_assert (stmt_info != NULL);
1260 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1261 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1262 return false;
1263
1264 return true;
1265 }
1266
1267 /* Function vect_can_advance_ivs_p
1268
1269 In case the number of iterations that LOOP iterates is unknown at compile
1270 time, an epilog loop will be generated, and the loop induction variables
1271 (IVs) will be "advanced" to the value they are supposed to take just before
1272 the epilog loop. Here we check that the access function of the loop IVs
1273 and the expression that represents the loop bound are simple enough.
1274 These restrictions will be relaxed in the future. */
1275
1276 bool
1277 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1278 {
1279 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1280 basic_block bb = loop->header;
1281 gphi_iterator gsi;
1282
1283 /* Analyze phi functions of the loop header. */
1284
1285 if (dump_enabled_p ())
1286 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1287 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1288 {
1289 tree evolution_part;
1290
1291 gphi *phi = gsi.phi ();
1292 if (dump_enabled_p ())
1293 {
1294 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1295 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1296 }
1297
1298 /* Skip virtual phi's. The data dependences that are associated with
1299 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1300
1301 Skip reduction phis. */
1302 if (!iv_phi_p (phi))
1303 {
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_NOTE, vect_location,
1306 "reduc or virtual phi. skip.\n");
1307 continue;
1308 }
1309
1310 /* Analyze the evolution function. */
1311
1312 evolution_part
1313 = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1314 if (evolution_part == NULL_TREE)
1315 {
1316 if (dump_enabled_p ())
1317 dump_printf (MSG_MISSED_OPTIMIZATION,
1318 "No access function or evolution.\n");
1319 return false;
1320 }
1321
1322 /* FORNOW: We do not transform initial conditions of IVs
1323 which evolution functions are not invariants in the loop. */
1324
1325 if (!expr_invariant_in_loop_p (loop, evolution_part))
1326 {
1327 if (dump_enabled_p ())
1328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329 "evolution not invariant in loop.\n");
1330 return false;
1331 }
1332
1333 /* FORNOW: We do not transform initial conditions of IVs
1334 which evolution functions are a polynomial of degree >= 2. */
1335
1336 if (tree_is_chrec (evolution_part))
1337 {
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340 "evolution is chrec.\n");
1341 return false;
1342 }
1343 }
1344
1345 return true;
1346 }
1347
1348
1349 /* Function vect_update_ivs_after_vectorizer.
1350
1351 "Advance" the induction variables of LOOP to the value they should take
1352 after the execution of LOOP. This is currently necessary because the
1353 vectorizer does not handle induction variables that are used after the
1354 loop. Such a situation occurs when the last iterations of LOOP are
1355 peeled, because:
1356 1. We introduced new uses after LOOP for IVs that were not originally used
1357 after LOOP: the IVs of LOOP are now used by an epilog loop.
1358 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1359 times, whereas the loop IVs should be bumped N times.
1360
1361 Input:
1362 - LOOP - a loop that is going to be vectorized. The last few iterations
1363 of LOOP were peeled.
1364 - NITERS - the number of iterations that LOOP executes (before it is
1365 vectorized). i.e, the number of times the ivs should be bumped.
1366 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1367 coming out from LOOP on which there are uses of the LOOP ivs
1368 (this is the path from LOOP->exit to epilog_loop->preheader).
1369
1370 The new definitions of the ivs are placed in LOOP->exit.
1371 The phi args associated with the edge UPDATE_E in the bb
1372 UPDATE_E->dest are updated accordingly.
1373
1374 Assumption 1: Like the rest of the vectorizer, this function assumes
1375 a single loop exit that has a single predecessor.
1376
1377 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1378 organized in the same order.
1379
1380 Assumption 3: The access function of the ivs is simple enough (see
1381 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1382
1383 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1384 coming out of LOOP on which the ivs of LOOP are used (this is the path
1385 that leads to the epilog loop; other paths skip the epilog loop). This
1386 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1387 needs to have its phis updated.
1388 */
1389
1390 static void
1391 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1392 tree niters, edge update_e)
1393 {
1394 gphi_iterator gsi, gsi1;
1395 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1396 basic_block update_bb = update_e->dest;
1397 basic_block exit_bb = single_exit (loop)->dest;
1398
1399 /* Make sure there exists a single-predecessor exit bb: */
1400 gcc_assert (single_pred_p (exit_bb));
1401 gcc_assert (single_succ_edge (exit_bb) == update_e);
1402
1403 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1404 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1405 gsi_next (&gsi), gsi_next (&gsi1))
1406 {
1407 tree init_expr;
1408 tree step_expr, off;
1409 tree type;
1410 tree var, ni, ni_name;
1411 gimple_stmt_iterator last_gsi;
1412
1413 gphi *phi = gsi.phi ();
1414 gphi *phi1 = gsi1.phi ();
1415 if (dump_enabled_p ())
1416 {
1417 dump_printf_loc (MSG_NOTE, vect_location,
1418 "vect_update_ivs_after_vectorizer: phi: ");
1419 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1420 }
1421
1422 /* Skip reduction and virtual phis. */
1423 if (!iv_phi_p (phi))
1424 {
1425 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "reduc or virtual phi. skip.\n");
1428 continue;
1429 }
1430
1431 type = TREE_TYPE (gimple_phi_result (phi));
1432 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1433 step_expr = unshare_expr (step_expr);
1434
1435 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1436 of degree >= 2 or exponential. */
1437 gcc_assert (!tree_is_chrec (step_expr));
1438
1439 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1440
1441 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1442 fold_convert (TREE_TYPE (step_expr), niters),
1443 step_expr);
1444 if (POINTER_TYPE_P (type))
1445 ni = fold_build_pointer_plus (init_expr, off);
1446 else
1447 ni = fold_build2 (PLUS_EXPR, type,
1448 init_expr, fold_convert (type, off));
1449
1450 var = create_tmp_var (type, "tmp");
1451
1452 last_gsi = gsi_last_bb (exit_bb);
1453 gimple_seq new_stmts = NULL;
1454 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1455 /* Exit_bb shouldn't be empty. */
1456 if (!gsi_end_p (last_gsi))
1457 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1458 else
1459 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1460
1461 /* Fix phi expressions in the successor bb. */
1462 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1463 }
1464 }
1465
1466 /* Function vect_gen_prolog_loop_niters
1467
1468 Generate the number of iterations which should be peeled as prolog for the
1469 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1470 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1471 As a result, after the execution of this loop, the data reference DR will
1472 refer to an aligned location. The following computation is generated:
1473
1474 If the misalignment of DR is known at compile time:
1475 addr_mis = int mis = DR_MISALIGNMENT (dr);
1476 Else, compute address misalignment in bytes:
1477 addr_mis = addr & (vectype_align - 1)
1478
1479 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1480
1481 (elem_size = element type size; an element is the scalar element whose type
1482 is the inner type of the vectype)
1483
1484 The computations will be emitted at the end of BB. We also compute and
1485 store upper bound (included) of the result in BOUND.
1486
1487 When the step of the data-ref in the loop is not 1 (as in interleaved data
1488 and SLP), the number of iterations of the prolog must be divided by the step
1489 (which is equal to the size of interleaved group).
1490
1491 The above formulas assume that VF == number of elements in the vector. This
1492 may not hold when there are multiple-types in the loop.
1493 In this case, for some data-references in the loop the VF does not represent
1494 the number of elements that fit in the vector. Therefore, instead of VF we
1495 use TYPE_VECTOR_SUBPARTS. */
1496
1497 static tree
1498 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1499 basic_block bb, int *bound)
1500 {
1501 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1502 tree var;
1503 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1504 gimple_seq stmts = NULL, new_stmts = NULL;
1505 tree iters, iters_name;
1506 gimple *dr_stmt = DR_STMT (dr);
1507 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1508 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1509 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1510
1511 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1512 {
1513 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1514
1515 if (dump_enabled_p ())
1516 dump_printf_loc (MSG_NOTE, vect_location,
1517 "known peeling = %d.\n", npeel);
1518
1519 iters = build_int_cst (niters_type, npeel);
1520 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1521 }
1522 else
1523 {
1524 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1525 tree offset = negative
1526 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
1527 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
1528 &stmts, offset);
1529 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1530 tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1531 HOST_WIDE_INT elem_size
1532 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1533 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1534 HOST_WIDE_INT align_in_elems = target_align / elem_size;
1535 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1536 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1537 tree misalign_in_bytes;
1538 tree misalign_in_elems;
1539
1540 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1541 misalign_in_bytes
1542 = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr),
1543 target_align_minus_1);
1544
1545 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1546 misalign_in_elems
1547 = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log);
1548
1549 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1550 & (align_in_elems - 1)). */
1551 if (negative)
1552 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1553 align_in_elems_tree);
1554 else
1555 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1556 misalign_in_elems);
1557 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1558 iters = fold_convert (niters_type, iters);
1559 *bound = align_in_elems - 1;
1560 }
1561
1562 if (dump_enabled_p ())
1563 {
1564 dump_printf_loc (MSG_NOTE, vect_location,
1565 "niters for prolog loop: ");
1566 dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1567 dump_printf (MSG_NOTE, "\n");
1568 }
1569
1570 var = create_tmp_var (niters_type, "prolog_loop_niters");
1571 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1572
1573 if (new_stmts)
1574 gimple_seq_add_seq (&stmts, new_stmts);
1575 if (stmts)
1576 {
1577 gcc_assert (single_succ_p (bb));
1578 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1579 if (gsi_end_p (gsi))
1580 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1581 else
1582 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1583 }
1584 return iters_name;
1585 }
1586
1587
1588 /* Function vect_update_init_of_dr
1589
1590 NITERS iterations were peeled from LOOP. DR represents a data reference
1591 in LOOP. This function updates the information recorded in DR to
1592 account for the fact that the first NITERS iterations had already been
1593 executed. Specifically, it updates the OFFSET field of DR. */
1594
1595 static void
1596 vect_update_init_of_dr (struct data_reference *dr, tree niters)
1597 {
1598 tree offset = DR_OFFSET (dr);
1599
1600 niters = fold_build2 (MULT_EXPR, sizetype,
1601 fold_convert (sizetype, niters),
1602 fold_convert (sizetype, DR_STEP (dr)));
1603 offset = fold_build2 (PLUS_EXPR, sizetype,
1604 fold_convert (sizetype, offset), niters);
1605 DR_OFFSET (dr) = offset;
1606 }
1607
1608
1609 /* Function vect_update_inits_of_drs
1610
1611 NITERS iterations were peeled from the loop represented by LOOP_VINFO.
1612 This function updates the information recorded for the data references in
1613 the loop to account for the fact that the first NITERS iterations had
1614 already been executed. Specifically, it updates the initial_condition of
1615 the access_function of all the data_references in the loop. */
1616
1617 static void
1618 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
1619 {
1620 unsigned int i;
1621 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1622 struct data_reference *dr;
1623
1624 if (dump_enabled_p ())
1625 dump_printf_loc (MSG_NOTE, vect_location,
1626 "=== vect_update_inits_of_dr ===\n");
1627
1628 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1629 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1630 {
1631 gimple_seq seq;
1632 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1633 tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1634
1635 niters = fold_convert (sizetype, niters);
1636 niters = force_gimple_operand (niters, &seq, false, var);
1637 if (seq)
1638 {
1639 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1640 gcc_assert (!new_bb);
1641 }
1642 }
1643
1644 FOR_EACH_VEC_ELT (datarefs, i, dr)
1645 vect_update_init_of_dr (dr, niters);
1646 }
1647
1648
1649 /* This function builds ni_name = number of iterations. Statements
1650 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1651 it to TRUE if new ssa_var is generated. */
1652
1653 tree
1654 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1655 {
1656 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1657 if (TREE_CODE (ni) == INTEGER_CST)
1658 return ni;
1659 else
1660 {
1661 tree ni_name, var;
1662 gimple_seq stmts = NULL;
1663 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1664
1665 var = create_tmp_var (TREE_TYPE (ni), "niters");
1666 ni_name = force_gimple_operand (ni, &stmts, false, var);
1667 if (stmts)
1668 {
1669 gsi_insert_seq_on_edge_immediate (pe, stmts);
1670 if (new_var_p != NULL)
1671 *new_var_p = true;
1672 }
1673
1674 return ni_name;
1675 }
1676 }
1677
1678 /* Calculate the number of iterations above which vectorized loop will be
1679 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1680 of prolog loop. If it's integer const, the integer number is also passed
1681 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (included) of
1682 number of iterations of prolog loop. VFM1 is vector factor minus one.
1683 If CHECK_PROFITABILITY is true, TH is the threshold below which scalar
1684 (rather than vectorized) loop will be executed. This function stores
1685 upper bound (included) of the result in BOUND_SCALAR. */
1686
1687 static tree
1688 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1689 int bound_prolog, poly_int64 vfm1, int th,
1690 poly_uint64 *bound_scalar,
1691 bool check_profitability)
1692 {
1693 tree type = TREE_TYPE (niters_prolog);
1694 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1695 build_int_cst (type, vfm1));
1696
1697 *bound_scalar = vfm1 + bound_prolog;
1698 if (check_profitability)
1699 {
1700 /* TH indicates the minimum niters of vectorized loop, while we
1701 compute the maximum niters of scalar loop. */
1702 th--;
1703 /* Peeling for constant times. */
1704 if (int_niters_prolog >= 0)
1705 {
1706 *bound_scalar = upper_bound (int_niters_prolog + vfm1, th);
1707 return build_int_cst (type, *bound_scalar);
1708 }
1709 /* Peeling for unknown times. Note BOUND_PROLOG is the upper
1710 bound (inlcuded) of niters of prolog loop. */
1711 if (known_ge (th, vfm1 + bound_prolog))
1712 {
1713 *bound_scalar = th;
1714 return build_int_cst (type, th);
1715 }
1716 /* Need to do runtime comparison. */
1717 else if (maybe_gt (th, vfm1))
1718 {
1719 *bound_scalar = upper_bound (*bound_scalar, th);
1720 return fold_build2 (MAX_EXPR, type,
1721 build_int_cst (type, th), niters);
1722 }
1723 }
1724 return niters;
1725 }
1726
1727 /* NITERS is the number of times that the original scalar loop executes
1728 after peeling. Work out the maximum number of iterations N that can
1729 be handled by the vectorized form of the loop and then either:
1730
1731 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1732
1733 niters_vector = N
1734
1735 b) set *STEP_VECTOR_PTR to one and generate:
1736
1737 niters_vector = N / vf
1738
1739 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1740 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1741 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
1742
1743 void
1744 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1745 tree *niters_vector_ptr, tree *step_vector_ptr,
1746 bool niters_no_overflow)
1747 {
1748 tree ni_minus_gap, var;
1749 tree niters_vector, step_vector, type = TREE_TYPE (niters);
1750 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1751 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1752 tree log_vf = NULL_TREE;
1753
1754 /* If epilogue loop is required because of data accesses with gaps, we
1755 subtract one iteration from the total number of iterations here for
1756 correct calculation of RATIO. */
1757 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1758 {
1759 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1760 build_one_cst (type));
1761 if (!is_gimple_val (ni_minus_gap))
1762 {
1763 var = create_tmp_var (type, "ni_gap");
1764 gimple *stmts = NULL;
1765 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1766 true, var);
1767 gsi_insert_seq_on_edge_immediate (pe, stmts);
1768 }
1769 }
1770 else
1771 ni_minus_gap = niters;
1772
1773 unsigned HOST_WIDE_INT const_vf;
1774 if (vf.is_constant (&const_vf)
1775 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1776 {
1777 /* Create: niters >> log2(vf) */
1778 /* If it's known that niters == number of latch executions + 1 doesn't
1779 overflow, we can generate niters >> log2(vf); otherwise we generate
1780 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1781 will be at least one. */
1782 log_vf = build_int_cst (type, exact_log2 (const_vf));
1783 if (niters_no_overflow)
1784 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1785 else
1786 niters_vector
1787 = fold_build2 (PLUS_EXPR, type,
1788 fold_build2 (RSHIFT_EXPR, type,
1789 fold_build2 (MINUS_EXPR, type,
1790 ni_minus_gap,
1791 build_int_cst (type, vf)),
1792 log_vf),
1793 build_int_cst (type, 1));
1794 step_vector = build_one_cst (type);
1795 }
1796 else
1797 {
1798 niters_vector = ni_minus_gap;
1799 step_vector = build_int_cst (type, vf);
1800 }
1801
1802 if (!is_gimple_val (niters_vector))
1803 {
1804 var = create_tmp_var (type, "bnd");
1805 gimple_seq stmts = NULL;
1806 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1807 gsi_insert_seq_on_edge_immediate (pe, stmts);
1808 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1809 we set range information to make niters analyzer's life easier. */
1810 if (stmts != NULL && log_vf)
1811 set_range_info (niters_vector, VR_RANGE,
1812 wi::to_wide (build_int_cst (type, 1)),
1813 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1814 TYPE_MAX_VALUE (type),
1815 log_vf)));
1816 }
1817 *niters_vector_ptr = niters_vector;
1818 *step_vector_ptr = step_vector;
1819
1820 return;
1821 }
1822
1823 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1824 loop specified by LOOP_VINFO after vectorization, compute the number
1825 of iterations before vectorization (niters_vector * vf) and store it
1826 to NITERS_VECTOR_MULT_VF_PTR. */
1827
1828 static void
1829 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1830 tree niters_vector,
1831 tree *niters_vector_mult_vf_ptr)
1832 {
1833 /* We should be using a step_vector of VF if VF is variable. */
1834 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1835 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1836 tree type = TREE_TYPE (niters_vector);
1837 tree log_vf = build_int_cst (type, exact_log2 (vf));
1838 basic_block exit_bb = single_exit (loop)->dest;
1839
1840 gcc_assert (niters_vector_mult_vf_ptr != NULL);
1841 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
1842 niters_vector, log_vf);
1843 if (!is_gimple_val (niters_vector_mult_vf))
1844 {
1845 tree var = create_tmp_var (type, "niters_vector_mult_vf");
1846 gimple_seq stmts = NULL;
1847 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
1848 &stmts, true, var);
1849 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
1850 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1851 }
1852 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
1853 }
1854
1855 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
1856 from SECOND/FIRST and puts it at the original loop's preheader/exit
1857 edge, the two loops are arranged as below:
1858
1859 preheader_a:
1860 first_loop:
1861 header_a:
1862 i_1 = PHI<i_0, i_2>;
1863 ...
1864 i_2 = i_1 + 1;
1865 if (cond_a)
1866 goto latch_a;
1867 else
1868 goto between_bb;
1869 latch_a:
1870 goto header_a;
1871
1872 between_bb:
1873 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
1874
1875 second_loop:
1876 header_b:
1877 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
1878 or with i_2 if no LCSSA phi is created
1879 under condition of CREATE_LCSSA_FOR_IV_PHIS.
1880 ...
1881 i_4 = i_3 + 1;
1882 if (cond_b)
1883 goto latch_b;
1884 else
1885 goto exit_bb;
1886 latch_b:
1887 goto header_b;
1888
1889 exit_bb:
1890
1891 This function creates loop closed SSA for the first loop; update the
1892 second loop's PHI nodes by replacing argument on incoming edge with the
1893 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
1894 is false, Loop closed ssa phis will only be created for non-iv phis for
1895 the first loop.
1896
1897 This function assumes exit bb of the first loop is preheader bb of the
1898 second loop, i.e, between_bb in the example code. With PHIs updated,
1899 the second loop will execute rest iterations of the first. */
1900
1901 static void
1902 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
1903 struct loop *first, struct loop *second,
1904 bool create_lcssa_for_iv_phis)
1905 {
1906 gphi_iterator gsi_update, gsi_orig;
1907 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1908
1909 edge first_latch_e = EDGE_SUCC (first->latch, 0);
1910 edge second_preheader_e = loop_preheader_edge (second);
1911 basic_block between_bb = single_exit (first)->dest;
1912
1913 gcc_assert (between_bb == second_preheader_e->src);
1914 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
1915 /* Either the first loop or the second is the loop to be vectorized. */
1916 gcc_assert (loop == first || loop == second);
1917
1918 for (gsi_orig = gsi_start_phis (first->header),
1919 gsi_update = gsi_start_phis (second->header);
1920 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
1921 gsi_next (&gsi_orig), gsi_next (&gsi_update))
1922 {
1923 gphi *orig_phi = gsi_orig.phi ();
1924 gphi *update_phi = gsi_update.phi ();
1925
1926 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
1927 /* Generate lcssa PHI node for the first loop. */
1928 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
1929 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
1930 {
1931 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
1932 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
1933 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
1934 arg = new_res;
1935 }
1936
1937 /* Update PHI node in the second loop by replacing arg on the loop's
1938 incoming edge. */
1939 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
1940 }
1941 }
1942
1943 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
1944 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
1945 are two pred edges of the merge point before UPDATE_LOOP. The two loops
1946 appear like below:
1947
1948 guard_bb:
1949 if (cond)
1950 goto merge_bb;
1951 else
1952 goto skip_loop;
1953
1954 skip_loop:
1955 header_a:
1956 i_1 = PHI<i_0, i_2>;
1957 ...
1958 i_2 = i_1 + 1;
1959 if (cond_a)
1960 goto latch_a;
1961 else
1962 goto exit_a;
1963 latch_a:
1964 goto header_a;
1965
1966 exit_a:
1967 i_5 = PHI<i_2>;
1968
1969 merge_bb:
1970 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
1971
1972 update_loop:
1973 header_b:
1974 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
1975 ...
1976 i_4 = i_3 + 1;
1977 if (cond_b)
1978 goto latch_b;
1979 else
1980 goto exit_bb;
1981 latch_b:
1982 goto header_b;
1983
1984 exit_bb:
1985
1986 This function creates PHI nodes at merge_bb and replaces the use of i_5
1987 in the update_loop's PHI node with the result of new PHI result. */
1988
1989 static void
1990 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
1991 struct loop *update_loop,
1992 edge guard_edge, edge merge_edge)
1993 {
1994 source_location merge_loc, guard_loc;
1995 edge orig_e = loop_preheader_edge (skip_loop);
1996 edge update_e = loop_preheader_edge (update_loop);
1997 gphi_iterator gsi_orig, gsi_update;
1998
1999 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2000 gsi_update = gsi_start_phis (update_loop->header));
2001 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2002 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2003 {
2004 gphi *orig_phi = gsi_orig.phi ();
2005 gphi *update_phi = gsi_update.phi ();
2006
2007 /* Generate new phi node at merge bb of the guard. */
2008 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2009 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2010
2011 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2012 args in NEW_PHI for these edges. */
2013 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2014 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2015 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2016 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2017 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2018 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2019
2020 /* Update phi in UPDATE_PHI. */
2021 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2022 }
2023 }
2024
2025 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2026 this function searches for the corresponding lcssa phi node in exit
2027 bb of LOOP. If it is found, return the phi result; otherwise return
2028 NULL. */
2029
2030 static tree
2031 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2032 gphi *lcssa_phi)
2033 {
2034 gphi_iterator gsi;
2035 edge e = single_exit (loop);
2036
2037 gcc_assert (single_pred_p (e->dest));
2038 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2039 {
2040 gphi *phi = gsi.phi ();
2041 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2042 PHI_ARG_DEF (lcssa_phi, 0), 0))
2043 return PHI_RESULT (phi);
2044 }
2045 return NULL_TREE;
2046 }
2047
2048 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2049 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2050 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2051 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2052 The CFG looks like:
2053
2054 loop:
2055 header_a:
2056 i_1 = PHI<i_0, i_2>;
2057 ...
2058 i_2 = i_1 + 1;
2059 if (cond_a)
2060 goto latch_a;
2061 else
2062 goto exit_a;
2063 latch_a:
2064 goto header_a;
2065
2066 exit_a:
2067
2068 guard_bb:
2069 if (cond)
2070 goto merge_bb;
2071 else
2072 goto epilog_loop;
2073
2074 ;; fall_through_bb
2075
2076 epilog_loop:
2077 header_b:
2078 i_3 = PHI<i_2, i_4>;
2079 ...
2080 i_4 = i_3 + 1;
2081 if (cond_b)
2082 goto latch_b;
2083 else
2084 goto merge_bb;
2085 latch_b:
2086 goto header_b;
2087
2088 merge_bb:
2089 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2090
2091 exit_bb:
2092 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2093
2094 For each name used out side EPILOG (i.e - for each name that has a lcssa
2095 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2096 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2097 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2098 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2099 in exit_bb will also be updated. */
2100
2101 static void
2102 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2103 edge guard_edge, edge merge_edge)
2104 {
2105 gphi_iterator gsi;
2106 basic_block merge_bb = guard_edge->dest;
2107
2108 gcc_assert (single_succ_p (merge_bb));
2109 edge e = single_succ_edge (merge_bb);
2110 basic_block exit_bb = e->dest;
2111 gcc_assert (single_pred_p (exit_bb));
2112 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2113
2114 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2115 {
2116 gphi *update_phi = gsi.phi ();
2117 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2118 /* This loop-closed-phi actually doesn't represent a use out of the
2119 loop - the phi arg is a constant. */
2120 if (TREE_CODE (old_arg) != SSA_NAME)
2121 continue;
2122
2123 tree merge_arg = get_current_def (old_arg);
2124 if (!merge_arg)
2125 merge_arg = old_arg;
2126
2127 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2128 /* If the var is live after loop but not a reduction, we simply
2129 use the old arg. */
2130 if (!guard_arg)
2131 guard_arg = old_arg;
2132
2133 /* Create new phi node in MERGE_BB: */
2134 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2135 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2136
2137 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2138 the two PHI args in merge_phi for these edges. */
2139 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2140 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2141
2142 /* Update the original phi in exit_bb. */
2143 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2144 }
2145 }
2146
2147 /* EPILOG loop is duplicated from the original loop for vectorizing,
2148 the arg of its loop closed ssa PHI needs to be updated. */
2149
2150 static void
2151 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2152 {
2153 gphi_iterator gsi;
2154 basic_block exit_bb = single_exit (epilog)->dest;
2155
2156 gcc_assert (single_pred_p (exit_bb));
2157 edge e = EDGE_PRED (exit_bb, 0);
2158 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2159 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2160 }
2161
2162 /* Function vect_do_peeling.
2163
2164 Input:
2165 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2166
2167 preheader:
2168 LOOP:
2169 header_bb:
2170 loop_body
2171 if (exit_loop_cond) goto exit_bb
2172 else goto header_bb
2173 exit_bb:
2174
2175 - NITERS: The number of iterations of the loop.
2176 - NITERSM1: The number of iterations of the loop's latch.
2177 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2178 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2179 CHECK_PROFITABILITY is true.
2180 Output:
2181 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2182 iterate after vectorization; see vect_set_loop_condition for details.
2183 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2184 should be set to the number of scalar iterations handled by the
2185 vector loop. The SSA name is only used on exit from the loop.
2186
2187 This function peels prolog and epilog from the loop, adds guards skipping
2188 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2189 would look like:
2190
2191 guard_bb_1:
2192 if (prefer_scalar_loop) goto merge_bb_1
2193 else goto guard_bb_2
2194
2195 guard_bb_2:
2196 if (skip_prolog) goto merge_bb_2
2197 else goto prolog_preheader
2198
2199 prolog_preheader:
2200 PROLOG:
2201 prolog_header_bb:
2202 prolog_body
2203 if (exit_prolog_cond) goto prolog_exit_bb
2204 else goto prolog_header_bb
2205 prolog_exit_bb:
2206
2207 merge_bb_2:
2208
2209 vector_preheader:
2210 VECTOR LOOP:
2211 vector_header_bb:
2212 vector_body
2213 if (exit_vector_cond) goto vector_exit_bb
2214 else goto vector_header_bb
2215 vector_exit_bb:
2216
2217 guard_bb_3:
2218 if (skip_epilog) goto merge_bb_3
2219 else goto epilog_preheader
2220
2221 merge_bb_1:
2222
2223 epilog_preheader:
2224 EPILOG:
2225 epilog_header_bb:
2226 epilog_body
2227 if (exit_epilog_cond) goto merge_bb_3
2228 else goto epilog_header_bb
2229
2230 merge_bb_3:
2231
2232 Note this function peels prolog and epilog only if it's necessary,
2233 as well as guards.
2234 Returns created epilogue or NULL.
2235
2236 TODO: Guard for prefer_scalar_loop should be emitted along with
2237 versioning conditions if loop versioning is needed. */
2238
2239
2240 struct loop *
2241 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2242 tree *niters_vector, tree *step_vector,
2243 tree *niters_vector_mult_vf_var, int th,
2244 bool check_profitability, bool niters_no_overflow)
2245 {
2246 edge e, guard_e;
2247 tree type = TREE_TYPE (niters), guard_cond;
2248 basic_block guard_bb, guard_to;
2249 profile_probability prob_prolog, prob_vector, prob_epilog;
2250 int bound_prolog = 0;
2251 poly_uint64 bound_scalar = 0;
2252 int estimated_vf;
2253 int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2254 bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2255 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2256
2257 if (!prolog_peeling && !epilog_peeling)
2258 return NULL;
2259
2260 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2261 estimated_vf = vect_vf_for_cost (loop_vinfo);
2262 if (estimated_vf == 2)
2263 estimated_vf = 3;
2264 prob_prolog = prob_epilog = profile_probability::guessed_always ()
2265 .apply_scale (estimated_vf - 1, estimated_vf);
2266 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267
2268 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2269 struct loop *first_loop = loop;
2270 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2271 create_lcssa_for_virtual_phi (loop);
2272 update_ssa (TODO_update_ssa_only_virtuals);
2273
2274 if (MAY_HAVE_DEBUG_BIND_STMTS)
2275 {
2276 gcc_assert (!adjust_vec.exists ());
2277 adjust_vec.create (32);
2278 }
2279 initialize_original_copy_tables ();
2280
2281 /* Prolog loop may be skipped. */
2282 bool skip_prolog = (prolog_peeling != 0);
2283 /* Skip to epilog if scalar loop may be preferred. It's only needed
2284 when we peel for epilog loop and when it hasn't been checked with
2285 loop versioning. */
2286 bool skip_vector = ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2287 && !LOOP_REQUIRES_VERSIONING (loop_vinfo))
2288 || !vf.is_constant ());
2289 /* Epilog loop must be executed if the number of iterations for epilog
2290 loop is known at compile time, otherwise we need to add a check at
2291 the end of vector loop and skip to the end of epilog loop. */
2292 bool skip_epilog = (prolog_peeling < 0
2293 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2294 || !vf.is_constant ());
2295 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2296 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2297 skip_epilog = false;
2298
2299 /* Record the anchor bb at which guard should be placed if scalar loop
2300 may be preferred. */
2301 basic_block anchor = loop_preheader_edge (loop)->src;
2302 if (skip_vector)
2303 {
2304 split_edge (loop_preheader_edge (loop));
2305
2306 /* Due to the order in which we peel prolog and epilog, we first
2307 propagate probability to the whole loop. The purpose is to
2308 avoid adjusting probabilities of both prolog and vector loops
2309 separately. Note in this case, the probability of epilog loop
2310 needs to be scaled back later. */
2311 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2312 if (prob_vector.initialized_p ())
2313 {
2314 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2315 scale_loop_profile (loop, prob_vector, 0);
2316 }
2317 }
2318
2319 tree niters_prolog = build_int_cst (type, 0);
2320 source_location loop_loc = find_loop_location (loop);
2321 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2322 if (prolog_peeling)
2323 {
2324 e = loop_preheader_edge (loop);
2325 if (!slpeel_can_duplicate_loop_p (loop, e))
2326 {
2327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2328 "loop can't be duplicated to preheader edge.\n");
2329 gcc_unreachable ();
2330 }
2331 /* Peel prolog and put it on preheader edge of loop. */
2332 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2333 if (!prolog)
2334 {
2335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2336 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2337 gcc_unreachable ();
2338 }
2339 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2340 first_loop = prolog;
2341 reset_original_copy_tables ();
2342
2343 /* Generate and update the number of iterations for prolog loop. */
2344 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2345 &bound_prolog);
2346 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2347 vect_set_loop_condition (prolog, NULL, niters_prolog,
2348 step_prolog, NULL_TREE, false);
2349
2350 /* Skip the prolog loop. */
2351 if (skip_prolog)
2352 {
2353 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2354 niters_prolog, build_int_cst (type, 0));
2355 guard_bb = loop_preheader_edge (prolog)->src;
2356 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2357 guard_to = split_edge (loop_preheader_edge (loop));
2358 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2359 guard_to, guard_bb,
2360 prob_prolog.invert (),
2361 irred_flag);
2362 e = EDGE_PRED (guard_to, 0);
2363 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2364 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2365
2366 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2367 scale_loop_profile (prolog, prob_prolog, bound_prolog);
2368 }
2369 /* Update init address of DRs. */
2370 vect_update_inits_of_drs (loop_vinfo, niters_prolog);
2371 /* Update niters for vector loop. */
2372 LOOP_VINFO_NITERS (loop_vinfo)
2373 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2374 LOOP_VINFO_NITERSM1 (loop_vinfo)
2375 = fold_build2 (MINUS_EXPR, type,
2376 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2377 bool new_var_p = false;
2378 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2379 /* It's guaranteed that vector loop bound before vectorization is at
2380 least VF, so set range information for newly generated var. */
2381 if (new_var_p)
2382 set_range_info (niters, VR_RANGE,
2383 wi::to_wide (build_int_cst (type, vf)),
2384 wi::to_wide (TYPE_MAX_VALUE (type)));
2385
2386 /* Prolog iterates at most bound_prolog times, latch iterates at
2387 most bound_prolog - 1 times. */
2388 record_niter_bound (prolog, bound_prolog - 1, false, true);
2389 delete_update_ssa ();
2390 adjust_vec_debug_stmts ();
2391 scev_reset ();
2392 }
2393
2394 if (epilog_peeling)
2395 {
2396 e = single_exit (loop);
2397 if (!slpeel_can_duplicate_loop_p (loop, e))
2398 {
2399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2400 "loop can't be duplicated to exit edge.\n");
2401 gcc_unreachable ();
2402 }
2403 /* Peel epilog and put it on exit edge of loop. */
2404 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2405 if (!epilog)
2406 {
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2408 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2409 gcc_unreachable ();
2410 }
2411 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2412
2413 /* Scalar version loop may be preferred. In this case, add guard
2414 and skip to epilog. Note this only happens when the number of
2415 iterations of loop is unknown at compile time, otherwise this
2416 won't be vectorized. */
2417 if (skip_vector)
2418 {
2419 /* Additional epilogue iteration is peeled if gap exists. */
2420 bool peel_for_gaps = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2421 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2422 bound_prolog,
2423 peel_for_gaps ? vf : vf - 1,
2424 th, &bound_scalar,
2425 check_profitability);
2426 /* Build guard against NITERSM1 since NITERS may overflow. */
2427 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2428 guard_bb = anchor;
2429 guard_to = split_edge (loop_preheader_edge (epilog));
2430 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2431 guard_to, guard_bb,
2432 prob_vector.invert (),
2433 irred_flag);
2434 e = EDGE_PRED (guard_to, 0);
2435 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2436 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2437
2438 /* Simply propagate profile info from guard_bb to guard_to which is
2439 a merge point of control flow. */
2440 guard_to->count = guard_bb->count;
2441
2442 /* Scale probability of epilog loop back.
2443 FIXME: We should avoid scaling down and back up. Profile may
2444 get lost if we scale down to 0. */
2445 basic_block *bbs = get_loop_body (epilog);
2446 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2447 bbs[i]->count = bbs[i]->count.apply_scale
2448 (bbs[i]->count,
2449 bbs[i]->count.apply_probability
2450 (prob_vector));
2451 free (bbs);
2452 }
2453
2454 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2455 tree niters_vector_mult_vf;
2456 /* If loop is peeled for non-zero constant times, now niters refers to
2457 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2458 overflows. */
2459 niters_no_overflow |= (prolog_peeling > 0);
2460 vect_gen_vector_loop_niters (loop_vinfo, niters,
2461 niters_vector, step_vector,
2462 niters_no_overflow);
2463 if (!integer_onep (*step_vector))
2464 {
2465 /* On exit from the loop we will have an easy way of calcalating
2466 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2467 until then. */
2468 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2469 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2470 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2471 }
2472 else
2473 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2474 &niters_vector_mult_vf);
2475 /* Update IVs of original loop as if they were advanced by
2476 niters_vector_mult_vf steps. */
2477 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2478 edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2479 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2480 update_e);
2481
2482 if (skip_epilog)
2483 {
2484 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2485 niters, niters_vector_mult_vf);
2486 guard_bb = single_exit (loop)->dest;
2487 guard_to = split_edge (single_exit (epilog));
2488 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2489 skip_vector ? anchor : guard_bb,
2490 prob_epilog.invert (),
2491 irred_flag);
2492 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2493 single_exit (epilog));
2494 /* Only need to handle basic block before epilog loop if it's not
2495 the guard_bb, which is the case when skip_vector is true. */
2496 if (guard_bb != bb_before_epilog)
2497 {
2498 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2499
2500 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2501 }
2502 scale_loop_profile (epilog, prob_epilog, 0);
2503 }
2504 else
2505 slpeel_update_phi_nodes_for_lcssa (epilog);
2506
2507 unsigned HOST_WIDE_INT bound1, bound2;
2508 if (vf.is_constant (&bound1) && bound_scalar.is_constant (&bound2))
2509 {
2510 bound1 -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 2;
2511 if (bound2)
2512 /* We share epilog loop with scalar version loop. */
2513 bound1 = MAX (bound1, bound2 - 1);
2514 record_niter_bound (epilog, bound1, false, true);
2515 }
2516
2517 delete_update_ssa ();
2518 adjust_vec_debug_stmts ();
2519 scev_reset ();
2520 }
2521 adjust_vec.release ();
2522 free_original_copy_tables ();
2523
2524 return epilog;
2525 }
2526
2527 /* Function vect_create_cond_for_niters_checks.
2528
2529 Create a conditional expression that represents the run-time checks for
2530 loop's niter. The loop is guaranteed to to terminate if the run-time
2531 checks hold.
2532
2533 Input:
2534 COND_EXPR - input conditional expression. New conditions will be chained
2535 with logical AND operation. If it is NULL, then the function
2536 is used to return the number of alias checks.
2537 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2538 to be checked.
2539
2540 Output:
2541 COND_EXPR - conditional expression.
2542
2543 The returned COND_EXPR is the conditional expression to be used in the
2544 if statement that controls which version of the loop gets executed at
2545 runtime. */
2546
2547 static void
2548 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2549 {
2550 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2551
2552 if (*cond_expr)
2553 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2554 *cond_expr, part_cond_expr);
2555 else
2556 *cond_expr = part_cond_expr;
2557 }
2558
2559 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2560 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2561
2562 static void
2563 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2564 {
2565 if (*cond_expr)
2566 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2567 *cond_expr, part_cond_expr);
2568 else
2569 *cond_expr = part_cond_expr;
2570 }
2571
2572 /* Function vect_create_cond_for_align_checks.
2573
2574 Create a conditional expression that represents the alignment checks for
2575 all of data references (array element references) whose alignment must be
2576 checked at runtime.
2577
2578 Input:
2579 COND_EXPR - input conditional expression. New conditions will be chained
2580 with logical AND operation.
2581 LOOP_VINFO - two fields of the loop information are used.
2582 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2583 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2584
2585 Output:
2586 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2587 expression.
2588 The returned value is the conditional expression to be used in the if
2589 statement that controls which version of the loop gets executed at runtime.
2590
2591 The algorithm makes two assumptions:
2592 1) The number of bytes "n" in a vector is a power of 2.
2593 2) An address "a" is aligned if a%n is zero and that this
2594 test can be done as a&(n-1) == 0. For example, for 16
2595 byte vectors the test is a&0xf == 0. */
2596
2597 static void
2598 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2599 tree *cond_expr,
2600 gimple_seq *cond_expr_stmt_list)
2601 {
2602 vec<gimple *> may_misalign_stmts
2603 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2604 gimple *ref_stmt;
2605 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2606 tree mask_cst;
2607 unsigned int i;
2608 tree int_ptrsize_type;
2609 char tmp_name[20];
2610 tree or_tmp_name = NULL_TREE;
2611 tree and_tmp_name;
2612 gimple *and_stmt;
2613 tree ptrsize_zero;
2614 tree part_cond_expr;
2615
2616 /* Check that mask is one less than a power of 2, i.e., mask is
2617 all zeros followed by all ones. */
2618 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2619
2620 int_ptrsize_type = signed_type_for (ptr_type_node);
2621
2622 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2623 of the first vector of the i'th data reference. */
2624
2625 FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
2626 {
2627 gimple_seq new_stmt_list = NULL;
2628 tree addr_base;
2629 tree addr_tmp_name;
2630 tree new_or_tmp_name;
2631 gimple *addr_stmt, *or_stmt;
2632 stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2633 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2634 bool negative = tree_int_cst_compare
2635 (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2636 tree offset = negative
2637 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2638
2639 /* create: addr_tmp = (int)(address_of_first_vector) */
2640 addr_base =
2641 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
2642 offset);
2643 if (new_stmt_list != NULL)
2644 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2645
2646 sprintf (tmp_name, "addr2int%d", i);
2647 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2648 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2649 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2650
2651 /* The addresses are OR together. */
2652
2653 if (or_tmp_name != NULL_TREE)
2654 {
2655 /* create: or_tmp = or_tmp | addr_tmp */
2656 sprintf (tmp_name, "orptrs%d", i);
2657 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2658 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2659 or_tmp_name, addr_tmp_name);
2660 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2661 or_tmp_name = new_or_tmp_name;
2662 }
2663 else
2664 or_tmp_name = addr_tmp_name;
2665
2666 } /* end for i */
2667
2668 mask_cst = build_int_cst (int_ptrsize_type, mask);
2669
2670 /* create: and_tmp = or_tmp & mask */
2671 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2672
2673 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2674 or_tmp_name, mask_cst);
2675 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2676
2677 /* Make and_tmp the left operand of the conditional test against zero.
2678 if and_tmp has a nonzero bit then some address is unaligned. */
2679 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2680 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2681 and_tmp_name, ptrsize_zero);
2682 chain_cond_expr (cond_expr, part_cond_expr);
2683 }
2684
2685 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2686 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2687 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2688 and this new condition are true. Treat a null *COND_EXPR as "true". */
2689
2690 static void
2691 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2692 {
2693 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2694 unsigned int i;
2695 vec_object_pair *pair;
2696 FOR_EACH_VEC_ELT (pairs, i, pair)
2697 {
2698 tree addr1 = build_fold_addr_expr (pair->first);
2699 tree addr2 = build_fold_addr_expr (pair->second);
2700 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2701 addr1, addr2);
2702 chain_cond_expr (cond_expr, part_cond_expr);
2703 }
2704 }
2705
2706 /* Function vect_create_cond_for_alias_checks.
2707
2708 Create a conditional expression that represents the run-time checks for
2709 overlapping of address ranges represented by a list of data references
2710 relations passed as input.
2711
2712 Input:
2713 COND_EXPR - input conditional expression. New conditions will be chained
2714 with logical AND operation. If it is NULL, then the function
2715 is used to return the number of alias checks.
2716 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2717 to be checked.
2718
2719 Output:
2720 COND_EXPR - conditional expression.
2721
2722 The returned COND_EXPR is the conditional expression to be used in the if
2723 statement that controls which version of the loop gets executed at runtime.
2724 */
2725
2726 void
2727 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2728 {
2729 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2730 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2731
2732 if (comp_alias_ddrs.is_empty ())
2733 return;
2734
2735 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2736 &comp_alias_ddrs, cond_expr);
2737 if (dump_enabled_p ())
2738 dump_printf_loc (MSG_NOTE, vect_location,
2739 "created %u versioning for alias checks.\n",
2740 comp_alias_ddrs.length ());
2741 }
2742
2743
2744 /* Function vect_loop_versioning.
2745
2746 If the loop has data references that may or may not be aligned or/and
2747 has data reference relations whose independence was not proven then
2748 two versions of the loop need to be generated, one which is vectorized
2749 and one which isn't. A test is then generated to control which of the
2750 loops is executed. The test checks for the alignment of all of the
2751 data references that may or may not be aligned. An additional
2752 sequence of runtime tests is generated for each pairs of DDRs whose
2753 independence was not proven. The vectorized version of loop is
2754 executed only if both alias and alignment tests are passed.
2755
2756 The test generated to check which version of loop is executed
2757 is modified to also check for profitability as indicated by the
2758 cost model threshold TH.
2759
2760 The versioning precondition(s) are placed in *COND_EXPR and
2761 *COND_EXPR_STMT_LIST. */
2762
2763 void
2764 vect_loop_versioning (loop_vec_info loop_vinfo,
2765 unsigned int th, bool check_profitability,
2766 poly_uint64 versioning_threshold)
2767 {
2768 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2769 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2770 basic_block condition_bb;
2771 gphi_iterator gsi;
2772 gimple_stmt_iterator cond_exp_gsi;
2773 basic_block merge_bb;
2774 basic_block new_exit_bb;
2775 edge new_exit_e, e;
2776 gphi *orig_phi, *new_phi;
2777 tree cond_expr = NULL_TREE;
2778 gimple_seq cond_expr_stmt_list = NULL;
2779 tree arg;
2780 profile_probability prob = profile_probability::likely ();
2781 gimple_seq gimplify_stmt_list = NULL;
2782 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2783 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2784 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2785 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2786
2787 if (check_profitability)
2788 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2789 build_int_cst (TREE_TYPE (scalar_loop_iters),
2790 th - 1));
2791 if (maybe_ne (versioning_threshold, 0U))
2792 {
2793 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2794 build_int_cst (TREE_TYPE (scalar_loop_iters),
2795 versioning_threshold - 1));
2796 if (cond_expr)
2797 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2798 expr, cond_expr);
2799 else
2800 cond_expr = expr;
2801 }
2802
2803 if (version_niter)
2804 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
2805
2806 if (cond_expr)
2807 cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
2808 is_gimple_condexpr, NULL_TREE);
2809
2810 if (version_align)
2811 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
2812 &cond_expr_stmt_list);
2813
2814 if (version_alias)
2815 {
2816 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
2817 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
2818 }
2819
2820 cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list,
2821 is_gimple_condexpr, NULL_TREE);
2822 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
2823
2824 initialize_original_copy_tables ();
2825 if (scalar_loop)
2826 {
2827 edge scalar_e;
2828 basic_block preheader, scalar_preheader;
2829
2830 /* We don't want to scale SCALAR_LOOP's frequencies, we need to
2831 scale LOOP's frequencies instead. */
2832 nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
2833 prob, prob.invert (), prob, prob.invert (), true);
2834 scale_loop_frequencies (loop, prob);
2835 /* CONDITION_BB was created above SCALAR_LOOP's preheader,
2836 while we need to move it above LOOP's preheader. */
2837 e = loop_preheader_edge (loop);
2838 scalar_e = loop_preheader_edge (scalar_loop);
2839 gcc_assert (empty_block_p (e->src)
2840 && single_pred_p (e->src));
2841 gcc_assert (empty_block_p (scalar_e->src)
2842 && single_pred_p (scalar_e->src));
2843 gcc_assert (single_pred_p (condition_bb));
2844 preheader = e->src;
2845 scalar_preheader = scalar_e->src;
2846 scalar_e = find_edge (condition_bb, scalar_preheader);
2847 e = single_pred_edge (preheader);
2848 redirect_edge_and_branch_force (single_pred_edge (condition_bb),
2849 scalar_preheader);
2850 redirect_edge_and_branch_force (scalar_e, preheader);
2851 redirect_edge_and_branch_force (e, condition_bb);
2852 set_immediate_dominator (CDI_DOMINATORS, condition_bb,
2853 single_pred (condition_bb));
2854 set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
2855 single_pred (scalar_preheader));
2856 set_immediate_dominator (CDI_DOMINATORS, preheader,
2857 condition_bb);
2858 }
2859 else
2860 nloop = loop_version (loop, cond_expr, &condition_bb,
2861 prob, prob.invert (), prob, prob.invert (), true);
2862
2863 if (version_niter)
2864 {
2865 /* The versioned loop could be infinite, we need to clear existing
2866 niter information which is copied from the original loop. */
2867 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
2868 vect_free_loop_info_assumptions (nloop);
2869 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
2870 loop_constraint_set (loop, LOOP_C_INFINITE);
2871 }
2872
2873 if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
2874 && dump_enabled_p ())
2875 {
2876 if (version_alias)
2877 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2878 "loop versioned for vectorization because of "
2879 "possible aliasing\n");
2880 if (version_align)
2881 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2882 "loop versioned for vectorization to enhance "
2883 "alignment\n");
2884
2885 }
2886 free_original_copy_tables ();
2887
2888 /* Loop versioning violates an assumption we try to maintain during
2889 vectorization - that the loop exit block has a single predecessor.
2890 After versioning, the exit block of both loop versions is the same
2891 basic block (i.e. it has two predecessors). Just in order to simplify
2892 following transformations in the vectorizer, we fix this situation
2893 here by adding a new (empty) block on the exit-edge of the loop,
2894 with the proper loop-exit phis to maintain loop-closed-form.
2895 If loop versioning wasn't done from loop, but scalar_loop instead,
2896 merge_bb will have already just a single successor. */
2897
2898 merge_bb = single_exit (loop)->dest;
2899 if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
2900 {
2901 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
2902 new_exit_bb = split_edge (single_exit (loop));
2903 new_exit_e = single_exit (loop);
2904 e = EDGE_SUCC (new_exit_bb, 0);
2905
2906 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2907 {
2908 tree new_res;
2909 orig_phi = gsi.phi ();
2910 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2911 new_phi = create_phi_node (new_res, new_exit_bb);
2912 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
2913 add_phi_arg (new_phi, arg, new_exit_e,
2914 gimple_phi_arg_location_from_edge (orig_phi, e));
2915 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
2916 }
2917 }
2918
2919 /* End loop-exit-fixes after versioning. */
2920
2921 if (cond_expr_stmt_list)
2922 {
2923 cond_exp_gsi = gsi_last_bb (condition_bb);
2924 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
2925 GSI_SAME_STMT);
2926 }
2927 update_ssa (TODO_update_ssa);
2928 }