]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-loop-manip.c
Correct a function pre/postcondition [PR102403].
[thirdparty/gcc.git] / gcc / tree-vect-loop-manip.c
CommitLineData
b8698a0f 1/* Vectorizer Specific Loop Manipulations
99dee823 2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
b8698a0f 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
ebfd146a
IR
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
40e23961 26#include "tree.h"
c7131fb2 27#include "gimple.h"
957060b5
AM
28#include "cfghooks.h"
29#include "tree-pass.h"
c7131fb2 30#include "ssa.h"
c7131fb2 31#include "fold-const.h"
60393bbc 32#include "cfganal.h"
45b0be94 33#include "gimplify.h"
5be5c238 34#include "gimple-iterator.h"
18f429e2 35#include "gimplify-me.h"
442b4905 36#include "tree-cfg.h"
e28030cf 37#include "tree-ssa-loop-manip.h"
442b4905 38#include "tree-into-ssa.h"
7a300452 39#include "tree-ssa.h"
ebfd146a 40#include "cfgloop.h"
ebfd146a
IR
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
2a93954e 43#include "tree-ssa-loop-ivopts.h"
0f26839a 44#include "gimple-fold.h"
7cfb4d93
RS
45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
0a0ef238
RS
50#include "insn-config.h"
51#include "rtl.h"
52#include "recog.h"
ebfd146a
IR
53
54/*************************************************************************
55 Simple Loop Peeling Utilities
56
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
59
60
61/* Renames the use *OP_P. */
62
63static void
64rename_use_op (use_operand_p op_p)
65{
66 tree new_name;
67
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
70
71 new_name = get_current_def (USE_FROM_PTR (op_p));
72
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
76
77 /* An ordinary ssa name defined in the loop. */
78
79 SET_USE (op_p, new_name);
80}
81
82
cb330ba5 83/* Renames the variables in basic block BB. Allow renaming of PHI arguments
a6c51a12
YR
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
ebfd146a 86
2cfc56b9 87static void
a6c51a12 88rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
ebfd146a 89{
355fe088 90 gimple *stmt;
ebfd146a
IR
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
99b1c316
MS
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
a6c51a12
YR
97
98 if (rename_from_outer_loop)
99 {
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
102 }
ebfd146a 103
538dd0b7
DM
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
ebfd146a
IR
106 {
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
110 }
111
2cfc56b9 112 FOR_EACH_EDGE (e, ei, bb->preds)
ebfd146a 113 {
cb330ba5
JJ
114 if (!flow_bb_inside_loop_p (loop, e->src))
115 {
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
119 {
120 if (outer_loop->inner->next)
121 {
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
128 }
cb330ba5
JJ
129 }
130 }
538dd0b7
DM
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
ebfd146a
IR
134 }
135}
136
137
a79683d5 138struct adjust_info
684f25f4
AO
139{
140 tree from, to;
141 basic_block bb;
a79683d5 142};
684f25f4 143
684f25f4
AO
144/* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
ff4c81cc 150static vec<adjust_info, va_heap> adjust_vec;
684f25f4
AO
151
152/* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
155
156static void
157adjust_debug_stmts_now (adjust_info *ai)
158{
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
355fe088 163 gimple *stmt;
684f25f4
AO
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171 {
172 use_operand_p use_p;
173 basic_block bbuse;
174
175 if (!is_gimple_debug (stmt))
176 continue;
177
178 gcc_assert (gimple_debug_bind_p (stmt));
179
180 bbuse = gimple_bb (stmt);
181
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 {
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
191 {
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
194 }
195 }
196 }
197}
198
199/* Adjust debug stmts as scheduled before. */
200
201static void
202adjust_vec_debug_stmts (void)
203{
36f52e8f 204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
684f25f4
AO
205 return;
206
9771b263 207 gcc_assert (adjust_vec.exists ());
684f25f4 208
9771b263 209 while (!adjust_vec.is_empty ())
684f25f4 210 {
9771b263
DN
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
684f25f4 213 }
684f25f4
AO
214}
215
216/* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
220
221static void
222adjust_debug_stmts (tree from, tree to, basic_block bb)
223{
224 adjust_info ai;
225
36f52e8f 226 if (MAY_HAVE_DEBUG_BIND_STMTS
a471762f 227 && TREE_CODE (from) == SSA_NAME
a52ca739 228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
a471762f 229 && ! virtual_operand_p (from))
684f25f4
AO
230 {
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
234
9771b263
DN
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
684f25f4
AO
237 else
238 adjust_debug_stmts_now (&ai);
239 }
240}
241
242/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
246
247static void
355fe088 248adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
684f25f4
AO
249{
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253
36f52e8f 254 if (MAY_HAVE_DEBUG_BIND_STMTS)
684f25f4
AO
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
257}
258
37478789
KL
259/* Define one loop rgroup control CTRL from loop LOOP. INIT_CTRL is the value
260 that the control should have during the first iteration and NEXT_CTRL is the
7cfb4d93 261 value that it should have on subsequent iterations. */
0f26839a 262
7cfb4d93 263static void
37478789
KL
264vect_set_loop_control (class loop *loop, tree ctrl, tree init_ctrl,
265 tree next_ctrl)
7cfb4d93 266{
37478789
KL
267 gphi *phi = create_phi_node (ctrl, loop->header);
268 add_phi_arg (phi, init_ctrl, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_ctrl, loop_latch_edge (loop), UNKNOWN_LOCATION);
7cfb4d93 270}
0f26839a 271
7cfb4d93 272/* Add SEQ to the end of LOOP's preheader block. */
ebfd146a 273
7cfb4d93 274static void
99b1c316 275add_preheader_seq (class loop *loop, gimple_seq seq)
7cfb4d93
RS
276{
277 if (seq)
278 {
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
282 }
283}
ebfd146a 284
7cfb4d93
RS
285/* Add SEQ to the beginning of LOOP's header block. */
286
287static void
99b1c316 288add_header_seq (class loop *loop, gimple_seq seq)
7cfb4d93
RS
289{
290 if (seq)
291 {
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 }
295}
296
297/* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
301
302static bool
303interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
305{
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
313 }
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316}
317
318/* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
321
322static bool
37478789
KL
323vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
324 rgroup_controls *src_rgm)
7cfb4d93 325{
37478789
KL
326 tree src_masktype = src_rgm->type;
327 tree dest_masktype = dest_rgm->type;
7cfb4d93 328 machine_mode src_mode = TYPE_MODE (src_masktype);
0a0ef238 329 insn_code icode1, icode2;
7cfb4d93 330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
0a0ef238
RS
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
7cfb4d93
RS
335 {
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
0a0ef238
RS
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
37478789 341 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
7cfb4d93 342 {
37478789
KL
343 tree src = src_rgm->controls[i / 2];
344 tree dest = dest_rgm->controls[i];
9bfb28ed
RS
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
7cfb4d93
RS
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
352 {
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
359 }
360 gimple_seq_add_stmt (seq, stmt);
361 }
362 return true;
363 }
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
368 {
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
37478789 374 for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
7cfb4d93 375 {
37478789
KL
376 tree src = src_rgm->controls[i / 2];
377 tree dest = dest_rgm->controls[i];
7cfb4d93
RS
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
381 }
382 return true;
383 }
384 return false;
385}
386
37478789
KL
387/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
388 for all the rgroup controls in RGC and return a control that is nonzero
389 when the loop needs to iterate. Add any new preheader statements to
390 PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
7cfb4d93 391
37478789 392 RGC belongs to loop LOOP. The loop originally iterated NITERS
fcae0292 393 times and has been vectorized according to LOOP_VINFO.
7cfb4d93 394
535e7c11
RS
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
399
7cfb4d93
RS
400 It is known that:
401
9fb832ce 402 NITERS * RGC->max_nscalars_per_iter * RGC->factor
7cfb4d93
RS
403
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
406
9fb832ce 407 VF * RGC->max_nscalars_per_iter * RGC->factor
7cfb4d93
RS
408
409 might overflow before hitting a value above:
410
9fb832ce 411 (NITERS + NITERS_SKIP) * RGC->max_nscalars_per_iter * RGC->factor
7cfb4d93
RS
412
413 This means that we cannot guarantee that such an induction variable
9fb832ce 414 would ever hit a value that produces a set of all-false masks or zero
dfdf9085
KL
415 lengths for RGC.
416
417 Note: the cost of the code generated by this function is modeled
418 by vect_estimate_min_profitable_iters, so changes here may need
419 corresponding changes there. */
7cfb4d93
RS
420
421static tree
37478789
KL
422vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
423 gimple_seq *preheader_seq,
424 gimple_stmt_iterator loop_cond_gsi,
425 rgroup_controls *rgc, tree niters,
426 tree niters_skip, bool might_wrap_p)
7cfb4d93 427{
04f0546e
KL
428 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
429 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
9fb832ce
KL
430 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
431
37478789 432 tree ctrl_type = rgc->type;
9fb832ce
KL
433 unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
434 poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
fcae0292 435 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9fb832ce
KL
436 tree length_limit = NULL_TREE;
437 /* For length, we need length_limit to ensure length in range. */
438 if (!use_masks_p)
439 length_limit = build_int_cst (compare_type, nitems_per_ctrl);
7cfb4d93 440
9fb832ce 441 /* Calculate the maximum number of item values that the rgroup
535e7c11
RS
442 handles in total, the number that it handles for each iteration
443 of the vector loop, and the number that it should skip during the
444 first iteration of the vector loop. */
9fb832ce
KL
445 tree nitems_total = niters;
446 tree nitems_step = build_int_cst (iv_type, vf);
447 tree nitems_skip = niters_skip;
448 if (nitems_per_iter != 1)
7cfb4d93 449 {
37478789
KL
450 /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
451 these multiplications don't overflow. */
9fb832ce
KL
452 tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
453 tree iv_factor = build_int_cst (iv_type, nitems_per_iter);
454 nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
455 nitems_total, compare_factor);
456 nitems_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
457 nitems_step, iv_factor);
458 if (nitems_skip)
459 nitems_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
460 nitems_skip, compare_factor);
7cfb4d93
RS
461 }
462
9fb832ce 463 /* Create an induction variable that counts the number of items
7cfb4d93
RS
464 processed. */
465 tree index_before_incr, index_after_incr;
466 gimple_stmt_iterator incr_gsi;
467 bool insert_after;
7cfb4d93 468 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
9fb832ce 469 create_iv (build_int_cst (iv_type, 0), nitems_step, NULL_TREE, loop,
fcae0292 470 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
9b884225 471
fcae0292 472 tree zero_index = build_int_cst (compare_type, 0);
535e7c11 473 tree test_index, test_limit, first_limit;
7cfb4d93
RS
474 gimple_stmt_iterator *test_gsi;
475 if (might_wrap_p)
476 {
477 /* In principle the loop should stop iterating once the incremented
535e7c11
RS
478 IV reaches a value greater than or equal to:
479
9fb832ce 480 NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP
535e7c11
RS
481
482 However, there's no guarantee that this addition doesn't overflow
483 the comparison type, or that the IV hits a value above it before
484 wrapping around. We therefore adjust the limit down by one
485 IV step:
7cfb4d93 486
9fb832ce
KL
487 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
488 -[infinite-prec] NITEMS_STEP
7cfb4d93
RS
489
490 and compare the IV against this limit _before_ incrementing it.
491 Since the comparison type is unsigned, we actually want the
492 subtraction to saturate at zero:
493
9fb832ce
KL
494 (NITEMS_TOTAL +[infinite-prec] NITEMS_SKIP)
495 -[sat] NITEMS_STEP
535e7c11 496
9fb832ce 497 And since NITEMS_SKIP < NITEMS_STEP, we can reassociate this as:
535e7c11 498
9fb832ce 499 NITEMS_TOTAL -[sat] (NITEMS_STEP - NITEMS_SKIP)
535e7c11
RS
500
501 where the rightmost subtraction can be done directly in
502 COMPARE_TYPE. */
7cfb4d93 503 test_index = index_before_incr;
fcae0292 504 tree adjust = gimple_convert (preheader_seq, compare_type,
9fb832ce
KL
505 nitems_step);
506 if (nitems_skip)
535e7c11 507 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
9fb832ce 508 adjust, nitems_skip);
7cfb4d93 509 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
9fb832ce 510 nitems_total, adjust);
7cfb4d93 511 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
535e7c11 512 test_limit, adjust);
7cfb4d93 513 test_gsi = &incr_gsi;
535e7c11
RS
514
515 /* Get a safe limit for the first iteration. */
9fb832ce 516 if (nitems_skip)
535e7c11 517 {
9fb832ce
KL
518 /* The first vector iteration can handle at most NITEMS_STEP
519 items. NITEMS_STEP <= CONST_LIMIT, and adding
520 NITEMS_SKIP to that cannot overflow. */
535e7c11
RS
521 tree const_limit = build_int_cst (compare_type,
522 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
9fb832ce 523 * nitems_per_iter);
535e7c11 524 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
9fb832ce 525 nitems_total, const_limit);
535e7c11 526 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
9fb832ce 527 first_limit, nitems_skip);
535e7c11
RS
528 }
529 else
530 /* For the first iteration it doesn't matter whether the IV hits
9fb832ce 531 a value above NITEMS_TOTAL. That only matters for the latch
535e7c11 532 condition. */
9fb832ce 533 first_limit = nitems_total;
7cfb4d93
RS
534 }
535 else
536 {
537 /* Test the incremented IV, which will always hit a value above
538 the bound before wrapping. */
539 test_index = index_after_incr;
9fb832ce
KL
540 test_limit = nitems_total;
541 if (nitems_skip)
535e7c11 542 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
9fb832ce 543 test_limit, nitems_skip);
7cfb4d93 544 test_gsi = &loop_cond_gsi;
535e7c11
RS
545
546 first_limit = test_limit;
7cfb4d93
RS
547 }
548
fcae0292
RS
549 /* Convert the IV value to the comparison type (either a no-op or
550 a demotion). */
9b884225
KV
551 gimple_seq test_seq = NULL;
552 test_index = gimple_convert (&test_seq, compare_type, test_index);
553 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
554
37478789
KL
555 /* Provide a definition of each control in the group. */
556 tree next_ctrl = NULL_TREE;
557 tree ctrl;
fcae0292 558 unsigned int i;
37478789 559 FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
7cfb4d93 560 {
9fb832ce 561 /* Previous controls will cover BIAS items. This control covers the
7cfb4d93 562 next batch. */
9fb832ce 563 poly_uint64 bias = nitems_per_ctrl * i;
7cfb4d93 564 tree bias_tree = build_int_cst (compare_type, bias);
7cfb4d93
RS
565
566 /* See whether the first iteration of the vector loop is known
37478789 567 to have a full control. */
7cfb4d93
RS
568 poly_uint64 const_limit;
569 bool first_iteration_full
535e7c11 570 = (poly_int_tree_p (first_limit, &const_limit)
9fb832ce 571 && known_ge (const_limit, (i + 1) * nitems_per_ctrl));
7cfb4d93
RS
572
573 /* Rather than have a new IV that starts at BIAS and goes up to
37478789 574 TEST_LIMIT, prefer to use the same 0-based IV for each control
7cfb4d93
RS
575 and adjust the bound down by BIAS. */
576 tree this_test_limit = test_limit;
577 if (i != 0)
578 {
579 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
580 compare_type, this_test_limit,
581 bias_tree);
582 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
583 compare_type, this_test_limit,
584 bias_tree);
585 }
586
9fb832ce 587 /* Create the initial control. First include all items that
535e7c11 588 are within the loop limit. */
37478789 589 tree init_ctrl = NULL_TREE;
7cfb4d93
RS
590 if (!first_iteration_full)
591 {
592 tree start, end;
535e7c11 593 if (first_limit == test_limit)
7cfb4d93
RS
594 {
595 /* Use a natural test between zero (the initial IV value)
596 and the loop limit. The "else" block would be valid too,
597 but this choice can avoid the need to load BIAS_TREE into
598 a register. */
599 start = zero_index;
600 end = this_test_limit;
601 }
602 else
603 {
9fb832ce 604 /* FIRST_LIMIT is the maximum number of items handled by the
535e7c11 605 first iteration of the vector loop. Test the portion
37478789 606 associated with this control. */
7cfb4d93 607 start = bias_tree;
535e7c11 608 end = first_limit;
7cfb4d93
RS
609 }
610
9fb832ce 611 if (use_masks_p)
92acae50
RB
612 init_ctrl = vect_gen_while (preheader_seq, ctrl_type,
613 start, end, "max_mask");
9fb832ce
KL
614 else
615 {
616 init_ctrl = make_temp_ssa_name (compare_type, NULL, "max_len");
617 gimple_seq seq = vect_gen_len (init_ctrl, start,
618 end, length_limit);
619 gimple_seq_add_seq (preheader_seq, seq);
620 }
7cfb4d93
RS
621 }
622
535e7c11 623 /* Now AND out the bits that are within the number of skipped
9fb832ce 624 items. */
535e7c11 625 poly_uint64 const_skip;
9fb832ce
KL
626 if (nitems_skip
627 && !(poly_int_tree_p (nitems_skip, &const_skip)
535e7c11
RS
628 && known_le (const_skip, bias)))
629 {
9fb832ce 630 gcc_assert (use_masks_p);
37478789 631 tree unskipped_mask = vect_gen_while_not (preheader_seq, ctrl_type,
9fb832ce 632 bias_tree, nitems_skip);
37478789
KL
633 if (init_ctrl)
634 init_ctrl = gimple_build (preheader_seq, BIT_AND_EXPR, ctrl_type,
635 init_ctrl, unskipped_mask);
535e7c11 636 else
37478789 637 init_ctrl = unskipped_mask;
535e7c11
RS
638 }
639
37478789 640 if (!init_ctrl)
9fb832ce
KL
641 {
642 /* First iteration is full. */
643 if (use_masks_p)
644 init_ctrl = build_minus_one_cst (ctrl_type);
645 else
646 init_ctrl = length_limit;
647 }
7cfb4d93 648
37478789 649 /* Get the control value for the next iteration of the loop. */
9fb832ce
KL
650 if (use_masks_p)
651 {
92acae50
RB
652 gimple_seq stmts = NULL;
653 next_ctrl = vect_gen_while (&stmts, ctrl_type, test_index,
654 this_test_limit, "next_mask");
655 gsi_insert_seq_before (test_gsi, stmts, GSI_SAME_STMT);
9fb832ce
KL
656 }
657 else
658 {
659 next_ctrl = make_temp_ssa_name (compare_type, NULL, "next_len");
660 gimple_seq seq = vect_gen_len (next_ctrl, test_index, this_test_limit,
661 length_limit);
662 gsi_insert_seq_before (test_gsi, seq, GSI_SAME_STMT);
663 }
7cfb4d93 664
37478789 665 vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
7cfb4d93 666 }
37478789 667 return next_ctrl;
7cfb4d93
RS
668}
669
37478789
KL
670/* Set up the iteration condition and rgroup controls for LOOP, given
671 that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
672 loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
673 the number of iterations of the original scalar loop that should be
674 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
675 for vect_set_loop_condition.
7cfb4d93
RS
676
677 Insert the branch-back condition before LOOP_COND_GSI and return the
678 final gcond. */
679
680static gcond *
04f0546e
KL
681vect_set_loop_condition_partial_vectors (class loop *loop,
682 loop_vec_info loop_vinfo, tree niters,
683 tree final_iv, bool niters_maybe_zero,
684 gimple_stmt_iterator loop_cond_gsi)
7cfb4d93
RS
685{
686 gimple_seq preheader_seq = NULL;
687 gimple_seq header_seq = NULL;
688
9fb832ce 689 bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
04f0546e 690 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
7cfb4d93 691 unsigned int compare_precision = TYPE_PRECISION (compare_type);
7cfb4d93
RS
692 tree orig_niters = niters;
693
694 /* Type of the initial value of NITERS. */
695 tree ni_actual_type = TREE_TYPE (niters);
696 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
9b884225 697 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7cfb4d93
RS
698
699 /* Convert NITERS to the same size as the compare. */
700 if (compare_precision > ni_actual_precision
701 && niters_maybe_zero)
702 {
703 /* We know that there is always at least one iteration, so if the
704 count is zero then it must have wrapped. Cope with this by
705 subtracting 1 before the conversion and adding 1 to the result. */
706 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
707 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
708 niters, build_minus_one_cst (ni_actual_type));
709 niters = gimple_convert (&preheader_seq, compare_type, niters);
710 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
711 niters, build_one_cst (compare_type));
712 }
713 else
714 niters = gimple_convert (&preheader_seq, compare_type, niters);
715
37478789
KL
716 /* Iterate over all the rgroups and fill in their controls. We could use
717 the first control from any rgroup for the loop condition; here we
7cfb4d93 718 arbitrarily pick the last. */
37478789
KL
719 tree test_ctrl = NULL_TREE;
720 rgroup_controls *rgc;
7cfb4d93 721 unsigned int i;
9fb832ce
KL
722 auto_vec<rgroup_controls> *controls = use_masks_p
723 ? &LOOP_VINFO_MASKS (loop_vinfo)
724 : &LOOP_VINFO_LENS (loop_vinfo);
725 FOR_EACH_VEC_ELT (*controls, i, rgc)
37478789 726 if (!rgc->controls.is_empty ())
7cfb4d93
RS
727 {
728 /* First try using permutes. This adds a single vector
729 instruction to the loop for each mask, but needs no extra
730 loop invariants or IVs. */
731 unsigned int nmasks = i + 1;
9fb832ce 732 if (use_masks_p && (nmasks & 1) == 0)
7cfb4d93 733 {
9fb832ce 734 rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
37478789
KL
735 if (!half_rgc->controls.is_empty ()
736 && vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
7cfb4d93
RS
737 continue;
738 }
739
740 /* See whether zero-based IV would ever generate all-false masks
9fb832ce 741 or zero length before wrapping around. */
dfdf9085 742 bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
7cfb4d93 743
37478789
KL
744 /* Set up all controls for this group. */
745 test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
746 &preheader_seq,
747 loop_cond_gsi, rgc,
748 niters, niters_skip,
749 might_wrap_p);
7cfb4d93
RS
750 }
751
752 /* Emit all accumulated statements. */
753 add_preheader_seq (loop, preheader_seq);
754 add_header_seq (loop, header_seq);
755
756 /* Get a boolean result that tells us whether to iterate. */
757 edge exit_edge = single_exit (loop);
758 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
37478789
KL
759 tree zero_ctrl = build_zero_cst (TREE_TYPE (test_ctrl));
760 gcond *cond_stmt = gimple_build_cond (code, test_ctrl, zero_ctrl,
7cfb4d93
RS
761 NULL_TREE, NULL_TREE);
762 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
763
764 /* The loop iterates (NITERS - 1) / VF + 1 times.
765 Subtract one from this to get the latch count. */
766 tree step = build_int_cst (compare_type,
767 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
768 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
769 build_minus_one_cst (compare_type));
770 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
771 niters_minus_one, step);
772
773 if (final_iv)
774 {
775 gassign *assign = gimple_build_assign (final_iv, orig_niters);
776 gsi_insert_on_edge_immediate (single_exit (loop), assign);
777 }
778
779 return cond_stmt;
780}
781
04f0546e
KL
782/* Like vect_set_loop_condition, but handle the case in which the vector
783 loop handles exactly VF scalars per iteration. */
7cfb4d93
RS
784
785static gcond *
04f0546e
KL
786vect_set_loop_condition_normal (class loop *loop, tree niters, tree step,
787 tree final_iv, bool niters_maybe_zero,
788 gimple_stmt_iterator loop_cond_gsi)
ebfd146a
IR
789{
790 tree indx_before_incr, indx_after_incr;
538dd0b7
DM
791 gcond *cond_stmt;
792 gcond *orig_cond;
0f26839a 793 edge pe = loop_preheader_edge (loop);
ebfd146a 794 edge exit_edge = single_exit (loop);
ebfd146a
IR
795 gimple_stmt_iterator incr_gsi;
796 bool insert_after;
ebfd146a 797 enum tree_code code;
0f26839a 798 tree niters_type = TREE_TYPE (niters);
ebfd146a
IR
799
800 orig_cond = get_loop_exit_condition (loop);
801 gcc_assert (orig_cond);
802 loop_cond_gsi = gsi_for_stmt (orig_cond);
803
0f26839a
RS
804 tree init, limit;
805 if (!niters_maybe_zero && integer_onep (step))
806 {
807 /* In this case we can use a simple 0-based IV:
808
809 A:
810 x = 0;
811 do
812 {
813 ...
814 x += 1;
815 }
816 while (x < NITERS); */
817 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
818 init = build_zero_cst (niters_type);
819 limit = niters;
820 }
821 else
822 {
823 /* The following works for all values of NITERS except 0:
824
825 B:
826 x = 0;
827 do
828 {
829 ...
830 x += STEP;
831 }
832 while (x <= NITERS - STEP);
833
834 so that the loop continues to iterate if x + STEP - 1 < NITERS
835 but stops if x + STEP - 1 >= NITERS.
836
837 However, if NITERS is zero, x never hits a value above NITERS - STEP
838 before wrapping around. There are two obvious ways of dealing with
839 this:
840
841 - start at STEP - 1 and compare x before incrementing it
842 - start at -1 and compare x after incrementing it
843
844 The latter is simpler and is what we use. The loop in this case
845 looks like:
846
847 C:
848 x = -1;
849 do
850 {
851 ...
852 x += STEP;
853 }
854 while (x < NITERS - STEP);
855
856 In both cases the loop limit is NITERS - STEP. */
857 gimple_seq seq = NULL;
858 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
859 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
860 if (seq)
861 {
862 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
863 gcc_assert (!new_bb);
864 }
865 if (niters_maybe_zero)
866 {
867 /* Case C. */
868 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
869 init = build_all_ones_cst (niters_type);
870 }
871 else
872 {
873 /* Case B. */
874 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
875 init = build_zero_cst (niters_type);
876 }
877 }
878
ebfd146a
IR
879 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
880 create_iv (init, step, NULL_TREE, loop,
881 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
ebfd146a
IR
882 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
883 true, NULL_TREE, true,
884 GSI_SAME_STMT);
0f26839a 885 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
ebfd146a
IR
886 true, GSI_SAME_STMT);
887
0f26839a 888 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
ebfd146a
IR
889 NULL_TREE);
890
891 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
892
71118889 893 /* Record the number of latch iterations. */
0f26839a
RS
894 if (limit == niters)
895 /* Case A: the loop iterates NITERS times. Subtract one to get the
896 latch count. */
897 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
898 build_int_cst (niters_type, 1));
899 else
900 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
901 Subtract one from this to get the latch count. */
902 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
903 limit, step);
904
905 if (final_iv)
906 {
907 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
908 indx_after_incr, init);
909 gsi_insert_on_edge_immediate (single_exit (loop), assign);
910 }
7cfb4d93
RS
911
912 return cond_stmt;
913}
914
915/* If we're using fully-masked loops, make LOOP iterate:
916
917 N == (NITERS - 1) / STEP + 1
918
919 times. When NITERS is zero, this is equivalent to making the loop
920 execute (1 << M) / STEP times, where M is the precision of NITERS.
921 NITERS_MAYBE_ZERO is true if this last case might occur.
922
923 If we're not using fully-masked loops, make LOOP iterate:
924
925 N == (NITERS - STEP) / STEP + 1
926
927 times, where NITERS is known to be outside the range [1, STEP - 1].
928 This is equivalent to making the loop execute NITERS / STEP times
929 when NITERS is nonzero and (1 << M) / STEP times otherwise.
930 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
931
932 If FINAL_IV is nonnull, it is an SSA name that should be set to
933 N * STEP on exit from the loop.
934
935 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
936
937void
99b1c316 938vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
7cfb4d93
RS
939 tree niters, tree step, tree final_iv,
940 bool niters_maybe_zero)
941{
942 gcond *cond_stmt;
943 gcond *orig_cond = get_loop_exit_condition (loop);
944 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
945
b3372425 946 if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
04f0546e
KL
947 cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
948 niters, final_iv,
949 niters_maybe_zero,
950 loop_cond_gsi);
7cfb4d93 951 else
04f0546e
KL
952 cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
953 niters_maybe_zero,
954 loop_cond_gsi);
7cfb4d93
RS
955
956 /* Remove old loop exit test. */
b5b56c2a
RS
957 stmt_vec_info orig_cond_info;
958 if (loop_vinfo
959 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
960 loop_vinfo->remove_stmt (orig_cond_info);
961 else
962 gsi_remove (&loop_cond_gsi, true);
7cfb4d93
RS
963
964 if (dump_enabled_p ())
3c2a8ed0
DM
965 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
966 cond_stmt);
ebfd146a
IR
967}
968
5ce9450f
JJ
969/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
970 For all PHI arguments in FROM->dest and TO->dest from those
971 edges ensure that TO->dest PHI arguments have current_def
972 to that in from. */
973
974static void
975slpeel_duplicate_current_defs_from_edges (edge from, edge to)
976{
977 gimple_stmt_iterator gsi_from, gsi_to;
978
979 for (gsi_from = gsi_start_phis (from->dest),
980 gsi_to = gsi_start_phis (to->dest);
14ba8d6d 981 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
5ce9450f 982 {
355fe088
TS
983 gimple *from_phi = gsi_stmt (gsi_from);
984 gimple *to_phi = gsi_stmt (gsi_to);
5ce9450f 985 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1a5da5b6
RB
986 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
987 if (virtual_operand_p (from_arg))
988 {
14ba8d6d
RB
989 gsi_next (&gsi_from);
990 continue;
991 }
1a5da5b6
RB
992 if (virtual_operand_p (to_arg))
993 {
14ba8d6d
RB
994 gsi_next (&gsi_to);
995 continue;
996 }
1a5da5b6
RB
997 if (TREE_CODE (from_arg) != SSA_NAME)
998 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
d8564d45
RB
999 else if (TREE_CODE (to_arg) == SSA_NAME
1000 && from_arg != to_arg)
1a5da5b6
RB
1001 {
1002 if (get_current_def (to_arg) == NULL_TREE)
733441e2
RB
1003 {
1004 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
1005 TREE_TYPE (get_current_def
1006 (from_arg))));
1007 set_current_def (to_arg, get_current_def (from_arg));
1008 }
1a5da5b6 1009 }
14ba8d6d
RB
1010 gsi_next (&gsi_from);
1011 gsi_next (&gsi_to);
5ce9450f 1012 }
1a5da5b6
RB
1013
1014 gphi *from_phi = get_virtual_phi (from->dest);
1015 gphi *to_phi = get_virtual_phi (to->dest);
1016 if (from_phi)
1017 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
1018 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
5ce9450f
JJ
1019}
1020
ebfd146a 1021
b8698a0f 1022/* Given LOOP this function generates a new copy of it and puts it
5ce9450f
JJ
1023 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
1024 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1025 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1026 entry or exit of LOOP. */
ebfd146a 1027
99b1c316
MS
1028class loop *
1029slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
1030 class loop *scalar_loop, edge e)
ebfd146a 1031{
99b1c316 1032 class loop *new_loop;
3884da6f 1033 basic_block *new_bbs, *bbs, *pbbs;
ebfd146a
IR
1034 bool at_exit;
1035 bool was_imm_dom;
b8698a0f 1036 basic_block exit_dest;
ebfd146a 1037 edge exit, new_exit;
a6c51a12 1038 bool duplicate_outer_loop = false;
ebfd146a 1039
2cfc56b9
RB
1040 exit = single_exit (loop);
1041 at_exit = (e == exit);
ebfd146a
IR
1042 if (!at_exit && e != loop_preheader_edge (loop))
1043 return NULL;
1044
5ce9450f
JJ
1045 if (scalar_loop == NULL)
1046 scalar_loop = loop;
1047
1048 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
3884da6f
BC
1049 pbbs = bbs + 1;
1050 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
a6c51a12
YR
1051 /* Allow duplication of outer loops. */
1052 if (scalar_loop->inner)
1053 duplicate_outer_loop = true;
ebfd146a 1054 /* Check whether duplication is possible. */
3884da6f 1055 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
ebfd146a
IR
1056 {
1057 free (bbs);
1058 return NULL;
1059 }
1060
1061 /* Generate new loop structure. */
5ce9450f
JJ
1062 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1063 duplicate_subloops (scalar_loop, new_loop);
ebfd146a 1064
2cfc56b9 1065 exit_dest = exit->dest;
b8698a0f
L
1066 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1067 exit_dest) == loop->header ?
ebfd146a
IR
1068 true : false);
1069
2cfc56b9
RB
1070 /* Also copy the pre-header, this avoids jumping through hoops to
1071 duplicate the loop entry PHI arguments. Create an empty
1072 pre-header unconditionally for this. */
5ce9450f 1073 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
2cfc56b9 1074 edge entry_e = single_pred_edge (preheader);
3884da6f 1075 bbs[0] = preheader;
5ce9450f 1076 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
ebfd146a 1077
5ce9450f
JJ
1078 exit = single_exit (scalar_loop);
1079 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
ebfd146a 1080 &exit, 1, &new_exit, NULL,
3884da6f 1081 at_exit ? loop->latch : e->src, true);
5ce9450f 1082 exit = single_exit (loop);
3884da6f 1083 basic_block new_preheader = new_bbs[0];
ebfd146a 1084
4a369d19
RB
1085 /* Before installing PHI arguments make sure that the edges
1086 into them match that of the scalar loop we analyzed. This
1087 makes sure the SLP tree matches up between the main vectorized
1088 loop and the epilogue vectorized copies. */
1089 if (single_succ_edge (preheader)->dest_idx
1090 != single_succ_edge (new_bbs[0])->dest_idx)
1091 {
1092 basic_block swap_bb = new_bbs[1];
1093 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1094 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1095 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1096 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1097 }
1098 if (duplicate_outer_loop)
1099 {
1100 class loop *new_inner_loop = get_loop_copy (scalar_loop->inner);
1101 if (loop_preheader_edge (scalar_loop)->dest_idx
1102 != loop_preheader_edge (new_inner_loop)->dest_idx)
1103 {
1104 basic_block swap_bb = new_inner_loop->header;
1105 gcc_assert (EDGE_COUNT (swap_bb->preds) == 2);
1106 std::swap (EDGE_PRED (swap_bb, 0), EDGE_PRED (swap_bb, 1));
1107 EDGE_PRED (swap_bb, 0)->dest_idx = 0;
1108 EDGE_PRED (swap_bb, 1)->dest_idx = 1;
1109 }
1110 }
1111
5ce9450f
JJ
1112 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1113
d0909f58
RB
1114 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1115 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1116 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1117
5ce9450f
JJ
1118 if (scalar_loop != loop)
1119 {
1120 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1121 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1122 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1123 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1124 header) to have current_def set, so copy them over. */
1125 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1126 exit);
1127 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1128 0),
1129 EDGE_SUCC (loop->latch, 0));
1130 }
b8698a0f 1131
ebfd146a
IR
1132 if (at_exit) /* Add the loop copy at exit. */
1133 {
5ce9450f
JJ
1134 if (scalar_loop != loop)
1135 {
538dd0b7 1136 gphi_iterator gsi;
5ce9450f
JJ
1137 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1138
1139 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1140 gsi_next (&gsi))
1141 {
538dd0b7 1142 gphi *phi = gsi.phi ();
5ce9450f
JJ
1143 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1144 location_t orig_locus
1145 = gimple_phi_arg_location_from_edge (phi, e);
1146
1147 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1148 }
1149 }
2cfc56b9
RB
1150 redirect_edge_and_branch_force (e, new_preheader);
1151 flush_pending_stmts (e);
1152 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
a6c51a12 1153 if (was_imm_dom || duplicate_outer_loop)
5ce9450f 1154 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
2cfc56b9
RB
1155
1156 /* And remove the non-necessary forwarder again. Keep the other
1157 one so we have a proper pre-header for the loop at the exit edge. */
5ce9450f
JJ
1158 redirect_edge_pred (single_succ_edge (preheader),
1159 single_pred (preheader));
2cfc56b9 1160 delete_basic_block (preheader);
5ce9450f
JJ
1161 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1162 loop_preheader_edge (scalar_loop)->src);
ebfd146a
IR
1163 }
1164 else /* Add the copy at entry. */
1165 {
5ce9450f
JJ
1166 if (scalar_loop != loop)
1167 {
1168 /* Remove the non-necessary forwarder of scalar_loop again. */
1169 redirect_edge_pred (single_succ_edge (preheader),
1170 single_pred (preheader));
1171 delete_basic_block (preheader);
1172 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1173 loop_preheader_edge (scalar_loop)->src);
1174 preheader = split_edge (loop_preheader_edge (loop));
1175 entry_e = single_pred_edge (preheader);
1176 }
1177
2cfc56b9
RB
1178 redirect_edge_and_branch_force (entry_e, new_preheader);
1179 flush_pending_stmts (entry_e);
1180 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1181
1182 redirect_edge_and_branch_force (new_exit, preheader);
1183 flush_pending_stmts (new_exit);
1184 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1185
1186 /* And remove the non-necessary forwarder again. Keep the other
1187 one so we have a proper pre-header for the loop at the exit edge. */
5ce9450f
JJ
1188 redirect_edge_pred (single_succ_edge (new_preheader),
1189 single_pred (new_preheader));
2cfc56b9
RB
1190 delete_basic_block (new_preheader);
1191 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1192 loop_preheader_edge (new_loop)->src);
ebfd146a
IR
1193 }
1194
5ce9450f
JJ
1195 if (scalar_loop != loop)
1196 {
1197 /* Update new_loop->header PHIs, so that on the preheader
1198 edge they are the ones from loop rather than scalar_loop. */
538dd0b7 1199 gphi_iterator gsi_orig, gsi_new;
5ce9450f
JJ
1200 edge orig_e = loop_preheader_edge (loop);
1201 edge new_e = loop_preheader_edge (new_loop);
1202
1203 for (gsi_orig = gsi_start_phis (loop->header),
1204 gsi_new = gsi_start_phis (new_loop->header);
1205 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1206 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1207 {
538dd0b7
DM
1208 gphi *orig_phi = gsi_orig.phi ();
1209 gphi *new_phi = gsi_new.phi ();
5ce9450f
JJ
1210 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1211 location_t orig_locus
1212 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1213
1214 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1215 }
1216 }
1217
ebfd146a
IR
1218 free (new_bbs);
1219 free (bbs);
1220
b2b29377 1221 checking_verify_dominators (CDI_DOMINATORS);
2cfc56b9 1222
ebfd146a
IR
1223 return new_loop;
1224}
1225
1226
a5e3d614
BC
1227/* Given the condition expression COND, put it as the last statement of
1228 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1229 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
3e907b90
BC
1230 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1231 new edge as irreducible if IRREDUCIBLE_P is true. */
ebfd146a
IR
1232
1233static edge
86290011 1234slpeel_add_loop_guard (basic_block guard_bb, tree cond,
a5e3d614 1235 basic_block guard_to, basic_block dom_bb,
357067f2 1236 profile_probability probability, bool irreducible_p)
ebfd146a
IR
1237{
1238 gimple_stmt_iterator gsi;
1239 edge new_e, enter_e;
538dd0b7 1240 gcond *cond_stmt;
ebfd146a
IR
1241 gimple_seq gimplify_stmt_list = NULL;
1242
1243 enter_e = EDGE_SUCC (guard_bb, 0);
1244 enter_e->flags &= ~EDGE_FALLTHRU;
1245 enter_e->flags |= EDGE_FALSE_VALUE;
1246 gsi = gsi_last_bb (guard_bb);
1247
f7a06a98
RG
1248 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1249 NULL_TREE);
86290011 1250 if (gimplify_stmt_list)
a5e3d614 1251 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
ebfd146a 1252
a5e3d614 1253 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
ebfd146a
IR
1254 gsi = gsi_last_bb (guard_bb);
1255 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1256
1257 /* Add new edge to connect guard block to the merge/loop-exit block. */
a5e3d614 1258 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
e78410bf 1259
e78410bf 1260 new_e->probability = probability;
3e907b90
BC
1261 if (irreducible_p)
1262 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1263
357067f2 1264 enter_e->probability = probability.invert ();
a5e3d614 1265 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
5281a167
RB
1266
1267 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1268 if (enter_e->dest->loop_father->header == enter_e->dest)
1269 split_edge (enter_e);
1270
ebfd146a
IR
1271 return new_e;
1272}
1273
1274
1275/* This function verifies that the following restrictions apply to LOOP:
a6c51a12
YR
1276 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1277 for innermost loop and 5 basic blocks for outer-loop.
1278 (2) it is single entry, single exit
1279 (3) its exit condition is the last stmt in the header
1280 (4) E is the entry/exit edge of LOOP.
ebfd146a
IR
1281 */
1282
1283bool
99b1c316 1284slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
ebfd146a
IR
1285{
1286 edge exit_e = single_exit (loop);
1287 edge entry_e = loop_preheader_edge (loop);
538dd0b7 1288 gcond *orig_cond = get_loop_exit_condition (loop);
ebfd146a 1289 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
a6c51a12 1290 unsigned int num_bb = loop->inner? 5 : 2;
ebfd146a 1291
62fdbf29
BC
1292 /* All loops have an outer scope; the only case loop->outer is NULL is for
1293 the function itself. */
1294 if (!loop_outer (loop)
a6c51a12 1295 || loop->num_nodes != num_bb
ebfd146a
IR
1296 || !empty_block_p (loop->latch)
1297 || !single_exit (loop)
1298 /* Verify that new loop exit condition can be trivially modified. */
1299 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1300 || (e != exit_e && e != entry_e))
1301 return false;
1302
1303 return true;
1304}
1305
a5e3d614
BC
1306/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1307 in the exit bb and rename all the uses after the loop. This simplifies
1308 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1309 (but normally loop closed SSA form doesn't require virtual PHIs to be
1310 in the same form). Doing this early simplifies the checking what
e3969868 1311 uses should be renamed.
ebfd146a 1312
e3969868
RS
1313 If we create a new phi after the loop, return the definition that
1314 applies on entry to the loop, otherwise return null. */
1315
1316static tree
99b1c316 1317create_lcssa_for_virtual_phi (class loop *loop)
ebfd146a 1318{
538dd0b7 1319 gphi_iterator gsi;
ebfd146a 1320 edge exit_e = single_exit (loop);
b8698a0f 1321
e20f6b4b 1322 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
ea057359 1323 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
e20f6b4b 1324 {
538dd0b7 1325 gphi *phi = gsi.phi ();
e20f6b4b
JJ
1326 for (gsi = gsi_start_phis (exit_e->dest);
1327 !gsi_end_p (gsi); gsi_next (&gsi))
ea057359 1328 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
e20f6b4b
JJ
1329 break;
1330 if (gsi_end_p (gsi))
1331 {
b731b390 1332 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
538dd0b7 1333 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
e20f6b4b
JJ
1334 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1335 imm_use_iterator imm_iter;
355fe088 1336 gimple *stmt;
e20f6b4b
JJ
1337 use_operand_p use_p;
1338
15b6695a
RB
1339 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1340 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
9e227d60 1341 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
e20f6b4b
JJ
1342 gimple_phi_set_result (new_phi, new_vop);
1343 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
b5fd0440
YR
1344 if (stmt != new_phi
1345 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
e20f6b4b
JJ
1346 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1347 SET_USE (use_p, new_vop);
e3969868
RS
1348
1349 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
e20f6b4b
JJ
1350 }
1351 break;
1352 }
e3969868 1353 return NULL_TREE;
ebfd146a
IR
1354}
1355
1356/* Function vect_get_loop_location.
1357
1358 Extract the location of the loop in the source code.
1359 If the loop is not well formed for vectorization, an estimated
1360 location is calculated.
1361 Return the loop location if succeed and NULL if not. */
1362
4f5b9c80 1363dump_user_location_t
99b1c316 1364find_loop_location (class loop *loop)
ebfd146a 1365{
355fe088 1366 gimple *stmt = NULL;
ebfd146a
IR
1367 basic_block bb;
1368 gimple_stmt_iterator si;
1369
1370 if (!loop)
4f5b9c80 1371 return dump_user_location_t ();
ebfd146a
IR
1372
1373 stmt = get_loop_exit_condition (loop);
1374
502498d5
JJ
1375 if (stmt
1376 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
4f5b9c80 1377 return stmt;
ebfd146a
IR
1378
1379 /* If we got here the loop is probably not "well formed",
1380 try to estimate the loop location */
1381
1382 if (!loop->header)
4f5b9c80 1383 return dump_user_location_t ();
ebfd146a
IR
1384
1385 bb = loop->header;
1386
1387 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1388 {
1389 stmt = gsi_stmt (si);
502498d5 1390 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
4f5b9c80 1391 return stmt;
ebfd146a
IR
1392 }
1393
4f5b9c80 1394 return dump_user_location_t ();
ebfd146a
IR
1395}
1396
82570274
RS
1397/* Return true if the phi described by STMT_INFO defines an IV of the
1398 loop to be vectorized. */
a5e3d614
BC
1399
1400static bool
82570274 1401iv_phi_p (stmt_vec_info stmt_info)
a5e3d614 1402{
82570274 1403 gphi *phi = as_a <gphi *> (stmt_info->stmt);
a5e3d614
BC
1404 if (virtual_operand_p (PHI_RESULT (phi)))
1405 return false;
1406
a5e3d614
BC
1407 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1408 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1409 return false;
1410
1411 return true;
1412}
ebfd146a 1413
ebfd146a
IR
1414/* Function vect_can_advance_ivs_p
1415
b8698a0f
L
1416 In case the number of iterations that LOOP iterates is unknown at compile
1417 time, an epilog loop will be generated, and the loop induction variables
1418 (IVs) will be "advanced" to the value they are supposed to take just before
ebfd146a
IR
1419 the epilog loop. Here we check that the access function of the loop IVs
1420 and the expression that represents the loop bound are simple enough.
1421 These restrictions will be relaxed in the future. */
1422
b8698a0f 1423bool
ebfd146a
IR
1424vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1425{
99b1c316 1426 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 1427 basic_block bb = loop->header;
538dd0b7 1428 gphi_iterator gsi;
ebfd146a
IR
1429
1430 /* Analyze phi functions of the loop header. */
1431
73fbfcad 1432 if (dump_enabled_p ())
e645e942 1433 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
ebfd146a
IR
1434 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1435 {
ebfd146a
IR
1436 tree evolution_part;
1437
a5e3d614 1438 gphi *phi = gsi.phi ();
91987857 1439 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
73fbfcad 1440 if (dump_enabled_p ())
3c2a8ed0
DM
1441 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1442 phi_info->stmt);
ebfd146a
IR
1443
1444 /* Skip virtual phi's. The data dependences that are associated with
a5e3d614 1445 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
ebfd146a 1446
a5e3d614 1447 Skip reduction phis. */
82570274 1448 if (!iv_phi_p (phi_info))
ebfd146a 1449 {
73fbfcad 1450 if (dump_enabled_p ())
a5e3d614
BC
1451 dump_printf_loc (MSG_NOTE, vect_location,
1452 "reduc or virtual phi. skip.\n");
ebfd146a
IR
1453 continue;
1454 }
1455
ebfd146a
IR
1456 /* Analyze the evolution function. */
1457
91987857 1458 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
ebfd146a
IR
1459 if (evolution_part == NULL_TREE)
1460 {
73fbfcad 1461 if (dump_enabled_p ())
afb119be 1462 dump_printf (MSG_MISSED_OPTIMIZATION,
e645e942 1463 "No access function or evolution.\n");
ebfd146a
IR
1464 return false;
1465 }
b8698a0f 1466
2a93954e
AH
1467 /* FORNOW: We do not transform initial conditions of IVs
1468 which evolution functions are not invariants in the loop. */
1469
1470 if (!expr_invariant_in_loop_p (loop, evolution_part))
1471 {
1472 if (dump_enabled_p ())
1473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 "evolution not invariant in loop.\n");
1475 return false;
1476 }
1477
b8698a0f 1478 /* FORNOW: We do not transform initial conditions of IVs
ebfd146a
IR
1479 which evolution functions are a polynomial of degree >= 2. */
1480
1481 if (tree_is_chrec (evolution_part))
2a93954e
AH
1482 {
1483 if (dump_enabled_p ())
1484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485 "evolution is chrec.\n");
1486 return false;
1487 }
ebfd146a
IR
1488 }
1489
1490 return true;
1491}
1492
1493
1494/* Function vect_update_ivs_after_vectorizer.
1495
1496 "Advance" the induction variables of LOOP to the value they should take
1497 after the execution of LOOP. This is currently necessary because the
1498 vectorizer does not handle induction variables that are used after the
1499 loop. Such a situation occurs when the last iterations of LOOP are
1500 peeled, because:
1501 1. We introduced new uses after LOOP for IVs that were not originally used
1502 after LOOP: the IVs of LOOP are now used by an epilog loop.
1503 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1504 times, whereas the loop IVs should be bumped N times.
1505
1506 Input:
1507 - LOOP - a loop that is going to be vectorized. The last few iterations
1508 of LOOP were peeled.
1509 - NITERS - the number of iterations that LOOP executes (before it is
1510 vectorized). i.e, the number of times the ivs should be bumped.
1511 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1512 coming out from LOOP on which there are uses of the LOOP ivs
1513 (this is the path from LOOP->exit to epilog_loop->preheader).
1514
1515 The new definitions of the ivs are placed in LOOP->exit.
1516 The phi args associated with the edge UPDATE_E in the bb
1517 UPDATE_E->dest are updated accordingly.
1518
1519 Assumption 1: Like the rest of the vectorizer, this function assumes
1520 a single loop exit that has a single predecessor.
1521
1522 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1523 organized in the same order.
1524
1525 Assumption 3: The access function of the ivs is simple enough (see
1526 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1527
1528 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
b8698a0f 1529 coming out of LOOP on which the ivs of LOOP are used (this is the path
ebfd146a
IR
1530 that leads to the epilog loop; other paths skip the epilog loop). This
1531 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1532 needs to have its phis updated.
1533 */
1534
1535static void
a5e3d614
BC
1536vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1537 tree niters, edge update_e)
ebfd146a 1538{
538dd0b7 1539 gphi_iterator gsi, gsi1;
99b1c316 1540 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 1541 basic_block update_bb = update_e->dest;
a5e3d614 1542 basic_block exit_bb = single_exit (loop)->dest;
ebfd146a
IR
1543
1544 /* Make sure there exists a single-predecessor exit bb: */
1545 gcc_assert (single_pred_p (exit_bb));
a5e3d614 1546 gcc_assert (single_succ_edge (exit_bb) == update_e);
ebfd146a
IR
1547
1548 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1549 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1550 gsi_next (&gsi), gsi_next (&gsi1))
1551 {
ebfd146a 1552 tree init_expr;
550918ca
RG
1553 tree step_expr, off;
1554 tree type;
ebfd146a
IR
1555 tree var, ni, ni_name;
1556 gimple_stmt_iterator last_gsi;
1557
a5e3d614
BC
1558 gphi *phi = gsi.phi ();
1559 gphi *phi1 = gsi1.phi ();
91987857 1560 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
73fbfcad 1561 if (dump_enabled_p ())
3c2a8ed0
DM
1562 dump_printf_loc (MSG_NOTE, vect_location,
1563 "vect_update_ivs_after_vectorizer: phi: %G", phi);
ebfd146a 1564
a5e3d614 1565 /* Skip reduction and virtual phis. */
82570274 1566 if (!iv_phi_p (phi_info))
ebfd146a 1567 {
73fbfcad 1568 if (dump_enabled_p ())
a5e3d614
BC
1569 dump_printf_loc (MSG_NOTE, vect_location,
1570 "reduc or virtual phi. skip.\n");
ebfd146a
IR
1571 continue;
1572 }
1573
0ac168a1 1574 type = TREE_TYPE (gimple_phi_result (phi));
91987857 1575 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
0ac168a1 1576 step_expr = unshare_expr (step_expr);
b8698a0f 1577
ebfd146a
IR
1578 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1579 of degree >= 2 or exponential. */
0ac168a1 1580 gcc_assert (!tree_is_chrec (step_expr));
ebfd146a 1581
0ac168a1 1582 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
ebfd146a 1583
550918ca
RG
1584 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1585 fold_convert (TREE_TYPE (step_expr), niters),
1586 step_expr);
0ac168a1 1587 if (POINTER_TYPE_P (type))
5d49b6a7 1588 ni = fold_build_pointer_plus (init_expr, off);
ebfd146a 1589 else
0ac168a1
RG
1590 ni = fold_build2 (PLUS_EXPR, type,
1591 init_expr, fold_convert (type, off));
ebfd146a 1592
0ac168a1 1593 var = create_tmp_var (type, "tmp");
ebfd146a
IR
1594
1595 last_gsi = gsi_last_bb (exit_bb);
a5e3d614
BC
1596 gimple_seq new_stmts = NULL;
1597 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1598 /* Exit_bb shouldn't be empty. */
1599 if (!gsi_end_p (last_gsi))
1600 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1601 else
1602 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
b8698a0f 1603
ebfd146a 1604 /* Fix phi expressions in the successor bb. */
684f25f4 1605 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
ebfd146a
IR
1606 }
1607}
1608
535e7c11
RS
1609/* Return a gimple value containing the misalignment (measured in vector
1610 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1611 it is away from a perfectly aligned address. Add any new statements
1612 to SEQ. */
1613
1614static tree
1615get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1616{
1e5e6ff5 1617 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
89fa689a 1618 stmt_vec_info stmt_info = dr_info->stmt;
535e7c11
RS
1619 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1620
ca31798e
AV
1621 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1622 unsigned HOST_WIDE_INT target_align_c;
1623 tree target_align_minus_1;
535e7c11 1624
89fa689a
RS
1625 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1626 size_zero_node) < 0;
535e7c11
RS
1627 tree offset = (negative
1628 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1629 : size_zero_node);
308bc496
RB
1630 tree start_addr = vect_create_addr_base_for_vector_ref (loop_vinfo,
1631 stmt_info, seq,
535e7c11
RS
1632 offset);
1633 tree type = unsigned_type_for (TREE_TYPE (start_addr));
ca31798e
AV
1634 if (target_align.is_constant (&target_align_c))
1635 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1636 else
1637 {
1638 tree vla = build_int_cst (type, target_align);
1639 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1640 fold_build2 (MINUS_EXPR, type,
1641 build_int_cst (type, 0), vla));
1642 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1643 build_int_cst (type, 1));
1644 }
1645
535e7c11
RS
1646 HOST_WIDE_INT elem_size
1647 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1648 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1649
1650 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1651 tree int_start_addr = fold_convert (type, start_addr);
1652 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1653 target_align_minus_1);
1654
1655 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1656 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1657 elem_size_log);
1658
1659 return misalign_in_elems;
1660}
1661
a5e3d614 1662/* Function vect_gen_prolog_loop_niters
d9157f15 1663
a5e3d614
BC
1664 Generate the number of iterations which should be peeled as prolog for the
1665 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1666 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1667 As a result, after the execution of this loop, the data reference DR will
1668 refer to an aligned location. The following computation is generated:
ebfd146a
IR
1669
1670 If the misalignment of DR is known at compile time:
1671 addr_mis = int mis = DR_MISALIGNMENT (dr);
1672 Else, compute address misalignment in bytes:
535e7c11 1673 addr_mis = addr & (target_align - 1)
ebfd146a 1674
a5e3d614 1675 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
ebfd146a
IR
1676
1677 (elem_size = element type size; an element is the scalar element whose type
1678 is the inner type of the vectype)
1679
a5e3d614 1680 The computations will be emitted at the end of BB. We also compute and
cbb22e61 1681 store upper bound (included) of the result in BOUND.
a5e3d614 1682
ebfd146a
IR
1683 When the step of the data-ref in the loop is not 1 (as in interleaved data
1684 and SLP), the number of iterations of the prolog must be divided by the step
1685 (which is equal to the size of interleaved group).
1686
1687 The above formulas assume that VF == number of elements in the vector. This
1688 may not hold when there are multiple-types in the loop.
1689 In this case, for some data-references in the loop the VF does not represent
1690 the number of elements that fit in the vector. Therefore, instead of VF we
1691 use TYPE_VECTOR_SUBPARTS. */
1692
b8698a0f 1693static tree
a5e3d614
BC
1694vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1695 basic_block bb, int *bound)
ebfd146a 1696{
1e5e6ff5 1697 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
ebfd146a 1698 tree var;
a5e3d614
BC
1699 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1700 gimple_seq stmts = NULL, new_stmts = NULL;
ebfd146a 1701 tree iters, iters_name;
89fa689a 1702 stmt_vec_info stmt_info = dr_info->stmt;
ebfd146a 1703 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ca31798e 1704 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
ebfd146a 1705
15e693cc 1706 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
ebfd146a 1707 {
15e693cc 1708 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
ebfd146a 1709
73fbfcad 1710 if (dump_enabled_p ())
ccb3ad87 1711 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1712 "known peeling = %d.\n", npeel);
ebfd146a 1713
720f5239 1714 iters = build_int_cst (niters_type, npeel);
cbb22e61 1715 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
ebfd146a
IR
1716 }
1717 else
1718 {
535e7c11
RS
1719 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1720 tree type = TREE_TYPE (misalign_in_elems);
f702e7d4
RS
1721 HOST_WIDE_INT elem_size
1722 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
ca31798e
AV
1723 /* We only do prolog peeling if the target alignment is known at compile
1724 time. */
1725 poly_uint64 align_in_elems =
1726 exact_div (target_align, elem_size);
1727 tree align_in_elems_minus_1 =
1728 build_int_cst (type, align_in_elems - 1);
f702e7d4 1729 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
f702e7d4
RS
1730
1731 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1732 & (align_in_elems - 1)). */
89fa689a
RS
1733 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1734 size_zero_node) < 0;
d8ba5b19 1735 if (negative)
f702e7d4
RS
1736 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1737 align_in_elems_tree);
d8ba5b19 1738 else
f702e7d4
RS
1739 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1740 misalign_in_elems);
1741 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
ebfd146a 1742 iters = fold_convert (niters_type, iters);
ca31798e
AV
1743 unsigned HOST_WIDE_INT align_in_elems_c;
1744 if (align_in_elems.is_constant (&align_in_elems_c))
1745 *bound = align_in_elems_c - 1;
1746 else
1747 *bound = -1;
ebfd146a
IR
1748 }
1749
73fbfcad 1750 if (dump_enabled_p ())
3c2a8ed0
DM
1751 dump_printf_loc (MSG_NOTE, vect_location,
1752 "niters for prolog loop: %T\n", iters);
ebfd146a
IR
1753
1754 var = create_tmp_var (niters_type, "prolog_loop_niters");
a5e3d614 1755 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
ebfd146a 1756
a5e3d614
BC
1757 if (new_stmts)
1758 gimple_seq_add_seq (&stmts, new_stmts);
ebfd146a
IR
1759 if (stmts)
1760 {
a5e3d614
BC
1761 gcc_assert (single_succ_p (bb));
1762 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1763 if (gsi_end_p (gsi))
1764 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1765 else
1766 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
ebfd146a 1767 }
b8698a0f 1768 return iters_name;
ebfd146a
IR
1769}
1770
1771
1772/* Function vect_update_init_of_dr
1773
535e7c11
RS
1774 If CODE is PLUS, the vector loop starts NITERS iterations after the
1775 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1776 iterations before the scalar one (using masking to skip inactive
1777 elements). This function updates the information recorded in DR to
1778 account for the difference. Specifically, it updates the OFFSET
67723321 1779 field of DR_INFO. */
ebfd146a
IR
1780
1781static void
67723321 1782vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
ebfd146a 1783{
67723321
AV
1784 struct data_reference *dr = dr_info->dr;
1785 tree offset = dr_info->offset;
1786 if (!offset)
1787 offset = build_zero_cst (sizetype);
b8698a0f 1788
ebfd146a
IR
1789 niters = fold_build2 (MULT_EXPR, sizetype,
1790 fold_convert (sizetype, niters),
1791 fold_convert (sizetype, DR_STEP (dr)));
535e7c11 1792 offset = fold_build2 (code, sizetype,
587aa063 1793 fold_convert (sizetype, offset), niters);
67723321 1794 dr_info->offset = offset;
ebfd146a
IR
1795}
1796
1797
1798/* Function vect_update_inits_of_drs
1799
535e7c11
RS
1800 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1801 CODE and NITERS are as for vect_update_inits_of_dr. */
ebfd146a 1802
97c14603 1803void
535e7c11
RS
1804vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1805 tree_code code)
ebfd146a
IR
1806{
1807 unsigned int i;
9771b263 1808 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
ebfd146a 1809 struct data_reference *dr;
a5e3d614 1810
adac3a68 1811 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
a5e3d614 1812
97c14603
AV
1813 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1814 here, but since we might use these niters to update the epilogues niters
1815 and data references we can't insert them here as this definition might not
1816 always dominate its uses. */
a5e3d614 1817 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
97c14603 1818 niters = fold_convert (sizetype, niters);
ebfd146a 1819
9771b263 1820 FOR_EACH_VEC_ELT (datarefs, i, dr)
5fa23466 1821 {
f5ae2856
RS
1822 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1823 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
67723321 1824 vect_update_init_of_dr (dr_info, niters, code);
5fa23466 1825 }
ebfd146a
IR
1826}
1827
535e7c11
RS
1828/* For the information recorded in LOOP_VINFO prepare the loop for peeling
1829 by masking. This involves calculating the number of iterations to
1830 be peeled and then aligning all memory references appropriately. */
1831
1832void
1833vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1834{
1835 tree misalign_in_elems;
04f0546e 1836 tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
535e7c11
RS
1837
1838 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1839
1840 /* From the information recorded in LOOP_VINFO get the number of iterations
1841 that need to be skipped via masking. */
1842 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1843 {
1844 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1845 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1846 misalign_in_elems = build_int_cst (type, misalign);
1847 }
1848 else
1849 {
1850 gimple_seq seq1 = NULL, seq2 = NULL;
1851 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1852 misalign_in_elems = fold_convert (type, misalign_in_elems);
1853 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1854 &seq2, true, NULL_TREE);
1855 gimple_seq_add_seq (&seq1, seq2);
1856 if (seq1)
1857 {
1858 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1859 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1860 gcc_assert (!new_bb);
1861 }
1862 }
1863
1864 if (dump_enabled_p ())
3c2a8ed0
DM
1865 dump_printf_loc (MSG_NOTE, vect_location,
1866 "misalignment for fully-masked loop: %T\n",
1867 misalign_in_elems);
535e7c11
RS
1868
1869 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1870
1871 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1872}
ebfd146a 1873
a5e3d614 1874/* This function builds ni_name = number of iterations. Statements
7078979b
BC
1875 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1876 it to TRUE if new ssa_var is generated. */
a5e3d614
BC
1877
1878tree
7078979b 1879vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
a5e3d614
BC
1880{
1881 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1882 if (TREE_CODE (ni) == INTEGER_CST)
1883 return ni;
1884 else
1885 {
1886 tree ni_name, var;
1887 gimple_seq stmts = NULL;
1888 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1889
1890 var = create_tmp_var (TREE_TYPE (ni), "niters");
1891 ni_name = force_gimple_operand (ni, &stmts, false, var);
1892 if (stmts)
7078979b
BC
1893 {
1894 gsi_insert_seq_on_edge_immediate (pe, stmts);
1895 if (new_var_p != NULL)
1896 *new_var_p = true;
1897 }
a5e3d614
BC
1898
1899 return ni_name;
1900 }
1901}
1902
cbb22e61
BC
1903/* Calculate the number of iterations above which vectorized loop will be
1904 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1905 of prolog loop. If it's integer const, the integer number is also passed
d1d20a49
RS
1906 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1907 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1908 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1909 threshold below which the scalar (rather than vectorized) loop will be
1910 executed. This function stores the upper bound (inclusive) of the result
1911 in BOUND_SCALAR. */
a5e3d614
BC
1912
1913static tree
1914vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
d1d20a49 1915 int bound_prolog, poly_int64 bound_epilog, int th,
d9f21f6a
RS
1916 poly_uint64 *bound_scalar,
1917 bool check_profitability)
a5e3d614
BC
1918{
1919 tree type = TREE_TYPE (niters_prolog);
1920 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
d1d20a49 1921 build_int_cst (type, bound_epilog));
a5e3d614 1922
d1d20a49 1923 *bound_scalar = bound_prolog + bound_epilog;
a5e3d614
BC
1924 if (check_profitability)
1925 {
cbb22e61
BC
1926 /* TH indicates the minimum niters of vectorized loop, while we
1927 compute the maximum niters of scalar loop. */
1928 th--;
a5e3d614
BC
1929 /* Peeling for constant times. */
1930 if (int_niters_prolog >= 0)
1931 {
d1d20a49 1932 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
cbb22e61 1933 return build_int_cst (type, *bound_scalar);
a5e3d614 1934 }
d1d20a49
RS
1935 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1936 and BOUND_EPILOG are inclusive upper bounds. */
1937 if (known_ge (th, bound_prolog + bound_epilog))
a5e3d614 1938 {
cbb22e61 1939 *bound_scalar = th;
a5e3d614
BC
1940 return build_int_cst (type, th);
1941 }
d9f21f6a 1942 /* Need to do runtime comparison. */
d1d20a49 1943 else if (maybe_gt (th, bound_epilog))
d9f21f6a
RS
1944 {
1945 *bound_scalar = upper_bound (*bound_scalar, th);
1946 return fold_build2 (MAX_EXPR, type,
1947 build_int_cst (type, th), niters);
1948 }
a5e3d614
BC
1949 }
1950 return niters;
1951}
1952
0f26839a
RS
1953/* NITERS is the number of times that the original scalar loop executes
1954 after peeling. Work out the maximum number of iterations N that can
1955 be handled by the vectorized form of the loop and then either:
ebfd146a 1956
0f26839a 1957 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
d9157f15 1958
0f26839a
RS
1959 niters_vector = N
1960
1961 b) set *STEP_VECTOR_PTR to one and generate:
1962
1963 niters_vector = N / vf
1964
1965 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1966 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1967 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
ebfd146a
IR
1968
1969void
a5e3d614 1970vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
0f26839a
RS
1971 tree *niters_vector_ptr, tree *step_vector_ptr,
1972 bool niters_no_overflow)
ebfd146a 1973{
a5e3d614 1974 tree ni_minus_gap, var;
0f26839a 1975 tree niters_vector, step_vector, type = TREE_TYPE (niters);
d9f21f6a 1976 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
a5e3d614 1977 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
0f26839a 1978 tree log_vf = NULL_TREE;
a5e3d614
BC
1979
1980 /* If epilogue loop is required because of data accesses with gaps, we
1981 subtract one iteration from the total number of iterations here for
1982 correct calculation of RATIO. */
1983 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1984 {
7078979b
BC
1985 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1986 build_one_cst (type));
a5e3d614
BC
1987 if (!is_gimple_val (ni_minus_gap))
1988 {
7078979b 1989 var = create_tmp_var (type, "ni_gap");
a5e3d614
BC
1990 gimple *stmts = NULL;
1991 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1992 true, var);
1993 gsi_insert_seq_on_edge_immediate (pe, stmts);
1994 }
1995 }
1996 else
1997 ni_minus_gap = niters;
1998
d9f21f6a 1999 unsigned HOST_WIDE_INT const_vf;
7cfb4d93 2000 if (vf.is_constant (&const_vf)
b3372425 2001 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
0f26839a
RS
2002 {
2003 /* Create: niters >> log2(vf) */
2004 /* If it's known that niters == number of latch executions + 1 doesn't
2005 overflow, we can generate niters >> log2(vf); otherwise we generate
2006 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
2007 will be at least one. */
d9f21f6a 2008 log_vf = build_int_cst (type, exact_log2 (const_vf));
0f26839a
RS
2009 if (niters_no_overflow)
2010 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
2011 else
2012 niters_vector
2013 = fold_build2 (PLUS_EXPR, type,
2014 fold_build2 (RSHIFT_EXPR, type,
2015 fold_build2 (MINUS_EXPR, type,
2016 ni_minus_gap,
2017 build_int_cst (type, vf)),
2018 log_vf),
2019 build_int_cst (type, 1));
2020 step_vector = build_one_cst (type);
2021 }
a5e3d614 2022 else
0f26839a
RS
2023 {
2024 niters_vector = ni_minus_gap;
2025 step_vector = build_int_cst (type, vf);
2026 }
a5e3d614
BC
2027
2028 if (!is_gimple_val (niters_vector))
2029 {
7078979b
BC
2030 var = create_tmp_var (type, "bnd");
2031 gimple_seq stmts = NULL;
a5e3d614
BC
2032 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
2033 gsi_insert_seq_on_edge_immediate (pe, stmts);
7078979b 2034 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
cdcbef3c
RB
2035 we set range information to make niters analyzer's life easier.
2036 Note the number of latch iteration value can be TYPE_MAX_VALUE so
2037 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */
0f26839a 2038 if (stmts != NULL && log_vf)
cdcbef3c
RB
2039 {
2040 if (niters_no_overflow)
2041 set_range_info (niters_vector, VR_RANGE,
2042 wi::one (TYPE_PRECISION (type)),
2043 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2044 TYPE_SIGN (type)),
2045 exact_log2 (const_vf),
2046 TYPE_SIGN (type)));
2047 /* For VF == 1 the vector IV might also overflow so we cannot
2048 assert a minimum value of 1. */
2049 else if (const_vf > 1)
2050 set_range_info (niters_vector, VR_RANGE,
2051 wi::one (TYPE_PRECISION (type)),
2052 wi::rshift (wi::max_value (TYPE_PRECISION (type),
2053 TYPE_SIGN (type))
2054 - (const_vf - 1),
2055 exact_log2 (const_vf), TYPE_SIGN (type))
2056 + 1);
2057 }
a5e3d614
BC
2058 }
2059 *niters_vector_ptr = niters_vector;
0f26839a 2060 *step_vector_ptr = step_vector;
a5e3d614
BC
2061
2062 return;
2063}
2064
2065/* Given NITERS_VECTOR which is the number of iterations for vectorized
2066 loop specified by LOOP_VINFO after vectorization, compute the number
2067 of iterations before vectorization (niters_vector * vf) and store it
2068 to NITERS_VECTOR_MULT_VF_PTR. */
2069
2070static void
2071vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2072 tree niters_vector,
2073 tree *niters_vector_mult_vf_ptr)
2074{
d9f21f6a
RS
2075 /* We should be using a step_vector of VF if VF is variable. */
2076 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
99b1c316 2077 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
a5e3d614
BC
2078 tree type = TREE_TYPE (niters_vector);
2079 tree log_vf = build_int_cst (type, exact_log2 (vf));
2080 basic_block exit_bb = single_exit (loop)->dest;
ebfd146a 2081
a5e3d614
BC
2082 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2083 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2084 niters_vector, log_vf);
2085 if (!is_gimple_val (niters_vector_mult_vf))
2086 {
2087 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2088 gimple_seq stmts = NULL;
2089 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2090 &stmts, true, var);
2091 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2092 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2093 }
2094 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2095}
2096
b81f2daf
RB
2097/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2098 this function searches for the corresponding lcssa phi node in exit
2099 bb of LOOP. If it is found, return the phi result; otherwise return
2100 NULL. */
2101
2102static tree
2103find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2104 gphi *lcssa_phi)
2105{
2106 gphi_iterator gsi;
2107 edge e = single_exit (loop);
2108
2109 gcc_assert (single_pred_p (e->dest));
2110 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2111 {
2112 gphi *phi = gsi.phi ();
2113 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2114 PHI_ARG_DEF (lcssa_phi, 0), 0))
2115 return PHI_RESULT (phi);
2116 }
2117 return NULL_TREE;
2118}
2119
a5e3d614
BC
2120/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2121 from SECOND/FIRST and puts it at the original loop's preheader/exit
2122 edge, the two loops are arranged as below:
2123
2124 preheader_a:
2125 first_loop:
2126 header_a:
2127 i_1 = PHI<i_0, i_2>;
2128 ...
2129 i_2 = i_1 + 1;
2130 if (cond_a)
2131 goto latch_a;
2132 else
2133 goto between_bb;
2134 latch_a:
2135 goto header_a;
2136
2137 between_bb:
2138 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2139
2140 second_loop:
2141 header_b:
2142 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2143 or with i_2 if no LCSSA phi is created
2144 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2145 ...
2146 i_4 = i_3 + 1;
2147 if (cond_b)
2148 goto latch_b;
2149 else
2150 goto exit_bb;
2151 latch_b:
2152 goto header_b;
2153
2154 exit_bb:
2155
2156 This function creates loop closed SSA for the first loop; update the
2157 second loop's PHI nodes by replacing argument on incoming edge with the
2158 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2159 is false, Loop closed ssa phis will only be created for non-iv phis for
2160 the first loop.
2161
2162 This function assumes exit bb of the first loop is preheader bb of the
2163 second loop, i.e, between_bb in the example code. With PHIs updated,
2164 the second loop will execute rest iterations of the first. */
ebfd146a 2165
a5e3d614
BC
2166static void
2167slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
99b1c316 2168 class loop *first, class loop *second,
a5e3d614
BC
2169 bool create_lcssa_for_iv_phis)
2170{
2171 gphi_iterator gsi_update, gsi_orig;
99b1c316 2172 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
a5e3d614
BC
2173
2174 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2175 edge second_preheader_e = loop_preheader_edge (second);
2176 basic_block between_bb = single_exit (first)->dest;
2177
2178 gcc_assert (between_bb == second_preheader_e->src);
2179 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2180 /* Either the first loop or the second is the loop to be vectorized. */
2181 gcc_assert (loop == first || loop == second);
2182
2183 for (gsi_orig = gsi_start_phis (first->header),
2184 gsi_update = gsi_start_phis (second->header);
2185 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2186 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2187 {
2188 gphi *orig_phi = gsi_orig.phi ();
2189 gphi *update_phi = gsi_update.phi ();
2190
2191 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2192 /* Generate lcssa PHI node for the first loop. */
2193 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
82570274
RS
2194 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2195 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
a5e3d614
BC
2196 {
2197 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2198 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2199 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2200 arg = new_res;
2201 }
2202
2203 /* Update PHI node in the second loop by replacing arg on the loop's
2204 incoming edge. */
2205 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2206 }
b81f2daf
RB
2207
2208 /* For epilogue peeling we have to make sure to copy all LC PHIs
2209 for correct vectorization of live stmts. */
2210 if (loop == first)
2211 {
2212 basic_block orig_exit = single_exit (second)->dest;
2213 for (gsi_orig = gsi_start_phis (orig_exit);
2214 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2215 {
2216 gphi *orig_phi = gsi_orig.phi ();
2217 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2218 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2219 continue;
2220
2221 /* Already created in the above loop. */
2222 if (find_guard_arg (first, second, orig_phi))
2223 continue;
2224
2225 tree new_res = copy_ssa_name (orig_arg);
2226 gphi *lcphi = create_phi_node (new_res, between_bb);
2227 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2228 }
2229 }
a5e3d614
BC
2230}
2231
2232/* Function slpeel_add_loop_guard adds guard skipping from the beginning
2233 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2234 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2235 appear like below:
2236
2237 guard_bb:
2238 if (cond)
2239 goto merge_bb;
2240 else
2241 goto skip_loop;
2242
2243 skip_loop:
2244 header_a:
2245 i_1 = PHI<i_0, i_2>;
2246 ...
2247 i_2 = i_1 + 1;
2248 if (cond_a)
2249 goto latch_a;
2250 else
2251 goto exit_a;
2252 latch_a:
2253 goto header_a;
2254
2255 exit_a:
2256 i_5 = PHI<i_2>;
2257
2258 merge_bb:
2259 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2260
2261 update_loop:
2262 header_b:
2263 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2264 ...
2265 i_4 = i_3 + 1;
2266 if (cond_b)
2267 goto latch_b;
2268 else
2269 goto exit_bb;
2270 latch_b:
2271 goto header_b;
2272
2273 exit_bb:
2274
2275 This function creates PHI nodes at merge_bb and replaces the use of i_5
2276 in the update_loop's PHI node with the result of new PHI result. */
2277
2278static void
99b1c316
MS
2279slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2280 class loop *update_loop,
a5e3d614
BC
2281 edge guard_edge, edge merge_edge)
2282{
620e594b 2283 location_t merge_loc, guard_loc;
a5e3d614
BC
2284 edge orig_e = loop_preheader_edge (skip_loop);
2285 edge update_e = loop_preheader_edge (update_loop);
2286 gphi_iterator gsi_orig, gsi_update;
2287
2288 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2289 gsi_update = gsi_start_phis (update_loop->header));
2290 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2291 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2292 {
2293 gphi *orig_phi = gsi_orig.phi ();
2294 gphi *update_phi = gsi_update.phi ();
2295
2296 /* Generate new phi node at merge bb of the guard. */
2297 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2298 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2299
2300 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2301 args in NEW_PHI for these edges. */
2302 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2303 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2304 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2305 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2306 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2307 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2308
2309 /* Update phi in UPDATE_PHI. */
2310 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2311 }
2312}
2313
a5e3d614
BC
2314/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2315 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2316 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2317 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2318 The CFG looks like:
2319
2320 loop:
2321 header_a:
2322 i_1 = PHI<i_0, i_2>;
2323 ...
2324 i_2 = i_1 + 1;
2325 if (cond_a)
2326 goto latch_a;
2327 else
2328 goto exit_a;
2329 latch_a:
2330 goto header_a;
2331
2332 exit_a:
2333
2334 guard_bb:
2335 if (cond)
2336 goto merge_bb;
2337 else
2338 goto epilog_loop;
2339
2340 ;; fall_through_bb
2341
2342 epilog_loop:
2343 header_b:
2344 i_3 = PHI<i_2, i_4>;
2345 ...
2346 i_4 = i_3 + 1;
2347 if (cond_b)
2348 goto latch_b;
2349 else
2350 goto merge_bb;
2351 latch_b:
2352 goto header_b;
2353
2354 merge_bb:
2355 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2356
2357 exit_bb:
2358 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2359
2360 For each name used out side EPILOG (i.e - for each name that has a lcssa
2361 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2362 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2363 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2364 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2365 in exit_bb will also be updated. */
2366
2367static void
99b1c316 2368slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
a5e3d614
BC
2369 edge guard_edge, edge merge_edge)
2370{
2371 gphi_iterator gsi;
2372 basic_block merge_bb = guard_edge->dest;
2373
2374 gcc_assert (single_succ_p (merge_bb));
2375 edge e = single_succ_edge (merge_bb);
2376 basic_block exit_bb = e->dest;
2377 gcc_assert (single_pred_p (exit_bb));
2378 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2379
2380 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2381 {
2382 gphi *update_phi = gsi.phi ();
2383 tree old_arg = PHI_ARG_DEF (update_phi, 0);
a5e3d614 2384
2e7a4f57
AV
2385 tree merge_arg = NULL_TREE;
2386
2387 /* If the old argument is a SSA_NAME use its current_def. */
2388 if (TREE_CODE (old_arg) == SSA_NAME)
2389 merge_arg = get_current_def (old_arg);
2390 /* If it's a constant or doesn't have a current_def, just use the old
2391 argument. */
a5e3d614
BC
2392 if (!merge_arg)
2393 merge_arg = old_arg;
2394
2395 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2396 /* If the var is live after loop but not a reduction, we simply
2397 use the old arg. */
2398 if (!guard_arg)
2399 guard_arg = old_arg;
2400
2401 /* Create new phi node in MERGE_BB: */
2402 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2403 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2404
2405 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2406 the two PHI args in merge_phi for these edges. */
2407 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2408 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2409
2410 /* Update the original phi in exit_bb. */
2411 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2412 }
2413}
2414
2415/* EPILOG loop is duplicated from the original loop for vectorizing,
2416 the arg of its loop closed ssa PHI needs to be updated. */
2417
2418static void
99b1c316 2419slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
a5e3d614
BC
2420{
2421 gphi_iterator gsi;
2422 basic_block exit_bb = single_exit (epilog)->dest;
2423
2424 gcc_assert (single_pred_p (exit_bb));
2425 edge e = EDGE_PRED (exit_bb, 0);
2426 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2427 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2428}
2429
4452a766
RS
2430/* EPILOGUE_VINFO is an epilogue loop that we now know would need to
2431 iterate exactly CONST_NITERS times. Make a final decision about
2432 whether the epilogue loop should be used, returning true if so. */
2433
2434static bool
2435vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
2436 unsigned HOST_WIDE_INT const_niters)
2437{
2438 /* Avoid wrap-around when computing const_niters - 1. Also reject
2439 using an epilogue loop for a single scalar iteration, even if
2440 we could in principle implement that using partial vectors. */
2441 unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2442 if (const_niters <= gap_niters + 1)
2443 return false;
2444
2445 /* Install the number of iterations. */
2446 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
2447 tree niters_tree = build_int_cst (niters_type, const_niters);
2448 tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
2449
2450 LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
2451 LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
2452
2453 /* Decide what to do if the number of epilogue iterations is not
2454 a multiple of the epilogue loop's vectorization factor. */
2455 return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
2456}
2457
1583b8bf
RS
2458/* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
2459 Return a value that equals:
2460
2461 - MAIN_LOOP_VALUE when LOOP_VINFO is entered from the main loop and
2462 - SKIP_VALUE when the main loop is skipped. */
2463
2464tree
2465vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value,
2466 tree skip_value)
2467{
2468 gcc_assert (loop_vinfo->main_loop_edge);
2469
2470 tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value));
2471 basic_block bb = loop_vinfo->main_loop_edge->dest;
2472 gphi *new_phi = create_phi_node (phi_result, bb);
2473 add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge,
2474 UNKNOWN_LOCATION);
2475 add_phi_arg (new_phi, skip_value,
2476 loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION);
2477 return phi_result;
2478}
2479
a5e3d614
BC
2480/* Function vect_do_peeling.
2481
2482 Input:
2483 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2484
2485 preheader:
2486 LOOP:
2487 header_bb:
2488 loop_body
2489 if (exit_loop_cond) goto exit_bb
2490 else goto header_bb
2491 exit_bb:
2492
2493 - NITERS: The number of iterations of the loop.
2494 - NITERSM1: The number of iterations of the loop's latch.
2495 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2496 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2497 CHECK_PROFITABILITY is true.
2498 Output:
0f26839a 2499 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
7cfb4d93 2500 iterate after vectorization; see vect_set_loop_condition for details.
0f26839a
RS
2501 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2502 should be set to the number of scalar iterations handled by the
2503 vector loop. The SSA name is only used on exit from the loop.
a5e3d614
BC
2504
2505 This function peels prolog and epilog from the loop, adds guards skipping
2506 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2507 would look like:
2508
2509 guard_bb_1:
2510 if (prefer_scalar_loop) goto merge_bb_1
2511 else goto guard_bb_2
2512
2513 guard_bb_2:
2514 if (skip_prolog) goto merge_bb_2
2515 else goto prolog_preheader
2516
2517 prolog_preheader:
2518 PROLOG:
2519 prolog_header_bb:
2520 prolog_body
2521 if (exit_prolog_cond) goto prolog_exit_bb
2522 else goto prolog_header_bb
2523 prolog_exit_bb:
2524
2525 merge_bb_2:
2526
2527 vector_preheader:
2528 VECTOR LOOP:
2529 vector_header_bb:
2530 vector_body
2531 if (exit_vector_cond) goto vector_exit_bb
2532 else goto vector_header_bb
2533 vector_exit_bb:
2534
2535 guard_bb_3:
2536 if (skip_epilog) goto merge_bb_3
2537 else goto epilog_preheader
2538
2539 merge_bb_1:
2540
2541 epilog_preheader:
2542 EPILOG:
2543 epilog_header_bb:
2544 epilog_body
2545 if (exit_epilog_cond) goto merge_bb_3
2546 else goto epilog_header_bb
2547
2548 merge_bb_3:
2549
2550 Note this function peels prolog and epilog only if it's necessary,
2551 as well as guards.
97c14603
AV
2552 This function returns the epilogue loop if a decision was made to vectorize
2553 it, otherwise NULL.
2554
2555 The analysis resulting in this epilogue loop's loop_vec_info was performed
2556 in the same vect_analyze_loop call as the main loop's. At that time
2557 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2558 vectorization factors than the main loop. This list is stored in the main
2559 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2560 vectorize the epilogue loop for a lower vectorization factor, the
2561 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2562 updated and linked to the epilogue loop. This is later used to vectorize
2563 the epilogue. The reason the loop_vec_info needs updating is that it was
2564 constructed based on the original main loop, and the epilogue loop is a
2565 copy of this loop, so all links pointing to statements in the original loop
2566 need updating. Furthermore, these loop_vec_infos share the
2567 data_reference's records, which will also need to be updated.
a5e3d614
BC
2568
2569 TODO: Guard for prefer_scalar_loop should be emitted along with
2570 versioning conditions if loop versioning is needed. */
2571
598eaaa2 2572
99b1c316 2573class loop *
a5e3d614 2574vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
0f26839a
RS
2575 tree *niters_vector, tree *step_vector,
2576 tree *niters_vector_mult_vf_var, int th,
97c14603 2577 bool check_profitability, bool niters_no_overflow,
67723321 2578 tree *advance)
a5e3d614
BC
2579{
2580 edge e, guard_e;
2581 tree type = TREE_TYPE (niters), guard_cond;
2582 basic_block guard_bb, guard_to;
357067f2 2583 profile_probability prob_prolog, prob_vector, prob_epilog;
d9f21f6a 2584 int estimated_vf;
535e7c11 2585 int prolog_peeling = 0;
97c14603 2586 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
4452a766 2587 bool vect_epilogues_updated_niters = false;
ca31798e
AV
2588 /* We currently do not support prolog peeling if the target alignment is not
2589 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2590 target alignment being constant. */
2591 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2592 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2593 return NULL;
2594
535e7c11
RS
2595 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2596 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
d1d20a49
RS
2597
2598 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2599 poly_uint64 bound_epilog = 0;
b3372425 2600 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
d1d20a49
RS
2601 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602 bound_epilog += vf - 1;
2603 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2604 bound_epilog += 1;
2605 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2606 poly_uint64 bound_scalar = bound_epilog;
a5e3d614
BC
2607
2608 if (!prolog_peeling && !epilog_peeling)
598eaaa2 2609 return NULL;
a5e3d614 2610
27c14056
RB
2611 /* Before doing any peeling make sure to reset debug binds outside of
2612 the loop refering to defs not in LC SSA. */
2613 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2614 for (unsigned i = 0; i < loop->num_nodes; ++i)
2615 {
2616 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2617 imm_use_iterator ui;
2618 gimple *use_stmt;
2619 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2620 gsi_next (&gsi))
2621 {
2622 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2623 if (gimple_debug_bind_p (use_stmt)
2624 && loop != gimple_bb (use_stmt)->loop_father
2625 && !flow_loop_nested_p (loop,
2626 gimple_bb (use_stmt)->loop_father))
2627 {
2628 gimple_debug_bind_reset_value (use_stmt);
2629 update_stmt (use_stmt);
2630 }
2631 }
2632 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2633 gsi_next (&gsi))
2634 {
2635 ssa_op_iter op_iter;
2636 def_operand_p def_p;
2637 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2638 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2639 if (gimple_debug_bind_p (use_stmt)
2640 && loop != gimple_bb (use_stmt)->loop_father
2641 && !flow_loop_nested_p (loop,
2642 gimple_bb (use_stmt)->loop_father))
2643 {
2644 gimple_debug_bind_reset_value (use_stmt);
2645 update_stmt (use_stmt);
2646 }
2647 }
2648 }
2649
357067f2 2650 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
d9f21f6a
RS
2651 estimated_vf = vect_vf_for_cost (loop_vinfo);
2652 if (estimated_vf == 2)
2653 estimated_vf = 3;
357067f2 2654 prob_prolog = prob_epilog = profile_probability::guessed_always ()
d9f21f6a 2655 .apply_scale (estimated_vf - 1, estimated_vf);
a5e3d614 2656
27c14056 2657 class loop *prolog, *epilog = NULL;
99b1c316 2658 class loop *first_loop = loop;
3e907b90 2659 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
e3969868
RS
2660
2661 /* We might have a queued need to update virtual SSA form. As we
2662 delete the update SSA machinery below after doing a regular
2663 incremental SSA update during loop copying make sure we don't
2664 lose that fact.
2665 ??? Needing to update virtual SSA form by renaming is unfortunate
2666 but not all of the vectorizer code inserting new loads / stores
2667 properly assigns virtual operands to those statements. */
a5e3d614
BC
2668 update_ssa (TODO_update_ssa_only_virtuals);
2669
e3969868
RS
2670 create_lcssa_for_virtual_phi (loop);
2671
2672 /* If we're vectorizing an epilogue loop, the update_ssa above will
2673 have ensured that the virtual operand is in SSA form throughout the
2674 vectorized main loop. Normally it is possible to trace the updated
2675 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2676 back to scalar-stmt vuses, meaning that the effect of the SSA update
2677 remains local to the main loop. However, there are rare cases in
2678 which the vectorized loop has vdefs even when the original scalar
2679 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2680 introduces clobbers of the temporary vector array, which in turn
2681 needs new vdefs. If the scalar loop doesn't write to memory, these
2682 new vdefs will be the only ones in the vector loop.
2683
2684 In that case, update_ssa will have added a new virtual phi to the
2685 main loop, which previously didn't need one. Ensure that we (locally)
2686 maintain LCSSA form for the virtual operand, just as we would have
2687 done if the virtual phi had existed from the outset. This makes it
2688 easier to duplicate the scalar epilogue loop below. */
2689 tree vop_to_rename = NULL_TREE;
2690 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2691 {
2692 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2693 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2694 }
2695
36f52e8f 2696 if (MAY_HAVE_DEBUG_BIND_STMTS)
a5e3d614
BC
2697 {
2698 gcc_assert (!adjust_vec.exists ());
2699 adjust_vec.create (32);
2700 }
ebfd146a
IR
2701 initialize_original_copy_tables ();
2702
d1d20a49
RS
2703 /* Record the anchor bb at which the guard should be placed if the scalar
2704 loop might be preferred. */
2705 basic_block anchor = loop_preheader_edge (loop)->src;
2706
2707 /* Generate the number of iterations for the prolog loop. We do this here
2708 so that we can also get the upper bound on the number of iterations. */
2709 tree niters_prolog;
2710 int bound_prolog = 0;
2711 if (prolog_peeling)
2712 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
97c14603 2713 &bound_prolog);
d1d20a49
RS
2714 else
2715 niters_prolog = build_int_cst (type, 0);
2716
97c14603
AV
2717 loop_vec_info epilogue_vinfo = NULL;
2718 if (vect_epilogues)
2719 {
2720 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2721 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2722 }
2723
2724 tree niters_vector_mult_vf = NULL_TREE;
2725 /* Saving NITERs before the loop, as this may be changed by prologue. */
2726 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2727 edge update_e = NULL, skip_e = NULL;
2728 unsigned int lowest_vf = constant_lower_bound (vf);
2729 /* If we know the number of scalar iterations for the main loop we should
2730 check whether after the main loop there are enough iterations left over
2731 for the epilogue. */
2732 if (vect_epilogues
2733 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2734 && prolog_peeling >= 0
4452a766 2735 && known_eq (vf, lowest_vf))
97c14603
AV
2736 {
2737 unsigned HOST_WIDE_INT eiters
2738 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2739 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2740
2741 eiters -= prolog_peeling;
2742 eiters
2743 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2744
4452a766 2745 while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
97c14603
AV
2746 {
2747 delete epilogue_vinfo;
2748 epilogue_vinfo = NULL;
2749 if (loop_vinfo->epilogue_vinfos.length () == 0)
2750 {
2751 vect_epilogues = false;
2752 break;
2753 }
2754 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2755 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2756 }
4452a766 2757 vect_epilogues_updated_niters = true;
97c14603 2758 }
a5e3d614
BC
2759 /* Prolog loop may be skipped. */
2760 bool skip_prolog = (prolog_peeling != 0);
97c14603
AV
2761 /* Skip this loop to epilog when there are not enough iterations to enter this
2762 vectorized loop. If true we should perform runtime checks on the NITERS
2763 to check whether we should skip the current vectorized loop. If we know
2764 the number of scalar iterations we may choose to add a runtime check if
2765 this number "maybe" smaller than the number of iterations required
2766 when we know the number of scalar iterations may potentially
2767 be smaller than the number of iterations required to enter this loop, for
2768 this we use the upper bounds on the prolog and epilog peeling. When we
2769 don't know the number of iterations and don't require versioning it is
2770 because we have asserted that there are enough scalar iterations to enter
2771 the main loop, so this skip is not necessary. When we are versioning then
2772 we only add such a skip if we have chosen to vectorize the epilogue. */
d1d20a49
RS
2773 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2774 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2775 bound_prolog + bound_epilog)
97c14603
AV
2776 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2777 || vect_epilogues));
a5e3d614
BC
2778 /* Epilog loop must be executed if the number of iterations for epilog
2779 loop is known at compile time, otherwise we need to add a check at
2780 the end of vector loop and skip to the end of epilog loop. */
2781 bool skip_epilog = (prolog_peeling < 0
d9f21f6a
RS
2782 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2783 || !vf.is_constant ());
a5e3d614
BC
2784 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2785 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2786 skip_epilog = false;
2787
a5e3d614 2788 if (skip_vector)
25c99850
BC
2789 {
2790 split_edge (loop_preheader_edge (loop));
2791
2792 /* Due to the order in which we peel prolog and epilog, we first
2793 propagate probability to the whole loop. The purpose is to
2794 avoid adjusting probabilities of both prolog and vector loops
2795 separately. Note in this case, the probability of epilog loop
2796 needs to be scaled back later. */
2797 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
357067f2 2798 if (prob_vector.initialized_p ())
d9f21f6a
RS
2799 {
2800 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2801 scale_loop_profile (loop, prob_vector, 0);
2802 }
25c99850 2803 }
a5e3d614 2804
4f5b9c80 2805 dump_user_location_t loop_loc = find_loop_location (loop);
99b1c316 2806 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
97c14603
AV
2807 if (vect_epilogues)
2808 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2809 use the original scalar loop as remaining epilogue if necessary. */
2810 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2811 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2812
a5e3d614 2813 if (prolog_peeling)
5d2eb24b 2814 {
a5e3d614
BC
2815 e = loop_preheader_edge (loop);
2816 if (!slpeel_can_duplicate_loop_p (loop, e))
5d2eb24b 2817 {
a5e3d614
BC
2818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2819 "loop can't be duplicated to preheader edge.\n");
2820 gcc_unreachable ();
2821 }
2822 /* Peel prolog and put it on preheader edge of loop. */
2823 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2824 if (!prolog)
2825 {
2826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2827 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2828 gcc_unreachable ();
2829 }
03001a35 2830 prolog->force_vectorize = false;
a5e3d614
BC
2831 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2832 first_loop = prolog;
2833 reset_original_copy_tables ();
2834
d1d20a49 2835 /* Update the number of iterations for prolog loop. */
0f26839a 2836 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
7cfb4d93
RS
2837 vect_set_loop_condition (prolog, NULL, niters_prolog,
2838 step_prolog, NULL_TREE, false);
a5e3d614
BC
2839
2840 /* Skip the prolog loop. */
2841 if (skip_prolog)
2842 {
2843 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2844 niters_prolog, build_int_cst (type, 0));
2845 guard_bb = loop_preheader_edge (prolog)->src;
25c99850 2846 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
a5e3d614
BC
2847 guard_to = split_edge (loop_preheader_edge (loop));
2848 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2849 guard_to, guard_bb,
357067f2 2850 prob_prolog.invert (),
3e907b90 2851 irred_flag);
a5e3d614
BC
2852 e = EDGE_PRED (guard_to, 0);
2853 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2854 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
25c99850 2855
af2bbc51
JH
2856 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2857 scale_loop_profile (prolog, prob_prolog, bound_prolog);
a5e3d614 2858 }
97c14603 2859
a5e3d614 2860 /* Update init address of DRs. */
535e7c11 2861 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
a5e3d614
BC
2862 /* Update niters for vector loop. */
2863 LOOP_VINFO_NITERS (loop_vinfo)
2864 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2865 LOOP_VINFO_NITERSM1 (loop_vinfo)
2866 = fold_build2 (MINUS_EXPR, type,
2867 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
7078979b
BC
2868 bool new_var_p = false;
2869 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2870 /* It's guaranteed that vector loop bound before vectorization is at
2871 least VF, so set range information for newly generated var. */
2872 if (new_var_p)
2873 set_range_info (niters, VR_RANGE,
8e6cdc90
RS
2874 wi::to_wide (build_int_cst (type, vf)),
2875 wi::to_wide (TYPE_MAX_VALUE (type)));
a5e3d614 2876
cbb22e61
BC
2877 /* Prolog iterates at most bound_prolog times, latch iterates at
2878 most bound_prolog - 1 times. */
2879 record_niter_bound (prolog, bound_prolog - 1, false, true);
a5e3d614
BC
2880 delete_update_ssa ();
2881 adjust_vec_debug_stmts ();
2882 scev_reset ();
5d2eb24b
IR
2883 }
2884
a5e3d614
BC
2885 if (epilog_peeling)
2886 {
2887 e = single_exit (loop);
2888 if (!slpeel_can_duplicate_loop_p (loop, e))
2889 {
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2891 "loop can't be duplicated to exit edge.\n");
2892 gcc_unreachable ();
2893 }
97c14603
AV
2894 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2895 said epilog then we should use a copy of the main loop as a starting
2896 point. This loop may have already had some preliminary transformations
2897 to allow for more optimal vectorization, for example if-conversion.
2898 If we are not vectorizing the epilog then we should use the scalar loop
2899 as the transformations mentioned above make less or no sense when not
2900 vectorizing. */
2901 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
e3969868
RS
2902 if (vop_to_rename)
2903 {
2904 /* Vectorizing the main loop can sometimes introduce a vdef to
2905 a loop that previously didn't have one; see the comment above
2906 the definition of VOP_TO_RENAME for details. The definition
2907 D that holds on E will then be different from the definition
2908 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2909 rename VOP_TO_RENAME to D when copying the loop.
2910
2911 The virtual operand is in LCSSA form for the main loop,
2912 and no stmt between the main loop and E needs a vdef,
2913 so we know that D is provided by a phi rather than by a
2914 vdef on a normal gimple stmt. */
2915 basic_block vdef_bb = e->src;
2916 gphi *vphi;
2917 while (!(vphi = get_virtual_phi (vdef_bb)))
2918 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2919 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2920 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2921 }
97c14603 2922 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
a5e3d614
BC
2923 if (!epilog)
2924 {
2925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2926 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2927 gcc_unreachable ();
2928 }
03001a35 2929 epilog->force_vectorize = false;
a5e3d614
BC
2930 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2931
2932 /* Scalar version loop may be preferred. In this case, add guard
2933 and skip to epilog. Note this only happens when the number of
2934 iterations of loop is unknown at compile time, otherwise this
2935 won't be vectorized. */
2936 if (skip_vector)
2937 {
cbb22e61 2938 /* Additional epilogue iteration is peeled if gap exists. */
a5e3d614 2939 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
d1d20a49 2940 bound_prolog, bound_epilog,
cbb22e61 2941 th, &bound_scalar,
a5e3d614 2942 check_profitability);
cbb22e61
BC
2943 /* Build guard against NITERSM1 since NITERS may overflow. */
2944 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
a5e3d614
BC
2945 guard_bb = anchor;
2946 guard_to = split_edge (loop_preheader_edge (epilog));
2947 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2948 guard_to, guard_bb,
357067f2 2949 prob_vector.invert (),
3e907b90 2950 irred_flag);
97c14603 2951 skip_e = guard_e;
a5e3d614
BC
2952 e = EDGE_PRED (guard_to, 0);
2953 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2954 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
25c99850
BC
2955
2956 /* Simply propagate profile info from guard_bb to guard_to which is
2957 a merge point of control flow. */
25c99850 2958 guard_to->count = guard_bb->count;
00fa28d1 2959
af2bbc51
JH
2960 /* Scale probability of epilog loop back.
2961 FIXME: We should avoid scaling down and back up. Profile may
2962 get lost if we scale down to 0. */
10ea2672 2963 basic_block *bbs = get_loop_body (epilog);
00fa28d1
JH
2964 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2965 bbs[i]->count = bbs[i]->count.apply_scale
2966 (bbs[i]->count,
2967 bbs[i]->count.apply_probability
2968 (prob_vector));
af2bbc51 2969 free (bbs);
a5e3d614 2970 }
ebfd146a 2971
25c99850 2972 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
a5e3d614
BC
2973 /* If loop is peeled for non-zero constant times, now niters refers to
2974 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2975 overflows. */
2976 niters_no_overflow |= (prolog_peeling > 0);
2977 vect_gen_vector_loop_niters (loop_vinfo, niters,
0f26839a
RS
2978 niters_vector, step_vector,
2979 niters_no_overflow);
2980 if (!integer_onep (*step_vector))
2981 {
2982 /* On exit from the loop we will have an easy way of calcalating
2983 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2984 until then. */
2985 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2986 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2987 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2988 }
2989 else
2990 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2991 &niters_vector_mult_vf);
a5e3d614
BC
2992 /* Update IVs of original loop as if they were advanced by
2993 niters_vector_mult_vf steps. */
2994 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
97c14603 2995 update_e = skip_vector ? e : loop_preheader_edge (epilog);
a5e3d614
BC
2996 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2997 update_e);
2998
2999 if (skip_epilog)
3000 {
3001 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
3002 niters, niters_vector_mult_vf);
3003 guard_bb = single_exit (loop)->dest;
3004 guard_to = split_edge (single_exit (epilog));
3005 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
3006 skip_vector ? anchor : guard_bb,
357067f2 3007 prob_epilog.invert (),
3e907b90 3008 irred_flag);
1583b8bf
RS
3009 if (vect_epilogues)
3010 epilogue_vinfo->skip_this_loop_edge = guard_e;
a5e3d614
BC
3011 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
3012 single_exit (epilog));
25c99850
BC
3013 /* Only need to handle basic block before epilog loop if it's not
3014 the guard_bb, which is the case when skip_vector is true. */
3015 if (guard_bb != bb_before_epilog)
3016 {
357067f2 3017 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
25c99850 3018
af2bbc51 3019 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
25c99850 3020 }
d9f21f6a 3021 scale_loop_profile (epilog, prob_epilog, 0);
a5e3d614
BC
3022 }
3023 else
3024 slpeel_update_phi_nodes_for_lcssa (epilog);
ebfd146a 3025
d1d20a49
RS
3026 unsigned HOST_WIDE_INT bound;
3027 if (bound_scalar.is_constant (&bound))
d9f21f6a 3028 {
d1d20a49
RS
3029 gcc_assert (bound != 0);
3030 /* -1 to convert loop iterations to latch iterations. */
3031 record_niter_bound (epilog, bound - 1, false, true);
d9f21f6a 3032 }
a5e3d614
BC
3033
3034 delete_update_ssa ();
3035 adjust_vec_debug_stmts ();
3036 scev_reset ();
3037 }
97c14603
AV
3038
3039 if (vect_epilogues)
3040 {
3041 epilog->aux = epilogue_vinfo;
3042 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
3043
3044 loop_constraint_clear (epilog, LOOP_C_INFINITE);
3045
3046 /* We now must calculate the number of NITERS performed by the previous
3047 loop and EPILOGUE_NITERS to be performed by the epilogue. */
3048 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
3049 niters_prolog, niters_vector_mult_vf);
3050
3051 /* If skip_vector we may skip the previous loop, we insert a phi-node to
3052 determine whether we are coming from the previous vectorized loop
3053 using the update_e edge or the skip_vector basic block using the
3054 skip_e edge. */
3055 if (skip_vector)
3056 {
4452a766
RS
3057 gcc_assert (update_e != NULL
3058 && skip_e != NULL
3059 && !vect_epilogues_updated_niters);
97c14603
AV
3060 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
3061 update_e->dest);
3062 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
3063 gimple *stmt = gimple_build_assign (new_ssa, niters);
3064 gimple_stmt_iterator gsi;
3065 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
3066 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
3067 {
3068 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
3069 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
3070 }
3071 else
3072 {
3073 gsi = gsi_last_bb (update_e->src);
3074 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
3075 }
3076
3077 niters = new_ssa;
3078 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
3079 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
3080 UNKNOWN_LOCATION);
3081 niters = PHI_RESULT (new_phi);
1583b8bf
RS
3082 epilogue_vinfo->main_loop_edge = update_e;
3083 epilogue_vinfo->skip_main_loop_edge = skip_e;
97c14603
AV
3084 }
3085
97c14603
AV
3086 /* Set ADVANCE to the number of iterations performed by the previous
3087 loop and its prologue. */
3088 *advance = niters;
3089
4452a766
RS
3090 if (!vect_epilogues_updated_niters)
3091 {
3092 /* Subtract the number of iterations performed by the vectorized loop
3093 from the number of total iterations. */
3094 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
3095 before_loop_niters,
3096 niters);
3097
3098 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
3099 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
3100 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
3101 epilogue_niters,
3102 build_one_cst (TREE_TYPE (epilogue_niters)));
3103
3104 /* Decide what to do if the number of epilogue iterations is not
3105 a multiple of the epilogue loop's vectorization factor.
3106 We should have rejected the loop during the analysis phase
3107 if this fails. */
3108 if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
3109 true))
3110 gcc_unreachable ();
3111 }
97c14603
AV
3112 }
3113
a5e3d614 3114 adjust_vec.release ();
ebfd146a 3115 free_original_copy_tables ();
598eaaa2 3116
97c14603 3117 return vect_epilogues ? epilog : NULL;
ebfd146a
IR
3118}
3119
01d32b2b
BC
3120/* Function vect_create_cond_for_niters_checks.
3121
3122 Create a conditional expression that represents the run-time checks for
ce811fc4 3123 loop's niter. The loop is guaranteed to terminate if the run-time
01d32b2b
BC
3124 checks hold.
3125
3126 Input:
3127 COND_EXPR - input conditional expression. New conditions will be chained
3128 with logical AND operation. If it is NULL, then the function
3129 is used to return the number of alias checks.
3130 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3131 to be checked.
3132
3133 Output:
3134 COND_EXPR - conditional expression.
3135
3136 The returned COND_EXPR is the conditional expression to be used in the
3137 if statement that controls which version of the loop gets executed at
3138 runtime. */
3139
3140static void
3141vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3142{
3143 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3144
3145 if (*cond_expr)
3146 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3147 *cond_expr, part_cond_expr);
3148 else
3149 *cond_expr = part_cond_expr;
3150}
ebfd146a 3151
9adee305
RS
3152/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3153 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
3154
3155static void
3156chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3157{
3158 if (*cond_expr)
3159 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3160 *cond_expr, part_cond_expr);
3161 else
3162 *cond_expr = part_cond_expr;
3163}
3164
ebfd146a
IR
3165/* Function vect_create_cond_for_align_checks.
3166
3167 Create a conditional expression that represents the alignment checks for
3168 all of data references (array element references) whose alignment must be
3169 checked at runtime.
3170
3171 Input:
3172 COND_EXPR - input conditional expression. New conditions will be chained
3173 with logical AND operation.
3174 LOOP_VINFO - two fields of the loop information are used.
3175 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3176 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3177
3178 Output:
3179 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3180 expression.
3181 The returned value is the conditional expression to be used in the if
3182 statement that controls which version of the loop gets executed at runtime.
3183
3184 The algorithm makes two assumptions:
3185 1) The number of bytes "n" in a vector is a power of 2.
3186 2) An address "a" is aligned if a%n is zero and that this
3187 test can be done as a&(n-1) == 0. For example, for 16
3188 byte vectors the test is a&0xf == 0. */
3189
3190static void
3191vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3192 tree *cond_expr,
3193 gimple_seq *cond_expr_stmt_list)
3194{
00dcc88a 3195 const vec<stmt_vec_info> &may_misalign_stmts
ebfd146a 3196 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7bcbf2d8 3197 stmt_vec_info stmt_info;
ebfd146a
IR
3198 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3199 tree mask_cst;
3200 unsigned int i;
ebfd146a
IR
3201 tree int_ptrsize_type;
3202 char tmp_name[20];
3203 tree or_tmp_name = NULL_TREE;
83d5977e 3204 tree and_tmp_name;
355fe088 3205 gimple *and_stmt;
ebfd146a
IR
3206 tree ptrsize_zero;
3207 tree part_cond_expr;
3208
3209 /* Check that mask is one less than a power of 2, i.e., mask is
3210 all zeros followed by all ones. */
3211 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3212
96f9265a 3213 int_ptrsize_type = signed_type_for (ptr_type_node);
ebfd146a
IR
3214
3215 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3216 of the first vector of the i'th data reference. */
3217
7bcbf2d8 3218 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
ebfd146a
IR
3219 {
3220 gimple_seq new_stmt_list = NULL;
3221 tree addr_base;
83d5977e
RG
3222 tree addr_tmp_name;
3223 tree new_or_tmp_name;
355fe088 3224 gimple *addr_stmt, *or_stmt;
7bcbf2d8 3225 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
d8ba5b19 3226 bool negative = tree_int_cst_compare
7bcbf2d8 3227 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
d8ba5b19 3228 tree offset = negative
aad83b7c 3229 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
ebfd146a
IR
3230
3231 /* create: addr_tmp = (int)(address_of_first_vector) */
3232 addr_base =
308bc496
RB
3233 vect_create_addr_base_for_vector_ref (loop_vinfo,
3234 stmt_info, &new_stmt_list,
3f5e8a76 3235 offset);
ebfd146a
IR
3236 if (new_stmt_list != NULL)
3237 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3238
83d5977e
RG
3239 sprintf (tmp_name, "addr2int%d", i);
3240 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
0d0e4a03 3241 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
ebfd146a
IR
3242 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3243
3244 /* The addresses are OR together. */
3245
3246 if (or_tmp_name != NULL_TREE)
3247 {
3248 /* create: or_tmp = or_tmp | addr_tmp */
83d5977e
RG
3249 sprintf (tmp_name, "orptrs%d", i);
3250 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
0d0e4a03
JJ
3251 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3252 or_tmp_name, addr_tmp_name);
ebfd146a
IR
3253 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3254 or_tmp_name = new_or_tmp_name;
3255 }
3256 else
3257 or_tmp_name = addr_tmp_name;
3258
3259 } /* end for i */
3260
3261 mask_cst = build_int_cst (int_ptrsize_type, mask);
3262
3263 /* create: and_tmp = or_tmp & mask */
83d5977e 3264 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
ebfd146a 3265
0d0e4a03
JJ
3266 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3267 or_tmp_name, mask_cst);
ebfd146a
IR
3268 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3269
3270 /* Make and_tmp the left operand of the conditional test against zero.
3271 if and_tmp has a nonzero bit then some address is unaligned. */
3272 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3273 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3274 and_tmp_name, ptrsize_zero);
9adee305
RS
3275 chain_cond_expr (cond_expr, part_cond_expr);
3276}
3277
3278/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3279 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3280 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3281 and this new condition are true. Treat a null *COND_EXPR as "true". */
3282
3283static void
3284vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3285{
00dcc88a
MS
3286 const vec<vec_object_pair> &pairs
3287 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
9adee305
RS
3288 unsigned int i;
3289 vec_object_pair *pair;
3290 FOR_EACH_VEC_ELT (pairs, i, pair)
3291 {
3292 tree addr1 = build_fold_addr_expr (pair->first);
3293 tree addr2 = build_fold_addr_expr (pair->second);
3294 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3295 addr1, addr2);
3296 chain_cond_expr (cond_expr, part_cond_expr);
3297 }
ebfd146a
IR
3298}
3299
a57776a1
RS
3300/* Create an expression that is true when all lower-bound conditions for
3301 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3302
3303static void
3304vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3305{
00dcc88a
MS
3306 const vec<vec_lower_bound> &lower_bounds
3307 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
a57776a1
RS
3308 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3309 {
3310 tree expr = lower_bounds[i].expr;
3311 tree type = unsigned_type_for (TREE_TYPE (expr));
3312 expr = fold_convert (type, expr);
3313 poly_uint64 bound = lower_bounds[i].min_value;
3314 if (!lower_bounds[i].unsigned_p)
3315 {
3316 expr = fold_build2 (PLUS_EXPR, type, expr,
3317 build_int_cstu (type, bound - 1));
3318 bound += bound - 1;
3319 }
3320 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3321 build_int_cstu (type, bound));
3322 chain_cond_expr (cond_expr, part_cond_expr);
3323 }
3324}
3325
ebfd146a
IR
3326/* Function vect_create_cond_for_alias_checks.
3327
3328 Create a conditional expression that represents the run-time checks for
3329 overlapping of address ranges represented by a list of data references
3330 relations passed as input.
3331
3332 Input:
3333 COND_EXPR - input conditional expression. New conditions will be chained
a05a89fa
CH
3334 with logical AND operation. If it is NULL, then the function
3335 is used to return the number of alias checks.
ebfd146a
IR
3336 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3337 to be checked.
3338
3339 Output:
3340 COND_EXPR - conditional expression.
ebfd146a 3341
a05a89fa 3342 The returned COND_EXPR is the conditional expression to be used in the if
ebfd146a
IR
3343 statement that controls which version of the loop gets executed at runtime.
3344*/
3345
a05a89fa 3346void
4bdd44c4 3347vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
ebfd146a 3348{
00dcc88a 3349 const vec<dr_with_seg_len_pair_t> &comp_alias_ddrs =
a05a89fa 3350 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
ebfd146a 3351
a05a89fa 3352 if (comp_alias_ddrs.is_empty ())
ebfd146a
IR
3353 return;
3354
9cbd2d97
BC
3355 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3356 &comp_alias_ddrs, cond_expr);
73fbfcad 3357 if (dump_enabled_p ())
ccb3ad87 3358 dump_printf_loc (MSG_NOTE, vect_location,
78c60e3d 3359 "created %u versioning for alias checks.\n",
a05a89fa 3360 comp_alias_ddrs.length ());
ebfd146a
IR
3361}
3362
3363
3364/* Function vect_loop_versioning.
b8698a0f 3365
ebfd146a
IR
3366 If the loop has data references that may or may not be aligned or/and
3367 has data reference relations whose independence was not proven then
3368 two versions of the loop need to be generated, one which is vectorized
3369 and one which isn't. A test is then generated to control which of the
3370 loops is executed. The test checks for the alignment of all of the
3371 data references that may or may not be aligned. An additional
3372 sequence of runtime tests is generated for each pairs of DDRs whose
b8698a0f
L
3373 independence was not proven. The vectorized version of loop is
3374 executed only if both alias and alignment tests are passed.
3375
ebfd146a 3376 The test generated to check which version of loop is executed
b8698a0f 3377 is modified to also check for profitability as indicated by the
d9157f15 3378 cost model threshold TH.
86290011
RG
3379
3380 The versioning precondition(s) are placed in *COND_EXPR and
d68d56b5 3381 *COND_EXPR_STMT_LIST. */
ebfd146a 3382
99b1c316 3383class loop *
9c4f0d31
RB
3384vect_loop_versioning (loop_vec_info loop_vinfo,
3385 gimple *loop_vectorized_call)
ebfd146a 3386{
99b1c316
MS
3387 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3388 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
ebfd146a 3389 basic_block condition_bb;
538dd0b7
DM
3390 gphi_iterator gsi;
3391 gimple_stmt_iterator cond_exp_gsi;
ebfd146a
IR
3392 basic_block merge_bb;
3393 basic_block new_exit_bb;
3394 edge new_exit_e, e;
538dd0b7 3395 gphi *orig_phi, *new_phi;
368117e8 3396 tree cond_expr = NULL_TREE;
d68d56b5 3397 gimple_seq cond_expr_stmt_list = NULL;
ebfd146a 3398 tree arg;
af2bbc51 3399 profile_probability prob = profile_probability::likely ();
ebfd146a 3400 gimple_seq gimplify_stmt_list = NULL;
fdd29374 3401 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
9cc1fb4b
XDL
3402 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3403 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
01d32b2b 3404 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
a421fe9e
AV
3405 poly_uint64 versioning_threshold
3406 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
4e65deef
JJ
3407 tree version_simd_if_cond
3408 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
a421fe9e 3409 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
ebfd146a 3410
6eed64b9 3411 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
a421fe9e 3412 && !ordered_p (th, versioning_threshold))
80be3333 3413 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
01d32b2b 3414 build_int_cst (TREE_TYPE (scalar_loop_iters),
fdd29374 3415 th - 1));
a696bc4f
RS
3416 if (maybe_ne (versioning_threshold, 0U))
3417 {
3418 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3419 build_int_cst (TREE_TYPE (scalar_loop_iters),
3420 versioning_threshold - 1));
3421 if (cond_expr)
3422 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3423 expr, cond_expr);
3424 else
3425 cond_expr = expr;
3426 }
01d32b2b
BC
3427
3428 if (version_niter)
3429 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3430
3431 if (cond_expr)
2778a719
RB
3432 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3433 &cond_expr_stmt_list,
01d32b2b 3434 is_gimple_condexpr, NULL_TREE);
ebfd146a 3435
9cc1fb4b 3436 if (version_align)
d68d56b5
RG
3437 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3438 &cond_expr_stmt_list);
ebfd146a 3439
9cc1fb4b 3440 if (version_alias)
9adee305
RS
3441 {
3442 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
a57776a1 3443 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
9adee305
RS
3444 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3445 }
86290011 3446
4e65deef
JJ
3447 if (version_simd_if_cond)
3448 {
3449 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3450 if (flag_checking)
3451 if (basic_block bb
3452 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3453 gcc_assert (bb != loop->header
3454 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3455 && (scalar_loop == NULL
3456 || (bb != scalar_loop->header
3457 && dominated_by_p (CDI_DOMINATORS,
3458 scalar_loop->header, bb))));
3459 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3460 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3461 version_simd_if_cond, zero);
3462 if (cond_expr)
3463 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3464 c, cond_expr);
3465 else
3466 cond_expr = c;
3467 if (dump_enabled_p ())
3468 dump_printf_loc (MSG_NOTE, vect_location,
3469 "created versioning for simd if condition check.\n");
3470 }
3471
b210f45f
RB
3472 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3473 &gimplify_stmt_list,
d68d56b5
RG
3474 is_gimple_condexpr, NULL_TREE);
3475 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
ebfd146a 3476
2778a719
RB
3477 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3478 invariant in. */
99b1c316 3479 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
2778a719
RB
3480 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3481 !gsi_end_p (gsi); gsi_next (&gsi))
5ce9450f 3482 {
2778a719
RB
3483 gimple *stmt = gsi_stmt (gsi);
3484 update_stmt (stmt);
3485 ssa_op_iter iter;
3486 use_operand_p use_p;
3487 basic_block def_bb;
3488 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3489 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3490 && flow_bb_inside_loop_p (outermost, def_bb))
3491 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3492 }
5ce9450f 3493
2778a719
RB
3494 /* Search for the outermost loop we can version. Avoid versioning of
3495 non-perfect nests but allow if-conversion versioned loops inside. */
99b1c316 3496 class loop *loop_to_version = loop;
2778a719
RB
3497 if (flow_loop_nested_p (outermost, loop))
3498 {
3499 if (dump_enabled_p ())
3500 dump_printf_loc (MSG_NOTE, vect_location,
3501 "trying to apply versioning to outer loop %d\n",
3502 outermost->num);
3503 if (outermost->num == 0)
3504 outermost = superloop_at_depth (loop, 1);
3505 /* And avoid applying versioning on non-perfect nests. */
3506 while (loop_to_version != outermost
3507 && single_exit (loop_outer (loop_to_version))
3508 && (!loop_outer (loop_to_version)->inner->next
3509 || vect_loop_vectorized_call (loop_to_version))
3510 && (!loop_outer (loop_to_version)->inner->next
3511 || !loop_outer (loop_to_version)->inner->next->next))
3512 loop_to_version = loop_outer (loop_to_version);
3513 }
3514
3515 /* Apply versioning. If there is already a scalar version created by
b614fca2
RB
3516 if-conversion re-use that. Note we cannot re-use the copy of
3517 an if-converted outer-loop when vectorizing the inner loop only. */
2778a719 3518 gcond *cond;
b614fca2 3519 if ((!loop_to_version->inner || loop == loop_to_version)
9c4f0d31 3520 && loop_vectorized_call)
2778a719
RB
3521 {
3522 gcc_assert (scalar_loop);
9c4f0d31
RB
3523 condition_bb = gimple_bb (loop_vectorized_call);
3524 cond = as_a <gcond *> (last_stmt (condition_bb));
2778a719
RB
3525 gimple_cond_set_condition_from_tree (cond, cond_expr);
3526 update_stmt (cond);
3527
3528 if (cond_expr_stmt_list)
3529 {
9c4f0d31 3530 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
2778a719
RB
3531 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3532 GSI_SAME_STMT);
3533 }
3534
5cee3239
RB
3535 /* if-conversion uses profile_probability::always () for both paths,
3536 reset the paths probabilities appropriately. */
3537 edge te, fe;
3538 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3539 te->probability = prob;
3540 fe->probability = prob.invert ();
3541 /* We can scale loops counts immediately but have to postpone
3542 scaling the scalar loop because we re-use it during peeling. */
3543 scale_loop_frequencies (loop_to_version, te->probability);
3544 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3545
2778a719
RB
3546 nloop = scalar_loop;
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_NOTE, vect_location,
3549 "reusing %sloop version created by if conversion\n",
3550 loop_to_version != loop ? "outer " : "");
5ce9450f
JJ
3551 }
3552 else
2778a719
RB
3553 {
3554 if (loop_to_version != loop
3555 && dump_enabled_p ())
3556 dump_printf_loc (MSG_NOTE, vect_location,
3557 "applying loop versioning to outer loop %d\n",
3558 loop_to_version->num);
3559
3560 initialize_original_copy_tables ();
3561 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3562 prob, prob.invert (), prob, prob.invert (), true);
3563 gcc_assert (nloop);
3564 nloop = get_loop_copy (loop);
3565
3566 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3567 reap those otherwise; they also refer to the original
3568 loops. */
99b1c316 3569 class loop *l = loop;
2778a719
RB
3570 while (gimple *call = vect_loop_vectorized_call (l))
3571 {
3572 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3573 fold_loop_internal_call (call, boolean_false_node);
3574 l = loop_outer (l);
3575 }
3576 free_original_copy_tables ();
3577
3578 if (cond_expr_stmt_list)
3579 {
3580 cond_exp_gsi = gsi_last_bb (condition_bb);
3581 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3582 GSI_SAME_STMT);
3583 }
3584
3585 /* Loop versioning violates an assumption we try to maintain during
3586 vectorization - that the loop exit block has a single predecessor.
3587 After versioning, the exit block of both loop versions is the same
3588 basic block (i.e. it has two predecessors). Just in order to simplify
3589 following transformations in the vectorizer, we fix this situation
3590 here by adding a new (empty) block on the exit-edge of the loop,
3591 with the proper loop-exit phis to maintain loop-closed-form.
3592 If loop versioning wasn't done from loop, but scalar_loop instead,
3593 merge_bb will have already just a single successor. */
3594
3595 merge_bb = single_exit (loop_to_version)->dest;
3596 if (EDGE_COUNT (merge_bb->preds) >= 2)
3597 {
3598 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3599 new_exit_bb = split_edge (single_exit (loop_to_version));
3600 new_exit_e = single_exit (loop_to_version);
3601 e = EDGE_SUCC (new_exit_bb, 0);
3602
3603 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3604 gsi_next (&gsi))
3605 {
3606 tree new_res;
3607 orig_phi = gsi.phi ();
3608 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3609 new_phi = create_phi_node (new_res, new_exit_bb);
3610 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3611 add_phi_arg (new_phi, arg, new_exit_e,
3612 gimple_phi_arg_location_from_edge (orig_phi, e));
3613 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3614 }
3615 }
3616
3617 update_ssa (TODO_update_ssa);
3618 }
01d32b2b
BC
3619
3620 if (version_niter)
3621 {
3622 /* The versioned loop could be infinite, we need to clear existing
3623 niter information which is copied from the original loop. */
3624 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3625 vect_free_loop_info_assumptions (nloop);
01d32b2b 3626 }
9cc1fb4b 3627
4f5b9c80 3628 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
9cc1fb4b
XDL
3629 && dump_enabled_p ())
3630 {
3631 if (version_alias)
7db960c5
DM
3632 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3633 vect_location,
9cc1fb4b
XDL
3634 "loop versioned for vectorization because of "
3635 "possible aliasing\n");
3636 if (version_align)
7db960c5
DM
3637 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3638 vect_location,
9cc1fb4b
XDL
3639 "loop versioned for vectorization to enhance "
3640 "alignment\n");
3641
3642 }
03001a35
RB
3643
3644 return nloop;
ebfd146a 3645}