]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-loop-manip.c
[12/n] PR85694: Rework detection of widened operations
[thirdparty/gcc.git] / gcc / tree-vect-loop-manip.c
CommitLineData
48e1416a 1/* Vectorizer Specific Loop Manipulations
8e8f6434 2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
48e1416a 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
fb85abff 4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
9ef16211 25#include "backend.h"
b20a8bb4 26#include "tree.h"
9ef16211 27#include "gimple.h"
7c29e30e 28#include "cfghooks.h"
29#include "tree-pass.h"
9ef16211 30#include "ssa.h"
9ef16211 31#include "fold-const.h"
94ea8568 32#include "cfganal.h"
a8783bee 33#include "gimplify.h"
dcf1a1ec 34#include "gimple-iterator.h"
e795d6e1 35#include "gimplify-me.h"
073c1fd5 36#include "tree-cfg.h"
05d9c18a 37#include "tree-ssa-loop-manip.h"
073c1fd5 38#include "tree-into-ssa.h"
69ee5dbb 39#include "tree-ssa.h"
fb85abff 40#include "cfgloop.h"
fb85abff 41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
12117f39 43#include "tree-ssa-loop-ivopts.h"
cde959e7 44#include "gimple-fold.h"
60b29a7e 45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
fb85abff 50
51/*************************************************************************
52 Simple Loop Peeling Utilities
53
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
56
57
58/* Renames the use *OP_P. */
59
60static void
61rename_use_op (use_operand_p op_p)
62{
63 tree new_name;
64
65 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66 return;
67
68 new_name = get_current_def (USE_FROM_PTR (op_p));
69
70 /* Something defined outside of the loop. */
71 if (!new_name)
72 return;
73
74 /* An ordinary ssa name defined in the loop. */
75
76 SET_USE (op_p, new_name);
77}
78
79
ccd0a9f9 80/* Renames the variables in basic block BB. Allow renaming of PHI arguments
5ee742c4 81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82 true. */
fb85abff 83
c9b2c569 84static void
5ee742c4 85rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
fb85abff 86{
42acab1c 87 gimple *stmt;
fb85abff 88 use_operand_p use_p;
89 ssa_op_iter iter;
90 edge e;
91 edge_iterator ei;
92 struct loop *loop = bb->loop_father;
5ee742c4 93 struct loop *outer_loop = NULL;
94
95 if (rename_from_outer_loop)
96 {
97 gcc_assert (loop);
98 outer_loop = loop_outer (loop);
99 }
fb85abff 100
1a91d914 101 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102 gsi_next (&gsi))
fb85abff 103 {
104 stmt = gsi_stmt (gsi);
105 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 rename_use_op (use_p);
107 }
108
c9b2c569 109 FOR_EACH_EDGE (e, ei, bb->preds)
fb85abff 110 {
ccd0a9f9 111 if (!flow_bb_inside_loop_p (loop, e->src))
112 {
113 if (!rename_from_outer_loop)
114 continue;
115 if (e->src != outer_loop->header)
116 {
117 if (outer_loop->inner->next)
118 {
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e->src)
123 || single_pred (e->src) != outer_loop->header)
124 continue;
125 }
ccd0a9f9 126 }
127 }
1a91d914 128 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 gsi_next (&gsi))
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
fb85abff 131 }
132}
133
134
6dc50383 135struct adjust_info
b123eaab 136{
137 tree from, to;
138 basic_block bb;
6dc50383 139};
b123eaab 140
b123eaab 141/* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
146 more. */
d70aebca 147static vec<adjust_info, va_heap> adjust_vec;
b123eaab 148
149/* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
152
153static void
154adjust_debug_stmts_now (adjust_info *ai)
155{
156 basic_block bbphi = ai->bb;
157 tree orig_def = ai->from;
158 tree new_def = ai->to;
159 imm_use_iterator imm_iter;
42acab1c 160 gimple *stmt;
b123eaab 161 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164
165 /* Adjust any debug stmts that held onto non-loop-closed
166 references. */
167 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168 {
169 use_operand_p use_p;
170 basic_block bbuse;
171
172 if (!is_gimple_debug (stmt))
173 continue;
174
175 gcc_assert (gimple_debug_bind_p (stmt));
176
177 bbuse = gimple_bb (stmt);
178
179 if ((bbuse == bbphi
180 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 && !(bbuse == bbdef
182 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183 {
184 if (new_def)
185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 SET_USE (use_p, new_def);
187 else
188 {
189 gimple_debug_bind_reset_value (stmt);
190 update_stmt (stmt);
191 }
192 }
193 }
194}
195
196/* Adjust debug stmts as scheduled before. */
197
198static void
199adjust_vec_debug_stmts (void)
200{
c64f38bf 201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
b123eaab 202 return;
203
f1f41a6c 204 gcc_assert (adjust_vec.exists ());
b123eaab 205
f1f41a6c 206 while (!adjust_vec.is_empty ())
b123eaab 207 {
f1f41a6c 208 adjust_debug_stmts_now (&adjust_vec.last ());
209 adjust_vec.pop ();
b123eaab 210 }
b123eaab 211}
212
213/* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
217
218static void
219adjust_debug_stmts (tree from, tree to, basic_block bb)
220{
221 adjust_info ai;
222
c64f38bf 223 if (MAY_HAVE_DEBUG_BIND_STMTS
0087edc6 224 && TREE_CODE (from) == SSA_NAME
2510e5cd 225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
0087edc6 226 && ! virtual_operand_p (from))
b123eaab 227 {
228 ai.from = from;
229 ai.to = to;
230 ai.bb = bb;
231
f1f41a6c 232 if (adjust_vec.exists ())
233 adjust_vec.safe_push (ai);
b123eaab 234 else
235 adjust_debug_stmts_now (&ai);
236 }
237}
238
239/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
242 transformations. */
243
244static void
42acab1c 245adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
b123eaab 246{
247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248
249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250
c64f38bf 251 if (MAY_HAVE_DEBUG_BIND_STMTS)
b123eaab 252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 gimple_bb (update_phi));
254}
255
60b29a7e 256/* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
cde959e7 259
60b29a7e 260static void
261vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262 tree next_mask)
263{
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267}
cde959e7 268
60b29a7e 269/* Add SEQ to the end of LOOP's preheader block. */
fb85abff 270
60b29a7e 271static void
272add_preheader_seq (struct loop *loop, gimple_seq seq)
273{
274 if (seq)
275 {
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
279 }
280}
fb85abff 281
60b29a7e 282/* Add SEQ to the beginning of LOOP's header block. */
283
284static void
285add_header_seq (struct loop *loop, gimple_seq seq)
286{
287 if (seq)
288 {
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291 }
292}
293
294/* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
298
299static bool
300interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
302{
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
307 {
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
310 }
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313}
314
315/* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
318
319static bool
320vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 rgroup_masks *src_rgm)
322{
323 tree src_masktype = src_rgm->mask_type;
324 tree dest_masktype = dest_rgm->mask_type;
325 machine_mode src_mode = TYPE_MODE (src_masktype);
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329 {
330 /* Unpacking the source masks gives at least as many mask bits as
331 we need. We can then VIEW_CONVERT any excess bits away. */
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334 {
335 tree src = src_rgm->masks[i / 2];
336 tree dest = dest_rgm->masks[i];
88e81b08 337 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338 ? VEC_UNPACK_HI_EXPR
60b29a7e 339 : VEC_UNPACK_LO_EXPR);
340 gassign *stmt;
341 if (dest_masktype == unpack_masktype)
342 stmt = gimple_build_assign (dest, code, src);
343 else
344 {
345 tree temp = make_ssa_name (unpack_masktype);
346 stmt = gimple_build_assign (temp, code, src);
347 gimple_seq_add_stmt (seq, stmt);
348 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349 build1 (VIEW_CONVERT_EXPR,
350 dest_masktype, temp));
351 }
352 gimple_seq_add_stmt (seq, stmt);
353 }
354 return true;
355 }
356 vec_perm_indices indices[2];
357 if (dest_masktype == src_masktype
358 && interleave_supported_p (&indices[0], src_masktype, 0)
359 && interleave_supported_p (&indices[1], src_masktype, 1))
360 {
361 /* The destination requires twice as many mask bits as the source, so
362 we can use interleaving permutes to double up the number of bits. */
363 tree masks[2];
364 for (unsigned int i = 0; i < 2; ++i)
365 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367 {
368 tree src = src_rgm->masks[i / 2];
369 tree dest = dest_rgm->masks[i];
370 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371 src, src, masks[i & 1]);
372 gimple_seq_add_stmt (seq, stmt);
373 }
374 return true;
375 }
376 return false;
377}
378
379/* Helper for vect_set_loop_condition_masked. Generate definitions for
380 all the masks in RGM and return a mask that is nonzero when the loop
381 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
382 Use LOOP_COND_GSI to insert code before the exit gcond.
383
384 RGM belongs to loop LOOP. The loop originally iterated NITERS
385 times and has been vectorized according to LOOP_VINFO. Each iteration
386 of the vectorized loop handles VF iterations of the scalar loop.
387
6753a4bf 388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389 starts with NITERS_SKIP dummy iterations of the scalar loop before
390 the real work starts. The mask elements for these dummy iterations
391 must be 0, to ensure that the extra iterations do not have an effect.
392
60b29a7e 393 It is known that:
394
395 NITERS * RGM->max_nscalars_per_iter
396
397 does not overflow. However, MIGHT_WRAP_P says whether an induction
398 variable that starts at 0 and has step:
399
400 VF * RGM->max_nscalars_per_iter
401
402 might overflow before hitting a value above:
403
6753a4bf 404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
60b29a7e 405
406 This means that we cannot guarantee that such an induction variable
407 would ever hit a value that produces a set of all-false masks for RGM. */
408
409static tree
410vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411 gimple_seq *preheader_seq,
412 gimple_stmt_iterator loop_cond_gsi,
413 rgroup_masks *rgm, tree vf,
6753a4bf 414 tree niters, tree niters_skip,
415 bool might_wrap_p)
60b29a7e 416{
417 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418 tree mask_type = rgm->mask_type;
419 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421
422 /* Calculate the maximum number of scalar values that the rgroup
6753a4bf 423 handles in total, the number that it handles for each iteration
424 of the vector loop, and the number that it should skip during the
425 first iteration of the vector loop. */
60b29a7e 426 tree nscalars_total = niters;
427 tree nscalars_step = vf;
6753a4bf 428 tree nscalars_skip = niters_skip;
60b29a7e 429 if (nscalars_per_iter != 1)
430 {
431 /* We checked before choosing to use a fully-masked loop that these
432 multiplications don't overflow. */
433 tree factor = build_int_cst (compare_type, nscalars_per_iter);
434 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
435 nscalars_total, factor);
436 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437 nscalars_step, factor);
6753a4bf 438 if (nscalars_skip)
439 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
440 nscalars_skip, factor);
60b29a7e 441 }
442
443 /* Create an induction variable that counts the number of scalars
444 processed. */
445 tree index_before_incr, index_after_incr;
446 gimple_stmt_iterator incr_gsi;
447 bool insert_after;
448 tree zero_index = build_int_cst (compare_type, 0);
449 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450 create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
451 insert_after, &index_before_incr, &index_after_incr);
452
6753a4bf 453 tree test_index, test_limit, first_limit;
60b29a7e 454 gimple_stmt_iterator *test_gsi;
455 if (might_wrap_p)
456 {
457 /* In principle the loop should stop iterating once the incremented
6753a4bf 458 IV reaches a value greater than or equal to:
459
460 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
461
462 However, there's no guarantee that this addition doesn't overflow
463 the comparison type, or that the IV hits a value above it before
464 wrapping around. We therefore adjust the limit down by one
465 IV step:
60b29a7e 466
6753a4bf 467 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468 -[infinite-prec] NSCALARS_STEP
60b29a7e 469
470 and compare the IV against this limit _before_ incrementing it.
471 Since the comparison type is unsigned, we actually want the
472 subtraction to saturate at zero:
473
6753a4bf 474 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
475 -[sat] NSCALARS_STEP
476
477 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
478
479 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
480
481 where the rightmost subtraction can be done directly in
482 COMPARE_TYPE. */
60b29a7e 483 test_index = index_before_incr;
6753a4bf 484 tree adjust = nscalars_step;
485 if (nscalars_skip)
486 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
487 adjust, nscalars_skip);
60b29a7e 488 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
6753a4bf 489 nscalars_total, adjust);
60b29a7e 490 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
6753a4bf 491 test_limit, adjust);
60b29a7e 492 test_gsi = &incr_gsi;
6753a4bf 493
494 /* Get a safe limit for the first iteration. */
495 if (nscalars_skip)
496 {
497 /* The first vector iteration can handle at most NSCALARS_STEP
498 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
499 NSCALARS_SKIP to that cannot overflow. */
500 tree const_limit = build_int_cst (compare_type,
501 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
502 * nscalars_per_iter);
503 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
504 nscalars_total, const_limit);
505 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
506 first_limit, nscalars_skip);
507 }
508 else
509 /* For the first iteration it doesn't matter whether the IV hits
510 a value above NSCALARS_TOTAL. That only matters for the latch
511 condition. */
512 first_limit = nscalars_total;
60b29a7e 513 }
514 else
515 {
516 /* Test the incremented IV, which will always hit a value above
517 the bound before wrapping. */
518 test_index = index_after_incr;
519 test_limit = nscalars_total;
6753a4bf 520 if (nscalars_skip)
521 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
522 test_limit, nscalars_skip);
60b29a7e 523 test_gsi = &loop_cond_gsi;
6753a4bf 524
525 first_limit = test_limit;
60b29a7e 526 }
527
528 /* Provide a definition of each mask in the group. */
529 tree next_mask = NULL_TREE;
530 tree mask;
531 unsigned int i;
532 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
533 {
534 /* Previous masks will cover BIAS scalars. This mask covers the
535 next batch. */
536 poly_uint64 bias = nscalars_per_mask * i;
537 tree bias_tree = build_int_cst (compare_type, bias);
538 gimple *tmp_stmt;
539
540 /* See whether the first iteration of the vector loop is known
541 to have a full mask. */
542 poly_uint64 const_limit;
543 bool first_iteration_full
6753a4bf 544 = (poly_int_tree_p (first_limit, &const_limit)
60b29a7e 545 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
546
547 /* Rather than have a new IV that starts at BIAS and goes up to
548 TEST_LIMIT, prefer to use the same 0-based IV for each mask
549 and adjust the bound down by BIAS. */
550 tree this_test_limit = test_limit;
551 if (i != 0)
552 {
553 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
554 compare_type, this_test_limit,
555 bias_tree);
556 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
557 compare_type, this_test_limit,
558 bias_tree);
559 }
560
6753a4bf 561 /* Create the initial mask. First include all scalars that
562 are within the loop limit. */
60b29a7e 563 tree init_mask = NULL_TREE;
564 if (!first_iteration_full)
565 {
566 tree start, end;
6753a4bf 567 if (first_limit == test_limit)
60b29a7e 568 {
569 /* Use a natural test between zero (the initial IV value)
570 and the loop limit. The "else" block would be valid too,
571 but this choice can avoid the need to load BIAS_TREE into
572 a register. */
573 start = zero_index;
574 end = this_test_limit;
575 }
576 else
577 {
6753a4bf 578 /* FIRST_LIMIT is the maximum number of scalars handled by the
579 first iteration of the vector loop. Test the portion
580 associated with this mask. */
60b29a7e 581 start = bias_tree;
6753a4bf 582 end = first_limit;
60b29a7e 583 }
584
585 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
586 tmp_stmt = vect_gen_while (init_mask, start, end);
587 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
588 }
589
6753a4bf 590 /* Now AND out the bits that are within the number of skipped
591 scalars. */
592 poly_uint64 const_skip;
593 if (nscalars_skip
594 && !(poly_int_tree_p (nscalars_skip, &const_skip)
595 && known_le (const_skip, bias)))
596 {
597 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
598 bias_tree, nscalars_skip);
599 if (init_mask)
600 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
601 init_mask, unskipped_mask);
602 else
603 init_mask = unskipped_mask;
604 }
605
60b29a7e 606 if (!init_mask)
607 /* First iteration is full. */
608 init_mask = build_minus_one_cst (mask_type);
609
610 /* Get the mask value for the next iteration of the loop. */
611 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
612 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
613 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
614
615 vect_set_loop_mask (loop, mask, init_mask, next_mask);
616 }
617 return next_mask;
618}
619
620/* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
53771608 622 number of iterations of the original scalar loop that should be
623 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
624 as for vect_set_loop_condition.
60b29a7e 625
626 Insert the branch-back condition before LOOP_COND_GSI and return the
627 final gcond. */
628
629static gcond *
630vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
631 tree niters, tree final_iv,
632 bool niters_maybe_zero,
633 gimple_stmt_iterator loop_cond_gsi)
634{
635 gimple_seq preheader_seq = NULL;
636 gimple_seq header_seq = NULL;
637
638 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
639 unsigned int compare_precision = TYPE_PRECISION (compare_type);
640 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
641 tree orig_niters = niters;
642
643 /* Type of the initial value of NITERS. */
644 tree ni_actual_type = TREE_TYPE (niters);
645 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
646
647 /* Convert NITERS to the same size as the compare. */
648 if (compare_precision > ni_actual_precision
649 && niters_maybe_zero)
650 {
651 /* We know that there is always at least one iteration, so if the
652 count is zero then it must have wrapped. Cope with this by
653 subtracting 1 before the conversion and adding 1 to the result. */
654 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
655 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
656 niters, build_minus_one_cst (ni_actual_type));
657 niters = gimple_convert (&preheader_seq, compare_type, niters);
658 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
659 niters, build_one_cst (compare_type));
660 }
661 else
662 niters = gimple_convert (&preheader_seq, compare_type, niters);
663
6753a4bf 664 /* Convert skip_niters to the right type. */
665 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
666
60b29a7e 667 /* Now calculate the value that the induction variable must be able
668 to hit in order to ensure that we end the loop with an all-false mask.
669 This involves adding the maximum number of inactive trailing scalar
670 iterations. */
671 widest_int iv_limit;
672 bool known_max_iters = max_loop_iterations (loop, &iv_limit);
673 if (known_max_iters)
674 {
6753a4bf 675 if (niters_skip)
676 {
677 /* Add the maximum number of skipped iterations to the
678 maximum iteration count. */
679 if (TREE_CODE (niters_skip) == INTEGER_CST)
680 iv_limit += wi::to_widest (niters_skip);
681 else
682 iv_limit += max_vf - 1;
683 }
60b29a7e 684 /* IV_LIMIT is the maximum number of latch iterations, which is also
685 the maximum in-range IV value. Round this value down to the previous
686 vector alignment boundary and then add an extra full iteration. */
687 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
688 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
689 }
690
691 /* Get the vectorization factor in tree form. */
692 tree vf = build_int_cst (compare_type,
693 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
694
695 /* Iterate over all the rgroups and fill in their masks. We could use
696 the first mask from any rgroup for the loop condition; here we
697 arbitrarily pick the last. */
698 tree test_mask = NULL_TREE;
699 rgroup_masks *rgm;
700 unsigned int i;
701 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
702 FOR_EACH_VEC_ELT (*masks, i, rgm)
703 if (!rgm->masks.is_empty ())
704 {
705 /* First try using permutes. This adds a single vector
706 instruction to the loop for each mask, but needs no extra
707 loop invariants or IVs. */
708 unsigned int nmasks = i + 1;
709 if ((nmasks & 1) == 0)
710 {
711 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
712 if (!half_rgm->masks.is_empty ()
713 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
714 continue;
715 }
716
717 /* See whether zero-based IV would ever generate all-false masks
718 before wrapping around. */
719 bool might_wrap_p
720 = (!known_max_iters
721 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
722 UNSIGNED)
723 > compare_precision));
724
725 /* Set up all masks for this group. */
726 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
727 &preheader_seq,
728 loop_cond_gsi, rgm, vf,
6753a4bf 729 niters, niters_skip,
730 might_wrap_p);
60b29a7e 731 }
732
733 /* Emit all accumulated statements. */
734 add_preheader_seq (loop, preheader_seq);
735 add_header_seq (loop, header_seq);
736
737 /* Get a boolean result that tells us whether to iterate. */
738 edge exit_edge = single_exit (loop);
739 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
740 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
741 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
742 NULL_TREE, NULL_TREE);
743 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
744
745 /* The loop iterates (NITERS - 1) / VF + 1 times.
746 Subtract one from this to get the latch count. */
747 tree step = build_int_cst (compare_type,
748 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
749 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
750 build_minus_one_cst (compare_type));
751 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
752 niters_minus_one, step);
753
754 if (final_iv)
755 {
756 gassign *assign = gimple_build_assign (final_iv, orig_niters);
757 gsi_insert_on_edge_immediate (single_exit (loop), assign);
758 }
759
760 return cond_stmt;
761}
762
763/* Like vect_set_loop_condition, but handle the case in which there
764 are no loop masks. */
765
766static gcond *
767vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
768 tree step, tree final_iv,
769 bool niters_maybe_zero,
770 gimple_stmt_iterator loop_cond_gsi)
fb85abff 771{
772 tree indx_before_incr, indx_after_incr;
1a91d914 773 gcond *cond_stmt;
774 gcond *orig_cond;
cde959e7 775 edge pe = loop_preheader_edge (loop);
fb85abff 776 edge exit_edge = single_exit (loop);
fb85abff 777 gimple_stmt_iterator incr_gsi;
778 bool insert_after;
fb85abff 779 enum tree_code code;
cde959e7 780 tree niters_type = TREE_TYPE (niters);
fb85abff 781
782 orig_cond = get_loop_exit_condition (loop);
783 gcc_assert (orig_cond);
784 loop_cond_gsi = gsi_for_stmt (orig_cond);
785
cde959e7 786 tree init, limit;
787 if (!niters_maybe_zero && integer_onep (step))
788 {
789 /* In this case we can use a simple 0-based IV:
790
791 A:
792 x = 0;
793 do
794 {
795 ...
796 x += 1;
797 }
798 while (x < NITERS); */
799 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
800 init = build_zero_cst (niters_type);
801 limit = niters;
802 }
803 else
804 {
805 /* The following works for all values of NITERS except 0:
806
807 B:
808 x = 0;
809 do
810 {
811 ...
812 x += STEP;
813 }
814 while (x <= NITERS - STEP);
815
816 so that the loop continues to iterate if x + STEP - 1 < NITERS
817 but stops if x + STEP - 1 >= NITERS.
818
819 However, if NITERS is zero, x never hits a value above NITERS - STEP
820 before wrapping around. There are two obvious ways of dealing with
821 this:
822
823 - start at STEP - 1 and compare x before incrementing it
824 - start at -1 and compare x after incrementing it
825
826 The latter is simpler and is what we use. The loop in this case
827 looks like:
828
829 C:
830 x = -1;
831 do
832 {
833 ...
834 x += STEP;
835 }
836 while (x < NITERS - STEP);
837
838 In both cases the loop limit is NITERS - STEP. */
839 gimple_seq seq = NULL;
840 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
841 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
842 if (seq)
843 {
844 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
845 gcc_assert (!new_bb);
846 }
847 if (niters_maybe_zero)
848 {
849 /* Case C. */
850 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
851 init = build_all_ones_cst (niters_type);
852 }
853 else
854 {
855 /* Case B. */
856 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
857 init = build_zero_cst (niters_type);
858 }
859 }
860
fb85abff 861 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
862 create_iv (init, step, NULL_TREE, loop,
863 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
fb85abff 864 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
865 true, NULL_TREE, true,
866 GSI_SAME_STMT);
cde959e7 867 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
fb85abff 868 true, GSI_SAME_STMT);
869
cde959e7 870 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
fb85abff 871 NULL_TREE);
872
873 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
874
56c0f85a 875 /* Record the number of latch iterations. */
cde959e7 876 if (limit == niters)
877 /* Case A: the loop iterates NITERS times. Subtract one to get the
878 latch count. */
879 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
880 build_int_cst (niters_type, 1));
881 else
882 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883 Subtract one from this to get the latch count. */
884 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
885 limit, step);
886
887 if (final_iv)
888 {
889 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
890 indx_after_incr, init);
891 gsi_insert_on_edge_immediate (single_exit (loop), assign);
892 }
60b29a7e 893
894 return cond_stmt;
895}
896
897/* If we're using fully-masked loops, make LOOP iterate:
898
899 N == (NITERS - 1) / STEP + 1
900
901 times. When NITERS is zero, this is equivalent to making the loop
902 execute (1 << M) / STEP times, where M is the precision of NITERS.
903 NITERS_MAYBE_ZERO is true if this last case might occur.
904
905 If we're not using fully-masked loops, make LOOP iterate:
906
907 N == (NITERS - STEP) / STEP + 1
908
909 times, where NITERS is known to be outside the range [1, STEP - 1].
910 This is equivalent to making the loop execute NITERS / STEP times
911 when NITERS is nonzero and (1 << M) / STEP times otherwise.
912 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
913
914 If FINAL_IV is nonnull, it is an SSA name that should be set to
915 N * STEP on exit from the loop.
916
917 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
918
919void
920vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
921 tree niters, tree step, tree final_iv,
922 bool niters_maybe_zero)
923{
924 gcond *cond_stmt;
925 gcond *orig_cond = get_loop_exit_condition (loop);
926 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
927
928 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
929 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
930 final_iv, niters_maybe_zero,
931 loop_cond_gsi);
932 else
933 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
934 final_iv, niters_maybe_zero,
935 loop_cond_gsi);
936
937 /* Remove old loop exit test. */
938 gsi_remove (&loop_cond_gsi, true);
939 free_stmt_vec_info (orig_cond);
940
941 if (dump_enabled_p ())
942 {
943 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
944 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
945 }
fb85abff 946}
947
c71d3c24 948/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
949 For all PHI arguments in FROM->dest and TO->dest from those
950 edges ensure that TO->dest PHI arguments have current_def
951 to that in from. */
952
953static void
954slpeel_duplicate_current_defs_from_edges (edge from, edge to)
955{
956 gimple_stmt_iterator gsi_from, gsi_to;
957
958 for (gsi_from = gsi_start_phis (from->dest),
959 gsi_to = gsi_start_phis (to->dest);
e87502d6 960 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
c71d3c24 961 {
42acab1c 962 gimple *from_phi = gsi_stmt (gsi_from);
963 gimple *to_phi = gsi_stmt (gsi_to);
c71d3c24 964 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
e3243c77 965 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
966 if (virtual_operand_p (from_arg))
967 {
e87502d6 968 gsi_next (&gsi_from);
969 continue;
970 }
e3243c77 971 if (virtual_operand_p (to_arg))
972 {
e87502d6 973 gsi_next (&gsi_to);
974 continue;
975 }
e3243c77 976 if (TREE_CODE (from_arg) != SSA_NAME)
977 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
978 else
979 {
980 if (get_current_def (to_arg) == NULL_TREE)
981 set_current_def (to_arg, get_current_def (from_arg));
982 }
e87502d6 983 gsi_next (&gsi_from);
984 gsi_next (&gsi_to);
c71d3c24 985 }
e3243c77 986
987 gphi *from_phi = get_virtual_phi (from->dest);
988 gphi *to_phi = get_virtual_phi (to->dest);
989 if (from_phi)
990 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
991 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
c71d3c24 992}
993
fb85abff 994
48e1416a 995/* Given LOOP this function generates a new copy of it and puts it
c71d3c24 996 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
997 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
998 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
999 entry or exit of LOOP. */
fb85abff 1000
1001struct loop *
c71d3c24 1002slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1003 struct loop *scalar_loop, edge e)
fb85abff 1004{
1005 struct loop *new_loop;
8a61cfb8 1006 basic_block *new_bbs, *bbs, *pbbs;
fb85abff 1007 bool at_exit;
1008 bool was_imm_dom;
48e1416a 1009 basic_block exit_dest;
fb85abff 1010 edge exit, new_exit;
5ee742c4 1011 bool duplicate_outer_loop = false;
fb85abff 1012
c9b2c569 1013 exit = single_exit (loop);
1014 at_exit = (e == exit);
fb85abff 1015 if (!at_exit && e != loop_preheader_edge (loop))
1016 return NULL;
1017
c71d3c24 1018 if (scalar_loop == NULL)
1019 scalar_loop = loop;
1020
1021 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
8a61cfb8 1022 pbbs = bbs + 1;
1023 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
5ee742c4 1024 /* Allow duplication of outer loops. */
1025 if (scalar_loop->inner)
1026 duplicate_outer_loop = true;
fb85abff 1027 /* Check whether duplication is possible. */
8a61cfb8 1028 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
fb85abff 1029 {
1030 free (bbs);
1031 return NULL;
1032 }
1033
1034 /* Generate new loop structure. */
c71d3c24 1035 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1036 duplicate_subloops (scalar_loop, new_loop);
fb85abff 1037
c9b2c569 1038 exit_dest = exit->dest;
48e1416a 1039 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1040 exit_dest) == loop->header ?
fb85abff 1041 true : false);
1042
c9b2c569 1043 /* Also copy the pre-header, this avoids jumping through hoops to
1044 duplicate the loop entry PHI arguments. Create an empty
1045 pre-header unconditionally for this. */
c71d3c24 1046 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
c9b2c569 1047 edge entry_e = single_pred_edge (preheader);
8a61cfb8 1048 bbs[0] = preheader;
c71d3c24 1049 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
fb85abff 1050
c71d3c24 1051 exit = single_exit (scalar_loop);
1052 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
fb85abff 1053 &exit, 1, &new_exit, NULL,
8a61cfb8 1054 at_exit ? loop->latch : e->src, true);
c71d3c24 1055 exit = single_exit (loop);
8a61cfb8 1056 basic_block new_preheader = new_bbs[0];
fb85abff 1057
c71d3c24 1058 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
fb85abff 1059
c71d3c24 1060 if (scalar_loop != loop)
1061 {
1062 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1063 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1064 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1065 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1066 header) to have current_def set, so copy them over. */
1067 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1068 exit);
1069 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1070 0),
1071 EDGE_SUCC (loop->latch, 0));
1072 }
48e1416a 1073
fb85abff 1074 if (at_exit) /* Add the loop copy at exit. */
1075 {
c71d3c24 1076 if (scalar_loop != loop)
1077 {
1a91d914 1078 gphi_iterator gsi;
c71d3c24 1079 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1080
1081 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1082 gsi_next (&gsi))
1083 {
1a91d914 1084 gphi *phi = gsi.phi ();
c71d3c24 1085 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1086 location_t orig_locus
1087 = gimple_phi_arg_location_from_edge (phi, e);
1088
1089 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1090 }
1091 }
c9b2c569 1092 redirect_edge_and_branch_force (e, new_preheader);
1093 flush_pending_stmts (e);
1094 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
5ee742c4 1095 if (was_imm_dom || duplicate_outer_loop)
c71d3c24 1096 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
c9b2c569 1097
1098 /* And remove the non-necessary forwarder again. Keep the other
1099 one so we have a proper pre-header for the loop at the exit edge. */
c71d3c24 1100 redirect_edge_pred (single_succ_edge (preheader),
1101 single_pred (preheader));
c9b2c569 1102 delete_basic_block (preheader);
c71d3c24 1103 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104 loop_preheader_edge (scalar_loop)->src);
fb85abff 1105 }
1106 else /* Add the copy at entry. */
1107 {
c71d3c24 1108 if (scalar_loop != loop)
1109 {
1110 /* Remove the non-necessary forwarder of scalar_loop again. */
1111 redirect_edge_pred (single_succ_edge (preheader),
1112 single_pred (preheader));
1113 delete_basic_block (preheader);
1114 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1115 loop_preheader_edge (scalar_loop)->src);
1116 preheader = split_edge (loop_preheader_edge (loop));
1117 entry_e = single_pred_edge (preheader);
1118 }
1119
c9b2c569 1120 redirect_edge_and_branch_force (entry_e, new_preheader);
1121 flush_pending_stmts (entry_e);
1122 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1123
1124 redirect_edge_and_branch_force (new_exit, preheader);
1125 flush_pending_stmts (new_exit);
1126 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1127
1128 /* And remove the non-necessary forwarder again. Keep the other
1129 one so we have a proper pre-header for the loop at the exit edge. */
c71d3c24 1130 redirect_edge_pred (single_succ_edge (new_preheader),
1131 single_pred (new_preheader));
c9b2c569 1132 delete_basic_block (new_preheader);
1133 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1134 loop_preheader_edge (new_loop)->src);
fb85abff 1135 }
1136
6e429c5c 1137 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1138 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
5ee742c4 1139 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
c9b2c569 1140
c71d3c24 1141 if (scalar_loop != loop)
1142 {
1143 /* Update new_loop->header PHIs, so that on the preheader
1144 edge they are the ones from loop rather than scalar_loop. */
1a91d914 1145 gphi_iterator gsi_orig, gsi_new;
c71d3c24 1146 edge orig_e = loop_preheader_edge (loop);
1147 edge new_e = loop_preheader_edge (new_loop);
1148
1149 for (gsi_orig = gsi_start_phis (loop->header),
1150 gsi_new = gsi_start_phis (new_loop->header);
1151 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1152 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1153 {
1a91d914 1154 gphi *orig_phi = gsi_orig.phi ();
1155 gphi *new_phi = gsi_new.phi ();
c71d3c24 1156 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1157 location_t orig_locus
1158 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1159
1160 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1161 }
1162 }
1163
fb85abff 1164 free (new_bbs);
1165 free (bbs);
1166
382ecba7 1167 checking_verify_dominators (CDI_DOMINATORS);
c9b2c569 1168
fb85abff 1169 return new_loop;
1170}
1171
1172
6c6a3430 1173/* Given the condition expression COND, put it as the last statement of
1174 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1175 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
15492f79 1176 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1177 new edge as irreducible if IRREDUCIBLE_P is true. */
fb85abff 1178
1179static edge
23a3430d 1180slpeel_add_loop_guard (basic_block guard_bb, tree cond,
6c6a3430 1181 basic_block guard_to, basic_block dom_bb,
720cfc43 1182 profile_probability probability, bool irreducible_p)
fb85abff 1183{
1184 gimple_stmt_iterator gsi;
1185 edge new_e, enter_e;
1a91d914 1186 gcond *cond_stmt;
fb85abff 1187 gimple_seq gimplify_stmt_list = NULL;
1188
1189 enter_e = EDGE_SUCC (guard_bb, 0);
1190 enter_e->flags &= ~EDGE_FALLTHRU;
1191 enter_e->flags |= EDGE_FALSE_VALUE;
1192 gsi = gsi_last_bb (guard_bb);
1193
87ae3989 1194 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1195 NULL_TREE);
23a3430d 1196 if (gimplify_stmt_list)
6c6a3430 1197 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
fb85abff 1198
6c6a3430 1199 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
fb85abff 1200 gsi = gsi_last_bb (guard_bb);
1201 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1202
1203 /* Add new edge to connect guard block to the merge/loop-exit block. */
6c6a3430 1204 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
877584e4 1205
877584e4 1206 new_e->probability = probability;
15492f79 1207 if (irreducible_p)
1208 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1209
720cfc43 1210 enter_e->probability = probability.invert ();
6c6a3430 1211 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
a62a4a78 1212
1213 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1214 if (enter_e->dest->loop_father->header == enter_e->dest)
1215 split_edge (enter_e);
1216
fb85abff 1217 return new_e;
1218}
1219
1220
1221/* This function verifies that the following restrictions apply to LOOP:
5ee742c4 1222 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1223 for innermost loop and 5 basic blocks for outer-loop.
1224 (2) it is single entry, single exit
1225 (3) its exit condition is the last stmt in the header
1226 (4) E is the entry/exit edge of LOOP.
fb85abff 1227 */
1228
1229bool
1230slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1231{
1232 edge exit_e = single_exit (loop);
1233 edge entry_e = loop_preheader_edge (loop);
1a91d914 1234 gcond *orig_cond = get_loop_exit_condition (loop);
fb85abff 1235 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
5ee742c4 1236 unsigned int num_bb = loop->inner? 5 : 2;
fb85abff 1237
51eb70a9 1238 /* All loops have an outer scope; the only case loop->outer is NULL is for
1239 the function itself. */
1240 if (!loop_outer (loop)
5ee742c4 1241 || loop->num_nodes != num_bb
fb85abff 1242 || !empty_block_p (loop->latch)
1243 || !single_exit (loop)
1244 /* Verify that new loop exit condition can be trivially modified. */
1245 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1246 || (e != exit_e && e != entry_e))
1247 return false;
1248
1249 return true;
1250}
1251
6c6a3430 1252/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1253 in the exit bb and rename all the uses after the loop. This simplifies
1254 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1255 (but normally loop closed SSA form doesn't require virtual PHIs to be
1256 in the same form). Doing this early simplifies the checking what
1257 uses should be renamed. */
fb85abff 1258
1259static void
6c6a3430 1260create_lcssa_for_virtual_phi (struct loop *loop)
fb85abff 1261{
1a91d914 1262 gphi_iterator gsi;
fb85abff 1263 edge exit_e = single_exit (loop);
48e1416a 1264
38091110 1265 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
7c782c9b 1266 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
38091110 1267 {
1a91d914 1268 gphi *phi = gsi.phi ();
38091110 1269 for (gsi = gsi_start_phis (exit_e->dest);
1270 !gsi_end_p (gsi); gsi_next (&gsi))
7c782c9b 1271 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
38091110 1272 break;
1273 if (gsi_end_p (gsi))
1274 {
f9e245b2 1275 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1a91d914 1276 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
38091110 1277 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1278 imm_use_iterator imm_iter;
42acab1c 1279 gimple *stmt;
38091110 1280 use_operand_p use_p;
1281
6c99cfac 1282 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1283 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
60d535d2 1284 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
38091110 1285 gimple_phi_set_result (new_phi, new_vop);
1286 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
480233e7 1287 if (stmt != new_phi
1288 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
38091110 1289 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1290 SET_USE (use_p, new_vop);
1291 }
1292 break;
1293 }
fb85abff 1294
fb85abff 1295}
1296
1297/* Function vect_get_loop_location.
1298
1299 Extract the location of the loop in the source code.
1300 If the loop is not well formed for vectorization, an estimated
1301 location is calculated.
1302 Return the loop location if succeed and NULL if not. */
1303
c309657f 1304dump_user_location_t
fb85abff 1305find_loop_location (struct loop *loop)
1306{
42acab1c 1307 gimple *stmt = NULL;
fb85abff 1308 basic_block bb;
1309 gimple_stmt_iterator si;
1310
1311 if (!loop)
c309657f 1312 return dump_user_location_t ();
fb85abff 1313
1314 stmt = get_loop_exit_condition (loop);
1315
b2d978a6 1316 if (stmt
1317 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
c309657f 1318 return stmt;
fb85abff 1319
1320 /* If we got here the loop is probably not "well formed",
1321 try to estimate the loop location */
1322
1323 if (!loop->header)
c309657f 1324 return dump_user_location_t ();
fb85abff 1325
1326 bb = loop->header;
1327
1328 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329 {
1330 stmt = gsi_stmt (si);
b2d978a6 1331 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
c309657f 1332 return stmt;
fb85abff 1333 }
1334
c309657f 1335 return dump_user_location_t ();
fb85abff 1336}
1337
6c6a3430 1338/* Return true if PHI defines an IV of the loop to be vectorized. */
1339
1340static bool
1341iv_phi_p (gphi *phi)
1342{
1343 if (virtual_operand_p (PHI_RESULT (phi)))
1344 return false;
1345
1346 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1347 gcc_assert (stmt_info != NULL);
1348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350 return false;
1351
1352 return true;
1353}
fb85abff 1354
fb85abff 1355/* Function vect_can_advance_ivs_p
1356
48e1416a 1357 In case the number of iterations that LOOP iterates is unknown at compile
1358 time, an epilog loop will be generated, and the loop induction variables
1359 (IVs) will be "advanced" to the value they are supposed to take just before
fb85abff 1360 the epilog loop. Here we check that the access function of the loop IVs
1361 and the expression that represents the loop bound are simple enough.
1362 These restrictions will be relaxed in the future. */
1363
48e1416a 1364bool
fb85abff 1365vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366{
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block bb = loop->header;
1a91d914 1369 gphi_iterator gsi;
fb85abff 1370
1371 /* Analyze phi functions of the loop header. */
1372
6d8fb6cf 1373 if (dump_enabled_p ())
78bb46f5 1374 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
fb85abff 1375 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376 {
fb85abff 1377 tree evolution_part;
1378
6c6a3430 1379 gphi *phi = gsi.phi ();
6d8fb6cf 1380 if (dump_enabled_p ())
fb85abff 1381 {
7bd765d4 1382 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1383 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
fb85abff 1384 }
1385
1386 /* Skip virtual phi's. The data dependences that are associated with
6c6a3430 1387 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
fb85abff 1388
6c6a3430 1389 Skip reduction phis. */
1390 if (!iv_phi_p (phi))
fb85abff 1391 {
6d8fb6cf 1392 if (dump_enabled_p ())
6c6a3430 1393 dump_printf_loc (MSG_NOTE, vect_location,
1394 "reduc or virtual phi. skip.\n");
fb85abff 1395 continue;
1396 }
1397
fb85abff 1398 /* Analyze the evolution function. */
1399
2cd0995e 1400 evolution_part
1401 = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
fb85abff 1402 if (evolution_part == NULL_TREE)
1403 {
6d8fb6cf 1404 if (dump_enabled_p ())
2cd0995e 1405 dump_printf (MSG_MISSED_OPTIMIZATION,
78bb46f5 1406 "No access function or evolution.\n");
fb85abff 1407 return false;
1408 }
48e1416a 1409
12117f39 1410 /* FORNOW: We do not transform initial conditions of IVs
1411 which evolution functions are not invariants in the loop. */
1412
1413 if (!expr_invariant_in_loop_p (loop, evolution_part))
1414 {
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417 "evolution not invariant in loop.\n");
1418 return false;
1419 }
1420
48e1416a 1421 /* FORNOW: We do not transform initial conditions of IVs
fb85abff 1422 which evolution functions are a polynomial of degree >= 2. */
1423
1424 if (tree_is_chrec (evolution_part))
12117f39 1425 {
1426 if (dump_enabled_p ())
1427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428 "evolution is chrec.\n");
1429 return false;
1430 }
fb85abff 1431 }
1432
1433 return true;
1434}
1435
1436
1437/* Function vect_update_ivs_after_vectorizer.
1438
1439 "Advance" the induction variables of LOOP to the value they should take
1440 after the execution of LOOP. This is currently necessary because the
1441 vectorizer does not handle induction variables that are used after the
1442 loop. Such a situation occurs when the last iterations of LOOP are
1443 peeled, because:
1444 1. We introduced new uses after LOOP for IVs that were not originally used
1445 after LOOP: the IVs of LOOP are now used by an epilog loop.
1446 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1447 times, whereas the loop IVs should be bumped N times.
1448
1449 Input:
1450 - LOOP - a loop that is going to be vectorized. The last few iterations
1451 of LOOP were peeled.
1452 - NITERS - the number of iterations that LOOP executes (before it is
1453 vectorized). i.e, the number of times the ivs should be bumped.
1454 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1455 coming out from LOOP on which there are uses of the LOOP ivs
1456 (this is the path from LOOP->exit to epilog_loop->preheader).
1457
1458 The new definitions of the ivs are placed in LOOP->exit.
1459 The phi args associated with the edge UPDATE_E in the bb
1460 UPDATE_E->dest are updated accordingly.
1461
1462 Assumption 1: Like the rest of the vectorizer, this function assumes
1463 a single loop exit that has a single predecessor.
1464
1465 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1466 organized in the same order.
1467
1468 Assumption 3: The access function of the ivs is simple enough (see
1469 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1470
1471 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
48e1416a 1472 coming out of LOOP on which the ivs of LOOP are used (this is the path
fb85abff 1473 that leads to the epilog loop; other paths skip the epilog loop). This
1474 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1475 needs to have its phis updated.
1476 */
1477
1478static void
6c6a3430 1479vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1480 tree niters, edge update_e)
fb85abff 1481{
1a91d914 1482 gphi_iterator gsi, gsi1;
6c6a3430 1483 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 1484 basic_block update_bb = update_e->dest;
6c6a3430 1485 basic_block exit_bb = single_exit (loop)->dest;
fb85abff 1486
1487 /* Make sure there exists a single-predecessor exit bb: */
1488 gcc_assert (single_pred_p (exit_bb));
6c6a3430 1489 gcc_assert (single_succ_edge (exit_bb) == update_e);
fb85abff 1490
1491 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1492 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1493 gsi_next (&gsi), gsi_next (&gsi1))
1494 {
fb85abff 1495 tree init_expr;
1efcacec 1496 tree step_expr, off;
1497 tree type;
fb85abff 1498 tree var, ni, ni_name;
1499 gimple_stmt_iterator last_gsi;
1500
6c6a3430 1501 gphi *phi = gsi.phi ();
1502 gphi *phi1 = gsi1.phi ();
6d8fb6cf 1503 if (dump_enabled_p ())
6c6a3430 1504 {
1505 dump_printf_loc (MSG_NOTE, vect_location,
1506 "vect_update_ivs_after_vectorizer: phi: ");
7bd765d4 1507 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6c6a3430 1508 }
fb85abff 1509
6c6a3430 1510 /* Skip reduction and virtual phis. */
1511 if (!iv_phi_p (phi))
fb85abff 1512 {
6d8fb6cf 1513 if (dump_enabled_p ())
6c6a3430 1514 dump_printf_loc (MSG_NOTE, vect_location,
1515 "reduc or virtual phi. skip.\n");
fb85abff 1516 continue;
1517 }
1518
86faead7 1519 type = TREE_TYPE (gimple_phi_result (phi));
6c6a3430 1520 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
86faead7 1521 step_expr = unshare_expr (step_expr);
48e1416a 1522
fb85abff 1523 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1524 of degree >= 2 or exponential. */
86faead7 1525 gcc_assert (!tree_is_chrec (step_expr));
fb85abff 1526
86faead7 1527 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
fb85abff 1528
1efcacec 1529 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1530 fold_convert (TREE_TYPE (step_expr), niters),
1531 step_expr);
86faead7 1532 if (POINTER_TYPE_P (type))
2cc66f2a 1533 ni = fold_build_pointer_plus (init_expr, off);
fb85abff 1534 else
86faead7 1535 ni = fold_build2 (PLUS_EXPR, type,
1536 init_expr, fold_convert (type, off));
fb85abff 1537
86faead7 1538 var = create_tmp_var (type, "tmp");
fb85abff 1539
1540 last_gsi = gsi_last_bb (exit_bb);
6c6a3430 1541 gimple_seq new_stmts = NULL;
1542 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1543 /* Exit_bb shouldn't be empty. */
1544 if (!gsi_end_p (last_gsi))
1545 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1546 else
1547 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
48e1416a 1548
fb85abff 1549 /* Fix phi expressions in the successor bb. */
b123eaab 1550 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
fb85abff 1551 }
1552}
1553
6753a4bf 1554/* Return a gimple value containing the misalignment (measured in vector
1555 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1556 it is away from a perfectly aligned address. Add any new statements
1557 to SEQ. */
1558
1559static tree
1560get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1561{
1562 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1ce0a2db 1563 gimple *dr_stmt = vect_dr_stmt (dr);
6753a4bf 1564 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1565 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1566
1567 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1568 gcc_assert (target_align != 0);
1569
1570 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1571 tree offset = (negative
1572 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1573 : size_zero_node);
1574 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq,
1575 offset);
1576 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1577 tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1578 HOST_WIDE_INT elem_size
1579 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1580 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1581
1582 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1583 tree int_start_addr = fold_convert (type, start_addr);
1584 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1585 target_align_minus_1);
1586
1587 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1588 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1589 elem_size_log);
1590
1591 return misalign_in_elems;
1592}
1593
6c6a3430 1594/* Function vect_gen_prolog_loop_niters
97fe80a6 1595
6c6a3430 1596 Generate the number of iterations which should be peeled as prolog for the
1597 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1598 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1599 As a result, after the execution of this loop, the data reference DR will
1600 refer to an aligned location. The following computation is generated:
fb85abff 1601
1602 If the misalignment of DR is known at compile time:
1603 addr_mis = int mis = DR_MISALIGNMENT (dr);
1604 Else, compute address misalignment in bytes:
6753a4bf 1605 addr_mis = addr & (target_align - 1)
fb85abff 1606
6c6a3430 1607 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
fb85abff 1608
1609 (elem_size = element type size; an element is the scalar element whose type
1610 is the inner type of the vectype)
1611
6c6a3430 1612 The computations will be emitted at the end of BB. We also compute and
82112bf2 1613 store upper bound (included) of the result in BOUND.
6c6a3430 1614
fb85abff 1615 When the step of the data-ref in the loop is not 1 (as in interleaved data
1616 and SLP), the number of iterations of the prolog must be divided by the step
1617 (which is equal to the size of interleaved group).
1618
1619 The above formulas assume that VF == number of elements in the vector. This
1620 may not hold when there are multiple-types in the loop.
1621 In this case, for some data-references in the loop the VF does not represent
1622 the number of elements that fit in the vector. Therefore, instead of VF we
1623 use TYPE_VECTOR_SUBPARTS. */
1624
48e1416a 1625static tree
6c6a3430 1626vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1627 basic_block bb, int *bound)
fb85abff 1628{
1629 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
fb85abff 1630 tree var;
6c6a3430 1631 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1632 gimple_seq stmts = NULL, new_stmts = NULL;
fb85abff 1633 tree iters, iters_name;
1ce0a2db 1634 gimple *dr_stmt = vect_dr_stmt (dr);
fb85abff 1635 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1636 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
aec313e5 1637 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
fb85abff 1638
313a5120 1639 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
fb85abff 1640 {
313a5120 1641 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
fb85abff 1642
6d8fb6cf 1643 if (dump_enabled_p ())
b055bc88 1644 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1645 "known peeling = %d.\n", npeel);
fb85abff 1646
0822b158 1647 iters = build_int_cst (niters_type, npeel);
82112bf2 1648 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
fb85abff 1649 }
1650 else
1651 {
6753a4bf 1652 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1653 tree type = TREE_TYPE (misalign_in_elems);
aec313e5 1654 HOST_WIDE_INT elem_size
1655 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
aec313e5 1656 HOST_WIDE_INT align_in_elems = target_align / elem_size;
1657 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1658 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
aec313e5 1659
1660 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1661 & (align_in_elems - 1)). */
6753a4bf 1662 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
f1b8c740 1663 if (negative)
aec313e5 1664 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1665 align_in_elems_tree);
f1b8c740 1666 else
aec313e5 1667 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1668 misalign_in_elems);
1669 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
fb85abff 1670 iters = fold_convert (niters_type, iters);
aec313e5 1671 *bound = align_in_elems - 1;
fb85abff 1672 }
1673
6d8fb6cf 1674 if (dump_enabled_p ())
fb85abff 1675 {
b055bc88 1676 dump_printf_loc (MSG_NOTE, vect_location,
7bd765d4 1677 "niters for prolog loop: ");
b055bc88 1678 dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
78bb46f5 1679 dump_printf (MSG_NOTE, "\n");
fb85abff 1680 }
1681
1682 var = create_tmp_var (niters_type, "prolog_loop_niters");
6c6a3430 1683 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
fb85abff 1684
6c6a3430 1685 if (new_stmts)
1686 gimple_seq_add_seq (&stmts, new_stmts);
fb85abff 1687 if (stmts)
1688 {
6c6a3430 1689 gcc_assert (single_succ_p (bb));
1690 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1691 if (gsi_end_p (gsi))
1692 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1693 else
1694 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
fb85abff 1695 }
48e1416a 1696 return iters_name;
fb85abff 1697}
1698
1699
1700/* Function vect_update_init_of_dr
1701
6753a4bf 1702 If CODE is PLUS, the vector loop starts NITERS iterations after the
1703 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1704 iterations before the scalar one (using masking to skip inactive
1705 elements). This function updates the information recorded in DR to
1706 account for the difference. Specifically, it updates the OFFSET
1707 field of DR. */
fb85abff 1708
1709static void
6753a4bf 1710vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
fb85abff 1711{
1712 tree offset = DR_OFFSET (dr);
48e1416a 1713
fb85abff 1714 niters = fold_build2 (MULT_EXPR, sizetype,
1715 fold_convert (sizetype, niters),
1716 fold_convert (sizetype, DR_STEP (dr)));
6753a4bf 1717 offset = fold_build2 (code, sizetype,
87f9ffa4 1718 fold_convert (sizetype, offset), niters);
fb85abff 1719 DR_OFFSET (dr) = offset;
1720}
1721
1722
1723/* Function vect_update_inits_of_drs
1724
6753a4bf 1725 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1726 CODE and NITERS are as for vect_update_inits_of_dr. */
fb85abff 1727
1728static void
6753a4bf 1729vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1730 tree_code code)
fb85abff 1731{
1732 unsigned int i;
f1f41a6c 1733 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
fb85abff 1734 struct data_reference *dr;
6c6a3430 1735
88f6eb8f 1736 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
6c6a3430 1737
1738 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1739 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1740 {
1741 gimple_seq seq;
1742 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1743 tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1744
1745 niters = fold_convert (sizetype, niters);
1746 niters = force_gimple_operand (niters, &seq, false, var);
1747 if (seq)
1748 {
1749 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1750 gcc_assert (!new_bb);
1751 }
1752 }
fb85abff 1753
f1f41a6c 1754 FOR_EACH_VEC_ELT (datarefs, i, dr)
fa681b45 1755 {
1756 gimple *stmt = DR_STMT (dr);
1757 if (!STMT_VINFO_GATHER_SCATTER_P (vinfo_for_stmt (stmt)))
1758 vect_update_init_of_dr (dr, niters, code);
1759 }
fb85abff 1760}
1761
6753a4bf 1762/* For the information recorded in LOOP_VINFO prepare the loop for peeling
1763 by masking. This involves calculating the number of iterations to
1764 be peeled and then aligning all memory references appropriately. */
1765
1766void
1767vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1768{
1769 tree misalign_in_elems;
1770 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1771
1772 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1773
1774 /* From the information recorded in LOOP_VINFO get the number of iterations
1775 that need to be skipped via masking. */
1776 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1777 {
1778 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1779 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1780 misalign_in_elems = build_int_cst (type, misalign);
1781 }
1782 else
1783 {
1784 gimple_seq seq1 = NULL, seq2 = NULL;
1785 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1786 misalign_in_elems = fold_convert (type, misalign_in_elems);
1787 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1788 &seq2, true, NULL_TREE);
1789 gimple_seq_add_seq (&seq1, seq2);
1790 if (seq1)
1791 {
1792 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1793 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1794 gcc_assert (!new_bb);
1795 }
1796 }
1797
1798 if (dump_enabled_p ())
1799 {
1800 dump_printf_loc (MSG_NOTE, vect_location,
1801 "misalignment for fully-masked loop: ");
1802 dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1803 dump_printf (MSG_NOTE, "\n");
1804 }
1805
1806 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1807
1808 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1809}
fb85abff 1810
6c6a3430 1811/* This function builds ni_name = number of iterations. Statements
3a815241 1812 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1813 it to TRUE if new ssa_var is generated. */
6c6a3430 1814
1815tree
3a815241 1816vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
6c6a3430 1817{
1818 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1819 if (TREE_CODE (ni) == INTEGER_CST)
1820 return ni;
1821 else
1822 {
1823 tree ni_name, var;
1824 gimple_seq stmts = NULL;
1825 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1826
1827 var = create_tmp_var (TREE_TYPE (ni), "niters");
1828 ni_name = force_gimple_operand (ni, &stmts, false, var);
1829 if (stmts)
3a815241 1830 {
1831 gsi_insert_seq_on_edge_immediate (pe, stmts);
1832 if (new_var_p != NULL)
1833 *new_var_p = true;
1834 }
6c6a3430 1835
1836 return ni_name;
1837 }
1838}
1839
82112bf2 1840/* Calculate the number of iterations above which vectorized loop will be
1841 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1842 of prolog loop. If it's integer const, the integer number is also passed
53771608 1843 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1844 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1845 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1846 threshold below which the scalar (rather than vectorized) loop will be
1847 executed. This function stores the upper bound (inclusive) of the result
1848 in BOUND_SCALAR. */
6c6a3430 1849
1850static tree
1851vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
53771608 1852 int bound_prolog, poly_int64 bound_epilog, int th,
d75596cd 1853 poly_uint64 *bound_scalar,
1854 bool check_profitability)
6c6a3430 1855{
1856 tree type = TREE_TYPE (niters_prolog);
1857 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
53771608 1858 build_int_cst (type, bound_epilog));
6c6a3430 1859
53771608 1860 *bound_scalar = bound_prolog + bound_epilog;
6c6a3430 1861 if (check_profitability)
1862 {
82112bf2 1863 /* TH indicates the minimum niters of vectorized loop, while we
1864 compute the maximum niters of scalar loop. */
1865 th--;
6c6a3430 1866 /* Peeling for constant times. */
1867 if (int_niters_prolog >= 0)
1868 {
53771608 1869 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
82112bf2 1870 return build_int_cst (type, *bound_scalar);
6c6a3430 1871 }
53771608 1872 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1873 and BOUND_EPILOG are inclusive upper bounds. */
1874 if (known_ge (th, bound_prolog + bound_epilog))
6c6a3430 1875 {
82112bf2 1876 *bound_scalar = th;
6c6a3430 1877 return build_int_cst (type, th);
1878 }
d75596cd 1879 /* Need to do runtime comparison. */
53771608 1880 else if (maybe_gt (th, bound_epilog))
d75596cd 1881 {
1882 *bound_scalar = upper_bound (*bound_scalar, th);
1883 return fold_build2 (MAX_EXPR, type,
1884 build_int_cst (type, th), niters);
1885 }
6c6a3430 1886 }
1887 return niters;
1888}
1889
cde959e7 1890/* NITERS is the number of times that the original scalar loop executes
1891 after peeling. Work out the maximum number of iterations N that can
1892 be handled by the vectorized form of the loop and then either:
fb85abff 1893
cde959e7 1894 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
97fe80a6 1895
cde959e7 1896 niters_vector = N
1897
1898 b) set *STEP_VECTOR_PTR to one and generate:
1899
1900 niters_vector = N / vf
1901
1902 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1903 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1904 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
fb85abff 1905
1906void
6c6a3430 1907vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
cde959e7 1908 tree *niters_vector_ptr, tree *step_vector_ptr,
1909 bool niters_no_overflow)
fb85abff 1910{
6c6a3430 1911 tree ni_minus_gap, var;
cde959e7 1912 tree niters_vector, step_vector, type = TREE_TYPE (niters);
d75596cd 1913 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6c6a3430 1914 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
cde959e7 1915 tree log_vf = NULL_TREE;
6c6a3430 1916
1917 /* If epilogue loop is required because of data accesses with gaps, we
1918 subtract one iteration from the total number of iterations here for
1919 correct calculation of RATIO. */
1920 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1921 {
3a815241 1922 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1923 build_one_cst (type));
6c6a3430 1924 if (!is_gimple_val (ni_minus_gap))
1925 {
3a815241 1926 var = create_tmp_var (type, "ni_gap");
6c6a3430 1927 gimple *stmts = NULL;
1928 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1929 true, var);
1930 gsi_insert_seq_on_edge_immediate (pe, stmts);
1931 }
1932 }
1933 else
1934 ni_minus_gap = niters;
1935
d75596cd 1936 unsigned HOST_WIDE_INT const_vf;
60b29a7e 1937 if (vf.is_constant (&const_vf)
1938 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
cde959e7 1939 {
1940 /* Create: niters >> log2(vf) */
1941 /* If it's known that niters == number of latch executions + 1 doesn't
1942 overflow, we can generate niters >> log2(vf); otherwise we generate
1943 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1944 will be at least one. */
d75596cd 1945 log_vf = build_int_cst (type, exact_log2 (const_vf));
cde959e7 1946 if (niters_no_overflow)
1947 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1948 else
1949 niters_vector
1950 = fold_build2 (PLUS_EXPR, type,
1951 fold_build2 (RSHIFT_EXPR, type,
1952 fold_build2 (MINUS_EXPR, type,
1953 ni_minus_gap,
1954 build_int_cst (type, vf)),
1955 log_vf),
1956 build_int_cst (type, 1));
1957 step_vector = build_one_cst (type);
1958 }
6c6a3430 1959 else
cde959e7 1960 {
1961 niters_vector = ni_minus_gap;
1962 step_vector = build_int_cst (type, vf);
1963 }
6c6a3430 1964
1965 if (!is_gimple_val (niters_vector))
1966 {
3a815241 1967 var = create_tmp_var (type, "bnd");
1968 gimple_seq stmts = NULL;
6c6a3430 1969 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1970 gsi_insert_seq_on_edge_immediate (pe, stmts);
3a815241 1971 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1972 we set range information to make niters analyzer's life easier. */
cde959e7 1973 if (stmts != NULL && log_vf)
e3d0f65c 1974 set_range_info (niters_vector, VR_RANGE,
1975 wi::to_wide (build_int_cst (type, 1)),
1976 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1977 TYPE_MAX_VALUE (type),
1978 log_vf)));
6c6a3430 1979 }
1980 *niters_vector_ptr = niters_vector;
cde959e7 1981 *step_vector_ptr = step_vector;
6c6a3430 1982
1983 return;
1984}
1985
1986/* Given NITERS_VECTOR which is the number of iterations for vectorized
1987 loop specified by LOOP_VINFO after vectorization, compute the number
1988 of iterations before vectorization (niters_vector * vf) and store it
1989 to NITERS_VECTOR_MULT_VF_PTR. */
1990
1991static void
1992vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1993 tree niters_vector,
1994 tree *niters_vector_mult_vf_ptr)
1995{
d75596cd 1996 /* We should be using a step_vector of VF if VF is variable. */
1997 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
fb85abff 1998 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6c6a3430 1999 tree type = TREE_TYPE (niters_vector);
2000 tree log_vf = build_int_cst (type, exact_log2 (vf));
2001 basic_block exit_bb = single_exit (loop)->dest;
fb85abff 2002
6c6a3430 2003 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2004 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2005 niters_vector, log_vf);
2006 if (!is_gimple_val (niters_vector_mult_vf))
2007 {
2008 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2009 gimple_seq stmts = NULL;
2010 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2011 &stmts, true, var);
2012 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2013 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2014 }
2015 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2016}
2017
2018/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2019 from SECOND/FIRST and puts it at the original loop's preheader/exit
2020 edge, the two loops are arranged as below:
2021
2022 preheader_a:
2023 first_loop:
2024 header_a:
2025 i_1 = PHI<i_0, i_2>;
2026 ...
2027 i_2 = i_1 + 1;
2028 if (cond_a)
2029 goto latch_a;
2030 else
2031 goto between_bb;
2032 latch_a:
2033 goto header_a;
2034
2035 between_bb:
2036 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2037
2038 second_loop:
2039 header_b:
2040 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2041 or with i_2 if no LCSSA phi is created
2042 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2043 ...
2044 i_4 = i_3 + 1;
2045 if (cond_b)
2046 goto latch_b;
2047 else
2048 goto exit_bb;
2049 latch_b:
2050 goto header_b;
2051
2052 exit_bb:
2053
2054 This function creates loop closed SSA for the first loop; update the
2055 second loop's PHI nodes by replacing argument on incoming edge with the
2056 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2057 is false, Loop closed ssa phis will only be created for non-iv phis for
2058 the first loop.
2059
2060 This function assumes exit bb of the first loop is preheader bb of the
2061 second loop, i.e, between_bb in the example code. With PHIs updated,
2062 the second loop will execute rest iterations of the first. */
fb85abff 2063
6c6a3430 2064static void
2065slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2066 struct loop *first, struct loop *second,
2067 bool create_lcssa_for_iv_phis)
2068{
2069 gphi_iterator gsi_update, gsi_orig;
2070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2071
2072 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2073 edge second_preheader_e = loop_preheader_edge (second);
2074 basic_block between_bb = single_exit (first)->dest;
2075
2076 gcc_assert (between_bb == second_preheader_e->src);
2077 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2078 /* Either the first loop or the second is the loop to be vectorized. */
2079 gcc_assert (loop == first || loop == second);
2080
2081 for (gsi_orig = gsi_start_phis (first->header),
2082 gsi_update = gsi_start_phis (second->header);
2083 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2084 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2085 {
2086 gphi *orig_phi = gsi_orig.phi ();
2087 gphi *update_phi = gsi_update.phi ();
2088
2089 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2090 /* Generate lcssa PHI node for the first loop. */
2091 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2092 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
2093 {
2094 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2095 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2096 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2097 arg = new_res;
2098 }
2099
2100 /* Update PHI node in the second loop by replacing arg on the loop's
2101 incoming edge. */
2102 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2103 }
2104}
2105
2106/* Function slpeel_add_loop_guard adds guard skipping from the beginning
2107 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2108 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2109 appear like below:
2110
2111 guard_bb:
2112 if (cond)
2113 goto merge_bb;
2114 else
2115 goto skip_loop;
2116
2117 skip_loop:
2118 header_a:
2119 i_1 = PHI<i_0, i_2>;
2120 ...
2121 i_2 = i_1 + 1;
2122 if (cond_a)
2123 goto latch_a;
2124 else
2125 goto exit_a;
2126 latch_a:
2127 goto header_a;
2128
2129 exit_a:
2130 i_5 = PHI<i_2>;
2131
2132 merge_bb:
2133 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2134
2135 update_loop:
2136 header_b:
2137 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2138 ...
2139 i_4 = i_3 + 1;
2140 if (cond_b)
2141 goto latch_b;
2142 else
2143 goto exit_bb;
2144 latch_b:
2145 goto header_b;
2146
2147 exit_bb:
2148
2149 This function creates PHI nodes at merge_bb and replaces the use of i_5
2150 in the update_loop's PHI node with the result of new PHI result. */
2151
2152static void
2153slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2154 struct loop *update_loop,
2155 edge guard_edge, edge merge_edge)
2156{
2157 source_location merge_loc, guard_loc;
2158 edge orig_e = loop_preheader_edge (skip_loop);
2159 edge update_e = loop_preheader_edge (update_loop);
2160 gphi_iterator gsi_orig, gsi_update;
2161
2162 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2163 gsi_update = gsi_start_phis (update_loop->header));
2164 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2165 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2166 {
2167 gphi *orig_phi = gsi_orig.phi ();
2168 gphi *update_phi = gsi_update.phi ();
2169
2170 /* Generate new phi node at merge bb of the guard. */
2171 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2172 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2173
2174 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2175 args in NEW_PHI for these edges. */
2176 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2177 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2178 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2179 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2180 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2181 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2182
2183 /* Update phi in UPDATE_PHI. */
2184 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2185 }
2186}
2187
2188/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2189 this function searches for the corresponding lcssa phi node in exit
2190 bb of LOOP. If it is found, return the phi result; otherwise return
2191 NULL. */
2192
2193static tree
2194find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2195 gphi *lcssa_phi)
2196{
2197 gphi_iterator gsi;
2198 edge e = single_exit (loop);
2199
2200 gcc_assert (single_pred_p (e->dest));
2201 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2202 {
2203 gphi *phi = gsi.phi ();
2204 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2205 PHI_ARG_DEF (lcssa_phi, 0), 0))
2206 return PHI_RESULT (phi);
2207 }
2208 return NULL_TREE;
2209}
2210
2211/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2212 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2213 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2214 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2215 The CFG looks like:
2216
2217 loop:
2218 header_a:
2219 i_1 = PHI<i_0, i_2>;
2220 ...
2221 i_2 = i_1 + 1;
2222 if (cond_a)
2223 goto latch_a;
2224 else
2225 goto exit_a;
2226 latch_a:
2227 goto header_a;
2228
2229 exit_a:
2230
2231 guard_bb:
2232 if (cond)
2233 goto merge_bb;
2234 else
2235 goto epilog_loop;
2236
2237 ;; fall_through_bb
2238
2239 epilog_loop:
2240 header_b:
2241 i_3 = PHI<i_2, i_4>;
2242 ...
2243 i_4 = i_3 + 1;
2244 if (cond_b)
2245 goto latch_b;
2246 else
2247 goto merge_bb;
2248 latch_b:
2249 goto header_b;
2250
2251 merge_bb:
2252 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2253
2254 exit_bb:
2255 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2256
2257 For each name used out side EPILOG (i.e - for each name that has a lcssa
2258 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2259 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2260 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2261 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2262 in exit_bb will also be updated. */
2263
2264static void
2265slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2266 edge guard_edge, edge merge_edge)
2267{
2268 gphi_iterator gsi;
2269 basic_block merge_bb = guard_edge->dest;
2270
2271 gcc_assert (single_succ_p (merge_bb));
2272 edge e = single_succ_edge (merge_bb);
2273 basic_block exit_bb = e->dest;
2274 gcc_assert (single_pred_p (exit_bb));
2275 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2276
2277 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2278 {
2279 gphi *update_phi = gsi.phi ();
2280 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2281 /* This loop-closed-phi actually doesn't represent a use out of the
2282 loop - the phi arg is a constant. */
2283 if (TREE_CODE (old_arg) != SSA_NAME)
2284 continue;
2285
2286 tree merge_arg = get_current_def (old_arg);
2287 if (!merge_arg)
2288 merge_arg = old_arg;
2289
2290 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2291 /* If the var is live after loop but not a reduction, we simply
2292 use the old arg. */
2293 if (!guard_arg)
2294 guard_arg = old_arg;
2295
2296 /* Create new phi node in MERGE_BB: */
2297 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2298 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2299
2300 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2301 the two PHI args in merge_phi for these edges. */
2302 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2303 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2304
2305 /* Update the original phi in exit_bb. */
2306 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2307 }
2308}
2309
2310/* EPILOG loop is duplicated from the original loop for vectorizing,
2311 the arg of its loop closed ssa PHI needs to be updated. */
2312
2313static void
2314slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2315{
2316 gphi_iterator gsi;
2317 basic_block exit_bb = single_exit (epilog)->dest;
2318
2319 gcc_assert (single_pred_p (exit_bb));
2320 edge e = EDGE_PRED (exit_bb, 0);
2321 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2322 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2323}
2324
2325/* Function vect_do_peeling.
2326
2327 Input:
2328 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2329
2330 preheader:
2331 LOOP:
2332 header_bb:
2333 loop_body
2334 if (exit_loop_cond) goto exit_bb
2335 else goto header_bb
2336 exit_bb:
2337
2338 - NITERS: The number of iterations of the loop.
2339 - NITERSM1: The number of iterations of the loop's latch.
2340 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2341 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2342 CHECK_PROFITABILITY is true.
2343 Output:
cde959e7 2344 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
60b29a7e 2345 iterate after vectorization; see vect_set_loop_condition for details.
cde959e7 2346 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2347 should be set to the number of scalar iterations handled by the
2348 vector loop. The SSA name is only used on exit from the loop.
6c6a3430 2349
2350 This function peels prolog and epilog from the loop, adds guards skipping
2351 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2352 would look like:
2353
2354 guard_bb_1:
2355 if (prefer_scalar_loop) goto merge_bb_1
2356 else goto guard_bb_2
2357
2358 guard_bb_2:
2359 if (skip_prolog) goto merge_bb_2
2360 else goto prolog_preheader
2361
2362 prolog_preheader:
2363 PROLOG:
2364 prolog_header_bb:
2365 prolog_body
2366 if (exit_prolog_cond) goto prolog_exit_bb
2367 else goto prolog_header_bb
2368 prolog_exit_bb:
2369
2370 merge_bb_2:
2371
2372 vector_preheader:
2373 VECTOR LOOP:
2374 vector_header_bb:
2375 vector_body
2376 if (exit_vector_cond) goto vector_exit_bb
2377 else goto vector_header_bb
2378 vector_exit_bb:
2379
2380 guard_bb_3:
2381 if (skip_epilog) goto merge_bb_3
2382 else goto epilog_preheader
2383
2384 merge_bb_1:
2385
2386 epilog_preheader:
2387 EPILOG:
2388 epilog_header_bb:
2389 epilog_body
2390 if (exit_epilog_cond) goto merge_bb_3
2391 else goto epilog_header_bb
2392
2393 merge_bb_3:
2394
2395 Note this function peels prolog and epilog only if it's necessary,
2396 as well as guards.
5b631e09 2397 Returns created epilogue or NULL.
6c6a3430 2398
2399 TODO: Guard for prefer_scalar_loop should be emitted along with
2400 versioning conditions if loop versioning is needed. */
2401
5b631e09 2402
2403struct loop *
6c6a3430 2404vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
cde959e7 2405 tree *niters_vector, tree *step_vector,
2406 tree *niters_vector_mult_vf_var, int th,
2407 bool check_profitability, bool niters_no_overflow)
6c6a3430 2408{
2409 edge e, guard_e;
2410 tree type = TREE_TYPE (niters), guard_cond;
2411 basic_block guard_bb, guard_to;
720cfc43 2412 profile_probability prob_prolog, prob_vector, prob_epilog;
d75596cd 2413 int estimated_vf;
6753a4bf 2414 int prolog_peeling = 0;
2415 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2416 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
53771608 2417
2418 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2419 poly_uint64 bound_epilog = 0;
2420 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2421 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2422 bound_epilog += vf - 1;
2423 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2424 bound_epilog += 1;
2425 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2426 poly_uint64 bound_scalar = bound_epilog;
6c6a3430 2427
2428 if (!prolog_peeling && !epilog_peeling)
5b631e09 2429 return NULL;
6c6a3430 2430
720cfc43 2431 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
d75596cd 2432 estimated_vf = vect_vf_for_cost (loop_vinfo);
2433 if (estimated_vf == 2)
2434 estimated_vf = 3;
720cfc43 2435 prob_prolog = prob_epilog = profile_probability::guessed_always ()
d75596cd 2436 .apply_scale (estimated_vf - 1, estimated_vf);
6c6a3430 2437
5b631e09 2438 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
6c6a3430 2439 struct loop *first_loop = loop;
15492f79 2440 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
6c6a3430 2441 create_lcssa_for_virtual_phi (loop);
2442 update_ssa (TODO_update_ssa_only_virtuals);
2443
c64f38bf 2444 if (MAY_HAVE_DEBUG_BIND_STMTS)
6c6a3430 2445 {
2446 gcc_assert (!adjust_vec.exists ());
2447 adjust_vec.create (32);
2448 }
fb85abff 2449 initialize_original_copy_tables ();
2450
53771608 2451 /* Record the anchor bb at which the guard should be placed if the scalar
2452 loop might be preferred. */
2453 basic_block anchor = loop_preheader_edge (loop)->src;
2454
2455 /* Generate the number of iterations for the prolog loop. We do this here
2456 so that we can also get the upper bound on the number of iterations. */
2457 tree niters_prolog;
2458 int bound_prolog = 0;
2459 if (prolog_peeling)
2460 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2461 &bound_prolog);
2462 else
2463 niters_prolog = build_int_cst (type, 0);
2464
6c6a3430 2465 /* Prolog loop may be skipped. */
2466 bool skip_prolog = (prolog_peeling != 0);
32236f80 2467 /* Skip to epilog if scalar loop may be preferred. It's only needed
2468 when we peel for epilog loop and when it hasn't been checked with
2469 loop versioning. */
53771608 2470 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2471 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2472 bound_prolog + bound_epilog)
2473 : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
6c6a3430 2474 /* Epilog loop must be executed if the number of iterations for epilog
2475 loop is known at compile time, otherwise we need to add a check at
2476 the end of vector loop and skip to the end of epilog loop. */
2477 bool skip_epilog = (prolog_peeling < 0
d75596cd 2478 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2479 || !vf.is_constant ());
6c6a3430 2480 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2481 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2482 skip_epilog = false;
2483
6c6a3430 2484 if (skip_vector)
c214c858 2485 {
2486 split_edge (loop_preheader_edge (loop));
2487
2488 /* Due to the order in which we peel prolog and epilog, we first
2489 propagate probability to the whole loop. The purpose is to
2490 avoid adjusting probabilities of both prolog and vector loops
2491 separately. Note in this case, the probability of epilog loop
2492 needs to be scaled back later. */
2493 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
720cfc43 2494 if (prob_vector.initialized_p ())
d75596cd 2495 {
2496 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2497 scale_loop_profile (loop, prob_vector, 0);
2498 }
c214c858 2499 }
6c6a3430 2500
c309657f 2501 dump_user_location_t loop_loc = find_loop_location (loop);
6c6a3430 2502 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2503 if (prolog_peeling)
2f630015 2504 {
6c6a3430 2505 e = loop_preheader_edge (loop);
2506 if (!slpeel_can_duplicate_loop_p (loop, e))
2f630015 2507 {
6c6a3430 2508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2509 "loop can't be duplicated to preheader edge.\n");
2510 gcc_unreachable ();
2511 }
2512 /* Peel prolog and put it on preheader edge of loop. */
2513 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2514 if (!prolog)
2515 {
2516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2517 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2518 gcc_unreachable ();
2519 }
2520 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2521 first_loop = prolog;
2522 reset_original_copy_tables ();
2523
53771608 2524 /* Update the number of iterations for prolog loop. */
cde959e7 2525 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
60b29a7e 2526 vect_set_loop_condition (prolog, NULL, niters_prolog,
2527 step_prolog, NULL_TREE, false);
6c6a3430 2528
2529 /* Skip the prolog loop. */
2530 if (skip_prolog)
2531 {
2532 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2533 niters_prolog, build_int_cst (type, 0));
2534 guard_bb = loop_preheader_edge (prolog)->src;
c214c858 2535 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
6c6a3430 2536 guard_to = split_edge (loop_preheader_edge (loop));
2537 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2538 guard_to, guard_bb,
720cfc43 2539 prob_prolog.invert (),
15492f79 2540 irred_flag);
6c6a3430 2541 e = EDGE_PRED (guard_to, 0);
2542 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2543 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
c214c858 2544
ca69b069 2545 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2546 scale_loop_profile (prolog, prob_prolog, bound_prolog);
6c6a3430 2547 }
2548 /* Update init address of DRs. */
6753a4bf 2549 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
6c6a3430 2550 /* Update niters for vector loop. */
2551 LOOP_VINFO_NITERS (loop_vinfo)
2552 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2553 LOOP_VINFO_NITERSM1 (loop_vinfo)
2554 = fold_build2 (MINUS_EXPR, type,
2555 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
3a815241 2556 bool new_var_p = false;
2557 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2558 /* It's guaranteed that vector loop bound before vectorization is at
2559 least VF, so set range information for newly generated var. */
2560 if (new_var_p)
2561 set_range_info (niters, VR_RANGE,
e3d0f65c 2562 wi::to_wide (build_int_cst (type, vf)),
2563 wi::to_wide (TYPE_MAX_VALUE (type)));
6c6a3430 2564
82112bf2 2565 /* Prolog iterates at most bound_prolog times, latch iterates at
2566 most bound_prolog - 1 times. */
2567 record_niter_bound (prolog, bound_prolog - 1, false, true);
6c6a3430 2568 delete_update_ssa ();
2569 adjust_vec_debug_stmts ();
2570 scev_reset ();
2f630015 2571 }
2572
6c6a3430 2573 if (epilog_peeling)
2574 {
2575 e = single_exit (loop);
2576 if (!slpeel_can_duplicate_loop_p (loop, e))
2577 {
2578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2579 "loop can't be duplicated to exit edge.\n");
2580 gcc_unreachable ();
2581 }
2582 /* Peel epilog and put it on exit edge of loop. */
2583 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2584 if (!epilog)
2585 {
2586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2587 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2588 gcc_unreachable ();
2589 }
2590 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2591
2592 /* Scalar version loop may be preferred. In this case, add guard
2593 and skip to epilog. Note this only happens when the number of
2594 iterations of loop is unknown at compile time, otherwise this
2595 won't be vectorized. */
2596 if (skip_vector)
2597 {
82112bf2 2598 /* Additional epilogue iteration is peeled if gap exists. */
6c6a3430 2599 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
53771608 2600 bound_prolog, bound_epilog,
82112bf2 2601 th, &bound_scalar,
6c6a3430 2602 check_profitability);
82112bf2 2603 /* Build guard against NITERSM1 since NITERS may overflow. */
2604 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
6c6a3430 2605 guard_bb = anchor;
2606 guard_to = split_edge (loop_preheader_edge (epilog));
2607 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2608 guard_to, guard_bb,
720cfc43 2609 prob_vector.invert (),
15492f79 2610 irred_flag);
6c6a3430 2611 e = EDGE_PRED (guard_to, 0);
2612 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2613 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
c214c858 2614
2615 /* Simply propagate profile info from guard_bb to guard_to which is
2616 a merge point of control flow. */
c214c858 2617 guard_to->count = guard_bb->count;
dcc86e2d 2618
ca69b069 2619 /* Scale probability of epilog loop back.
2620 FIXME: We should avoid scaling down and back up. Profile may
2621 get lost if we scale down to 0. */
12b55cc8 2622 basic_block *bbs = get_loop_body (epilog);
dcc86e2d 2623 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2624 bbs[i]->count = bbs[i]->count.apply_scale
2625 (bbs[i]->count,
2626 bbs[i]->count.apply_probability
2627 (prob_vector));
ca69b069 2628 free (bbs);
6c6a3430 2629 }
fb85abff 2630
c214c858 2631 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
6c6a3430 2632 tree niters_vector_mult_vf;
2633 /* If loop is peeled for non-zero constant times, now niters refers to
2634 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2635 overflows. */
2636 niters_no_overflow |= (prolog_peeling > 0);
2637 vect_gen_vector_loop_niters (loop_vinfo, niters,
cde959e7 2638 niters_vector, step_vector,
2639 niters_no_overflow);
2640 if (!integer_onep (*step_vector))
2641 {
2642 /* On exit from the loop we will have an easy way of calcalating
2643 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2644 until then. */
2645 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2646 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2647 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2648 }
2649 else
2650 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2651 &niters_vector_mult_vf);
6c6a3430 2652 /* Update IVs of original loop as if they were advanced by
2653 niters_vector_mult_vf steps. */
2654 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2655 edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2656 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2657 update_e);
2658
2659 if (skip_epilog)
2660 {
2661 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2662 niters, niters_vector_mult_vf);
2663 guard_bb = single_exit (loop)->dest;
2664 guard_to = split_edge (single_exit (epilog));
2665 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2666 skip_vector ? anchor : guard_bb,
720cfc43 2667 prob_epilog.invert (),
15492f79 2668 irred_flag);
6c6a3430 2669 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2670 single_exit (epilog));
c214c858 2671 /* Only need to handle basic block before epilog loop if it's not
2672 the guard_bb, which is the case when skip_vector is true. */
2673 if (guard_bb != bb_before_epilog)
2674 {
720cfc43 2675 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
c214c858 2676
ca69b069 2677 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
c214c858 2678 }
d75596cd 2679 scale_loop_profile (epilog, prob_epilog, 0);
6c6a3430 2680 }
2681 else
2682 slpeel_update_phi_nodes_for_lcssa (epilog);
fb85abff 2683
53771608 2684 unsigned HOST_WIDE_INT bound;
2685 if (bound_scalar.is_constant (&bound))
d75596cd 2686 {
53771608 2687 gcc_assert (bound != 0);
2688 /* -1 to convert loop iterations to latch iterations. */
2689 record_niter_bound (epilog, bound - 1, false, true);
d75596cd 2690 }
6c6a3430 2691
2692 delete_update_ssa ();
2693 adjust_vec_debug_stmts ();
2694 scev_reset ();
2695 }
2696 adjust_vec.release ();
fb85abff 2697 free_original_copy_tables ();
5b631e09 2698
2699 return epilog;
fb85abff 2700}
2701
d5e80d93 2702/* Function vect_create_cond_for_niters_checks.
2703
2704 Create a conditional expression that represents the run-time checks for
f9936b7c 2705 loop's niter. The loop is guaranteed to terminate if the run-time
d5e80d93 2706 checks hold.
2707
2708 Input:
2709 COND_EXPR - input conditional expression. New conditions will be chained
2710 with logical AND operation. If it is NULL, then the function
2711 is used to return the number of alias checks.
2712 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2713 to be checked.
2714
2715 Output:
2716 COND_EXPR - conditional expression.
2717
2718 The returned COND_EXPR is the conditional expression to be used in the
2719 if statement that controls which version of the loop gets executed at
2720 runtime. */
2721
2722static void
2723vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2724{
2725 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2726
2727 if (*cond_expr)
2728 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2729 *cond_expr, part_cond_expr);
2730 else
2731 *cond_expr = part_cond_expr;
2732}
fb85abff 2733
f68a7726 2734/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2735 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2736
2737static void
2738chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2739{
2740 if (*cond_expr)
2741 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2742 *cond_expr, part_cond_expr);
2743 else
2744 *cond_expr = part_cond_expr;
2745}
2746
fb85abff 2747/* Function vect_create_cond_for_align_checks.
2748
2749 Create a conditional expression that represents the alignment checks for
2750 all of data references (array element references) whose alignment must be
2751 checked at runtime.
2752
2753 Input:
2754 COND_EXPR - input conditional expression. New conditions will be chained
2755 with logical AND operation.
2756 LOOP_VINFO - two fields of the loop information are used.
2757 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2758 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2759
2760 Output:
2761 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2762 expression.
2763 The returned value is the conditional expression to be used in the if
2764 statement that controls which version of the loop gets executed at runtime.
2765
2766 The algorithm makes two assumptions:
2767 1) The number of bytes "n" in a vector is a power of 2.
2768 2) An address "a" is aligned if a%n is zero and that this
2769 test can be done as a&(n-1) == 0. For example, for 16
2770 byte vectors the test is a&0xf == 0. */
2771
2772static void
2773vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2774 tree *cond_expr,
2775 gimple_seq *cond_expr_stmt_list)
2776{
42acab1c 2777 vec<gimple *> may_misalign_stmts
fb85abff 2778 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
42acab1c 2779 gimple *ref_stmt;
fb85abff 2780 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2781 tree mask_cst;
2782 unsigned int i;
fb85abff 2783 tree int_ptrsize_type;
2784 char tmp_name[20];
2785 tree or_tmp_name = NULL_TREE;
03d37e4e 2786 tree and_tmp_name;
42acab1c 2787 gimple *and_stmt;
fb85abff 2788 tree ptrsize_zero;
2789 tree part_cond_expr;
2790
2791 /* Check that mask is one less than a power of 2, i.e., mask is
2792 all zeros followed by all ones. */
2793 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2794
3cea8318 2795 int_ptrsize_type = signed_type_for (ptr_type_node);
fb85abff 2796
2797 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2798 of the first vector of the i'th data reference. */
2799
f1f41a6c 2800 FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
fb85abff 2801 {
2802 gimple_seq new_stmt_list = NULL;
2803 tree addr_base;
03d37e4e 2804 tree addr_tmp_name;
2805 tree new_or_tmp_name;
42acab1c 2806 gimple *addr_stmt, *or_stmt;
f1b8c740 2807 stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2808 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2809 bool negative = tree_int_cst_compare
2810 (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2811 tree offset = negative
80b8a97a 2812 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
fb85abff 2813
2814 /* create: addr_tmp = (int)(address_of_first_vector) */
2815 addr_base =
2816 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
9e879814 2817 offset);
fb85abff 2818 if (new_stmt_list != NULL)
2819 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2820
03d37e4e 2821 sprintf (tmp_name, "addr2int%d", i);
2822 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
e9cf809e 2823 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
fb85abff 2824 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2825
2826 /* The addresses are OR together. */
2827
2828 if (or_tmp_name != NULL_TREE)
2829 {
2830 /* create: or_tmp = or_tmp | addr_tmp */
03d37e4e 2831 sprintf (tmp_name, "orptrs%d", i);
2832 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
e9cf809e 2833 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2834 or_tmp_name, addr_tmp_name);
fb85abff 2835 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2836 or_tmp_name = new_or_tmp_name;
2837 }
2838 else
2839 or_tmp_name = addr_tmp_name;
2840
2841 } /* end for i */
2842
2843 mask_cst = build_int_cst (int_ptrsize_type, mask);
2844
2845 /* create: and_tmp = or_tmp & mask */
03d37e4e 2846 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
fb85abff 2847
e9cf809e 2848 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2849 or_tmp_name, mask_cst);
fb85abff 2850 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2851
2852 /* Make and_tmp the left operand of the conditional test against zero.
2853 if and_tmp has a nonzero bit then some address is unaligned. */
2854 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2855 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2856 and_tmp_name, ptrsize_zero);
f68a7726 2857 chain_cond_expr (cond_expr, part_cond_expr);
2858}
2859
2860/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2861 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2862 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2863 and this new condition are true. Treat a null *COND_EXPR as "true". */
2864
2865static void
2866vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2867{
2868 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2869 unsigned int i;
2870 vec_object_pair *pair;
2871 FOR_EACH_VEC_ELT (pairs, i, pair)
2872 {
2873 tree addr1 = build_fold_addr_expr (pair->first);
2874 tree addr2 = build_fold_addr_expr (pair->second);
2875 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2876 addr1, addr2);
2877 chain_cond_expr (cond_expr, part_cond_expr);
2878 }
fb85abff 2879}
2880
e85b4a5e 2881/* Create an expression that is true when all lower-bound conditions for
2882 the vectorized loop are met. Chain this condition with *COND_EXPR. */
2883
2884static void
2885vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2886{
2887 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2888 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2889 {
2890 tree expr = lower_bounds[i].expr;
2891 tree type = unsigned_type_for (TREE_TYPE (expr));
2892 expr = fold_convert (type, expr);
2893 poly_uint64 bound = lower_bounds[i].min_value;
2894 if (!lower_bounds[i].unsigned_p)
2895 {
2896 expr = fold_build2 (PLUS_EXPR, type, expr,
2897 build_int_cstu (type, bound - 1));
2898 bound += bound - 1;
2899 }
2900 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2901 build_int_cstu (type, bound));
2902 chain_cond_expr (cond_expr, part_cond_expr);
2903 }
2904}
2905
fb85abff 2906/* Function vect_create_cond_for_alias_checks.
2907
2908 Create a conditional expression that represents the run-time checks for
2909 overlapping of address ranges represented by a list of data references
2910 relations passed as input.
2911
2912 Input:
2913 COND_EXPR - input conditional expression. New conditions will be chained
8a7b0f48 2914 with logical AND operation. If it is NULL, then the function
2915 is used to return the number of alias checks.
fb85abff 2916 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2917 to be checked.
2918
2919 Output:
2920 COND_EXPR - conditional expression.
fb85abff 2921
8a7b0f48 2922 The returned COND_EXPR is the conditional expression to be used in the if
fb85abff 2923 statement that controls which version of the loop gets executed at runtime.
2924*/
2925
8a7b0f48 2926void
90d4c4af 2927vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
fb85abff 2928{
43d14b66 2929 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
8a7b0f48 2930 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
fb85abff 2931
8a7b0f48 2932 if (comp_alias_ddrs.is_empty ())
fb85abff 2933 return;
2934
49ce332c 2935 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2936 &comp_alias_ddrs, cond_expr);
6d8fb6cf 2937 if (dump_enabled_p ())
b055bc88 2938 dump_printf_loc (MSG_NOTE, vect_location,
7bd765d4 2939 "created %u versioning for alias checks.\n",
8a7b0f48 2940 comp_alias_ddrs.length ());
fb85abff 2941}
2942
2943
2944/* Function vect_loop_versioning.
48e1416a 2945
fb85abff 2946 If the loop has data references that may or may not be aligned or/and
2947 has data reference relations whose independence was not proven then
2948 two versions of the loop need to be generated, one which is vectorized
2949 and one which isn't. A test is then generated to control which of the
2950 loops is executed. The test checks for the alignment of all of the
2951 data references that may or may not be aligned. An additional
2952 sequence of runtime tests is generated for each pairs of DDRs whose
48e1416a 2953 independence was not proven. The vectorized version of loop is
2954 executed only if both alias and alignment tests are passed.
2955
fb85abff 2956 The test generated to check which version of loop is executed
48e1416a 2957 is modified to also check for profitability as indicated by the
97fe80a6 2958 cost model threshold TH.
23a3430d 2959
2960 The versioning precondition(s) are placed in *COND_EXPR and
2afdcbed 2961 *COND_EXPR_STMT_LIST. */
fb85abff 2962
2963void
e7430948 2964vect_loop_versioning (loop_vec_info loop_vinfo,
7456a7ea 2965 unsigned int th, bool check_profitability,
2966 poly_uint64 versioning_threshold)
fb85abff 2967{
d5e80d93 2968 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
c71d3c24 2969 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
fb85abff 2970 basic_block condition_bb;
1a91d914 2971 gphi_iterator gsi;
2972 gimple_stmt_iterator cond_exp_gsi;
fb85abff 2973 basic_block merge_bb;
2974 basic_block new_exit_bb;
2975 edge new_exit_e, e;
1a91d914 2976 gphi *orig_phi, *new_phi;
e7430948 2977 tree cond_expr = NULL_TREE;
2afdcbed 2978 gimple_seq cond_expr_stmt_list = NULL;
fb85abff 2979 tree arg;
ca69b069 2980 profile_probability prob = profile_probability::likely ();
fb85abff 2981 gimple_seq gimplify_stmt_list = NULL;
b622227e 2982 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
6ee2edad 2983 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2984 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
d5e80d93 2985 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
fb85abff 2986
e7430948 2987 if (check_profitability)
ba12948e 2988 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
d5e80d93 2989 build_int_cst (TREE_TYPE (scalar_loop_iters),
b622227e 2990 th - 1));
7456a7ea 2991 if (maybe_ne (versioning_threshold, 0U))
2992 {
2993 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2994 build_int_cst (TREE_TYPE (scalar_loop_iters),
2995 versioning_threshold - 1));
2996 if (cond_expr)
2997 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2998 expr, cond_expr);
2999 else
3000 cond_expr = expr;
3001 }
d5e80d93 3002
3003 if (version_niter)
3004 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3005
3006 if (cond_expr)
3007 cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3008 is_gimple_condexpr, NULL_TREE);
fb85abff 3009
6ee2edad 3010 if (version_align)
2afdcbed 3011 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3012 &cond_expr_stmt_list);
fb85abff 3013
6ee2edad 3014 if (version_alias)
f68a7726 3015 {
3016 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
e85b4a5e 3017 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
f68a7726 3018 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3019 }
23a3430d 3020
f404501a 3021 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3022 &gimplify_stmt_list,
2afdcbed 3023 is_gimple_condexpr, NULL_TREE);
3024 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
fb85abff 3025
3026 initialize_original_copy_tables ();
c71d3c24 3027 if (scalar_loop)
3028 {
3029 edge scalar_e;
3030 basic_block preheader, scalar_preheader;
3031
3032 /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3033 scale LOOP's frequencies instead. */
b6863ffa 3034 nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
ca69b069 3035 prob, prob.invert (), prob, prob.invert (), true);
3036 scale_loop_frequencies (loop, prob);
c71d3c24 3037 /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3038 while we need to move it above LOOP's preheader. */
3039 e = loop_preheader_edge (loop);
3040 scalar_e = loop_preheader_edge (scalar_loop);
3041 gcc_assert (empty_block_p (e->src)
3042 && single_pred_p (e->src));
3043 gcc_assert (empty_block_p (scalar_e->src)
3044 && single_pred_p (scalar_e->src));
3045 gcc_assert (single_pred_p (condition_bb));
3046 preheader = e->src;
3047 scalar_preheader = scalar_e->src;
3048 scalar_e = find_edge (condition_bb, scalar_preheader);
3049 e = single_pred_edge (preheader);
3050 redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3051 scalar_preheader);
3052 redirect_edge_and_branch_force (scalar_e, preheader);
3053 redirect_edge_and_branch_force (e, condition_bb);
3054 set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3055 single_pred (condition_bb));
3056 set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3057 single_pred (scalar_preheader));
3058 set_immediate_dominator (CDI_DOMINATORS, preheader,
3059 condition_bb);
3060 }
3061 else
d5e80d93 3062 nloop = loop_version (loop, cond_expr, &condition_bb,
ca69b069 3063 prob, prob.invert (), prob, prob.invert (), true);
d5e80d93 3064
3065 if (version_niter)
3066 {
3067 /* The versioned loop could be infinite, we need to clear existing
3068 niter information which is copied from the original loop. */
3069 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3070 vect_free_loop_info_assumptions (nloop);
3071 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3072 loop_constraint_set (loop, LOOP_C_INFINITE);
3073 }
6ee2edad 3074
c309657f 3075 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
6ee2edad 3076 && dump_enabled_p ())
3077 {
3078 if (version_alias)
3079 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3080 "loop versioned for vectorization because of "
3081 "possible aliasing\n");
3082 if (version_align)
3083 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3084 "loop versioned for vectorization to enhance "
3085 "alignment\n");
3086
3087 }
9af5ce0c 3088 free_original_copy_tables ();
fb85abff 3089
48e1416a 3090 /* Loop versioning violates an assumption we try to maintain during
fb85abff 3091 vectorization - that the loop exit block has a single predecessor.
3092 After versioning, the exit block of both loop versions is the same
3093 basic block (i.e. it has two predecessors). Just in order to simplify
3094 following transformations in the vectorizer, we fix this situation
3095 here by adding a new (empty) block on the exit-edge of the loop,
c71d3c24 3096 with the proper loop-exit phis to maintain loop-closed-form.
3097 If loop versioning wasn't done from loop, but scalar_loop instead,
3098 merge_bb will have already just a single successor. */
48e1416a 3099
fb85abff 3100 merge_bb = single_exit (loop)->dest;
c71d3c24 3101 if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
fb85abff 3102 {
c71d3c24 3103 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3104 new_exit_bb = split_edge (single_exit (loop));
3105 new_exit_e = single_exit (loop);
3106 e = EDGE_SUCC (new_exit_bb, 0);
3107
3108 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3109 {
3110 tree new_res;
1a91d914 3111 orig_phi = gsi.phi ();
f9e245b2 3112 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
c71d3c24 3113 new_phi = create_phi_node (new_res, new_exit_bb);
3114 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3115 add_phi_arg (new_phi, arg, new_exit_e,
3116 gimple_phi_arg_location_from_edge (orig_phi, e));
3117 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3118 }
48e1416a 3119 }
fb85abff 3120
3121 /* End loop-exit-fixes after versioning. */
3122
2afdcbed 3123 if (cond_expr_stmt_list)
fb85abff 3124 {
3125 cond_exp_gsi = gsi_last_bb (condition_bb);
2afdcbed 3126 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
23a3430d 3127 GSI_SAME_STMT);
fb85abff 3128 }
95e19962 3129 update_ssa (TODO_update_ssa);
fb85abff 3130}