]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-loop-manip.c
Allow single-element interleaving for non-power-of-2 strides
[thirdparty/gcc.git] / gcc / tree-vect-loop-manip.c
CommitLineData
b8698a0f 1/* Vectorizer Specific Loop Manipulations
85ec4feb 2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
b8698a0f 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
ebfd146a
IR
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
40e23961 26#include "tree.h"
c7131fb2 27#include "gimple.h"
957060b5
AM
28#include "cfghooks.h"
29#include "tree-pass.h"
c7131fb2 30#include "ssa.h"
c7131fb2 31#include "fold-const.h"
60393bbc 32#include "cfganal.h"
45b0be94 33#include "gimplify.h"
5be5c238 34#include "gimple-iterator.h"
18f429e2 35#include "gimplify-me.h"
442b4905 36#include "tree-cfg.h"
e28030cf 37#include "tree-ssa-loop-manip.h"
442b4905 38#include "tree-into-ssa.h"
7a300452 39#include "tree-ssa.h"
ebfd146a 40#include "cfgloop.h"
ebfd146a
IR
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
2a93954e 43#include "tree-ssa-loop-ivopts.h"
0f26839a 44#include "gimple-fold.h"
7cfb4d93
RS
45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
ebfd146a
IR
50
51/*************************************************************************
52 Simple Loop Peeling Utilities
53
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
56
57
58/* Renames the use *OP_P. */
59
60static void
61rename_use_op (use_operand_p op_p)
62{
63 tree new_name;
64
65 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66 return;
67
68 new_name = get_current_def (USE_FROM_PTR (op_p));
69
70 /* Something defined outside of the loop. */
71 if (!new_name)
72 return;
73
74 /* An ordinary ssa name defined in the loop. */
75
76 SET_USE (op_p, new_name);
77}
78
79
cb330ba5 80/* Renames the variables in basic block BB. Allow renaming of PHI arguments
a6c51a12
YR
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82 true. */
ebfd146a 83
2cfc56b9 84static void
a6c51a12 85rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
ebfd146a 86{
355fe088 87 gimple *stmt;
ebfd146a
IR
88 use_operand_p use_p;
89 ssa_op_iter iter;
90 edge e;
91 edge_iterator ei;
92 struct loop *loop = bb->loop_father;
a6c51a12
YR
93 struct loop *outer_loop = NULL;
94
95 if (rename_from_outer_loop)
96 {
97 gcc_assert (loop);
98 outer_loop = loop_outer (loop);
99 }
ebfd146a 100
538dd0b7
DM
101 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102 gsi_next (&gsi))
ebfd146a
IR
103 {
104 stmt = gsi_stmt (gsi);
105 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 rename_use_op (use_p);
107 }
108
2cfc56b9 109 FOR_EACH_EDGE (e, ei, bb->preds)
ebfd146a 110 {
cb330ba5
JJ
111 if (!flow_bb_inside_loop_p (loop, e->src))
112 {
113 if (!rename_from_outer_loop)
114 continue;
115 if (e->src != outer_loop->header)
116 {
117 if (outer_loop->inner->next)
118 {
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e->src)
123 || single_pred (e->src) != outer_loop->header)
124 continue;
125 }
cb330ba5
JJ
126 }
127 }
538dd0b7
DM
128 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 gsi_next (&gsi))
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
ebfd146a
IR
131 }
132}
133
134
a79683d5 135struct adjust_info
684f25f4
AO
136{
137 tree from, to;
138 basic_block bb;
a79683d5 139};
684f25f4 140
684f25f4
AO
141/* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
146 more. */
ff4c81cc 147static vec<adjust_info, va_heap> adjust_vec;
684f25f4
AO
148
149/* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
152
153static void
154adjust_debug_stmts_now (adjust_info *ai)
155{
156 basic_block bbphi = ai->bb;
157 tree orig_def = ai->from;
158 tree new_def = ai->to;
159 imm_use_iterator imm_iter;
355fe088 160 gimple *stmt;
684f25f4
AO
161 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164
165 /* Adjust any debug stmts that held onto non-loop-closed
166 references. */
167 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168 {
169 use_operand_p use_p;
170 basic_block bbuse;
171
172 if (!is_gimple_debug (stmt))
173 continue;
174
175 gcc_assert (gimple_debug_bind_p (stmt));
176
177 bbuse = gimple_bb (stmt);
178
179 if ((bbuse == bbphi
180 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 && !(bbuse == bbdef
182 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183 {
184 if (new_def)
185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 SET_USE (use_p, new_def);
187 else
188 {
189 gimple_debug_bind_reset_value (stmt);
190 update_stmt (stmt);
191 }
192 }
193 }
194}
195
196/* Adjust debug stmts as scheduled before. */
197
198static void
199adjust_vec_debug_stmts (void)
200{
36f52e8f 201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
684f25f4
AO
202 return;
203
9771b263 204 gcc_assert (adjust_vec.exists ());
684f25f4 205
9771b263 206 while (!adjust_vec.is_empty ())
684f25f4 207 {
9771b263
DN
208 adjust_debug_stmts_now (&adjust_vec.last ());
209 adjust_vec.pop ();
684f25f4 210 }
684f25f4
AO
211}
212
213/* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
217
218static void
219adjust_debug_stmts (tree from, tree to, basic_block bb)
220{
221 adjust_info ai;
222
36f52e8f 223 if (MAY_HAVE_DEBUG_BIND_STMTS
a471762f 224 && TREE_CODE (from) == SSA_NAME
a52ca739 225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
a471762f 226 && ! virtual_operand_p (from))
684f25f4
AO
227 {
228 ai.from = from;
229 ai.to = to;
230 ai.bb = bb;
231
9771b263
DN
232 if (adjust_vec.exists ())
233 adjust_vec.safe_push (ai);
684f25f4
AO
234 else
235 adjust_debug_stmts_now (&ai);
236 }
237}
238
239/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
242 transformations. */
243
244static void
355fe088 245adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
684f25f4
AO
246{
247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248
249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250
36f52e8f 251 if (MAY_HAVE_DEBUG_BIND_STMTS)
684f25f4
AO
252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 gimple_bb (update_phi));
254}
255
7cfb4d93
RS
256/* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
0f26839a 259
7cfb4d93
RS
260static void
261vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262 tree next_mask)
263{
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267}
0f26839a 268
7cfb4d93 269/* Add SEQ to the end of LOOP's preheader block. */
ebfd146a 270
7cfb4d93
RS
271static void
272add_preheader_seq (struct loop *loop, gimple_seq seq)
273{
274 if (seq)
275 {
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
279 }
280}
ebfd146a 281
7cfb4d93
RS
282/* Add SEQ to the beginning of LOOP's header block. */
283
284static void
285add_header_seq (struct loop *loop, gimple_seq seq)
286{
287 if (seq)
288 {
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291 }
292}
293
294/* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
298
299static bool
300interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
302{
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
307 {
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
310 }
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313}
314
315/* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
318
319static bool
320vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321 rgroup_masks *src_rgm)
322{
323 tree src_masktype = src_rgm->mask_type;
324 tree dest_masktype = dest_rgm->mask_type;
325 machine_mode src_mode = TYPE_MODE (src_masktype);
326 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329 {
330 /* Unpacking the source masks gives at least as many mask bits as
331 we need. We can then VIEW_CONVERT any excess bits away. */
332 tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334 {
335 tree src = src_rgm->masks[i / 2];
336 tree dest = dest_rgm->masks[i];
337 tree_code code = (i & 1 ? VEC_UNPACK_HI_EXPR
338 : VEC_UNPACK_LO_EXPR);
339 gassign *stmt;
340 if (dest_masktype == unpack_masktype)
341 stmt = gimple_build_assign (dest, code, src);
342 else
343 {
344 tree temp = make_ssa_name (unpack_masktype);
345 stmt = gimple_build_assign (temp, code, src);
346 gimple_seq_add_stmt (seq, stmt);
347 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
348 build1 (VIEW_CONVERT_EXPR,
349 dest_masktype, temp));
350 }
351 gimple_seq_add_stmt (seq, stmt);
352 }
353 return true;
354 }
355 vec_perm_indices indices[2];
356 if (dest_masktype == src_masktype
357 && interleave_supported_p (&indices[0], src_masktype, 0)
358 && interleave_supported_p (&indices[1], src_masktype, 1))
359 {
360 /* The destination requires twice as many mask bits as the source, so
361 we can use interleaving permutes to double up the number of bits. */
362 tree masks[2];
363 for (unsigned int i = 0; i < 2; ++i)
364 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
365 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
366 {
367 tree src = src_rgm->masks[i / 2];
368 tree dest = dest_rgm->masks[i];
369 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
370 src, src, masks[i & 1]);
371 gimple_seq_add_stmt (seq, stmt);
372 }
373 return true;
374 }
375 return false;
376}
377
378/* Helper for vect_set_loop_condition_masked. Generate definitions for
379 all the masks in RGM and return a mask that is nonzero when the loop
380 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
381 Use LOOP_COND_GSI to insert code before the exit gcond.
382
383 RGM belongs to loop LOOP. The loop originally iterated NITERS
384 times and has been vectorized according to LOOP_VINFO. Each iteration
385 of the vectorized loop handles VF iterations of the scalar loop.
386
535e7c11
RS
387 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
388 starts with NITERS_SKIP dummy iterations of the scalar loop before
389 the real work starts. The mask elements for these dummy iterations
390 must be 0, to ensure that the extra iterations do not have an effect.
391
7cfb4d93
RS
392 It is known that:
393
394 NITERS * RGM->max_nscalars_per_iter
395
396 does not overflow. However, MIGHT_WRAP_P says whether an induction
397 variable that starts at 0 and has step:
398
399 VF * RGM->max_nscalars_per_iter
400
401 might overflow before hitting a value above:
402
535e7c11 403 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
7cfb4d93
RS
404
405 This means that we cannot guarantee that such an induction variable
406 would ever hit a value that produces a set of all-false masks for RGM. */
407
408static tree
409vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
410 gimple_seq *preheader_seq,
411 gimple_stmt_iterator loop_cond_gsi,
412 rgroup_masks *rgm, tree vf,
535e7c11
RS
413 tree niters, tree niters_skip,
414 bool might_wrap_p)
7cfb4d93
RS
415{
416 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
417 tree mask_type = rgm->mask_type;
418 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
419 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
420
421 /* Calculate the maximum number of scalar values that the rgroup
535e7c11
RS
422 handles in total, the number that it handles for each iteration
423 of the vector loop, and the number that it should skip during the
424 first iteration of the vector loop. */
7cfb4d93
RS
425 tree nscalars_total = niters;
426 tree nscalars_step = vf;
535e7c11 427 tree nscalars_skip = niters_skip;
7cfb4d93
RS
428 if (nscalars_per_iter != 1)
429 {
430 /* We checked before choosing to use a fully-masked loop that these
431 multiplications don't overflow. */
432 tree factor = build_int_cst (compare_type, nscalars_per_iter);
433 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
434 nscalars_total, factor);
435 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
436 nscalars_step, factor);
535e7c11
RS
437 if (nscalars_skip)
438 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
439 nscalars_skip, factor);
7cfb4d93
RS
440 }
441
442 /* Create an induction variable that counts the number of scalars
443 processed. */
444 tree index_before_incr, index_after_incr;
445 gimple_stmt_iterator incr_gsi;
446 bool insert_after;
447 tree zero_index = build_int_cst (compare_type, 0);
448 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
449 create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
450 insert_after, &index_before_incr, &index_after_incr);
451
535e7c11 452 tree test_index, test_limit, first_limit;
7cfb4d93
RS
453 gimple_stmt_iterator *test_gsi;
454 if (might_wrap_p)
455 {
456 /* In principle the loop should stop iterating once the incremented
535e7c11
RS
457 IV reaches a value greater than or equal to:
458
459 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
460
461 However, there's no guarantee that this addition doesn't overflow
462 the comparison type, or that the IV hits a value above it before
463 wrapping around. We therefore adjust the limit down by one
464 IV step:
7cfb4d93 465
535e7c11
RS
466 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
467 -[infinite-prec] NSCALARS_STEP
7cfb4d93
RS
468
469 and compare the IV against this limit _before_ incrementing it.
470 Since the comparison type is unsigned, we actually want the
471 subtraction to saturate at zero:
472
535e7c11
RS
473 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
474 -[sat] NSCALARS_STEP
475
476 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
477
478 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
479
480 where the rightmost subtraction can be done directly in
481 COMPARE_TYPE. */
7cfb4d93 482 test_index = index_before_incr;
535e7c11
RS
483 tree adjust = nscalars_step;
484 if (nscalars_skip)
485 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
486 adjust, nscalars_skip);
7cfb4d93 487 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
535e7c11 488 nscalars_total, adjust);
7cfb4d93 489 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
535e7c11 490 test_limit, adjust);
7cfb4d93 491 test_gsi = &incr_gsi;
535e7c11
RS
492
493 /* Get a safe limit for the first iteration. */
494 if (nscalars_skip)
495 {
496 /* The first vector iteration can handle at most NSCALARS_STEP
497 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
498 NSCALARS_SKIP to that cannot overflow. */
499 tree const_limit = build_int_cst (compare_type,
500 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
501 * nscalars_per_iter);
502 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
503 nscalars_total, const_limit);
504 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
505 first_limit, nscalars_skip);
506 }
507 else
508 /* For the first iteration it doesn't matter whether the IV hits
509 a value above NSCALARS_TOTAL. That only matters for the latch
510 condition. */
511 first_limit = nscalars_total;
7cfb4d93
RS
512 }
513 else
514 {
515 /* Test the incremented IV, which will always hit a value above
516 the bound before wrapping. */
517 test_index = index_after_incr;
518 test_limit = nscalars_total;
535e7c11
RS
519 if (nscalars_skip)
520 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
521 test_limit, nscalars_skip);
7cfb4d93 522 test_gsi = &loop_cond_gsi;
535e7c11
RS
523
524 first_limit = test_limit;
7cfb4d93
RS
525 }
526
527 /* Provide a definition of each mask in the group. */
528 tree next_mask = NULL_TREE;
529 tree mask;
530 unsigned int i;
531 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
532 {
533 /* Previous masks will cover BIAS scalars. This mask covers the
534 next batch. */
535 poly_uint64 bias = nscalars_per_mask * i;
536 tree bias_tree = build_int_cst (compare_type, bias);
537 gimple *tmp_stmt;
538
539 /* See whether the first iteration of the vector loop is known
540 to have a full mask. */
541 poly_uint64 const_limit;
542 bool first_iteration_full
535e7c11 543 = (poly_int_tree_p (first_limit, &const_limit)
7cfb4d93
RS
544 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
545
546 /* Rather than have a new IV that starts at BIAS and goes up to
547 TEST_LIMIT, prefer to use the same 0-based IV for each mask
548 and adjust the bound down by BIAS. */
549 tree this_test_limit = test_limit;
550 if (i != 0)
551 {
552 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
553 compare_type, this_test_limit,
554 bias_tree);
555 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
556 compare_type, this_test_limit,
557 bias_tree);
558 }
559
535e7c11
RS
560 /* Create the initial mask. First include all scalars that
561 are within the loop limit. */
7cfb4d93
RS
562 tree init_mask = NULL_TREE;
563 if (!first_iteration_full)
564 {
565 tree start, end;
535e7c11 566 if (first_limit == test_limit)
7cfb4d93
RS
567 {
568 /* Use a natural test between zero (the initial IV value)
569 and the loop limit. The "else" block would be valid too,
570 but this choice can avoid the need to load BIAS_TREE into
571 a register. */
572 start = zero_index;
573 end = this_test_limit;
574 }
575 else
576 {
535e7c11
RS
577 /* FIRST_LIMIT is the maximum number of scalars handled by the
578 first iteration of the vector loop. Test the portion
579 associated with this mask. */
7cfb4d93 580 start = bias_tree;
535e7c11 581 end = first_limit;
7cfb4d93
RS
582 }
583
584 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
585 tmp_stmt = vect_gen_while (init_mask, start, end);
586 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
587 }
588
535e7c11
RS
589 /* Now AND out the bits that are within the number of skipped
590 scalars. */
591 poly_uint64 const_skip;
592 if (nscalars_skip
593 && !(poly_int_tree_p (nscalars_skip, &const_skip)
594 && known_le (const_skip, bias)))
595 {
596 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
597 bias_tree, nscalars_skip);
598 if (init_mask)
599 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
600 init_mask, unskipped_mask);
601 else
602 init_mask = unskipped_mask;
603 }
604
7cfb4d93
RS
605 if (!init_mask)
606 /* First iteration is full. */
607 init_mask = build_minus_one_cst (mask_type);
608
609 /* Get the mask value for the next iteration of the loop. */
610 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
611 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
612 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
613
614 vect_set_loop_mask (loop, mask, init_mask, next_mask);
615 }
616 return next_mask;
617}
618
619/* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
620 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
621 number of iterations of the original scalar loop. NITERS_MAYBE_ZERO
622 and FINAL_IV are as for vect_set_loop_condition.
623
624 Insert the branch-back condition before LOOP_COND_GSI and return the
625 final gcond. */
626
627static gcond *
628vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
629 tree niters, tree final_iv,
630 bool niters_maybe_zero,
631 gimple_stmt_iterator loop_cond_gsi)
632{
633 gimple_seq preheader_seq = NULL;
634 gimple_seq header_seq = NULL;
635
636 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
637 unsigned int compare_precision = TYPE_PRECISION (compare_type);
638 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
639 tree orig_niters = niters;
640
641 /* Type of the initial value of NITERS. */
642 tree ni_actual_type = TREE_TYPE (niters);
643 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
644
645 /* Convert NITERS to the same size as the compare. */
646 if (compare_precision > ni_actual_precision
647 && niters_maybe_zero)
648 {
649 /* We know that there is always at least one iteration, so if the
650 count is zero then it must have wrapped. Cope with this by
651 subtracting 1 before the conversion and adding 1 to the result. */
652 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
653 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
654 niters, build_minus_one_cst (ni_actual_type));
655 niters = gimple_convert (&preheader_seq, compare_type, niters);
656 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
657 niters, build_one_cst (compare_type));
658 }
659 else
660 niters = gimple_convert (&preheader_seq, compare_type, niters);
661
535e7c11
RS
662 /* Convert skip_niters to the right type. */
663 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
664
7cfb4d93
RS
665 /* Now calculate the value that the induction variable must be able
666 to hit in order to ensure that we end the loop with an all-false mask.
667 This involves adding the maximum number of inactive trailing scalar
668 iterations. */
669 widest_int iv_limit;
670 bool known_max_iters = max_loop_iterations (loop, &iv_limit);
671 if (known_max_iters)
672 {
535e7c11
RS
673 if (niters_skip)
674 {
675 /* Add the maximum number of skipped iterations to the
676 maximum iteration count. */
677 if (TREE_CODE (niters_skip) == INTEGER_CST)
678 iv_limit += wi::to_widest (niters_skip);
679 else
680 iv_limit += max_vf - 1;
681 }
7cfb4d93
RS
682 /* IV_LIMIT is the maximum number of latch iterations, which is also
683 the maximum in-range IV value. Round this value down to the previous
684 vector alignment boundary and then add an extra full iteration. */
685 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
686 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
687 }
688
689 /* Get the vectorization factor in tree form. */
690 tree vf = build_int_cst (compare_type,
691 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
692
693 /* Iterate over all the rgroups and fill in their masks. We could use
694 the first mask from any rgroup for the loop condition; here we
695 arbitrarily pick the last. */
696 tree test_mask = NULL_TREE;
697 rgroup_masks *rgm;
698 unsigned int i;
699 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
700 FOR_EACH_VEC_ELT (*masks, i, rgm)
701 if (!rgm->masks.is_empty ())
702 {
703 /* First try using permutes. This adds a single vector
704 instruction to the loop for each mask, but needs no extra
705 loop invariants or IVs. */
706 unsigned int nmasks = i + 1;
707 if ((nmasks & 1) == 0)
708 {
709 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
710 if (!half_rgm->masks.is_empty ()
711 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
712 continue;
713 }
714
715 /* See whether zero-based IV would ever generate all-false masks
716 before wrapping around. */
717 bool might_wrap_p
718 = (!known_max_iters
719 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
720 UNSIGNED)
721 > compare_precision));
722
723 /* Set up all masks for this group. */
724 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
725 &preheader_seq,
726 loop_cond_gsi, rgm, vf,
535e7c11
RS
727 niters, niters_skip,
728 might_wrap_p);
7cfb4d93
RS
729 }
730
731 /* Emit all accumulated statements. */
732 add_preheader_seq (loop, preheader_seq);
733 add_header_seq (loop, header_seq);
734
735 /* Get a boolean result that tells us whether to iterate. */
736 edge exit_edge = single_exit (loop);
737 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
738 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
739 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
740 NULL_TREE, NULL_TREE);
741 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
742
743 /* The loop iterates (NITERS - 1) / VF + 1 times.
744 Subtract one from this to get the latch count. */
745 tree step = build_int_cst (compare_type,
746 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
747 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
748 build_minus_one_cst (compare_type));
749 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
750 niters_minus_one, step);
751
752 if (final_iv)
753 {
754 gassign *assign = gimple_build_assign (final_iv, orig_niters);
755 gsi_insert_on_edge_immediate (single_exit (loop), assign);
756 }
757
758 return cond_stmt;
759}
760
761/* Like vect_set_loop_condition, but handle the case in which there
762 are no loop masks. */
763
764static gcond *
765vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
766 tree step, tree final_iv,
767 bool niters_maybe_zero,
768 gimple_stmt_iterator loop_cond_gsi)
ebfd146a
IR
769{
770 tree indx_before_incr, indx_after_incr;
538dd0b7
DM
771 gcond *cond_stmt;
772 gcond *orig_cond;
0f26839a 773 edge pe = loop_preheader_edge (loop);
ebfd146a 774 edge exit_edge = single_exit (loop);
ebfd146a
IR
775 gimple_stmt_iterator incr_gsi;
776 bool insert_after;
ebfd146a 777 enum tree_code code;
0f26839a 778 tree niters_type = TREE_TYPE (niters);
ebfd146a
IR
779
780 orig_cond = get_loop_exit_condition (loop);
781 gcc_assert (orig_cond);
782 loop_cond_gsi = gsi_for_stmt (orig_cond);
783
0f26839a
RS
784 tree init, limit;
785 if (!niters_maybe_zero && integer_onep (step))
786 {
787 /* In this case we can use a simple 0-based IV:
788
789 A:
790 x = 0;
791 do
792 {
793 ...
794 x += 1;
795 }
796 while (x < NITERS); */
797 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
798 init = build_zero_cst (niters_type);
799 limit = niters;
800 }
801 else
802 {
803 /* The following works for all values of NITERS except 0:
804
805 B:
806 x = 0;
807 do
808 {
809 ...
810 x += STEP;
811 }
812 while (x <= NITERS - STEP);
813
814 so that the loop continues to iterate if x + STEP - 1 < NITERS
815 but stops if x + STEP - 1 >= NITERS.
816
817 However, if NITERS is zero, x never hits a value above NITERS - STEP
818 before wrapping around. There are two obvious ways of dealing with
819 this:
820
821 - start at STEP - 1 and compare x before incrementing it
822 - start at -1 and compare x after incrementing it
823
824 The latter is simpler and is what we use. The loop in this case
825 looks like:
826
827 C:
828 x = -1;
829 do
830 {
831 ...
832 x += STEP;
833 }
834 while (x < NITERS - STEP);
835
836 In both cases the loop limit is NITERS - STEP. */
837 gimple_seq seq = NULL;
838 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
839 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
840 if (seq)
841 {
842 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
843 gcc_assert (!new_bb);
844 }
845 if (niters_maybe_zero)
846 {
847 /* Case C. */
848 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
849 init = build_all_ones_cst (niters_type);
850 }
851 else
852 {
853 /* Case B. */
854 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
855 init = build_zero_cst (niters_type);
856 }
857 }
858
ebfd146a
IR
859 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
860 create_iv (init, step, NULL_TREE, loop,
861 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
ebfd146a
IR
862 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
863 true, NULL_TREE, true,
864 GSI_SAME_STMT);
0f26839a 865 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
ebfd146a
IR
866 true, GSI_SAME_STMT);
867
0f26839a 868 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
ebfd146a
IR
869 NULL_TREE);
870
871 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
872
71118889 873 /* Record the number of latch iterations. */
0f26839a
RS
874 if (limit == niters)
875 /* Case A: the loop iterates NITERS times. Subtract one to get the
876 latch count. */
877 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
878 build_int_cst (niters_type, 1));
879 else
880 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
881 Subtract one from this to get the latch count. */
882 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
883 limit, step);
884
885 if (final_iv)
886 {
887 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
888 indx_after_incr, init);
889 gsi_insert_on_edge_immediate (single_exit (loop), assign);
890 }
7cfb4d93
RS
891
892 return cond_stmt;
893}
894
895/* If we're using fully-masked loops, make LOOP iterate:
896
897 N == (NITERS - 1) / STEP + 1
898
899 times. When NITERS is zero, this is equivalent to making the loop
900 execute (1 << M) / STEP times, where M is the precision of NITERS.
901 NITERS_MAYBE_ZERO is true if this last case might occur.
902
903 If we're not using fully-masked loops, make LOOP iterate:
904
905 N == (NITERS - STEP) / STEP + 1
906
907 times, where NITERS is known to be outside the range [1, STEP - 1].
908 This is equivalent to making the loop execute NITERS / STEP times
909 when NITERS is nonzero and (1 << M) / STEP times otherwise.
910 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
911
912 If FINAL_IV is nonnull, it is an SSA name that should be set to
913 N * STEP on exit from the loop.
914
915 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
916
917void
918vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
919 tree niters, tree step, tree final_iv,
920 bool niters_maybe_zero)
921{
922 gcond *cond_stmt;
923 gcond *orig_cond = get_loop_exit_condition (loop);
924 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
925
926 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
927 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
928 final_iv, niters_maybe_zero,
929 loop_cond_gsi);
930 else
931 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
932 final_iv, niters_maybe_zero,
933 loop_cond_gsi);
934
935 /* Remove old loop exit test. */
936 gsi_remove (&loop_cond_gsi, true);
937 free_stmt_vec_info (orig_cond);
938
939 if (dump_enabled_p ())
940 {
941 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
942 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
943 }
ebfd146a
IR
944}
945
5ce9450f
JJ
946/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
947 For all PHI arguments in FROM->dest and TO->dest from those
948 edges ensure that TO->dest PHI arguments have current_def
949 to that in from. */
950
951static void
952slpeel_duplicate_current_defs_from_edges (edge from, edge to)
953{
954 gimple_stmt_iterator gsi_from, gsi_to;
955
956 for (gsi_from = gsi_start_phis (from->dest),
957 gsi_to = gsi_start_phis (to->dest);
14ba8d6d 958 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
5ce9450f 959 {
355fe088
TS
960 gimple *from_phi = gsi_stmt (gsi_from);
961 gimple *to_phi = gsi_stmt (gsi_to);
5ce9450f 962 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1a5da5b6
RB
963 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
964 if (virtual_operand_p (from_arg))
965 {
14ba8d6d
RB
966 gsi_next (&gsi_from);
967 continue;
968 }
1a5da5b6
RB
969 if (virtual_operand_p (to_arg))
970 {
14ba8d6d
RB
971 gsi_next (&gsi_to);
972 continue;
973 }
1a5da5b6
RB
974 if (TREE_CODE (from_arg) != SSA_NAME)
975 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
976 else
977 {
978 if (get_current_def (to_arg) == NULL_TREE)
979 set_current_def (to_arg, get_current_def (from_arg));
980 }
14ba8d6d
RB
981 gsi_next (&gsi_from);
982 gsi_next (&gsi_to);
5ce9450f 983 }
1a5da5b6
RB
984
985 gphi *from_phi = get_virtual_phi (from->dest);
986 gphi *to_phi = get_virtual_phi (to->dest);
987 if (from_phi)
988 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
989 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
5ce9450f
JJ
990}
991
ebfd146a 992
b8698a0f 993/* Given LOOP this function generates a new copy of it and puts it
5ce9450f
JJ
994 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
995 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
996 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
997 entry or exit of LOOP. */
ebfd146a
IR
998
999struct loop *
5ce9450f
JJ
1000slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1001 struct loop *scalar_loop, edge e)
ebfd146a
IR
1002{
1003 struct loop *new_loop;
3884da6f 1004 basic_block *new_bbs, *bbs, *pbbs;
ebfd146a
IR
1005 bool at_exit;
1006 bool was_imm_dom;
b8698a0f 1007 basic_block exit_dest;
ebfd146a 1008 edge exit, new_exit;
a6c51a12 1009 bool duplicate_outer_loop = false;
ebfd146a 1010
2cfc56b9
RB
1011 exit = single_exit (loop);
1012 at_exit = (e == exit);
ebfd146a
IR
1013 if (!at_exit && e != loop_preheader_edge (loop))
1014 return NULL;
1015
5ce9450f
JJ
1016 if (scalar_loop == NULL)
1017 scalar_loop = loop;
1018
1019 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
3884da6f
BC
1020 pbbs = bbs + 1;
1021 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
a6c51a12
YR
1022 /* Allow duplication of outer loops. */
1023 if (scalar_loop->inner)
1024 duplicate_outer_loop = true;
ebfd146a 1025 /* Check whether duplication is possible. */
3884da6f 1026 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
ebfd146a
IR
1027 {
1028 free (bbs);
1029 return NULL;
1030 }
1031
1032 /* Generate new loop structure. */
5ce9450f
JJ
1033 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1034 duplicate_subloops (scalar_loop, new_loop);
ebfd146a 1035
2cfc56b9 1036 exit_dest = exit->dest;
b8698a0f
L
1037 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1038 exit_dest) == loop->header ?
ebfd146a
IR
1039 true : false);
1040
2cfc56b9
RB
1041 /* Also copy the pre-header, this avoids jumping through hoops to
1042 duplicate the loop entry PHI arguments. Create an empty
1043 pre-header unconditionally for this. */
5ce9450f 1044 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
2cfc56b9 1045 edge entry_e = single_pred_edge (preheader);
3884da6f 1046 bbs[0] = preheader;
5ce9450f 1047 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
ebfd146a 1048
5ce9450f
JJ
1049 exit = single_exit (scalar_loop);
1050 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
ebfd146a 1051 &exit, 1, &new_exit, NULL,
3884da6f 1052 at_exit ? loop->latch : e->src, true);
5ce9450f 1053 exit = single_exit (loop);
3884da6f 1054 basic_block new_preheader = new_bbs[0];
ebfd146a 1055
5ce9450f
JJ
1056 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1057
1058 if (scalar_loop != loop)
1059 {
1060 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1061 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1062 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1063 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1064 header) to have current_def set, so copy them over. */
1065 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1066 exit);
1067 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1068 0),
1069 EDGE_SUCC (loop->latch, 0));
1070 }
b8698a0f 1071
ebfd146a
IR
1072 if (at_exit) /* Add the loop copy at exit. */
1073 {
5ce9450f
JJ
1074 if (scalar_loop != loop)
1075 {
538dd0b7 1076 gphi_iterator gsi;
5ce9450f
JJ
1077 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1078
1079 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1080 gsi_next (&gsi))
1081 {
538dd0b7 1082 gphi *phi = gsi.phi ();
5ce9450f
JJ
1083 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1084 location_t orig_locus
1085 = gimple_phi_arg_location_from_edge (phi, e);
1086
1087 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1088 }
1089 }
2cfc56b9
RB
1090 redirect_edge_and_branch_force (e, new_preheader);
1091 flush_pending_stmts (e);
1092 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
a6c51a12 1093 if (was_imm_dom || duplicate_outer_loop)
5ce9450f 1094 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
2cfc56b9
RB
1095
1096 /* And remove the non-necessary forwarder again. Keep the other
1097 one so we have a proper pre-header for the loop at the exit edge. */
5ce9450f
JJ
1098 redirect_edge_pred (single_succ_edge (preheader),
1099 single_pred (preheader));
2cfc56b9 1100 delete_basic_block (preheader);
5ce9450f
JJ
1101 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1102 loop_preheader_edge (scalar_loop)->src);
ebfd146a
IR
1103 }
1104 else /* Add the copy at entry. */
1105 {
5ce9450f
JJ
1106 if (scalar_loop != loop)
1107 {
1108 /* Remove the non-necessary forwarder of scalar_loop again. */
1109 redirect_edge_pred (single_succ_edge (preheader),
1110 single_pred (preheader));
1111 delete_basic_block (preheader);
1112 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1113 loop_preheader_edge (scalar_loop)->src);
1114 preheader = split_edge (loop_preheader_edge (loop));
1115 entry_e = single_pred_edge (preheader);
1116 }
1117
2cfc56b9
RB
1118 redirect_edge_and_branch_force (entry_e, new_preheader);
1119 flush_pending_stmts (entry_e);
1120 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1121
1122 redirect_edge_and_branch_force (new_exit, preheader);
1123 flush_pending_stmts (new_exit);
1124 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1125
1126 /* And remove the non-necessary forwarder again. Keep the other
1127 one so we have a proper pre-header for the loop at the exit edge. */
5ce9450f
JJ
1128 redirect_edge_pred (single_succ_edge (new_preheader),
1129 single_pred (new_preheader));
2cfc56b9
RB
1130 delete_basic_block (new_preheader);
1131 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1132 loop_preheader_edge (new_loop)->src);
ebfd146a
IR
1133 }
1134
166b8799
BC
1135 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1136 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
a6c51a12 1137 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
2cfc56b9 1138
5ce9450f
JJ
1139 if (scalar_loop != loop)
1140 {
1141 /* Update new_loop->header PHIs, so that on the preheader
1142 edge they are the ones from loop rather than scalar_loop. */
538dd0b7 1143 gphi_iterator gsi_orig, gsi_new;
5ce9450f
JJ
1144 edge orig_e = loop_preheader_edge (loop);
1145 edge new_e = loop_preheader_edge (new_loop);
1146
1147 for (gsi_orig = gsi_start_phis (loop->header),
1148 gsi_new = gsi_start_phis (new_loop->header);
1149 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1150 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1151 {
538dd0b7
DM
1152 gphi *orig_phi = gsi_orig.phi ();
1153 gphi *new_phi = gsi_new.phi ();
5ce9450f
JJ
1154 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1155 location_t orig_locus
1156 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1157
1158 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1159 }
1160 }
1161
ebfd146a
IR
1162 free (new_bbs);
1163 free (bbs);
1164
b2b29377 1165 checking_verify_dominators (CDI_DOMINATORS);
2cfc56b9 1166
ebfd146a
IR
1167 return new_loop;
1168}
1169
1170
a5e3d614
BC
1171/* Given the condition expression COND, put it as the last statement of
1172 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1173 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
3e907b90
BC
1174 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1175 new edge as irreducible if IRREDUCIBLE_P is true. */
ebfd146a
IR
1176
1177static edge
86290011 1178slpeel_add_loop_guard (basic_block guard_bb, tree cond,
a5e3d614 1179 basic_block guard_to, basic_block dom_bb,
357067f2 1180 profile_probability probability, bool irreducible_p)
ebfd146a
IR
1181{
1182 gimple_stmt_iterator gsi;
1183 edge new_e, enter_e;
538dd0b7 1184 gcond *cond_stmt;
ebfd146a
IR
1185 gimple_seq gimplify_stmt_list = NULL;
1186
1187 enter_e = EDGE_SUCC (guard_bb, 0);
1188 enter_e->flags &= ~EDGE_FALLTHRU;
1189 enter_e->flags |= EDGE_FALSE_VALUE;
1190 gsi = gsi_last_bb (guard_bb);
1191
f7a06a98
RG
1192 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1193 NULL_TREE);
86290011 1194 if (gimplify_stmt_list)
a5e3d614 1195 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
ebfd146a 1196
a5e3d614 1197 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
ebfd146a
IR
1198 gsi = gsi_last_bb (guard_bb);
1199 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1200
1201 /* Add new edge to connect guard block to the merge/loop-exit block. */
a5e3d614 1202 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
e78410bf 1203
e78410bf 1204 new_e->probability = probability;
3e907b90
BC
1205 if (irreducible_p)
1206 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1207
357067f2 1208 enter_e->probability = probability.invert ();
a5e3d614 1209 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
5281a167
RB
1210
1211 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1212 if (enter_e->dest->loop_father->header == enter_e->dest)
1213 split_edge (enter_e);
1214
ebfd146a
IR
1215 return new_e;
1216}
1217
1218
1219/* This function verifies that the following restrictions apply to LOOP:
a6c51a12
YR
1220 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1221 for innermost loop and 5 basic blocks for outer-loop.
1222 (2) it is single entry, single exit
1223 (3) its exit condition is the last stmt in the header
1224 (4) E is the entry/exit edge of LOOP.
ebfd146a
IR
1225 */
1226
1227bool
1228slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1229{
1230 edge exit_e = single_exit (loop);
1231 edge entry_e = loop_preheader_edge (loop);
538dd0b7 1232 gcond *orig_cond = get_loop_exit_condition (loop);
ebfd146a 1233 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
a6c51a12 1234 unsigned int num_bb = loop->inner? 5 : 2;
ebfd146a 1235
62fdbf29
BC
1236 /* All loops have an outer scope; the only case loop->outer is NULL is for
1237 the function itself. */
1238 if (!loop_outer (loop)
a6c51a12 1239 || loop->num_nodes != num_bb
ebfd146a
IR
1240 || !empty_block_p (loop->latch)
1241 || !single_exit (loop)
1242 /* Verify that new loop exit condition can be trivially modified. */
1243 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1244 || (e != exit_e && e != entry_e))
1245 return false;
1246
1247 return true;
1248}
1249
a5e3d614
BC
1250/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1251 in the exit bb and rename all the uses after the loop. This simplifies
1252 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1253 (but normally loop closed SSA form doesn't require virtual PHIs to be
1254 in the same form). Doing this early simplifies the checking what
1255 uses should be renamed. */
ebfd146a
IR
1256
1257static void
a5e3d614 1258create_lcssa_for_virtual_phi (struct loop *loop)
ebfd146a 1259{
538dd0b7 1260 gphi_iterator gsi;
ebfd146a 1261 edge exit_e = single_exit (loop);
b8698a0f 1262
e20f6b4b 1263 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
ea057359 1264 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
e20f6b4b 1265 {
538dd0b7 1266 gphi *phi = gsi.phi ();
e20f6b4b
JJ
1267 for (gsi = gsi_start_phis (exit_e->dest);
1268 !gsi_end_p (gsi); gsi_next (&gsi))
ea057359 1269 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
e20f6b4b
JJ
1270 break;
1271 if (gsi_end_p (gsi))
1272 {
b731b390 1273 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
538dd0b7 1274 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
e20f6b4b
JJ
1275 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276 imm_use_iterator imm_iter;
355fe088 1277 gimple *stmt;
e20f6b4b
JJ
1278 use_operand_p use_p;
1279
15b6695a
RB
1280 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
9e227d60 1282 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
e20f6b4b
JJ
1283 gimple_phi_set_result (new_phi, new_vop);
1284 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
b5fd0440
YR
1285 if (stmt != new_phi
1286 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
e20f6b4b
JJ
1287 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288 SET_USE (use_p, new_vop);
1289 }
1290 break;
1291 }
ebfd146a 1292
ebfd146a
IR
1293}
1294
1295/* Function vect_get_loop_location.
1296
1297 Extract the location of the loop in the source code.
1298 If the loop is not well formed for vectorization, an estimated
1299 location is calculated.
1300 Return the loop location if succeed and NULL if not. */
1301
b05e0233 1302source_location
ebfd146a
IR
1303find_loop_location (struct loop *loop)
1304{
355fe088 1305 gimple *stmt = NULL;
ebfd146a
IR
1306 basic_block bb;
1307 gimple_stmt_iterator si;
1308
1309 if (!loop)
b05e0233 1310 return UNKNOWN_LOCATION;
ebfd146a
IR
1311
1312 stmt = get_loop_exit_condition (loop);
1313
502498d5
JJ
1314 if (stmt
1315 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
ebfd146a
IR
1316 return gimple_location (stmt);
1317
1318 /* If we got here the loop is probably not "well formed",
1319 try to estimate the loop location */
1320
1321 if (!loop->header)
b05e0233 1322 return UNKNOWN_LOCATION;
ebfd146a
IR
1323
1324 bb = loop->header;
1325
1326 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1327 {
1328 stmt = gsi_stmt (si);
502498d5 1329 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
ebfd146a
IR
1330 return gimple_location (stmt);
1331 }
1332
b05e0233 1333 return UNKNOWN_LOCATION;
ebfd146a
IR
1334}
1335
a5e3d614
BC
1336/* Return true if PHI defines an IV of the loop to be vectorized. */
1337
1338static bool
1339iv_phi_p (gphi *phi)
1340{
1341 if (virtual_operand_p (PHI_RESULT (phi)))
1342 return false;
1343
1344 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1345 gcc_assert (stmt_info != NULL);
1346 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1347 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1348 return false;
1349
1350 return true;
1351}
ebfd146a 1352
ebfd146a
IR
1353/* Function vect_can_advance_ivs_p
1354
b8698a0f
L
1355 In case the number of iterations that LOOP iterates is unknown at compile
1356 time, an epilog loop will be generated, and the loop induction variables
1357 (IVs) will be "advanced" to the value they are supposed to take just before
ebfd146a
IR
1358 the epilog loop. Here we check that the access function of the loop IVs
1359 and the expression that represents the loop bound are simple enough.
1360 These restrictions will be relaxed in the future. */
1361
b8698a0f 1362bool
ebfd146a
IR
1363vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1364{
1365 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1366 basic_block bb = loop->header;
538dd0b7 1367 gphi_iterator gsi;
ebfd146a
IR
1368
1369 /* Analyze phi functions of the loop header. */
1370
73fbfcad 1371 if (dump_enabled_p ())
e645e942 1372 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
ebfd146a
IR
1373 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1374 {
ebfd146a
IR
1375 tree evolution_part;
1376
a5e3d614 1377 gphi *phi = gsi.phi ();
73fbfcad 1378 if (dump_enabled_p ())
ebfd146a 1379 {
78c60e3d
SS
1380 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1381 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
ebfd146a
IR
1382 }
1383
1384 /* Skip virtual phi's. The data dependences that are associated with
a5e3d614 1385 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
ebfd146a 1386
a5e3d614
BC
1387 Skip reduction phis. */
1388 if (!iv_phi_p (phi))
ebfd146a 1389 {
73fbfcad 1390 if (dump_enabled_p ())
a5e3d614
BC
1391 dump_printf_loc (MSG_NOTE, vect_location,
1392 "reduc or virtual phi. skip.\n");
ebfd146a
IR
1393 continue;
1394 }
1395
ebfd146a
IR
1396 /* Analyze the evolution function. */
1397
afb119be
RB
1398 evolution_part
1399 = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
ebfd146a
IR
1400 if (evolution_part == NULL_TREE)
1401 {
73fbfcad 1402 if (dump_enabled_p ())
afb119be 1403 dump_printf (MSG_MISSED_OPTIMIZATION,
e645e942 1404 "No access function or evolution.\n");
ebfd146a
IR
1405 return false;
1406 }
b8698a0f 1407
2a93954e
AH
1408 /* FORNOW: We do not transform initial conditions of IVs
1409 which evolution functions are not invariants in the loop. */
1410
1411 if (!expr_invariant_in_loop_p (loop, evolution_part))
1412 {
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "evolution not invariant in loop.\n");
1416 return false;
1417 }
1418
b8698a0f 1419 /* FORNOW: We do not transform initial conditions of IVs
ebfd146a
IR
1420 which evolution functions are a polynomial of degree >= 2. */
1421
1422 if (tree_is_chrec (evolution_part))
2a93954e
AH
1423 {
1424 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426 "evolution is chrec.\n");
1427 return false;
1428 }
ebfd146a
IR
1429 }
1430
1431 return true;
1432}
1433
1434
1435/* Function vect_update_ivs_after_vectorizer.
1436
1437 "Advance" the induction variables of LOOP to the value they should take
1438 after the execution of LOOP. This is currently necessary because the
1439 vectorizer does not handle induction variables that are used after the
1440 loop. Such a situation occurs when the last iterations of LOOP are
1441 peeled, because:
1442 1. We introduced new uses after LOOP for IVs that were not originally used
1443 after LOOP: the IVs of LOOP are now used by an epilog loop.
1444 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445 times, whereas the loop IVs should be bumped N times.
1446
1447 Input:
1448 - LOOP - a loop that is going to be vectorized. The last few iterations
1449 of LOOP were peeled.
1450 - NITERS - the number of iterations that LOOP executes (before it is
1451 vectorized). i.e, the number of times the ivs should be bumped.
1452 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453 coming out from LOOP on which there are uses of the LOOP ivs
1454 (this is the path from LOOP->exit to epilog_loop->preheader).
1455
1456 The new definitions of the ivs are placed in LOOP->exit.
1457 The phi args associated with the edge UPDATE_E in the bb
1458 UPDATE_E->dest are updated accordingly.
1459
1460 Assumption 1: Like the rest of the vectorizer, this function assumes
1461 a single loop exit that has a single predecessor.
1462
1463 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464 organized in the same order.
1465
1466 Assumption 3: The access function of the ivs is simple enough (see
1467 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1468
1469 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
b8698a0f 1470 coming out of LOOP on which the ivs of LOOP are used (this is the path
ebfd146a
IR
1471 that leads to the epilog loop; other paths skip the epilog loop). This
1472 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473 needs to have its phis updated.
1474 */
1475
1476static void
a5e3d614
BC
1477vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478 tree niters, edge update_e)
ebfd146a 1479{
538dd0b7 1480 gphi_iterator gsi, gsi1;
a5e3d614 1481 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 1482 basic_block update_bb = update_e->dest;
a5e3d614 1483 basic_block exit_bb = single_exit (loop)->dest;
ebfd146a
IR
1484
1485 /* Make sure there exists a single-predecessor exit bb: */
1486 gcc_assert (single_pred_p (exit_bb));
a5e3d614 1487 gcc_assert (single_succ_edge (exit_bb) == update_e);
ebfd146a
IR
1488
1489 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491 gsi_next (&gsi), gsi_next (&gsi1))
1492 {
ebfd146a 1493 tree init_expr;
550918ca
RG
1494 tree step_expr, off;
1495 tree type;
ebfd146a
IR
1496 tree var, ni, ni_name;
1497 gimple_stmt_iterator last_gsi;
1498
a5e3d614
BC
1499 gphi *phi = gsi.phi ();
1500 gphi *phi1 = gsi1.phi ();
73fbfcad 1501 if (dump_enabled_p ())
a5e3d614
BC
1502 {
1503 dump_printf_loc (MSG_NOTE, vect_location,
1504 "vect_update_ivs_after_vectorizer: phi: ");
78c60e3d 1505 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
a5e3d614 1506 }
ebfd146a 1507
a5e3d614
BC
1508 /* Skip reduction and virtual phis. */
1509 if (!iv_phi_p (phi))
ebfd146a 1510 {
73fbfcad 1511 if (dump_enabled_p ())
a5e3d614
BC
1512 dump_printf_loc (MSG_NOTE, vect_location,
1513 "reduc or virtual phi. skip.\n");
ebfd146a
IR
1514 continue;
1515 }
1516
0ac168a1 1517 type = TREE_TYPE (gimple_phi_result (phi));
a5e3d614 1518 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
0ac168a1 1519 step_expr = unshare_expr (step_expr);
b8698a0f 1520
ebfd146a
IR
1521 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1522 of degree >= 2 or exponential. */
0ac168a1 1523 gcc_assert (!tree_is_chrec (step_expr));
ebfd146a 1524
0ac168a1 1525 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
ebfd146a 1526
550918ca
RG
1527 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1528 fold_convert (TREE_TYPE (step_expr), niters),
1529 step_expr);
0ac168a1 1530 if (POINTER_TYPE_P (type))
5d49b6a7 1531 ni = fold_build_pointer_plus (init_expr, off);
ebfd146a 1532 else
0ac168a1
RG
1533 ni = fold_build2 (PLUS_EXPR, type,
1534 init_expr, fold_convert (type, off));
ebfd146a 1535
0ac168a1 1536 var = create_tmp_var (type, "tmp");
ebfd146a
IR
1537
1538 last_gsi = gsi_last_bb (exit_bb);
a5e3d614
BC
1539 gimple_seq new_stmts = NULL;
1540 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1541 /* Exit_bb shouldn't be empty. */
1542 if (!gsi_end_p (last_gsi))
1543 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1544 else
1545 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
b8698a0f 1546
ebfd146a 1547 /* Fix phi expressions in the successor bb. */
684f25f4 1548 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
ebfd146a
IR
1549 }
1550}
1551
535e7c11
RS
1552/* Return a gimple value containing the misalignment (measured in vector
1553 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1554 it is away from a perfectly aligned address. Add any new statements
1555 to SEQ. */
1556
1557static tree
1558get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1559{
1560 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1561 gimple *dr_stmt = DR_STMT (dr);
1562 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1563 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1564
1565 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1566 gcc_assert (target_align != 0);
1567
1568 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1569 tree offset = (negative
1570 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1571 : size_zero_node);
1572 tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, seq,
1573 offset);
1574 tree type = unsigned_type_for (TREE_TYPE (start_addr));
1575 tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1576 HOST_WIDE_INT elem_size
1577 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1578 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1579
1580 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1581 tree int_start_addr = fold_convert (type, start_addr);
1582 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1583 target_align_minus_1);
1584
1585 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1586 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1587 elem_size_log);
1588
1589 return misalign_in_elems;
1590}
1591
a5e3d614 1592/* Function vect_gen_prolog_loop_niters
d9157f15 1593
a5e3d614
BC
1594 Generate the number of iterations which should be peeled as prolog for the
1595 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1596 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1597 As a result, after the execution of this loop, the data reference DR will
1598 refer to an aligned location. The following computation is generated:
ebfd146a
IR
1599
1600 If the misalignment of DR is known at compile time:
1601 addr_mis = int mis = DR_MISALIGNMENT (dr);
1602 Else, compute address misalignment in bytes:
535e7c11 1603 addr_mis = addr & (target_align - 1)
ebfd146a 1604
a5e3d614 1605 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
ebfd146a
IR
1606
1607 (elem_size = element type size; an element is the scalar element whose type
1608 is the inner type of the vectype)
1609
a5e3d614 1610 The computations will be emitted at the end of BB. We also compute and
cbb22e61 1611 store upper bound (included) of the result in BOUND.
a5e3d614 1612
ebfd146a
IR
1613 When the step of the data-ref in the loop is not 1 (as in interleaved data
1614 and SLP), the number of iterations of the prolog must be divided by the step
1615 (which is equal to the size of interleaved group).
1616
1617 The above formulas assume that VF == number of elements in the vector. This
1618 may not hold when there are multiple-types in the loop.
1619 In this case, for some data-references in the loop the VF does not represent
1620 the number of elements that fit in the vector. Therefore, instead of VF we
1621 use TYPE_VECTOR_SUBPARTS. */
1622
b8698a0f 1623static tree
a5e3d614
BC
1624vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1625 basic_block bb, int *bound)
ebfd146a
IR
1626{
1627 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
ebfd146a 1628 tree var;
a5e3d614
BC
1629 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1630 gimple_seq stmts = NULL, new_stmts = NULL;
ebfd146a 1631 tree iters, iters_name;
355fe088 1632 gimple *dr_stmt = DR_STMT (dr);
ebfd146a
IR
1633 stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1634 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
f702e7d4 1635 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
ebfd146a 1636
15e693cc 1637 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
ebfd146a 1638 {
15e693cc 1639 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
ebfd146a 1640
73fbfcad 1641 if (dump_enabled_p ())
ccb3ad87 1642 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1643 "known peeling = %d.\n", npeel);
ebfd146a 1644
720f5239 1645 iters = build_int_cst (niters_type, npeel);
cbb22e61 1646 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
ebfd146a
IR
1647 }
1648 else
1649 {
535e7c11
RS
1650 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1651 tree type = TREE_TYPE (misalign_in_elems);
f702e7d4
RS
1652 HOST_WIDE_INT elem_size
1653 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
f702e7d4
RS
1654 HOST_WIDE_INT align_in_elems = target_align / elem_size;
1655 tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1656 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
f702e7d4
RS
1657
1658 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1659 & (align_in_elems - 1)). */
535e7c11 1660 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
d8ba5b19 1661 if (negative)
f702e7d4
RS
1662 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1663 align_in_elems_tree);
d8ba5b19 1664 else
f702e7d4
RS
1665 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1666 misalign_in_elems);
1667 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
ebfd146a 1668 iters = fold_convert (niters_type, iters);
f702e7d4 1669 *bound = align_in_elems - 1;
ebfd146a
IR
1670 }
1671
73fbfcad 1672 if (dump_enabled_p ())
ebfd146a 1673 {
ccb3ad87 1674 dump_printf_loc (MSG_NOTE, vect_location,
78c60e3d 1675 "niters for prolog loop: ");
ccb3ad87 1676 dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
e645e942 1677 dump_printf (MSG_NOTE, "\n");
ebfd146a
IR
1678 }
1679
1680 var = create_tmp_var (niters_type, "prolog_loop_niters");
a5e3d614 1681 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
ebfd146a 1682
a5e3d614
BC
1683 if (new_stmts)
1684 gimple_seq_add_seq (&stmts, new_stmts);
ebfd146a
IR
1685 if (stmts)
1686 {
a5e3d614
BC
1687 gcc_assert (single_succ_p (bb));
1688 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1689 if (gsi_end_p (gsi))
1690 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1691 else
1692 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
ebfd146a 1693 }
b8698a0f 1694 return iters_name;
ebfd146a
IR
1695}
1696
1697
1698/* Function vect_update_init_of_dr
1699
535e7c11
RS
1700 If CODE is PLUS, the vector loop starts NITERS iterations after the
1701 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1702 iterations before the scalar one (using masking to skip inactive
1703 elements). This function updates the information recorded in DR to
1704 account for the difference. Specifically, it updates the OFFSET
1705 field of DR. */
ebfd146a
IR
1706
1707static void
535e7c11 1708vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
ebfd146a
IR
1709{
1710 tree offset = DR_OFFSET (dr);
b8698a0f 1711
ebfd146a
IR
1712 niters = fold_build2 (MULT_EXPR, sizetype,
1713 fold_convert (sizetype, niters),
1714 fold_convert (sizetype, DR_STEP (dr)));
535e7c11 1715 offset = fold_build2 (code, sizetype,
587aa063 1716 fold_convert (sizetype, offset), niters);
ebfd146a
IR
1717 DR_OFFSET (dr) = offset;
1718}
1719
1720
1721/* Function vect_update_inits_of_drs
1722
535e7c11
RS
1723 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1724 CODE and NITERS are as for vect_update_inits_of_dr. */
ebfd146a
IR
1725
1726static void
535e7c11
RS
1727vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1728 tree_code code)
ebfd146a
IR
1729{
1730 unsigned int i;
9771b263 1731 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
ebfd146a 1732 struct data_reference *dr;
a5e3d614
BC
1733
1734 if (dump_enabled_p ())
ccb3ad87 1735 dump_printf_loc (MSG_NOTE, vect_location,
a5e3d614
BC
1736 "=== vect_update_inits_of_dr ===\n");
1737
1738 /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
1739 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1740 {
1741 gimple_seq seq;
1742 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1743 tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1744
1745 niters = fold_convert (sizetype, niters);
1746 niters = force_gimple_operand (niters, &seq, false, var);
1747 if (seq)
1748 {
1749 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1750 gcc_assert (!new_bb);
1751 }
1752 }
ebfd146a 1753
9771b263 1754 FOR_EACH_VEC_ELT (datarefs, i, dr)
535e7c11 1755 vect_update_init_of_dr (dr, niters, code);
ebfd146a
IR
1756}
1757
535e7c11
RS
1758/* For the information recorded in LOOP_VINFO prepare the loop for peeling
1759 by masking. This involves calculating the number of iterations to
1760 be peeled and then aligning all memory references appropriately. */
1761
1762void
1763vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1764{
1765 tree misalign_in_elems;
1766 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1767
1768 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1769
1770 /* From the information recorded in LOOP_VINFO get the number of iterations
1771 that need to be skipped via masking. */
1772 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1773 {
1774 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1775 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1776 misalign_in_elems = build_int_cst (type, misalign);
1777 }
1778 else
1779 {
1780 gimple_seq seq1 = NULL, seq2 = NULL;
1781 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1782 misalign_in_elems = fold_convert (type, misalign_in_elems);
1783 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1784 &seq2, true, NULL_TREE);
1785 gimple_seq_add_seq (&seq1, seq2);
1786 if (seq1)
1787 {
1788 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1789 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1790 gcc_assert (!new_bb);
1791 }
1792 }
1793
1794 if (dump_enabled_p ())
1795 {
1796 dump_printf_loc (MSG_NOTE, vect_location,
1797 "misalignment for fully-masked loop: ");
1798 dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1799 dump_printf (MSG_NOTE, "\n");
1800 }
1801
1802 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1803
1804 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1805}
ebfd146a 1806
a5e3d614 1807/* This function builds ni_name = number of iterations. Statements
7078979b
BC
1808 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1809 it to TRUE if new ssa_var is generated. */
a5e3d614
BC
1810
1811tree
7078979b 1812vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
a5e3d614
BC
1813{
1814 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1815 if (TREE_CODE (ni) == INTEGER_CST)
1816 return ni;
1817 else
1818 {
1819 tree ni_name, var;
1820 gimple_seq stmts = NULL;
1821 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1822
1823 var = create_tmp_var (TREE_TYPE (ni), "niters");
1824 ni_name = force_gimple_operand (ni, &stmts, false, var);
1825 if (stmts)
7078979b
BC
1826 {
1827 gsi_insert_seq_on_edge_immediate (pe, stmts);
1828 if (new_var_p != NULL)
1829 *new_var_p = true;
1830 }
a5e3d614
BC
1831
1832 return ni_name;
1833 }
1834}
1835
cbb22e61
BC
1836/* Calculate the number of iterations above which vectorized loop will be
1837 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1838 of prolog loop. If it's integer const, the integer number is also passed
1839 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (included) of
1840 number of iterations of prolog loop. VFM1 is vector factor minus one.
1841 If CHECK_PROFITABILITY is true, TH is the threshold below which scalar
1842 (rather than vectorized) loop will be executed. This function stores
1843 upper bound (included) of the result in BOUND_SCALAR. */
a5e3d614
BC
1844
1845static tree
1846vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
d9f21f6a
RS
1847 int bound_prolog, poly_int64 vfm1, int th,
1848 poly_uint64 *bound_scalar,
1849 bool check_profitability)
a5e3d614
BC
1850{
1851 tree type = TREE_TYPE (niters_prolog);
1852 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
cbb22e61 1853 build_int_cst (type, vfm1));
a5e3d614 1854
cbb22e61 1855 *bound_scalar = vfm1 + bound_prolog;
a5e3d614
BC
1856 if (check_profitability)
1857 {
cbb22e61
BC
1858 /* TH indicates the minimum niters of vectorized loop, while we
1859 compute the maximum niters of scalar loop. */
1860 th--;
a5e3d614
BC
1861 /* Peeling for constant times. */
1862 if (int_niters_prolog >= 0)
1863 {
d9f21f6a 1864 *bound_scalar = upper_bound (int_niters_prolog + vfm1, th);
cbb22e61 1865 return build_int_cst (type, *bound_scalar);
a5e3d614 1866 }
cbb22e61
BC
1867 /* Peeling for unknown times. Note BOUND_PROLOG is the upper
1868 bound (inlcuded) of niters of prolog loop. */
d9f21f6a 1869 if (known_ge (th, vfm1 + bound_prolog))
a5e3d614 1870 {
cbb22e61 1871 *bound_scalar = th;
a5e3d614
BC
1872 return build_int_cst (type, th);
1873 }
d9f21f6a
RS
1874 /* Need to do runtime comparison. */
1875 else if (maybe_gt (th, vfm1))
1876 {
1877 *bound_scalar = upper_bound (*bound_scalar, th);
1878 return fold_build2 (MAX_EXPR, type,
1879 build_int_cst (type, th), niters);
1880 }
a5e3d614
BC
1881 }
1882 return niters;
1883}
1884
0f26839a
RS
1885/* NITERS is the number of times that the original scalar loop executes
1886 after peeling. Work out the maximum number of iterations N that can
1887 be handled by the vectorized form of the loop and then either:
ebfd146a 1888
0f26839a 1889 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
d9157f15 1890
0f26839a
RS
1891 niters_vector = N
1892
1893 b) set *STEP_VECTOR_PTR to one and generate:
1894
1895 niters_vector = N / vf
1896
1897 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1898 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1899 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
ebfd146a
IR
1900
1901void
a5e3d614 1902vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
0f26839a
RS
1903 tree *niters_vector_ptr, tree *step_vector_ptr,
1904 bool niters_no_overflow)
ebfd146a 1905{
a5e3d614 1906 tree ni_minus_gap, var;
0f26839a 1907 tree niters_vector, step_vector, type = TREE_TYPE (niters);
d9f21f6a 1908 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
a5e3d614 1909 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
0f26839a 1910 tree log_vf = NULL_TREE;
a5e3d614
BC
1911
1912 /* If epilogue loop is required because of data accesses with gaps, we
1913 subtract one iteration from the total number of iterations here for
1914 correct calculation of RATIO. */
1915 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1916 {
7078979b
BC
1917 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1918 build_one_cst (type));
a5e3d614
BC
1919 if (!is_gimple_val (ni_minus_gap))
1920 {
7078979b 1921 var = create_tmp_var (type, "ni_gap");
a5e3d614
BC
1922 gimple *stmts = NULL;
1923 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1924 true, var);
1925 gsi_insert_seq_on_edge_immediate (pe, stmts);
1926 }
1927 }
1928 else
1929 ni_minus_gap = niters;
1930
d9f21f6a 1931 unsigned HOST_WIDE_INT const_vf;
7cfb4d93
RS
1932 if (vf.is_constant (&const_vf)
1933 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
0f26839a
RS
1934 {
1935 /* Create: niters >> log2(vf) */
1936 /* If it's known that niters == number of latch executions + 1 doesn't
1937 overflow, we can generate niters >> log2(vf); otherwise we generate
1938 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1939 will be at least one. */
d9f21f6a 1940 log_vf = build_int_cst (type, exact_log2 (const_vf));
0f26839a
RS
1941 if (niters_no_overflow)
1942 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1943 else
1944 niters_vector
1945 = fold_build2 (PLUS_EXPR, type,
1946 fold_build2 (RSHIFT_EXPR, type,
1947 fold_build2 (MINUS_EXPR, type,
1948 ni_minus_gap,
1949 build_int_cst (type, vf)),
1950 log_vf),
1951 build_int_cst (type, 1));
1952 step_vector = build_one_cst (type);
1953 }
a5e3d614 1954 else
0f26839a
RS
1955 {
1956 niters_vector = ni_minus_gap;
1957 step_vector = build_int_cst (type, vf);
1958 }
a5e3d614
BC
1959
1960 if (!is_gimple_val (niters_vector))
1961 {
7078979b
BC
1962 var = create_tmp_var (type, "bnd");
1963 gimple_seq stmts = NULL;
a5e3d614
BC
1964 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1965 gsi_insert_seq_on_edge_immediate (pe, stmts);
7078979b
BC
1966 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1967 we set range information to make niters analyzer's life easier. */
0f26839a 1968 if (stmts != NULL && log_vf)
8e6cdc90
RS
1969 set_range_info (niters_vector, VR_RANGE,
1970 wi::to_wide (build_int_cst (type, 1)),
1971 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1972 TYPE_MAX_VALUE (type),
1973 log_vf)));
a5e3d614
BC
1974 }
1975 *niters_vector_ptr = niters_vector;
0f26839a 1976 *step_vector_ptr = step_vector;
a5e3d614
BC
1977
1978 return;
1979}
1980
1981/* Given NITERS_VECTOR which is the number of iterations for vectorized
1982 loop specified by LOOP_VINFO after vectorization, compute the number
1983 of iterations before vectorization (niters_vector * vf) and store it
1984 to NITERS_VECTOR_MULT_VF_PTR. */
1985
1986static void
1987vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1988 tree niters_vector,
1989 tree *niters_vector_mult_vf_ptr)
1990{
d9f21f6a
RS
1991 /* We should be using a step_vector of VF if VF is variable. */
1992 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
ebfd146a 1993 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
a5e3d614
BC
1994 tree type = TREE_TYPE (niters_vector);
1995 tree log_vf = build_int_cst (type, exact_log2 (vf));
1996 basic_block exit_bb = single_exit (loop)->dest;
ebfd146a 1997
a5e3d614
BC
1998 gcc_assert (niters_vector_mult_vf_ptr != NULL);
1999 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2000 niters_vector, log_vf);
2001 if (!is_gimple_val (niters_vector_mult_vf))
2002 {
2003 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2004 gimple_seq stmts = NULL;
2005 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2006 &stmts, true, var);
2007 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2008 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2009 }
2010 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2011}
2012
2013/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2014 from SECOND/FIRST and puts it at the original loop's preheader/exit
2015 edge, the two loops are arranged as below:
2016
2017 preheader_a:
2018 first_loop:
2019 header_a:
2020 i_1 = PHI<i_0, i_2>;
2021 ...
2022 i_2 = i_1 + 1;
2023 if (cond_a)
2024 goto latch_a;
2025 else
2026 goto between_bb;
2027 latch_a:
2028 goto header_a;
2029
2030 between_bb:
2031 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2032
2033 second_loop:
2034 header_b:
2035 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2036 or with i_2 if no LCSSA phi is created
2037 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2038 ...
2039 i_4 = i_3 + 1;
2040 if (cond_b)
2041 goto latch_b;
2042 else
2043 goto exit_bb;
2044 latch_b:
2045 goto header_b;
2046
2047 exit_bb:
2048
2049 This function creates loop closed SSA for the first loop; update the
2050 second loop's PHI nodes by replacing argument on incoming edge with the
2051 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2052 is false, Loop closed ssa phis will only be created for non-iv phis for
2053 the first loop.
2054
2055 This function assumes exit bb of the first loop is preheader bb of the
2056 second loop, i.e, between_bb in the example code. With PHIs updated,
2057 the second loop will execute rest iterations of the first. */
ebfd146a 2058
a5e3d614
BC
2059static void
2060slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2061 struct loop *first, struct loop *second,
2062 bool create_lcssa_for_iv_phis)
2063{
2064 gphi_iterator gsi_update, gsi_orig;
2065 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2066
2067 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2068 edge second_preheader_e = loop_preheader_edge (second);
2069 basic_block between_bb = single_exit (first)->dest;
2070
2071 gcc_assert (between_bb == second_preheader_e->src);
2072 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2073 /* Either the first loop or the second is the loop to be vectorized. */
2074 gcc_assert (loop == first || loop == second);
2075
2076 for (gsi_orig = gsi_start_phis (first->header),
2077 gsi_update = gsi_start_phis (second->header);
2078 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2079 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2080 {
2081 gphi *orig_phi = gsi_orig.phi ();
2082 gphi *update_phi = gsi_update.phi ();
2083
2084 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2085 /* Generate lcssa PHI node for the first loop. */
2086 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2087 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
2088 {
2089 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2090 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2091 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2092 arg = new_res;
2093 }
2094
2095 /* Update PHI node in the second loop by replacing arg on the loop's
2096 incoming edge. */
2097 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2098 }
2099}
2100
2101/* Function slpeel_add_loop_guard adds guard skipping from the beginning
2102 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2103 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2104 appear like below:
2105
2106 guard_bb:
2107 if (cond)
2108 goto merge_bb;
2109 else
2110 goto skip_loop;
2111
2112 skip_loop:
2113 header_a:
2114 i_1 = PHI<i_0, i_2>;
2115 ...
2116 i_2 = i_1 + 1;
2117 if (cond_a)
2118 goto latch_a;
2119 else
2120 goto exit_a;
2121 latch_a:
2122 goto header_a;
2123
2124 exit_a:
2125 i_5 = PHI<i_2>;
2126
2127 merge_bb:
2128 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2129
2130 update_loop:
2131 header_b:
2132 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2133 ...
2134 i_4 = i_3 + 1;
2135 if (cond_b)
2136 goto latch_b;
2137 else
2138 goto exit_bb;
2139 latch_b:
2140 goto header_b;
2141
2142 exit_bb:
2143
2144 This function creates PHI nodes at merge_bb and replaces the use of i_5
2145 in the update_loop's PHI node with the result of new PHI result. */
2146
2147static void
2148slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2149 struct loop *update_loop,
2150 edge guard_edge, edge merge_edge)
2151{
2152 source_location merge_loc, guard_loc;
2153 edge orig_e = loop_preheader_edge (skip_loop);
2154 edge update_e = loop_preheader_edge (update_loop);
2155 gphi_iterator gsi_orig, gsi_update;
2156
2157 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2158 gsi_update = gsi_start_phis (update_loop->header));
2159 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2160 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2161 {
2162 gphi *orig_phi = gsi_orig.phi ();
2163 gphi *update_phi = gsi_update.phi ();
2164
2165 /* Generate new phi node at merge bb of the guard. */
2166 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2167 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2168
2169 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2170 args in NEW_PHI for these edges. */
2171 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2172 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2173 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2174 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2175 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2176 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2177
2178 /* Update phi in UPDATE_PHI. */
2179 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2180 }
2181}
2182
2183/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2184 this function searches for the corresponding lcssa phi node in exit
2185 bb of LOOP. If it is found, return the phi result; otherwise return
2186 NULL. */
2187
2188static tree
2189find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2190 gphi *lcssa_phi)
2191{
2192 gphi_iterator gsi;
2193 edge e = single_exit (loop);
2194
2195 gcc_assert (single_pred_p (e->dest));
2196 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2197 {
2198 gphi *phi = gsi.phi ();
2199 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2200 PHI_ARG_DEF (lcssa_phi, 0), 0))
2201 return PHI_RESULT (phi);
2202 }
2203 return NULL_TREE;
2204}
2205
2206/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2207 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2208 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2209 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2210 The CFG looks like:
2211
2212 loop:
2213 header_a:
2214 i_1 = PHI<i_0, i_2>;
2215 ...
2216 i_2 = i_1 + 1;
2217 if (cond_a)
2218 goto latch_a;
2219 else
2220 goto exit_a;
2221 latch_a:
2222 goto header_a;
2223
2224 exit_a:
2225
2226 guard_bb:
2227 if (cond)
2228 goto merge_bb;
2229 else
2230 goto epilog_loop;
2231
2232 ;; fall_through_bb
2233
2234 epilog_loop:
2235 header_b:
2236 i_3 = PHI<i_2, i_4>;
2237 ...
2238 i_4 = i_3 + 1;
2239 if (cond_b)
2240 goto latch_b;
2241 else
2242 goto merge_bb;
2243 latch_b:
2244 goto header_b;
2245
2246 merge_bb:
2247 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2248
2249 exit_bb:
2250 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2251
2252 For each name used out side EPILOG (i.e - for each name that has a lcssa
2253 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2254 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2255 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2256 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2257 in exit_bb will also be updated. */
2258
2259static void
2260slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2261 edge guard_edge, edge merge_edge)
2262{
2263 gphi_iterator gsi;
2264 basic_block merge_bb = guard_edge->dest;
2265
2266 gcc_assert (single_succ_p (merge_bb));
2267 edge e = single_succ_edge (merge_bb);
2268 basic_block exit_bb = e->dest;
2269 gcc_assert (single_pred_p (exit_bb));
2270 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2271
2272 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2273 {
2274 gphi *update_phi = gsi.phi ();
2275 tree old_arg = PHI_ARG_DEF (update_phi, 0);
2276 /* This loop-closed-phi actually doesn't represent a use out of the
2277 loop - the phi arg is a constant. */
2278 if (TREE_CODE (old_arg) != SSA_NAME)
2279 continue;
2280
2281 tree merge_arg = get_current_def (old_arg);
2282 if (!merge_arg)
2283 merge_arg = old_arg;
2284
2285 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2286 /* If the var is live after loop but not a reduction, we simply
2287 use the old arg. */
2288 if (!guard_arg)
2289 guard_arg = old_arg;
2290
2291 /* Create new phi node in MERGE_BB: */
2292 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2293 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2294
2295 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2296 the two PHI args in merge_phi for these edges. */
2297 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2298 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2299
2300 /* Update the original phi in exit_bb. */
2301 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2302 }
2303}
2304
2305/* EPILOG loop is duplicated from the original loop for vectorizing,
2306 the arg of its loop closed ssa PHI needs to be updated. */
2307
2308static void
2309slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2310{
2311 gphi_iterator gsi;
2312 basic_block exit_bb = single_exit (epilog)->dest;
2313
2314 gcc_assert (single_pred_p (exit_bb));
2315 edge e = EDGE_PRED (exit_bb, 0);
2316 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2317 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2318}
2319
2320/* Function vect_do_peeling.
2321
2322 Input:
2323 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2324
2325 preheader:
2326 LOOP:
2327 header_bb:
2328 loop_body
2329 if (exit_loop_cond) goto exit_bb
2330 else goto header_bb
2331 exit_bb:
2332
2333 - NITERS: The number of iterations of the loop.
2334 - NITERSM1: The number of iterations of the loop's latch.
2335 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2336 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2337 CHECK_PROFITABILITY is true.
2338 Output:
0f26839a 2339 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
7cfb4d93 2340 iterate after vectorization; see vect_set_loop_condition for details.
0f26839a
RS
2341 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2342 should be set to the number of scalar iterations handled by the
2343 vector loop. The SSA name is only used on exit from the loop.
a5e3d614
BC
2344
2345 This function peels prolog and epilog from the loop, adds guards skipping
2346 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2347 would look like:
2348
2349 guard_bb_1:
2350 if (prefer_scalar_loop) goto merge_bb_1
2351 else goto guard_bb_2
2352
2353 guard_bb_2:
2354 if (skip_prolog) goto merge_bb_2
2355 else goto prolog_preheader
2356
2357 prolog_preheader:
2358 PROLOG:
2359 prolog_header_bb:
2360 prolog_body
2361 if (exit_prolog_cond) goto prolog_exit_bb
2362 else goto prolog_header_bb
2363 prolog_exit_bb:
2364
2365 merge_bb_2:
2366
2367 vector_preheader:
2368 VECTOR LOOP:
2369 vector_header_bb:
2370 vector_body
2371 if (exit_vector_cond) goto vector_exit_bb
2372 else goto vector_header_bb
2373 vector_exit_bb:
2374
2375 guard_bb_3:
2376 if (skip_epilog) goto merge_bb_3
2377 else goto epilog_preheader
2378
2379 merge_bb_1:
2380
2381 epilog_preheader:
2382 EPILOG:
2383 epilog_header_bb:
2384 epilog_body
2385 if (exit_epilog_cond) goto merge_bb_3
2386 else goto epilog_header_bb
2387
2388 merge_bb_3:
2389
2390 Note this function peels prolog and epilog only if it's necessary,
2391 as well as guards.
598eaaa2 2392 Returns created epilogue or NULL.
a5e3d614
BC
2393
2394 TODO: Guard for prefer_scalar_loop should be emitted along with
2395 versioning conditions if loop versioning is needed. */
2396
598eaaa2
YR
2397
2398struct loop *
a5e3d614 2399vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
0f26839a
RS
2400 tree *niters_vector, tree *step_vector,
2401 tree *niters_vector_mult_vf_var, int th,
2402 bool check_profitability, bool niters_no_overflow)
a5e3d614
BC
2403{
2404 edge e, guard_e;
2405 tree type = TREE_TYPE (niters), guard_cond;
2406 basic_block guard_bb, guard_to;
357067f2 2407 profile_probability prob_prolog, prob_vector, prob_epilog;
d9f21f6a
RS
2408 int bound_prolog = 0;
2409 poly_uint64 bound_scalar = 0;
2410 int estimated_vf;
535e7c11
RS
2411 int prolog_peeling = 0;
2412 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2413 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
a5e3d614
BC
2414 bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2416
2417 if (!prolog_peeling && !epilog_peeling)
598eaaa2 2418 return NULL;
a5e3d614 2419
357067f2 2420 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
d9f21f6a
RS
2421 estimated_vf = vect_vf_for_cost (loop_vinfo);
2422 if (estimated_vf == 2)
2423 estimated_vf = 3;
357067f2 2424 prob_prolog = prob_epilog = profile_probability::guessed_always ()
d9f21f6a
RS
2425 .apply_scale (estimated_vf - 1, estimated_vf);
2426 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
a5e3d614 2427
598eaaa2 2428 struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
a5e3d614 2429 struct loop *first_loop = loop;
3e907b90 2430 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
a5e3d614
BC
2431 create_lcssa_for_virtual_phi (loop);
2432 update_ssa (TODO_update_ssa_only_virtuals);
2433
36f52e8f 2434 if (MAY_HAVE_DEBUG_BIND_STMTS)
a5e3d614
BC
2435 {
2436 gcc_assert (!adjust_vec.exists ());
2437 adjust_vec.create (32);
2438 }
ebfd146a
IR
2439 initialize_original_copy_tables ();
2440
a5e3d614
BC
2441 /* Prolog loop may be skipped. */
2442 bool skip_prolog = (prolog_peeling != 0);
704c28ee
BC
2443 /* Skip to epilog if scalar loop may be preferred. It's only needed
2444 when we peel for epilog loop and when it hasn't been checked with
2445 loop versioning. */
d9f21f6a
RS
2446 bool skip_vector = ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2447 && !LOOP_REQUIRES_VERSIONING (loop_vinfo))
2448 || !vf.is_constant ());
a5e3d614
BC
2449 /* Epilog loop must be executed if the number of iterations for epilog
2450 loop is known at compile time, otherwise we need to add a check at
2451 the end of vector loop and skip to the end of epilog loop. */
2452 bool skip_epilog = (prolog_peeling < 0
d9f21f6a
RS
2453 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2454 || !vf.is_constant ());
a5e3d614
BC
2455 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2456 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2457 skip_epilog = false;
2458
2459 /* Record the anchor bb at which guard should be placed if scalar loop
2460 may be preferred. */
2461 basic_block anchor = loop_preheader_edge (loop)->src;
2462 if (skip_vector)
25c99850
BC
2463 {
2464 split_edge (loop_preheader_edge (loop));
2465
2466 /* Due to the order in which we peel prolog and epilog, we first
2467 propagate probability to the whole loop. The purpose is to
2468 avoid adjusting probabilities of both prolog and vector loops
2469 separately. Note in this case, the probability of epilog loop
2470 needs to be scaled back later. */
2471 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
357067f2 2472 if (prob_vector.initialized_p ())
d9f21f6a
RS
2473 {
2474 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2475 scale_loop_profile (loop, prob_vector, 0);
2476 }
25c99850 2477 }
a5e3d614
BC
2478
2479 tree niters_prolog = build_int_cst (type, 0);
2480 source_location loop_loc = find_loop_location (loop);
2481 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2482 if (prolog_peeling)
5d2eb24b 2483 {
a5e3d614
BC
2484 e = loop_preheader_edge (loop);
2485 if (!slpeel_can_duplicate_loop_p (loop, e))
5d2eb24b 2486 {
a5e3d614
BC
2487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2488 "loop can't be duplicated to preheader edge.\n");
2489 gcc_unreachable ();
2490 }
2491 /* Peel prolog and put it on preheader edge of loop. */
2492 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2493 if (!prolog)
2494 {
2495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2496 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2497 gcc_unreachable ();
2498 }
2499 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2500 first_loop = prolog;
2501 reset_original_copy_tables ();
2502
2503 /* Generate and update the number of iterations for prolog loop. */
2504 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2505 &bound_prolog);
0f26839a 2506 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
7cfb4d93
RS
2507 vect_set_loop_condition (prolog, NULL, niters_prolog,
2508 step_prolog, NULL_TREE, false);
a5e3d614
BC
2509
2510 /* Skip the prolog loop. */
2511 if (skip_prolog)
2512 {
2513 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2514 niters_prolog, build_int_cst (type, 0));
2515 guard_bb = loop_preheader_edge (prolog)->src;
25c99850 2516 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
a5e3d614
BC
2517 guard_to = split_edge (loop_preheader_edge (loop));
2518 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2519 guard_to, guard_bb,
357067f2 2520 prob_prolog.invert (),
3e907b90 2521 irred_flag);
a5e3d614
BC
2522 e = EDGE_PRED (guard_to, 0);
2523 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2524 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
25c99850 2525
af2bbc51
JH
2526 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2527 scale_loop_profile (prolog, prob_prolog, bound_prolog);
a5e3d614
BC
2528 }
2529 /* Update init address of DRs. */
535e7c11 2530 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
a5e3d614
BC
2531 /* Update niters for vector loop. */
2532 LOOP_VINFO_NITERS (loop_vinfo)
2533 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2534 LOOP_VINFO_NITERSM1 (loop_vinfo)
2535 = fold_build2 (MINUS_EXPR, type,
2536 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
7078979b
BC
2537 bool new_var_p = false;
2538 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2539 /* It's guaranteed that vector loop bound before vectorization is at
2540 least VF, so set range information for newly generated var. */
2541 if (new_var_p)
2542 set_range_info (niters, VR_RANGE,
8e6cdc90
RS
2543 wi::to_wide (build_int_cst (type, vf)),
2544 wi::to_wide (TYPE_MAX_VALUE (type)));
a5e3d614 2545
cbb22e61
BC
2546 /* Prolog iterates at most bound_prolog times, latch iterates at
2547 most bound_prolog - 1 times. */
2548 record_niter_bound (prolog, bound_prolog - 1, false, true);
a5e3d614
BC
2549 delete_update_ssa ();
2550 adjust_vec_debug_stmts ();
2551 scev_reset ();
5d2eb24b
IR
2552 }
2553
a5e3d614
BC
2554 if (epilog_peeling)
2555 {
2556 e = single_exit (loop);
2557 if (!slpeel_can_duplicate_loop_p (loop, e))
2558 {
2559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2560 "loop can't be duplicated to exit edge.\n");
2561 gcc_unreachable ();
2562 }
2563 /* Peel epilog and put it on exit edge of loop. */
2564 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2565 if (!epilog)
2566 {
2567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2568 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2569 gcc_unreachable ();
2570 }
2571 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2572
2573 /* Scalar version loop may be preferred. In this case, add guard
2574 and skip to epilog. Note this only happens when the number of
2575 iterations of loop is unknown at compile time, otherwise this
2576 won't be vectorized. */
2577 if (skip_vector)
2578 {
cbb22e61
BC
2579 /* Additional epilogue iteration is peeled if gap exists. */
2580 bool peel_for_gaps = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
a5e3d614 2581 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
cbb22e61
BC
2582 bound_prolog,
2583 peel_for_gaps ? vf : vf - 1,
2584 th, &bound_scalar,
a5e3d614 2585 check_profitability);
cbb22e61
BC
2586 /* Build guard against NITERSM1 since NITERS may overflow. */
2587 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
a5e3d614
BC
2588 guard_bb = anchor;
2589 guard_to = split_edge (loop_preheader_edge (epilog));
2590 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2591 guard_to, guard_bb,
357067f2 2592 prob_vector.invert (),
3e907b90 2593 irred_flag);
a5e3d614
BC
2594 e = EDGE_PRED (guard_to, 0);
2595 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2596 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
25c99850
BC
2597
2598 /* Simply propagate profile info from guard_bb to guard_to which is
2599 a merge point of control flow. */
25c99850 2600 guard_to->count = guard_bb->count;
00fa28d1 2601
af2bbc51
JH
2602 /* Scale probability of epilog loop back.
2603 FIXME: We should avoid scaling down and back up. Profile may
2604 get lost if we scale down to 0. */
10ea2672 2605 basic_block *bbs = get_loop_body (epilog);
00fa28d1
JH
2606 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2607 bbs[i]->count = bbs[i]->count.apply_scale
2608 (bbs[i]->count,
2609 bbs[i]->count.apply_probability
2610 (prob_vector));
af2bbc51 2611 free (bbs);
a5e3d614 2612 }
ebfd146a 2613
25c99850 2614 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
a5e3d614
BC
2615 tree niters_vector_mult_vf;
2616 /* If loop is peeled for non-zero constant times, now niters refers to
2617 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2618 overflows. */
2619 niters_no_overflow |= (prolog_peeling > 0);
2620 vect_gen_vector_loop_niters (loop_vinfo, niters,
0f26839a
RS
2621 niters_vector, step_vector,
2622 niters_no_overflow);
2623 if (!integer_onep (*step_vector))
2624 {
2625 /* On exit from the loop we will have an easy way of calcalating
2626 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2627 until then. */
2628 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2629 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2630 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2631 }
2632 else
2633 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2634 &niters_vector_mult_vf);
a5e3d614
BC
2635 /* Update IVs of original loop as if they were advanced by
2636 niters_vector_mult_vf steps. */
2637 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2638 edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2639 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2640 update_e);
2641
2642 if (skip_epilog)
2643 {
2644 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2645 niters, niters_vector_mult_vf);
2646 guard_bb = single_exit (loop)->dest;
2647 guard_to = split_edge (single_exit (epilog));
2648 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2649 skip_vector ? anchor : guard_bb,
357067f2 2650 prob_epilog.invert (),
3e907b90 2651 irred_flag);
a5e3d614
BC
2652 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2653 single_exit (epilog));
25c99850
BC
2654 /* Only need to handle basic block before epilog loop if it's not
2655 the guard_bb, which is the case when skip_vector is true. */
2656 if (guard_bb != bb_before_epilog)
2657 {
357067f2 2658 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
25c99850 2659
af2bbc51 2660 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
25c99850 2661 }
d9f21f6a 2662 scale_loop_profile (epilog, prob_epilog, 0);
a5e3d614
BC
2663 }
2664 else
2665 slpeel_update_phi_nodes_for_lcssa (epilog);
ebfd146a 2666
d9f21f6a
RS
2667 unsigned HOST_WIDE_INT bound1, bound2;
2668 if (vf.is_constant (&bound1) && bound_scalar.is_constant (&bound2))
2669 {
2670 bound1 -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 2;
2671 if (bound2)
2672 /* We share epilog loop with scalar version loop. */
2673 bound1 = MAX (bound1, bound2 - 1);
2674 record_niter_bound (epilog, bound1, false, true);
2675 }
a5e3d614
BC
2676
2677 delete_update_ssa ();
2678 adjust_vec_debug_stmts ();
2679 scev_reset ();
2680 }
2681 adjust_vec.release ();
ebfd146a 2682 free_original_copy_tables ();
598eaaa2
YR
2683
2684 return epilog;
ebfd146a
IR
2685}
2686
01d32b2b
BC
2687/* Function vect_create_cond_for_niters_checks.
2688
2689 Create a conditional expression that represents the run-time checks for
2690 loop's niter. The loop is guaranteed to to terminate if the run-time
2691 checks hold.
2692
2693 Input:
2694 COND_EXPR - input conditional expression. New conditions will be chained
2695 with logical AND operation. If it is NULL, then the function
2696 is used to return the number of alias checks.
2697 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2698 to be checked.
2699
2700 Output:
2701 COND_EXPR - conditional expression.
2702
2703 The returned COND_EXPR is the conditional expression to be used in the
2704 if statement that controls which version of the loop gets executed at
2705 runtime. */
2706
2707static void
2708vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2709{
2710 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2711
2712 if (*cond_expr)
2713 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2714 *cond_expr, part_cond_expr);
2715 else
2716 *cond_expr = part_cond_expr;
2717}
ebfd146a 2718
9adee305
RS
2719/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2720 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2721
2722static void
2723chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2724{
2725 if (*cond_expr)
2726 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2727 *cond_expr, part_cond_expr);
2728 else
2729 *cond_expr = part_cond_expr;
2730}
2731
ebfd146a
IR
2732/* Function vect_create_cond_for_align_checks.
2733
2734 Create a conditional expression that represents the alignment checks for
2735 all of data references (array element references) whose alignment must be
2736 checked at runtime.
2737
2738 Input:
2739 COND_EXPR - input conditional expression. New conditions will be chained
2740 with logical AND operation.
2741 LOOP_VINFO - two fields of the loop information are used.
2742 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2743 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2744
2745 Output:
2746 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2747 expression.
2748 The returned value is the conditional expression to be used in the if
2749 statement that controls which version of the loop gets executed at runtime.
2750
2751 The algorithm makes two assumptions:
2752 1) The number of bytes "n" in a vector is a power of 2.
2753 2) An address "a" is aligned if a%n is zero and that this
2754 test can be done as a&(n-1) == 0. For example, for 16
2755 byte vectors the test is a&0xf == 0. */
2756
2757static void
2758vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2759 tree *cond_expr,
2760 gimple_seq *cond_expr_stmt_list)
2761{
355fe088 2762 vec<gimple *> may_misalign_stmts
ebfd146a 2763 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
355fe088 2764 gimple *ref_stmt;
ebfd146a
IR
2765 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2766 tree mask_cst;
2767 unsigned int i;
ebfd146a
IR
2768 tree int_ptrsize_type;
2769 char tmp_name[20];
2770 tree or_tmp_name = NULL_TREE;
83d5977e 2771 tree and_tmp_name;
355fe088 2772 gimple *and_stmt;
ebfd146a
IR
2773 tree ptrsize_zero;
2774 tree part_cond_expr;
2775
2776 /* Check that mask is one less than a power of 2, i.e., mask is
2777 all zeros followed by all ones. */
2778 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2779
96f9265a 2780 int_ptrsize_type = signed_type_for (ptr_type_node);
ebfd146a
IR
2781
2782 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2783 of the first vector of the i'th data reference. */
2784
9771b263 2785 FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
ebfd146a
IR
2786 {
2787 gimple_seq new_stmt_list = NULL;
2788 tree addr_base;
83d5977e
RG
2789 tree addr_tmp_name;
2790 tree new_or_tmp_name;
355fe088 2791 gimple *addr_stmt, *or_stmt;
d8ba5b19
RG
2792 stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2793 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2794 bool negative = tree_int_cst_compare
2795 (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2796 tree offset = negative
aad83b7c 2797 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
ebfd146a
IR
2798
2799 /* create: addr_tmp = (int)(address_of_first_vector) */
2800 addr_base =
2801 vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
3f5e8a76 2802 offset);
ebfd146a
IR
2803 if (new_stmt_list != NULL)
2804 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2805
83d5977e
RG
2806 sprintf (tmp_name, "addr2int%d", i);
2807 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
0d0e4a03 2808 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
ebfd146a
IR
2809 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2810
2811 /* The addresses are OR together. */
2812
2813 if (or_tmp_name != NULL_TREE)
2814 {
2815 /* create: or_tmp = or_tmp | addr_tmp */
83d5977e
RG
2816 sprintf (tmp_name, "orptrs%d", i);
2817 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
0d0e4a03
JJ
2818 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2819 or_tmp_name, addr_tmp_name);
ebfd146a
IR
2820 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2821 or_tmp_name = new_or_tmp_name;
2822 }
2823 else
2824 or_tmp_name = addr_tmp_name;
2825
2826 } /* end for i */
2827
2828 mask_cst = build_int_cst (int_ptrsize_type, mask);
2829
2830 /* create: and_tmp = or_tmp & mask */
83d5977e 2831 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
ebfd146a 2832
0d0e4a03
JJ
2833 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2834 or_tmp_name, mask_cst);
ebfd146a
IR
2835 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2836
2837 /* Make and_tmp the left operand of the conditional test against zero.
2838 if and_tmp has a nonzero bit then some address is unaligned. */
2839 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2840 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2841 and_tmp_name, ptrsize_zero);
9adee305
RS
2842 chain_cond_expr (cond_expr, part_cond_expr);
2843}
2844
2845/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2846 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2847 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2848 and this new condition are true. Treat a null *COND_EXPR as "true". */
2849
2850static void
2851vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2852{
2853 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2854 unsigned int i;
2855 vec_object_pair *pair;
2856 FOR_EACH_VEC_ELT (pairs, i, pair)
2857 {
2858 tree addr1 = build_fold_addr_expr (pair->first);
2859 tree addr2 = build_fold_addr_expr (pair->second);
2860 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2861 addr1, addr2);
2862 chain_cond_expr (cond_expr, part_cond_expr);
2863 }
ebfd146a
IR
2864}
2865
ebfd146a
IR
2866/* Function vect_create_cond_for_alias_checks.
2867
2868 Create a conditional expression that represents the run-time checks for
2869 overlapping of address ranges represented by a list of data references
2870 relations passed as input.
2871
2872 Input:
2873 COND_EXPR - input conditional expression. New conditions will be chained
a05a89fa
CH
2874 with logical AND operation. If it is NULL, then the function
2875 is used to return the number of alias checks.
ebfd146a
IR
2876 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2877 to be checked.
2878
2879 Output:
2880 COND_EXPR - conditional expression.
ebfd146a 2881
a05a89fa 2882 The returned COND_EXPR is the conditional expression to be used in the if
ebfd146a
IR
2883 statement that controls which version of the loop gets executed at runtime.
2884*/
2885
a05a89fa 2886void
4bdd44c4 2887vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
ebfd146a 2888{
93bdc3ed 2889 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
a05a89fa 2890 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
ebfd146a 2891
a05a89fa 2892 if (comp_alias_ddrs.is_empty ())
ebfd146a
IR
2893 return;
2894
9cbd2d97
BC
2895 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2896 &comp_alias_ddrs, cond_expr);
73fbfcad 2897 if (dump_enabled_p ())
ccb3ad87 2898 dump_printf_loc (MSG_NOTE, vect_location,
78c60e3d 2899 "created %u versioning for alias checks.\n",
a05a89fa 2900 comp_alias_ddrs.length ());
ebfd146a
IR
2901}
2902
2903
2904/* Function vect_loop_versioning.
b8698a0f 2905
ebfd146a
IR
2906 If the loop has data references that may or may not be aligned or/and
2907 has data reference relations whose independence was not proven then
2908 two versions of the loop need to be generated, one which is vectorized
2909 and one which isn't. A test is then generated to control which of the
2910 loops is executed. The test checks for the alignment of all of the
2911 data references that may or may not be aligned. An additional
2912 sequence of runtime tests is generated for each pairs of DDRs whose
b8698a0f
L
2913 independence was not proven. The vectorized version of loop is
2914 executed only if both alias and alignment tests are passed.
2915
ebfd146a 2916 The test generated to check which version of loop is executed
b8698a0f 2917 is modified to also check for profitability as indicated by the
d9157f15 2918 cost model threshold TH.
86290011
RG
2919
2920 The versioning precondition(s) are placed in *COND_EXPR and
d68d56b5 2921 *COND_EXPR_STMT_LIST. */
ebfd146a
IR
2922
2923void
368117e8 2924vect_loop_versioning (loop_vec_info loop_vinfo,
a696bc4f
RS
2925 unsigned int th, bool check_profitability,
2926 poly_uint64 versioning_threshold)
ebfd146a 2927{
01d32b2b 2928 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
5ce9450f 2929 struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
ebfd146a 2930 basic_block condition_bb;
538dd0b7
DM
2931 gphi_iterator gsi;
2932 gimple_stmt_iterator cond_exp_gsi;
ebfd146a
IR
2933 basic_block merge_bb;
2934 basic_block new_exit_bb;
2935 edge new_exit_e, e;
538dd0b7 2936 gphi *orig_phi, *new_phi;
368117e8 2937 tree cond_expr = NULL_TREE;
d68d56b5 2938 gimple_seq cond_expr_stmt_list = NULL;
ebfd146a 2939 tree arg;
af2bbc51 2940 profile_probability prob = profile_probability::likely ();
ebfd146a 2941 gimple_seq gimplify_stmt_list = NULL;
fdd29374 2942 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
9cc1fb4b
XDL
2943 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2944 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
01d32b2b 2945 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
ebfd146a 2946
368117e8 2947 if (check_profitability)
80be3333 2948 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
01d32b2b 2949 build_int_cst (TREE_TYPE (scalar_loop_iters),
fdd29374 2950 th - 1));
a696bc4f
RS
2951 if (maybe_ne (versioning_threshold, 0U))
2952 {
2953 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2954 build_int_cst (TREE_TYPE (scalar_loop_iters),
2955 versioning_threshold - 1));
2956 if (cond_expr)
2957 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2958 expr, cond_expr);
2959 else
2960 cond_expr = expr;
2961 }
01d32b2b
BC
2962
2963 if (version_niter)
2964 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
2965
2966 if (cond_expr)
2967 cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
2968 is_gimple_condexpr, NULL_TREE);
ebfd146a 2969
9cc1fb4b 2970 if (version_align)
d68d56b5
RG
2971 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
2972 &cond_expr_stmt_list);
ebfd146a 2973
9cc1fb4b 2974 if (version_alias)
9adee305
RS
2975 {
2976 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
2977 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
2978 }
86290011 2979
d68d56b5
RG
2980 cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list,
2981 is_gimple_condexpr, NULL_TREE);
2982 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
ebfd146a
IR
2983
2984 initialize_original_copy_tables ();
5ce9450f
JJ
2985 if (scalar_loop)
2986 {
2987 edge scalar_e;
2988 basic_block preheader, scalar_preheader;
2989
2990 /* We don't want to scale SCALAR_LOOP's frequencies, we need to
2991 scale LOOP's frequencies instead. */
5d3ebb71 2992 nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
af2bbc51
JH
2993 prob, prob.invert (), prob, prob.invert (), true);
2994 scale_loop_frequencies (loop, prob);
5ce9450f
JJ
2995 /* CONDITION_BB was created above SCALAR_LOOP's preheader,
2996 while we need to move it above LOOP's preheader. */
2997 e = loop_preheader_edge (loop);
2998 scalar_e = loop_preheader_edge (scalar_loop);
2999 gcc_assert (empty_block_p (e->src)
3000 && single_pred_p (e->src));
3001 gcc_assert (empty_block_p (scalar_e->src)
3002 && single_pred_p (scalar_e->src));
3003 gcc_assert (single_pred_p (condition_bb));
3004 preheader = e->src;
3005 scalar_preheader = scalar_e->src;
3006 scalar_e = find_edge (condition_bb, scalar_preheader);
3007 e = single_pred_edge (preheader);
3008 redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3009 scalar_preheader);
3010 redirect_edge_and_branch_force (scalar_e, preheader);
3011 redirect_edge_and_branch_force (e, condition_bb);
3012 set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3013 single_pred (condition_bb));
3014 set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3015 single_pred (scalar_preheader));
3016 set_immediate_dominator (CDI_DOMINATORS, preheader,
3017 condition_bb);
3018 }
3019 else
01d32b2b 3020 nloop = loop_version (loop, cond_expr, &condition_bb,
af2bbc51 3021 prob, prob.invert (), prob, prob.invert (), true);
01d32b2b
BC
3022
3023 if (version_niter)
3024 {
3025 /* The versioned loop could be infinite, we need to clear existing
3026 niter information which is copied from the original loop. */
3027 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3028 vect_free_loop_info_assumptions (nloop);
3029 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3030 loop_constraint_set (loop, LOOP_C_INFINITE);
3031 }
9cc1fb4b 3032
b05e0233 3033 if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
9cc1fb4b
XDL
3034 && dump_enabled_p ())
3035 {
3036 if (version_alias)
3037 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3038 "loop versioned for vectorization because of "
3039 "possible aliasing\n");
3040 if (version_align)
3041 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3042 "loop versioned for vectorization to enhance "
3043 "alignment\n");
3044
3045 }
c3284718 3046 free_original_copy_tables ();
ebfd146a 3047
b8698a0f 3048 /* Loop versioning violates an assumption we try to maintain during
ebfd146a
IR
3049 vectorization - that the loop exit block has a single predecessor.
3050 After versioning, the exit block of both loop versions is the same
3051 basic block (i.e. it has two predecessors). Just in order to simplify
3052 following transformations in the vectorizer, we fix this situation
3053 here by adding a new (empty) block on the exit-edge of the loop,
5ce9450f
JJ
3054 with the proper loop-exit phis to maintain loop-closed-form.
3055 If loop versioning wasn't done from loop, but scalar_loop instead,
3056 merge_bb will have already just a single successor. */
b8698a0f 3057
ebfd146a 3058 merge_bb = single_exit (loop)->dest;
5ce9450f 3059 if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
ebfd146a 3060 {
5ce9450f
JJ
3061 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3062 new_exit_bb = split_edge (single_exit (loop));
3063 new_exit_e = single_exit (loop);
3064 e = EDGE_SUCC (new_exit_bb, 0);
3065
3066 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3067 {
3068 tree new_res;
538dd0b7 3069 orig_phi = gsi.phi ();
b731b390 3070 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
5ce9450f
JJ
3071 new_phi = create_phi_node (new_res, new_exit_bb);
3072 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3073 add_phi_arg (new_phi, arg, new_exit_e,
3074 gimple_phi_arg_location_from_edge (orig_phi, e));
3075 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3076 }
b8698a0f 3077 }
ebfd146a
IR
3078
3079 /* End loop-exit-fixes after versioning. */
3080
d68d56b5 3081 if (cond_expr_stmt_list)
ebfd146a
IR
3082 {
3083 cond_exp_gsi = gsi_last_bb (condition_bb);
d68d56b5 3084 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
86290011 3085 GSI_SAME_STMT);
ebfd146a 3086 }
90eb75f2 3087 update_ssa (TODO_update_ssa);
ebfd146a 3088}