]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-loop-manip.c
Replace mode_for_int_vector with related_int_vector_mode
[thirdparty/gcc.git] / gcc / tree-vect-loop-manip.c
CommitLineData
b8698a0f 1/* Vectorizer Specific Loop Manipulations
a5544970 2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
b8698a0f 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
ebfd146a
IR
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
40e23961 26#include "tree.h"
c7131fb2 27#include "gimple.h"
957060b5
AM
28#include "cfghooks.h"
29#include "tree-pass.h"
c7131fb2 30#include "ssa.h"
c7131fb2 31#include "fold-const.h"
60393bbc 32#include "cfganal.h"
45b0be94 33#include "gimplify.h"
5be5c238 34#include "gimple-iterator.h"
18f429e2 35#include "gimplify-me.h"
442b4905 36#include "tree-cfg.h"
e28030cf 37#include "tree-ssa-loop-manip.h"
442b4905 38#include "tree-into-ssa.h"
7a300452 39#include "tree-ssa.h"
ebfd146a 40#include "cfgloop.h"
ebfd146a
IR
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
2a93954e 43#include "tree-ssa-loop-ivopts.h"
0f26839a 44#include "gimple-fold.h"
7cfb4d93
RS
45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
ebfd146a
IR
50
51/*************************************************************************
52 Simple Loop Peeling Utilities
53
54 Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
56
57
58/* Renames the use *OP_P. */
59
60static void
61rename_use_op (use_operand_p op_p)
62{
63 tree new_name;
64
65 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66 return;
67
68 new_name = get_current_def (USE_FROM_PTR (op_p));
69
70 /* Something defined outside of the loop. */
71 if (!new_name)
72 return;
73
74 /* An ordinary ssa name defined in the loop. */
75
76 SET_USE (op_p, new_name);
77}
78
79
cb330ba5 80/* Renames the variables in basic block BB. Allow renaming of PHI arguments
a6c51a12
YR
81 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82 true. */
ebfd146a 83
2cfc56b9 84static void
a6c51a12 85rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
ebfd146a 86{
355fe088 87 gimple *stmt;
ebfd146a
IR
88 use_operand_p use_p;
89 ssa_op_iter iter;
90 edge e;
91 edge_iterator ei;
99b1c316
MS
92 class loop *loop = bb->loop_father;
93 class loop *outer_loop = NULL;
a6c51a12
YR
94
95 if (rename_from_outer_loop)
96 {
97 gcc_assert (loop);
98 outer_loop = loop_outer (loop);
99 }
ebfd146a 100
538dd0b7
DM
101 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102 gsi_next (&gsi))
ebfd146a
IR
103 {
104 stmt = gsi_stmt (gsi);
105 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106 rename_use_op (use_p);
107 }
108
2cfc56b9 109 FOR_EACH_EDGE (e, ei, bb->preds)
ebfd146a 110 {
cb330ba5
JJ
111 if (!flow_bb_inside_loop_p (loop, e->src))
112 {
113 if (!rename_from_outer_loop)
114 continue;
115 if (e->src != outer_loop->header)
116 {
117 if (outer_loop->inner->next)
118 {
119 /* If outer_loop has 2 inner loops, allow there to
120 be an extra basic block which decides which of the
121 two loops to use using LOOP_VECTORIZED. */
122 if (!single_pred_p (e->src)
123 || single_pred (e->src) != outer_loop->header)
124 continue;
125 }
cb330ba5
JJ
126 }
127 }
538dd0b7
DM
128 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129 gsi_next (&gsi))
130 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
ebfd146a
IR
131 }
132}
133
134
a79683d5 135struct adjust_info
684f25f4
AO
136{
137 tree from, to;
138 basic_block bb;
a79683d5 139};
684f25f4 140
684f25f4
AO
141/* A stack of values to be adjusted in debug stmts. We have to
142 process them LIFO, so that the closest substitution applies. If we
143 processed them FIFO, without the stack, we might substitute uses
144 with a PHI DEF that would soon become non-dominant, and when we got
145 to the suitable one, it wouldn't have anything to substitute any
146 more. */
ff4c81cc 147static vec<adjust_info, va_heap> adjust_vec;
684f25f4
AO
148
149/* Adjust any debug stmts that referenced AI->from values to use the
150 loop-closed AI->to, if the references are dominated by AI->bb and
151 not by the definition of AI->from. */
152
153static void
154adjust_debug_stmts_now (adjust_info *ai)
155{
156 basic_block bbphi = ai->bb;
157 tree orig_def = ai->from;
158 tree new_def = ai->to;
159 imm_use_iterator imm_iter;
355fe088 160 gimple *stmt;
684f25f4
AO
161 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162
163 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164
165 /* Adjust any debug stmts that held onto non-loop-closed
166 references. */
167 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168 {
169 use_operand_p use_p;
170 basic_block bbuse;
171
172 if (!is_gimple_debug (stmt))
173 continue;
174
175 gcc_assert (gimple_debug_bind_p (stmt));
176
177 bbuse = gimple_bb (stmt);
178
179 if ((bbuse == bbphi
180 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181 && !(bbuse == bbdef
182 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183 {
184 if (new_def)
185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186 SET_USE (use_p, new_def);
187 else
188 {
189 gimple_debug_bind_reset_value (stmt);
190 update_stmt (stmt);
191 }
192 }
193 }
194}
195
196/* Adjust debug stmts as scheduled before. */
197
198static void
199adjust_vec_debug_stmts (void)
200{
36f52e8f 201 if (!MAY_HAVE_DEBUG_BIND_STMTS)
684f25f4
AO
202 return;
203
9771b263 204 gcc_assert (adjust_vec.exists ());
684f25f4 205
9771b263 206 while (!adjust_vec.is_empty ())
684f25f4 207 {
9771b263
DN
208 adjust_debug_stmts_now (&adjust_vec.last ());
209 adjust_vec.pop ();
684f25f4 210 }
684f25f4
AO
211}
212
213/* Adjust any debug stmts that referenced FROM values to use the
214 loop-closed TO, if the references are dominated by BB and not by
215 the definition of FROM. If adjust_vec is non-NULL, adjustments
216 will be postponed until adjust_vec_debug_stmts is called. */
217
218static void
219adjust_debug_stmts (tree from, tree to, basic_block bb)
220{
221 adjust_info ai;
222
36f52e8f 223 if (MAY_HAVE_DEBUG_BIND_STMTS
a471762f 224 && TREE_CODE (from) == SSA_NAME
a52ca739 225 && ! SSA_NAME_IS_DEFAULT_DEF (from)
a471762f 226 && ! virtual_operand_p (from))
684f25f4
AO
227 {
228 ai.from = from;
229 ai.to = to;
230 ai.bb = bb;
231
9771b263
DN
232 if (adjust_vec.exists ())
233 adjust_vec.safe_push (ai);
684f25f4
AO
234 else
235 adjust_debug_stmts_now (&ai);
236 }
237}
238
239/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240 to adjust any debug stmts that referenced the old phi arg,
241 presumably non-loop-closed references left over from other
242 transformations. */
243
244static void
355fe088 245adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
684f25f4
AO
246{
247 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248
249 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250
36f52e8f 251 if (MAY_HAVE_DEBUG_BIND_STMTS)
684f25f4
AO
252 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253 gimple_bb (update_phi));
254}
255
7cfb4d93
RS
256/* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
257 the mask should have during the first iteration and NEXT_MASK is the
258 value that it should have on subsequent iterations. */
0f26839a 259
7cfb4d93 260static void
99b1c316 261vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
7cfb4d93
RS
262 tree next_mask)
263{
264 gphi *phi = create_phi_node (mask, loop->header);
265 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267}
0f26839a 268
7cfb4d93 269/* Add SEQ to the end of LOOP's preheader block. */
ebfd146a 270
7cfb4d93 271static void
99b1c316 272add_preheader_seq (class loop *loop, gimple_seq seq)
7cfb4d93
RS
273{
274 if (seq)
275 {
276 edge pe = loop_preheader_edge (loop);
277 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278 gcc_assert (!new_bb);
279 }
280}
ebfd146a 281
7cfb4d93
RS
282/* Add SEQ to the beginning of LOOP's header block. */
283
284static void
99b1c316 285add_header_seq (class loop *loop, gimple_seq seq)
7cfb4d93
RS
286{
287 if (seq)
288 {
289 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291 }
292}
293
294/* Return true if the target can interleave elements of two vectors.
295 OFFSET is 0 if the first half of the vectors should be interleaved
296 or 1 if the second half should. When returning true, store the
297 associated permutation in INDICES. */
298
299static bool
300interleave_supported_p (vec_perm_indices *indices, tree vectype,
301 unsigned int offset)
302{
303 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304 poly_uint64 base = exact_div (nelts, 2) * offset;
305 vec_perm_builder sel (nelts, 2, 3);
306 for (unsigned int i = 0; i < 3; ++i)
307 {
308 sel.quick_push (base + i);
309 sel.quick_push (base + i + nelts);
310 }
311 indices->new_vector (sel, 2, nelts);
312 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313}
314
315/* Try to use permutes to define the masks in DEST_RGM using the masks
316 in SRC_RGM, given that the former has twice as many masks as the
317 latter. Return true on success, adding any new statements to SEQ. */
318
319static bool
830e90da 320vect_maybe_permute_loop_masks (loop_vec_info loop_vinfo, gimple_seq *seq,
b0dab10e 321 rgroup_masks *dest_rgm,
7cfb4d93
RS
322 rgroup_masks *src_rgm)
323{
324 tree src_masktype = src_rgm->mask_type;
325 tree dest_masktype = dest_rgm->mask_type;
326 machine_mode src_mode = TYPE_MODE (src_masktype);
327 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
328 && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
329 && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
330 {
331 /* Unpacking the source masks gives at least as many mask bits as
332 we need. We can then VIEW_CONVERT any excess bits away. */
830e90da 333 tree unpack_masktype = vect_halve_mask_nunits (loop_vinfo, src_masktype);
7cfb4d93
RS
334 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
335 {
336 tree src = src_rgm->masks[i / 2];
337 tree dest = dest_rgm->masks[i];
9bfb28ed
RS
338 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
339 ? VEC_UNPACK_HI_EXPR
7cfb4d93
RS
340 : VEC_UNPACK_LO_EXPR);
341 gassign *stmt;
342 if (dest_masktype == unpack_masktype)
343 stmt = gimple_build_assign (dest, code, src);
344 else
345 {
346 tree temp = make_ssa_name (unpack_masktype);
347 stmt = gimple_build_assign (temp, code, src);
348 gimple_seq_add_stmt (seq, stmt);
349 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
350 build1 (VIEW_CONVERT_EXPR,
351 dest_masktype, temp));
352 }
353 gimple_seq_add_stmt (seq, stmt);
354 }
355 return true;
356 }
357 vec_perm_indices indices[2];
358 if (dest_masktype == src_masktype
359 && interleave_supported_p (&indices[0], src_masktype, 0)
360 && interleave_supported_p (&indices[1], src_masktype, 1))
361 {
362 /* The destination requires twice as many mask bits as the source, so
363 we can use interleaving permutes to double up the number of bits. */
364 tree masks[2];
365 for (unsigned int i = 0; i < 2; ++i)
366 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
367 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
368 {
369 tree src = src_rgm->masks[i / 2];
370 tree dest = dest_rgm->masks[i];
371 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
372 src, src, masks[i & 1]);
373 gimple_seq_add_stmt (seq, stmt);
374 }
375 return true;
376 }
377 return false;
378}
379
380/* Helper for vect_set_loop_condition_masked. Generate definitions for
381 all the masks in RGM and return a mask that is nonzero when the loop
382 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
383 Use LOOP_COND_GSI to insert code before the exit gcond.
384
385 RGM belongs to loop LOOP. The loop originally iterated NITERS
fcae0292 386 times and has been vectorized according to LOOP_VINFO.
7cfb4d93 387
535e7c11
RS
388 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389 starts with NITERS_SKIP dummy iterations of the scalar loop before
390 the real work starts. The mask elements for these dummy iterations
391 must be 0, to ensure that the extra iterations do not have an effect.
392
7cfb4d93
RS
393 It is known that:
394
395 NITERS * RGM->max_nscalars_per_iter
396
397 does not overflow. However, MIGHT_WRAP_P says whether an induction
398 variable that starts at 0 and has step:
399
400 VF * RGM->max_nscalars_per_iter
401
402 might overflow before hitting a value above:
403
535e7c11 404 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
7cfb4d93
RS
405
406 This means that we cannot guarantee that such an induction variable
407 would ever hit a value that produces a set of all-false masks for RGM. */
408
409static tree
99b1c316 410vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
7cfb4d93
RS
411 gimple_seq *preheader_seq,
412 gimple_stmt_iterator loop_cond_gsi,
fcae0292 413 rgroup_masks *rgm, tree niters, tree niters_skip,
535e7c11 414 bool might_wrap_p)
7cfb4d93
RS
415{
416 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
9b884225 417 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
7cfb4d93
RS
418 tree mask_type = rgm->mask_type;
419 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
fcae0292 421 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7cfb4d93
RS
422
423 /* Calculate the maximum number of scalar values that the rgroup
535e7c11
RS
424 handles in total, the number that it handles for each iteration
425 of the vector loop, and the number that it should skip during the
426 first iteration of the vector loop. */
7cfb4d93 427 tree nscalars_total = niters;
fcae0292 428 tree nscalars_step = build_int_cst (iv_type, vf);
535e7c11 429 tree nscalars_skip = niters_skip;
7cfb4d93
RS
430 if (nscalars_per_iter != 1)
431 {
432 /* We checked before choosing to use a fully-masked loop that these
433 multiplications don't overflow. */
fcae0292
RS
434 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
435 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
7cfb4d93 436 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
fcae0292
RS
437 nscalars_total, compare_factor);
438 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
439 nscalars_step, iv_factor);
535e7c11
RS
440 if (nscalars_skip)
441 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
fcae0292 442 nscalars_skip, compare_factor);
7cfb4d93
RS
443 }
444
445 /* Create an induction variable that counts the number of scalars
446 processed. */
447 tree index_before_incr, index_after_incr;
448 gimple_stmt_iterator incr_gsi;
449 bool insert_after;
7cfb4d93 450 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
fcae0292
RS
451 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
452 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
9b884225 453
fcae0292 454 tree zero_index = build_int_cst (compare_type, 0);
535e7c11 455 tree test_index, test_limit, first_limit;
7cfb4d93
RS
456 gimple_stmt_iterator *test_gsi;
457 if (might_wrap_p)
458 {
459 /* In principle the loop should stop iterating once the incremented
535e7c11
RS
460 IV reaches a value greater than or equal to:
461
462 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
463
464 However, there's no guarantee that this addition doesn't overflow
465 the comparison type, or that the IV hits a value above it before
466 wrapping around. We therefore adjust the limit down by one
467 IV step:
7cfb4d93 468
535e7c11
RS
469 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
470 -[infinite-prec] NSCALARS_STEP
7cfb4d93
RS
471
472 and compare the IV against this limit _before_ incrementing it.
473 Since the comparison type is unsigned, we actually want the
474 subtraction to saturate at zero:
475
535e7c11
RS
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 -[sat] NSCALARS_STEP
478
479 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
480
481 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
482
483 where the rightmost subtraction can be done directly in
484 COMPARE_TYPE. */
7cfb4d93 485 test_index = index_before_incr;
fcae0292
RS
486 tree adjust = gimple_convert (preheader_seq, compare_type,
487 nscalars_step);
535e7c11
RS
488 if (nscalars_skip)
489 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
490 adjust, nscalars_skip);
7cfb4d93 491 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
535e7c11 492 nscalars_total, adjust);
7cfb4d93 493 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
535e7c11 494 test_limit, adjust);
7cfb4d93 495 test_gsi = &incr_gsi;
535e7c11
RS
496
497 /* Get a safe limit for the first iteration. */
498 if (nscalars_skip)
499 {
500 /* The first vector iteration can handle at most NSCALARS_STEP
501 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
502 NSCALARS_SKIP to that cannot overflow. */
503 tree const_limit = build_int_cst (compare_type,
504 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
505 * nscalars_per_iter);
506 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
507 nscalars_total, const_limit);
508 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
509 first_limit, nscalars_skip);
510 }
511 else
512 /* For the first iteration it doesn't matter whether the IV hits
513 a value above NSCALARS_TOTAL. That only matters for the latch
514 condition. */
515 first_limit = nscalars_total;
7cfb4d93
RS
516 }
517 else
518 {
519 /* Test the incremented IV, which will always hit a value above
520 the bound before wrapping. */
521 test_index = index_after_incr;
522 test_limit = nscalars_total;
535e7c11
RS
523 if (nscalars_skip)
524 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
525 test_limit, nscalars_skip);
7cfb4d93 526 test_gsi = &loop_cond_gsi;
535e7c11
RS
527
528 first_limit = test_limit;
7cfb4d93
RS
529 }
530
fcae0292
RS
531 /* Convert the IV value to the comparison type (either a no-op or
532 a demotion). */
9b884225
KV
533 gimple_seq test_seq = NULL;
534 test_index = gimple_convert (&test_seq, compare_type, test_index);
535 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
536
fcae0292
RS
537 /* Provide a definition of each mask in the group. */
538 tree next_mask = NULL_TREE;
539 tree mask;
540 unsigned int i;
7cfb4d93
RS
541 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
542 {
543 /* Previous masks will cover BIAS scalars. This mask covers the
544 next batch. */
545 poly_uint64 bias = nscalars_per_mask * i;
546 tree bias_tree = build_int_cst (compare_type, bias);
547 gimple *tmp_stmt;
548
549 /* See whether the first iteration of the vector loop is known
550 to have a full mask. */
551 poly_uint64 const_limit;
552 bool first_iteration_full
535e7c11 553 = (poly_int_tree_p (first_limit, &const_limit)
7cfb4d93
RS
554 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
555
556 /* Rather than have a new IV that starts at BIAS and goes up to
557 TEST_LIMIT, prefer to use the same 0-based IV for each mask
558 and adjust the bound down by BIAS. */
559 tree this_test_limit = test_limit;
560 if (i != 0)
561 {
562 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
563 compare_type, this_test_limit,
564 bias_tree);
565 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
566 compare_type, this_test_limit,
567 bias_tree);
568 }
569
535e7c11
RS
570 /* Create the initial mask. First include all scalars that
571 are within the loop limit. */
7cfb4d93
RS
572 tree init_mask = NULL_TREE;
573 if (!first_iteration_full)
574 {
575 tree start, end;
535e7c11 576 if (first_limit == test_limit)
7cfb4d93
RS
577 {
578 /* Use a natural test between zero (the initial IV value)
579 and the loop limit. The "else" block would be valid too,
580 but this choice can avoid the need to load BIAS_TREE into
581 a register. */
582 start = zero_index;
583 end = this_test_limit;
584 }
585 else
586 {
535e7c11
RS
587 /* FIRST_LIMIT is the maximum number of scalars handled by the
588 first iteration of the vector loop. Test the portion
589 associated with this mask. */
7cfb4d93 590 start = bias_tree;
535e7c11 591 end = first_limit;
7cfb4d93
RS
592 }
593
594 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
595 tmp_stmt = vect_gen_while (init_mask, start, end);
596 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
597 }
598
535e7c11
RS
599 /* Now AND out the bits that are within the number of skipped
600 scalars. */
601 poly_uint64 const_skip;
602 if (nscalars_skip
603 && !(poly_int_tree_p (nscalars_skip, &const_skip)
604 && known_le (const_skip, bias)))
605 {
606 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
607 bias_tree, nscalars_skip);
608 if (init_mask)
609 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
610 init_mask, unskipped_mask);
611 else
612 init_mask = unskipped_mask;
613 }
614
7cfb4d93
RS
615 if (!init_mask)
616 /* First iteration is full. */
617 init_mask = build_minus_one_cst (mask_type);
618
619 /* Get the mask value for the next iteration of the loop. */
620 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
621 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
622 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
623
624 vect_set_loop_mask (loop, mask, init_mask, next_mask);
625 }
626 return next_mask;
627}
628
629/* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
630 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
d1d20a49
RS
631 number of iterations of the original scalar loop that should be
632 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
633 as for vect_set_loop_condition.
7cfb4d93
RS
634
635 Insert the branch-back condition before LOOP_COND_GSI and return the
636 final gcond. */
637
638static gcond *
99b1c316 639vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
7cfb4d93
RS
640 tree niters, tree final_iv,
641 bool niters_maybe_zero,
642 gimple_stmt_iterator loop_cond_gsi)
643{
644 gimple_seq preheader_seq = NULL;
645 gimple_seq header_seq = NULL;
646
647 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
648 unsigned int compare_precision = TYPE_PRECISION (compare_type);
7cfb4d93
RS
649 tree orig_niters = niters;
650
651 /* Type of the initial value of NITERS. */
652 tree ni_actual_type = TREE_TYPE (niters);
653 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
9b884225 654 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7cfb4d93
RS
655
656 /* Convert NITERS to the same size as the compare. */
657 if (compare_precision > ni_actual_precision
658 && niters_maybe_zero)
659 {
660 /* We know that there is always at least one iteration, so if the
661 count is zero then it must have wrapped. Cope with this by
662 subtracting 1 before the conversion and adding 1 to the result. */
663 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
664 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
665 niters, build_minus_one_cst (ni_actual_type));
666 niters = gimple_convert (&preheader_seq, compare_type, niters);
667 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
668 niters, build_one_cst (compare_type));
669 }
670 else
671 niters = gimple_convert (&preheader_seq, compare_type, niters);
672
9b884225 673 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
7cfb4d93
RS
674
675 /* Iterate over all the rgroups and fill in their masks. We could use
676 the first mask from any rgroup for the loop condition; here we
677 arbitrarily pick the last. */
678 tree test_mask = NULL_TREE;
679 rgroup_masks *rgm;
680 unsigned int i;
681 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
682 FOR_EACH_VEC_ELT (*masks, i, rgm)
683 if (!rgm->masks.is_empty ())
684 {
685 /* First try using permutes. This adds a single vector
686 instruction to the loop for each mask, but needs no extra
687 loop invariants or IVs. */
688 unsigned int nmasks = i + 1;
689 if ((nmasks & 1) == 0)
690 {
691 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
692 if (!half_rgm->masks.is_empty ()
b0dab10e
RS
693 && vect_maybe_permute_loop_masks (loop_vinfo, &header_seq,
694 rgm, half_rgm))
7cfb4d93
RS
695 continue;
696 }
697
698 /* See whether zero-based IV would ever generate all-false masks
699 before wrapping around. */
700 bool might_wrap_p
9b884225 701 = (iv_limit == -1
7cfb4d93
RS
702 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
703 UNSIGNED)
704 > compare_precision));
705
706 /* Set up all masks for this group. */
707 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
708 &preheader_seq,
fcae0292 709 loop_cond_gsi, rgm,
535e7c11
RS
710 niters, niters_skip,
711 might_wrap_p);
7cfb4d93
RS
712 }
713
714 /* Emit all accumulated statements. */
715 add_preheader_seq (loop, preheader_seq);
716 add_header_seq (loop, header_seq);
717
718 /* Get a boolean result that tells us whether to iterate. */
719 edge exit_edge = single_exit (loop);
720 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
721 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
722 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
723 NULL_TREE, NULL_TREE);
724 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
725
726 /* The loop iterates (NITERS - 1) / VF + 1 times.
727 Subtract one from this to get the latch count. */
728 tree step = build_int_cst (compare_type,
729 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
730 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
731 build_minus_one_cst (compare_type));
732 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
733 niters_minus_one, step);
734
735 if (final_iv)
736 {
737 gassign *assign = gimple_build_assign (final_iv, orig_niters);
738 gsi_insert_on_edge_immediate (single_exit (loop), assign);
739 }
740
741 return cond_stmt;
742}
743
744/* Like vect_set_loop_condition, but handle the case in which there
745 are no loop masks. */
746
747static gcond *
99b1c316 748vect_set_loop_condition_unmasked (class loop *loop, tree niters,
7cfb4d93
RS
749 tree step, tree final_iv,
750 bool niters_maybe_zero,
751 gimple_stmt_iterator loop_cond_gsi)
ebfd146a
IR
752{
753 tree indx_before_incr, indx_after_incr;
538dd0b7
DM
754 gcond *cond_stmt;
755 gcond *orig_cond;
0f26839a 756 edge pe = loop_preheader_edge (loop);
ebfd146a 757 edge exit_edge = single_exit (loop);
ebfd146a
IR
758 gimple_stmt_iterator incr_gsi;
759 bool insert_after;
ebfd146a 760 enum tree_code code;
0f26839a 761 tree niters_type = TREE_TYPE (niters);
ebfd146a
IR
762
763 orig_cond = get_loop_exit_condition (loop);
764 gcc_assert (orig_cond);
765 loop_cond_gsi = gsi_for_stmt (orig_cond);
766
0f26839a
RS
767 tree init, limit;
768 if (!niters_maybe_zero && integer_onep (step))
769 {
770 /* In this case we can use a simple 0-based IV:
771
772 A:
773 x = 0;
774 do
775 {
776 ...
777 x += 1;
778 }
779 while (x < NITERS); */
780 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
781 init = build_zero_cst (niters_type);
782 limit = niters;
783 }
784 else
785 {
786 /* The following works for all values of NITERS except 0:
787
788 B:
789 x = 0;
790 do
791 {
792 ...
793 x += STEP;
794 }
795 while (x <= NITERS - STEP);
796
797 so that the loop continues to iterate if x + STEP - 1 < NITERS
798 but stops if x + STEP - 1 >= NITERS.
799
800 However, if NITERS is zero, x never hits a value above NITERS - STEP
801 before wrapping around. There are two obvious ways of dealing with
802 this:
803
804 - start at STEP - 1 and compare x before incrementing it
805 - start at -1 and compare x after incrementing it
806
807 The latter is simpler and is what we use. The loop in this case
808 looks like:
809
810 C:
811 x = -1;
812 do
813 {
814 ...
815 x += STEP;
816 }
817 while (x < NITERS - STEP);
818
819 In both cases the loop limit is NITERS - STEP. */
820 gimple_seq seq = NULL;
821 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
822 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
823 if (seq)
824 {
825 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
826 gcc_assert (!new_bb);
827 }
828 if (niters_maybe_zero)
829 {
830 /* Case C. */
831 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
832 init = build_all_ones_cst (niters_type);
833 }
834 else
835 {
836 /* Case B. */
837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
838 init = build_zero_cst (niters_type);
839 }
840 }
841
ebfd146a
IR
842 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
843 create_iv (init, step, NULL_TREE, loop,
844 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
ebfd146a
IR
845 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
846 true, NULL_TREE, true,
847 GSI_SAME_STMT);
0f26839a 848 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
ebfd146a
IR
849 true, GSI_SAME_STMT);
850
0f26839a 851 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
ebfd146a
IR
852 NULL_TREE);
853
854 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
855
71118889 856 /* Record the number of latch iterations. */
0f26839a
RS
857 if (limit == niters)
858 /* Case A: the loop iterates NITERS times. Subtract one to get the
859 latch count. */
860 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
861 build_int_cst (niters_type, 1));
862 else
863 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
864 Subtract one from this to get the latch count. */
865 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
866 limit, step);
867
868 if (final_iv)
869 {
870 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
871 indx_after_incr, init);
872 gsi_insert_on_edge_immediate (single_exit (loop), assign);
873 }
7cfb4d93
RS
874
875 return cond_stmt;
876}
877
878/* If we're using fully-masked loops, make LOOP iterate:
879
880 N == (NITERS - 1) / STEP + 1
881
882 times. When NITERS is zero, this is equivalent to making the loop
883 execute (1 << M) / STEP times, where M is the precision of NITERS.
884 NITERS_MAYBE_ZERO is true if this last case might occur.
885
886 If we're not using fully-masked loops, make LOOP iterate:
887
888 N == (NITERS - STEP) / STEP + 1
889
890 times, where NITERS is known to be outside the range [1, STEP - 1].
891 This is equivalent to making the loop execute NITERS / STEP times
892 when NITERS is nonzero and (1 << M) / STEP times otherwise.
893 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
894
895 If FINAL_IV is nonnull, it is an SSA name that should be set to
896 N * STEP on exit from the loop.
897
898 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
899
900void
99b1c316 901vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
7cfb4d93
RS
902 tree niters, tree step, tree final_iv,
903 bool niters_maybe_zero)
904{
905 gcond *cond_stmt;
906 gcond *orig_cond = get_loop_exit_condition (loop);
907 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
908
909 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
910 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
911 final_iv, niters_maybe_zero,
912 loop_cond_gsi);
913 else
914 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
915 final_iv, niters_maybe_zero,
916 loop_cond_gsi);
917
918 /* Remove old loop exit test. */
b5b56c2a
RS
919 stmt_vec_info orig_cond_info;
920 if (loop_vinfo
921 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
922 loop_vinfo->remove_stmt (orig_cond_info);
923 else
924 gsi_remove (&loop_cond_gsi, true);
7cfb4d93
RS
925
926 if (dump_enabled_p ())
3c2a8ed0
DM
927 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
928 cond_stmt);
ebfd146a
IR
929}
930
5ce9450f
JJ
931/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
932 For all PHI arguments in FROM->dest and TO->dest from those
933 edges ensure that TO->dest PHI arguments have current_def
934 to that in from. */
935
936static void
937slpeel_duplicate_current_defs_from_edges (edge from, edge to)
938{
939 gimple_stmt_iterator gsi_from, gsi_to;
940
941 for (gsi_from = gsi_start_phis (from->dest),
942 gsi_to = gsi_start_phis (to->dest);
14ba8d6d 943 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
5ce9450f 944 {
355fe088
TS
945 gimple *from_phi = gsi_stmt (gsi_from);
946 gimple *to_phi = gsi_stmt (gsi_to);
5ce9450f 947 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1a5da5b6
RB
948 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
949 if (virtual_operand_p (from_arg))
950 {
14ba8d6d
RB
951 gsi_next (&gsi_from);
952 continue;
953 }
1a5da5b6
RB
954 if (virtual_operand_p (to_arg))
955 {
14ba8d6d
RB
956 gsi_next (&gsi_to);
957 continue;
958 }
1a5da5b6
RB
959 if (TREE_CODE (from_arg) != SSA_NAME)
960 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
d8564d45
RB
961 else if (TREE_CODE (to_arg) == SSA_NAME
962 && from_arg != to_arg)
1a5da5b6
RB
963 {
964 if (get_current_def (to_arg) == NULL_TREE)
733441e2
RB
965 {
966 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
967 TREE_TYPE (get_current_def
968 (from_arg))));
969 set_current_def (to_arg, get_current_def (from_arg));
970 }
1a5da5b6 971 }
14ba8d6d
RB
972 gsi_next (&gsi_from);
973 gsi_next (&gsi_to);
5ce9450f 974 }
1a5da5b6
RB
975
976 gphi *from_phi = get_virtual_phi (from->dest);
977 gphi *to_phi = get_virtual_phi (to->dest);
978 if (from_phi)
979 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
980 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
5ce9450f
JJ
981}
982
ebfd146a 983
b8698a0f 984/* Given LOOP this function generates a new copy of it and puts it
5ce9450f
JJ
985 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
986 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
987 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
988 entry or exit of LOOP. */
ebfd146a 989
99b1c316
MS
990class loop *
991slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
992 class loop *scalar_loop, edge e)
ebfd146a 993{
99b1c316 994 class loop *new_loop;
3884da6f 995 basic_block *new_bbs, *bbs, *pbbs;
ebfd146a
IR
996 bool at_exit;
997 bool was_imm_dom;
b8698a0f 998 basic_block exit_dest;
ebfd146a 999 edge exit, new_exit;
a6c51a12 1000 bool duplicate_outer_loop = false;
ebfd146a 1001
2cfc56b9
RB
1002 exit = single_exit (loop);
1003 at_exit = (e == exit);
ebfd146a
IR
1004 if (!at_exit && e != loop_preheader_edge (loop))
1005 return NULL;
1006
5ce9450f
JJ
1007 if (scalar_loop == NULL)
1008 scalar_loop = loop;
1009
1010 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
3884da6f
BC
1011 pbbs = bbs + 1;
1012 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
a6c51a12
YR
1013 /* Allow duplication of outer loops. */
1014 if (scalar_loop->inner)
1015 duplicate_outer_loop = true;
ebfd146a 1016 /* Check whether duplication is possible. */
3884da6f 1017 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
ebfd146a
IR
1018 {
1019 free (bbs);
1020 return NULL;
1021 }
1022
1023 /* Generate new loop structure. */
5ce9450f
JJ
1024 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1025 duplicate_subloops (scalar_loop, new_loop);
ebfd146a 1026
2cfc56b9 1027 exit_dest = exit->dest;
b8698a0f
L
1028 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1029 exit_dest) == loop->header ?
ebfd146a
IR
1030 true : false);
1031
2cfc56b9
RB
1032 /* Also copy the pre-header, this avoids jumping through hoops to
1033 duplicate the loop entry PHI arguments. Create an empty
1034 pre-header unconditionally for this. */
5ce9450f 1035 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
2cfc56b9 1036 edge entry_e = single_pred_edge (preheader);
3884da6f 1037 bbs[0] = preheader;
5ce9450f 1038 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
ebfd146a 1039
5ce9450f
JJ
1040 exit = single_exit (scalar_loop);
1041 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
ebfd146a 1042 &exit, 1, &new_exit, NULL,
3884da6f 1043 at_exit ? loop->latch : e->src, true);
5ce9450f 1044 exit = single_exit (loop);
3884da6f 1045 basic_block new_preheader = new_bbs[0];
ebfd146a 1046
5ce9450f
JJ
1047 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1048
1049 if (scalar_loop != loop)
1050 {
1051 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1052 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1053 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1054 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1055 header) to have current_def set, so copy them over. */
1056 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1057 exit);
1058 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1059 0),
1060 EDGE_SUCC (loop->latch, 0));
1061 }
b8698a0f 1062
ebfd146a
IR
1063 if (at_exit) /* Add the loop copy at exit. */
1064 {
5ce9450f
JJ
1065 if (scalar_loop != loop)
1066 {
538dd0b7 1067 gphi_iterator gsi;
5ce9450f
JJ
1068 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1069
1070 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1071 gsi_next (&gsi))
1072 {
538dd0b7 1073 gphi *phi = gsi.phi ();
5ce9450f
JJ
1074 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1075 location_t orig_locus
1076 = gimple_phi_arg_location_from_edge (phi, e);
1077
1078 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1079 }
1080 }
2cfc56b9
RB
1081 redirect_edge_and_branch_force (e, new_preheader);
1082 flush_pending_stmts (e);
1083 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
a6c51a12 1084 if (was_imm_dom || duplicate_outer_loop)
5ce9450f 1085 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
2cfc56b9
RB
1086
1087 /* And remove the non-necessary forwarder again. Keep the other
1088 one so we have a proper pre-header for the loop at the exit edge. */
5ce9450f
JJ
1089 redirect_edge_pred (single_succ_edge (preheader),
1090 single_pred (preheader));
2cfc56b9 1091 delete_basic_block (preheader);
5ce9450f
JJ
1092 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1093 loop_preheader_edge (scalar_loop)->src);
ebfd146a
IR
1094 }
1095 else /* Add the copy at entry. */
1096 {
5ce9450f
JJ
1097 if (scalar_loop != loop)
1098 {
1099 /* Remove the non-necessary forwarder of scalar_loop again. */
1100 redirect_edge_pred (single_succ_edge (preheader),
1101 single_pred (preheader));
1102 delete_basic_block (preheader);
1103 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104 loop_preheader_edge (scalar_loop)->src);
1105 preheader = split_edge (loop_preheader_edge (loop));
1106 entry_e = single_pred_edge (preheader);
1107 }
1108
2cfc56b9
RB
1109 redirect_edge_and_branch_force (entry_e, new_preheader);
1110 flush_pending_stmts (entry_e);
1111 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1112
1113 redirect_edge_and_branch_force (new_exit, preheader);
1114 flush_pending_stmts (new_exit);
1115 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1116
1117 /* And remove the non-necessary forwarder again. Keep the other
1118 one so we have a proper pre-header for the loop at the exit edge. */
5ce9450f
JJ
1119 redirect_edge_pred (single_succ_edge (new_preheader),
1120 single_pred (new_preheader));
2cfc56b9
RB
1121 delete_basic_block (new_preheader);
1122 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1123 loop_preheader_edge (new_loop)->src);
ebfd146a
IR
1124 }
1125
166b8799
BC
1126 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1127 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
a6c51a12 1128 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
2cfc56b9 1129
5ce9450f
JJ
1130 if (scalar_loop != loop)
1131 {
1132 /* Update new_loop->header PHIs, so that on the preheader
1133 edge they are the ones from loop rather than scalar_loop. */
538dd0b7 1134 gphi_iterator gsi_orig, gsi_new;
5ce9450f
JJ
1135 edge orig_e = loop_preheader_edge (loop);
1136 edge new_e = loop_preheader_edge (new_loop);
1137
1138 for (gsi_orig = gsi_start_phis (loop->header),
1139 gsi_new = gsi_start_phis (new_loop->header);
1140 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1141 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1142 {
538dd0b7
DM
1143 gphi *orig_phi = gsi_orig.phi ();
1144 gphi *new_phi = gsi_new.phi ();
5ce9450f
JJ
1145 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1146 location_t orig_locus
1147 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1148
1149 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1150 }
1151 }
1152
ebfd146a
IR
1153 free (new_bbs);
1154 free (bbs);
1155
b2b29377 1156 checking_verify_dominators (CDI_DOMINATORS);
2cfc56b9 1157
ebfd146a
IR
1158 return new_loop;
1159}
1160
1161
a5e3d614
BC
1162/* Given the condition expression COND, put it as the last statement of
1163 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1164 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
3e907b90
BC
1165 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1166 new edge as irreducible if IRREDUCIBLE_P is true. */
ebfd146a
IR
1167
1168static edge
86290011 1169slpeel_add_loop_guard (basic_block guard_bb, tree cond,
a5e3d614 1170 basic_block guard_to, basic_block dom_bb,
357067f2 1171 profile_probability probability, bool irreducible_p)
ebfd146a
IR
1172{
1173 gimple_stmt_iterator gsi;
1174 edge new_e, enter_e;
538dd0b7 1175 gcond *cond_stmt;
ebfd146a
IR
1176 gimple_seq gimplify_stmt_list = NULL;
1177
1178 enter_e = EDGE_SUCC (guard_bb, 0);
1179 enter_e->flags &= ~EDGE_FALLTHRU;
1180 enter_e->flags |= EDGE_FALSE_VALUE;
1181 gsi = gsi_last_bb (guard_bb);
1182
f7a06a98
RG
1183 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1184 NULL_TREE);
86290011 1185 if (gimplify_stmt_list)
a5e3d614 1186 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
ebfd146a 1187
a5e3d614 1188 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
ebfd146a
IR
1189 gsi = gsi_last_bb (guard_bb);
1190 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1191
1192 /* Add new edge to connect guard block to the merge/loop-exit block. */
a5e3d614 1193 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
e78410bf 1194
e78410bf 1195 new_e->probability = probability;
3e907b90
BC
1196 if (irreducible_p)
1197 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1198
357067f2 1199 enter_e->probability = probability.invert ();
a5e3d614 1200 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
5281a167
RB
1201
1202 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1203 if (enter_e->dest->loop_father->header == enter_e->dest)
1204 split_edge (enter_e);
1205
ebfd146a
IR
1206 return new_e;
1207}
1208
1209
1210/* This function verifies that the following restrictions apply to LOOP:
a6c51a12
YR
1211 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1212 for innermost loop and 5 basic blocks for outer-loop.
1213 (2) it is single entry, single exit
1214 (3) its exit condition is the last stmt in the header
1215 (4) E is the entry/exit edge of LOOP.
ebfd146a
IR
1216 */
1217
1218bool
99b1c316 1219slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
ebfd146a
IR
1220{
1221 edge exit_e = single_exit (loop);
1222 edge entry_e = loop_preheader_edge (loop);
538dd0b7 1223 gcond *orig_cond = get_loop_exit_condition (loop);
ebfd146a 1224 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
a6c51a12 1225 unsigned int num_bb = loop->inner? 5 : 2;
ebfd146a 1226
62fdbf29
BC
1227 /* All loops have an outer scope; the only case loop->outer is NULL is for
1228 the function itself. */
1229 if (!loop_outer (loop)
a6c51a12 1230 || loop->num_nodes != num_bb
ebfd146a
IR
1231 || !empty_block_p (loop->latch)
1232 || !single_exit (loop)
1233 /* Verify that new loop exit condition can be trivially modified. */
1234 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1235 || (e != exit_e && e != entry_e))
1236 return false;
1237
1238 return true;
1239}
1240
a5e3d614
BC
1241/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1242 in the exit bb and rename all the uses after the loop. This simplifies
1243 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1244 (but normally loop closed SSA form doesn't require virtual PHIs to be
1245 in the same form). Doing this early simplifies the checking what
1246 uses should be renamed. */
ebfd146a
IR
1247
1248static void
99b1c316 1249create_lcssa_for_virtual_phi (class loop *loop)
ebfd146a 1250{
538dd0b7 1251 gphi_iterator gsi;
ebfd146a 1252 edge exit_e = single_exit (loop);
b8698a0f 1253
e20f6b4b 1254 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
ea057359 1255 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
e20f6b4b 1256 {
538dd0b7 1257 gphi *phi = gsi.phi ();
e20f6b4b
JJ
1258 for (gsi = gsi_start_phis (exit_e->dest);
1259 !gsi_end_p (gsi); gsi_next (&gsi))
ea057359 1260 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
e20f6b4b
JJ
1261 break;
1262 if (gsi_end_p (gsi))
1263 {
b731b390 1264 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
538dd0b7 1265 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
e20f6b4b
JJ
1266 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1267 imm_use_iterator imm_iter;
355fe088 1268 gimple *stmt;
e20f6b4b
JJ
1269 use_operand_p use_p;
1270
15b6695a
RB
1271 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1272 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
9e227d60 1273 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
e20f6b4b
JJ
1274 gimple_phi_set_result (new_phi, new_vop);
1275 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
b5fd0440
YR
1276 if (stmt != new_phi
1277 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
e20f6b4b
JJ
1278 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1279 SET_USE (use_p, new_vop);
1280 }
1281 break;
1282 }
ebfd146a 1283
ebfd146a
IR
1284}
1285
1286/* Function vect_get_loop_location.
1287
1288 Extract the location of the loop in the source code.
1289 If the loop is not well formed for vectorization, an estimated
1290 location is calculated.
1291 Return the loop location if succeed and NULL if not. */
1292
4f5b9c80 1293dump_user_location_t
99b1c316 1294find_loop_location (class loop *loop)
ebfd146a 1295{
355fe088 1296 gimple *stmt = NULL;
ebfd146a
IR
1297 basic_block bb;
1298 gimple_stmt_iterator si;
1299
1300 if (!loop)
4f5b9c80 1301 return dump_user_location_t ();
ebfd146a
IR
1302
1303 stmt = get_loop_exit_condition (loop);
1304
502498d5
JJ
1305 if (stmt
1306 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
4f5b9c80 1307 return stmt;
ebfd146a
IR
1308
1309 /* If we got here the loop is probably not "well formed",
1310 try to estimate the loop location */
1311
1312 if (!loop->header)
4f5b9c80 1313 return dump_user_location_t ();
ebfd146a
IR
1314
1315 bb = loop->header;
1316
1317 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1318 {
1319 stmt = gsi_stmt (si);
502498d5 1320 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
4f5b9c80 1321 return stmt;
ebfd146a
IR
1322 }
1323
4f5b9c80 1324 return dump_user_location_t ();
ebfd146a
IR
1325}
1326
82570274
RS
1327/* Return true if the phi described by STMT_INFO defines an IV of the
1328 loop to be vectorized. */
a5e3d614
BC
1329
1330static bool
82570274 1331iv_phi_p (stmt_vec_info stmt_info)
a5e3d614 1332{
82570274 1333 gphi *phi = as_a <gphi *> (stmt_info->stmt);
a5e3d614
BC
1334 if (virtual_operand_p (PHI_RESULT (phi)))
1335 return false;
1336
a5e3d614
BC
1337 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1338 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1339 return false;
1340
1341 return true;
1342}
ebfd146a 1343
ebfd146a
IR
1344/* Function vect_can_advance_ivs_p
1345
b8698a0f
L
1346 In case the number of iterations that LOOP iterates is unknown at compile
1347 time, an epilog loop will be generated, and the loop induction variables
1348 (IVs) will be "advanced" to the value they are supposed to take just before
ebfd146a
IR
1349 the epilog loop. Here we check that the access function of the loop IVs
1350 and the expression that represents the loop bound are simple enough.
1351 These restrictions will be relaxed in the future. */
1352
b8698a0f 1353bool
ebfd146a
IR
1354vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1355{
99b1c316 1356 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 1357 basic_block bb = loop->header;
538dd0b7 1358 gphi_iterator gsi;
ebfd146a
IR
1359
1360 /* Analyze phi functions of the loop header. */
1361
73fbfcad 1362 if (dump_enabled_p ())
e645e942 1363 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
ebfd146a
IR
1364 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1365 {
ebfd146a
IR
1366 tree evolution_part;
1367
a5e3d614 1368 gphi *phi = gsi.phi ();
91987857 1369 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
73fbfcad 1370 if (dump_enabled_p ())
3c2a8ed0
DM
1371 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1372 phi_info->stmt);
ebfd146a
IR
1373
1374 /* Skip virtual phi's. The data dependences that are associated with
a5e3d614 1375 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
ebfd146a 1376
a5e3d614 1377 Skip reduction phis. */
82570274 1378 if (!iv_phi_p (phi_info))
ebfd146a 1379 {
73fbfcad 1380 if (dump_enabled_p ())
a5e3d614
BC
1381 dump_printf_loc (MSG_NOTE, vect_location,
1382 "reduc or virtual phi. skip.\n");
ebfd146a
IR
1383 continue;
1384 }
1385
ebfd146a
IR
1386 /* Analyze the evolution function. */
1387
91987857 1388 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
ebfd146a
IR
1389 if (evolution_part == NULL_TREE)
1390 {
73fbfcad 1391 if (dump_enabled_p ())
afb119be 1392 dump_printf (MSG_MISSED_OPTIMIZATION,
e645e942 1393 "No access function or evolution.\n");
ebfd146a
IR
1394 return false;
1395 }
b8698a0f 1396
2a93954e
AH
1397 /* FORNOW: We do not transform initial conditions of IVs
1398 which evolution functions are not invariants in the loop. */
1399
1400 if (!expr_invariant_in_loop_p (loop, evolution_part))
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404 "evolution not invariant in loop.\n");
1405 return false;
1406 }
1407
b8698a0f 1408 /* FORNOW: We do not transform initial conditions of IVs
ebfd146a
IR
1409 which evolution functions are a polynomial of degree >= 2. */
1410
1411 if (tree_is_chrec (evolution_part))
2a93954e
AH
1412 {
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "evolution is chrec.\n");
1416 return false;
1417 }
ebfd146a
IR
1418 }
1419
1420 return true;
1421}
1422
1423
1424/* Function vect_update_ivs_after_vectorizer.
1425
1426 "Advance" the induction variables of LOOP to the value they should take
1427 after the execution of LOOP. This is currently necessary because the
1428 vectorizer does not handle induction variables that are used after the
1429 loop. Such a situation occurs when the last iterations of LOOP are
1430 peeled, because:
1431 1. We introduced new uses after LOOP for IVs that were not originally used
1432 after LOOP: the IVs of LOOP are now used by an epilog loop.
1433 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1434 times, whereas the loop IVs should be bumped N times.
1435
1436 Input:
1437 - LOOP - a loop that is going to be vectorized. The last few iterations
1438 of LOOP were peeled.
1439 - NITERS - the number of iterations that LOOP executes (before it is
1440 vectorized). i.e, the number of times the ivs should be bumped.
1441 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1442 coming out from LOOP on which there are uses of the LOOP ivs
1443 (this is the path from LOOP->exit to epilog_loop->preheader).
1444
1445 The new definitions of the ivs are placed in LOOP->exit.
1446 The phi args associated with the edge UPDATE_E in the bb
1447 UPDATE_E->dest are updated accordingly.
1448
1449 Assumption 1: Like the rest of the vectorizer, this function assumes
1450 a single loop exit that has a single predecessor.
1451
1452 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1453 organized in the same order.
1454
1455 Assumption 3: The access function of the ivs is simple enough (see
1456 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1457
1458 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
b8698a0f 1459 coming out of LOOP on which the ivs of LOOP are used (this is the path
ebfd146a
IR
1460 that leads to the epilog loop; other paths skip the epilog loop). This
1461 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1462 needs to have its phis updated.
1463 */
1464
1465static void
a5e3d614
BC
1466vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1467 tree niters, edge update_e)
ebfd146a 1468{
538dd0b7 1469 gphi_iterator gsi, gsi1;
99b1c316 1470 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 1471 basic_block update_bb = update_e->dest;
a5e3d614 1472 basic_block exit_bb = single_exit (loop)->dest;
ebfd146a
IR
1473
1474 /* Make sure there exists a single-predecessor exit bb: */
1475 gcc_assert (single_pred_p (exit_bb));
a5e3d614 1476 gcc_assert (single_succ_edge (exit_bb) == update_e);
ebfd146a
IR
1477
1478 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1479 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1480 gsi_next (&gsi), gsi_next (&gsi1))
1481 {
ebfd146a 1482 tree init_expr;
550918ca
RG
1483 tree step_expr, off;
1484 tree type;
ebfd146a
IR
1485 tree var, ni, ni_name;
1486 gimple_stmt_iterator last_gsi;
1487
a5e3d614
BC
1488 gphi *phi = gsi.phi ();
1489 gphi *phi1 = gsi1.phi ();
91987857 1490 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
73fbfcad 1491 if (dump_enabled_p ())
3c2a8ed0
DM
1492 dump_printf_loc (MSG_NOTE, vect_location,
1493 "vect_update_ivs_after_vectorizer: phi: %G", phi);
ebfd146a 1494
a5e3d614 1495 /* Skip reduction and virtual phis. */
82570274 1496 if (!iv_phi_p (phi_info))
ebfd146a 1497 {
73fbfcad 1498 if (dump_enabled_p ())
a5e3d614
BC
1499 dump_printf_loc (MSG_NOTE, vect_location,
1500 "reduc or virtual phi. skip.\n");
ebfd146a
IR
1501 continue;
1502 }
1503
0ac168a1 1504 type = TREE_TYPE (gimple_phi_result (phi));
91987857 1505 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
0ac168a1 1506 step_expr = unshare_expr (step_expr);
b8698a0f 1507
ebfd146a
IR
1508 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1509 of degree >= 2 or exponential. */
0ac168a1 1510 gcc_assert (!tree_is_chrec (step_expr));
ebfd146a 1511
0ac168a1 1512 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
ebfd146a 1513
550918ca
RG
1514 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1515 fold_convert (TREE_TYPE (step_expr), niters),
1516 step_expr);
0ac168a1 1517 if (POINTER_TYPE_P (type))
5d49b6a7 1518 ni = fold_build_pointer_plus (init_expr, off);
ebfd146a 1519 else
0ac168a1
RG
1520 ni = fold_build2 (PLUS_EXPR, type,
1521 init_expr, fold_convert (type, off));
ebfd146a 1522
0ac168a1 1523 var = create_tmp_var (type, "tmp");
ebfd146a
IR
1524
1525 last_gsi = gsi_last_bb (exit_bb);
a5e3d614
BC
1526 gimple_seq new_stmts = NULL;
1527 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1528 /* Exit_bb shouldn't be empty. */
1529 if (!gsi_end_p (last_gsi))
1530 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1531 else
1532 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
b8698a0f 1533
ebfd146a 1534 /* Fix phi expressions in the successor bb. */
684f25f4 1535 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
ebfd146a
IR
1536 }
1537}
1538
535e7c11
RS
1539/* Return a gimple value containing the misalignment (measured in vector
1540 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1541 it is away from a perfectly aligned address. Add any new statements
1542 to SEQ. */
1543
1544static tree
1545get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1546{
1e5e6ff5 1547 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
89fa689a 1548 stmt_vec_info stmt_info = dr_info->stmt;
535e7c11
RS
1549 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1550
ca31798e
AV
1551 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1552 unsigned HOST_WIDE_INT target_align_c;
1553 tree target_align_minus_1;
535e7c11 1554
89fa689a
RS
1555 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1556 size_zero_node) < 0;
535e7c11
RS
1557 tree offset = (negative
1558 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1559 : size_zero_node);
78e02b3b 1560 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
535e7c11
RS
1561 offset);
1562 tree type = unsigned_type_for (TREE_TYPE (start_addr));
ca31798e
AV
1563 if (target_align.is_constant (&target_align_c))
1564 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1565 else
1566 {
1567 tree vla = build_int_cst (type, target_align);
1568 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1569 fold_build2 (MINUS_EXPR, type,
1570 build_int_cst (type, 0), vla));
1571 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1572 build_int_cst (type, 1));
1573 }
1574
535e7c11
RS
1575 HOST_WIDE_INT elem_size
1576 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1577 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1578
1579 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1580 tree int_start_addr = fold_convert (type, start_addr);
1581 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1582 target_align_minus_1);
1583
1584 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1585 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1586 elem_size_log);
1587
1588 return misalign_in_elems;
1589}
1590
a5e3d614 1591/* Function vect_gen_prolog_loop_niters
d9157f15 1592
a5e3d614
BC
1593 Generate the number of iterations which should be peeled as prolog for the
1594 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1595 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1596 As a result, after the execution of this loop, the data reference DR will
1597 refer to an aligned location. The following computation is generated:
ebfd146a
IR
1598
1599 If the misalignment of DR is known at compile time:
1600 addr_mis = int mis = DR_MISALIGNMENT (dr);
1601 Else, compute address misalignment in bytes:
535e7c11 1602 addr_mis = addr & (target_align - 1)
ebfd146a 1603
a5e3d614 1604 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
ebfd146a
IR
1605
1606 (elem_size = element type size; an element is the scalar element whose type
1607 is the inner type of the vectype)
1608
a5e3d614 1609 The computations will be emitted at the end of BB. We also compute and
cbb22e61 1610 store upper bound (included) of the result in BOUND.
a5e3d614 1611
ebfd146a
IR
1612 When the step of the data-ref in the loop is not 1 (as in interleaved data
1613 and SLP), the number of iterations of the prolog must be divided by the step
1614 (which is equal to the size of interleaved group).
1615
1616 The above formulas assume that VF == number of elements in the vector. This
1617 may not hold when there are multiple-types in the loop.
1618 In this case, for some data-references in the loop the VF does not represent
1619 the number of elements that fit in the vector. Therefore, instead of VF we
1620 use TYPE_VECTOR_SUBPARTS. */
1621
b8698a0f 1622static tree
a5e3d614
BC
1623vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1624 basic_block bb, int *bound)
ebfd146a 1625{
1e5e6ff5 1626 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
ebfd146a 1627 tree var;
a5e3d614
BC
1628 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1629 gimple_seq stmts = NULL, new_stmts = NULL;
ebfd146a 1630 tree iters, iters_name;
89fa689a 1631 stmt_vec_info stmt_info = dr_info->stmt;
ebfd146a 1632 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ca31798e 1633 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
ebfd146a 1634
15e693cc 1635 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
ebfd146a 1636 {
15e693cc 1637 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
ebfd146a 1638
73fbfcad 1639 if (dump_enabled_p ())
ccb3ad87 1640 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1641 "known peeling = %d.\n", npeel);
ebfd146a 1642
720f5239 1643 iters = build_int_cst (niters_type, npeel);
cbb22e61 1644 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
ebfd146a
IR
1645 }
1646 else
1647 {
535e7c11
RS
1648 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1649 tree type = TREE_TYPE (misalign_in_elems);
f702e7d4
RS
1650 HOST_WIDE_INT elem_size
1651 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
ca31798e
AV
1652 /* We only do prolog peeling if the target alignment is known at compile
1653 time. */
1654 poly_uint64 align_in_elems =
1655 exact_div (target_align, elem_size);
1656 tree align_in_elems_minus_1 =
1657 build_int_cst (type, align_in_elems - 1);
f702e7d4 1658 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
f702e7d4
RS
1659
1660 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1661 & (align_in_elems - 1)). */
89fa689a
RS
1662 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1663 size_zero_node) < 0;
d8ba5b19 1664 if (negative)
f702e7d4
RS
1665 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1666 align_in_elems_tree);
d8ba5b19 1667 else
f702e7d4
RS
1668 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1669 misalign_in_elems);
1670 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
ebfd146a 1671 iters = fold_convert (niters_type, iters);
ca31798e
AV
1672 unsigned HOST_WIDE_INT align_in_elems_c;
1673 if (align_in_elems.is_constant (&align_in_elems_c))
1674 *bound = align_in_elems_c - 1;
1675 else
1676 *bound = -1;
ebfd146a
IR
1677 }
1678
73fbfcad 1679 if (dump_enabled_p ())
3c2a8ed0
DM
1680 dump_printf_loc (MSG_NOTE, vect_location,
1681 "niters for prolog loop: %T\n", iters);
ebfd146a
IR
1682
1683 var = create_tmp_var (niters_type, "prolog_loop_niters");
a5e3d614 1684 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
ebfd146a 1685
a5e3d614
BC
1686 if (new_stmts)
1687 gimple_seq_add_seq (&stmts, new_stmts);
ebfd146a
IR
1688 if (stmts)
1689 {
a5e3d614
BC
1690 gcc_assert (single_succ_p (bb));
1691 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1692 if (gsi_end_p (gsi))
1693 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1694 else
1695 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
ebfd146a 1696 }
b8698a0f 1697 return iters_name;
ebfd146a
IR
1698}
1699
1700
1701/* Function vect_update_init_of_dr
1702
535e7c11
RS
1703 If CODE is PLUS, the vector loop starts NITERS iterations after the
1704 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1705 iterations before the scalar one (using masking to skip inactive
1706 elements). This function updates the information recorded in DR to
1707 account for the difference. Specifically, it updates the OFFSET
1708 field of DR. */
ebfd146a
IR
1709
1710static void
535e7c11 1711vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
ebfd146a
IR
1712{
1713 tree offset = DR_OFFSET (dr);
b8698a0f 1714
ebfd146a
IR
1715 niters = fold_build2 (MULT_EXPR, sizetype,
1716 fold_convert (sizetype, niters),
1717 fold_convert (sizetype, DR_STEP (dr)));
535e7c11 1718 offset = fold_build2 (code, sizetype,
587aa063 1719 fold_convert (sizetype, offset), niters);
ebfd146a
IR
1720 DR_OFFSET (dr) = offset;
1721}
1722
1723
1724/* Function vect_update_inits_of_drs
1725
535e7c11
RS
1726 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1727 CODE and NITERS are as for vect_update_inits_of_dr. */
ebfd146a 1728
97c14603 1729void
535e7c11
RS
1730vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1731 tree_code code)
ebfd146a
IR
1732{
1733 unsigned int i;
9771b263 1734 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
ebfd146a 1735 struct data_reference *dr;
a5e3d614 1736
adac3a68 1737 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
a5e3d614 1738
97c14603
AV
1739 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1740 here, but since we might use these niters to update the epilogues niters
1741 and data references we can't insert them here as this definition might not
1742 always dominate its uses. */
a5e3d614 1743 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
97c14603 1744 niters = fold_convert (sizetype, niters);
ebfd146a 1745
9771b263 1746 FOR_EACH_VEC_ELT (datarefs, i, dr)
5fa23466 1747 {
f5ae2856
RS
1748 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1749 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
5fa23466
RB
1750 vect_update_init_of_dr (dr, niters, code);
1751 }
ebfd146a
IR
1752}
1753
535e7c11
RS
1754/* For the information recorded in LOOP_VINFO prepare the loop for peeling
1755 by masking. This involves calculating the number of iterations to
1756 be peeled and then aligning all memory references appropriately. */
1757
1758void
1759vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1760{
1761 tree misalign_in_elems;
1762 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1763
1764 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1765
1766 /* From the information recorded in LOOP_VINFO get the number of iterations
1767 that need to be skipped via masking. */
1768 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1769 {
1770 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1771 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1772 misalign_in_elems = build_int_cst (type, misalign);
1773 }
1774 else
1775 {
1776 gimple_seq seq1 = NULL, seq2 = NULL;
1777 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1778 misalign_in_elems = fold_convert (type, misalign_in_elems);
1779 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1780 &seq2, true, NULL_TREE);
1781 gimple_seq_add_seq (&seq1, seq2);
1782 if (seq1)
1783 {
1784 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1785 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1786 gcc_assert (!new_bb);
1787 }
1788 }
1789
1790 if (dump_enabled_p ())
3c2a8ed0
DM
1791 dump_printf_loc (MSG_NOTE, vect_location,
1792 "misalignment for fully-masked loop: %T\n",
1793 misalign_in_elems);
535e7c11
RS
1794
1795 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1796
1797 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1798}
ebfd146a 1799
a5e3d614 1800/* This function builds ni_name = number of iterations. Statements
7078979b
BC
1801 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1802 it to TRUE if new ssa_var is generated. */
a5e3d614
BC
1803
1804tree
7078979b 1805vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
a5e3d614
BC
1806{
1807 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1808 if (TREE_CODE (ni) == INTEGER_CST)
1809 return ni;
1810 else
1811 {
1812 tree ni_name, var;
1813 gimple_seq stmts = NULL;
1814 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1815
1816 var = create_tmp_var (TREE_TYPE (ni), "niters");
1817 ni_name = force_gimple_operand (ni, &stmts, false, var);
1818 if (stmts)
7078979b
BC
1819 {
1820 gsi_insert_seq_on_edge_immediate (pe, stmts);
1821 if (new_var_p != NULL)
1822 *new_var_p = true;
1823 }
a5e3d614
BC
1824
1825 return ni_name;
1826 }
1827}
1828
cbb22e61
BC
1829/* Calculate the number of iterations above which vectorized loop will be
1830 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1831 of prolog loop. If it's integer const, the integer number is also passed
d1d20a49
RS
1832 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1833 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1834 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1835 threshold below which the scalar (rather than vectorized) loop will be
1836 executed. This function stores the upper bound (inclusive) of the result
1837 in BOUND_SCALAR. */
a5e3d614
BC
1838
1839static tree
1840vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
d1d20a49 1841 int bound_prolog, poly_int64 bound_epilog, int th,
d9f21f6a
RS
1842 poly_uint64 *bound_scalar,
1843 bool check_profitability)
a5e3d614
BC
1844{
1845 tree type = TREE_TYPE (niters_prolog);
1846 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
d1d20a49 1847 build_int_cst (type, bound_epilog));
a5e3d614 1848
d1d20a49 1849 *bound_scalar = bound_prolog + bound_epilog;
a5e3d614
BC
1850 if (check_profitability)
1851 {
cbb22e61
BC
1852 /* TH indicates the minimum niters of vectorized loop, while we
1853 compute the maximum niters of scalar loop. */
1854 th--;
a5e3d614
BC
1855 /* Peeling for constant times. */
1856 if (int_niters_prolog >= 0)
1857 {
d1d20a49 1858 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
cbb22e61 1859 return build_int_cst (type, *bound_scalar);
a5e3d614 1860 }
d1d20a49
RS
1861 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1862 and BOUND_EPILOG are inclusive upper bounds. */
1863 if (known_ge (th, bound_prolog + bound_epilog))
a5e3d614 1864 {
cbb22e61 1865 *bound_scalar = th;
a5e3d614
BC
1866 return build_int_cst (type, th);
1867 }
d9f21f6a 1868 /* Need to do runtime comparison. */
d1d20a49 1869 else if (maybe_gt (th, bound_epilog))
d9f21f6a
RS
1870 {
1871 *bound_scalar = upper_bound (*bound_scalar, th);
1872 return fold_build2 (MAX_EXPR, type,
1873 build_int_cst (type, th), niters);
1874 }
a5e3d614
BC
1875 }
1876 return niters;
1877}
1878
0f26839a
RS
1879/* NITERS is the number of times that the original scalar loop executes
1880 after peeling. Work out the maximum number of iterations N that can
1881 be handled by the vectorized form of the loop and then either:
ebfd146a 1882
0f26839a 1883 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
d9157f15 1884
0f26839a
RS
1885 niters_vector = N
1886
1887 b) set *STEP_VECTOR_PTR to one and generate:
1888
1889 niters_vector = N / vf
1890
1891 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1892 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1893 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
ebfd146a
IR
1894
1895void
a5e3d614 1896vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
0f26839a
RS
1897 tree *niters_vector_ptr, tree *step_vector_ptr,
1898 bool niters_no_overflow)
ebfd146a 1899{
a5e3d614 1900 tree ni_minus_gap, var;
0f26839a 1901 tree niters_vector, step_vector, type = TREE_TYPE (niters);
d9f21f6a 1902 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
a5e3d614 1903 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
0f26839a 1904 tree log_vf = NULL_TREE;
a5e3d614
BC
1905
1906 /* If epilogue loop is required because of data accesses with gaps, we
1907 subtract one iteration from the total number of iterations here for
1908 correct calculation of RATIO. */
1909 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1910 {
7078979b
BC
1911 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1912 build_one_cst (type));
a5e3d614
BC
1913 if (!is_gimple_val (ni_minus_gap))
1914 {
7078979b 1915 var = create_tmp_var (type, "ni_gap");
a5e3d614
BC
1916 gimple *stmts = NULL;
1917 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1918 true, var);
1919 gsi_insert_seq_on_edge_immediate (pe, stmts);
1920 }
1921 }
1922 else
1923 ni_minus_gap = niters;
1924
d9f21f6a 1925 unsigned HOST_WIDE_INT const_vf;
7cfb4d93
RS
1926 if (vf.is_constant (&const_vf)
1927 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
0f26839a
RS
1928 {
1929 /* Create: niters >> log2(vf) */
1930 /* If it's known that niters == number of latch executions + 1 doesn't
1931 overflow, we can generate niters >> log2(vf); otherwise we generate
1932 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1933 will be at least one. */
d9f21f6a 1934 log_vf = build_int_cst (type, exact_log2 (const_vf));
0f26839a
RS
1935 if (niters_no_overflow)
1936 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1937 else
1938 niters_vector
1939 = fold_build2 (PLUS_EXPR, type,
1940 fold_build2 (RSHIFT_EXPR, type,
1941 fold_build2 (MINUS_EXPR, type,
1942 ni_minus_gap,
1943 build_int_cst (type, vf)),
1944 log_vf),
1945 build_int_cst (type, 1));
1946 step_vector = build_one_cst (type);
1947 }
a5e3d614 1948 else
0f26839a
RS
1949 {
1950 niters_vector = ni_minus_gap;
1951 step_vector = build_int_cst (type, vf);
1952 }
a5e3d614
BC
1953
1954 if (!is_gimple_val (niters_vector))
1955 {
7078979b
BC
1956 var = create_tmp_var (type, "bnd");
1957 gimple_seq stmts = NULL;
a5e3d614
BC
1958 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1959 gsi_insert_seq_on_edge_immediate (pe, stmts);
7078979b
BC
1960 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1961 we set range information to make niters analyzer's life easier. */
0f26839a 1962 if (stmts != NULL && log_vf)
8e6cdc90
RS
1963 set_range_info (niters_vector, VR_RANGE,
1964 wi::to_wide (build_int_cst (type, 1)),
1965 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1966 TYPE_MAX_VALUE (type),
1967 log_vf)));
a5e3d614
BC
1968 }
1969 *niters_vector_ptr = niters_vector;
0f26839a 1970 *step_vector_ptr = step_vector;
a5e3d614
BC
1971
1972 return;
1973}
1974
1975/* Given NITERS_VECTOR which is the number of iterations for vectorized
1976 loop specified by LOOP_VINFO after vectorization, compute the number
1977 of iterations before vectorization (niters_vector * vf) and store it
1978 to NITERS_VECTOR_MULT_VF_PTR. */
1979
1980static void
1981vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1982 tree niters_vector,
1983 tree *niters_vector_mult_vf_ptr)
1984{
d9f21f6a
RS
1985 /* We should be using a step_vector of VF if VF is variable. */
1986 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
99b1c316 1987 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
a5e3d614
BC
1988 tree type = TREE_TYPE (niters_vector);
1989 tree log_vf = build_int_cst (type, exact_log2 (vf));
1990 basic_block exit_bb = single_exit (loop)->dest;
ebfd146a 1991
a5e3d614
BC
1992 gcc_assert (niters_vector_mult_vf_ptr != NULL);
1993 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
1994 niters_vector, log_vf);
1995 if (!is_gimple_val (niters_vector_mult_vf))
1996 {
1997 tree var = create_tmp_var (type, "niters_vector_mult_vf");
1998 gimple_seq stmts = NULL;
1999 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2000 &stmts, true, var);
2001 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2002 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2003 }
2004 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2005}
2006
b81f2daf
RB
2007/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2008 this function searches for the corresponding lcssa phi node in exit
2009 bb of LOOP. If it is found, return the phi result; otherwise return
2010 NULL. */
2011
2012static tree
2013find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2014 gphi *lcssa_phi)
2015{
2016 gphi_iterator gsi;
2017 edge e = single_exit (loop);
2018
2019 gcc_assert (single_pred_p (e->dest));
2020 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2021 {
2022 gphi *phi = gsi.phi ();
2023 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2024 PHI_ARG_DEF (lcssa_phi, 0), 0))
2025 return PHI_RESULT (phi);
2026 }
2027 return NULL_TREE;
2028}
2029
a5e3d614
BC
2030/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2031 from SECOND/FIRST and puts it at the original loop's preheader/exit
2032 edge, the two loops are arranged as below:
2033
2034 preheader_a:
2035 first_loop:
2036 header_a:
2037 i_1 = PHI<i_0, i_2>;
2038 ...
2039 i_2 = i_1 + 1;
2040 if (cond_a)
2041 goto latch_a;
2042 else
2043 goto between_bb;
2044 latch_a:
2045 goto header_a;
2046
2047 between_bb:
2048 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2049
2050 second_loop:
2051 header_b:
2052 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2053 or with i_2 if no LCSSA phi is created
2054 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2055 ...
2056 i_4 = i_3 + 1;
2057 if (cond_b)
2058 goto latch_b;
2059 else
2060 goto exit_bb;
2061 latch_b:
2062 goto header_b;
2063
2064 exit_bb:
2065
2066 This function creates loop closed SSA for the first loop; update the
2067 second loop's PHI nodes by replacing argument on incoming edge with the
2068 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2069 is false, Loop closed ssa phis will only be created for non-iv phis for
2070 the first loop.
2071
2072 This function assumes exit bb of the first loop is preheader bb of the
2073 second loop, i.e, between_bb in the example code. With PHIs updated,
2074 the second loop will execute rest iterations of the first. */
ebfd146a 2075
a5e3d614
BC
2076static void
2077slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
99b1c316 2078 class loop *first, class loop *second,
a5e3d614
BC
2079 bool create_lcssa_for_iv_phis)
2080{
2081 gphi_iterator gsi_update, gsi_orig;
99b1c316 2082 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
a5e3d614
BC
2083
2084 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2085 edge second_preheader_e = loop_preheader_edge (second);
2086 basic_block between_bb = single_exit (first)->dest;
2087
2088 gcc_assert (between_bb == second_preheader_e->src);
2089 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2090 /* Either the first loop or the second is the loop to be vectorized. */
2091 gcc_assert (loop == first || loop == second);
2092
2093 for (gsi_orig = gsi_start_phis (first->header),
2094 gsi_update = gsi_start_phis (second->header);
2095 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2096 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2097 {
2098 gphi *orig_phi = gsi_orig.phi ();
2099 gphi *update_phi = gsi_update.phi ();
2100
2101 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2102 /* Generate lcssa PHI node for the first loop. */
2103 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
82570274
RS
2104 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2105 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
a5e3d614
BC
2106 {
2107 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2108 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2109 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2110 arg = new_res;
2111 }
2112
2113 /* Update PHI node in the second loop by replacing arg on the loop's
2114 incoming edge. */
2115 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2116 }
b81f2daf
RB
2117
2118 /* For epilogue peeling we have to make sure to copy all LC PHIs
2119 for correct vectorization of live stmts. */
2120 if (loop == first)
2121 {
2122 basic_block orig_exit = single_exit (second)->dest;
2123 for (gsi_orig = gsi_start_phis (orig_exit);
2124 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2125 {
2126 gphi *orig_phi = gsi_orig.phi ();
2127 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2128 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2129 continue;
2130
2131 /* Already created in the above loop. */
2132 if (find_guard_arg (first, second, orig_phi))
2133 continue;
2134
2135 tree new_res = copy_ssa_name (orig_arg);
2136 gphi *lcphi = create_phi_node (new_res, between_bb);
2137 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2138 }
2139 }
a5e3d614
BC
2140}
2141
2142/* Function slpeel_add_loop_guard adds guard skipping from the beginning
2143 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2144 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2145 appear like below:
2146
2147 guard_bb:
2148 if (cond)
2149 goto merge_bb;
2150 else
2151 goto skip_loop;
2152
2153 skip_loop:
2154 header_a:
2155 i_1 = PHI<i_0, i_2>;
2156 ...
2157 i_2 = i_1 + 1;
2158 if (cond_a)
2159 goto latch_a;
2160 else
2161 goto exit_a;
2162 latch_a:
2163 goto header_a;
2164
2165 exit_a:
2166 i_5 = PHI<i_2>;
2167
2168 merge_bb:
2169 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2170
2171 update_loop:
2172 header_b:
2173 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2174 ...
2175 i_4 = i_3 + 1;
2176 if (cond_b)
2177 goto latch_b;
2178 else
2179 goto exit_bb;
2180 latch_b:
2181 goto header_b;
2182
2183 exit_bb:
2184
2185 This function creates PHI nodes at merge_bb and replaces the use of i_5
2186 in the update_loop's PHI node with the result of new PHI result. */
2187
2188static void
99b1c316
MS
2189slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2190 class loop *update_loop,
a5e3d614
BC
2191 edge guard_edge, edge merge_edge)
2192{
620e594b 2193 location_t merge_loc, guard_loc;
a5e3d614
BC
2194 edge orig_e = loop_preheader_edge (skip_loop);
2195 edge update_e = loop_preheader_edge (update_loop);
2196 gphi_iterator gsi_orig, gsi_update;
2197
2198 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2199 gsi_update = gsi_start_phis (update_loop->header));
2200 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2201 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2202 {
2203 gphi *orig_phi = gsi_orig.phi ();
2204 gphi *update_phi = gsi_update.phi ();
2205
2206 /* Generate new phi node at merge bb of the guard. */
2207 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2208 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2209
2210 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2211 args in NEW_PHI for these edges. */
2212 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2213 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2214 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2215 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2216 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2217 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2218
2219 /* Update phi in UPDATE_PHI. */
2220 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2221 }
2222}
2223
a5e3d614
BC
2224/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2225 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2226 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2227 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2228 The CFG looks like:
2229
2230 loop:
2231 header_a:
2232 i_1 = PHI<i_0, i_2>;
2233 ...
2234 i_2 = i_1 + 1;
2235 if (cond_a)
2236 goto latch_a;
2237 else
2238 goto exit_a;
2239 latch_a:
2240 goto header_a;
2241
2242 exit_a:
2243
2244 guard_bb:
2245 if (cond)
2246 goto merge_bb;
2247 else
2248 goto epilog_loop;
2249
2250 ;; fall_through_bb
2251
2252 epilog_loop:
2253 header_b:
2254 i_3 = PHI<i_2, i_4>;
2255 ...
2256 i_4 = i_3 + 1;
2257 if (cond_b)
2258 goto latch_b;
2259 else
2260 goto merge_bb;
2261 latch_b:
2262 goto header_b;
2263
2264 merge_bb:
2265 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2266
2267 exit_bb:
2268 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2269
2270 For each name used out side EPILOG (i.e - for each name that has a lcssa
2271 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2272 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2273 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2274 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2275 in exit_bb will also be updated. */
2276
2277static void
99b1c316 2278slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
a5e3d614
BC
2279 edge guard_edge, edge merge_edge)
2280{
2281 gphi_iterator gsi;
2282 basic_block merge_bb = guard_edge->dest;
2283
2284 gcc_assert (single_succ_p (merge_bb));
2285 edge e = single_succ_edge (merge_bb);
2286 basic_block exit_bb = e->dest;
2287 gcc_assert (single_pred_p (exit_bb));
2288 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2289
2290 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2291 {
2292 gphi *update_phi = gsi.phi ();
2293 tree old_arg = PHI_ARG_DEF (update_phi, 0);
a5e3d614 2294
2e7a4f57
AV
2295 tree merge_arg = NULL_TREE;
2296
2297 /* If the old argument is a SSA_NAME use its current_def. */
2298 if (TREE_CODE (old_arg) == SSA_NAME)
2299 merge_arg = get_current_def (old_arg);
2300 /* If it's a constant or doesn't have a current_def, just use the old
2301 argument. */
a5e3d614
BC
2302 if (!merge_arg)
2303 merge_arg = old_arg;
2304
2305 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2306 /* If the var is live after loop but not a reduction, we simply
2307 use the old arg. */
2308 if (!guard_arg)
2309 guard_arg = old_arg;
2310
2311 /* Create new phi node in MERGE_BB: */
2312 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2313 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2314
2315 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2316 the two PHI args in merge_phi for these edges. */
2317 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2318 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2319
2320 /* Update the original phi in exit_bb. */
2321 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2322 }
2323}
2324
2325/* EPILOG loop is duplicated from the original loop for vectorizing,
2326 the arg of its loop closed ssa PHI needs to be updated. */
2327
2328static void
99b1c316 2329slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
a5e3d614
BC
2330{
2331 gphi_iterator gsi;
2332 basic_block exit_bb = single_exit (epilog)->dest;
2333
2334 gcc_assert (single_pred_p (exit_bb));
2335 edge e = EDGE_PRED (exit_bb, 0);
2336 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2337 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2338}
2339
2340/* Function vect_do_peeling.
2341
2342 Input:
2343 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2344
2345 preheader:
2346 LOOP:
2347 header_bb:
2348 loop_body
2349 if (exit_loop_cond) goto exit_bb
2350 else goto header_bb
2351 exit_bb:
2352
2353 - NITERS: The number of iterations of the loop.
2354 - NITERSM1: The number of iterations of the loop's latch.
2355 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2356 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2357 CHECK_PROFITABILITY is true.
2358 Output:
0f26839a 2359 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
7cfb4d93 2360 iterate after vectorization; see vect_set_loop_condition for details.
0f26839a
RS
2361 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2362 should be set to the number of scalar iterations handled by the
2363 vector loop. The SSA name is only used on exit from the loop.
a5e3d614
BC
2364
2365 This function peels prolog and epilog from the loop, adds guards skipping
2366 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2367 would look like:
2368
2369 guard_bb_1:
2370 if (prefer_scalar_loop) goto merge_bb_1
2371 else goto guard_bb_2
2372
2373 guard_bb_2:
2374 if (skip_prolog) goto merge_bb_2
2375 else goto prolog_preheader
2376
2377 prolog_preheader:
2378 PROLOG:
2379 prolog_header_bb:
2380 prolog_body
2381 if (exit_prolog_cond) goto prolog_exit_bb
2382 else goto prolog_header_bb
2383 prolog_exit_bb:
2384
2385 merge_bb_2:
2386
2387 vector_preheader:
2388 VECTOR LOOP:
2389 vector_header_bb:
2390 vector_body
2391 if (exit_vector_cond) goto vector_exit_bb
2392 else goto vector_header_bb
2393 vector_exit_bb:
2394
2395 guard_bb_3:
2396 if (skip_epilog) goto merge_bb_3
2397 else goto epilog_preheader
2398
2399 merge_bb_1:
2400
2401 epilog_preheader:
2402 EPILOG:
2403 epilog_header_bb:
2404 epilog_body
2405 if (exit_epilog_cond) goto merge_bb_3
2406 else goto epilog_header_bb
2407
2408 merge_bb_3:
2409
2410 Note this function peels prolog and epilog only if it's necessary,
2411 as well as guards.
97c14603
AV
2412 This function returns the epilogue loop if a decision was made to vectorize
2413 it, otherwise NULL.
2414
2415 The analysis resulting in this epilogue loop's loop_vec_info was performed
2416 in the same vect_analyze_loop call as the main loop's. At that time
2417 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2418 vectorization factors than the main loop. This list is stored in the main
2419 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2420 vectorize the epilogue loop for a lower vectorization factor, the
2421 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2422 updated and linked to the epilogue loop. This is later used to vectorize
2423 the epilogue. The reason the loop_vec_info needs updating is that it was
2424 constructed based on the original main loop, and the epilogue loop is a
2425 copy of this loop, so all links pointing to statements in the original loop
2426 need updating. Furthermore, these loop_vec_infos share the
2427 data_reference's records, which will also need to be updated.
a5e3d614
BC
2428
2429 TODO: Guard for prefer_scalar_loop should be emitted along with
2430 versioning conditions if loop versioning is needed. */
2431
598eaaa2 2432
99b1c316 2433class loop *
a5e3d614 2434vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
0f26839a
RS
2435 tree *niters_vector, tree *step_vector,
2436 tree *niters_vector_mult_vf_var, int th,
97c14603
AV
2437 bool check_profitability, bool niters_no_overflow,
2438 tree *advance, drs_init_vec &orig_drs_init)
a5e3d614
BC
2439{
2440 edge e, guard_e;
2441 tree type = TREE_TYPE (niters), guard_cond;
2442 basic_block guard_bb, guard_to;
357067f2 2443 profile_probability prob_prolog, prob_vector, prob_epilog;
d9f21f6a 2444 int estimated_vf;
535e7c11 2445 int prolog_peeling = 0;
97c14603 2446 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
ca31798e
AV
2447 /* We currently do not support prolog peeling if the target alignment is not
2448 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2449 target alignment being constant. */
2450 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2451 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2452 return NULL;
2453
535e7c11
RS
2454 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2455 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
d1d20a49
RS
2456
2457 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2458 poly_uint64 bound_epilog = 0;
2459 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2460 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2461 bound_epilog += vf - 1;
2462 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2463 bound_epilog += 1;
2464 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2465 poly_uint64 bound_scalar = bound_epilog;
a5e3d614
BC
2466
2467 if (!prolog_peeling && !epilog_peeling)
598eaaa2 2468 return NULL;
a5e3d614 2469
357067f2 2470 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
d9f21f6a
RS
2471 estimated_vf = vect_vf_for_cost (loop_vinfo);
2472 if (estimated_vf == 2)
2473 estimated_vf = 3;
357067f2 2474 prob_prolog = prob_epilog = profile_probability::guessed_always ()
d9f21f6a 2475 .apply_scale (estimated_vf - 1, estimated_vf);
a5e3d614 2476
99b1c316
MS
2477 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2478 class loop *first_loop = loop;
3e907b90 2479 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
a5e3d614
BC
2480 create_lcssa_for_virtual_phi (loop);
2481 update_ssa (TODO_update_ssa_only_virtuals);
2482
36f52e8f 2483 if (MAY_HAVE_DEBUG_BIND_STMTS)
a5e3d614
BC
2484 {
2485 gcc_assert (!adjust_vec.exists ());
2486 adjust_vec.create (32);
2487 }
ebfd146a
IR
2488 initialize_original_copy_tables ();
2489
d1d20a49
RS
2490 /* Record the anchor bb at which the guard should be placed if the scalar
2491 loop might be preferred. */
2492 basic_block anchor = loop_preheader_edge (loop)->src;
2493
2494 /* Generate the number of iterations for the prolog loop. We do this here
2495 so that we can also get the upper bound on the number of iterations. */
2496 tree niters_prolog;
2497 int bound_prolog = 0;
2498 if (prolog_peeling)
2499 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
97c14603 2500 &bound_prolog);
d1d20a49
RS
2501 else
2502 niters_prolog = build_int_cst (type, 0);
2503
97c14603
AV
2504 loop_vec_info epilogue_vinfo = NULL;
2505 if (vect_epilogues)
2506 {
2507 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2508 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2509 }
2510
2511 tree niters_vector_mult_vf = NULL_TREE;
2512 /* Saving NITERs before the loop, as this may be changed by prologue. */
2513 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2514 edge update_e = NULL, skip_e = NULL;
2515 unsigned int lowest_vf = constant_lower_bound (vf);
2516 /* If we know the number of scalar iterations for the main loop we should
2517 check whether after the main loop there are enough iterations left over
2518 for the epilogue. */
2519 if (vect_epilogues
2520 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2521 && prolog_peeling >= 0
2522 && known_eq (vf, lowest_vf))
2523 {
2524 unsigned HOST_WIDE_INT eiters
2525 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2526 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2527
2528 eiters -= prolog_peeling;
2529 eiters
2530 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2531
2532 unsigned int ratio;
87b47251
AV
2533 unsigned int epilogue_gaps
2534 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
97c14603
AV
2535 while (!(constant_multiple_p (loop_vinfo->vector_size,
2536 epilogue_vinfo->vector_size, &ratio)
87b47251 2537 && eiters >= lowest_vf / ratio + epilogue_gaps))
97c14603
AV
2538 {
2539 delete epilogue_vinfo;
2540 epilogue_vinfo = NULL;
2541 if (loop_vinfo->epilogue_vinfos.length () == 0)
2542 {
2543 vect_epilogues = false;
2544 break;
2545 }
2546 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2547 loop_vinfo->epilogue_vinfos.ordered_remove (0);
87b47251 2548 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
97c14603
AV
2549 }
2550 }
a5e3d614
BC
2551 /* Prolog loop may be skipped. */
2552 bool skip_prolog = (prolog_peeling != 0);
97c14603
AV
2553 /* Skip this loop to epilog when there are not enough iterations to enter this
2554 vectorized loop. If true we should perform runtime checks on the NITERS
2555 to check whether we should skip the current vectorized loop. If we know
2556 the number of scalar iterations we may choose to add a runtime check if
2557 this number "maybe" smaller than the number of iterations required
2558 when we know the number of scalar iterations may potentially
2559 be smaller than the number of iterations required to enter this loop, for
2560 this we use the upper bounds on the prolog and epilog peeling. When we
2561 don't know the number of iterations and don't require versioning it is
2562 because we have asserted that there are enough scalar iterations to enter
2563 the main loop, so this skip is not necessary. When we are versioning then
2564 we only add such a skip if we have chosen to vectorize the epilogue. */
d1d20a49
RS
2565 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2566 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2567 bound_prolog + bound_epilog)
97c14603
AV
2568 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2569 || vect_epilogues));
a5e3d614
BC
2570 /* Epilog loop must be executed if the number of iterations for epilog
2571 loop is known at compile time, otherwise we need to add a check at
2572 the end of vector loop and skip to the end of epilog loop. */
2573 bool skip_epilog = (prolog_peeling < 0
d9f21f6a
RS
2574 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2575 || !vf.is_constant ());
a5e3d614
BC
2576 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2577 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2578 skip_epilog = false;
2579
a5e3d614 2580 if (skip_vector)
25c99850
BC
2581 {
2582 split_edge (loop_preheader_edge (loop));
2583
2584 /* Due to the order in which we peel prolog and epilog, we first
2585 propagate probability to the whole loop. The purpose is to
2586 avoid adjusting probabilities of both prolog and vector loops
2587 separately. Note in this case, the probability of epilog loop
2588 needs to be scaled back later. */
2589 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
357067f2 2590 if (prob_vector.initialized_p ())
d9f21f6a
RS
2591 {
2592 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2593 scale_loop_profile (loop, prob_vector, 0);
2594 }
25c99850 2595 }
a5e3d614 2596
4f5b9c80 2597 dump_user_location_t loop_loc = find_loop_location (loop);
99b1c316 2598 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
97c14603
AV
2599 if (vect_epilogues)
2600 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2601 use the original scalar loop as remaining epilogue if necessary. */
2602 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2603 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2604
a5e3d614 2605 if (prolog_peeling)
5d2eb24b 2606 {
a5e3d614
BC
2607 e = loop_preheader_edge (loop);
2608 if (!slpeel_can_duplicate_loop_p (loop, e))
5d2eb24b 2609 {
a5e3d614
BC
2610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2611 "loop can't be duplicated to preheader edge.\n");
2612 gcc_unreachable ();
2613 }
2614 /* Peel prolog and put it on preheader edge of loop. */
2615 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2616 if (!prolog)
2617 {
2618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2619 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2620 gcc_unreachable ();
2621 }
03001a35 2622 prolog->force_vectorize = false;
a5e3d614
BC
2623 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2624 first_loop = prolog;
2625 reset_original_copy_tables ();
2626
d1d20a49 2627 /* Update the number of iterations for prolog loop. */
0f26839a 2628 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
7cfb4d93
RS
2629 vect_set_loop_condition (prolog, NULL, niters_prolog,
2630 step_prolog, NULL_TREE, false);
a5e3d614
BC
2631
2632 /* Skip the prolog loop. */
2633 if (skip_prolog)
2634 {
2635 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2636 niters_prolog, build_int_cst (type, 0));
2637 guard_bb = loop_preheader_edge (prolog)->src;
25c99850 2638 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
a5e3d614
BC
2639 guard_to = split_edge (loop_preheader_edge (loop));
2640 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2641 guard_to, guard_bb,
357067f2 2642 prob_prolog.invert (),
3e907b90 2643 irred_flag);
a5e3d614
BC
2644 e = EDGE_PRED (guard_to, 0);
2645 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2646 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
25c99850 2647
af2bbc51
JH
2648 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2649 scale_loop_profile (prolog, prob_prolog, bound_prolog);
a5e3d614 2650 }
97c14603
AV
2651
2652 /* Save original inits for each data_reference before advancing them with
2653 NITERS_PROLOG. */
2654 unsigned int i;
2655 struct data_reference *dr;
2656 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
2657 FOR_EACH_VEC_ELT (datarefs, i, dr)
2658 orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr)));
2659
a5e3d614 2660 /* Update init address of DRs. */
535e7c11 2661 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
a5e3d614
BC
2662 /* Update niters for vector loop. */
2663 LOOP_VINFO_NITERS (loop_vinfo)
2664 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2665 LOOP_VINFO_NITERSM1 (loop_vinfo)
2666 = fold_build2 (MINUS_EXPR, type,
2667 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
7078979b
BC
2668 bool new_var_p = false;
2669 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2670 /* It's guaranteed that vector loop bound before vectorization is at
2671 least VF, so set range information for newly generated var. */
2672 if (new_var_p)
2673 set_range_info (niters, VR_RANGE,
8e6cdc90
RS
2674 wi::to_wide (build_int_cst (type, vf)),
2675 wi::to_wide (TYPE_MAX_VALUE (type)));
a5e3d614 2676
cbb22e61
BC
2677 /* Prolog iterates at most bound_prolog times, latch iterates at
2678 most bound_prolog - 1 times. */
2679 record_niter_bound (prolog, bound_prolog - 1, false, true);
a5e3d614
BC
2680 delete_update_ssa ();
2681 adjust_vec_debug_stmts ();
2682 scev_reset ();
5d2eb24b
IR
2683 }
2684
a5e3d614
BC
2685 if (epilog_peeling)
2686 {
2687 e = single_exit (loop);
2688 if (!slpeel_can_duplicate_loop_p (loop, e))
2689 {
2690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2691 "loop can't be duplicated to exit edge.\n");
2692 gcc_unreachable ();
2693 }
97c14603
AV
2694 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2695 said epilog then we should use a copy of the main loop as a starting
2696 point. This loop may have already had some preliminary transformations
2697 to allow for more optimal vectorization, for example if-conversion.
2698 If we are not vectorizing the epilog then we should use the scalar loop
2699 as the transformations mentioned above make less or no sense when not
2700 vectorizing. */
2701 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2702 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
a5e3d614
BC
2703 if (!epilog)
2704 {
2705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2706 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2707 gcc_unreachable ();
2708 }
03001a35 2709 epilog->force_vectorize = false;
a5e3d614
BC
2710 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2711
2712 /* Scalar version loop may be preferred. In this case, add guard
2713 and skip to epilog. Note this only happens when the number of
2714 iterations of loop is unknown at compile time, otherwise this
2715 won't be vectorized. */
2716 if (skip_vector)
2717 {
cbb22e61 2718 /* Additional epilogue iteration is peeled if gap exists. */
a5e3d614 2719 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
d1d20a49 2720 bound_prolog, bound_epilog,
cbb22e61 2721 th, &bound_scalar,
a5e3d614 2722 check_profitability);
cbb22e61
BC
2723 /* Build guard against NITERSM1 since NITERS may overflow. */
2724 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
a5e3d614
BC
2725 guard_bb = anchor;
2726 guard_to = split_edge (loop_preheader_edge (epilog));
2727 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2728 guard_to, guard_bb,
357067f2 2729 prob_vector.invert (),
3e907b90 2730 irred_flag);
97c14603 2731 skip_e = guard_e;
a5e3d614
BC
2732 e = EDGE_PRED (guard_to, 0);
2733 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2734 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
25c99850
BC
2735
2736 /* Simply propagate profile info from guard_bb to guard_to which is
2737 a merge point of control flow. */
25c99850 2738 guard_to->count = guard_bb->count;
00fa28d1 2739
af2bbc51
JH
2740 /* Scale probability of epilog loop back.
2741 FIXME: We should avoid scaling down and back up. Profile may
2742 get lost if we scale down to 0. */
10ea2672 2743 basic_block *bbs = get_loop_body (epilog);
00fa28d1
JH
2744 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2745 bbs[i]->count = bbs[i]->count.apply_scale
2746 (bbs[i]->count,
2747 bbs[i]->count.apply_probability
2748 (prob_vector));
af2bbc51 2749 free (bbs);
a5e3d614 2750 }
ebfd146a 2751
25c99850 2752 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
a5e3d614
BC
2753 /* If loop is peeled for non-zero constant times, now niters refers to
2754 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2755 overflows. */
2756 niters_no_overflow |= (prolog_peeling > 0);
2757 vect_gen_vector_loop_niters (loop_vinfo, niters,
0f26839a
RS
2758 niters_vector, step_vector,
2759 niters_no_overflow);
2760 if (!integer_onep (*step_vector))
2761 {
2762 /* On exit from the loop we will have an easy way of calcalating
2763 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2764 until then. */
2765 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2766 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2767 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2768 }
2769 else
2770 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2771 &niters_vector_mult_vf);
a5e3d614
BC
2772 /* Update IVs of original loop as if they were advanced by
2773 niters_vector_mult_vf steps. */
2774 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
97c14603 2775 update_e = skip_vector ? e : loop_preheader_edge (epilog);
a5e3d614
BC
2776 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2777 update_e);
2778
2779 if (skip_epilog)
2780 {
2781 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2782 niters, niters_vector_mult_vf);
2783 guard_bb = single_exit (loop)->dest;
2784 guard_to = split_edge (single_exit (epilog));
2785 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2786 skip_vector ? anchor : guard_bb,
357067f2 2787 prob_epilog.invert (),
3e907b90 2788 irred_flag);
a5e3d614
BC
2789 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2790 single_exit (epilog));
25c99850
BC
2791 /* Only need to handle basic block before epilog loop if it's not
2792 the guard_bb, which is the case when skip_vector is true. */
2793 if (guard_bb != bb_before_epilog)
2794 {
357067f2 2795 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
25c99850 2796
af2bbc51 2797 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
25c99850 2798 }
d9f21f6a 2799 scale_loop_profile (epilog, prob_epilog, 0);
a5e3d614
BC
2800 }
2801 else
2802 slpeel_update_phi_nodes_for_lcssa (epilog);
ebfd146a 2803
d1d20a49
RS
2804 unsigned HOST_WIDE_INT bound;
2805 if (bound_scalar.is_constant (&bound))
d9f21f6a 2806 {
d1d20a49
RS
2807 gcc_assert (bound != 0);
2808 /* -1 to convert loop iterations to latch iterations. */
2809 record_niter_bound (epilog, bound - 1, false, true);
d9f21f6a 2810 }
a5e3d614
BC
2811
2812 delete_update_ssa ();
2813 adjust_vec_debug_stmts ();
2814 scev_reset ();
2815 }
97c14603
AV
2816
2817 if (vect_epilogues)
2818 {
2819 epilog->aux = epilogue_vinfo;
2820 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2821
2822 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2823
2824 /* We now must calculate the number of NITERS performed by the previous
2825 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2826 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2827 niters_prolog, niters_vector_mult_vf);
2828
2829 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2830 determine whether we are coming from the previous vectorized loop
2831 using the update_e edge or the skip_vector basic block using the
2832 skip_e edge. */
2833 if (skip_vector)
2834 {
2835 gcc_assert (update_e != NULL && skip_e != NULL);
2836 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2837 update_e->dest);
2838 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2839 gimple *stmt = gimple_build_assign (new_ssa, niters);
2840 gimple_stmt_iterator gsi;
2841 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2842 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2843 {
2844 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2845 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2846 }
2847 else
2848 {
2849 gsi = gsi_last_bb (update_e->src);
2850 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2851 }
2852
2853 niters = new_ssa;
2854 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2855 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2856 UNKNOWN_LOCATION);
2857 niters = PHI_RESULT (new_phi);
2858 }
2859
2860 /* Subtract the number of iterations performed by the vectorized loop
2861 from the number of total iterations. */
2862 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2863 before_loop_niters,
2864 niters);
2865
2866 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2867 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2868 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2869 epilogue_niters,
2870 build_one_cst (TREE_TYPE (epilogue_niters)));
2871
2872 /* Set ADVANCE to the number of iterations performed by the previous
2873 loop and its prologue. */
2874 *advance = niters;
2875
2876 /* Redo the peeling for niter analysis as the NITERs and alignment
2877 may have been updated to take the main loop into account. */
2878 determine_peel_for_niter (epilogue_vinfo);
2879 }
2880
a5e3d614 2881 adjust_vec.release ();
ebfd146a 2882 free_original_copy_tables ();
598eaaa2 2883
97c14603 2884 return vect_epilogues ? epilog : NULL;
ebfd146a
IR
2885}
2886
01d32b2b
BC
2887/* Function vect_create_cond_for_niters_checks.
2888
2889 Create a conditional expression that represents the run-time checks for
ce811fc4 2890 loop's niter. The loop is guaranteed to terminate if the run-time
01d32b2b
BC
2891 checks hold.
2892
2893 Input:
2894 COND_EXPR - input conditional expression. New conditions will be chained
2895 with logical AND operation. If it is NULL, then the function
2896 is used to return the number of alias checks.
2897 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2898 to be checked.
2899
2900 Output:
2901 COND_EXPR - conditional expression.
2902
2903 The returned COND_EXPR is the conditional expression to be used in the
2904 if statement that controls which version of the loop gets executed at
2905 runtime. */
2906
2907static void
2908vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2909{
2910 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2911
2912 if (*cond_expr)
2913 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2914 *cond_expr, part_cond_expr);
2915 else
2916 *cond_expr = part_cond_expr;
2917}
ebfd146a 2918
9adee305
RS
2919/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2920 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2921
2922static void
2923chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2924{
2925 if (*cond_expr)
2926 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2927 *cond_expr, part_cond_expr);
2928 else
2929 *cond_expr = part_cond_expr;
2930}
2931
ebfd146a
IR
2932/* Function vect_create_cond_for_align_checks.
2933
2934 Create a conditional expression that represents the alignment checks for
2935 all of data references (array element references) whose alignment must be
2936 checked at runtime.
2937
2938 Input:
2939 COND_EXPR - input conditional expression. New conditions will be chained
2940 with logical AND operation.
2941 LOOP_VINFO - two fields of the loop information are used.
2942 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2943 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2944
2945 Output:
2946 COND_EXPR_STMT_LIST - statements needed to construct the conditional
2947 expression.
2948 The returned value is the conditional expression to be used in the if
2949 statement that controls which version of the loop gets executed at runtime.
2950
2951 The algorithm makes two assumptions:
2952 1) The number of bytes "n" in a vector is a power of 2.
2953 2) An address "a" is aligned if a%n is zero and that this
2954 test can be done as a&(n-1) == 0. For example, for 16
2955 byte vectors the test is a&0xf == 0. */
2956
2957static void
2958vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2959 tree *cond_expr,
2960 gimple_seq *cond_expr_stmt_list)
2961{
7bcbf2d8 2962 vec<stmt_vec_info> may_misalign_stmts
ebfd146a 2963 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7bcbf2d8 2964 stmt_vec_info stmt_info;
ebfd146a
IR
2965 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2966 tree mask_cst;
2967 unsigned int i;
ebfd146a
IR
2968 tree int_ptrsize_type;
2969 char tmp_name[20];
2970 tree or_tmp_name = NULL_TREE;
83d5977e 2971 tree and_tmp_name;
355fe088 2972 gimple *and_stmt;
ebfd146a
IR
2973 tree ptrsize_zero;
2974 tree part_cond_expr;
2975
2976 /* Check that mask is one less than a power of 2, i.e., mask is
2977 all zeros followed by all ones. */
2978 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2979
96f9265a 2980 int_ptrsize_type = signed_type_for (ptr_type_node);
ebfd146a
IR
2981
2982 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2983 of the first vector of the i'th data reference. */
2984
7bcbf2d8 2985 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
ebfd146a
IR
2986 {
2987 gimple_seq new_stmt_list = NULL;
2988 tree addr_base;
83d5977e
RG
2989 tree addr_tmp_name;
2990 tree new_or_tmp_name;
355fe088 2991 gimple *addr_stmt, *or_stmt;
7bcbf2d8 2992 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
d8ba5b19 2993 bool negative = tree_int_cst_compare
7bcbf2d8 2994 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
d8ba5b19 2995 tree offset = negative
aad83b7c 2996 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
ebfd146a
IR
2997
2998 /* create: addr_tmp = (int)(address_of_first_vector) */
2999 addr_base =
7bcbf2d8 3000 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3f5e8a76 3001 offset);
ebfd146a
IR
3002 if (new_stmt_list != NULL)
3003 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3004
83d5977e
RG
3005 sprintf (tmp_name, "addr2int%d", i);
3006 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
0d0e4a03 3007 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
ebfd146a
IR
3008 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3009
3010 /* The addresses are OR together. */
3011
3012 if (or_tmp_name != NULL_TREE)
3013 {
3014 /* create: or_tmp = or_tmp | addr_tmp */
83d5977e
RG
3015 sprintf (tmp_name, "orptrs%d", i);
3016 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
0d0e4a03
JJ
3017 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3018 or_tmp_name, addr_tmp_name);
ebfd146a
IR
3019 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3020 or_tmp_name = new_or_tmp_name;
3021 }
3022 else
3023 or_tmp_name = addr_tmp_name;
3024
3025 } /* end for i */
3026
3027 mask_cst = build_int_cst (int_ptrsize_type, mask);
3028
3029 /* create: and_tmp = or_tmp & mask */
83d5977e 3030 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
ebfd146a 3031
0d0e4a03
JJ
3032 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3033 or_tmp_name, mask_cst);
ebfd146a
IR
3034 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3035
3036 /* Make and_tmp the left operand of the conditional test against zero.
3037 if and_tmp has a nonzero bit then some address is unaligned. */
3038 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3039 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3040 and_tmp_name, ptrsize_zero);
9adee305
RS
3041 chain_cond_expr (cond_expr, part_cond_expr);
3042}
3043
3044/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3045 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3046 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3047 and this new condition are true. Treat a null *COND_EXPR as "true". */
3048
3049static void
3050vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3051{
3052 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3053 unsigned int i;
3054 vec_object_pair *pair;
3055 FOR_EACH_VEC_ELT (pairs, i, pair)
3056 {
3057 tree addr1 = build_fold_addr_expr (pair->first);
3058 tree addr2 = build_fold_addr_expr (pair->second);
3059 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3060 addr1, addr2);
3061 chain_cond_expr (cond_expr, part_cond_expr);
3062 }
ebfd146a
IR
3063}
3064
a57776a1
RS
3065/* Create an expression that is true when all lower-bound conditions for
3066 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3067
3068static void
3069vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3070{
3071 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3072 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3073 {
3074 tree expr = lower_bounds[i].expr;
3075 tree type = unsigned_type_for (TREE_TYPE (expr));
3076 expr = fold_convert (type, expr);
3077 poly_uint64 bound = lower_bounds[i].min_value;
3078 if (!lower_bounds[i].unsigned_p)
3079 {
3080 expr = fold_build2 (PLUS_EXPR, type, expr,
3081 build_int_cstu (type, bound - 1));
3082 bound += bound - 1;
3083 }
3084 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3085 build_int_cstu (type, bound));
3086 chain_cond_expr (cond_expr, part_cond_expr);
3087 }
3088}
3089
ebfd146a
IR
3090/* Function vect_create_cond_for_alias_checks.
3091
3092 Create a conditional expression that represents the run-time checks for
3093 overlapping of address ranges represented by a list of data references
3094 relations passed as input.
3095
3096 Input:
3097 COND_EXPR - input conditional expression. New conditions will be chained
a05a89fa
CH
3098 with logical AND operation. If it is NULL, then the function
3099 is used to return the number of alias checks.
ebfd146a
IR
3100 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3101 to be checked.
3102
3103 Output:
3104 COND_EXPR - conditional expression.
ebfd146a 3105
a05a89fa 3106 The returned COND_EXPR is the conditional expression to be used in the if
ebfd146a
IR
3107 statement that controls which version of the loop gets executed at runtime.
3108*/
3109
a05a89fa 3110void
4bdd44c4 3111vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
ebfd146a 3112{
93bdc3ed 3113 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
a05a89fa 3114 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
ebfd146a 3115
a05a89fa 3116 if (comp_alias_ddrs.is_empty ())
ebfd146a
IR
3117 return;
3118
9cbd2d97
BC
3119 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3120 &comp_alias_ddrs, cond_expr);
73fbfcad 3121 if (dump_enabled_p ())
ccb3ad87 3122 dump_printf_loc (MSG_NOTE, vect_location,
78c60e3d 3123 "created %u versioning for alias checks.\n",
a05a89fa 3124 comp_alias_ddrs.length ());
ebfd146a
IR
3125}
3126
3127
3128/* Function vect_loop_versioning.
b8698a0f 3129
ebfd146a
IR
3130 If the loop has data references that may or may not be aligned or/and
3131 has data reference relations whose independence was not proven then
3132 two versions of the loop need to be generated, one which is vectorized
3133 and one which isn't. A test is then generated to control which of the
3134 loops is executed. The test checks for the alignment of all of the
3135 data references that may or may not be aligned. An additional
3136 sequence of runtime tests is generated for each pairs of DDRs whose
b8698a0f
L
3137 independence was not proven. The vectorized version of loop is
3138 executed only if both alias and alignment tests are passed.
3139
ebfd146a 3140 The test generated to check which version of loop is executed
b8698a0f 3141 is modified to also check for profitability as indicated by the
d9157f15 3142 cost model threshold TH.
86290011
RG
3143
3144 The versioning precondition(s) are placed in *COND_EXPR and
d68d56b5 3145 *COND_EXPR_STMT_LIST. */
ebfd146a 3146
99b1c316 3147class loop *
a421fe9e 3148vect_loop_versioning (loop_vec_info loop_vinfo)
ebfd146a 3149{
99b1c316
MS
3150 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3151 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
ebfd146a 3152 basic_block condition_bb;
538dd0b7
DM
3153 gphi_iterator gsi;
3154 gimple_stmt_iterator cond_exp_gsi;
ebfd146a
IR
3155 basic_block merge_bb;
3156 basic_block new_exit_bb;
3157 edge new_exit_e, e;
538dd0b7 3158 gphi *orig_phi, *new_phi;
368117e8 3159 tree cond_expr = NULL_TREE;
d68d56b5 3160 gimple_seq cond_expr_stmt_list = NULL;
ebfd146a 3161 tree arg;
af2bbc51 3162 profile_probability prob = profile_probability::likely ();
ebfd146a 3163 gimple_seq gimplify_stmt_list = NULL;
fdd29374 3164 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
9cc1fb4b
XDL
3165 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3166 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
01d32b2b 3167 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
a421fe9e
AV
3168 poly_uint64 versioning_threshold
3169 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
4e65deef
JJ
3170 tree version_simd_if_cond
3171 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
a421fe9e 3172 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
ebfd146a 3173
6eed64b9 3174 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
a421fe9e 3175 && !ordered_p (th, versioning_threshold))
80be3333 3176 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
01d32b2b 3177 build_int_cst (TREE_TYPE (scalar_loop_iters),
fdd29374 3178 th - 1));
a696bc4f
RS
3179 if (maybe_ne (versioning_threshold, 0U))
3180 {
3181 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3182 build_int_cst (TREE_TYPE (scalar_loop_iters),
3183 versioning_threshold - 1));
3184 if (cond_expr)
3185 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3186 expr, cond_expr);
3187 else
3188 cond_expr = expr;
3189 }
01d32b2b
BC
3190
3191 if (version_niter)
3192 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3193
3194 if (cond_expr)
2778a719
RB
3195 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3196 &cond_expr_stmt_list,
01d32b2b 3197 is_gimple_condexpr, NULL_TREE);
ebfd146a 3198
9cc1fb4b 3199 if (version_align)
d68d56b5
RG
3200 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3201 &cond_expr_stmt_list);
ebfd146a 3202
9cc1fb4b 3203 if (version_alias)
9adee305
RS
3204 {
3205 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
a57776a1 3206 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
9adee305
RS
3207 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3208 }
86290011 3209
4e65deef
JJ
3210 if (version_simd_if_cond)
3211 {
3212 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3213 if (flag_checking)
3214 if (basic_block bb
3215 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3216 gcc_assert (bb != loop->header
3217 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3218 && (scalar_loop == NULL
3219 || (bb != scalar_loop->header
3220 && dominated_by_p (CDI_DOMINATORS,
3221 scalar_loop->header, bb))));
3222 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3223 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3224 version_simd_if_cond, zero);
3225 if (cond_expr)
3226 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3227 c, cond_expr);
3228 else
3229 cond_expr = c;
3230 if (dump_enabled_p ())
3231 dump_printf_loc (MSG_NOTE, vect_location,
3232 "created versioning for simd if condition check.\n");
3233 }
3234
b210f45f
RB
3235 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3236 &gimplify_stmt_list,
d68d56b5
RG
3237 is_gimple_condexpr, NULL_TREE);
3238 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
ebfd146a 3239
2778a719
RB
3240 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3241 invariant in. */
99b1c316 3242 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
2778a719
RB
3243 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3244 !gsi_end_p (gsi); gsi_next (&gsi))
5ce9450f 3245 {
2778a719
RB
3246 gimple *stmt = gsi_stmt (gsi);
3247 update_stmt (stmt);
3248 ssa_op_iter iter;
3249 use_operand_p use_p;
3250 basic_block def_bb;
3251 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3252 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3253 && flow_bb_inside_loop_p (outermost, def_bb))
3254 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3255 }
5ce9450f 3256
2778a719
RB
3257 /* Search for the outermost loop we can version. Avoid versioning of
3258 non-perfect nests but allow if-conversion versioned loops inside. */
99b1c316 3259 class loop *loop_to_version = loop;
2778a719
RB
3260 if (flow_loop_nested_p (outermost, loop))
3261 {
3262 if (dump_enabled_p ())
3263 dump_printf_loc (MSG_NOTE, vect_location,
3264 "trying to apply versioning to outer loop %d\n",
3265 outermost->num);
3266 if (outermost->num == 0)
3267 outermost = superloop_at_depth (loop, 1);
3268 /* And avoid applying versioning on non-perfect nests. */
3269 while (loop_to_version != outermost
3270 && single_exit (loop_outer (loop_to_version))
3271 && (!loop_outer (loop_to_version)->inner->next
3272 || vect_loop_vectorized_call (loop_to_version))
3273 && (!loop_outer (loop_to_version)->inner->next
3274 || !loop_outer (loop_to_version)->inner->next->next))
3275 loop_to_version = loop_outer (loop_to_version);
3276 }
3277
3278 /* Apply versioning. If there is already a scalar version created by
b614fca2
RB
3279 if-conversion re-use that. Note we cannot re-use the copy of
3280 an if-converted outer-loop when vectorizing the inner loop only. */
2778a719 3281 gcond *cond;
b614fca2
RB
3282 gimple *call;
3283 if ((!loop_to_version->inner || loop == loop_to_version)
3284 && (call = vect_loop_vectorized_call (loop_to_version, &cond)))
2778a719
RB
3285 {
3286 gcc_assert (scalar_loop);
3287 condition_bb = gimple_bb (cond);
3288 gimple_cond_set_condition_from_tree (cond, cond_expr);
3289 update_stmt (cond);
3290
3291 if (cond_expr_stmt_list)
3292 {
3293 cond_exp_gsi = gsi_for_stmt (call);
3294 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3295 GSI_SAME_STMT);
3296 }
3297
5cee3239
RB
3298 /* if-conversion uses profile_probability::always () for both paths,
3299 reset the paths probabilities appropriately. */
3300 edge te, fe;
3301 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3302 te->probability = prob;
3303 fe->probability = prob.invert ();
3304 /* We can scale loops counts immediately but have to postpone
3305 scaling the scalar loop because we re-use it during peeling. */
3306 scale_loop_frequencies (loop_to_version, te->probability);
3307 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3308
2778a719
RB
3309 nloop = scalar_loop;
3310 if (dump_enabled_p ())
3311 dump_printf_loc (MSG_NOTE, vect_location,
3312 "reusing %sloop version created by if conversion\n",
3313 loop_to_version != loop ? "outer " : "");
5ce9450f
JJ
3314 }
3315 else
2778a719
RB
3316 {
3317 if (loop_to_version != loop
3318 && dump_enabled_p ())
3319 dump_printf_loc (MSG_NOTE, vect_location,
3320 "applying loop versioning to outer loop %d\n",
3321 loop_to_version->num);
3322
3323 initialize_original_copy_tables ();
3324 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3325 prob, prob.invert (), prob, prob.invert (), true);
3326 gcc_assert (nloop);
3327 nloop = get_loop_copy (loop);
3328
3329 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3330 reap those otherwise; they also refer to the original
3331 loops. */
99b1c316 3332 class loop *l = loop;
2778a719
RB
3333 while (gimple *call = vect_loop_vectorized_call (l))
3334 {
3335 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3336 fold_loop_internal_call (call, boolean_false_node);
3337 l = loop_outer (l);
3338 }
3339 free_original_copy_tables ();
3340
3341 if (cond_expr_stmt_list)
3342 {
3343 cond_exp_gsi = gsi_last_bb (condition_bb);
3344 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3345 GSI_SAME_STMT);
3346 }
3347
3348 /* Loop versioning violates an assumption we try to maintain during
3349 vectorization - that the loop exit block has a single predecessor.
3350 After versioning, the exit block of both loop versions is the same
3351 basic block (i.e. it has two predecessors). Just in order to simplify
3352 following transformations in the vectorizer, we fix this situation
3353 here by adding a new (empty) block on the exit-edge of the loop,
3354 with the proper loop-exit phis to maintain loop-closed-form.
3355 If loop versioning wasn't done from loop, but scalar_loop instead,
3356 merge_bb will have already just a single successor. */
3357
3358 merge_bb = single_exit (loop_to_version)->dest;
3359 if (EDGE_COUNT (merge_bb->preds) >= 2)
3360 {
3361 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3362 new_exit_bb = split_edge (single_exit (loop_to_version));
3363 new_exit_e = single_exit (loop_to_version);
3364 e = EDGE_SUCC (new_exit_bb, 0);
3365
3366 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3367 gsi_next (&gsi))
3368 {
3369 tree new_res;
3370 orig_phi = gsi.phi ();
3371 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3372 new_phi = create_phi_node (new_res, new_exit_bb);
3373 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3374 add_phi_arg (new_phi, arg, new_exit_e,
3375 gimple_phi_arg_location_from_edge (orig_phi, e));
3376 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3377 }
3378 }
3379
3380 update_ssa (TODO_update_ssa);
3381 }
01d32b2b
BC
3382
3383 if (version_niter)
3384 {
3385 /* The versioned loop could be infinite, we need to clear existing
3386 niter information which is copied from the original loop. */
3387 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3388 vect_free_loop_info_assumptions (nloop);
3389 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3390 loop_constraint_set (loop, LOOP_C_INFINITE);
3391 }
9cc1fb4b 3392
4f5b9c80 3393 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
9cc1fb4b
XDL
3394 && dump_enabled_p ())
3395 {
3396 if (version_alias)
7db960c5
DM
3397 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3398 vect_location,
9cc1fb4b
XDL
3399 "loop versioned for vectorization because of "
3400 "possible aliasing\n");
3401 if (version_align)
7db960c5
DM
3402 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3403 vect_location,
9cc1fb4b
XDL
3404 "loop versioned for vectorization to enhance "
3405 "alignment\n");
3406
3407 }
03001a35
RB
3408
3409 return nloop;
ebfd146a 3410}