]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-loop-manip.c
Update copyright years.
[thirdparty/gcc.git] / gcc / tree-vect-loop-manip.c
CommitLineData
b8698a0f 1/* Vectorizer Specific Loop Manipulations
8d9254fc 2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
b8698a0f 3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
ebfd146a
IR
4 and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
40e23961 26#include "tree.h"
c7131fb2 27#include "gimple.h"
957060b5
AM
28#include "cfghooks.h"
29#include "tree-pass.h"
c7131fb2 30#include "ssa.h"
c7131fb2 31#include "fold-const.h"
60393bbc 32#include "cfganal.h"
45b0be94 33#include "gimplify.h"
5be5c238 34#include "gimple-iterator.h"
18f429e2 35#include "gimplify-me.h"
442b4905 36#include "tree-cfg.h"
e28030cf 37#include "tree-ssa-loop-manip.h"
442b4905 38#include "tree-into-ssa.h"
7a300452 39#include "tree-ssa.h"
ebfd146a 40#include "cfgloop.h"
ebfd146a
IR
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
2a93954e 43#include "tree-ssa-loop-ivopts.h"
0f26839a 44#include "gimple-fold.h"
7cfb4d93
RS
45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
0a0ef238
RS
50#include "insn-config.h"
51#include "rtl.h"
52#include "recog.h"
ebfd146a
IR
53
54/*************************************************************************
55 Simple Loop Peeling Utilities
56
57 Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
59
60
61/* Renames the use *OP_P. */
62
63static void
64rename_use_op (use_operand_p op_p)
65{
66 tree new_name;
67
68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69 return;
70
71 new_name = get_current_def (USE_FROM_PTR (op_p));
72
73 /* Something defined outside of the loop. */
74 if (!new_name)
75 return;
76
77 /* An ordinary ssa name defined in the loop. */
78
79 SET_USE (op_p, new_name);
80}
81
82
cb330ba5 83/* Renames the variables in basic block BB. Allow renaming of PHI arguments
a6c51a12
YR
84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85 true. */
ebfd146a 86
2cfc56b9 87static void
a6c51a12 88rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
ebfd146a 89{
355fe088 90 gimple *stmt;
ebfd146a
IR
91 use_operand_p use_p;
92 ssa_op_iter iter;
93 edge e;
94 edge_iterator ei;
99b1c316
MS
95 class loop *loop = bb->loop_father;
96 class loop *outer_loop = NULL;
a6c51a12
YR
97
98 if (rename_from_outer_loop)
99 {
100 gcc_assert (loop);
101 outer_loop = loop_outer (loop);
102 }
ebfd146a 103
538dd0b7
DM
104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105 gsi_next (&gsi))
ebfd146a
IR
106 {
107 stmt = gsi_stmt (gsi);
108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109 rename_use_op (use_p);
110 }
111
2cfc56b9 112 FOR_EACH_EDGE (e, ei, bb->preds)
ebfd146a 113 {
cb330ba5
JJ
114 if (!flow_bb_inside_loop_p (loop, e->src))
115 {
116 if (!rename_from_outer_loop)
117 continue;
118 if (e->src != outer_loop->header)
119 {
120 if (outer_loop->inner->next)
121 {
122 /* If outer_loop has 2 inner loops, allow there to
123 be an extra basic block which decides which of the
124 two loops to use using LOOP_VECTORIZED. */
125 if (!single_pred_p (e->src)
126 || single_pred (e->src) != outer_loop->header)
127 continue;
128 }
cb330ba5
JJ
129 }
130 }
538dd0b7
DM
131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132 gsi_next (&gsi))
133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
ebfd146a
IR
134 }
135}
136
137
a79683d5 138struct adjust_info
684f25f4
AO
139{
140 tree from, to;
141 basic_block bb;
a79683d5 142};
684f25f4 143
684f25f4
AO
144/* A stack of values to be adjusted in debug stmts. We have to
145 process them LIFO, so that the closest substitution applies. If we
146 processed them FIFO, without the stack, we might substitute uses
147 with a PHI DEF that would soon become non-dominant, and when we got
148 to the suitable one, it wouldn't have anything to substitute any
149 more. */
ff4c81cc 150static vec<adjust_info, va_heap> adjust_vec;
684f25f4
AO
151
152/* Adjust any debug stmts that referenced AI->from values to use the
153 loop-closed AI->to, if the references are dominated by AI->bb and
154 not by the definition of AI->from. */
155
156static void
157adjust_debug_stmts_now (adjust_info *ai)
158{
159 basic_block bbphi = ai->bb;
160 tree orig_def = ai->from;
161 tree new_def = ai->to;
162 imm_use_iterator imm_iter;
355fe088 163 gimple *stmt;
684f25f4
AO
164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165
166 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167
168 /* Adjust any debug stmts that held onto non-loop-closed
169 references. */
170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171 {
172 use_operand_p use_p;
173 basic_block bbuse;
174
175 if (!is_gimple_debug (stmt))
176 continue;
177
178 gcc_assert (gimple_debug_bind_p (stmt));
179
180 bbuse = gimple_bb (stmt);
181
182 if ((bbuse == bbphi
183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184 && !(bbuse == bbdef
185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186 {
187 if (new_def)
188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189 SET_USE (use_p, new_def);
190 else
191 {
192 gimple_debug_bind_reset_value (stmt);
193 update_stmt (stmt);
194 }
195 }
196 }
197}
198
199/* Adjust debug stmts as scheduled before. */
200
201static void
202adjust_vec_debug_stmts (void)
203{
36f52e8f 204 if (!MAY_HAVE_DEBUG_BIND_STMTS)
684f25f4
AO
205 return;
206
9771b263 207 gcc_assert (adjust_vec.exists ());
684f25f4 208
9771b263 209 while (!adjust_vec.is_empty ())
684f25f4 210 {
9771b263
DN
211 adjust_debug_stmts_now (&adjust_vec.last ());
212 adjust_vec.pop ();
684f25f4 213 }
684f25f4
AO
214}
215
216/* Adjust any debug stmts that referenced FROM values to use the
217 loop-closed TO, if the references are dominated by BB and not by
218 the definition of FROM. If adjust_vec is non-NULL, adjustments
219 will be postponed until adjust_vec_debug_stmts is called. */
220
221static void
222adjust_debug_stmts (tree from, tree to, basic_block bb)
223{
224 adjust_info ai;
225
36f52e8f 226 if (MAY_HAVE_DEBUG_BIND_STMTS
a471762f 227 && TREE_CODE (from) == SSA_NAME
a52ca739 228 && ! SSA_NAME_IS_DEFAULT_DEF (from)
a471762f 229 && ! virtual_operand_p (from))
684f25f4
AO
230 {
231 ai.from = from;
232 ai.to = to;
233 ai.bb = bb;
234
9771b263
DN
235 if (adjust_vec.exists ())
236 adjust_vec.safe_push (ai);
684f25f4
AO
237 else
238 adjust_debug_stmts_now (&ai);
239 }
240}
241
242/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243 to adjust any debug stmts that referenced the old phi arg,
244 presumably non-loop-closed references left over from other
245 transformations. */
246
247static void
355fe088 248adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
684f25f4
AO
249{
250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251
252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253
36f52e8f 254 if (MAY_HAVE_DEBUG_BIND_STMTS)
684f25f4
AO
255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256 gimple_bb (update_phi));
257}
258
7cfb4d93
RS
259/* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
260 the mask should have during the first iteration and NEXT_MASK is the
261 value that it should have on subsequent iterations. */
0f26839a 262
7cfb4d93 263static void
99b1c316 264vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
7cfb4d93
RS
265 tree next_mask)
266{
267 gphi *phi = create_phi_node (mask, loop->header);
268 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
270}
0f26839a 271
7cfb4d93 272/* Add SEQ to the end of LOOP's preheader block. */
ebfd146a 273
7cfb4d93 274static void
99b1c316 275add_preheader_seq (class loop *loop, gimple_seq seq)
7cfb4d93
RS
276{
277 if (seq)
278 {
279 edge pe = loop_preheader_edge (loop);
280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281 gcc_assert (!new_bb);
282 }
283}
ebfd146a 284
7cfb4d93
RS
285/* Add SEQ to the beginning of LOOP's header block. */
286
287static void
99b1c316 288add_header_seq (class loop *loop, gimple_seq seq)
7cfb4d93
RS
289{
290 if (seq)
291 {
292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294 }
295}
296
297/* Return true if the target can interleave elements of two vectors.
298 OFFSET is 0 if the first half of the vectors should be interleaved
299 or 1 if the second half should. When returning true, store the
300 associated permutation in INDICES. */
301
302static bool
303interleave_supported_p (vec_perm_indices *indices, tree vectype,
304 unsigned int offset)
305{
306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307 poly_uint64 base = exact_div (nelts, 2) * offset;
308 vec_perm_builder sel (nelts, 2, 3);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel.quick_push (base + i);
312 sel.quick_push (base + i + nelts);
313 }
314 indices->new_vector (sel, 2, nelts);
315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316}
317
318/* Try to use permutes to define the masks in DEST_RGM using the masks
319 in SRC_RGM, given that the former has twice as many masks as the
320 latter. Return true on success, adding any new statements to SEQ. */
321
322static bool
0a0ef238 323vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
7cfb4d93
RS
324 rgroup_masks *src_rgm)
325{
326 tree src_masktype = src_rgm->mask_type;
327 tree dest_masktype = dest_rgm->mask_type;
328 machine_mode src_mode = TYPE_MODE (src_masktype);
0a0ef238 329 insn_code icode1, icode2;
7cfb4d93 330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
0a0ef238
RS
331 && (icode1 = optab_handler (vec_unpacku_hi_optab,
332 src_mode)) != CODE_FOR_nothing
333 && (icode2 = optab_handler (vec_unpacku_lo_optab,
334 src_mode)) != CODE_FOR_nothing)
7cfb4d93
RS
335 {
336 /* Unpacking the source masks gives at least as many mask bits as
337 we need. We can then VIEW_CONVERT any excess bits away. */
0a0ef238
RS
338 machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
7cfb4d93
RS
341 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
342 {
343 tree src = src_rgm->masks[i / 2];
344 tree dest = dest_rgm->masks[i];
9bfb28ed
RS
345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346 ? VEC_UNPACK_HI_EXPR
7cfb4d93
RS
347 : VEC_UNPACK_LO_EXPR);
348 gassign *stmt;
349 if (dest_masktype == unpack_masktype)
350 stmt = gimple_build_assign (dest, code, src);
351 else
352 {
353 tree temp = make_ssa_name (unpack_masktype);
354 stmt = gimple_build_assign (temp, code, src);
355 gimple_seq_add_stmt (seq, stmt);
356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357 build1 (VIEW_CONVERT_EXPR,
358 dest_masktype, temp));
359 }
360 gimple_seq_add_stmt (seq, stmt);
361 }
362 return true;
363 }
364 vec_perm_indices indices[2];
365 if (dest_masktype == src_masktype
366 && interleave_supported_p (&indices[0], src_masktype, 0)
367 && interleave_supported_p (&indices[1], src_masktype, 1))
368 {
369 /* The destination requires twice as many mask bits as the source, so
370 we can use interleaving permutes to double up the number of bits. */
371 tree masks[2];
372 for (unsigned int i = 0; i < 2; ++i)
373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
375 {
376 tree src = src_rgm->masks[i / 2];
377 tree dest = dest_rgm->masks[i];
378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379 src, src, masks[i & 1]);
380 gimple_seq_add_stmt (seq, stmt);
381 }
382 return true;
383 }
384 return false;
385}
386
387/* Helper for vect_set_loop_condition_masked. Generate definitions for
388 all the masks in RGM and return a mask that is nonzero when the loop
389 needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
390 Use LOOP_COND_GSI to insert code before the exit gcond.
391
392 RGM belongs to loop LOOP. The loop originally iterated NITERS
fcae0292 393 times and has been vectorized according to LOOP_VINFO.
7cfb4d93 394
535e7c11
RS
395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396 starts with NITERS_SKIP dummy iterations of the scalar loop before
397 the real work starts. The mask elements for these dummy iterations
398 must be 0, to ensure that the extra iterations do not have an effect.
399
7cfb4d93
RS
400 It is known that:
401
402 NITERS * RGM->max_nscalars_per_iter
403
404 does not overflow. However, MIGHT_WRAP_P says whether an induction
405 variable that starts at 0 and has step:
406
407 VF * RGM->max_nscalars_per_iter
408
409 might overflow before hitting a value above:
410
535e7c11 411 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
7cfb4d93
RS
412
413 This means that we cannot guarantee that such an induction variable
414 would ever hit a value that produces a set of all-false masks for RGM. */
415
416static tree
99b1c316 417vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
7cfb4d93
RS
418 gimple_seq *preheader_seq,
419 gimple_stmt_iterator loop_cond_gsi,
fcae0292 420 rgroup_masks *rgm, tree niters, tree niters_skip,
535e7c11 421 bool might_wrap_p)
7cfb4d93
RS
422{
423 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
9b884225 424 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
7cfb4d93
RS
425 tree mask_type = rgm->mask_type;
426 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
fcae0292 428 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7cfb4d93
RS
429
430 /* Calculate the maximum number of scalar values that the rgroup
535e7c11
RS
431 handles in total, the number that it handles for each iteration
432 of the vector loop, and the number that it should skip during the
433 first iteration of the vector loop. */
7cfb4d93 434 tree nscalars_total = niters;
fcae0292 435 tree nscalars_step = build_int_cst (iv_type, vf);
535e7c11 436 tree nscalars_skip = niters_skip;
7cfb4d93
RS
437 if (nscalars_per_iter != 1)
438 {
439 /* We checked before choosing to use a fully-masked loop that these
440 multiplications don't overflow. */
fcae0292
RS
441 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
7cfb4d93 443 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
fcae0292
RS
444 nscalars_total, compare_factor);
445 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446 nscalars_step, iv_factor);
535e7c11
RS
447 if (nscalars_skip)
448 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
fcae0292 449 nscalars_skip, compare_factor);
7cfb4d93
RS
450 }
451
452 /* Create an induction variable that counts the number of scalars
453 processed. */
454 tree index_before_incr, index_after_incr;
455 gimple_stmt_iterator incr_gsi;
456 bool insert_after;
7cfb4d93 457 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
fcae0292
RS
458 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459 &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
9b884225 460
fcae0292 461 tree zero_index = build_int_cst (compare_type, 0);
535e7c11 462 tree test_index, test_limit, first_limit;
7cfb4d93
RS
463 gimple_stmt_iterator *test_gsi;
464 if (might_wrap_p)
465 {
466 /* In principle the loop should stop iterating once the incremented
535e7c11
RS
467 IV reaches a value greater than or equal to:
468
469 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
470
471 However, there's no guarantee that this addition doesn't overflow
472 the comparison type, or that the IV hits a value above it before
473 wrapping around. We therefore adjust the limit down by one
474 IV step:
7cfb4d93 475
535e7c11
RS
476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477 -[infinite-prec] NSCALARS_STEP
7cfb4d93
RS
478
479 and compare the IV against this limit _before_ incrementing it.
480 Since the comparison type is unsigned, we actually want the
481 subtraction to saturate at zero:
482
535e7c11
RS
483 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484 -[sat] NSCALARS_STEP
485
486 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
487
488 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
489
490 where the rightmost subtraction can be done directly in
491 COMPARE_TYPE. */
7cfb4d93 492 test_index = index_before_incr;
fcae0292
RS
493 tree adjust = gimple_convert (preheader_seq, compare_type,
494 nscalars_step);
535e7c11
RS
495 if (nscalars_skip)
496 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497 adjust, nscalars_skip);
7cfb4d93 498 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
535e7c11 499 nscalars_total, adjust);
7cfb4d93 500 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
535e7c11 501 test_limit, adjust);
7cfb4d93 502 test_gsi = &incr_gsi;
535e7c11
RS
503
504 /* Get a safe limit for the first iteration. */
505 if (nscalars_skip)
506 {
507 /* The first vector iteration can handle at most NSCALARS_STEP
508 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
509 NSCALARS_SKIP to that cannot overflow. */
510 tree const_limit = build_int_cst (compare_type,
511 LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512 * nscalars_per_iter);
513 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514 nscalars_total, const_limit);
515 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516 first_limit, nscalars_skip);
517 }
518 else
519 /* For the first iteration it doesn't matter whether the IV hits
520 a value above NSCALARS_TOTAL. That only matters for the latch
521 condition. */
522 first_limit = nscalars_total;
7cfb4d93
RS
523 }
524 else
525 {
526 /* Test the incremented IV, which will always hit a value above
527 the bound before wrapping. */
528 test_index = index_after_incr;
529 test_limit = nscalars_total;
535e7c11
RS
530 if (nscalars_skip)
531 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532 test_limit, nscalars_skip);
7cfb4d93 533 test_gsi = &loop_cond_gsi;
535e7c11
RS
534
535 first_limit = test_limit;
7cfb4d93
RS
536 }
537
fcae0292
RS
538 /* Convert the IV value to the comparison type (either a no-op or
539 a demotion). */
9b884225
KV
540 gimple_seq test_seq = NULL;
541 test_index = gimple_convert (&test_seq, compare_type, test_index);
542 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
543
fcae0292
RS
544 /* Provide a definition of each mask in the group. */
545 tree next_mask = NULL_TREE;
546 tree mask;
547 unsigned int i;
7cfb4d93
RS
548 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
549 {
550 /* Previous masks will cover BIAS scalars. This mask covers the
551 next batch. */
552 poly_uint64 bias = nscalars_per_mask * i;
553 tree bias_tree = build_int_cst (compare_type, bias);
554 gimple *tmp_stmt;
555
556 /* See whether the first iteration of the vector loop is known
557 to have a full mask. */
558 poly_uint64 const_limit;
559 bool first_iteration_full
535e7c11 560 = (poly_int_tree_p (first_limit, &const_limit)
7cfb4d93
RS
561 && known_ge (const_limit, (i + 1) * nscalars_per_mask));
562
563 /* Rather than have a new IV that starts at BIAS and goes up to
564 TEST_LIMIT, prefer to use the same 0-based IV for each mask
565 and adjust the bound down by BIAS. */
566 tree this_test_limit = test_limit;
567 if (i != 0)
568 {
569 this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570 compare_type, this_test_limit,
571 bias_tree);
572 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573 compare_type, this_test_limit,
574 bias_tree);
575 }
576
535e7c11
RS
577 /* Create the initial mask. First include all scalars that
578 are within the loop limit. */
7cfb4d93
RS
579 tree init_mask = NULL_TREE;
580 if (!first_iteration_full)
581 {
582 tree start, end;
535e7c11 583 if (first_limit == test_limit)
7cfb4d93
RS
584 {
585 /* Use a natural test between zero (the initial IV value)
586 and the loop limit. The "else" block would be valid too,
587 but this choice can avoid the need to load BIAS_TREE into
588 a register. */
589 start = zero_index;
590 end = this_test_limit;
591 }
592 else
593 {
535e7c11
RS
594 /* FIRST_LIMIT is the maximum number of scalars handled by the
595 first iteration of the vector loop. Test the portion
596 associated with this mask. */
7cfb4d93 597 start = bias_tree;
535e7c11 598 end = first_limit;
7cfb4d93
RS
599 }
600
601 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602 tmp_stmt = vect_gen_while (init_mask, start, end);
603 gimple_seq_add_stmt (preheader_seq, tmp_stmt);
604 }
605
535e7c11
RS
606 /* Now AND out the bits that are within the number of skipped
607 scalars. */
608 poly_uint64 const_skip;
609 if (nscalars_skip
610 && !(poly_int_tree_p (nscalars_skip, &const_skip)
611 && known_le (const_skip, bias)))
612 {
613 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614 bias_tree, nscalars_skip);
615 if (init_mask)
616 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617 init_mask, unskipped_mask);
618 else
619 init_mask = unskipped_mask;
620 }
621
7cfb4d93
RS
622 if (!init_mask)
623 /* First iteration is full. */
624 init_mask = build_minus_one_cst (mask_type);
625
626 /* Get the mask value for the next iteration of the loop. */
627 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629 gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
630
631 vect_set_loop_mask (loop, mask, init_mask, next_mask);
632 }
633 return next_mask;
634}
635
636/* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637 LOOP_VINFO describes the vectorization of LOOP. NITERS is the
d1d20a49
RS
638 number of iterations of the original scalar loop that should be
639 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
640 as for vect_set_loop_condition.
7cfb4d93
RS
641
642 Insert the branch-back condition before LOOP_COND_GSI and return the
643 final gcond. */
644
645static gcond *
99b1c316 646vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
7cfb4d93
RS
647 tree niters, tree final_iv,
648 bool niters_maybe_zero,
649 gimple_stmt_iterator loop_cond_gsi)
650{
651 gimple_seq preheader_seq = NULL;
652 gimple_seq header_seq = NULL;
653
654 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655 unsigned int compare_precision = TYPE_PRECISION (compare_type);
7cfb4d93
RS
656 tree orig_niters = niters;
657
658 /* Type of the initial value of NITERS. */
659 tree ni_actual_type = TREE_TYPE (niters);
660 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
9b884225 661 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7cfb4d93
RS
662
663 /* Convert NITERS to the same size as the compare. */
664 if (compare_precision > ni_actual_precision
665 && niters_maybe_zero)
666 {
667 /* We know that there is always at least one iteration, so if the
668 count is zero then it must have wrapped. Cope with this by
669 subtracting 1 before the conversion and adding 1 to the result. */
670 gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672 niters, build_minus_one_cst (ni_actual_type));
673 niters = gimple_convert (&preheader_seq, compare_type, niters);
674 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675 niters, build_one_cst (compare_type));
676 }
677 else
678 niters = gimple_convert (&preheader_seq, compare_type, niters);
679
9b884225 680 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
7cfb4d93
RS
681
682 /* Iterate over all the rgroups and fill in their masks. We could use
683 the first mask from any rgroup for the loop condition; here we
684 arbitrarily pick the last. */
685 tree test_mask = NULL_TREE;
686 rgroup_masks *rgm;
687 unsigned int i;
688 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689 FOR_EACH_VEC_ELT (*masks, i, rgm)
690 if (!rgm->masks.is_empty ())
691 {
692 /* First try using permutes. This adds a single vector
693 instruction to the loop for each mask, but needs no extra
694 loop invariants or IVs. */
695 unsigned int nmasks = i + 1;
696 if ((nmasks & 1) == 0)
697 {
698 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699 if (!half_rgm->masks.is_empty ()
0a0ef238 700 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
7cfb4d93
RS
701 continue;
702 }
703
704 /* See whether zero-based IV would ever generate all-false masks
705 before wrapping around. */
706 bool might_wrap_p
9b884225 707 = (iv_limit == -1
7cfb4d93
RS
708 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709 UNSIGNED)
710 > compare_precision));
711
712 /* Set up all masks for this group. */
713 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714 &preheader_seq,
fcae0292 715 loop_cond_gsi, rgm,
535e7c11
RS
716 niters, niters_skip,
717 might_wrap_p);
7cfb4d93
RS
718 }
719
720 /* Emit all accumulated statements. */
721 add_preheader_seq (loop, preheader_seq);
722 add_header_seq (loop, header_seq);
723
724 /* Get a boolean result that tells us whether to iterate. */
725 edge exit_edge = single_exit (loop);
726 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729 NULL_TREE, NULL_TREE);
730 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
731
732 /* The loop iterates (NITERS - 1) / VF + 1 times.
733 Subtract one from this to get the latch count. */
734 tree step = build_int_cst (compare_type,
735 LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737 build_minus_one_cst (compare_type));
738 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739 niters_minus_one, step);
740
741 if (final_iv)
742 {
743 gassign *assign = gimple_build_assign (final_iv, orig_niters);
744 gsi_insert_on_edge_immediate (single_exit (loop), assign);
745 }
746
747 return cond_stmt;
748}
749
750/* Like vect_set_loop_condition, but handle the case in which there
751 are no loop masks. */
752
753static gcond *
99b1c316 754vect_set_loop_condition_unmasked (class loop *loop, tree niters,
7cfb4d93
RS
755 tree step, tree final_iv,
756 bool niters_maybe_zero,
757 gimple_stmt_iterator loop_cond_gsi)
ebfd146a
IR
758{
759 tree indx_before_incr, indx_after_incr;
538dd0b7
DM
760 gcond *cond_stmt;
761 gcond *orig_cond;
0f26839a 762 edge pe = loop_preheader_edge (loop);
ebfd146a 763 edge exit_edge = single_exit (loop);
ebfd146a
IR
764 gimple_stmt_iterator incr_gsi;
765 bool insert_after;
ebfd146a 766 enum tree_code code;
0f26839a 767 tree niters_type = TREE_TYPE (niters);
ebfd146a
IR
768
769 orig_cond = get_loop_exit_condition (loop);
770 gcc_assert (orig_cond);
771 loop_cond_gsi = gsi_for_stmt (orig_cond);
772
0f26839a
RS
773 tree init, limit;
774 if (!niters_maybe_zero && integer_onep (step))
775 {
776 /* In this case we can use a simple 0-based IV:
777
778 A:
779 x = 0;
780 do
781 {
782 ...
783 x += 1;
784 }
785 while (x < NITERS); */
786 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787 init = build_zero_cst (niters_type);
788 limit = niters;
789 }
790 else
791 {
792 /* The following works for all values of NITERS except 0:
793
794 B:
795 x = 0;
796 do
797 {
798 ...
799 x += STEP;
800 }
801 while (x <= NITERS - STEP);
802
803 so that the loop continues to iterate if x + STEP - 1 < NITERS
804 but stops if x + STEP - 1 >= NITERS.
805
806 However, if NITERS is zero, x never hits a value above NITERS - STEP
807 before wrapping around. There are two obvious ways of dealing with
808 this:
809
810 - start at STEP - 1 and compare x before incrementing it
811 - start at -1 and compare x after incrementing it
812
813 The latter is simpler and is what we use. The loop in this case
814 looks like:
815
816 C:
817 x = -1;
818 do
819 {
820 ...
821 x += STEP;
822 }
823 while (x < NITERS - STEP);
824
825 In both cases the loop limit is NITERS - STEP. */
826 gimple_seq seq = NULL;
827 limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829 if (seq)
830 {
831 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832 gcc_assert (!new_bb);
833 }
834 if (niters_maybe_zero)
835 {
836 /* Case C. */
837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838 init = build_all_ones_cst (niters_type);
839 }
840 else
841 {
842 /* Case B. */
843 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844 init = build_zero_cst (niters_type);
845 }
846 }
847
ebfd146a
IR
848 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
849 create_iv (init, step, NULL_TREE, loop,
850 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
ebfd146a
IR
851 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
852 true, NULL_TREE, true,
853 GSI_SAME_STMT);
0f26839a 854 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
ebfd146a
IR
855 true, GSI_SAME_STMT);
856
0f26839a 857 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
ebfd146a
IR
858 NULL_TREE);
859
860 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
861
71118889 862 /* Record the number of latch iterations. */
0f26839a
RS
863 if (limit == niters)
864 /* Case A: the loop iterates NITERS times. Subtract one to get the
865 latch count. */
866 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867 build_int_cst (niters_type, 1));
868 else
869 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870 Subtract one from this to get the latch count. */
871 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872 limit, step);
873
874 if (final_iv)
875 {
876 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877 indx_after_incr, init);
878 gsi_insert_on_edge_immediate (single_exit (loop), assign);
879 }
7cfb4d93
RS
880
881 return cond_stmt;
882}
883
884/* If we're using fully-masked loops, make LOOP iterate:
885
886 N == (NITERS - 1) / STEP + 1
887
888 times. When NITERS is zero, this is equivalent to making the loop
889 execute (1 << M) / STEP times, where M is the precision of NITERS.
890 NITERS_MAYBE_ZERO is true if this last case might occur.
891
892 If we're not using fully-masked loops, make LOOP iterate:
893
894 N == (NITERS - STEP) / STEP + 1
895
896 times, where NITERS is known to be outside the range [1, STEP - 1].
897 This is equivalent to making the loop execute NITERS / STEP times
898 when NITERS is nonzero and (1 << M) / STEP times otherwise.
899 NITERS_MAYBE_ZERO again indicates whether this last case might occur.
900
901 If FINAL_IV is nonnull, it is an SSA name that should be set to
902 N * STEP on exit from the loop.
903
904 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
905
906void
99b1c316 907vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
7cfb4d93
RS
908 tree niters, tree step, tree final_iv,
909 bool niters_maybe_zero)
910{
911 gcond *cond_stmt;
912 gcond *orig_cond = get_loop_exit_condition (loop);
913 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
914
915 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917 final_iv, niters_maybe_zero,
918 loop_cond_gsi);
919 else
920 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921 final_iv, niters_maybe_zero,
922 loop_cond_gsi);
923
924 /* Remove old loop exit test. */
b5b56c2a
RS
925 stmt_vec_info orig_cond_info;
926 if (loop_vinfo
927 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
928 loop_vinfo->remove_stmt (orig_cond_info);
929 else
930 gsi_remove (&loop_cond_gsi, true);
7cfb4d93
RS
931
932 if (dump_enabled_p ())
3c2a8ed0
DM
933 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
934 cond_stmt);
ebfd146a
IR
935}
936
5ce9450f
JJ
937/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
938 For all PHI arguments in FROM->dest and TO->dest from those
939 edges ensure that TO->dest PHI arguments have current_def
940 to that in from. */
941
942static void
943slpeel_duplicate_current_defs_from_edges (edge from, edge to)
944{
945 gimple_stmt_iterator gsi_from, gsi_to;
946
947 for (gsi_from = gsi_start_phis (from->dest),
948 gsi_to = gsi_start_phis (to->dest);
14ba8d6d 949 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
5ce9450f 950 {
355fe088
TS
951 gimple *from_phi = gsi_stmt (gsi_from);
952 gimple *to_phi = gsi_stmt (gsi_to);
5ce9450f 953 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
1a5da5b6
RB
954 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
955 if (virtual_operand_p (from_arg))
956 {
14ba8d6d
RB
957 gsi_next (&gsi_from);
958 continue;
959 }
1a5da5b6
RB
960 if (virtual_operand_p (to_arg))
961 {
14ba8d6d
RB
962 gsi_next (&gsi_to);
963 continue;
964 }
1a5da5b6
RB
965 if (TREE_CODE (from_arg) != SSA_NAME)
966 gcc_assert (operand_equal_p (from_arg, to_arg, 0));
d8564d45
RB
967 else if (TREE_CODE (to_arg) == SSA_NAME
968 && from_arg != to_arg)
1a5da5b6
RB
969 {
970 if (get_current_def (to_arg) == NULL_TREE)
733441e2
RB
971 {
972 gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
973 TREE_TYPE (get_current_def
974 (from_arg))));
975 set_current_def (to_arg, get_current_def (from_arg));
976 }
1a5da5b6 977 }
14ba8d6d
RB
978 gsi_next (&gsi_from);
979 gsi_next (&gsi_to);
5ce9450f 980 }
1a5da5b6
RB
981
982 gphi *from_phi = get_virtual_phi (from->dest);
983 gphi *to_phi = get_virtual_phi (to->dest);
984 if (from_phi)
985 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
986 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
5ce9450f
JJ
987}
988
ebfd146a 989
b8698a0f 990/* Given LOOP this function generates a new copy of it and puts it
5ce9450f
JJ
991 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is
992 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
993 basic blocks from SCALAR_LOOP instead of LOOP, but to either the
994 entry or exit of LOOP. */
ebfd146a 995
99b1c316
MS
996class loop *
997slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998 class loop *scalar_loop, edge e)
ebfd146a 999{
99b1c316 1000 class loop *new_loop;
3884da6f 1001 basic_block *new_bbs, *bbs, *pbbs;
ebfd146a
IR
1002 bool at_exit;
1003 bool was_imm_dom;
b8698a0f 1004 basic_block exit_dest;
ebfd146a 1005 edge exit, new_exit;
a6c51a12 1006 bool duplicate_outer_loop = false;
ebfd146a 1007
2cfc56b9
RB
1008 exit = single_exit (loop);
1009 at_exit = (e == exit);
ebfd146a
IR
1010 if (!at_exit && e != loop_preheader_edge (loop))
1011 return NULL;
1012
5ce9450f
JJ
1013 if (scalar_loop == NULL)
1014 scalar_loop = loop;
1015
1016 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
3884da6f
BC
1017 pbbs = bbs + 1;
1018 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
a6c51a12
YR
1019 /* Allow duplication of outer loops. */
1020 if (scalar_loop->inner)
1021 duplicate_outer_loop = true;
ebfd146a 1022 /* Check whether duplication is possible. */
3884da6f 1023 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
ebfd146a
IR
1024 {
1025 free (bbs);
1026 return NULL;
1027 }
1028
1029 /* Generate new loop structure. */
5ce9450f
JJ
1030 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031 duplicate_subloops (scalar_loop, new_loop);
ebfd146a 1032
2cfc56b9 1033 exit_dest = exit->dest;
b8698a0f
L
1034 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035 exit_dest) == loop->header ?
ebfd146a
IR
1036 true : false);
1037
2cfc56b9
RB
1038 /* Also copy the pre-header, this avoids jumping through hoops to
1039 duplicate the loop entry PHI arguments. Create an empty
1040 pre-header unconditionally for this. */
5ce9450f 1041 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
2cfc56b9 1042 edge entry_e = single_pred_edge (preheader);
3884da6f 1043 bbs[0] = preheader;
5ce9450f 1044 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
ebfd146a 1045
5ce9450f
JJ
1046 exit = single_exit (scalar_loop);
1047 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
ebfd146a 1048 &exit, 1, &new_exit, NULL,
3884da6f 1049 at_exit ? loop->latch : e->src, true);
5ce9450f 1050 exit = single_exit (loop);
3884da6f 1051 basic_block new_preheader = new_bbs[0];
ebfd146a 1052
5ce9450f
JJ
1053 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1054
1055 if (scalar_loop != loop)
1056 {
1057 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1058 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1059 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects
1060 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1061 header) to have current_def set, so copy them over. */
1062 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1063 exit);
1064 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1065 0),
1066 EDGE_SUCC (loop->latch, 0));
1067 }
b8698a0f 1068
ebfd146a
IR
1069 if (at_exit) /* Add the loop copy at exit. */
1070 {
5ce9450f
JJ
1071 if (scalar_loop != loop)
1072 {
538dd0b7 1073 gphi_iterator gsi;
5ce9450f
JJ
1074 new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1075
1076 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1077 gsi_next (&gsi))
1078 {
538dd0b7 1079 gphi *phi = gsi.phi ();
5ce9450f
JJ
1080 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1081 location_t orig_locus
1082 = gimple_phi_arg_location_from_edge (phi, e);
1083
1084 add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1085 }
1086 }
2cfc56b9
RB
1087 redirect_edge_and_branch_force (e, new_preheader);
1088 flush_pending_stmts (e);
1089 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
a6c51a12 1090 if (was_imm_dom || duplicate_outer_loop)
5ce9450f 1091 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
2cfc56b9
RB
1092
1093 /* And remove the non-necessary forwarder again. Keep the other
1094 one so we have a proper pre-header for the loop at the exit edge. */
5ce9450f
JJ
1095 redirect_edge_pred (single_succ_edge (preheader),
1096 single_pred (preheader));
2cfc56b9 1097 delete_basic_block (preheader);
5ce9450f
JJ
1098 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1099 loop_preheader_edge (scalar_loop)->src);
ebfd146a
IR
1100 }
1101 else /* Add the copy at entry. */
1102 {
5ce9450f
JJ
1103 if (scalar_loop != loop)
1104 {
1105 /* Remove the non-necessary forwarder of scalar_loop again. */
1106 redirect_edge_pred (single_succ_edge (preheader),
1107 single_pred (preheader));
1108 delete_basic_block (preheader);
1109 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1110 loop_preheader_edge (scalar_loop)->src);
1111 preheader = split_edge (loop_preheader_edge (loop));
1112 entry_e = single_pred_edge (preheader);
1113 }
1114
2cfc56b9
RB
1115 redirect_edge_and_branch_force (entry_e, new_preheader);
1116 flush_pending_stmts (entry_e);
1117 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1118
1119 redirect_edge_and_branch_force (new_exit, preheader);
1120 flush_pending_stmts (new_exit);
1121 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1122
1123 /* And remove the non-necessary forwarder again. Keep the other
1124 one so we have a proper pre-header for the loop at the exit edge. */
5ce9450f
JJ
1125 redirect_edge_pred (single_succ_edge (new_preheader),
1126 single_pred (new_preheader));
2cfc56b9
RB
1127 delete_basic_block (new_preheader);
1128 set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1129 loop_preheader_edge (new_loop)->src);
ebfd146a
IR
1130 }
1131
166b8799
BC
1132 /* Skip new preheader since it's deleted if copy loop is added at entry. */
1133 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
a6c51a12 1134 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
2cfc56b9 1135
5ce9450f
JJ
1136 if (scalar_loop != loop)
1137 {
1138 /* Update new_loop->header PHIs, so that on the preheader
1139 edge they are the ones from loop rather than scalar_loop. */
538dd0b7 1140 gphi_iterator gsi_orig, gsi_new;
5ce9450f
JJ
1141 edge orig_e = loop_preheader_edge (loop);
1142 edge new_e = loop_preheader_edge (new_loop);
1143
1144 for (gsi_orig = gsi_start_phis (loop->header),
1145 gsi_new = gsi_start_phis (new_loop->header);
1146 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147 gsi_next (&gsi_orig), gsi_next (&gsi_new))
1148 {
538dd0b7
DM
1149 gphi *orig_phi = gsi_orig.phi ();
1150 gphi *new_phi = gsi_new.phi ();
5ce9450f
JJ
1151 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152 location_t orig_locus
1153 = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1154
1155 add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1156 }
1157 }
1158
ebfd146a
IR
1159 free (new_bbs);
1160 free (bbs);
1161
b2b29377 1162 checking_verify_dominators (CDI_DOMINATORS);
2cfc56b9 1163
ebfd146a
IR
1164 return new_loop;
1165}
1166
1167
a5e3d614
BC
1168/* Given the condition expression COND, put it as the last statement of
1169 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170 DOM_BB; return the skip edge. GUARD_TO is the target basic block to
3e907b90
BC
1171 skip the loop. PROBABILITY is the skip edge's probability. Mark the
1172 new edge as irreducible if IRREDUCIBLE_P is true. */
ebfd146a
IR
1173
1174static edge
86290011 1175slpeel_add_loop_guard (basic_block guard_bb, tree cond,
a5e3d614 1176 basic_block guard_to, basic_block dom_bb,
357067f2 1177 profile_probability probability, bool irreducible_p)
ebfd146a
IR
1178{
1179 gimple_stmt_iterator gsi;
1180 edge new_e, enter_e;
538dd0b7 1181 gcond *cond_stmt;
ebfd146a
IR
1182 gimple_seq gimplify_stmt_list = NULL;
1183
1184 enter_e = EDGE_SUCC (guard_bb, 0);
1185 enter_e->flags &= ~EDGE_FALLTHRU;
1186 enter_e->flags |= EDGE_FALSE_VALUE;
1187 gsi = gsi_last_bb (guard_bb);
1188
f7a06a98
RG
1189 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190 NULL_TREE);
86290011 1191 if (gimplify_stmt_list)
a5e3d614 1192 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
ebfd146a 1193
a5e3d614 1194 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
ebfd146a
IR
1195 gsi = gsi_last_bb (guard_bb);
1196 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1197
1198 /* Add new edge to connect guard block to the merge/loop-exit block. */
a5e3d614 1199 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
e78410bf 1200
e78410bf 1201 new_e->probability = probability;
3e907b90
BC
1202 if (irreducible_p)
1203 new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1204
357067f2 1205 enter_e->probability = probability.invert ();
a5e3d614 1206 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
5281a167
RB
1207
1208 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
1209 if (enter_e->dest->loop_father->header == enter_e->dest)
1210 split_edge (enter_e);
1211
ebfd146a
IR
1212 return new_e;
1213}
1214
1215
1216/* This function verifies that the following restrictions apply to LOOP:
a6c51a12
YR
1217 (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218 for innermost loop and 5 basic blocks for outer-loop.
1219 (2) it is single entry, single exit
1220 (3) its exit condition is the last stmt in the header
1221 (4) E is the entry/exit edge of LOOP.
ebfd146a
IR
1222 */
1223
1224bool
99b1c316 1225slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
ebfd146a
IR
1226{
1227 edge exit_e = single_exit (loop);
1228 edge entry_e = loop_preheader_edge (loop);
538dd0b7 1229 gcond *orig_cond = get_loop_exit_condition (loop);
ebfd146a 1230 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
a6c51a12 1231 unsigned int num_bb = loop->inner? 5 : 2;
ebfd146a 1232
62fdbf29
BC
1233 /* All loops have an outer scope; the only case loop->outer is NULL is for
1234 the function itself. */
1235 if (!loop_outer (loop)
a6c51a12 1236 || loop->num_nodes != num_bb
ebfd146a
IR
1237 || !empty_block_p (loop->latch)
1238 || !single_exit (loop)
1239 /* Verify that new loop exit condition can be trivially modified. */
1240 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241 || (e != exit_e && e != entry_e))
1242 return false;
1243
1244 return true;
1245}
1246
a5e3d614
BC
1247/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248 in the exit bb and rename all the uses after the loop. This simplifies
1249 the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250 (but normally loop closed SSA form doesn't require virtual PHIs to be
1251 in the same form). Doing this early simplifies the checking what
e3969868 1252 uses should be renamed.
ebfd146a 1253
e3969868
RS
1254 If we create a new phi after the loop, return the definition that
1255 applies on entry to the loop, otherwise return null. */
1256
1257static tree
99b1c316 1258create_lcssa_for_virtual_phi (class loop *loop)
ebfd146a 1259{
538dd0b7 1260 gphi_iterator gsi;
ebfd146a 1261 edge exit_e = single_exit (loop);
b8698a0f 1262
e20f6b4b 1263 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
ea057359 1264 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
e20f6b4b 1265 {
538dd0b7 1266 gphi *phi = gsi.phi ();
e20f6b4b
JJ
1267 for (gsi = gsi_start_phis (exit_e->dest);
1268 !gsi_end_p (gsi); gsi_next (&gsi))
ea057359 1269 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
e20f6b4b
JJ
1270 break;
1271 if (gsi_end_p (gsi))
1272 {
b731b390 1273 tree new_vop = copy_ssa_name (PHI_RESULT (phi));
538dd0b7 1274 gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
e20f6b4b
JJ
1275 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276 imm_use_iterator imm_iter;
355fe088 1277 gimple *stmt;
e20f6b4b
JJ
1278 use_operand_p use_p;
1279
15b6695a
RB
1280 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
9e227d60 1282 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
e20f6b4b
JJ
1283 gimple_phi_set_result (new_phi, new_vop);
1284 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
b5fd0440
YR
1285 if (stmt != new_phi
1286 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
e20f6b4b
JJ
1287 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288 SET_USE (use_p, new_vop);
e3969868
RS
1289
1290 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
e20f6b4b
JJ
1291 }
1292 break;
1293 }
e3969868 1294 return NULL_TREE;
ebfd146a
IR
1295}
1296
1297/* Function vect_get_loop_location.
1298
1299 Extract the location of the loop in the source code.
1300 If the loop is not well formed for vectorization, an estimated
1301 location is calculated.
1302 Return the loop location if succeed and NULL if not. */
1303
4f5b9c80 1304dump_user_location_t
99b1c316 1305find_loop_location (class loop *loop)
ebfd146a 1306{
355fe088 1307 gimple *stmt = NULL;
ebfd146a
IR
1308 basic_block bb;
1309 gimple_stmt_iterator si;
1310
1311 if (!loop)
4f5b9c80 1312 return dump_user_location_t ();
ebfd146a
IR
1313
1314 stmt = get_loop_exit_condition (loop);
1315
502498d5
JJ
1316 if (stmt
1317 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
4f5b9c80 1318 return stmt;
ebfd146a
IR
1319
1320 /* If we got here the loop is probably not "well formed",
1321 try to estimate the loop location */
1322
1323 if (!loop->header)
4f5b9c80 1324 return dump_user_location_t ();
ebfd146a
IR
1325
1326 bb = loop->header;
1327
1328 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329 {
1330 stmt = gsi_stmt (si);
502498d5 1331 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
4f5b9c80 1332 return stmt;
ebfd146a
IR
1333 }
1334
4f5b9c80 1335 return dump_user_location_t ();
ebfd146a
IR
1336}
1337
82570274
RS
1338/* Return true if the phi described by STMT_INFO defines an IV of the
1339 loop to be vectorized. */
a5e3d614
BC
1340
1341static bool
82570274 1342iv_phi_p (stmt_vec_info stmt_info)
a5e3d614 1343{
82570274 1344 gphi *phi = as_a <gphi *> (stmt_info->stmt);
a5e3d614
BC
1345 if (virtual_operand_p (PHI_RESULT (phi)))
1346 return false;
1347
a5e3d614
BC
1348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350 return false;
1351
1352 return true;
1353}
ebfd146a 1354
ebfd146a
IR
1355/* Function vect_can_advance_ivs_p
1356
b8698a0f
L
1357 In case the number of iterations that LOOP iterates is unknown at compile
1358 time, an epilog loop will be generated, and the loop induction variables
1359 (IVs) will be "advanced" to the value they are supposed to take just before
ebfd146a
IR
1360 the epilog loop. Here we check that the access function of the loop IVs
1361 and the expression that represents the loop bound are simple enough.
1362 These restrictions will be relaxed in the future. */
1363
b8698a0f 1364bool
ebfd146a
IR
1365vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366{
99b1c316 1367 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 1368 basic_block bb = loop->header;
538dd0b7 1369 gphi_iterator gsi;
ebfd146a
IR
1370
1371 /* Analyze phi functions of the loop header. */
1372
73fbfcad 1373 if (dump_enabled_p ())
e645e942 1374 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
ebfd146a
IR
1375 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376 {
ebfd146a
IR
1377 tree evolution_part;
1378
a5e3d614 1379 gphi *phi = gsi.phi ();
91987857 1380 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
73fbfcad 1381 if (dump_enabled_p ())
3c2a8ed0
DM
1382 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383 phi_info->stmt);
ebfd146a
IR
1384
1385 /* Skip virtual phi's. The data dependences that are associated with
a5e3d614 1386 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
ebfd146a 1387
a5e3d614 1388 Skip reduction phis. */
82570274 1389 if (!iv_phi_p (phi_info))
ebfd146a 1390 {
73fbfcad 1391 if (dump_enabled_p ())
a5e3d614
BC
1392 dump_printf_loc (MSG_NOTE, vect_location,
1393 "reduc or virtual phi. skip.\n");
ebfd146a
IR
1394 continue;
1395 }
1396
ebfd146a
IR
1397 /* Analyze the evolution function. */
1398
91987857 1399 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
ebfd146a
IR
1400 if (evolution_part == NULL_TREE)
1401 {
73fbfcad 1402 if (dump_enabled_p ())
afb119be 1403 dump_printf (MSG_MISSED_OPTIMIZATION,
e645e942 1404 "No access function or evolution.\n");
ebfd146a
IR
1405 return false;
1406 }
b8698a0f 1407
2a93954e
AH
1408 /* FORNOW: We do not transform initial conditions of IVs
1409 which evolution functions are not invariants in the loop. */
1410
1411 if (!expr_invariant_in_loop_p (loop, evolution_part))
1412 {
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415 "evolution not invariant in loop.\n");
1416 return false;
1417 }
1418
b8698a0f 1419 /* FORNOW: We do not transform initial conditions of IVs
ebfd146a
IR
1420 which evolution functions are a polynomial of degree >= 2. */
1421
1422 if (tree_is_chrec (evolution_part))
2a93954e
AH
1423 {
1424 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426 "evolution is chrec.\n");
1427 return false;
1428 }
ebfd146a
IR
1429 }
1430
1431 return true;
1432}
1433
1434
1435/* Function vect_update_ivs_after_vectorizer.
1436
1437 "Advance" the induction variables of LOOP to the value they should take
1438 after the execution of LOOP. This is currently necessary because the
1439 vectorizer does not handle induction variables that are used after the
1440 loop. Such a situation occurs when the last iterations of LOOP are
1441 peeled, because:
1442 1. We introduced new uses after LOOP for IVs that were not originally used
1443 after LOOP: the IVs of LOOP are now used by an epilog loop.
1444 2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445 times, whereas the loop IVs should be bumped N times.
1446
1447 Input:
1448 - LOOP - a loop that is going to be vectorized. The last few iterations
1449 of LOOP were peeled.
1450 - NITERS - the number of iterations that LOOP executes (before it is
1451 vectorized). i.e, the number of times the ivs should be bumped.
1452 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453 coming out from LOOP on which there are uses of the LOOP ivs
1454 (this is the path from LOOP->exit to epilog_loop->preheader).
1455
1456 The new definitions of the ivs are placed in LOOP->exit.
1457 The phi args associated with the edge UPDATE_E in the bb
1458 UPDATE_E->dest are updated accordingly.
1459
1460 Assumption 1: Like the rest of the vectorizer, this function assumes
1461 a single loop exit that has a single predecessor.
1462
1463 Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464 organized in the same order.
1465
1466 Assumption 3: The access function of the ivs is simple enough (see
1467 vect_can_advance_ivs_p). This assumption will be relaxed in the future.
1468
1469 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
b8698a0f 1470 coming out of LOOP on which the ivs of LOOP are used (this is the path
ebfd146a
IR
1471 that leads to the epilog loop; other paths skip the epilog loop). This
1472 path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473 needs to have its phis updated.
1474 */
1475
1476static void
a5e3d614
BC
1477vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478 tree niters, edge update_e)
ebfd146a 1479{
538dd0b7 1480 gphi_iterator gsi, gsi1;
99b1c316 1481 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ebfd146a 1482 basic_block update_bb = update_e->dest;
a5e3d614 1483 basic_block exit_bb = single_exit (loop)->dest;
ebfd146a
IR
1484
1485 /* Make sure there exists a single-predecessor exit bb: */
1486 gcc_assert (single_pred_p (exit_bb));
a5e3d614 1487 gcc_assert (single_succ_edge (exit_bb) == update_e);
ebfd146a
IR
1488
1489 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490 !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491 gsi_next (&gsi), gsi_next (&gsi1))
1492 {
ebfd146a 1493 tree init_expr;
550918ca
RG
1494 tree step_expr, off;
1495 tree type;
ebfd146a
IR
1496 tree var, ni, ni_name;
1497 gimple_stmt_iterator last_gsi;
1498
a5e3d614
BC
1499 gphi *phi = gsi.phi ();
1500 gphi *phi1 = gsi1.phi ();
91987857 1501 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
73fbfcad 1502 if (dump_enabled_p ())
3c2a8ed0
DM
1503 dump_printf_loc (MSG_NOTE, vect_location,
1504 "vect_update_ivs_after_vectorizer: phi: %G", phi);
ebfd146a 1505
a5e3d614 1506 /* Skip reduction and virtual phis. */
82570274 1507 if (!iv_phi_p (phi_info))
ebfd146a 1508 {
73fbfcad 1509 if (dump_enabled_p ())
a5e3d614
BC
1510 dump_printf_loc (MSG_NOTE, vect_location,
1511 "reduc or virtual phi. skip.\n");
ebfd146a
IR
1512 continue;
1513 }
1514
0ac168a1 1515 type = TREE_TYPE (gimple_phi_result (phi));
91987857 1516 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
0ac168a1 1517 step_expr = unshare_expr (step_expr);
b8698a0f 1518
ebfd146a
IR
1519 /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520 of degree >= 2 or exponential. */
0ac168a1 1521 gcc_assert (!tree_is_chrec (step_expr));
ebfd146a 1522
0ac168a1 1523 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
ebfd146a 1524
550918ca
RG
1525 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526 fold_convert (TREE_TYPE (step_expr), niters),
1527 step_expr);
0ac168a1 1528 if (POINTER_TYPE_P (type))
5d49b6a7 1529 ni = fold_build_pointer_plus (init_expr, off);
ebfd146a 1530 else
0ac168a1
RG
1531 ni = fold_build2 (PLUS_EXPR, type,
1532 init_expr, fold_convert (type, off));
ebfd146a 1533
0ac168a1 1534 var = create_tmp_var (type, "tmp");
ebfd146a
IR
1535
1536 last_gsi = gsi_last_bb (exit_bb);
a5e3d614
BC
1537 gimple_seq new_stmts = NULL;
1538 ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539 /* Exit_bb shouldn't be empty. */
1540 if (!gsi_end_p (last_gsi))
1541 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542 else
1543 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
b8698a0f 1544
ebfd146a 1545 /* Fix phi expressions in the successor bb. */
684f25f4 1546 adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
ebfd146a
IR
1547 }
1548}
1549
535e7c11
RS
1550/* Return a gimple value containing the misalignment (measured in vector
1551 elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552 it is away from a perfectly aligned address. Add any new statements
1553 to SEQ. */
1554
1555static tree
1556get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557{
1e5e6ff5 1558 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
89fa689a 1559 stmt_vec_info stmt_info = dr_info->stmt;
535e7c11
RS
1560 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561
ca31798e
AV
1562 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563 unsigned HOST_WIDE_INT target_align_c;
1564 tree target_align_minus_1;
535e7c11 1565
89fa689a
RS
1566 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567 size_zero_node) < 0;
535e7c11
RS
1568 tree offset = (negative
1569 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570 : size_zero_node);
78e02b3b 1571 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
535e7c11
RS
1572 offset);
1573 tree type = unsigned_type_for (TREE_TYPE (start_addr));
ca31798e
AV
1574 if (target_align.is_constant (&target_align_c))
1575 target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1576 else
1577 {
1578 tree vla = build_int_cst (type, target_align);
1579 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1580 fold_build2 (MINUS_EXPR, type,
1581 build_int_cst (type, 0), vla));
1582 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1583 build_int_cst (type, 1));
1584 }
1585
535e7c11
RS
1586 HOST_WIDE_INT elem_size
1587 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1588 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1589
1590 /* Create: misalign_in_bytes = addr & (target_align - 1). */
1591 tree int_start_addr = fold_convert (type, start_addr);
1592 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1593 target_align_minus_1);
1594
1595 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
1596 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1597 elem_size_log);
1598
1599 return misalign_in_elems;
1600}
1601
a5e3d614 1602/* Function vect_gen_prolog_loop_niters
d9157f15 1603
a5e3d614
BC
1604 Generate the number of iterations which should be peeled as prolog for the
1605 loop represented by LOOP_VINFO. It is calculated as the misalignment of
1606 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1607 As a result, after the execution of this loop, the data reference DR will
1608 refer to an aligned location. The following computation is generated:
ebfd146a
IR
1609
1610 If the misalignment of DR is known at compile time:
1611 addr_mis = int mis = DR_MISALIGNMENT (dr);
1612 Else, compute address misalignment in bytes:
535e7c11 1613 addr_mis = addr & (target_align - 1)
ebfd146a 1614
a5e3d614 1615 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
ebfd146a
IR
1616
1617 (elem_size = element type size; an element is the scalar element whose type
1618 is the inner type of the vectype)
1619
a5e3d614 1620 The computations will be emitted at the end of BB. We also compute and
cbb22e61 1621 store upper bound (included) of the result in BOUND.
a5e3d614 1622
ebfd146a
IR
1623 When the step of the data-ref in the loop is not 1 (as in interleaved data
1624 and SLP), the number of iterations of the prolog must be divided by the step
1625 (which is equal to the size of interleaved group).
1626
1627 The above formulas assume that VF == number of elements in the vector. This
1628 may not hold when there are multiple-types in the loop.
1629 In this case, for some data-references in the loop the VF does not represent
1630 the number of elements that fit in the vector. Therefore, instead of VF we
1631 use TYPE_VECTOR_SUBPARTS. */
1632
b8698a0f 1633static tree
a5e3d614
BC
1634vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1635 basic_block bb, int *bound)
ebfd146a 1636{
1e5e6ff5 1637 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
ebfd146a 1638 tree var;
a5e3d614
BC
1639 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1640 gimple_seq stmts = NULL, new_stmts = NULL;
ebfd146a 1641 tree iters, iters_name;
89fa689a 1642 stmt_vec_info stmt_info = dr_info->stmt;
ebfd146a 1643 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
ca31798e 1644 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
ebfd146a 1645
15e693cc 1646 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
ebfd146a 1647 {
15e693cc 1648 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
ebfd146a 1649
73fbfcad 1650 if (dump_enabled_p ())
ccb3ad87 1651 dump_printf_loc (MSG_NOTE, vect_location,
e645e942 1652 "known peeling = %d.\n", npeel);
ebfd146a 1653
720f5239 1654 iters = build_int_cst (niters_type, npeel);
cbb22e61 1655 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
ebfd146a
IR
1656 }
1657 else
1658 {
535e7c11
RS
1659 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1660 tree type = TREE_TYPE (misalign_in_elems);
f702e7d4
RS
1661 HOST_WIDE_INT elem_size
1662 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
ca31798e
AV
1663 /* We only do prolog peeling if the target alignment is known at compile
1664 time. */
1665 poly_uint64 align_in_elems =
1666 exact_div (target_align, elem_size);
1667 tree align_in_elems_minus_1 =
1668 build_int_cst (type, align_in_elems - 1);
f702e7d4 1669 tree align_in_elems_tree = build_int_cst (type, align_in_elems);
f702e7d4
RS
1670
1671 /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
1672 & (align_in_elems - 1)). */
89fa689a
RS
1673 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1674 size_zero_node) < 0;
d8ba5b19 1675 if (negative)
f702e7d4
RS
1676 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1677 align_in_elems_tree);
d8ba5b19 1678 else
f702e7d4
RS
1679 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1680 misalign_in_elems);
1681 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
ebfd146a 1682 iters = fold_convert (niters_type, iters);
ca31798e
AV
1683 unsigned HOST_WIDE_INT align_in_elems_c;
1684 if (align_in_elems.is_constant (&align_in_elems_c))
1685 *bound = align_in_elems_c - 1;
1686 else
1687 *bound = -1;
ebfd146a
IR
1688 }
1689
73fbfcad 1690 if (dump_enabled_p ())
3c2a8ed0
DM
1691 dump_printf_loc (MSG_NOTE, vect_location,
1692 "niters for prolog loop: %T\n", iters);
ebfd146a
IR
1693
1694 var = create_tmp_var (niters_type, "prolog_loop_niters");
a5e3d614 1695 iters_name = force_gimple_operand (iters, &new_stmts, false, var);
ebfd146a 1696
a5e3d614
BC
1697 if (new_stmts)
1698 gimple_seq_add_seq (&stmts, new_stmts);
ebfd146a
IR
1699 if (stmts)
1700 {
a5e3d614
BC
1701 gcc_assert (single_succ_p (bb));
1702 gimple_stmt_iterator gsi = gsi_last_bb (bb);
1703 if (gsi_end_p (gsi))
1704 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1705 else
1706 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
ebfd146a 1707 }
b8698a0f 1708 return iters_name;
ebfd146a
IR
1709}
1710
1711
1712/* Function vect_update_init_of_dr
1713
535e7c11
RS
1714 If CODE is PLUS, the vector loop starts NITERS iterations after the
1715 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1716 iterations before the scalar one (using masking to skip inactive
1717 elements). This function updates the information recorded in DR to
1718 account for the difference. Specifically, it updates the OFFSET
1719 field of DR. */
ebfd146a
IR
1720
1721static void
535e7c11 1722vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
ebfd146a
IR
1723{
1724 tree offset = DR_OFFSET (dr);
b8698a0f 1725
ebfd146a
IR
1726 niters = fold_build2 (MULT_EXPR, sizetype,
1727 fold_convert (sizetype, niters),
1728 fold_convert (sizetype, DR_STEP (dr)));
535e7c11 1729 offset = fold_build2 (code, sizetype,
587aa063 1730 fold_convert (sizetype, offset), niters);
ebfd146a
IR
1731 DR_OFFSET (dr) = offset;
1732}
1733
1734
1735/* Function vect_update_inits_of_drs
1736
535e7c11
RS
1737 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1738 CODE and NITERS are as for vect_update_inits_of_dr. */
ebfd146a 1739
97c14603 1740void
535e7c11
RS
1741vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1742 tree_code code)
ebfd146a
IR
1743{
1744 unsigned int i;
9771b263 1745 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
ebfd146a 1746 struct data_reference *dr;
a5e3d614 1747
adac3a68 1748 DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
a5e3d614 1749
97c14603
AV
1750 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader
1751 here, but since we might use these niters to update the epilogues niters
1752 and data references we can't insert them here as this definition might not
1753 always dominate its uses. */
a5e3d614 1754 if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
97c14603 1755 niters = fold_convert (sizetype, niters);
ebfd146a 1756
9771b263 1757 FOR_EACH_VEC_ELT (datarefs, i, dr)
5fa23466 1758 {
f5ae2856
RS
1759 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1760 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
5fa23466
RB
1761 vect_update_init_of_dr (dr, niters, code);
1762 }
ebfd146a
IR
1763}
1764
535e7c11
RS
1765/* For the information recorded in LOOP_VINFO prepare the loop for peeling
1766 by masking. This involves calculating the number of iterations to
1767 be peeled and then aligning all memory references appropriately. */
1768
1769void
1770vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1771{
1772 tree misalign_in_elems;
1773 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1774
1775 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1776
1777 /* From the information recorded in LOOP_VINFO get the number of iterations
1778 that need to be skipped via masking. */
1779 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1780 {
1781 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1782 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1783 misalign_in_elems = build_int_cst (type, misalign);
1784 }
1785 else
1786 {
1787 gimple_seq seq1 = NULL, seq2 = NULL;
1788 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1789 misalign_in_elems = fold_convert (type, misalign_in_elems);
1790 misalign_in_elems = force_gimple_operand (misalign_in_elems,
1791 &seq2, true, NULL_TREE);
1792 gimple_seq_add_seq (&seq1, seq2);
1793 if (seq1)
1794 {
1795 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1796 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1797 gcc_assert (!new_bb);
1798 }
1799 }
1800
1801 if (dump_enabled_p ())
3c2a8ed0
DM
1802 dump_printf_loc (MSG_NOTE, vect_location,
1803 "misalignment for fully-masked loop: %T\n",
1804 misalign_in_elems);
535e7c11
RS
1805
1806 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1807
1808 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1809}
ebfd146a 1810
a5e3d614 1811/* This function builds ni_name = number of iterations. Statements
7078979b
BC
1812 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
1813 it to TRUE if new ssa_var is generated. */
a5e3d614
BC
1814
1815tree
7078979b 1816vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
a5e3d614
BC
1817{
1818 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1819 if (TREE_CODE (ni) == INTEGER_CST)
1820 return ni;
1821 else
1822 {
1823 tree ni_name, var;
1824 gimple_seq stmts = NULL;
1825 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1826
1827 var = create_tmp_var (TREE_TYPE (ni), "niters");
1828 ni_name = force_gimple_operand (ni, &stmts, false, var);
1829 if (stmts)
7078979b
BC
1830 {
1831 gsi_insert_seq_on_edge_immediate (pe, stmts);
1832 if (new_var_p != NULL)
1833 *new_var_p = true;
1834 }
a5e3d614
BC
1835
1836 return ni_name;
1837 }
1838}
1839
cbb22e61
BC
1840/* Calculate the number of iterations above which vectorized loop will be
1841 preferred than scalar loop. NITERS_PROLOG is the number of iterations
1842 of prolog loop. If it's integer const, the integer number is also passed
d1d20a49
RS
1843 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
1844 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
1845 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
1846 threshold below which the scalar (rather than vectorized) loop will be
1847 executed. This function stores the upper bound (inclusive) of the result
1848 in BOUND_SCALAR. */
a5e3d614
BC
1849
1850static tree
1851vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
d1d20a49 1852 int bound_prolog, poly_int64 bound_epilog, int th,
d9f21f6a
RS
1853 poly_uint64 *bound_scalar,
1854 bool check_profitability)
a5e3d614
BC
1855{
1856 tree type = TREE_TYPE (niters_prolog);
1857 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
d1d20a49 1858 build_int_cst (type, bound_epilog));
a5e3d614 1859
d1d20a49 1860 *bound_scalar = bound_prolog + bound_epilog;
a5e3d614
BC
1861 if (check_profitability)
1862 {
cbb22e61
BC
1863 /* TH indicates the minimum niters of vectorized loop, while we
1864 compute the maximum niters of scalar loop. */
1865 th--;
a5e3d614
BC
1866 /* Peeling for constant times. */
1867 if (int_niters_prolog >= 0)
1868 {
d1d20a49 1869 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
cbb22e61 1870 return build_int_cst (type, *bound_scalar);
a5e3d614 1871 }
d1d20a49
RS
1872 /* Peeling an unknown number of times. Note that both BOUND_PROLOG
1873 and BOUND_EPILOG are inclusive upper bounds. */
1874 if (known_ge (th, bound_prolog + bound_epilog))
a5e3d614 1875 {
cbb22e61 1876 *bound_scalar = th;
a5e3d614
BC
1877 return build_int_cst (type, th);
1878 }
d9f21f6a 1879 /* Need to do runtime comparison. */
d1d20a49 1880 else if (maybe_gt (th, bound_epilog))
d9f21f6a
RS
1881 {
1882 *bound_scalar = upper_bound (*bound_scalar, th);
1883 return fold_build2 (MAX_EXPR, type,
1884 build_int_cst (type, th), niters);
1885 }
a5e3d614
BC
1886 }
1887 return niters;
1888}
1889
0f26839a
RS
1890/* NITERS is the number of times that the original scalar loop executes
1891 after peeling. Work out the maximum number of iterations N that can
1892 be handled by the vectorized form of the loop and then either:
ebfd146a 1893
0f26839a 1894 a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
d9157f15 1895
0f26839a
RS
1896 niters_vector = N
1897
1898 b) set *STEP_VECTOR_PTR to one and generate:
1899
1900 niters_vector = N / vf
1901
1902 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1903 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
1904 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
ebfd146a
IR
1905
1906void
a5e3d614 1907vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
0f26839a
RS
1908 tree *niters_vector_ptr, tree *step_vector_ptr,
1909 bool niters_no_overflow)
ebfd146a 1910{
a5e3d614 1911 tree ni_minus_gap, var;
0f26839a 1912 tree niters_vector, step_vector, type = TREE_TYPE (niters);
d9f21f6a 1913 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
a5e3d614 1914 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
0f26839a 1915 tree log_vf = NULL_TREE;
a5e3d614
BC
1916
1917 /* If epilogue loop is required because of data accesses with gaps, we
1918 subtract one iteration from the total number of iterations here for
1919 correct calculation of RATIO. */
1920 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1921 {
7078979b
BC
1922 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1923 build_one_cst (type));
a5e3d614
BC
1924 if (!is_gimple_val (ni_minus_gap))
1925 {
7078979b 1926 var = create_tmp_var (type, "ni_gap");
a5e3d614
BC
1927 gimple *stmts = NULL;
1928 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1929 true, var);
1930 gsi_insert_seq_on_edge_immediate (pe, stmts);
1931 }
1932 }
1933 else
1934 ni_minus_gap = niters;
1935
d9f21f6a 1936 unsigned HOST_WIDE_INT const_vf;
7cfb4d93
RS
1937 if (vf.is_constant (&const_vf)
1938 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
0f26839a
RS
1939 {
1940 /* Create: niters >> log2(vf) */
1941 /* If it's known that niters == number of latch executions + 1 doesn't
1942 overflow, we can generate niters >> log2(vf); otherwise we generate
1943 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1944 will be at least one. */
d9f21f6a 1945 log_vf = build_int_cst (type, exact_log2 (const_vf));
0f26839a
RS
1946 if (niters_no_overflow)
1947 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1948 else
1949 niters_vector
1950 = fold_build2 (PLUS_EXPR, type,
1951 fold_build2 (RSHIFT_EXPR, type,
1952 fold_build2 (MINUS_EXPR, type,
1953 ni_minus_gap,
1954 build_int_cst (type, vf)),
1955 log_vf),
1956 build_int_cst (type, 1));
1957 step_vector = build_one_cst (type);
1958 }
a5e3d614 1959 else
0f26839a
RS
1960 {
1961 niters_vector = ni_minus_gap;
1962 step_vector = build_int_cst (type, vf);
1963 }
a5e3d614
BC
1964
1965 if (!is_gimple_val (niters_vector))
1966 {
7078979b
BC
1967 var = create_tmp_var (type, "bnd");
1968 gimple_seq stmts = NULL;
a5e3d614
BC
1969 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1970 gsi_insert_seq_on_edge_immediate (pe, stmts);
7078979b
BC
1971 /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1972 we set range information to make niters analyzer's life easier. */
0f26839a 1973 if (stmts != NULL && log_vf)
8e6cdc90
RS
1974 set_range_info (niters_vector, VR_RANGE,
1975 wi::to_wide (build_int_cst (type, 1)),
1976 wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1977 TYPE_MAX_VALUE (type),
1978 log_vf)));
a5e3d614
BC
1979 }
1980 *niters_vector_ptr = niters_vector;
0f26839a 1981 *step_vector_ptr = step_vector;
a5e3d614
BC
1982
1983 return;
1984}
1985
1986/* Given NITERS_VECTOR which is the number of iterations for vectorized
1987 loop specified by LOOP_VINFO after vectorization, compute the number
1988 of iterations before vectorization (niters_vector * vf) and store it
1989 to NITERS_VECTOR_MULT_VF_PTR. */
1990
1991static void
1992vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1993 tree niters_vector,
1994 tree *niters_vector_mult_vf_ptr)
1995{
d9f21f6a
RS
1996 /* We should be using a step_vector of VF if VF is variable. */
1997 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
99b1c316 1998 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
a5e3d614
BC
1999 tree type = TREE_TYPE (niters_vector);
2000 tree log_vf = build_int_cst (type, exact_log2 (vf));
2001 basic_block exit_bb = single_exit (loop)->dest;
ebfd146a 2002
a5e3d614
BC
2003 gcc_assert (niters_vector_mult_vf_ptr != NULL);
2004 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2005 niters_vector, log_vf);
2006 if (!is_gimple_val (niters_vector_mult_vf))
2007 {
2008 tree var = create_tmp_var (type, "niters_vector_mult_vf");
2009 gimple_seq stmts = NULL;
2010 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2011 &stmts, true, var);
2012 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2013 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2014 }
2015 *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2016}
2017
b81f2daf
RB
2018/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2019 this function searches for the corresponding lcssa phi node in exit
2020 bb of LOOP. If it is found, return the phi result; otherwise return
2021 NULL. */
2022
2023static tree
2024find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2025 gphi *lcssa_phi)
2026{
2027 gphi_iterator gsi;
2028 edge e = single_exit (loop);
2029
2030 gcc_assert (single_pred_p (e->dest));
2031 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2032 {
2033 gphi *phi = gsi.phi ();
2034 if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2035 PHI_ARG_DEF (lcssa_phi, 0), 0))
2036 return PHI_RESULT (phi);
2037 }
2038 return NULL_TREE;
2039}
2040
a5e3d614
BC
2041/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2042 from SECOND/FIRST and puts it at the original loop's preheader/exit
2043 edge, the two loops are arranged as below:
2044
2045 preheader_a:
2046 first_loop:
2047 header_a:
2048 i_1 = PHI<i_0, i_2>;
2049 ...
2050 i_2 = i_1 + 1;
2051 if (cond_a)
2052 goto latch_a;
2053 else
2054 goto between_bb;
2055 latch_a:
2056 goto header_a;
2057
2058 between_bb:
2059 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
2060
2061 second_loop:
2062 header_b:
2063 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2064 or with i_2 if no LCSSA phi is created
2065 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2066 ...
2067 i_4 = i_3 + 1;
2068 if (cond_b)
2069 goto latch_b;
2070 else
2071 goto exit_bb;
2072 latch_b:
2073 goto header_b;
2074
2075 exit_bb:
2076
2077 This function creates loop closed SSA for the first loop; update the
2078 second loop's PHI nodes by replacing argument on incoming edge with the
2079 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
2080 is false, Loop closed ssa phis will only be created for non-iv phis for
2081 the first loop.
2082
2083 This function assumes exit bb of the first loop is preheader bb of the
2084 second loop, i.e, between_bb in the example code. With PHIs updated,
2085 the second loop will execute rest iterations of the first. */
ebfd146a 2086
a5e3d614
BC
2087static void
2088slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
99b1c316 2089 class loop *first, class loop *second,
a5e3d614
BC
2090 bool create_lcssa_for_iv_phis)
2091{
2092 gphi_iterator gsi_update, gsi_orig;
99b1c316 2093 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
a5e3d614
BC
2094
2095 edge first_latch_e = EDGE_SUCC (first->latch, 0);
2096 edge second_preheader_e = loop_preheader_edge (second);
2097 basic_block between_bb = single_exit (first)->dest;
2098
2099 gcc_assert (between_bb == second_preheader_e->src);
2100 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2101 /* Either the first loop or the second is the loop to be vectorized. */
2102 gcc_assert (loop == first || loop == second);
2103
2104 for (gsi_orig = gsi_start_phis (first->header),
2105 gsi_update = gsi_start_phis (second->header);
2106 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2107 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2108 {
2109 gphi *orig_phi = gsi_orig.phi ();
2110 gphi *update_phi = gsi_update.phi ();
2111
2112 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2113 /* Generate lcssa PHI node for the first loop. */
2114 gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
82570274
RS
2115 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2116 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
a5e3d614
BC
2117 {
2118 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2119 gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2120 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2121 arg = new_res;
2122 }
2123
2124 /* Update PHI node in the second loop by replacing arg on the loop's
2125 incoming edge. */
2126 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2127 }
b81f2daf
RB
2128
2129 /* For epilogue peeling we have to make sure to copy all LC PHIs
2130 for correct vectorization of live stmts. */
2131 if (loop == first)
2132 {
2133 basic_block orig_exit = single_exit (second)->dest;
2134 for (gsi_orig = gsi_start_phis (orig_exit);
2135 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2136 {
2137 gphi *orig_phi = gsi_orig.phi ();
2138 tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2139 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg))
2140 continue;
2141
2142 /* Already created in the above loop. */
2143 if (find_guard_arg (first, second, orig_phi))
2144 continue;
2145
2146 tree new_res = copy_ssa_name (orig_arg);
2147 gphi *lcphi = create_phi_node (new_res, between_bb);
2148 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2149 }
2150 }
a5e3d614
BC
2151}
2152
2153/* Function slpeel_add_loop_guard adds guard skipping from the beginning
2154 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
2155 are two pred edges of the merge point before UPDATE_LOOP. The two loops
2156 appear like below:
2157
2158 guard_bb:
2159 if (cond)
2160 goto merge_bb;
2161 else
2162 goto skip_loop;
2163
2164 skip_loop:
2165 header_a:
2166 i_1 = PHI<i_0, i_2>;
2167 ...
2168 i_2 = i_1 + 1;
2169 if (cond_a)
2170 goto latch_a;
2171 else
2172 goto exit_a;
2173 latch_a:
2174 goto header_a;
2175
2176 exit_a:
2177 i_5 = PHI<i_2>;
2178
2179 merge_bb:
2180 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2181
2182 update_loop:
2183 header_b:
2184 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
2185 ...
2186 i_4 = i_3 + 1;
2187 if (cond_b)
2188 goto latch_b;
2189 else
2190 goto exit_bb;
2191 latch_b:
2192 goto header_b;
2193
2194 exit_bb:
2195
2196 This function creates PHI nodes at merge_bb and replaces the use of i_5
2197 in the update_loop's PHI node with the result of new PHI result. */
2198
2199static void
99b1c316
MS
2200slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2201 class loop *update_loop,
a5e3d614
BC
2202 edge guard_edge, edge merge_edge)
2203{
620e594b 2204 location_t merge_loc, guard_loc;
a5e3d614
BC
2205 edge orig_e = loop_preheader_edge (skip_loop);
2206 edge update_e = loop_preheader_edge (update_loop);
2207 gphi_iterator gsi_orig, gsi_update;
2208
2209 for ((gsi_orig = gsi_start_phis (skip_loop->header),
2210 gsi_update = gsi_start_phis (update_loop->header));
2211 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2212 gsi_next (&gsi_orig), gsi_next (&gsi_update))
2213 {
2214 gphi *orig_phi = gsi_orig.phi ();
2215 gphi *update_phi = gsi_update.phi ();
2216
2217 /* Generate new phi node at merge bb of the guard. */
2218 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2219 gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2220
2221 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
2222 args in NEW_PHI for these edges. */
2223 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2224 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2225 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2226 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2227 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2228 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2229
2230 /* Update phi in UPDATE_PHI. */
2231 adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2232 }
2233}
2234
a5e3d614
BC
2235/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2236 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
2237 point between the two loops to the end of EPILOG. Edges GUARD_EDGE
2238 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2239 The CFG looks like:
2240
2241 loop:
2242 header_a:
2243 i_1 = PHI<i_0, i_2>;
2244 ...
2245 i_2 = i_1 + 1;
2246 if (cond_a)
2247 goto latch_a;
2248 else
2249 goto exit_a;
2250 latch_a:
2251 goto header_a;
2252
2253 exit_a:
2254
2255 guard_bb:
2256 if (cond)
2257 goto merge_bb;
2258 else
2259 goto epilog_loop;
2260
2261 ;; fall_through_bb
2262
2263 epilog_loop:
2264 header_b:
2265 i_3 = PHI<i_2, i_4>;
2266 ...
2267 i_4 = i_3 + 1;
2268 if (cond_b)
2269 goto latch_b;
2270 else
2271 goto merge_bb;
2272 latch_b:
2273 goto header_b;
2274
2275 merge_bb:
2276 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2277
2278 exit_bb:
2279 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
2280
2281 For each name used out side EPILOG (i.e - for each name that has a lcssa
2282 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
2283 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
2284 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2285 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
2286 in exit_bb will also be updated. */
2287
2288static void
99b1c316 2289slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
a5e3d614
BC
2290 edge guard_edge, edge merge_edge)
2291{
2292 gphi_iterator gsi;
2293 basic_block merge_bb = guard_edge->dest;
2294
2295 gcc_assert (single_succ_p (merge_bb));
2296 edge e = single_succ_edge (merge_bb);
2297 basic_block exit_bb = e->dest;
2298 gcc_assert (single_pred_p (exit_bb));
2299 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2300
2301 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2302 {
2303 gphi *update_phi = gsi.phi ();
2304 tree old_arg = PHI_ARG_DEF (update_phi, 0);
a5e3d614 2305
2e7a4f57
AV
2306 tree merge_arg = NULL_TREE;
2307
2308 /* If the old argument is a SSA_NAME use its current_def. */
2309 if (TREE_CODE (old_arg) == SSA_NAME)
2310 merge_arg = get_current_def (old_arg);
2311 /* If it's a constant or doesn't have a current_def, just use the old
2312 argument. */
a5e3d614
BC
2313 if (!merge_arg)
2314 merge_arg = old_arg;
2315
2316 tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2317 /* If the var is live after loop but not a reduction, we simply
2318 use the old arg. */
2319 if (!guard_arg)
2320 guard_arg = old_arg;
2321
2322 /* Create new phi node in MERGE_BB: */
2323 tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2324 gphi *merge_phi = create_phi_node (new_res, merge_bb);
2325
2326 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2327 the two PHI args in merge_phi for these edges. */
2328 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2329 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2330
2331 /* Update the original phi in exit_bb. */
2332 adjust_phi_and_debug_stmts (update_phi, e, new_res);
2333 }
2334}
2335
2336/* EPILOG loop is duplicated from the original loop for vectorizing,
2337 the arg of its loop closed ssa PHI needs to be updated. */
2338
2339static void
99b1c316 2340slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
a5e3d614
BC
2341{
2342 gphi_iterator gsi;
2343 basic_block exit_bb = single_exit (epilog)->dest;
2344
2345 gcc_assert (single_pred_p (exit_bb));
2346 edge e = EDGE_PRED (exit_bb, 0);
2347 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2348 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2349}
2350
2351/* Function vect_do_peeling.
2352
2353 Input:
2354 - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2355
2356 preheader:
2357 LOOP:
2358 header_bb:
2359 loop_body
2360 if (exit_loop_cond) goto exit_bb
2361 else goto header_bb
2362 exit_bb:
2363
2364 - NITERS: The number of iterations of the loop.
2365 - NITERSM1: The number of iterations of the loop's latch.
2366 - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2367 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2368 CHECK_PROFITABILITY is true.
2369 Output:
0f26839a 2370 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
7cfb4d93 2371 iterate after vectorization; see vect_set_loop_condition for details.
0f26839a
RS
2372 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2373 should be set to the number of scalar iterations handled by the
2374 vector loop. The SSA name is only used on exit from the loop.
a5e3d614
BC
2375
2376 This function peels prolog and epilog from the loop, adds guards skipping
2377 PROLOG and EPILOG for various conditions. As a result, the changed CFG
2378 would look like:
2379
2380 guard_bb_1:
2381 if (prefer_scalar_loop) goto merge_bb_1
2382 else goto guard_bb_2
2383
2384 guard_bb_2:
2385 if (skip_prolog) goto merge_bb_2
2386 else goto prolog_preheader
2387
2388 prolog_preheader:
2389 PROLOG:
2390 prolog_header_bb:
2391 prolog_body
2392 if (exit_prolog_cond) goto prolog_exit_bb
2393 else goto prolog_header_bb
2394 prolog_exit_bb:
2395
2396 merge_bb_2:
2397
2398 vector_preheader:
2399 VECTOR LOOP:
2400 vector_header_bb:
2401 vector_body
2402 if (exit_vector_cond) goto vector_exit_bb
2403 else goto vector_header_bb
2404 vector_exit_bb:
2405
2406 guard_bb_3:
2407 if (skip_epilog) goto merge_bb_3
2408 else goto epilog_preheader
2409
2410 merge_bb_1:
2411
2412 epilog_preheader:
2413 EPILOG:
2414 epilog_header_bb:
2415 epilog_body
2416 if (exit_epilog_cond) goto merge_bb_3
2417 else goto epilog_header_bb
2418
2419 merge_bb_3:
2420
2421 Note this function peels prolog and epilog only if it's necessary,
2422 as well as guards.
97c14603
AV
2423 This function returns the epilogue loop if a decision was made to vectorize
2424 it, otherwise NULL.
2425
2426 The analysis resulting in this epilogue loop's loop_vec_info was performed
2427 in the same vect_analyze_loop call as the main loop's. At that time
2428 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2429 vectorization factors than the main loop. This list is stored in the main
2430 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to
2431 vectorize the epilogue loop for a lower vectorization factor, the
2432 loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2433 updated and linked to the epilogue loop. This is later used to vectorize
2434 the epilogue. The reason the loop_vec_info needs updating is that it was
2435 constructed based on the original main loop, and the epilogue loop is a
2436 copy of this loop, so all links pointing to statements in the original loop
2437 need updating. Furthermore, these loop_vec_infos share the
2438 data_reference's records, which will also need to be updated.
a5e3d614
BC
2439
2440 TODO: Guard for prefer_scalar_loop should be emitted along with
2441 versioning conditions if loop versioning is needed. */
2442
598eaaa2 2443
99b1c316 2444class loop *
a5e3d614 2445vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
0f26839a
RS
2446 tree *niters_vector, tree *step_vector,
2447 tree *niters_vector_mult_vf_var, int th,
97c14603
AV
2448 bool check_profitability, bool niters_no_overflow,
2449 tree *advance, drs_init_vec &orig_drs_init)
a5e3d614
BC
2450{
2451 edge e, guard_e;
2452 tree type = TREE_TYPE (niters), guard_cond;
2453 basic_block guard_bb, guard_to;
357067f2 2454 profile_probability prob_prolog, prob_vector, prob_epilog;
d9f21f6a 2455 int estimated_vf;
535e7c11 2456 int prolog_peeling = 0;
97c14603 2457 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
ca31798e
AV
2458 /* We currently do not support prolog peeling if the target alignment is not
2459 known at compile time. 'vect_gen_prolog_loop_niters' depends on the
2460 target alignment being constant. */
2461 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2462 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2463 return NULL;
2464
535e7c11
RS
2465 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2466 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
d1d20a49
RS
2467
2468 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2469 poly_uint64 bound_epilog = 0;
2470 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2471 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2472 bound_epilog += vf - 1;
2473 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2474 bound_epilog += 1;
2475 bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2476 poly_uint64 bound_scalar = bound_epilog;
a5e3d614
BC
2477
2478 if (!prolog_peeling && !epilog_peeling)
598eaaa2 2479 return NULL;
a5e3d614 2480
357067f2 2481 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
d9f21f6a
RS
2482 estimated_vf = vect_vf_for_cost (loop_vinfo);
2483 if (estimated_vf == 2)
2484 estimated_vf = 3;
357067f2 2485 prob_prolog = prob_epilog = profile_probability::guessed_always ()
d9f21f6a 2486 .apply_scale (estimated_vf - 1, estimated_vf);
a5e3d614 2487
99b1c316
MS
2488 class loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2489 class loop *first_loop = loop;
3e907b90 2490 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
e3969868
RS
2491
2492 /* We might have a queued need to update virtual SSA form. As we
2493 delete the update SSA machinery below after doing a regular
2494 incremental SSA update during loop copying make sure we don't
2495 lose that fact.
2496 ??? Needing to update virtual SSA form by renaming is unfortunate
2497 but not all of the vectorizer code inserting new loads / stores
2498 properly assigns virtual operands to those statements. */
a5e3d614
BC
2499 update_ssa (TODO_update_ssa_only_virtuals);
2500
e3969868
RS
2501 create_lcssa_for_virtual_phi (loop);
2502
2503 /* If we're vectorizing an epilogue loop, the update_ssa above will
2504 have ensured that the virtual operand is in SSA form throughout the
2505 vectorized main loop. Normally it is possible to trace the updated
2506 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2507 back to scalar-stmt vuses, meaning that the effect of the SSA update
2508 remains local to the main loop. However, there are rare cases in
2509 which the vectorized loop has vdefs even when the original scalar
2510 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES
2511 introduces clobbers of the temporary vector array, which in turn
2512 needs new vdefs. If the scalar loop doesn't write to memory, these
2513 new vdefs will be the only ones in the vector loop.
2514
2515 In that case, update_ssa will have added a new virtual phi to the
2516 main loop, which previously didn't need one. Ensure that we (locally)
2517 maintain LCSSA form for the virtual operand, just as we would have
2518 done if the virtual phi had existed from the outset. This makes it
2519 easier to duplicate the scalar epilogue loop below. */
2520 tree vop_to_rename = NULL_TREE;
2521 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2522 {
2523 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2524 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2525 }
2526
36f52e8f 2527 if (MAY_HAVE_DEBUG_BIND_STMTS)
a5e3d614
BC
2528 {
2529 gcc_assert (!adjust_vec.exists ());
2530 adjust_vec.create (32);
2531 }
ebfd146a
IR
2532 initialize_original_copy_tables ();
2533
d1d20a49
RS
2534 /* Record the anchor bb at which the guard should be placed if the scalar
2535 loop might be preferred. */
2536 basic_block anchor = loop_preheader_edge (loop)->src;
2537
2538 /* Generate the number of iterations for the prolog loop. We do this here
2539 so that we can also get the upper bound on the number of iterations. */
2540 tree niters_prolog;
2541 int bound_prolog = 0;
2542 if (prolog_peeling)
2543 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
97c14603 2544 &bound_prolog);
d1d20a49
RS
2545 else
2546 niters_prolog = build_int_cst (type, 0);
2547
97c14603
AV
2548 loop_vec_info epilogue_vinfo = NULL;
2549 if (vect_epilogues)
2550 {
2551 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2552 loop_vinfo->epilogue_vinfos.ordered_remove (0);
2553 }
2554
2555 tree niters_vector_mult_vf = NULL_TREE;
2556 /* Saving NITERs before the loop, as this may be changed by prologue. */
2557 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2558 edge update_e = NULL, skip_e = NULL;
2559 unsigned int lowest_vf = constant_lower_bound (vf);
2560 /* If we know the number of scalar iterations for the main loop we should
2561 check whether after the main loop there are enough iterations left over
2562 for the epilogue. */
2563 if (vect_epilogues
2564 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2565 && prolog_peeling >= 0
2566 && known_eq (vf, lowest_vf))
2567 {
2568 unsigned HOST_WIDE_INT eiters
2569 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
2570 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2571
2572 eiters -= prolog_peeling;
2573 eiters
2574 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2575
2576 unsigned int ratio;
87b47251
AV
2577 unsigned int epilogue_gaps
2578 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
1c84a2d2
RS
2579 while (!(constant_multiple_p
2580 (GET_MODE_SIZE (loop_vinfo->vector_mode),
2581 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
87b47251 2582 && eiters >= lowest_vf / ratio + epilogue_gaps))
97c14603
AV
2583 {
2584 delete epilogue_vinfo;
2585 epilogue_vinfo = NULL;
2586 if (loop_vinfo->epilogue_vinfos.length () == 0)
2587 {
2588 vect_epilogues = false;
2589 break;
2590 }
2591 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2592 loop_vinfo->epilogue_vinfos.ordered_remove (0);
87b47251 2593 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
97c14603
AV
2594 }
2595 }
a5e3d614
BC
2596 /* Prolog loop may be skipped. */
2597 bool skip_prolog = (prolog_peeling != 0);
97c14603
AV
2598 /* Skip this loop to epilog when there are not enough iterations to enter this
2599 vectorized loop. If true we should perform runtime checks on the NITERS
2600 to check whether we should skip the current vectorized loop. If we know
2601 the number of scalar iterations we may choose to add a runtime check if
2602 this number "maybe" smaller than the number of iterations required
2603 when we know the number of scalar iterations may potentially
2604 be smaller than the number of iterations required to enter this loop, for
2605 this we use the upper bounds on the prolog and epilog peeling. When we
2606 don't know the number of iterations and don't require versioning it is
2607 because we have asserted that there are enough scalar iterations to enter
2608 the main loop, so this skip is not necessary. When we are versioning then
2609 we only add such a skip if we have chosen to vectorize the epilogue. */
d1d20a49
RS
2610 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2611 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2612 bound_prolog + bound_epilog)
97c14603
AV
2613 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2614 || vect_epilogues));
a5e3d614
BC
2615 /* Epilog loop must be executed if the number of iterations for epilog
2616 loop is known at compile time, otherwise we need to add a check at
2617 the end of vector loop and skip to the end of epilog loop. */
2618 bool skip_epilog = (prolog_peeling < 0
d9f21f6a
RS
2619 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2620 || !vf.is_constant ());
a5e3d614
BC
2621 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
2622 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2623 skip_epilog = false;
2624
a5e3d614 2625 if (skip_vector)
25c99850
BC
2626 {
2627 split_edge (loop_preheader_edge (loop));
2628
2629 /* Due to the order in which we peel prolog and epilog, we first
2630 propagate probability to the whole loop. The purpose is to
2631 avoid adjusting probabilities of both prolog and vector loops
2632 separately. Note in this case, the probability of epilog loop
2633 needs to be scaled back later. */
2634 basic_block bb_before_loop = loop_preheader_edge (loop)->src;
357067f2 2635 if (prob_vector.initialized_p ())
d9f21f6a
RS
2636 {
2637 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2638 scale_loop_profile (loop, prob_vector, 0);
2639 }
25c99850 2640 }
a5e3d614 2641
4f5b9c80 2642 dump_user_location_t loop_loc = find_loop_location (loop);
99b1c316 2643 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
97c14603
AV
2644 if (vect_epilogues)
2645 /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2646 use the original scalar loop as remaining epilogue if necessary. */
2647 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2648 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2649
a5e3d614 2650 if (prolog_peeling)
5d2eb24b 2651 {
a5e3d614
BC
2652 e = loop_preheader_edge (loop);
2653 if (!slpeel_can_duplicate_loop_p (loop, e))
5d2eb24b 2654 {
a5e3d614
BC
2655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2656 "loop can't be duplicated to preheader edge.\n");
2657 gcc_unreachable ();
2658 }
2659 /* Peel prolog and put it on preheader edge of loop. */
2660 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2661 if (!prolog)
2662 {
2663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2664 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2665 gcc_unreachable ();
2666 }
03001a35 2667 prolog->force_vectorize = false;
a5e3d614
BC
2668 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2669 first_loop = prolog;
2670 reset_original_copy_tables ();
2671
d1d20a49 2672 /* Update the number of iterations for prolog loop. */
0f26839a 2673 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
7cfb4d93
RS
2674 vect_set_loop_condition (prolog, NULL, niters_prolog,
2675 step_prolog, NULL_TREE, false);
a5e3d614
BC
2676
2677 /* Skip the prolog loop. */
2678 if (skip_prolog)
2679 {
2680 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2681 niters_prolog, build_int_cst (type, 0));
2682 guard_bb = loop_preheader_edge (prolog)->src;
25c99850 2683 basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
a5e3d614
BC
2684 guard_to = split_edge (loop_preheader_edge (loop));
2685 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2686 guard_to, guard_bb,
357067f2 2687 prob_prolog.invert (),
3e907b90 2688 irred_flag);
a5e3d614
BC
2689 e = EDGE_PRED (guard_to, 0);
2690 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2691 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
25c99850 2692
af2bbc51
JH
2693 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2694 scale_loop_profile (prolog, prob_prolog, bound_prolog);
a5e3d614 2695 }
97c14603
AV
2696
2697 /* Save original inits for each data_reference before advancing them with
2698 NITERS_PROLOG. */
2699 unsigned int i;
2700 struct data_reference *dr;
2701 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
2702 FOR_EACH_VEC_ELT (datarefs, i, dr)
2703 orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr)));
2704
a5e3d614 2705 /* Update init address of DRs. */
535e7c11 2706 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
a5e3d614
BC
2707 /* Update niters for vector loop. */
2708 LOOP_VINFO_NITERS (loop_vinfo)
2709 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2710 LOOP_VINFO_NITERSM1 (loop_vinfo)
2711 = fold_build2 (MINUS_EXPR, type,
2712 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
7078979b
BC
2713 bool new_var_p = false;
2714 niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2715 /* It's guaranteed that vector loop bound before vectorization is at
2716 least VF, so set range information for newly generated var. */
2717 if (new_var_p)
2718 set_range_info (niters, VR_RANGE,
8e6cdc90
RS
2719 wi::to_wide (build_int_cst (type, vf)),
2720 wi::to_wide (TYPE_MAX_VALUE (type)));
a5e3d614 2721
cbb22e61
BC
2722 /* Prolog iterates at most bound_prolog times, latch iterates at
2723 most bound_prolog - 1 times. */
2724 record_niter_bound (prolog, bound_prolog - 1, false, true);
a5e3d614
BC
2725 delete_update_ssa ();
2726 adjust_vec_debug_stmts ();
2727 scev_reset ();
5d2eb24b
IR
2728 }
2729
a5e3d614
BC
2730 if (epilog_peeling)
2731 {
2732 e = single_exit (loop);
2733 if (!slpeel_can_duplicate_loop_p (loop, e))
2734 {
2735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2736 "loop can't be duplicated to exit edge.\n");
2737 gcc_unreachable ();
2738 }
97c14603
AV
2739 /* Peel epilog and put it on exit edge of loop. If we are vectorizing
2740 said epilog then we should use a copy of the main loop as a starting
2741 point. This loop may have already had some preliminary transformations
2742 to allow for more optimal vectorization, for example if-conversion.
2743 If we are not vectorizing the epilog then we should use the scalar loop
2744 as the transformations mentioned above make less or no sense when not
2745 vectorizing. */
2746 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
e3969868
RS
2747 if (vop_to_rename)
2748 {
2749 /* Vectorizing the main loop can sometimes introduce a vdef to
2750 a loop that previously didn't have one; see the comment above
2751 the definition of VOP_TO_RENAME for details. The definition
2752 D that holds on E will then be different from the definition
2753 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2754 rename VOP_TO_RENAME to D when copying the loop.
2755
2756 The virtual operand is in LCSSA form for the main loop,
2757 and no stmt between the main loop and E needs a vdef,
2758 so we know that D is provided by a phi rather than by a
2759 vdef on a normal gimple stmt. */
2760 basic_block vdef_bb = e->src;
2761 gphi *vphi;
2762 while (!(vphi = get_virtual_phi (vdef_bb)))
2763 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2764 gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2765 set_current_def (vop_to_rename, gimple_phi_result (vphi));
2766 }
97c14603 2767 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
a5e3d614
BC
2768 if (!epilog)
2769 {
2770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2771 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2772 gcc_unreachable ();
2773 }
03001a35 2774 epilog->force_vectorize = false;
a5e3d614
BC
2775 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2776
2777 /* Scalar version loop may be preferred. In this case, add guard
2778 and skip to epilog. Note this only happens when the number of
2779 iterations of loop is unknown at compile time, otherwise this
2780 won't be vectorized. */
2781 if (skip_vector)
2782 {
cbb22e61 2783 /* Additional epilogue iteration is peeled if gap exists. */
a5e3d614 2784 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
d1d20a49 2785 bound_prolog, bound_epilog,
cbb22e61 2786 th, &bound_scalar,
a5e3d614 2787 check_profitability);
cbb22e61
BC
2788 /* Build guard against NITERSM1 since NITERS may overflow. */
2789 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
a5e3d614
BC
2790 guard_bb = anchor;
2791 guard_to = split_edge (loop_preheader_edge (epilog));
2792 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2793 guard_to, guard_bb,
357067f2 2794 prob_vector.invert (),
3e907b90 2795 irred_flag);
97c14603 2796 skip_e = guard_e;
a5e3d614
BC
2797 e = EDGE_PRED (guard_to, 0);
2798 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2799 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
25c99850
BC
2800
2801 /* Simply propagate profile info from guard_bb to guard_to which is
2802 a merge point of control flow. */
25c99850 2803 guard_to->count = guard_bb->count;
00fa28d1 2804
af2bbc51
JH
2805 /* Scale probability of epilog loop back.
2806 FIXME: We should avoid scaling down and back up. Profile may
2807 get lost if we scale down to 0. */
10ea2672 2808 basic_block *bbs = get_loop_body (epilog);
00fa28d1
JH
2809 for (unsigned int i = 0; i < epilog->num_nodes; i++)
2810 bbs[i]->count = bbs[i]->count.apply_scale
2811 (bbs[i]->count,
2812 bbs[i]->count.apply_probability
2813 (prob_vector));
af2bbc51 2814 free (bbs);
a5e3d614 2815 }
ebfd146a 2816
25c99850 2817 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
a5e3d614
BC
2818 /* If loop is peeled for non-zero constant times, now niters refers to
2819 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2820 overflows. */
2821 niters_no_overflow |= (prolog_peeling > 0);
2822 vect_gen_vector_loop_niters (loop_vinfo, niters,
0f26839a
RS
2823 niters_vector, step_vector,
2824 niters_no_overflow);
2825 if (!integer_onep (*step_vector))
2826 {
2827 /* On exit from the loop we will have an easy way of calcalating
2828 NITERS_VECTOR / STEP * STEP. Install a dummy definition
2829 until then. */
2830 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2831 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2832 *niters_vector_mult_vf_var = niters_vector_mult_vf;
2833 }
2834 else
2835 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2836 &niters_vector_mult_vf);
a5e3d614
BC
2837 /* Update IVs of original loop as if they were advanced by
2838 niters_vector_mult_vf steps. */
2839 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
97c14603 2840 update_e = skip_vector ? e : loop_preheader_edge (epilog);
a5e3d614
BC
2841 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2842 update_e);
2843
2844 if (skip_epilog)
2845 {
2846 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2847 niters, niters_vector_mult_vf);
2848 guard_bb = single_exit (loop)->dest;
2849 guard_to = split_edge (single_exit (epilog));
2850 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2851 skip_vector ? anchor : guard_bb,
357067f2 2852 prob_epilog.invert (),
3e907b90 2853 irred_flag);
a5e3d614
BC
2854 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2855 single_exit (epilog));
25c99850
BC
2856 /* Only need to handle basic block before epilog loop if it's not
2857 the guard_bb, which is the case when skip_vector is true. */
2858 if (guard_bb != bb_before_epilog)
2859 {
357067f2 2860 prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
25c99850 2861
af2bbc51 2862 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
25c99850 2863 }
d9f21f6a 2864 scale_loop_profile (epilog, prob_epilog, 0);
a5e3d614
BC
2865 }
2866 else
2867 slpeel_update_phi_nodes_for_lcssa (epilog);
ebfd146a 2868
d1d20a49
RS
2869 unsigned HOST_WIDE_INT bound;
2870 if (bound_scalar.is_constant (&bound))
d9f21f6a 2871 {
d1d20a49
RS
2872 gcc_assert (bound != 0);
2873 /* -1 to convert loop iterations to latch iterations. */
2874 record_niter_bound (epilog, bound - 1, false, true);
d9f21f6a 2875 }
a5e3d614
BC
2876
2877 delete_update_ssa ();
2878 adjust_vec_debug_stmts ();
2879 scev_reset ();
2880 }
97c14603
AV
2881
2882 if (vect_epilogues)
2883 {
2884 epilog->aux = epilogue_vinfo;
2885 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2886
2887 loop_constraint_clear (epilog, LOOP_C_INFINITE);
2888
2889 /* We now must calculate the number of NITERS performed by the previous
2890 loop and EPILOGUE_NITERS to be performed by the epilogue. */
2891 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2892 niters_prolog, niters_vector_mult_vf);
2893
2894 /* If skip_vector we may skip the previous loop, we insert a phi-node to
2895 determine whether we are coming from the previous vectorized loop
2896 using the update_e edge or the skip_vector basic block using the
2897 skip_e edge. */
2898 if (skip_vector)
2899 {
2900 gcc_assert (update_e != NULL && skip_e != NULL);
2901 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2902 update_e->dest);
2903 tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2904 gimple *stmt = gimple_build_assign (new_ssa, niters);
2905 gimple_stmt_iterator gsi;
2906 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2907 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2908 {
2909 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2910 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2911 }
2912 else
2913 {
2914 gsi = gsi_last_bb (update_e->src);
2915 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2916 }
2917
2918 niters = new_ssa;
2919 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2920 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2921 UNKNOWN_LOCATION);
2922 niters = PHI_RESULT (new_phi);
2923 }
2924
2925 /* Subtract the number of iterations performed by the vectorized loop
2926 from the number of total iterations. */
2927 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2928 before_loop_niters,
2929 niters);
2930
2931 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2932 LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2933 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2934 epilogue_niters,
2935 build_one_cst (TREE_TYPE (epilogue_niters)));
2936
2937 /* Set ADVANCE to the number of iterations performed by the previous
2938 loop and its prologue. */
2939 *advance = niters;
2940
2941 /* Redo the peeling for niter analysis as the NITERs and alignment
2942 may have been updated to take the main loop into account. */
2943 determine_peel_for_niter (epilogue_vinfo);
2944 }
2945
a5e3d614 2946 adjust_vec.release ();
ebfd146a 2947 free_original_copy_tables ();
598eaaa2 2948
97c14603 2949 return vect_epilogues ? epilog : NULL;
ebfd146a
IR
2950}
2951
01d32b2b
BC
2952/* Function vect_create_cond_for_niters_checks.
2953
2954 Create a conditional expression that represents the run-time checks for
ce811fc4 2955 loop's niter. The loop is guaranteed to terminate if the run-time
01d32b2b
BC
2956 checks hold.
2957
2958 Input:
2959 COND_EXPR - input conditional expression. New conditions will be chained
2960 with logical AND operation. If it is NULL, then the function
2961 is used to return the number of alias checks.
2962 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2963 to be checked.
2964
2965 Output:
2966 COND_EXPR - conditional expression.
2967
2968 The returned COND_EXPR is the conditional expression to be used in the
2969 if statement that controls which version of the loop gets executed at
2970 runtime. */
2971
2972static void
2973vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2974{
2975 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2976
2977 if (*cond_expr)
2978 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2979 *cond_expr, part_cond_expr);
2980 else
2981 *cond_expr = part_cond_expr;
2982}
ebfd146a 2983
9adee305
RS
2984/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2985 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
2986
2987static void
2988chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2989{
2990 if (*cond_expr)
2991 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2992 *cond_expr, part_cond_expr);
2993 else
2994 *cond_expr = part_cond_expr;
2995}
2996
ebfd146a
IR
2997/* Function vect_create_cond_for_align_checks.
2998
2999 Create a conditional expression that represents the alignment checks for
3000 all of data references (array element references) whose alignment must be
3001 checked at runtime.
3002
3003 Input:
3004 COND_EXPR - input conditional expression. New conditions will be chained
3005 with logical AND operation.
3006 LOOP_VINFO - two fields of the loop information are used.
3007 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3008 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3009
3010 Output:
3011 COND_EXPR_STMT_LIST - statements needed to construct the conditional
3012 expression.
3013 The returned value is the conditional expression to be used in the if
3014 statement that controls which version of the loop gets executed at runtime.
3015
3016 The algorithm makes two assumptions:
3017 1) The number of bytes "n" in a vector is a power of 2.
3018 2) An address "a" is aligned if a%n is zero and that this
3019 test can be done as a&(n-1) == 0. For example, for 16
3020 byte vectors the test is a&0xf == 0. */
3021
3022static void
3023vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3024 tree *cond_expr,
3025 gimple_seq *cond_expr_stmt_list)
3026{
7bcbf2d8 3027 vec<stmt_vec_info> may_misalign_stmts
ebfd146a 3028 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
7bcbf2d8 3029 stmt_vec_info stmt_info;
ebfd146a
IR
3030 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3031 tree mask_cst;
3032 unsigned int i;
ebfd146a
IR
3033 tree int_ptrsize_type;
3034 char tmp_name[20];
3035 tree or_tmp_name = NULL_TREE;
83d5977e 3036 tree and_tmp_name;
355fe088 3037 gimple *and_stmt;
ebfd146a
IR
3038 tree ptrsize_zero;
3039 tree part_cond_expr;
3040
3041 /* Check that mask is one less than a power of 2, i.e., mask is
3042 all zeros followed by all ones. */
3043 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3044
96f9265a 3045 int_ptrsize_type = signed_type_for (ptr_type_node);
ebfd146a
IR
3046
3047 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3048 of the first vector of the i'th data reference. */
3049
7bcbf2d8 3050 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
ebfd146a
IR
3051 {
3052 gimple_seq new_stmt_list = NULL;
3053 tree addr_base;
83d5977e
RG
3054 tree addr_tmp_name;
3055 tree new_or_tmp_name;
355fe088 3056 gimple *addr_stmt, *or_stmt;
7bcbf2d8 3057 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
d8ba5b19 3058 bool negative = tree_int_cst_compare
7bcbf2d8 3059 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
d8ba5b19 3060 tree offset = negative
aad83b7c 3061 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
ebfd146a
IR
3062
3063 /* create: addr_tmp = (int)(address_of_first_vector) */
3064 addr_base =
7bcbf2d8 3065 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3f5e8a76 3066 offset);
ebfd146a
IR
3067 if (new_stmt_list != NULL)
3068 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3069
83d5977e
RG
3070 sprintf (tmp_name, "addr2int%d", i);
3071 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
0d0e4a03 3072 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
ebfd146a
IR
3073 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3074
3075 /* The addresses are OR together. */
3076
3077 if (or_tmp_name != NULL_TREE)
3078 {
3079 /* create: or_tmp = or_tmp | addr_tmp */
83d5977e
RG
3080 sprintf (tmp_name, "orptrs%d", i);
3081 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
0d0e4a03
JJ
3082 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3083 or_tmp_name, addr_tmp_name);
ebfd146a
IR
3084 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3085 or_tmp_name = new_or_tmp_name;
3086 }
3087 else
3088 or_tmp_name = addr_tmp_name;
3089
3090 } /* end for i */
3091
3092 mask_cst = build_int_cst (int_ptrsize_type, mask);
3093
3094 /* create: and_tmp = or_tmp & mask */
83d5977e 3095 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
ebfd146a 3096
0d0e4a03
JJ
3097 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3098 or_tmp_name, mask_cst);
ebfd146a
IR
3099 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3100
3101 /* Make and_tmp the left operand of the conditional test against zero.
3102 if and_tmp has a nonzero bit then some address is unaligned. */
3103 ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3104 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3105 and_tmp_name, ptrsize_zero);
9adee305
RS
3106 chain_cond_expr (cond_expr, part_cond_expr);
3107}
3108
3109/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3110 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3111 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3112 and this new condition are true. Treat a null *COND_EXPR as "true". */
3113
3114static void
3115vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3116{
3117 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3118 unsigned int i;
3119 vec_object_pair *pair;
3120 FOR_EACH_VEC_ELT (pairs, i, pair)
3121 {
3122 tree addr1 = build_fold_addr_expr (pair->first);
3123 tree addr2 = build_fold_addr_expr (pair->second);
3124 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3125 addr1, addr2);
3126 chain_cond_expr (cond_expr, part_cond_expr);
3127 }
ebfd146a
IR
3128}
3129
a57776a1
RS
3130/* Create an expression that is true when all lower-bound conditions for
3131 the vectorized loop are met. Chain this condition with *COND_EXPR. */
3132
3133static void
3134vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3135{
3136 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3137 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3138 {
3139 tree expr = lower_bounds[i].expr;
3140 tree type = unsigned_type_for (TREE_TYPE (expr));
3141 expr = fold_convert (type, expr);
3142 poly_uint64 bound = lower_bounds[i].min_value;
3143 if (!lower_bounds[i].unsigned_p)
3144 {
3145 expr = fold_build2 (PLUS_EXPR, type, expr,
3146 build_int_cstu (type, bound - 1));
3147 bound += bound - 1;
3148 }
3149 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3150 build_int_cstu (type, bound));
3151 chain_cond_expr (cond_expr, part_cond_expr);
3152 }
3153}
3154
ebfd146a
IR
3155/* Function vect_create_cond_for_alias_checks.
3156
3157 Create a conditional expression that represents the run-time checks for
3158 overlapping of address ranges represented by a list of data references
3159 relations passed as input.
3160
3161 Input:
3162 COND_EXPR - input conditional expression. New conditions will be chained
a05a89fa
CH
3163 with logical AND operation. If it is NULL, then the function
3164 is used to return the number of alias checks.
ebfd146a
IR
3165 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3166 to be checked.
3167
3168 Output:
3169 COND_EXPR - conditional expression.
ebfd146a 3170
a05a89fa 3171 The returned COND_EXPR is the conditional expression to be used in the if
ebfd146a
IR
3172 statement that controls which version of the loop gets executed at runtime.
3173*/
3174
a05a89fa 3175void
4bdd44c4 3176vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
ebfd146a 3177{
93bdc3ed 3178 vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
a05a89fa 3179 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
ebfd146a 3180
a05a89fa 3181 if (comp_alias_ddrs.is_empty ())
ebfd146a
IR
3182 return;
3183
9cbd2d97
BC
3184 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3185 &comp_alias_ddrs, cond_expr);
73fbfcad 3186 if (dump_enabled_p ())
ccb3ad87 3187 dump_printf_loc (MSG_NOTE, vect_location,
78c60e3d 3188 "created %u versioning for alias checks.\n",
a05a89fa 3189 comp_alias_ddrs.length ());
ebfd146a
IR
3190}
3191
3192
3193/* Function vect_loop_versioning.
b8698a0f 3194
ebfd146a
IR
3195 If the loop has data references that may or may not be aligned or/and
3196 has data reference relations whose independence was not proven then
3197 two versions of the loop need to be generated, one which is vectorized
3198 and one which isn't. A test is then generated to control which of the
3199 loops is executed. The test checks for the alignment of all of the
3200 data references that may or may not be aligned. An additional
3201 sequence of runtime tests is generated for each pairs of DDRs whose
b8698a0f
L
3202 independence was not proven. The vectorized version of loop is
3203 executed only if both alias and alignment tests are passed.
3204
ebfd146a 3205 The test generated to check which version of loop is executed
b8698a0f 3206 is modified to also check for profitability as indicated by the
d9157f15 3207 cost model threshold TH.
86290011
RG
3208
3209 The versioning precondition(s) are placed in *COND_EXPR and
d68d56b5 3210 *COND_EXPR_STMT_LIST. */
ebfd146a 3211
99b1c316 3212class loop *
a421fe9e 3213vect_loop_versioning (loop_vec_info loop_vinfo)
ebfd146a 3214{
99b1c316
MS
3215 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3216 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
ebfd146a 3217 basic_block condition_bb;
538dd0b7
DM
3218 gphi_iterator gsi;
3219 gimple_stmt_iterator cond_exp_gsi;
ebfd146a
IR
3220 basic_block merge_bb;
3221 basic_block new_exit_bb;
3222 edge new_exit_e, e;
538dd0b7 3223 gphi *orig_phi, *new_phi;
368117e8 3224 tree cond_expr = NULL_TREE;
d68d56b5 3225 gimple_seq cond_expr_stmt_list = NULL;
ebfd146a 3226 tree arg;
af2bbc51 3227 profile_probability prob = profile_probability::likely ();
ebfd146a 3228 gimple_seq gimplify_stmt_list = NULL;
fdd29374 3229 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
9cc1fb4b
XDL
3230 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3231 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
01d32b2b 3232 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
a421fe9e
AV
3233 poly_uint64 versioning_threshold
3234 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
4e65deef
JJ
3235 tree version_simd_if_cond
3236 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
a421fe9e 3237 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
ebfd146a 3238
6eed64b9 3239 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
a421fe9e 3240 && !ordered_p (th, versioning_threshold))
80be3333 3241 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
01d32b2b 3242 build_int_cst (TREE_TYPE (scalar_loop_iters),
fdd29374 3243 th - 1));
a696bc4f
RS
3244 if (maybe_ne (versioning_threshold, 0U))
3245 {
3246 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3247 build_int_cst (TREE_TYPE (scalar_loop_iters),
3248 versioning_threshold - 1));
3249 if (cond_expr)
3250 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3251 expr, cond_expr);
3252 else
3253 cond_expr = expr;
3254 }
01d32b2b
BC
3255
3256 if (version_niter)
3257 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3258
3259 if (cond_expr)
2778a719
RB
3260 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3261 &cond_expr_stmt_list,
01d32b2b 3262 is_gimple_condexpr, NULL_TREE);
ebfd146a 3263
9cc1fb4b 3264 if (version_align)
d68d56b5
RG
3265 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3266 &cond_expr_stmt_list);
ebfd146a 3267
9cc1fb4b 3268 if (version_alias)
9adee305
RS
3269 {
3270 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
a57776a1 3271 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
9adee305
RS
3272 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3273 }
86290011 3274
4e65deef
JJ
3275 if (version_simd_if_cond)
3276 {
3277 gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3278 if (flag_checking)
3279 if (basic_block bb
3280 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3281 gcc_assert (bb != loop->header
3282 && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3283 && (scalar_loop == NULL
3284 || (bb != scalar_loop->header
3285 && dominated_by_p (CDI_DOMINATORS,
3286 scalar_loop->header, bb))));
3287 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3288 tree c = fold_build2 (NE_EXPR, boolean_type_node,
3289 version_simd_if_cond, zero);
3290 if (cond_expr)
3291 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3292 c, cond_expr);
3293 else
3294 cond_expr = c;
3295 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE, vect_location,
3297 "created versioning for simd if condition check.\n");
3298 }
3299
b210f45f
RB
3300 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3301 &gimplify_stmt_list,
d68d56b5
RG
3302 is_gimple_condexpr, NULL_TREE);
3303 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
ebfd146a 3304
2778a719
RB
3305 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3306 invariant in. */
99b1c316 3307 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
2778a719
RB
3308 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3309 !gsi_end_p (gsi); gsi_next (&gsi))
5ce9450f 3310 {
2778a719
RB
3311 gimple *stmt = gsi_stmt (gsi);
3312 update_stmt (stmt);
3313 ssa_op_iter iter;
3314 use_operand_p use_p;
3315 basic_block def_bb;
3316 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3317 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3318 && flow_bb_inside_loop_p (outermost, def_bb))
3319 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3320 }
5ce9450f 3321
2778a719
RB
3322 /* Search for the outermost loop we can version. Avoid versioning of
3323 non-perfect nests but allow if-conversion versioned loops inside. */
99b1c316 3324 class loop *loop_to_version = loop;
2778a719
RB
3325 if (flow_loop_nested_p (outermost, loop))
3326 {
3327 if (dump_enabled_p ())
3328 dump_printf_loc (MSG_NOTE, vect_location,
3329 "trying to apply versioning to outer loop %d\n",
3330 outermost->num);
3331 if (outermost->num == 0)
3332 outermost = superloop_at_depth (loop, 1);
3333 /* And avoid applying versioning on non-perfect nests. */
3334 while (loop_to_version != outermost
3335 && single_exit (loop_outer (loop_to_version))
3336 && (!loop_outer (loop_to_version)->inner->next
3337 || vect_loop_vectorized_call (loop_to_version))
3338 && (!loop_outer (loop_to_version)->inner->next
3339 || !loop_outer (loop_to_version)->inner->next->next))
3340 loop_to_version = loop_outer (loop_to_version);
3341 }
3342
3343 /* Apply versioning. If there is already a scalar version created by
b614fca2
RB
3344 if-conversion re-use that. Note we cannot re-use the copy of
3345 an if-converted outer-loop when vectorizing the inner loop only. */
2778a719 3346 gcond *cond;
b614fca2
RB
3347 gimple *call;
3348 if ((!loop_to_version->inner || loop == loop_to_version)
3349 && (call = vect_loop_vectorized_call (loop_to_version, &cond)))
2778a719
RB
3350 {
3351 gcc_assert (scalar_loop);
3352 condition_bb = gimple_bb (cond);
3353 gimple_cond_set_condition_from_tree (cond, cond_expr);
3354 update_stmt (cond);
3355
3356 if (cond_expr_stmt_list)
3357 {
3358 cond_exp_gsi = gsi_for_stmt (call);
3359 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3360 GSI_SAME_STMT);
3361 }
3362
5cee3239
RB
3363 /* if-conversion uses profile_probability::always () for both paths,
3364 reset the paths probabilities appropriately. */
3365 edge te, fe;
3366 extract_true_false_edges_from_block (condition_bb, &te, &fe);
3367 te->probability = prob;
3368 fe->probability = prob.invert ();
3369 /* We can scale loops counts immediately but have to postpone
3370 scaling the scalar loop because we re-use it during peeling. */
3371 scale_loop_frequencies (loop_to_version, te->probability);
3372 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3373
2778a719
RB
3374 nloop = scalar_loop;
3375 if (dump_enabled_p ())
3376 dump_printf_loc (MSG_NOTE, vect_location,
3377 "reusing %sloop version created by if conversion\n",
3378 loop_to_version != loop ? "outer " : "");
5ce9450f
JJ
3379 }
3380 else
2778a719
RB
3381 {
3382 if (loop_to_version != loop
3383 && dump_enabled_p ())
3384 dump_printf_loc (MSG_NOTE, vect_location,
3385 "applying loop versioning to outer loop %d\n",
3386 loop_to_version->num);
3387
3388 initialize_original_copy_tables ();
3389 nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3390 prob, prob.invert (), prob, prob.invert (), true);
3391 gcc_assert (nloop);
3392 nloop = get_loop_copy (loop);
3393
3394 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3395 reap those otherwise; they also refer to the original
3396 loops. */
99b1c316 3397 class loop *l = loop;
2778a719
RB
3398 while (gimple *call = vect_loop_vectorized_call (l))
3399 {
3400 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3401 fold_loop_internal_call (call, boolean_false_node);
3402 l = loop_outer (l);
3403 }
3404 free_original_copy_tables ();
3405
3406 if (cond_expr_stmt_list)
3407 {
3408 cond_exp_gsi = gsi_last_bb (condition_bb);
3409 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3410 GSI_SAME_STMT);
3411 }
3412
3413 /* Loop versioning violates an assumption we try to maintain during
3414 vectorization - that the loop exit block has a single predecessor.
3415 After versioning, the exit block of both loop versions is the same
3416 basic block (i.e. it has two predecessors). Just in order to simplify
3417 following transformations in the vectorizer, we fix this situation
3418 here by adding a new (empty) block on the exit-edge of the loop,
3419 with the proper loop-exit phis to maintain loop-closed-form.
3420 If loop versioning wasn't done from loop, but scalar_loop instead,
3421 merge_bb will have already just a single successor. */
3422
3423 merge_bb = single_exit (loop_to_version)->dest;
3424 if (EDGE_COUNT (merge_bb->preds) >= 2)
3425 {
3426 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3427 new_exit_bb = split_edge (single_exit (loop_to_version));
3428 new_exit_e = single_exit (loop_to_version);
3429 e = EDGE_SUCC (new_exit_bb, 0);
3430
3431 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3432 gsi_next (&gsi))
3433 {
3434 tree new_res;
3435 orig_phi = gsi.phi ();
3436 new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3437 new_phi = create_phi_node (new_res, new_exit_bb);
3438 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3439 add_phi_arg (new_phi, arg, new_exit_e,
3440 gimple_phi_arg_location_from_edge (orig_phi, e));
3441 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3442 }
3443 }
3444
3445 update_ssa (TODO_update_ssa);
3446 }
01d32b2b
BC
3447
3448 if (version_niter)
3449 {
3450 /* The versioned loop could be infinite, we need to clear existing
3451 niter information which is copied from the original loop. */
3452 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3453 vect_free_loop_info_assumptions (nloop);
3454 /* And set constraint LOOP_C_INFINITE for niter analyzer. */
3455 loop_constraint_set (loop, LOOP_C_INFINITE);
3456 }
9cc1fb4b 3457
4f5b9c80 3458 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
9cc1fb4b
XDL
3459 && dump_enabled_p ())
3460 {
3461 if (version_alias)
7db960c5
DM
3462 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3463 vect_location,
9cc1fb4b
XDL
3464 "loop versioned for vectorization because of "
3465 "possible aliasing\n");
3466 if (version_align)
7db960c5
DM
3467 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3468 vect_location,
9cc1fb4b
XDL
3469 "loop versioned for vectorization to enhance "
3470 "alignment\n");
3471
3472 }
03001a35
RB
3473
3474 return nloop;
ebfd146a 3475}