]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-loop.c
Allow automatics in equivalences
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
CommitLineData
fb85abff 1/* Loop Vectorization
fbd26352 2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
48e1416a 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
fb85abff 4 Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
9ef16211 25#include "backend.h"
7c29e30e 26#include "target.h"
27#include "rtl.h"
fb85abff 28#include "tree.h"
9ef16211 29#include "gimple.h"
7c29e30e 30#include "cfghooks.h"
31#include "tree-pass.h"
9ef16211 32#include "ssa.h"
7c29e30e 33#include "optabs-tree.h"
7c29e30e 34#include "diagnostic-core.h"
b20a8bb4 35#include "fold-const.h"
9ed99284 36#include "stor-layout.h"
94ea8568 37#include "cfganal.h"
a8783bee 38#include "gimplify.h"
dcf1a1ec 39#include "gimple-iterator.h"
e795d6e1 40#include "gimplify-me.h"
05d9c18a 41#include "tree-ssa-loop-ivopts.h"
42#include "tree-ssa-loop-manip.h"
43#include "tree-ssa-loop-niter.h"
d5e80d93 44#include "tree-ssa-loop.h"
fb85abff 45#include "cfgloop.h"
fb85abff 46#include "params.h"
fb85abff 47#include "tree-scalar-evolution.h"
48#include "tree-vectorizer.h"
23ffec42 49#include "gimple-fold.h"
0a08c1bc 50#include "cgraph.h"
75aae5b4 51#include "tree-cfg.h"
5b631e09 52#include "tree-if-conv.h"
e53664fa 53#include "internal-fn.h"
6a8c2cbc 54#include "tree-vector-builder.h"
d37760c5 55#include "vec-perm-indices.h"
04936b7c 56#include "tree-eh.h"
fb85abff 57
58/* Loop Vectorization Pass.
59
48e1416a 60 This pass tries to vectorize loops.
fb85abff 61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
282bf14c 90 ("data-refs"). These two types of data require different handling both
fb85abff 91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
282bf14c 111 the loop that needs to be vectorized. It inserts the vector code sequence
fb85abff 112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
282bf14c 114 attached to S). This pointer will be used for the vectorization of following
fb85abff 115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
282bf14c 126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
fb85abff 127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
2101edf2 140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
282bf14c 142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
2101edf2 143 flexibility will be added in the future.
fb85abff 144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
282bf14c 148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
fb85abff 149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154*/
155
5938768b 156static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157
187ee2a2 158/* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
161
ed9370cc 162static opt_result
187ee2a2 163vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
167{
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
ed9370cc 176 return opt_result::success ();
187ee2a2 177 }
178
179 tree stmt_vectype, nunits_vectype;
ed9370cc 180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
187ee2a2 184
185 if (stmt_vectype)
186 {
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 }
199
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
202
ed9370cc 203 return opt_result::success ();
187ee2a2 204}
205
206/* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
212
ed9370cc 213static opt_result
187ee2a2 214vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
216{
03c0d666 217 vec_info *vinfo = stmt_info->vinfo;
187ee2a2 218 if (dump_enabled_p ())
a4e972e3 219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
ed9370cc 221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
187ee2a2 225
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
228 {
da611310 229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
aebdbd31 230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
187ee2a2 231
232 /* If a pattern statement has def stmts, analyze them too. */
187ee2a2 233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
235 {
03c0d666 236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
187ee2a2 237 if (dump_enabled_p ())
a4e972e3 238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
187ee2a2 241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
ed9370cc 243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
187ee2a2 247 }
248
249 if (dump_enabled_p ())
a4e972e3 250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
ed9370cc 253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
187ee2a2 256 }
257
ed9370cc 258 return opt_result::success ();
187ee2a2 259}
260
fb85abff 261/* Function vect_determine_vectorization_factor
262
282bf14c 263 Determine the vectorization factor (VF). VF is the number of data elements
fb85abff 264 that are operated upon in parallel in a single iteration of the vectorized
282bf14c 265 loop. For example, when vectorizing a loop that operates on 4byte elements,
fb85abff 266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
268
269 We currently support vectorization of loops in which all types operated upon
282bf14c 270 are of the same size. Therefore this function currently sets VF according to
fb85abff 271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
273
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
278 }
279
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
283 }
284*/
285
ed9370cc 286static opt_result
fb85abff 287vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288{
2e966e2a 289 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
dab48979 291 unsigned nbbs = loop->num_nodes;
d75596cd 292 poly_uint64 vectorization_factor = 1;
e4c9e0a5 293 tree scalar_type = NULL_TREE;
1a91d914 294 gphi *phi;
fb85abff 295 tree vectype;
fb85abff 296 stmt_vec_info stmt_info;
dab48979 297 unsigned i;
dab48979 298 auto_vec<stmt_vec_info> mask_producers;
fb85abff 299
88f6eb8f 300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
fb85abff 301
302 for (i = 0; i < nbbs; i++)
303 {
304 basic_block bb = bbs[i];
305
1a91d914 306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
fb85abff 308 {
1a91d914 309 phi = si.phi ();
03c0d666 310 stmt_info = loop_vinfo->lookup_stmt (phi);
6d8fb6cf 311 if (dump_enabled_p ())
a4e972e3 312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
fb85abff 314
315 gcc_assert (stmt_info);
316
abce4377 317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
fb85abff 319 {
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
322
6d8fb6cf 323 if (dump_enabled_p ())
a4e972e3 324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
fb85abff 327
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
ed9370cc 330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
fb85abff 334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
335
6d8fb6cf 336 if (dump_enabled_p ())
a4e972e3 337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
fb85abff 339
6d8fb6cf 340 if (dump_enabled_p ())
f08ee65f 341 {
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
345 }
fb85abff 346
d75596cd 347 vect_update_max_nunits (&vectorization_factor, vectype);
fb85abff 348 }
349 }
350
187ee2a2 351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
353 {
03c0d666 354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
ed9370cc 355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
fb85abff 360 }
361 }
362
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
6d8fb6cf 364 if (dump_enabled_p ())
d75596cd 365 {
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
369 }
370
371 if (known_le (vectorization_factor, 1U))
ed9370cc 372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
fb85abff 374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
375
dab48979 376 for (i = 0; i < mask_producers.length (); i++)
377 {
187ee2a2 378 stmt_info = mask_producers[i];
ed9370cc 379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
dab48979 380 if (!mask_type)
ed9370cc 381 return opt_result::propagate_failure (mask_type);
187ee2a2 382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
dab48979 383 }
384
ed9370cc 385 return opt_result::success ();
fb85abff 386}
387
388
389/* Function vect_is_simple_iv_evolution.
390
391 FORNOW: A simple evolution of an induction variables in the loop is
bb0d2509 392 considered a polynomial evolution. */
fb85abff 393
394static bool
395vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
397{
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
bb0d2509 401 basic_block bb;
fb85abff 402
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
407
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
412
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
415
6d8fb6cf 416 if (dump_enabled_p ())
a4e972e3 417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
fb85abff 419
420 *init = init_expr;
421 *step = step_expr;
422
bb0d2509 423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
1d62df1c 426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
fb85abff 432 {
6d8fb6cf 433 if (dump_enabled_p ())
7bd765d4 434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 435 "step unknown.\n");
fb85abff 436 return false;
437 }
438
439 return true;
440}
441
8073a327 442/* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
445
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
448 ...
449
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
452 ...
453 x_3 = ...;
454 ...
455
456 outer2:
457 x_4 = PHI <x_3(inner)>;
458 ...
459
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
462
463static bool
464vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
465{
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
474}
475
fb85abff 476/* Function vect_analyze_scalar_cycles_1.
477
478 Examine the cross iteration def-use cycles of scalar variables
282bf14c 479 in LOOP. LOOP_VINFO represents the loop that is now being
fb85abff 480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
482
483static void
2e966e2a 484vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
fb85abff 485{
486 basic_block bb = loop->header;
bb0d2509 487 tree init, step;
d19d572a 488 auto_vec<stmt_vec_info, 64> worklist;
1a91d914 489 gphi_iterator gsi;
7aa0d350 490 bool double_reduc;
fb85abff 491
88f6eb8f 492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
fb85abff 493
282bf14c 494 /* First - identify all inductions. Reduction detection assumes that all the
48e1416a 495 inductions have been identified, therefore, this order must not be
ade2ac53 496 changed. */
fb85abff 497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
498 {
1a91d914 499 gphi *phi = gsi.phi ();
fb85abff 500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
03c0d666 502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
fb85abff 503
6d8fb6cf 504 if (dump_enabled_p ())
a4e972e3 505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
fb85abff 506
282bf14c 507 /* Skip virtual phi's. The data dependences that are associated with
fb85abff 508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
7c782c9b 509 if (virtual_operand_p (def))
fb85abff 510 continue;
511
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
513
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
acf5dbc0 516 if (access_fn)
fb85abff 517 {
58280b1f 518 STRIP_NOPS (access_fn);
6d8fb6cf 519 if (dump_enabled_p ())
a4e972e3 520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
559260b3 522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
58280b1f 524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
fb85abff 526 }
527
528 if (!access_fn
8073a327 529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
bb0d2509 530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
fb85abff 533 {
a73182ff 534 worklist.safe_push (stmt_vinfo);
fb85abff 535 continue;
536 }
537
559260b3 538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
86faead7 540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
541
6d8fb6cf 542 if (dump_enabled_p ())
78bb46f5 543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
fb85abff 544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
545 }
546
547
ade2ac53 548 /* Second - identify all reductions and nested cycles. */
f1f41a6c 549 while (worklist.length () > 0)
fb85abff 550 {
d19d572a 551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
fb85abff 553 tree def = PHI_RESULT (phi);
fb85abff 554
6d8fb6cf 555 if (dump_enabled_p ())
a4e972e3 556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
fb85abff 557
7c782c9b 558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
fb85abff 560
f4649a92 561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
fb85abff 565 {
7aa0d350 566 if (double_reduc)
ade2ac53 567 {
6d8fb6cf 568 if (dump_enabled_p ())
7bd765d4 569 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 570 "Detected double reduction.\n");
ade2ac53 571
7aa0d350 572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
f4649a92 573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
ade2ac53 575 }
48e1416a 576 else
ade2ac53 577 {
119a8852 578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
7aa0d350 579 {
6d8fb6cf 580 if (dump_enabled_p ())
7bd765d4 581 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 582 "Detected vectorizable nested cycle.\n");
ade2ac53 583
7aa0d350 584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
f4649a92 585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
7aa0d350 586 }
587 else
588 {
6d8fb6cf 589 if (dump_enabled_p ())
7bd765d4 590 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 591 "Detected reduction.\n");
7aa0d350 592
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
f4649a92 594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
eefa05c8 595 /* Store the reduction cycles for possible vectorization in
c640fbe7 596 loop-aware SLP if it was not detected as reduction
597 chain. */
f4649a92 598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
7aa0d350 601 }
ade2ac53 602 }
fb85abff 603 }
604 else
6d8fb6cf 605 if (dump_enabled_p ())
7bd765d4 606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 607 "Unknown def-use cycle pattern.\n");
fb85abff 608 }
fb85abff 609}
610
611
612/* Function vect_analyze_scalar_cycles.
613
614 Examine the cross iteration def-use cycles of scalar variables, by
282bf14c 615 analyzing the loop-header PHIs of scalar variables. Classify each
fb85abff 616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
620
621 Example1: reduction:
622
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
626
627 Example2: induction:
628
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
632
633static void
634vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
635{
2e966e2a 636 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 637
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
639
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
48e1416a 646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
fb85abff 647 current checks are too strict. */
648
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651}
652
ecc42a77 653/* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
34563054 655
656static void
ecc42a77 657vect_fixup_reduc_chain (stmt_vec_info stmt_info)
34563054 658{
aebdbd31 659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
34563054 664 do
665 {
cd24aa3c 666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
aebdbd31 667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
cd24aa3c 668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
aebdbd31 670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
cd24aa3c 671 = STMT_VINFO_RELATED_STMT (stmt_info);
34563054 672 }
cd24aa3c 673 while (stmt_info);
aebdbd31 674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
34563054 675}
676
677/* Fixup scalar cycles that now have their stmts detected as patterns. */
678
679static void
680vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
681{
14dca1d8 682 stmt_vec_info first;
34563054 683 unsigned i;
684
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
14dca1d8 686 if (STMT_VINFO_IN_PATTERN_P (first))
34563054 687 {
14dca1d8 688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3ff1b153 689 while (next)
690 {
cd24aa3c 691 if (! STMT_VINFO_IN_PATTERN_P (next))
3ff1b153 692 break;
cd24aa3c 693 next = REDUC_GROUP_NEXT_ELEMENT (next);
3ff1b153 694 }
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
698 {
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
14dca1d8 701 = STMT_VINFO_RELATED_STMT (first);
3ff1b153 702 }
34563054 703 }
704}
313a5120 705
fb85abff 706/* Function vect_get_loop_niters.
707
313a5120 708 Determine how many iterations the loop is executed and place it
796f6cba 709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
d5e80d93 710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
313a5120 712
fb85abff 713 Return the loop exit condition. */
714
1a91d914 715
716static gcond *
2e966e2a 717vect_get_loop_niters (class loop *loop, tree *assumptions,
d5e80d93 718 tree *number_of_iterations, tree *number_of_iterationsm1)
fb85abff 719{
d5e80d93 720 edge exit = single_exit (loop);
2e966e2a 721 class tree_niter_desc niter_desc;
d5e80d93 722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
724
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
88f6eb8f 728 DUMP_VECT_SCOPE ("get_loop_niters");
fb85abff 729
d5e80d93 730 if (!exit)
731 return cond;
732
d5e80d93 733 may_be_zero = NULL_TREE;
d5e80d93 734 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
735 || chrec_contains_undetermined (niter_desc.niter))
736 return cond;
737
738 niter_assumptions = niter_desc.assumptions;
739 may_be_zero = niter_desc.may_be_zero;
740 niter = niter_desc.niter;
741
742 if (may_be_zero && integer_zerop (may_be_zero))
743 may_be_zero = NULL_TREE;
744
745 if (may_be_zero)
746 {
747 if (COMPARISON_CLASS_P (may_be_zero))
748 {
749 /* Try to combine may_be_zero with assumptions, this can simplify
750 computation of niter expression. */
751 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
752 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
753 niter_assumptions,
754 fold_build1 (TRUTH_NOT_EXPR,
755 boolean_type_node,
756 may_be_zero));
757 else
758 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
04936b7c 759 build_int_cst (TREE_TYPE (niter), 0),
760 rewrite_to_non_trapping_overflow (niter));
d5e80d93 761
762 may_be_zero = NULL_TREE;
763 }
764 else if (integer_nonzerop (may_be_zero))
765 {
766 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
767 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
768 return cond;
769 }
770 else
771 return cond;
772 }
773
774 *assumptions = niter_assumptions;
775 *number_of_iterationsm1 = niter;
796f6cba 776
313a5120 777 /* We want the number of loop header executions which is the number
778 of latch executions plus one.
779 ??? For UINT_MAX latch executions this number overflows to zero
780 for loops like do { n++; } while (n != 0); */
d5e80d93 781 if (niter && !chrec_contains_undetermined (niter))
782 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
783 build_int_cst (TREE_TYPE (niter), 1));
784 *number_of_iterations = niter;
fb85abff 785
d5e80d93 786 return cond;
fb85abff 787}
788
fb85abff 789/* Function bb_in_loop_p
790
791 Used as predicate for dfs order traversal of the loop bbs. */
792
793static bool
794bb_in_loop_p (const_basic_block bb, const void *data)
795{
2e966e2a 796 const class loop *const loop = (const class loop *)data;
fb85abff 797 if (flow_bb_inside_loop_p (loop, bb))
798 return true;
799 return false;
800}
801
802
e15e8a2a 803/* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
804 stmt_vec_info structs for all the stmts in LOOP_IN. */
805
2e966e2a 806_loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
a99aba41 807 : vec_info (vec_info::loop, init_cost (loop_in), shared),
e15e8a2a 808 loop (loop_in),
809 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
810 num_itersm1 (NULL_TREE),
811 num_iters (NULL_TREE),
812 num_iters_unchanged (NULL_TREE),
813 num_iters_assumptions (NULL_TREE),
814 th (0),
7456a7ea 815 versioning_threshold (0),
e15e8a2a 816 vectorization_factor (0),
4a85c0b1 817 max_vectorization_factor (0),
6753a4bf 818 mask_skip_niters (NULL_TREE),
60b29a7e 819 mask_compare_type (NULL_TREE),
1d86b8dc 820 simd_if_cond (NULL_TREE),
e15e8a2a 821 unaligned_dr (NULL),
822 peeling_for_alignment (0),
823 ptr_mask (0),
f404501a 824 ivexpr_map (NULL),
da008d72 825 scan_map (NULL),
e15e8a2a 826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
60b29a7e 829 can_fully_mask_p (true),
830 fully_masked_p (false),
e15e8a2a 831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
e3b3a12f 836 scalar_loop_scaling (profile_probability::uninitialized ()),
e15e8a2a 837 scalar_loop (NULL),
838 orig_loop_info (NULL)
fb85abff 839{
2482dbe8 840 /* CHECKME: We want to visit all BBs before their successors (except for
841 latch blocks, for which this assertion wouldn't hold). In the simple
842 case of the loop forms we allow, a dfs order of the BBs would the same
843 as reversed postorder traversal, so we are safe. */
844
845 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
846 bbs, loop->num_nodes, loop);
847 gcc_assert (nbbs == loop->num_nodes);
848
849 for (unsigned int i = 0; i < nbbs; i++)
fb85abff 850 {
2482dbe8 851 basic_block bb = bbs[i];
e15e8a2a 852 gimple_stmt_iterator si;
fb85abff 853
3702cf13 854 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
855 {
856 gimple *phi = gsi_stmt (si);
857 gimple_set_uid (phi, 0);
04b2391d 858 add_stmt (phi);
3702cf13 859 }
fb85abff 860
3702cf13 861 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
862 {
863 gimple *stmt = gsi_stmt (si);
864 gimple_set_uid (stmt, 0);
04b2391d 865 add_stmt (stmt);
da008d72 866 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
867 third argument is the #pragma omp simd if (x) condition, when 0,
1d86b8dc 868 loop shouldn't be vectorized, when non-zero constant, it should
869 be vectorized normally, otherwise versioned with vectorized loop
870 done if the condition is non-zero at runtime. */
871 if (loop_in->simduid
872 && is_gimple_call (stmt)
873 && gimple_call_internal_p (stmt)
874 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
da008d72 875 && gimple_call_num_args (stmt) >= 3
1d86b8dc 876 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
877 && (loop_in->simduid
878 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
879 {
da008d72 880 tree arg = gimple_call_arg (stmt, 2);
1d86b8dc 881 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
882 simd_if_cond = arg;
883 else
884 gcc_assert (integer_nonzerop (arg));
885 }
3702cf13 886 }
fb85abff 887 }
fb85abff 888}
889
60b29a7e 890/* Free all levels of MASKS. */
891
892void
893release_vec_loop_masks (vec_loop_masks *masks)
894{
895 rgroup_masks *rgm;
896 unsigned int i;
897 FOR_EACH_VEC_ELT (*masks, i, rgm)
898 rgm->masks.release ();
899 masks->release ();
900}
fb85abff 901
e15e8a2a 902/* Free all memory used by the _loop_vec_info, as well as all the
903 stmt_vec_info structs of all the stmts in the loop. */
fb85abff 904
e15e8a2a 905_loop_vec_info::~_loop_vec_info ()
fb85abff 906{
fb85abff 907 int nbbs;
908 gimple_stmt_iterator si;
909 int j;
fb85abff 910
e15e8a2a 911 nbbs = loop->num_nodes;
fb85abff 912 for (j = 0; j < nbbs; j++)
913 {
914 basic_block bb = bbs[j];
fb85abff 915 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
916 {
42acab1c 917 gimple *stmt = gsi_stmt (si);
ba69439f 918
919 /* We may have broken canonical form by moving a constant
920 into RHS1 of a commutative op. Fix such occurrences. */
e15e8a2a 921 if (operands_swapped && is_gimple_assign (stmt))
ba69439f 922 {
923 enum tree_code code = gimple_assign_rhs_code (stmt);
924
925 if ((code == PLUS_EXPR
926 || code == POINTER_PLUS_EXPR
927 || code == MULT_EXPR)
928 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
8f6fa493 929 swap_ssa_operands (stmt,
930 gimple_assign_rhs1_ptr (stmt),
931 gimple_assign_rhs2_ptr (stmt));
bbb60482 932 else if (code == COND_EXPR
933 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
934 {
935 tree cond_expr = gimple_assign_rhs1 (stmt);
936 enum tree_code cond_code = TREE_CODE (cond_expr);
937
938 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
939 {
940 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
941 0));
942 cond_code = invert_tree_comparison (cond_code,
943 honor_nans);
944 if (cond_code != ERROR_MARK)
945 {
946 TREE_SET_CODE (cond_expr, cond_code);
947 swap_ssa_operands (stmt,
948 gimple_assign_rhs2_ptr (stmt),
949 gimple_assign_rhs3_ptr (stmt));
950 }
951 }
952 }
ba69439f 953 }
fb85abff 954 gsi_next (&si);
955 }
956 }
957
e15e8a2a 958 free (bbs);
f68a7726 959
60b29a7e 960 release_vec_loop_masks (&masks);
f404501a 961 delete ivexpr_map;
da008d72 962 delete scan_map;
60b29a7e 963
fb85abff 964 loop->aux = NULL;
965}
966
f404501a 967/* Return an invariant or register for EXPR and emit necessary
968 computations in the LOOP_VINFO loop preheader. */
969
970tree
971cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
972{
973 if (is_gimple_reg (expr)
974 || is_gimple_min_invariant (expr))
975 return expr;
976
977 if (! loop_vinfo->ivexpr_map)
978 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
979 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
980 if (! cached)
981 {
982 gimple_seq stmts = NULL;
983 cached = force_gimple_operand (unshare_expr (expr),
984 &stmts, true, NULL_TREE);
985 if (stmts)
986 {
987 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
988 gsi_insert_seq_on_edge_immediate (e, stmts);
989 }
990 }
991 return cached;
992}
993
60b29a7e 994/* Return true if we can use CMP_TYPE as the comparison type to produce
995 all masks required to mask LOOP_VINFO. */
996
997static bool
998can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
999{
1000 rgroup_masks *rgm;
1001 unsigned int i;
1002 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1003 if (rgm->mask_type != NULL_TREE
1004 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1005 cmp_type, rgm->mask_type,
1006 OPTIMIZE_FOR_SPEED))
1007 return false;
1008 return true;
1009}
1010
1011/* Calculate the maximum number of scalars per iteration for every
1012 rgroup in LOOP_VINFO. */
1013
1014static unsigned int
1015vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1016{
1017 unsigned int res = 1;
1018 unsigned int i;
1019 rgroup_masks *rgm;
1020 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1021 res = MAX (res, rgm->max_nscalars_per_iter);
1022 return res;
1023}
1024
1025/* Each statement in LOOP_VINFO can be masked where necessary. Check
1026 whether we can actually generate the masks required. Return true if so,
1027 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1028
1029static bool
1030vect_verify_full_masking (loop_vec_info loop_vinfo)
1031{
2e966e2a 1032 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
60b29a7e 1033 unsigned int min_ni_width;
ef871d99 1034 unsigned int max_nscalars_per_iter
1035 = vect_get_max_nscalars_per_iter (loop_vinfo);
60b29a7e 1036
1ae144e9 1037 /* Use a normal loop if there are no statements that need masking.
1038 This only happens in rare degenerate cases: it means that the loop
1039 has no loads, no stores, and no live-out values. */
1040 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1041 return false;
1042
60b29a7e 1043 /* Get the maximum number of iterations that is representable
1044 in the counter type. */
1045 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1046 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1047
1048 /* Get a more refined estimate for the number of iterations. */
1049 widest_int max_back_edges;
1050 if (max_loop_iterations (loop, &max_back_edges))
1051 max_ni = wi::smin (max_ni, max_back_edges + 1);
1052
1053 /* Account for rgroup masks, in which each bit is replicated N times. */
ef871d99 1054 max_ni *= max_nscalars_per_iter;
60b29a7e 1055
1056 /* Work out how many bits we need to represent the limit. */
1057 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1058
1059 /* Find a scalar mode for which WHILE_ULT is supported. */
1060 opt_scalar_int_mode cmp_mode_iter;
1061 tree cmp_type = NULL_TREE;
ef871d99 1062 tree iv_type = NULL_TREE;
1063 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
8ac88374 1064 unsigned int iv_precision = UINT_MAX;
ef871d99 1065
1066 if (iv_limit != -1)
1067 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1068 UNSIGNED);
1069
60b29a7e 1070 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1071 {
1072 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1073 if (cmp_bits >= min_ni_width
1074 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1075 {
1076 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1077 if (this_type
1078 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1079 {
1080 /* Although we could stop as soon as we find a valid mode,
ef871d99 1081 there are at least two reasons why that's not always the
1082 best choice:
1083
1084 - An IV that's Pmode or wider is more likely to be reusable
8ac88374 1085 in address calculations than an IV that's narrower than
1086 Pmode.
ef871d99 1087
1088 - Doing the comparison in IV_PRECISION or wider allows
8ac88374 1089 a natural 0-based IV, whereas using a narrower comparison
1090 type requires mitigations against wrap-around.
ef871d99 1091
1092 Conversely, if the IV limit is variable, doing the comparison
1093 in a wider type than the original type can introduce
1094 unnecessary extensions, so picking the widest valid mode
1095 is not always a good choice either.
1096
1097 Here we prefer the first IV type that's Pmode or wider,
1098 and the first comparison type that's IV_PRECISION or wider.
1099 (The comparison type must be no wider than the IV type,
1100 to avoid extensions in the vector loop.)
1101
1102 ??? We might want to try continuing beyond Pmode for ILP32
1103 targets if CMP_BITS < IV_PRECISION. */
1104 iv_type = this_type;
1105 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1106 cmp_type = this_type;
60b29a7e 1107 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1108 break;
1109 }
1110 }
1111 }
1112
1113 if (!cmp_type)
1114 return false;
1115
1116 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
ef871d99 1117 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
60b29a7e 1118 return true;
1119}
fb85abff 1120
2a9a3444 1121/* Calculate the cost of one scalar iteration of the loop. */
1122static void
00ecf4da 1123vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2a9a3444 1124{
2e966e2a 1125 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2a9a3444 1126 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2538bd2e 1127 int nbbs = loop->num_nodes, factor;
2a9a3444 1128 int innerloop_iters, i;
1129
524665d0 1130 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1131
2538bd2e 1132 /* Gather costs for statements in the scalar loop. */
2a9a3444 1133
1134 /* FORNOW. */
1135 innerloop_iters = 1;
1136 if (loop->inner)
1137 innerloop_iters = 50; /* FIXME */
1138
1139 for (i = 0; i < nbbs; i++)
1140 {
1141 gimple_stmt_iterator si;
1142 basic_block bb = bbs[i];
1143
1144 if (bb->loop_father == loop->inner)
1145 factor = innerloop_iters;
1146 else
1147 factor = 1;
1148
1149 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1150 {
42acab1c 1151 gimple *stmt = gsi_stmt (si);
03c0d666 1152 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2a9a3444 1153
1154 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1155 continue;
1156
1157 /* Skip stmts that are not vectorized inside the loop. */
1d5511dd 1158 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1159 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1160 && (!STMT_VINFO_LIVE_P (vstmt_info)
1161 || !VECTORIZABLE_CYCLE_DEF
1162 (STMT_VINFO_DEF_TYPE (vstmt_info))))
2a9a3444 1163 continue;
1164
1165 vect_cost_for_stmt kind;
a1b0b75c 1166 if (STMT_VINFO_DATA_REF (stmt_info))
2a9a3444 1167 {
a1b0b75c 1168 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2a9a3444 1169 kind = scalar_load;
1170 else
1171 kind = scalar_store;
1172 }
1173 else
1174 kind = scalar_stmt;
1175
2538bd2e 1176 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1177 factor, kind, stmt_info, 0, vect_prologue);
2a9a3444 1178 }
1179 }
2538bd2e 1180
1181 /* Now accumulate cost. */
1182 void *target_cost_data = init_cost (loop);
1183 stmt_info_for_cost *si;
1184 int j;
1185 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1186 j, si)
1aeaa139 1187 (void) add_stmt_cost (target_cost_data, si->count,
1188 si->kind, si->stmt_info, si->misalign,
1189 vect_body);
2538bd2e 1190 unsigned dummy, body_cost = 0;
1191 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1192 destroy_cost_data (target_cost_data);
1193 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
2a9a3444 1194}
1195
1196
3702cf13 1197/* Function vect_analyze_loop_form_1.
fb85abff 1198
1199 Verify that certain CFG restrictions hold, including:
1200 - the loop has a pre-header
1201 - the loop has a single entry and exit
d5e80d93 1202 - the loop exit condition is simple enough
1203 - the number of iterations can be analyzed, i.e, a countable loop. The
1204 niter could be analyzed under some assumptions. */
fb85abff 1205
ed9370cc 1206opt_result
2e966e2a 1207vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
d5e80d93 1208 tree *assumptions, tree *number_of_iterationsm1,
3702cf13 1209 tree *number_of_iterations, gcond **inner_loop_cond)
fb85abff 1210{
88f6eb8f 1211 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
fb85abff 1212
1213 /* Different restrictions apply when we are considering an inner-most loop,
48e1416a 1214 vs. an outer (nested) loop.
fb85abff 1215 (FORNOW. May want to relax some of these restrictions in the future). */
1216
1217 if (!loop->inner)
1218 {
48e1416a 1219 /* Inner-most loop. We currently require that the number of BBs is
1220 exactly 2 (the header and latch). Vectorizable inner-most loops
fb85abff 1221 look like this:
1222
1223 (pre-header)
1224 |
1225 header <--------+
1226 | | |
1227 | +--> latch --+
1228 |
1229 (exit-bb) */
1230
1231 if (loop->num_nodes != 2)
ed9370cc 1232 return opt_result::failure_at (vect_location,
1233 "not vectorized:"
1234 " control flow in loop.\n");
fb85abff 1235
1236 if (empty_block_p (loop->header))
ed9370cc 1237 return opt_result::failure_at (vect_location,
1238 "not vectorized: empty loop.\n");
fb85abff 1239 }
1240 else
1241 {
2e966e2a 1242 class loop *innerloop = loop->inner;
f018d957 1243 edge entryedge;
fb85abff 1244
1245 /* Nested loop. We currently require that the loop is doubly-nested,
48e1416a 1246 contains a single inner loop, and the number of BBs is exactly 5.
fb85abff 1247 Vectorizable outer-loops look like this:
1248
1249 (pre-header)
1250 |
1251 header <---+
1252 | |
1253 inner-loop |
1254 | |
1255 tail ------+
48e1416a 1256 |
fb85abff 1257 (exit-bb)
1258
1259 The inner-loop has the properties expected of inner-most loops
1260 as described above. */
1261
1262 if ((loop->inner)->inner || (loop->inner)->next)
ed9370cc 1263 return opt_result::failure_at (vect_location,
1264 "not vectorized:"
1265 " multiple nested loops.\n");
fb85abff 1266
48e1416a 1267 if (loop->num_nodes != 5)
ed9370cc 1268 return opt_result::failure_at (vect_location,
1269 "not vectorized:"
1270 " control flow in loop.\n");
fb85abff 1271
3702cf13 1272 entryedge = loop_preheader_edge (innerloop);
fb85abff 1273 if (entryedge->src != loop->header
1274 || !single_exit (innerloop)
3702cf13 1275 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
ed9370cc 1276 return opt_result::failure_at (vect_location,
1277 "not vectorized:"
1278 " unsupported outerloop form.\n");
3702cf13 1279
1280 /* Analyze the inner-loop. */
d5e80d93 1281 tree inner_niterm1, inner_niter, inner_assumptions;
ed9370cc 1282 opt_result res
1283 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1284 &inner_assumptions, &inner_niterm1,
1285 &inner_niter, NULL);
1286 if (!res)
3702cf13 1287 {
1288 if (dump_enabled_p ())
ed9370cc 1289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3702cf13 1290 "not vectorized: Bad inner loop.\n");
ed9370cc 1291 return res;
3702cf13 1292 }
1293
ed9370cc 1294 /* Don't support analyzing niter under assumptions for inner
1295 loop. */
1296 if (!integer_onep (inner_assumptions))
1297 return opt_result::failure_at (vect_location,
1298 "not vectorized: Bad inner loop.\n");
1299
3702cf13 1300 if (!expr_invariant_in_loop_p (loop, inner_niter))
ed9370cc 1301 return opt_result::failure_at (vect_location,
1302 "not vectorized: inner-loop count not"
1303 " invariant.\n");
fb85abff 1304
6d8fb6cf 1305 if (dump_enabled_p ())
7bd765d4 1306 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1307 "Considering outer-loop vectorization.\n");
fb85abff 1308 }
48e1416a 1309
ed9370cc 1310 if (!single_exit (loop))
1311 return opt_result::failure_at (vect_location,
1312 "not vectorized: multiple exits.\n");
1313 if (EDGE_COUNT (loop->header->preds) != 2)
1314 return opt_result::failure_at (vect_location,
1315 "not vectorized:"
1316 " too many incoming edges.\n");
fb85abff 1317
1318 /* We assume that the loop exit condition is at the end of the loop. i.e,
1319 that the loop is represented as a do-while (with a proper if-guard
1320 before the loop if needed), where the loop header contains all the
1321 executable statements, and the latch is empty. */
1322 if (!empty_block_p (loop->latch)
3c18ea71 1323 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
ed9370cc 1324 return opt_result::failure_at (vect_location,
1325 "not vectorized: latch block not empty.\n");
fb85abff 1326
19961a78 1327 /* Make sure the exit is not abnormal. */
1328 edge e = single_exit (loop);
1329 if (e->flags & EDGE_ABNORMAL)
ed9370cc 1330 return opt_result::failure_at (vect_location,
1331 "not vectorized:"
1332 " abnormal loop exit edge.\n");
fb85abff 1333
d5e80d93 1334 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
3702cf13 1335 number_of_iterationsm1);
1336 if (!*loop_cond)
ed9370cc 1337 return opt_result::failure_at
1338 (vect_location,
1339 "not vectorized: complicated exit condition.\n");
48e1416a 1340
d5e80d93 1341 if (integer_zerop (*assumptions)
1342 || !*number_of_iterations
3702cf13 1343 || chrec_contains_undetermined (*number_of_iterations))
ed9370cc 1344 return opt_result::failure_at
1345 (*loop_cond,
1346 "not vectorized: number of iterations cannot be computed.\n");
fb85abff 1347
3702cf13 1348 if (integer_zerop (*number_of_iterations))
ed9370cc 1349 return opt_result::failure_at
1350 (*loop_cond,
1351 "not vectorized: number of iterations = 0.\n");
fb85abff 1352
ed9370cc 1353 return opt_result::success ();
3702cf13 1354}
1355
1356/* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1357
ed9370cc 1358opt_loop_vec_info
2e966e2a 1359vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
3702cf13 1360{
d5e80d93 1361 tree assumptions, number_of_iterations, number_of_iterationsm1;
3702cf13 1362 gcond *loop_cond, *inner_loop_cond = NULL;
1363
ed9370cc 1364 opt_result res
1365 = vect_analyze_loop_form_1 (loop, &loop_cond,
1366 &assumptions, &number_of_iterationsm1,
1367 &number_of_iterations, &inner_loop_cond);
1368 if (!res)
1369 return opt_loop_vec_info::propagate_failure (res);
3702cf13 1370
a99aba41 1371 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
796f6cba 1372 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
313a5120 1373 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1374 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
d5e80d93 1375 if (!integer_onep (assumptions))
1376 {
1377 /* We consider to vectorize this loop by versioning it under
1378 some assumptions. In order to do this, we need to clear
1379 existing information computed by scev and niter analyzer. */
1380 scev_reset_htab ();
46480a95 1381 free_numbers_of_iterations_estimates (loop);
d5e80d93 1382 /* Also set flag for this loop so that following scev and niter
1383 analysis are done under the assumptions. */
1384 loop_constraint_set (loop, LOOP_C_FINITE);
1385 /* Also record the assumptions for versioning. */
1386 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1387 }
313a5120 1388
1389 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
fb85abff 1390 {
6d8fb6cf 1391 if (dump_enabled_p ())
fb85abff 1392 {
7bd765d4 1393 dump_printf_loc (MSG_NOTE, vect_location,
1394 "Symbolic number of iterations is ");
1395 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
78bb46f5 1396 dump_printf (MSG_NOTE, "\n");
fb85abff 1397 }
1398 }
fb85abff 1399
03c0d666 1400 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1401 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
3702cf13 1402 if (inner_loop_cond)
03c0d666 1403 {
1404 stmt_vec_info inner_loop_cond_info
1405 = loop_vinfo->lookup_stmt (inner_loop_cond);
1406 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1407 }
fb85abff 1408
1409 gcc_assert (!loop->aux);
1410 loop->aux = loop_vinfo;
ed9370cc 1411 return opt_loop_vec_info::success (loop_vinfo);
fb85abff 1412}
1413
3702cf13 1414
1415
5cb834f3 1416/* Scan the loop stmts and dependent on whether there are any (non-)SLP
1417 statements update the vectorization factor. */
1418
1419static void
1420vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1421{
2e966e2a 1422 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5cb834f3 1423 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1424 int nbbs = loop->num_nodes;
d75596cd 1425 poly_uint64 vectorization_factor;
5cb834f3 1426 int i;
1427
88f6eb8f 1428 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
5cb834f3 1429
1430 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
d75596cd 1431 gcc_assert (known_ne (vectorization_factor, 0U));
5cb834f3 1432
1433 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1434 vectorization factor of the loop is the unrolling factor required by
1435 the SLP instances. If that unrolling factor is 1, we say, that we
1436 perform pure SLP on loop - cross iteration parallelism is not
1437 exploited. */
1438 bool only_slp_in_loop = true;
1439 for (i = 0; i < nbbs; i++)
1440 {
1441 basic_block bb = bbs[i];
1442 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1443 gsi_next (&si))
1444 {
03c0d666 1445 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
0b7ea3a9 1446 stmt_info = vect_stmt_to_vectorize (stmt_info);
5cb834f3 1447 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1448 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1449 && !PURE_SLP_STMT (stmt_info))
1450 /* STMT needs both SLP and loop-based vectorization. */
1451 only_slp_in_loop = false;
1452 }
1453 }
1454
1455 if (only_slp_in_loop)
5cc7beaa 1456 {
91f42adc 1457 if (dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE, vect_location,
1459 "Loop contains only SLP stmts\n");
5cc7beaa 1460 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1461 }
5cb834f3 1462 else
5cc7beaa 1463 {
91f42adc 1464 if (dump_enabled_p ())
1465 dump_printf_loc (MSG_NOTE, vect_location,
1466 "Loop contains SLP and non-SLP stmts\n");
d75596cd 1467 /* Both the vectorization factor and unroll factor have the form
1468 current_vector_size * X for some rational X, so they must have
1469 a common multiple. */
5cc7beaa 1470 vectorization_factor
d75596cd 1471 = force_common_multiple (vectorization_factor,
5cc7beaa 1472 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1473 }
5cb834f3 1474
1475 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1476 if (dump_enabled_p ())
d75596cd 1477 {
1478 dump_printf_loc (MSG_NOTE, vect_location,
1479 "Updating vectorization factor to ");
1480 dump_dec (MSG_NOTE, vectorization_factor);
1481 dump_printf (MSG_NOTE, ".\n");
1482 }
5cb834f3 1483}
f083cd24 1484
1ae144e9 1485/* Return true if STMT_INFO describes a double reduction phi and if
1486 the other phi in the reduction is also relevant for vectorization.
1487 This rejects cases such as:
1488
1489 outer1:
1490 x_1 = PHI <x_3(outer2), ...>;
1491 ...
1492
1493 inner:
1494 x_2 = ...;
1495 ...
1496
1497 outer2:
1498 x_3 = PHI <x_2(inner)>;
1499
1500 if nothing in x_2 or elsewhere makes x_1 relevant. */
1501
1502static bool
1503vect_active_double_reduction_p (stmt_vec_info stmt_info)
1504{
1505 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1506 return false;
1507
04eefad5 1508 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1ae144e9 1509}
1510
f083cd24 1511/* Function vect_analyze_loop_operations.
1512
1513 Scan the loop stmts and make sure they are all vectorizable. */
1514
ed9370cc 1515static opt_result
5cb834f3 1516vect_analyze_loop_operations (loop_vec_info loop_vinfo)
f083cd24 1517{
2e966e2a 1518 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
f083cd24 1519 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1520 int nbbs = loop->num_nodes;
f083cd24 1521 int i;
f083cd24 1522 stmt_vec_info stmt_info;
1523 bool need_to_vectorize = false;
5cb834f3 1524 bool ok;
f083cd24 1525
88f6eb8f 1526 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
f083cd24 1527
1425cbaa 1528 auto_vec<stmt_info_for_cost> cost_vec;
c863e35b 1529
f083cd24 1530 for (i = 0; i < nbbs; i++)
1531 {
1532 basic_block bb = bbs[i];
1533
1a91d914 1534 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535 gsi_next (&si))
f083cd24 1536 {
1a91d914 1537 gphi *phi = si.phi ();
f083cd24 1538 ok = true;
1539
03c0d666 1540 stmt_info = loop_vinfo->lookup_stmt (phi);
6d8fb6cf 1541 if (dump_enabled_p ())
a4e972e3 1542 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
8f0567ca 1543 if (virtual_operand_p (gimple_phi_result (phi)))
1544 continue;
f083cd24 1545
8bdf488e 1546 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1547 (i.e., a phi in the tail of the outer-loop). */
f083cd24 1548 if (! is_loop_header_bb_p (bb))
1549 {
8bdf488e 1550 /* FORNOW: we currently don't support the case that these phis
7aa0d350 1551 are not used in the outerloop (unless it is double reduction,
48e1416a 1552 i.e., this phi is vect_reduction_def), cause this case
7aa0d350 1553 requires to actually do something here. */
d2a7c9b9 1554 if (STMT_VINFO_LIVE_P (stmt_info)
1ae144e9 1555 && !vect_active_double_reduction_p (stmt_info))
ed9370cc 1556 return opt_result::failure_at (phi,
1557 "Unsupported loop-closed phi"
1558 " in outer-loop.\n");
8bdf488e 1559
1560 /* If PHI is used in the outer loop, we check that its operand
1561 is defined in the inner loop. */
1562 if (STMT_VINFO_RELEVANT_P (stmt_info))
1563 {
1564 tree phi_op;
8bdf488e 1565
1566 if (gimple_phi_num_args (phi) != 1)
ed9370cc 1567 return opt_result::failure_at (phi, "unsupported phi");
8bdf488e 1568
1569 phi_op = PHI_ARG_DEF (phi, 0);
9cfd4e76 1570 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1571 if (!op_def_info)
ed9370cc 1572 return opt_result::failure_at (phi, "unsupported phi");
8bdf488e 1573
9cfd4e76 1574 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1575 && (STMT_VINFO_RELEVANT (op_def_info)
1576 != vect_used_in_outer_by_reduction))
ed9370cc 1577 return opt_result::failure_at (phi, "unsupported phi");
8bdf488e 1578 }
1579
f083cd24 1580 continue;
1581 }
1582
1583 gcc_assert (stmt_info);
1584
6f710392 1585 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1586 || STMT_VINFO_LIVE_P (stmt_info))
f083cd24 1587 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
ed9370cc 1588 /* A scalar-dependence cycle that we don't support. */
1589 return opt_result::failure_at (phi,
1590 "not vectorized:"
1591 " scalar dependence cycle.\n");
f083cd24 1592
1593 if (STMT_VINFO_RELEVANT_P (stmt_info))
1594 {
1595 need_to_vectorize = true;
5cc7beaa 1596 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1597 && ! PURE_SLP_STMT (stmt_info))
a73182ff 1598 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1599 &cost_vec);
44b24fa0 1600 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1601 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1602 && ! PURE_SLP_STMT (stmt_info))
a73182ff 1603 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
c863e35b 1604 &cost_vec);
f083cd24 1605 }
1606
7feaafa2 1607 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1608 if (ok
1609 && STMT_VINFO_LIVE_P (stmt_info)
1610 && !PURE_SLP_STMT (stmt_info))
a73182ff 1611 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
c863e35b 1612 &cost_vec);
6f710392 1613
f083cd24 1614 if (!ok)
ed9370cc 1615 return opt_result::failure_at (phi,
1616 "not vectorized: relevant phi not "
1617 "supported: %G",
1618 static_cast <gimple *> (phi));
f083cd24 1619 }
1620
1a91d914 1621 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1622 gsi_next (&si))
f083cd24 1623 {
42acab1c 1624 gimple *stmt = gsi_stmt (si);
ed9370cc 1625 if (!gimple_clobber_p (stmt))
1626 {
1627 opt_result res
1628 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
819b1150 1629 &need_to_vectorize,
ed9370cc 1630 NULL, NULL, &cost_vec);
1631 if (!res)
1632 return res;
1633 }
48e1416a 1634 }
f083cd24 1635 } /* bbs */
1636
c863e35b 1637 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
c863e35b 1638
f083cd24 1639 /* All operations in the loop are either irrelevant (deal with loop
1640 control, or dead), or only used outside the loop and can be moved
1641 out of the loop (e.g. invariants, inductions). The loop can be
1642 optimized away by scalar optimizations. We're better off not
1643 touching this loop. */
1644 if (!need_to_vectorize)
1645 {
6d8fb6cf 1646 if (dump_enabled_p ())
7bd765d4 1647 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1648 "All the computation can be taken out of the loop.\n");
ed9370cc 1649 return opt_result::failure_at
1650 (vect_location,
1651 "not vectorized: redundant loop. no profit to vectorize.\n");
f083cd24 1652 }
1653
ed9370cc 1654 return opt_result::success ();
f083cd24 1655}
1656
2ea449b3 1657/* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1658 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1659 definitely no, or -1 if it's worth retrying. */
1660
1661static int
1662vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1663{
2e966e2a 1664 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2ea449b3 1665 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1666
1667 /* Only fully-masked loops can have iteration counts less than the
1668 vectorization factor. */
1669 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1670 {
1671 HOST_WIDE_INT max_niter;
1672
1673 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1674 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1675 else
1676 max_niter = max_stmt_executions_int (loop);
1677
1678 if (max_niter != -1
1679 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1680 {
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "not vectorized: iteration count smaller than "
1684 "vectorization factor.\n");
1685 return 0;
1686 }
1687 }
1688
1689 int min_profitable_iters, min_profitable_estimate;
1690 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1691 &min_profitable_estimate);
1692
1693 if (min_profitable_iters < 0)
1694 {
1695 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697 "not vectorized: vectorization not profitable.\n");
1698 if (dump_enabled_p ())
1699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700 "not vectorized: vector version will never be "
1701 "profitable.\n");
1702 return -1;
1703 }
1704
1705 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1706 * assumed_vf);
1707
1708 /* Use the cost model only if it is more conservative than user specified
1709 threshold. */
1710 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1711 min_profitable_iters);
1712
1713 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1714
1715 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1716 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1717 {
1718 if (dump_enabled_p ())
1719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1720 "not vectorized: vectorization not profitable.\n");
1721 if (dump_enabled_p ())
1722 dump_printf_loc (MSG_NOTE, vect_location,
1723 "not vectorized: iteration count smaller than user "
1724 "specified loop bound parameter or minimum profitable "
1725 "iterations (whichever is more conservative).\n");
1726 return 0;
1727 }
1728
1729 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1730 if (estimated_niter == -1)
1731 estimated_niter = likely_max_stmt_executions_int (loop);
1732 if (estimated_niter != -1
1733 && ((unsigned HOST_WIDE_INT) estimated_niter
1734 < MAX (th, (unsigned) min_profitable_estimate)))
1735 {
1736 if (dump_enabled_p ())
1737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1738 "not vectorized: estimated iteration count too "
1739 "small.\n");
1740 if (dump_enabled_p ())
1741 dump_printf_loc (MSG_NOTE, vect_location,
1742 "not vectorized: estimated iteration count smaller "
1743 "than specified loop bound parameter or minimum "
1744 "profitable iterations (whichever is more "
1745 "conservative).\n");
1746 return -1;
1747 }
1748
1749 return 1;
1750}
1751
ed9370cc 1752static opt_result
ed9d8730 1753vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1754 vec<data_reference_p> *datarefs,
1755 unsigned int *n_stmts)
fb85abff 1756{
ed9d8730 1757 *n_stmts = 0;
0a08c1bc 1758 for (unsigned i = 0; i < loop->num_nodes; i++)
1759 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1760 !gsi_end_p (gsi); gsi_next (&gsi))
1761 {
1762 gimple *stmt = gsi_stmt (gsi);
1763 if (is_gimple_debug (stmt))
1764 continue;
ed9d8730 1765 ++(*n_stmts);
ed9370cc 1766 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1767 if (!res)
0a08c1bc 1768 {
1769 if (is_gimple_call (stmt) && loop->safelen)
1770 {
1771 tree fndecl = gimple_call_fndecl (stmt), op;
1772 if (fndecl != NULL_TREE)
1773 {
1774 cgraph_node *node = cgraph_node::get (fndecl);
1775 if (node != NULL && node->simd_clones != NULL)
1776 {
1777 unsigned int j, n = gimple_call_num_args (stmt);
1778 for (j = 0; j < n; j++)
1779 {
1780 op = gimple_call_arg (stmt, j);
1781 if (DECL_P (op)
1782 || (REFERENCE_CLASS_P (op)
1783 && get_base_address (op)))
1784 break;
1785 }
1786 op = gimple_call_lhs (stmt);
1787 /* Ignore #pragma omp declare simd functions
1788 if they don't have data references in the
1789 call stmt itself. */
1790 if (j == n
1791 && !(op
1792 && (DECL_P (op)
1793 || (REFERENCE_CLASS_P (op)
1794 && get_base_address (op)))))
1795 continue;
1796 }
1797 }
1798 }
ed9370cc 1799 return res;
0a08c1bc 1800 }
03ad9f74 1801 /* If dependence analysis will give up due to the limit on the
1802 number of datarefs stop here and fail fatally. */
1803 if (datarefs->length ()
1804 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
ed9370cc 1805 return opt_result::failure_at (stmt, "exceeded param "
1806 "loop-max-datarefs-for-datadeps\n");
0a08c1bc 1807 }
ed9370cc 1808 return opt_result::success ();
ed9d8730 1809}
1810
f92474f8 1811/* Look for SLP-only access groups and turn each individual access into its own
1812 group. */
1813static void
1814vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1815{
1816 unsigned int i;
1817 struct data_reference *dr;
1818
1819 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1820
1821 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1822 FOR_EACH_VEC_ELT (datarefs, i, dr)
1823 {
1824 gcc_assert (DR_REF (dr));
1825 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1826
1827 /* Check if the load is a part of an interleaving chain. */
1828 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1829 {
1830 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1831 unsigned int group_size = DR_GROUP_SIZE (first_element);
1832
1833 /* Check if SLP-only groups. */
1834 if (!STMT_SLP_TYPE (stmt_info)
1835 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1836 {
1837 /* Dissolve the group. */
1838 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1839
1840 stmt_vec_info vinfo = first_element;
1841 while (vinfo)
1842 {
1843 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1844 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1845 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1846 DR_GROUP_SIZE (vinfo) = 1;
1847 DR_GROUP_GAP (vinfo) = group_size - 1;
1848 vinfo = next;
1849 }
1850 }
1851 }
1852 }
1853}
1854
ed9d8730 1855/* Function vect_analyze_loop_2.
1856
1857 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1858 for it. The different analyses will record information in the
1859 loop_vec_info struct. */
ed9370cc 1860static opt_result
a99aba41 1861vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
ed9d8730 1862{
ed9370cc 1863 opt_result ok = opt_result::success ();
ed9d8730 1864 int res;
1865 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1866 poly_uint64 min_vf = 2;
1867
1868 /* The first group of checks is independent of the vector size. */
1869 fatal = true;
1870
1d86b8dc 1871 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1872 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1873 return opt_result::failure_at (vect_location,
1874 "not vectorized: simd if(0)\n");
1875
ed9d8730 1876 /* Find all data references in the loop (which correspond to vdefs/vuses)
1877 and analyze their evolution in the loop. */
1878
1879 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
ed9d8730 1880
1881 /* Gather the data references and count stmts in the loop. */
a99aba41 1882 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
ed9d8730 1883 {
ed9370cc 1884 opt_result res
1885 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1886 &LOOP_VINFO_DATAREFS (loop_vinfo),
1887 n_stmts);
1888 if (!res)
a99aba41 1889 {
1890 if (dump_enabled_p ())
1891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892 "not vectorized: loop contains function "
1893 "calls or data references that cannot "
1894 "be analyzed\n");
ed9370cc 1895 return res;
a99aba41 1896 }
1897 loop_vinfo->shared->save_datarefs ();
ed9d8730 1898 }
a99aba41 1899 else
1900 loop_vinfo->shared->check_datarefs ();
0a08c1bc 1901
1902 /* Analyze the data references and also adjust the minimal
1903 vectorization factor according to the loads and stores. */
fb85abff 1904
2403338f 1905 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
fb85abff 1906 if (!ok)
1907 {
6d8fb6cf 1908 if (dump_enabled_p ())
7bd765d4 1909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1910 "bad data references.\n");
ed9370cc 1911 return ok;
fb85abff 1912 }
1913
bac0b1e7 1914 /* Classify all cross-iteration scalar data-flow cycles.
1915 Cross-iteration cycles caused by virtual phis are analyzed separately. */
bac0b1e7 1916 vect_analyze_scalar_cycles (loop_vinfo);
1917
e2c5c678 1918 vect_pattern_recog (loop_vinfo);
bac0b1e7 1919
34563054 1920 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1921
68f15e9d 1922 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1923 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1924
e2c5c678 1925 ok = vect_analyze_data_ref_accesses (loop_vinfo);
68f15e9d 1926 if (!ok)
1927 {
1928 if (dump_enabled_p ())
1929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1930 "bad data access.\n");
ed9370cc 1931 return ok;
68f15e9d 1932 }
1933
fb85abff 1934 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1935
2403338f 1936 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
fb85abff 1937 if (!ok)
1938 {
6d8fb6cf 1939 if (dump_enabled_p ())
7bd765d4 1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1941 "unexpected pattern.\n");
ed9370cc 1942 return ok;
fb85abff 1943 }
1944
37cf30c5 1945 /* While the rest of the analysis below depends on it in some way. */
1946 fatal = false;
1947
91a74fc6 1948 /* Analyze data dependences between the data-refs in the loop
1949 and adjust the maximum vectorization factor according to
1950 the dependences.
1951 FORNOW: fail at the first data dependence that we encounter. */
fb85abff 1952
68f15e9d 1953 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
ed9370cc 1954 if (!ok)
fb85abff 1955 {
6d8fb6cf 1956 if (dump_enabled_p ())
ed9370cc 1957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958 "bad data dependence.\n");
1959 return ok;
fb85abff 1960 }
ed9370cc 1961 if (max_vf != MAX_VECTORIZATION_FACTOR
1962 && maybe_lt (max_vf, min_vf))
1963 return opt_result::failure_at (vect_location, "bad data dependence.\n");
4a85c0b1 1964 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
fb85abff 1965
1966 ok = vect_determine_vectorization_factor (loop_vinfo);
1967 if (!ok)
1968 {
6d8fb6cf 1969 if (dump_enabled_p ())
7bd765d4 1970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1971 "can't determine vectorization factor.\n");
ed9370cc 1972 return ok;
fb85abff 1973 }
d75596cd 1974 if (max_vf != MAX_VECTORIZATION_FACTOR
1975 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
ed9370cc 1976 return opt_result::failure_at (vect_location, "bad data dependence.\n");
fb85abff 1977
dcf53ad6 1978 /* Compute the scalar iteration cost. */
1979 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1980
d75596cd 1981 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
dcf53ad6 1982 unsigned th;
dcf53ad6 1983
c1bee668 1984 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
a99aba41 1985 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
c1bee668 1986 if (!ok)
ed9370cc 1987 return ok;
c1bee668 1988
1989 /* If there are any SLP instances mark them as pure_slp. */
1990 bool slp = vect_make_slp_decision (loop_vinfo);
1991 if (slp)
1992 {
1993 /* Find stmts that need to be both vectorized and SLPed. */
1994 vect_detect_hybrid_slp (loop_vinfo);
1995
1996 /* Update the vectorization factor based on the SLP decision. */
1997 vect_update_vf_for_slp (loop_vinfo);
1998 }
1999
60b29a7e 2000 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2001
2002 /* We don't expect to have to roll back to anything other than an empty
2003 set of rgroups. */
2004 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2005
dcf53ad6 2006 /* This is the point where we can re-start analysis with SLP forced off. */
2007start_over:
2008
bbd820dd 2009 /* Now the vectorization factor is final. */
d75596cd 2010 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2011 gcc_assert (known_ne (vectorization_factor, 0U));
bbd820dd 2012
2013 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
d75596cd 2014 {
2015 dump_printf_loc (MSG_NOTE, vect_location,
2016 "vectorization_factor = ");
2017 dump_dec (MSG_NOTE, vectorization_factor);
bffe1cb4 2018 dump_printf (MSG_NOTE, ", niters = %wd\n",
d75596cd 2019 LOOP_VINFO_INT_NITERS (loop_vinfo));
2020 }
bbd820dd 2021
2022 HOST_WIDE_INT max_niter
a05d13ea 2023 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
bbd820dd 2024
91a74fc6 2025 /* Analyze the alignment of the data-refs in the loop.
2026 Fail if a data reference is found that cannot be vectorized. */
fb85abff 2027
e2c5c678 2028 ok = vect_analyze_data_refs_alignment (loop_vinfo);
fb85abff 2029 if (!ok)
2030 {
6d8fb6cf 2031 if (dump_enabled_p ())
7bd765d4 2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2033 "bad data alignment.\n");
ed9370cc 2034 return ok;
fb85abff 2035 }
2036
fb85abff 2037 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2038 It is important to call pruning after vect_analyze_data_ref_accesses,
2039 since we use grouping information gathered by interleaving analysis. */
2040 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2041 if (!ok)
ed9370cc 2042 return ok;
fb85abff 2043
2d974ea1 2044 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2045 vectorization, since we do not want to add extra peeling or
2046 add versioning for alignment. */
5b631e09 2047 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
5b631e09 2048 /* This pass will decide on using loop versioning and/or loop peeling in
2049 order to enhance the alignment of data references in the loop. */
2050 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2d974ea1 2051 else
2052 ok = vect_verify_datarefs_alignment (loop_vinfo);
2053 if (!ok)
ed9370cc 2054 return ok;
fb85abff 2055
c1bee668 2056 if (slp)
0822b158 2057 {
c1bee668 2058 /* Analyze operations in the SLP instances. Note this may
2059 remove unsupported SLP instances which makes the above
2060 SLP kind detection invalid. */
2061 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1c57101b 2062 vect_slp_analyze_operations (loop_vinfo);
c1bee668 2063 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
ed9370cc 2064 {
2065 ok = opt_result::failure_at (vect_location,
2066 "unsupported SLP instances\n");
2067 goto again;
2068 }
0822b158 2069 }
2070
f92474f8 2071 /* Dissolve SLP-only groups. */
2072 vect_dissolve_slp_only_groups (loop_vinfo);
2073
5cb834f3 2074 /* Scan all the remaining operations in the loop that are not subject
2075 to SLP and make sure they are vectorizable. */
2076 ok = vect_analyze_loop_operations (loop_vinfo);
fb85abff 2077 if (!ok)
2078 {
6d8fb6cf 2079 if (dump_enabled_p ())
7bd765d4 2080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2081 "bad operation or unsupported loop bound.\n");
ed9370cc 2082 return ok;
c4740c5d 2083 }
2084
60b29a7e 2085 /* Decide whether to use a fully-masked loop for this vectorization
2086 factor. */
2087 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2088 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2089 && vect_verify_full_masking (loop_vinfo));
2090 if (dump_enabled_p ())
2091 {
2092 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2093 dump_printf_loc (MSG_NOTE, vect_location,
2094 "using a fully-masked loop.\n");
2095 else
2096 dump_printf_loc (MSG_NOTE, vect_location,
2097 "not using a fully-masked loop.\n");
2098 }
2099
73e363e1 2100 /* If epilog loop is required because of data accesses with gaps,
2101 one additional iteration needs to be peeled. Check if there is
2102 enough iterations for vectorization. */
2103 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
60b29a7e 2104 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2105 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
73e363e1 2106 {
d75596cd 2107 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
73e363e1 2108 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2109
d75596cd 2110 if (known_lt (wi::to_widest (scalar_niters), vf))
ed9370cc 2111 return opt_result::failure_at (vect_location,
2112 "loop has no enough iterations to"
2113 " support peeling for gaps.\n");
73e363e1 2114 }
2115
2ea449b3 2116 /* Check the costings of the loop make vectorizing worthwhile. */
2117 res = vect_analyze_loop_costing (loop_vinfo);
2118 if (res < 0)
bbd820dd 2119 {
ed9370cc 2120 ok = opt_result::failure_at (vect_location,
2121 "Loop costings may not be worthwhile.\n");
2122 goto again;
bbd820dd 2123 }
ed9370cc 2124 if (!res)
2125 return opt_result::failure_at (vect_location,
2126 "Loop costings not worthwhile.\n");
bbd820dd 2127
313a5120 2128 /* Decide whether we need to create an epilogue loop to handle
2129 remaining scalar iterations. */
d75596cd 2130 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
004a94a5 2131
d75596cd 2132 unsigned HOST_WIDE_INT const_vf;
60b29a7e 2133 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2134 /* The main loop handles all iterations. */
2135 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2136 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
c333203a 2137 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
313a5120 2138 {
c333203a 2139 /* Work out the (constant) number of iterations that need to be
2140 peeled for reasons other than niters. */
2141 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2142 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2143 peel_niter += 1;
2144 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
d75596cd 2145 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
313a5120 2146 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2147 }
2148 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
c333203a 2149 /* ??? When peeling for gaps but not alignment, we could
2150 try to check whether the (variable) niters is known to be
2151 VF * N + 1. That's something of a niche case though. */
2152 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
d75596cd 2153 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2154 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2155 < (unsigned) exact_log2 (const_vf))
2156 /* In case of versioning, check if the maximum number of
2157 iterations is greater than th. If they are identical,
2158 the epilogue is unnecessary. */
d5e80d93 2159 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
d75596cd 2160 || ((unsigned HOST_WIDE_INT) max_niter
2161 > (th / const_vf) * const_vf))))
313a5120 2162 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2163
2164 /* If an epilogue loop is required make sure we can create one. */
2165 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2166 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2167 {
2168 if (dump_enabled_p ())
2169 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2170 if (!vect_can_advance_ivs_p (loop_vinfo)
2171 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2172 single_exit (LOOP_VINFO_LOOP
2173 (loop_vinfo))))
2174 {
ed9370cc 2175 ok = opt_result::failure_at (vect_location,
2176 "not vectorized: can't create required "
2177 "epilog loop\n");
dcf53ad6 2178 goto again;
313a5120 2179 }
2180 }
2181
32236f80 2182 /* During peeling, we need to check if number of loop iterations is
2183 enough for both peeled prolog loop and vector loop. This check
2184 can be merged along with threshold check of loop versioning, so
2185 increase threshold for this case if necessary. */
7456a7ea 2186 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
32236f80 2187 {
6753a4bf 2188 poly_uint64 niters_th = 0;
32236f80 2189
6753a4bf 2190 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
32236f80 2191 {
6753a4bf 2192 /* Niters for peeled prolog loop. */
2193 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2194 {
ec5bf0fb 2195 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
abc9513d 2196 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
6753a4bf 2197 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2198 }
2199 else
2200 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
32236f80 2201 }
32236f80 2202
2203 /* Niters for at least one iteration of vectorized loop. */
60b29a7e 2204 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2205 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
32236f80 2206 /* One additional iteration because of peeling for gap. */
ba12948e 2207 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
7456a7ea 2208 niters_th += 1;
2209 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
32236f80 2210 }
2211
d75596cd 2212 gcc_assert (known_eq (vectorization_factor,
2213 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
bbd820dd 2214
dcf53ad6 2215 /* Ok to vectorize! */
ed9370cc 2216 return opt_result::success ();
dcf53ad6 2217
2218again:
ed9370cc 2219 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2220 gcc_assert (!ok);
2221
dcf53ad6 2222 /* Try again with SLP forced off but if we didn't do any SLP there is
2223 no point in re-trying. */
2224 if (!slp)
ed9370cc 2225 return ok;
dcf53ad6 2226
93bfa1f9 2227 /* If there are reduction chains re-trying will fail anyway. */
2228 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
ed9370cc 2229 return ok;
93bfa1f9 2230
dcf53ad6 2231 /* Likewise if the grouped loads or stores in the SLP cannot be handled
93bfa1f9 2232 via interleaving or lane instructions. */
dcf53ad6 2233 slp_instance instance;
2234 slp_tree node;
2235 unsigned i, j;
2236 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2237 {
2238 stmt_vec_info vinfo;
06bb64b8 2239 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
dcf53ad6 2240 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
93bfa1f9 2241 continue;
cd24aa3c 2242 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
e1009321 2243 unsigned int size = DR_GROUP_SIZE (vinfo);
dcf53ad6 2244 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2dd8e84c 2245 if (! vect_store_lanes_supported (vectype, size, false)
49b29564 2246 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2247 && ! vect_grouped_store_supported (vectype, size))
ed9370cc 2248 return opt_result::failure_at (vinfo->stmt,
2249 "unsupported grouped store\n");
dcf53ad6 2250 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2251 {
06bb64b8 2252 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
cd24aa3c 2253 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
e1009321 2254 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2255 size = DR_GROUP_SIZE (vinfo);
dcf53ad6 2256 vectype = STMT_VINFO_VECTYPE (vinfo);
2dd8e84c 2257 if (! vect_load_lanes_supported (vectype, size, false)
bc691ae4 2258 && ! vect_grouped_load_supported (vectype, single_element_p,
2259 size))
ed9370cc 2260 return opt_result::failure_at (vinfo->stmt,
2261 "unsupported grouped load\n");
dcf53ad6 2262 }
2263 }
2264
2265 if (dump_enabled_p ())
2266 dump_printf_loc (MSG_NOTE, vect_location,
2267 "re-trying with SLP disabled\n");
2268
2269 /* Roll back state appropriately. No SLP this time. */
2270 slp = false;
2271 /* Restore vectorization factor as it were without SLP. */
2272 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2273 /* Free the SLP instances. */
2274 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2068679d 2275 vect_free_slp_instance (instance, false);
dcf53ad6 2276 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2277 /* Reset SLP type to loop_vect on all stmts. */
2278 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2279 {
2280 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5cc7beaa 2281 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2282 !gsi_end_p (si); gsi_next (&si))
2283 {
03c0d666 2284 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
5cc7beaa 2285 STMT_SLP_TYPE (stmt_info) = loop_vect;
2286 }
dcf53ad6 2287 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2288 !gsi_end_p (si); gsi_next (&si))
2289 {
03c0d666 2290 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
7819730f 2291 STMT_SLP_TYPE (stmt_info) = loop_vect;
dcf53ad6 2292 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2293 {
da611310 2294 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
aebdbd31 2295 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7819730f 2296 STMT_SLP_TYPE (stmt_info) = loop_vect;
da611310 2297 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
eec2f307 2298 !gsi_end_p (pi); gsi_next (&pi))
03c0d666 2299 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2300 = loop_vect;
dcf53ad6 2301 }
dcf53ad6 2302 }
2303 }
2304 /* Free optimized alias test DDRS. */
e85b4a5e 2305 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
dcf53ad6 2306 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
f68a7726 2307 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
dcf53ad6 2308 /* Reset target cost data. */
2309 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2310 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2311 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
60b29a7e 2312 /* Reset accumulated rgroup information. */
2313 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
dcf53ad6 2314 /* Reset assorted flags. */
2315 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2fed77be 2316 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
dcf53ad6 2317 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
7456a7ea 2318 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
60b29a7e 2319 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
dcf53ad6 2320
2321 goto start_over;
c4740c5d 2322}
2323
2324/* Function vect_analyze_loop.
2325
2326 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2327 for it. The different analyses will record information in the
5b631e09 2328 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2329 be vectorized. */
ed9370cc 2330opt_loop_vec_info
2e966e2a 2331vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
a99aba41 2332 vec_info_shared *shared)
c4740c5d 2333{
3106770a 2334 auto_vector_sizes vector_sizes;
c4740c5d 2335
2336 /* Autodetect first vector size we try. */
2337 current_vector_size = 0;
e7419472 2338 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2339 loop->simdlen != 0);
3106770a 2340 unsigned int next_size = 0;
c4740c5d 2341
88f6eb8f 2342 DUMP_VECT_SCOPE ("analyze_loop_nest");
c4740c5d 2343
2344 if (loop_outer (loop)
2345 && loop_vec_info_for_loop (loop_outer (loop))
2346 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
ed9370cc 2347 return opt_loop_vec_info::failure_at (vect_location,
2348 "outer-loop already vectorized.\n");
fb85abff 2349
a99aba41 2350 if (!find_loop_nest (loop, &shared->loop_nest))
ed9370cc 2351 return opt_loop_vec_info::failure_at
2352 (vect_location,
2353 "not vectorized: loop nest containing two or more consecutive inner"
2354 " loops cannot be vectorized\n");
a99aba41 2355
5602144c 2356 unsigned n_stmts = 0;
3106770a 2357 poly_uint64 autodetected_vector_size = 0;
e7419472 2358 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2359 poly_uint64 first_vector_size = 0;
c4740c5d 2360 while (1)
2361 {
2362 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
ed9370cc 2363 opt_loop_vec_info loop_vinfo
2364 = vect_analyze_loop_form (loop, shared);
c4740c5d 2365 if (!loop_vinfo)
2366 {
6d8fb6cf 2367 if (dump_enabled_p ())
7bd765d4 2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2369 "bad loop form.\n");
e7419472 2370 gcc_checking_assert (first_loop_vinfo == NULL);
ed9370cc 2371 return loop_vinfo;
c4740c5d 2372 }
fb85abff 2373
37cf30c5 2374 bool fatal = false;
5b631e09 2375
2376 if (orig_loop_vinfo)
2377 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2378
ed9370cc 2379 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2380 if (res)
c4740c5d 2381 {
2382 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2383
e7419472 2384 if (loop->simdlen
2385 && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2386 (unsigned HOST_WIDE_INT) loop->simdlen))
2387 {
2388 if (first_loop_vinfo == NULL)
2389 {
2390 first_loop_vinfo = loop_vinfo;
2391 first_vector_size = current_vector_size;
2392 loop->aux = NULL;
2393 }
2394 else
2395 delete loop_vinfo;
2396 }
2397 else
2398 {
2399 delete first_loop_vinfo;
2400 return loop_vinfo;
2401 }
c4740c5d 2402 }
e7419472 2403 else
2404 delete loop_vinfo;
c4740c5d 2405
3106770a 2406 if (next_size == 0)
2407 autodetected_vector_size = current_vector_size;
2408
2409 if (next_size < vector_sizes.length ()
2410 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2411 next_size += 1;
2412
e7419472 2413 if (fatal)
2414 {
2415 gcc_checking_assert (first_loop_vinfo == NULL);
2416 return opt_loop_vec_info::propagate_failure (res);
2417 }
2418
2419 if (next_size == vector_sizes.length ()
3106770a 2420 || known_eq (current_vector_size, 0U))
e7419472 2421 {
2422 if (first_loop_vinfo)
2423 {
2424 current_vector_size = first_vector_size;
2425 loop->aux = (loop_vec_info) first_loop_vinfo;
2426 if (dump_enabled_p ())
2427 {
2428 dump_printf_loc (MSG_NOTE, vect_location,
2429 "***** Choosing vector size ");
2430 dump_dec (MSG_NOTE, current_vector_size);
2431 dump_printf (MSG_NOTE, "\n");
2432 }
2433 return first_loop_vinfo;
2434 }
2435 else
2436 return opt_loop_vec_info::propagate_failure (res);
2437 }
c4740c5d 2438
2439 /* Try the next biggest vector size. */
3106770a 2440 current_vector_size = vector_sizes[next_size++];
6d8fb6cf 2441 if (dump_enabled_p ())
3106770a 2442 {
2443 dump_printf_loc (MSG_NOTE, vect_location,
2444 "***** Re-trying analysis with "
2445 "vector size ");
2446 dump_dec (MSG_NOTE, current_vector_size);
2447 dump_printf (MSG_NOTE, "\n");
2448 }
c4740c5d 2449 }
fb85abff 2450}
2451
d77809a4 2452/* Return true if there is an in-order reduction function for CODE, storing
2453 it in *REDUC_FN if so. */
2454
2455static bool
2456fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2457{
2458 switch (code)
2459 {
2460 case PLUS_EXPR:
2461 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2462 return true;
2463
2464 default:
2465 return false;
2466 }
2467}
fb85abff 2468
e53664fa 2469/* Function reduction_fn_for_scalar_code
fb85abff 2470
2471 Input:
2472 CODE - tree_code of a reduction operations.
2473
2474 Output:
e53664fa 2475 REDUC_FN - the corresponding internal function to be used to reduce the
2476 vector of partial results into a single scalar result, or IFN_LAST
7ba68b18 2477 if the operation is a supported reduction operation, but does not have
e53664fa 2478 such an internal function.
fb85abff 2479
7aa0d350 2480 Return FALSE if CODE currently cannot be vectorized as reduction. */
fb85abff 2481
2482static bool
e53664fa 2483reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
fb85abff 2484{
2485 switch (code)
7aa0d350 2486 {
2487 case MAX_EXPR:
e53664fa 2488 *reduc_fn = IFN_REDUC_MAX;
7aa0d350 2489 return true;
fb85abff 2490
7aa0d350 2491 case MIN_EXPR:
e53664fa 2492 *reduc_fn = IFN_REDUC_MIN;
7aa0d350 2493 return true;
fb85abff 2494
7aa0d350 2495 case PLUS_EXPR:
e53664fa 2496 *reduc_fn = IFN_REDUC_PLUS;
7aa0d350 2497 return true;
fb85abff 2498
216934f9 2499 case BIT_AND_EXPR:
2500 *reduc_fn = IFN_REDUC_AND;
2501 return true;
2502
7aa0d350 2503 case BIT_IOR_EXPR:
216934f9 2504 *reduc_fn = IFN_REDUC_IOR;
2505 return true;
2506
7aa0d350 2507 case BIT_XOR_EXPR:
216934f9 2508 *reduc_fn = IFN_REDUC_XOR;
2509 return true;
2510
2511 case MULT_EXPR:
2512 case MINUS_EXPR:
e53664fa 2513 *reduc_fn = IFN_LAST;
7aa0d350 2514 return true;
2515
2516 default:
2517 return false;
2518 }
fb85abff 2519}
2520
633af029 2521/* If there is a neutral value X such that SLP reduction NODE would not
2522 be affected by the introduction of additional X elements, return that X,
2523 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2524 is true if the SLP statements perform a single reduction, false if each
2525 statement performs an independent reduction. */
2526
2527static tree
2528neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2529 bool reduc_chain)
2530{
06bb64b8 2531 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2532 stmt_vec_info stmt_vinfo = stmts[0];
633af029 2533 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2534 tree scalar_type = TREE_TYPE (vector_type);
2e966e2a 2535 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
633af029 2536 gcc_assert (loop);
2537
2538 switch (code)
2539 {
2540 case WIDEN_SUM_EXPR:
2541 case DOT_PROD_EXPR:
2542 case SAD_EXPR:
2543 case PLUS_EXPR:
2544 case MINUS_EXPR:
2545 case BIT_IOR_EXPR:
2546 case BIT_XOR_EXPR:
2547 return build_zero_cst (scalar_type);
2548
2549 case MULT_EXPR:
2550 return build_one_cst (scalar_type);
2551
2552 case BIT_AND_EXPR:
2553 return build_all_ones_cst (scalar_type);
2554
2555 case MAX_EXPR:
2556 case MIN_EXPR:
2557 /* For MIN/MAX the initial values are neutral. A reduction chain
2558 has only a single initial value, so that value is neutral for
2559 all statements. */
2560 if (reduc_chain)
06bb64b8 2561 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2562 loop_preheader_edge (loop));
633af029 2563 return NULL_TREE;
2564
2565 default:
2566 return NULL_TREE;
2567 }
2568}
fb85abff 2569
282bf14c 2570/* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
fb85abff 2571 STMT is printed with a message MSG. */
2572
2573static void
3f6e5ced 2574report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
fb85abff 2575{
a4e972e3 2576 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
fb85abff 2577}
2578
9cfd4e76 2579/* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2580 operation. Return true if the results of DEF_STMT_INFO are something
2581 that can be accumulated by such a reduction. */
6340aaa8 2582
2583static bool
9cfd4e76 2584vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
6340aaa8 2585{
9cfd4e76 2586 return (is_gimple_assign (def_stmt_info->stmt)
2587 || is_gimple_call (def_stmt_info->stmt)
6340aaa8 2588 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
9cfd4e76 2589 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
6340aaa8 2590 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
9cfd4e76 2591 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
6340aaa8 2592}
fb85abff 2593
39a5d6b1 2594/* Detect SLP reduction of the form:
2595
2596 #a1 = phi <a5, a0>
2597 a2 = operation (a1)
2598 a3 = operation (a2)
2599 a4 = operation (a3)
2600 a5 = operation (a4)
2601
2602 #a = phi <a5>
2603
2604 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2605 FIRST_STMT is the first reduction stmt in the chain
2606 (a2 = operation (a1)).
2607
2608 Return TRUE if a reduction chain was detected. */
2609
2610static bool
42acab1c 2611vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2612 gimple *first_stmt)
39a5d6b1 2613{
2e966e2a 2614 class loop *loop = (gimple_bb (phi))->loop_father;
2615 class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
39a5d6b1 2616 enum tree_code code;
cd24aa3c 2617 gimple *loop_use_stmt = NULL;
a3d6edbf 2618 stmt_vec_info use_stmt_info;
39a5d6b1 2619 tree lhs;
2620 imm_use_iterator imm_iter;
2621 use_operand_p use_p;
6b809b99 2622 int nloop_uses, size = 0, n_out_of_loop_uses;
39a5d6b1 2623 bool found = false;
2624
2625 if (loop != vect_loop)
2626 return false;
2627
a3d6edbf 2628 auto_vec<stmt_vec_info, 8> reduc_chain;
39a5d6b1 2629 lhs = PHI_RESULT (phi);
2630 code = gimple_assign_rhs_code (first_stmt);
2631 while (1)
2632 {
2633 nloop_uses = 0;
6b809b99 2634 n_out_of_loop_uses = 0;
39a5d6b1 2635 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2636 {
42acab1c 2637 gimple *use_stmt = USE_STMT (use_p);
0b308eee 2638 if (is_gimple_debug (use_stmt))
2639 continue;
85078181 2640
39a5d6b1 2641 /* Check if we got back to the reduction phi. */
85078181 2642 if (use_stmt == phi)
39a5d6b1 2643 {
85078181 2644 loop_use_stmt = use_stmt;
39a5d6b1 2645 found = true;
2646 break;
2647 }
2648
6b809b99 2649 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2650 {
423475a3 2651 loop_use_stmt = use_stmt;
2652 nloop_uses++;
6b809b99 2653 }
2654 else
2655 n_out_of_loop_uses++;
39a5d6b1 2656
6b809b99 2657 /* There are can be either a single use in the loop or two uses in
2658 phi nodes. */
2659 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2660 return false;
39a5d6b1 2661 }
2662
2663 if (found)
2664 break;
2665
85078181 2666 /* We reached a statement with no loop uses. */
2667 if (nloop_uses == 0)
2668 return false;
2669
39a5d6b1 2670 /* This is a loop exit phi, and we haven't reached the reduction phi. */
85078181 2671 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
39a5d6b1 2672 return false;
2673
85078181 2674 if (!is_gimple_assign (loop_use_stmt)
2675 || code != gimple_assign_rhs_code (loop_use_stmt)
2676 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
39a5d6b1 2677 return false;
2678
2679 /* Insert USE_STMT into reduction chain. */
03c0d666 2680 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
a3d6edbf 2681 reduc_chain.safe_push (use_stmt_info);
39a5d6b1 2682
85078181 2683 lhs = gimple_assign_lhs (loop_use_stmt);
39a5d6b1 2684 size++;
2685 }
2686
85078181 2687 if (!found || loop_use_stmt != phi || size < 2)
39a5d6b1 2688 return false;
2689
39a5d6b1 2690 /* Swap the operands, if needed, to make the reduction operand be the second
2691 operand. */
2692 lhs = PHI_RESULT (phi);
a3d6edbf 2693 for (unsigned i = 0; i < reduc_chain.length (); ++i)
39a5d6b1 2694 {
a3d6edbf 2695 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
85078181 2696 if (gimple_assign_rhs2 (next_stmt) == lhs)
eb3a666e 2697 {
85078181 2698 tree op = gimple_assign_rhs1 (next_stmt);
9cfd4e76 2699 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
85078181 2700
2701 /* Check that the other def is either defined in the loop
2702 ("vect_internal_def"), or it's an induction (defined by a
2703 loop-header phi-node). */
9cfd4e76 2704 if (def_stmt_info
2705 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2706 && vect_valid_reduction_input_p (def_stmt_info))
eb3a666e 2707 {
85078181 2708 lhs = gimple_assign_lhs (next_stmt);
85078181 2709 continue;
2710 }
2711
2712 return false;
2713 }
2714 else
2715 {
2716 tree op = gimple_assign_rhs2 (next_stmt);
9cfd4e76 2717 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
85078181 2718
2719 /* Check that the other def is either defined in the loop
2720 ("vect_internal_def"), or it's an induction (defined by a
2721 loop-header phi-node). */
9cfd4e76 2722 if (def_stmt_info
2723 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2724 && vect_valid_reduction_input_p (def_stmt_info))
85078181 2725 {
6d8fb6cf 2726 if (dump_enabled_p ())
a4e972e3 2727 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2728 next_stmt);
eb3a666e 2729
8f6fa493 2730 swap_ssa_operands (next_stmt,
2731 gimple_assign_rhs1_ptr (next_stmt),
2732 gimple_assign_rhs2_ptr (next_stmt));
a9696ee9 2733 update_stmt (next_stmt);
ba69439f 2734
2735 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2736 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
eb3a666e 2737 }
2738 else
85078181 2739 return false;
39a5d6b1 2740 }
2741
eb3a666e 2742 lhs = gimple_assign_lhs (next_stmt);
39a5d6b1 2743 }
2744
a3d6edbf 2745 /* Build up the actual chain. */
2746 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2747 {
2748 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2749 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2750 }
2751 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2752 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2753
eb3a666e 2754 /* Save the chain for further analysis in SLP detection. */
a3d6edbf 2755 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2756 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
eb3a666e 2757
39a5d6b1 2758 return true;
2759}
2760
d77809a4 2761/* Return true if we need an in-order reduction for operation CODE
2762 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2763 overflow must wrap. */
2764
2765static bool
2766needs_fold_left_reduction_p (tree type, tree_code code,
2767 bool need_wrapping_integral_overflow)
2768{
2769 /* CHECKME: check for !flag_finite_math_only too? */
2770 if (SCALAR_FLOAT_TYPE_P (type))
2771 switch (code)
2772 {
2773 case MIN_EXPR:
2774 case MAX_EXPR:
2775 return false;
2776
2777 default:
2778 return !flag_associative_math;
2779 }
2780
2781 if (INTEGRAL_TYPE_P (type))
2782 {
2783 if (!operation_no_trapping_overflow (type, code))
2784 return true;
2785 if (need_wrapping_integral_overflow
2786 && !TYPE_OVERFLOW_WRAPS (type)
2787 && operation_can_overflow (code))
2788 return true;
2789 return false;
2790 }
2791
2792 if (SAT_FIXED_POINT_TYPE_P (type))
2793 return true;
2794
2795 return false;
2796}
39a5d6b1 2797
5051abaf 2798/* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2799 reduction operation CODE has a handled computation expression. */
2800
2801bool
c309657f 2802check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2803 tree loop_arg, enum tree_code code)
5051abaf 2804{
2805 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2806 auto_bitmap visited;
2807 tree lookfor = PHI_RESULT (phi);
2808 ssa_op_iter curri;
2809 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2810 while (USE_FROM_PTR (curr) != loop_arg)
2811 curr = op_iter_next_use (&curri);
2812 curri.i = curri.numops;
2813 do
2814 {
2815 path.safe_push (std::make_pair (curri, curr));
2816 tree use = USE_FROM_PTR (curr);
2817 if (use == lookfor)
2818 break;
2819 gimple *def = SSA_NAME_DEF_STMT (use);
2820 if (gimple_nop_p (def)
2821 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2822 {
2823pop:
2824 do
2825 {
2826 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2827 curri = x.first;
2828 curr = x.second;
2829 do
2830 curr = op_iter_next_use (&curri);
2831 /* Skip already visited or non-SSA operands (from iterating
2832 over PHI args). */
2833 while (curr != NULL_USE_OPERAND_P
2834 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2835 || ! bitmap_set_bit (visited,
2836 SSA_NAME_VERSION
2837 (USE_FROM_PTR (curr)))));
2838 }
2839 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2840 if (curr == NULL_USE_OPERAND_P)
2841 break;
2842 }
2843 else
2844 {
2845 if (gimple_code (def) == GIMPLE_PHI)
2846 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2847 else
2848 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2849 while (curr != NULL_USE_OPERAND_P
2850 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2851 || ! bitmap_set_bit (visited,
2852 SSA_NAME_VERSION
2853 (USE_FROM_PTR (curr)))))
2854 curr = op_iter_next_use (&curri);
2855 if (curr == NULL_USE_OPERAND_P)
2856 goto pop;
2857 }
2858 }
2859 while (1);
2860 if (dump_file && (dump_flags & TDF_DETAILS))
2861 {
2862 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2863 unsigned i;
2864 std::pair<ssa_op_iter, use_operand_p> *x;
2865 FOR_EACH_VEC_ELT (path, i, x)
a4e972e3 2866 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
5051abaf 2867 dump_printf (MSG_NOTE, "\n");
2868 }
2869
2870 /* Check whether the reduction path detected is valid. */
2871 bool fail = path.length () == 0;
2872 bool neg = false;
2873 for (unsigned i = 1; i < path.length (); ++i)
2874 {
2875 gimple *use_stmt = USE_STMT (path[i].second);
2876 tree op = USE_FROM_PTR (path[i].second);
2877 if (! has_single_use (op)
2878 || ! is_gimple_assign (use_stmt))
2879 {
2880 fail = true;
2881 break;
2882 }
2883 if (gimple_assign_rhs_code (use_stmt) != code)
2884 {
2885 if (code == PLUS_EXPR
2886 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2887 {
2888 /* Track whether we negate the reduction value each iteration. */
2889 if (gimple_assign_rhs2 (use_stmt) == op)
2890 neg = ! neg;
2891 }
2892 else
2893 {
2894 fail = true;
2895 break;
2896 }
2897 }
2898 }
2899 return ! fail && ! neg;
2900}
2901
2902
119a8852 2903/* Function vect_is_simple_reduction
fb85abff 2904
7aa0d350 2905 (1) Detect a cross-iteration def-use cycle that represents a simple
282bf14c 2906 reduction computation. We look for the following pattern:
fb85abff 2907
2908 loop_header:
2909 a1 = phi < a0, a2 >
2910 a3 = ...
2911 a2 = operation (a3, a1)
48e1416a 2912
63048bd8 2913 or
2914
2915 a3 = ...
2916 loop_header:
2917 a1 = phi < a0, a2 >
2918 a2 = operation (a3, a1)
2919
fb85abff 2920 such that:
48e1416a 2921 1. operation is commutative and associative and it is safe to
119a8852 2922 change the order of the computation
fb85abff 2923 2. no uses for a2 in the loop (a2 is used out of the loop)
caf6df13 2924 3. no uses of a1 in the loop besides the reduction operation
2925 4. no uses of a1 outside the loop.
fb85abff 2926
caf6df13 2927 Conditions 1,4 are tested here.
48e1416a 2928 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
ade2ac53 2929
48e1416a 2930 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
119a8852 2931 nested cycles.
7aa0d350 2932
2933 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2934 reductions:
2935
2936 a1 = phi < a0, a2 >
2937 inner loop (def of a3)
48e1416a 2938 a2 = phi < a3 >
f4a50267 2939
d09d8733 2940 (4) Detect condition expressions, ie:
2941 for (int i = 0; i < N; i++)
2942 if (a[i] < val)
2943 ret_val = a[i];
2944
7aa0d350 2945*/
fb85abff 2946
f4649a92 2947static stmt_vec_info
2948vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
119a8852 2949 bool *double_reduc,
ebacf0e3 2950 bool need_wrapping_integral_overflow,
2951 enum vect_reduction_type *v_reduc_type)
fb85abff 2952{
f4649a92 2953 gphi *phi = as_a <gphi *> (phi_info->stmt);
2e966e2a 2954 class loop *loop = (gimple_bb (phi))->loop_father;
2955 class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
9129f51c 2956 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
f4649a92 2957 gimple *phi_use_stmt = NULL;
f4a50267 2958 enum tree_code orig_code, code;
0df23b96 2959 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
fb85abff 2960 tree type;
fb85abff 2961 tree name;
2962 imm_use_iterator imm_iter;
2963 use_operand_p use_p;
7aa0d350 2964 bool phi_def;
2965
2966 *double_reduc = false;
d09d8733 2967 *v_reduc_type = TREE_CODE_REDUCTION;
fb85abff 2968
ed3fa54b 2969 tree phi_name = PHI_RESULT (phi);
75f8b7c8 2970 /* ??? If there are no uses of the PHI result the inner loop reduction
2971 won't be detected as possibly double-reduction by vectorizable_reduction
2972 because that tries to walk the PHI arg from the preheader edge which
2973 can be constant. See PR60382. */
ed3fa54b 2974 if (has_zero_uses (phi_name))
75f8b7c8 2975 return NULL;
9129f51c 2976 unsigned nphi_def_loop_uses = 0;
ed3fa54b 2977 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
fb85abff 2978 {
42acab1c 2979 gimple *use_stmt = USE_STMT (use_p);
9845d120 2980 if (is_gimple_debug (use_stmt))
2981 continue;
caf6df13 2982
2983 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2984 {
6d8fb6cf 2985 if (dump_enabled_p ())
7bd765d4 2986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2987 "intermediate value used outside loop.\n");
caf6df13 2988
2989 return NULL;
2990 }
2991
9129f51c 2992 nphi_def_loop_uses++;
24651fb7 2993 phi_use_stmt = use_stmt;
fb85abff 2994 }
2995
3b8dc59b 2996 edge latch_e = loop_latch_edge (loop);
2997 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
fb85abff 2998 if (TREE_CODE (loop_arg) != SSA_NAME)
2999 {
6d8fb6cf 3000 if (dump_enabled_p ())
a4e972e3 3001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002 "reduction: not ssa_name: %T\n", loop_arg);
fb85abff 3003 return NULL;
3004 }
3005
f4649a92 3006 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
91e68c4b 3007 if (!def_stmt_info
3008 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
f4649a92 3009 return NULL;
3010
3011 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
fb85abff 3012 {
731c7a45 3013 name = gimple_assign_lhs (def_stmt);
3014 phi_def = false;
fb85abff 3015 }
f4649a92 3016 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
731c7a45 3017 {
3018 name = PHI_RESULT (def_stmt);
3019 phi_def = true;
3020 }
3021 else
fb85abff 3022 {
6d8fb6cf 3023 if (dump_enabled_p ())
a4e972e3 3024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3025 "reduction: unhandled reduction operation: %G",
3026 def_stmt_info->stmt);
fb85abff 3027 return NULL;
3028 }
3029
9129f51c 3030 unsigned nlatch_def_loop_uses = 0;
3b8dc59b 3031 auto_vec<gphi *, 3> lcphis;
9129f51c 3032 bool inner_loop_of_double_reduc = false;
4364527a 3033 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3034 {
3035 gimple *use_stmt = USE_STMT (use_p);
3036 if (is_gimple_debug (use_stmt))
3037 continue;
3038 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
9129f51c 3039 nlatch_def_loop_uses++;
4364527a 3040 else
9129f51c 3041 {
3042 /* We can have more than one loop-closed PHI. */
3043 lcphis.safe_push (as_a <gphi *> (use_stmt));
3044 if (nested_in_vect_loop
3045 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3046 == vect_double_reduction_def))
3047 inner_loop_of_double_reduc = true;
3048 }
2fbb03c0 3049 }
3050
3051 /* If this isn't a nested cycle or if the nested cycle reduction value
3052 is used ouside of the inner loop we cannot handle uses of the reduction
3053 value. */
9129f51c 3054 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
3055 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2fbb03c0 3056 {
3057 if (dump_enabled_p ())
3058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3059 "reduction used in loop.\n");
3060 return NULL;
4364527a 3061 }
fb85abff 3062
7aa0d350 3063 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3064 defined in the inner loop. */
3065 if (phi_def)
3066 {
f4649a92 3067 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
7aa0d350 3068 op1 = PHI_ARG_DEF (def_stmt, 0);
3069
3070 if (gimple_phi_num_args (def_stmt) != 1
3071 || TREE_CODE (op1) != SSA_NAME)
3072 {
6d8fb6cf 3073 if (dump_enabled_p ())
7bd765d4 3074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 3075 "unsupported phi node definition.\n");
7aa0d350 3076
3077 return NULL;
3078 }
3079
9cfd4e76 3080 gimple *def1 = SSA_NAME_DEF_STMT (op1);
149f7c8d 3081 if (gimple_bb (def1)
3082 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
7aa0d350 3083 && loop->inner
3084 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
24651fb7 3085 && is_gimple_assign (def1)
ecff729d 3086 && is_a <gphi *> (phi_use_stmt)
24651fb7 3087 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
7aa0d350 3088 {
6d8fb6cf 3089 if (dump_enabled_p ())
7bd765d4 3090 report_vect_op (MSG_NOTE, def_stmt,
3091 "detected double reduction: ");
48e1416a 3092
7aa0d350 3093 *double_reduc = true;
f4649a92 3094 return def_stmt_info;
7aa0d350 3095 }
3096
3097 return NULL;
3098 }
3099
3b8dc59b 3100 /* If we are vectorizing an inner reduction we are executing that
3101 in the original order only in case we are not dealing with a
3102 double reduction. */
3103 bool check_reduction = true;
3104 if (flow_loop_nested_p (vect_loop, loop))
3105 {
3106 gphi *lcphi;
3107 unsigned i;
3108 check_reduction = false;
3109 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3110 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3111 {
3112 gimple *use_stmt = USE_STMT (use_p);
3113 if (is_gimple_debug (use_stmt))
3114 continue;
3115 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3116 check_reduction = true;
3117 }
3118 }
3119
f4649a92 3120 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
f4a50267 3121 code = orig_code = gimple_assign_rhs_code (def_stmt);
3122
2fbb03c0 3123 if (nested_in_vect_loop && !check_reduction)
3124 {
8039cace 3125 /* FIXME: Even for non-reductions code generation is funneled
3126 through vectorizable_reduction for the stmt defining the
3127 PHI latch value. So we have to artificially restrict ourselves
3128 for the supported operations. */
3129 switch (get_gimple_rhs_class (code))
3130 {
3131 case GIMPLE_BINARY_RHS:
3132 case GIMPLE_TERNARY_RHS:
3133 break;
3134 default:
3135 /* Not supported by vectorizable_reduction. */
3136 if (dump_enabled_p ())
3137 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3138 "nested cycle: not handled operation: ");
3139 return NULL;
3140 }
2fbb03c0 3141 if (dump_enabled_p ())
3142 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3143 return def_stmt_info;
3144 }
3145
f4a50267 3146 /* We can handle "res -= x[i]", which is non-associative by
3147 simply rewriting this into "res += -x[i]". Avoid changing
3148 gimple instruction for the first simple tests and only do this
3149 if we're allowed to change code at all. */
ed3fa54b 3150 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
f4a50267 3151 code = PLUS_EXPR;
fb85abff 3152
c88301ad 3153 if (code == COND_EXPR)
fb85abff 3154 {
3b8dc59b 3155 if (! nested_in_vect_loop)
d09d8733 3156 *v_reduc_type = COND_REDUCTION;
0df23b96 3157
8a2caf10 3158 op3 = gimple_assign_rhs1 (def_stmt);
a18d4327 3159 if (COMPARISON_CLASS_P (op3))
3160 {
3161 op4 = TREE_OPERAND (op3, 1);
3162 op3 = TREE_OPERAND (op3, 0);
48e1416a 3163 }
ed3fa54b 3164 if (op3 == phi_name || op4 == phi_name)
3165 {
3166 if (dump_enabled_p ())
3167 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3168 "reduction: condition depends on previous"
3169 " iteration: ");
3170 return NULL;
3171 }
48e1416a 3172
8a2caf10 3173 op1 = gimple_assign_rhs2 (def_stmt);
3174 op2 = gimple_assign_rhs3 (def_stmt);
fb85abff 3175 }
3b8dc59b 3176 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3177 {
3178 if (dump_enabled_p ())
3179 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3180 "reduction: not commutative/associative: ");
3181 return NULL;
3182 }
3183 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
0df23b96 3184 {
3185 op1 = gimple_assign_rhs1 (def_stmt);
3186 op2 = gimple_assign_rhs2 (def_stmt);
3b8dc59b 3187 }
3188 else
3189 {
3190 if (dump_enabled_p ())
3191 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3192 "reduction: not handled operation: ");
3193 return NULL;
3194 }
0df23b96 3195
3b8dc59b 3196 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3197 {
3198 if (dump_enabled_p ())
3199 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3200 "reduction: both uses not ssa_names: ");
0df23b96 3201
3b8dc59b 3202 return NULL;
3203 }
fb85abff 3204
fb85abff 3205 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
0df23b96 3206 if ((TREE_CODE (op1) == SSA_NAME
1ea6a73c 3207 && !types_compatible_p (type,TREE_TYPE (op1)))
0df23b96 3208 || (TREE_CODE (op2) == SSA_NAME
1ea6a73c 3209 && !types_compatible_p (type, TREE_TYPE (op2)))
0df23b96 3210 || (op3 && TREE_CODE (op3) == SSA_NAME
1ea6a73c 3211 && !types_compatible_p (type, TREE_TYPE (op3)))
0df23b96 3212 || (op4 && TREE_CODE (op4) == SSA_NAME
1ea6a73c 3213 && !types_compatible_p (type, TREE_TYPE (op4))))
fb85abff 3214 {
6d8fb6cf 3215 if (dump_enabled_p ())
fb85abff 3216 {
7bd765d4 3217 dump_printf_loc (MSG_NOTE, vect_location,
a4e972e3 3218 "reduction: multiple types: operation type: "
3219 "%T, operands types: %T,%T",
3220 type, TREE_TYPE (op1), TREE_TYPE (op2));
a18d4327 3221 if (op3)
a4e972e3 3222 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
a18d4327 3223
3224 if (op4)
a4e972e3 3225 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
78bb46f5 3226 dump_printf (MSG_NOTE, "\n");
fb85abff 3227 }
0df23b96 3228
fb85abff 3229 return NULL;
3230 }
3231
d77809a4 3232 /* Check whether it's ok to change the order of the computation.
ade2ac53 3233 Generally, when vectorizing a reduction we change the order of the
fb85abff 3234 computation. This may change the behavior of the program in some
48e1416a 3235 cases, so we need to check that this is ok. One exception is when
fb85abff 3236 vectorizing an outer-loop: the inner-loop is executed sequentially,
3237 and therefore vectorizing reductions in the inner-loop during
3238 outer-loop vectorization is safe. */
d77809a4 3239 if (check_reduction
3240 && *v_reduc_type == TREE_CODE_REDUCTION
3241 && needs_fold_left_reduction_p (type, code,
3242 need_wrapping_integral_overflow))
3243 *v_reduc_type = FOLD_LEFT_REDUCTION;
fb85abff 3244
ade2ac53 3245 /* Reduction is safe. We're dealing with one of the following:
fb85abff 3246 1) integer arithmetic and no trapv
ade2ac53 3247 2) floating point arithmetic, and special flags permit this optimization
3248 3) nested cycle (i.e., outer loop vectorization). */
9cfd4e76 3249 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3250 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3251 if (code != COND_EXPR && !def1_info && !def2_info)
fb85abff 3252 {
6d8fb6cf 3253 if (dump_enabled_p ())
7bd765d4 3254 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
fb85abff 3255 return NULL;
3256 }
3257
fb85abff 3258 /* Check that one def is the reduction def, defined by PHI,
f083cd24 3259 the other def is either defined in the loop ("vect_internal_def"),
fb85abff 3260 or it's an induction (defined by a loop-header phi-node). */
3261
9cfd4e76 3262 if (def2_info
3263 && def2_info->stmt == phi
0df23b96 3264 && (code == COND_EXPR
9cfd4e76 3265 || !def1_info
91e68c4b 3266 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
9cfd4e76 3267 || vect_valid_reduction_input_p (def1_info)))
fb85abff 3268 {
6d8fb6cf 3269 if (dump_enabled_p ())
7bd765d4 3270 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
f4649a92 3271 return def_stmt_info;
fb85abff 3272 }
39a5d6b1 3273
9cfd4e76 3274 if (def1_info
3275 && def1_info->stmt == phi
39a5d6b1 3276 && (code == COND_EXPR
9cfd4e76 3277 || !def2_info
91e68c4b 3278 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
9cfd4e76 3279 || vect_valid_reduction_input_p (def2_info)))
fb85abff 3280 {
3b8dc59b 3281 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
bbb60482 3282 {
3283 /* Check if we can swap operands (just for simplicity - so that
3284 the rest of the code can assume that the reduction variable
3285 is always the last (second) argument). */
d09d8733 3286 if (code == COND_EXPR)
3287 {
bbb60482 3288 /* Swap cond_expr by inverting the condition. */
3289 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3290 enum tree_code invert_code = ERROR_MARK;
3291 enum tree_code cond_code = TREE_CODE (cond_expr);
3292
3293 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3294 {
3295 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3296 invert_code = invert_tree_comparison (cond_code, honor_nans);
3297 }
3298 if (invert_code != ERROR_MARK)
3299 {
3300 TREE_SET_CODE (cond_expr, invert_code);
3301 swap_ssa_operands (def_stmt,
3302 gimple_assign_rhs2_ptr (def_stmt),
3303 gimple_assign_rhs3_ptr (def_stmt));
3304 }
3305 else
3306 {
3307 if (dump_enabled_p ())
3308 report_vect_op (MSG_NOTE, def_stmt,
3309 "detected reduction: cannot swap operands "
3310 "for cond_expr");
3311 return NULL;
3312 }
d09d8733 3313 }
bbb60482 3314 else
3315 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3316 gimple_assign_rhs2_ptr (def_stmt));
d09d8733 3317
bbb60482 3318 if (dump_enabled_p ())
7bd765d4 3319 report_vect_op (MSG_NOTE, def_stmt,
bbb60482 3320 "detected reduction: need to swap operands: ");
ba69439f 3321
3322 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3323 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
ade2ac53 3324 }
3325 else
3326 {
6d8fb6cf 3327 if (dump_enabled_p ())
7bd765d4 3328 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
ade2ac53 3329 }
3330
f4649a92 3331 return def_stmt_info;
fb85abff 3332 }
39a5d6b1 3333
3334 /* Try to find SLP reduction chain. */
3b8dc59b 3335 if (! nested_in_vect_loop
3336 && code != COND_EXPR
6154acba 3337 && orig_code != MINUS_EXPR
d09d8733 3338 && vect_is_slp_reduction (loop_info, phi, def_stmt))
fb85abff 3339 {
6d8fb6cf 3340 if (dump_enabled_p ())
7bd765d4 3341 report_vect_op (MSG_NOTE, def_stmt,
3342 "reduction: detected reduction chain: ");
ade2ac53 3343
f4649a92 3344 return def_stmt_info;
fb85abff 3345 }
39a5d6b1 3346
6154acba 3347 /* Look for the expression computing loop_arg from loop PHI result. */
f4649a92 3348 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3349 return def_stmt_info;
6154acba 3350
6d8fb6cf 3351 if (dump_enabled_p ())
6154acba 3352 {
3a94df0b 3353 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
6154acba 3354 "reduction: unknown pattern: ");
3355 }
3356
39a5d6b1 3357 return NULL;
fb85abff 3358}
3359
119a8852 3360/* Wrapper around vect_is_simple_reduction, which will modify code
f4a50267 3361 in-place if it enables detection of more reductions. Arguments
3362 as there. */
3363
f4649a92 3364stmt_vec_info
3365vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
119a8852 3366 bool *double_reduc,
b826233f 3367 bool need_wrapping_integral_overflow)
f4a50267 3368{
d09d8733 3369 enum vect_reduction_type v_reduc_type;
f4649a92 3370 stmt_vec_info def_info
3371 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3372 need_wrapping_integral_overflow,
3373 &v_reduc_type);
3374 if (def_info)
119a8852 3375 {
04eefad5 3376 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3377 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3378 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3379 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
119a8852 3380 }
f4649a92 3381 return def_info;
f4a50267 3382}
fb85abff 3383
0822b158 3384/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3385int
3386vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3387 int *peel_iters_epilogue,
7a66d0cf 3388 stmt_vector_for_cost *scalar_cost_vec,
f97dec81 3389 stmt_vector_for_cost *prologue_cost_vec,
3390 stmt_vector_for_cost *epilogue_cost_vec)
0822b158 3391{
f97dec81 3392 int retval = 0;
d75596cd 3393 int assumed_vf = vect_vf_for_cost (loop_vinfo);
0822b158 3394
3395 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3396 {
d75596cd 3397 *peel_iters_epilogue = assumed_vf / 2;
6d8fb6cf 3398 if (dump_enabled_p ())
7bd765d4 3399 dump_printf_loc (MSG_NOTE, vect_location,
3400 "cost model: epilogue peel iters set to vf/2 "
78bb46f5 3401 "because loop iterations are unknown .\n");
0822b158 3402
3403 /* If peeled iterations are known but number of scalar loop
3404 iterations are unknown, count a taken branch per peeled loop. */
7a66d0cf 3405 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
f97dec81 3406 NULL, 0, vect_prologue);
55a600a6 3407 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
8e6f320b 3408 NULL, 0, vect_epilogue);
0822b158 3409 }
3410 else
3411 {
3412 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3413 peel_iters_prologue = niters < peel_iters_prologue ?
3414 niters : peel_iters_prologue;
d75596cd 3415 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
a4ee7fac 3416 /* If we need to peel for gaps, but no peeling is required, we have to
3417 peel VF iterations. */
3418 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
d75596cd 3419 *peel_iters_epilogue = assumed_vf;
0822b158 3420 }
3421
7a66d0cf 3422 stmt_info_for_cost *si;
3423 int j;
f97dec81 3424 if (peel_iters_prologue)
7a66d0cf 3425 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
1aeaa139 3426 retval += record_stmt_cost (prologue_cost_vec,
3427 si->count * peel_iters_prologue,
3428 si->kind, si->stmt_info, si->misalign,
3429 vect_prologue);
f97dec81 3430 if (*peel_iters_epilogue)
7a66d0cf 3431 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
1aeaa139 3432 retval += record_stmt_cost (epilogue_cost_vec,
3433 si->count * *peel_iters_epilogue,
3434 si->kind, si->stmt_info, si->misalign,
3435 vect_epilogue);
7a66d0cf 3436
f97dec81 3437 return retval;
0822b158 3438}
3439
fb85abff 3440/* Function vect_estimate_min_profitable_iters
3441
3442 Return the number of iterations required for the vector version of the
3443 loop to be profitable relative to the cost of the scalar version of the
97fe80a6 3444 loop.
3445
3446 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3447 of iterations for vectorization. -1 value means loop vectorization
3448 is not profitable. This returned value may be used for dynamic
3449 profitability check.
3450
3451 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3452 for static check against estimated number of iterations. */
fb85abff 3453
5938768b 3454static void
3455vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3456 int *ret_min_profitable_niters,
3457 int *ret_min_profitable_estimate)
fb85abff 3458{
fb85abff 3459 int min_profitable_iters;
5938768b 3460 int min_profitable_estimate;
fb85abff 3461 int peel_iters_prologue;
3462 int peel_iters_epilogue;
f97dec81 3463 unsigned vec_inside_cost = 0;
fb85abff 3464 int vec_outside_cost = 0;
f97dec81 3465 unsigned vec_prologue_cost = 0;
3466 unsigned vec_epilogue_cost = 0;
fb85abff 3467 int scalar_single_iter_cost = 0;
3468 int scalar_outside_cost = 0;
d75596cd 3469 int assumed_vf = vect_vf_for_cost (loop_vinfo);
313a5120 3470 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
f97dec81 3471 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
fb85abff 3472
3473 /* Cost model disabled. */
3e398f5b 3474 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
fb85abff 3475 {
91f42adc 3476 if (dump_enabled_p ())
3477 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
5938768b 3478 *ret_min_profitable_niters = 0;
3479 *ret_min_profitable_estimate = 0;
3480 return;
fb85abff 3481 }
3482
3483 /* Requires loop versioning tests to handle misalignment. */
10095225 3484 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
fb85abff 3485 {
3486 /* FIXME: Make cost depend on complexity of individual check. */
f1f41a6c 3487 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
f97dec81 3488 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3489 vect_prologue);
91f42adc 3490 if (dump_enabled_p ())
3491 dump_printf (MSG_NOTE,
3492 "cost model: Adding cost of checks for loop "
3493 "versioning to treat misalignment.\n");
fb85abff 3494 }
3495
10095225 3496 /* Requires loop versioning with alias checks. */
3497 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
fb85abff 3498 {
3499 /* FIXME: Make cost depend on complexity of individual check. */
41500e78 3500 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
f97dec81 3501 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3502 vect_prologue);
f68a7726 3503 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3504 if (len)
3505 /* Count LEN - 1 ANDs and LEN comparisons. */
3506 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3507 NULL, 0, vect_prologue);
e85b4a5e 3508 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3509 if (len)
3510 {
3511 /* Count LEN - 1 ANDs and LEN comparisons. */
3512 unsigned int nstmts = len * 2 - 1;
3513 /* +1 for each bias that needs adding. */
3514 for (unsigned int i = 0; i < len; ++i)
3515 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3516 nstmts += 1;
3517 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3518 NULL, 0, vect_prologue);
3519 }
91f42adc 3520 if (dump_enabled_p ())
3521 dump_printf (MSG_NOTE,
3522 "cost model: Adding cost of checks for loop "
3523 "versioning aliasing.\n");
fb85abff 3524 }
3525
d5e80d93 3526 /* Requires loop versioning with niter checks. */
3527 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3528 {
3529 /* FIXME: Make cost depend on complexity of individual check. */
3530 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3531 vect_prologue);
91f42adc 3532 if (dump_enabled_p ())
3533 dump_printf (MSG_NOTE,
3534 "cost model: Adding cost of checks for loop "
3535 "versioning niters.\n");
d5e80d93 3536 }
3537
3538 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
f97dec81 3539 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3540 vect_prologue);
fb85abff 3541
3542 /* Count statements in scalar loop. Using this as scalar cost for a single
3543 iteration for now.
3544
3545 TODO: Add outer loop support.
3546
3547 TODO: Consider assigning different costs to different scalar
3548 statements. */
3549
7a66d0cf 3550 scalar_single_iter_cost
2a9a3444 3551 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
0822b158 3552
fb85abff 3553 /* Add additional cost for the peeled instructions in prologue and epilogue
60b29a7e 3554 loop. (For fully-masked loops there will be no peeling.)
fb85abff 3555
3556 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3557 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3558
3559 TODO: Build an expression that represents peel_iters for prologue and
3560 epilogue to be used in a run-time test. */
3561
60b29a7e 3562 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3563 {
3564 peel_iters_prologue = 0;
3565 peel_iters_epilogue = 0;
53771608 3566
3567 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3568 {
3569 /* We need to peel exactly one iteration. */
3570 peel_iters_epilogue += 1;
3571 stmt_info_for_cost *si;
3572 int j;
3573 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3574 j, si)
1aeaa139 3575 (void) add_stmt_cost (target_cost_data, si->count,
3576 si->kind, si->stmt_info, si->misalign,
3577 vect_epilogue);
53771608 3578 }
60b29a7e 3579 }
3580 else if (npeel < 0)
fb85abff 3581 {
d75596cd 3582 peel_iters_prologue = assumed_vf / 2;
91f42adc 3583 if (dump_enabled_p ())
3584 dump_printf (MSG_NOTE, "cost model: "
3585 "prologue peel iters set to vf/2.\n");
fb85abff 3586
3587 /* If peeling for alignment is unknown, loop bound of main loop becomes
3588 unknown. */
d75596cd 3589 peel_iters_epilogue = assumed_vf / 2;
91f42adc 3590 if (dump_enabled_p ())
3591 dump_printf (MSG_NOTE, "cost model: "
3592 "epilogue peel iters set to vf/2 because "
3593 "peeling for alignment is unknown.\n");
fb85abff 3594
3595 /* If peeled iterations are unknown, count a taken branch and a not taken
3596 branch per peeled loop. Even if scalar loop iterations are known,
3597 vector iterations are not known since peeled prologue iterations are
3598 not known. Hence guards remain the same. */
7a66d0cf 3599 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
f97dec81 3600 NULL, 0, vect_prologue);
7a66d0cf 3601 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
f97dec81 3602 NULL, 0, vect_prologue);
7a66d0cf 3603 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3604 NULL, 0, vect_epilogue);
3605 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3606 NULL, 0, vect_epilogue);
3607 stmt_info_for_cost *si;
3608 int j;
2a9a3444 3609 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
7a66d0cf 3610 {
7a66d0cf 3611 (void) add_stmt_cost (target_cost_data,
3612 si->count * peel_iters_prologue,
1aeaa139 3613 si->kind, si->stmt_info, si->misalign,
7a66d0cf 3614 vect_prologue);
3615 (void) add_stmt_cost (target_cost_data,
3616 si->count * peel_iters_epilogue,
1aeaa139 3617 si->kind, si->stmt_info, si->misalign,
7a66d0cf 3618 vect_epilogue);
3619 }
fb85abff 3620 }
48e1416a 3621 else
fb85abff 3622 {
f97dec81 3623 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3624 stmt_info_for_cost *si;
3625 int j;
3626 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3627
f1f41a6c 3628 prologue_cost_vec.create (2);
3629 epilogue_cost_vec.create (2);
0822b158 3630 peel_iters_prologue = npeel;
f97dec81 3631
3632 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3633 &peel_iters_epilogue,
2a9a3444 3634 &LOOP_VINFO_SCALAR_ITERATION_COST
3635 (loop_vinfo),
f97dec81 3636 &prologue_cost_vec,
3637 &epilogue_cost_vec);
3638
f1f41a6c 3639 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
1aeaa139 3640 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3641 si->misalign, vect_prologue);
f97dec81 3642
f1f41a6c 3643 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
1aeaa139 3644 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3645 si->misalign, vect_epilogue);
f97dec81 3646
f1f41a6c 3647 prologue_cost_vec.release ();
3648 epilogue_cost_vec.release ();
fb85abff 3649 }
3650
fb85abff 3651 /* FORNOW: The scalar outside cost is incremented in one of the
3652 following ways:
3653
3654 1. The vectorizer checks for alignment and aliasing and generates
3655 a condition that allows dynamic vectorization. A cost model
3656 check is ANDED with the versioning condition. Hence scalar code
3657 path now has the added cost of the versioning check.
3658
3659 if (cost > th & versioning_check)
3660 jmp to vector code
3661
3662 Hence run-time scalar is incremented by not-taken branch cost.
3663
3664 2. The vectorizer then checks if a prologue is required. If the
3665 cost model check was not done before during versioning, it has to
3666 be done before the prologue check.
3667
3668 if (cost <= th)
3669 prologue = scalar_iters
3670 if (prologue == 0)
3671 jmp to vector code
3672 else
3673 execute prologue
3674 if (prologue == num_iters)
3675 go to exit
3676
3677 Hence the run-time scalar cost is incremented by a taken branch,
3678 plus a not-taken branch, plus a taken branch cost.
3679
3680 3. The vectorizer then checks if an epilogue is required. If the
3681 cost model check was not done before during prologue check, it
3682 has to be done with the epilogue check.
3683
3684 if (prologue == 0)
3685 jmp to vector code
3686 else
3687 execute prologue
3688 if (prologue == num_iters)
3689 go to exit
3690 vector code:
3691 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3692 jmp to epilogue
3693
3694 Hence the run-time scalar cost should be incremented by 2 taken
3695 branches.
3696
3697 TODO: The back end may reorder the BBS's differently and reverse
3698 conditions/branch directions. Change the estimates below to
3699 something more reasonable. */
3700
3701 /* If the number of iterations is known and we do not do versioning, we can
282bf14c 3702 decide whether to vectorize at compile time. Hence the scalar version
fb85abff 3703 do not carry cost model guard costs. */
3704 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
d5e80d93 3705 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
fb85abff 3706 {
3707 /* Cost model check occurs at versioning. */
d5e80d93 3708 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
f4ac3f3e 3709 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
fb85abff 3710 else
3711 {
3712 /* Cost model check occurs at prologue generation. */
313a5120 3713 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
f4ac3f3e 3714 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3715 + vect_get_stmt_cost (cond_branch_not_taken);
fb85abff 3716 /* Cost model check occurs at epilogue generation. */
3717 else
f4ac3f3e 3718 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
fb85abff 3719 }
3720 }
3721
f97dec81 3722 /* Complete the target-specific cost calculations. */
3723 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3724 &vec_inside_cost, &vec_epilogue_cost);
fb85abff 3725
f97dec81 3726 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4db2b577 3727
e4eca2de 3728 if (dump_enabled_p ())
3729 {
3730 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3731 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3732 vec_inside_cost);
3733 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3734 vec_prologue_cost);
3735 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3736 vec_epilogue_cost);
3737 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3738 scalar_single_iter_cost);
3739 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3740 scalar_outside_cost);
3741 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3742 vec_outside_cost);
3743 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3744 peel_iters_prologue);
3745 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3746 peel_iters_epilogue);
3747 }
3748
48e1416a 3749 /* Calculate number of iterations required to make the vector version
282bf14c 3750 profitable, relative to the loop bodies only. The following condition
48e1416a 3751 must hold true:
e3cfba39 3752 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
fb85abff 3753 where
3754 SIC = scalar iteration cost, VIC = vector iteration cost,
3755 VOC = vector outside cost, VF = vectorization factor,
e3cfba39 3756 NPEEL = prologue iterations + epilogue iterations,
fb85abff 3757 SOC = scalar outside cost for run time cost model check. */
3758
e3cfba39 3759 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3760 - vec_inside_cost);
3761 if (saving_per_viter <= 0)
3762 {
3763 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3764 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3765 "vectorization did not happen for a simd loop");
3766
3767 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769 "cost model: the vector iteration cost = %d "
3770 "divided by the scalar iteration cost = %d "
3771 "is greater or equal to the vectorization factor = %d"
3772 ".\n",
3773 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3774 *ret_min_profitable_niters = -1;
3775 *ret_min_profitable_estimate = -1;
3776 return;
3777 }
3778
3779 /* ??? The "if" arm is written to handle all cases; see below for what
3780 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3781 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3782 {
3783 /* Rewriting the condition above in terms of the number of
3784 vector iterations (vniters) rather than the number of
3785 scalar iterations (niters) gives:
3786
3787 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3788
3789 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3790
3791 For integer N, X and Y when X > 0:
3792
3793 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3794 int outside_overhead = (vec_outside_cost
3795 - scalar_single_iter_cost * peel_iters_prologue
3796 - scalar_single_iter_cost * peel_iters_epilogue
3797 - scalar_outside_cost);
3798 /* We're only interested in cases that require at least one
3799 vector iteration. */
3800 int min_vec_niters = 1;
3801 if (outside_overhead > 0)
3802 min_vec_niters = outside_overhead / saving_per_viter + 1;
3803
3804 if (dump_enabled_p ())
3805 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3806 min_vec_niters);
3807
3808 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3809 {
3810 /* Now that we know the minimum number of vector iterations,
3811 find the minimum niters for which the scalar cost is larger:
3812
3813 SIC * niters > VIC * vniters + VOC - SOC
3814
3815 We know that the minimum niters is no more than
3816 vniters * VF + NPEEL, but it might be (and often is) less
3817 than that if a partial vector iteration is cheaper than the
3818 equivalent scalar code. */
3819 int threshold = (vec_inside_cost * min_vec_niters
3820 + vec_outside_cost
3821 - scalar_outside_cost);
3822 if (threshold <= 0)
3823 min_profitable_iters = 1;
3824 else
3825 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3826 }
3827 else
3828 /* Convert the number of vector iterations into a number of
3829 scalar iterations. */
3830 min_profitable_iters = (min_vec_niters * assumed_vf
3831 + peel_iters_prologue
3832 + peel_iters_epilogue);
3833 }
3834 else
fb85abff 3835 {
7fe29cd0 3836 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3837 * assumed_vf
3838 - vec_inside_cost * peel_iters_prologue
3839 - vec_inside_cost * peel_iters_epilogue);
7fe29cd0 3840 if (min_profitable_iters <= 0)
ba12948e 3841 min_profitable_iters = 0;
fb85abff 3842 else
7fe29cd0 3843 {
e3cfba39 3844 min_profitable_iters /= saving_per_viter;
d75596cd 3845
3846 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3847 <= (((int) vec_inside_cost * min_profitable_iters)
3848 + (((int) vec_outside_cost - scalar_outside_cost)
3849 * assumed_vf)))
3850 min_profitable_iters++;
7fe29cd0 3851 }
fb85abff 3852 }
fb85abff 3853
91f42adc 3854 if (dump_enabled_p ())
3855 dump_printf (MSG_NOTE,
3856 " Calculated minimum iters for profitability: %d\n",
3857 min_profitable_iters);
fb85abff 3858
60b29a7e 3859 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3860 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3861 /* We want the vectorized loop to execute at least once. */
d75596cd 3862 min_profitable_iters = assumed_vf + peel_iters_prologue;
fb85abff 3863
6d8fb6cf 3864 if (dump_enabled_p ())
7bd765d4 3865 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 3866 " Runtime profitability threshold = %d\n",
3867 min_profitable_iters);
5938768b 3868
3869 *ret_min_profitable_niters = min_profitable_iters;
3870
3871 /* Calculate number of iterations required to make the vector version
3872 profitable, relative to the loop bodies only.
3873
3874 Non-vectorized variant is SIC * niters and it must win over vector
3875 variant on the expected loop trip count. The following condition must hold true:
e3cfba39 3876 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5938768b 3877
3878 if (vec_outside_cost <= 0)
ba12948e 3879 min_profitable_estimate = 0;
e3cfba39 3880 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3881 {
3882 /* This is a repeat of the code above, but with + SOC rather
3883 than - SOC. */
3884 int outside_overhead = (vec_outside_cost
3885 - scalar_single_iter_cost * peel_iters_prologue
3886 - scalar_single_iter_cost * peel_iters_epilogue
3887 + scalar_outside_cost);
3888 int min_vec_niters = 1;
3889 if (outside_overhead > 0)
3890 min_vec_niters = outside_overhead / saving_per_viter + 1;
3891
3892 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3893 {
3894 int threshold = (vec_inside_cost * min_vec_niters
3895 + vec_outside_cost
3896 + scalar_outside_cost);
3897 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3898 }
3899 else
3900 min_profitable_estimate = (min_vec_niters * assumed_vf
3901 + peel_iters_prologue
3902 + peel_iters_epilogue);
3903 }
5938768b 3904 else
3905 {
d75596cd 3906 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3907 * assumed_vf
5938768b 3908 - vec_inside_cost * peel_iters_prologue
3909 - vec_inside_cost * peel_iters_epilogue)
d75596cd 3910 / ((scalar_single_iter_cost * assumed_vf)
5938768b 3911 - vec_inside_cost);
3912 }
5938768b 3913 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
6d8fb6cf 3914 if (dump_enabled_p ())
5938768b 3915 dump_printf_loc (MSG_NOTE, vect_location,
ce145c33 3916 " Static estimate profitability threshold = %d\n",
3917 min_profitable_estimate);
48e1416a 3918
5938768b 3919 *ret_min_profitable_estimate = min_profitable_estimate;
fb85abff 3920}
3921
b974a688 3922/* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
282dc861 3923 vector elements (not bits) for a vector with NELT elements. */
b974a688 3924static void
282dc861 3925calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
1957c019 3926 vec_perm_builder *sel)
b974a688 3927{
c3fa7fe9 3928 /* The encoding is a single stepped pattern. Any wrap-around is handled
3929 by vec_perm_indices. */
3930 sel->new_vector (nelt, 1, 3);
3931 for (unsigned int i = 0; i < 3; i++)
1957c019 3932 sel->quick_push (i + offset);
b974a688 3933}
3934
3935/* Checks whether the target supports whole-vector shifts for vectors of mode
3936 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3937 it supports vec_perm_const with masks for all necessary shift amounts. */
3938static bool
582adad1 3939have_whole_vector_shift (machine_mode mode)
b974a688 3940{
3941 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3942 return true;
3943
ba7efd65 3944 /* Variable-length vectors should be handled via the optab. */
3945 unsigned int nelt;
3946 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3947 return false;
3948
1957c019 3949 vec_perm_builder sel;
3950 vec_perm_indices indices;
ba7efd65 3951 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
b974a688 3952 {
282dc861 3953 calc_vec_perm_mask_for_shift (i, nelt, &sel);
1957c019 3954 indices.new_vector (sel, 2, nelt);
3955 if (!can_vec_perm_const_p (mode, indices, false))
b974a688 3956 return false;
3957 }
3958 return true;
3959}
fb85abff 3960
48e1416a 3961/* TODO: Close dependency between vect_model_*_cost and vectorizable_*
fb85abff 3962 functions. Design better to avoid maintenance issues. */
fb85abff 3963
48e1416a 3964/* Function vect_model_reduction_cost.
3965
3966 Models cost for a reduction operation, including the vector ops
fb85abff 3967 generated within the strip-mine loop, the initial definition before
3968 the loop, and the epilogue code that must be generated. */
3969
6ce96a53 3970static void
e53664fa 3971vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
c863e35b 3972 int ncopies, stmt_vector_for_cost *cost_vec)
fb85abff 3973{
3bf95150 3974 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
fb85abff 3975 enum tree_code code;
3976 optab optab;
3977 tree vectype;
3754d046 3978 machine_mode mode;
fb85abff 3979 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2e966e2a 3980 class loop *loop = NULL;
95fd3578 3981
3982 if (loop_vinfo)
c863e35b 3983 loop = LOOP_VINFO_LOOP (loop_vinfo);
fb85abff 3984
d09d8733 3985 /* Condition reductions generate two reductions in the loop. */
3bf95150 3986 vect_reduction_type reduction_type
3987 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3988 if (reduction_type == COND_REDUCTION)
d09d8733 3989 ncopies *= 2;
3990
6ce96a53 3991 vectype = STMT_VINFO_VECTYPE (stmt_info);
fb85abff 3992 mode = TYPE_MODE (vectype);
4a59791f 3993 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
fb85abff 3994
aebdbd31 3995 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
fb85abff 3996
d77809a4 3997 if (reduction_type == EXTRACT_LAST_REDUCTION
3998 || reduction_type == FOLD_LEFT_REDUCTION)
3bf95150 3999 {
4000 /* No extra instructions needed in the prologue. */
4001 prologue_cost = 0;
4002
d77809a4 4003 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4004 /* Count one reduction-like operation per vector. */
c863e35b 4005 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4006 stmt_info, 0, vect_body);
d77809a4 4007 else
4008 {
4009 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4010 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
c863e35b 4011 inside_cost = record_stmt_cost (cost_vec, nelements,
4012 vec_to_scalar, stmt_info, 0,
4013 vect_body);
4014 inside_cost += record_stmt_cost (cost_vec, nelements,
4015 scalar_stmt, stmt_info, 0,
4016 vect_body);
d77809a4 4017 }
3bf95150 4018 }
4019 else
4020 {
4021 /* Add in cost for initial definition.
4022 For cond reduction we have four vectors: initial index, step,
4023 initial result of the data reduction, initial value of the index
4024 reduction. */
4025 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
c863e35b 4026 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4027 scalar_to_vec, stmt_info, 0,
4028 vect_prologue);
3bf95150 4029
4030 /* Cost of reduction op inside loop. */
c863e35b 4031 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4032 stmt_info, 0, vect_body);
3bf95150 4033 }
fb85abff 4034
4035 /* Determine cost of epilogue code.
4036
4037 We have a reduction operator that will reduce the vector in one statement.
4038 Also requires scalar extract. */
4039
aebdbd31 4040 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
fb85abff 4041 {
e53664fa 4042 if (reduc_fn != IFN_LAST)
f97dec81 4043 {
3bf95150 4044 if (reduction_type == COND_REDUCTION)
d09d8733 4045 {
4046 /* An EQ stmt and an COND_EXPR stmt. */
c863e35b 4047 epilogue_cost += record_stmt_cost (cost_vec, 2,
4048 vector_stmt, stmt_info, 0,
4049 vect_epilogue);
d09d8733 4050 /* Reduction of the max index and a reduction of the found
4051 values. */
c863e35b 4052 epilogue_cost += record_stmt_cost (cost_vec, 2,
4053 vec_to_scalar, stmt_info, 0,
4054 vect_epilogue);
d09d8733 4055 /* A broadcast of the max value. */
c863e35b 4056 epilogue_cost += record_stmt_cost (cost_vec, 1,
4057 scalar_to_vec, stmt_info, 0,
4058 vect_epilogue);
d09d8733 4059 }
4060 else
4061 {
c863e35b 4062 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4063 stmt_info, 0, vect_epilogue);
4064 epilogue_cost += record_stmt_cost (cost_vec, 1,
4065 vec_to_scalar, stmt_info, 0,
4066 vect_epilogue);
d09d8733 4067 }
f97dec81 4068 }
3bf95150 4069 else if (reduction_type == COND_REDUCTION)
c07fcd5e 4070 {
09de8b78 4071 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
c07fcd5e 4072 /* Extraction of scalar elements. */
c863e35b 4073 epilogue_cost += record_stmt_cost (cost_vec,
4074 2 * estimated_nunits,
4075 vec_to_scalar, stmt_info, 0,
4076 vect_epilogue);
c07fcd5e 4077 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
c863e35b 4078 epilogue_cost += record_stmt_cost (cost_vec,
4079 2 * estimated_nunits - 3,
4080 scalar_stmt, stmt_info, 0,
4081 vect_epilogue);
c07fcd5e 4082 }
d77809a4 4083 else if (reduction_type == EXTRACT_LAST_REDUCTION
4084 || reduction_type == FOLD_LEFT_REDUCTION)
3bf95150 4085 /* No extra instructions need in the epilogue. */
4086 ;
48e1416a 4087 else
fb85abff 4088 {
e913b5cd 4089 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
fb85abff 4090 tree bitsize =
aebdbd31 4091 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
e913b5cd 4092 int element_bitsize = tree_to_uhwi (bitsize);
fb85abff 4093 int nelements = vec_size_in_bits / element_bitsize;
4094
c07fcd5e 4095 if (code == COND_EXPR)
4096 code = MAX_EXPR;
4097
fb85abff 4098 optab = optab_for_tree_code (code, vectype, optab_default);
4099
4100 /* We have a whole vector shift available. */
c07fcd5e 4101 if (optab != unknown_optab
4102 && VECTOR_MODE_P (mode)
d6bf3b14 4103 && optab_handler (optab, mode) != CODE_FOR_nothing
b974a688 4104 && have_whole_vector_shift (mode))
f97dec81 4105 {
4106 /* Final reduction via vector shifts and the reduction operator.
4107 Also requires scalar extract. */
c863e35b 4108 epilogue_cost += record_stmt_cost (cost_vec,
4109 exact_log2 (nelements) * 2,
4110 vector_stmt, stmt_info, 0,
4111 vect_epilogue);
4112 epilogue_cost += record_stmt_cost (cost_vec, 1,
4113 vec_to_scalar, stmt_info, 0,
4114 vect_epilogue);
f97dec81 4115 }
fb85abff 4116 else
f97dec81 4117 /* Use extracts and reduction op for final reduction. For N
4118 elements, we have N extracts and N-1 reduction ops. */
c863e35b 4119 epilogue_cost += record_stmt_cost (cost_vec,
4120 nelements + nelements - 1,
4121 vector_stmt, stmt_info, 0,
4122 vect_epilogue);
fb85abff 4123 }
4124 }
4125
6d8fb6cf 4126 if (dump_enabled_p ())
7bd765d4 4127 dump_printf (MSG_NOTE,
4128 "vect_model_reduction_cost: inside_cost = %d, "
78bb46f5 4129 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
7bd765d4 4130 prologue_cost, epilogue_cost);
fb85abff 4131}
4132
4133
4134/* Function vect_model_induction_cost.
4135
4136 Models cost for induction operations. */
4137
4138static void
c863e35b 4139vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4140 stmt_vector_for_cost *cost_vec)
fb85abff 4141{
f97dec81 4142 unsigned inside_cost, prologue_cost;
4db2b577 4143
5cc7beaa 4144 if (PURE_SLP_STMT (stmt_info))
4145 return;
4146
fb85abff 4147 /* loop cost for vec_loop. */
c863e35b 4148 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4149 stmt_info, 0, vect_body);
4db2b577 4150
fb85abff 4151 /* prologue cost for vec_init and vec_step. */
c863e35b 4152 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4153 stmt_info, 0, vect_prologue);
48e1416a 4154
6d8fb6cf 4155 if (dump_enabled_p ())
7bd765d4 4156 dump_printf_loc (MSG_NOTE, vect_location,
4157 "vect_model_induction_cost: inside_cost = %d, "
78bb46f5 4158 "prologue_cost = %d .\n", inside_cost, prologue_cost);
fb85abff 4159}
4160
4161
fb85abff 4162
4163/* Function get_initial_def_for_reduction
4164
4165 Input:
ecc42a77 4166 STMT_VINFO - a stmt that performs a reduction operation in the loop.
fb85abff 4167 INIT_VAL - the initial value of the reduction variable
4168
4169 Output:
4170 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4171 of the reduction (used for adjusting the epilog - see below).
ecc42a77 4172 Return a vector variable, initialized according to the operation that
4173 STMT_VINFO performs. This vector will be used as the initial value
4174 of the vector of partial results.
fb85abff 4175
4176 Option1 (adjust in epilog): Initialize the vector as follows:
0df23b96 4177 add/bit or/xor: [0,0,...,0,0]
4178 mult/bit and: [1,1,...,1,1]
4179 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
fb85abff 4180 and when necessary (e.g. add/mult case) let the caller know
4181 that it needs to adjust the result by init_val.
4182
4183 Option2: Initialize the vector as follows:
0df23b96 4184 add/bit or/xor: [init_val,0,0,...,0]
4185 mult/bit and: [init_val,1,1,...,1]
4186 min/max/cond_expr: [init_val,init_val,...,init_val]
fb85abff 4187 and no adjustments are needed.
4188
4189 For example, for the following code:
4190
4191 s = init_val;
4192 for (i=0;i<n;i++)
4193 s = s + a[i];
4194
ecc42a77 4195 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
fb85abff 4196 For a vector of 4 units, we want to return either [0,0,0,init_val],
4197 or [0,0,0,0] and let the caller know that it needs to adjust
4198 the result at the end by 'init_val'.
4199
4200 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
7aa0d350 4201 initialization vector is simpler (same element in all entries), if
4202 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
48e1416a 4203
fb85abff 4204 A cost model should help decide between these two schemes. */
4205
4206tree
ecc42a77 4207get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
7aa0d350 4208 tree *adjustment_def)
fb85abff 4209{
fb85abff 4210 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2e966e2a 4211 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1efcacec 4212 tree scalar_type = TREE_TYPE (init_val);
4213 tree vectype = get_vectype_for_scalar_type (scalar_type);
a73182ff 4214 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
fb85abff 4215 tree def_for_init;
4216 tree init_def;
7aa0d350 4217 REAL_VALUE_TYPE real_init_val = dconst0;
4218 int int_init_val = 0;
0464ea95 4219 gimple_seq stmts = NULL;
fb85abff 4220
1efcacec 4221 gcc_assert (vectype);
1efcacec 4222
4223 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4224 || SCALAR_FLOAT_TYPE_P (scalar_type));
7aa0d350 4225
a73182ff 4226 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4227 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
fb85abff 4228
3bf95150 4229 vect_reduction_type reduction_type
4230 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4231
7aa0d350 4232 switch (code)
4233 {
eab42b58 4234 case WIDEN_SUM_EXPR:
4235 case DOT_PROD_EXPR:
4236 case SAD_EXPR:
4237 case PLUS_EXPR:
4238 case MINUS_EXPR:
4239 case BIT_IOR_EXPR:
4240 case BIT_XOR_EXPR:
4241 case MULT_EXPR:
4242 case BIT_AND_EXPR:
4243 {
fdf40949 4244 /* ADJUSTMENT_DEF is NULL when called from
7aa0d350 4245 vect_create_epilog_for_reduction to vectorize double reduction. */
4246 if (adjustment_def)
a890896f 4247 *adjustment_def = init_val;
7aa0d350 4248
b036fcd8 4249 if (code == MULT_EXPR)
7aa0d350 4250 {
4251 real_init_val = dconst1;
4252 int_init_val = 1;
4253 }
4254
b036fcd8 4255 if (code == BIT_AND_EXPR)
4256 int_init_val = -1;
4257
7aa0d350 4258 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4259 def_for_init = build_real (scalar_type, real_init_val);
4260 else
4261 def_for_init = build_int_cst (scalar_type, int_init_val);
4262
eab42b58 4263 if (adjustment_def)
9ed1960b 4264 /* Option1: the first element is '0' or '1' as well. */
4265 init_def = gimple_build_vector_from_val (&stmts, vectype,
4266 def_for_init);
633af029 4267 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4268 {
4269 /* Option2 (variable length): the first element is INIT_VAL. */
d5a19a73 4270 init_def = gimple_build_vector_from_val (&stmts, vectype,
4271 def_for_init);
4272 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4273 vectype, init_def, init_val);
633af029 4274 }
9ed1960b 4275 else
eab42b58 4276 {
9ed1960b 4277 /* Option2: the first element is INIT_VAL. */
db39ad9d 4278 tree_vector_builder elts (vectype, 1, 2);
9ed1960b 4279 elts.quick_push (init_val);
db39ad9d 4280 elts.quick_push (def_for_init);
4281 init_def = gimple_build_vector (&stmts, &elts);
fadf62f4 4282 }
eab42b58 4283 }
4284 break;
7aa0d350 4285
eab42b58 4286 case MIN_EXPR:
4287 case MAX_EXPR:
4288 case COND_EXPR:
4289 {
d09d8733 4290 if (adjustment_def)
7aa0d350 4291 {
d09d8733 4292 *adjustment_def = NULL_TREE;
3bf95150 4293 if (reduction_type != COND_REDUCTION
4294 && reduction_type != EXTRACT_LAST_REDUCTION)
d09d8733 4295 {
a73182ff 4296 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
d09d8733 4297 break;
4298 }
4299 }
0464ea95 4300 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
9ed1960b 4301 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
eab42b58 4302 }
4303 break;
7aa0d350 4304
eab42b58 4305 default:
4306 gcc_unreachable ();
7aa0d350 4307 }
fb85abff 4308
9ed1960b 4309 if (stmts)
4310 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
fb85abff 4311 return init_def;
4312}
4313
6154acba 4314/* Get at the initial defs for the reduction PHIs in SLP_NODE.
633af029 4315 NUMBER_OF_VECTORS is the number of vector defs to create.
4316 If NEUTRAL_OP is nonnull, introducing extra elements of that
4317 value will not change the result. */
4f0d4cce 4318
4319static void
4320get_initial_defs_for_reduction (slp_tree slp_node,
4321 vec<tree> *vec_oprnds,
4322 unsigned int number_of_vectors,
633af029 4323 bool reduc_chain, tree neutral_op)
4f0d4cce 4324{
06bb64b8 4325 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4326 stmt_vec_info stmt_vinfo = stmts[0];
633af029 4327 unsigned HOST_WIDE_INT nunits;
4f0d4cce 4328 unsigned j, number_of_places_left_in_vector;
633af029 4329 tree vector_type;
efda2654 4330 unsigned int group_size = stmts.length ();
4331 unsigned int i;
2e966e2a 4332 class loop *loop;
4f0d4cce 4333
4334 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4f0d4cce 4335
6154acba 4336 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4337
06bb64b8 4338 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
6154acba 4339 gcc_assert (loop);
9ed1960b 4340 edge pe = loop_preheader_edge (loop);
4f0d4cce 4341
633af029 4342 gcc_assert (!reduc_chain || neutral_op);
4f0d4cce 4343
4344 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4345 created vectors. It is greater than 1 if unrolling is performed.
4346
4347 For example, we have two scalar operands, s1 and s2 (e.g., group of
4348 strided accesses of size two), while NUNITS is four (i.e., four scalars
4349 of this type can be packed in a vector). The output vector will contain
4350 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4351 will be 2).
4352
e1009321 4353 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4354 vectors containing the operands.
4f0d4cce 4355
4356 For example, NUNITS is four as before, and the group size is 8
4357 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4358 {s5, s6, s7, s8}. */
4359
633af029 4360 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4361 nunits = group_size;
4362
4f0d4cce 4363 number_of_places_left_in_vector = nunits;
633af029 4364 bool constant_p = true;
db39ad9d 4365 tree_vector_builder elts (vector_type, nunits, 1);
eab42b58 4366 elts.quick_grow (nunits);
b4271aab 4367 gimple_seq ctor_seq = NULL;
efda2654 4368 for (j = 0; j < nunits * number_of_vectors; ++j)
4f0d4cce 4369 {
efda2654 4370 tree op;
4371 i = j % group_size;
4372 stmt_vinfo = stmts[i];
4f0d4cce 4373
efda2654 4374 /* Get the def before the loop. In reduction chain we have only
4375 one initial value. Else we have as many as PHIs in the group. */
4376 if (reduc_chain)
4377 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4378 else if (((vec_oprnds->length () + 1) * nunits
4379 - number_of_places_left_in_vector >= group_size)
4380 && neutral_op)
4381 op = neutral_op;
4382 else
4383 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4f0d4cce 4384
efda2654 4385 /* Create 'vect_ = {op0,op1,...,opn}'. */
4386 number_of_places_left_in_vector--;
4387 elts[nunits - number_of_places_left_in_vector - 1] = op;
4388 if (!CONSTANT_CLASS_P (op))
4389 constant_p = false;
4390
4391 if (number_of_places_left_in_vector == 0)
4392 {
efda2654 4393 tree init;
4394 if (constant_p && !neutral_op
4395 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4396 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4397 /* Build the vector directly from ELTS. */
4398 init = gimple_build_vector (&ctor_seq, &elts);
4399 else if (neutral_op)
4400 {
4401 /* Build a vector of the neutral value and shift the
4402 other elements into place. */
4403 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4404 neutral_op);
4405 int k = nunits;
4406 while (k > 0 && elts[k - 1] == neutral_op)
4407 k -= 1;
4408 while (k > 0)
633af029 4409 {
efda2654 4410 k -= 1;
4411 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4412 vector_type, init, elts[k]);
633af029 4413 }
efda2654 4414 }
4415 else
9ed1960b 4416 {
efda2654 4417 /* First time round, duplicate ELTS to fill the
b4271aab 4418 required number of vectors. */
4419 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4420 number_of_vectors, *vec_oprnds);
4421 break;
9ed1960b 4422 }
efda2654 4423 vec_oprnds->quick_push (init);
4424
4425 number_of_places_left_in_vector = nunits;
4426 elts.new_vector (vector_type, nunits, 1);
4427 elts.quick_grow (nunits);
4428 constant_p = true;
4429 }
4f0d4cce 4430 }
b4271aab 4431 if (ctor_seq != NULL)
4432 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4f0d4cce 4433}
4434
4435
fb85abff 4436/* Function vect_create_epilog_for_reduction
48e1416a 4437
fb85abff 4438 Create code at the loop-epilog to finalize the result of a reduction
eefa05c8 4439 computation.
4440
4441 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4442 reduction statements.
ecc42a77 4443 STMT_INFO is the scalar reduction stmt that is being vectorized.
fb85abff 4444 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
282bf14c 4445 number of elements that we can fit in a vectype (nunits). In this case
fb85abff 4446 we have to generate more than one vector stmt - i.e - we need to "unroll"
4447 the vector stmt by a factor VF/nunits. For more details see documentation
4448 in vectorizable_operation.
e53664fa 4449 REDUC_FN is the internal function for the epilog reduction.
eefa05c8 4450 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4451 computation.
4452 REDUC_INDEX is the index of the operand in the right hand side of the
ade2ac53 4453 statement that is defined by REDUCTION_PHI.
7aa0d350 4454 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
eefa05c8 4455 SLP_NODE is an SLP node containing a group of reduction statements. The
ecc42a77 4456 first one in this group is STMT_INFO.
fdf40949 4457 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4458 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4459 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4460 any value of the IV in the loop.
4461 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
633af029 4462 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4463 null if this is not an SLP reduction
fb85abff 4464
4465 This function:
eefa05c8 4466 1. Creates the reduction def-use cycles: sets the arguments for
4467 REDUCTION_PHIS:
fb85abff 4468 The loop-entry argument is the vectorized initial-value of the reduction.
eefa05c8 4469 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4470 sums.
4471 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
e53664fa 4472 by calling the function specified by REDUC_FN if available, or by
fb85abff 4473 other means (whole-vector shifts or a scalar loop).
48e1416a 4474 The function also creates a new phi node at the loop exit to preserve
fb85abff 4475 loop-closed form, as illustrated below.
48e1416a 4476
fb85abff 4477 The flow at the entry to this function:
48e1416a 4478
fb85abff 4479 loop:
4480 vec_def = phi <null, null> # REDUCTION_PHI
ecc42a77 4481 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4482 s_loop = scalar_stmt # (scalar) STMT_INFO
fb85abff 4483 loop_exit:
4484 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4485 use <s_out0>
4486 use <s_out0>
4487
4488 The above is transformed by this function into:
4489
4490 loop:
4491 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
ecc42a77 4492 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4493 s_loop = scalar_stmt # (scalar) STMT_INFO
fb85abff 4494 loop_exit:
4495 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4496 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4497 v_out2 = reduce <v_out1>
4498 s_out3 = extract_field <v_out2, 0>
4499 s_out4 = adjust_result <s_out3>
4500 use <s_out4>
4501 use <s_out4>
4502*/
4503
4504static void
ecc42a77 4505vect_create_epilog_for_reduction (vec<tree> vect_defs,
4506 stmt_vec_info stmt_info,
f17c6474 4507 gimple *reduc_def_stmt,
e53664fa 4508 int ncopies, internal_fn reduc_fn,
dc1fb456 4509 vec<stmt_vec_info> reduction_phis,
6154acba 4510 bool double_reduc,
4511 slp_tree slp_node,
fdf40949 4512 slp_instance slp_node_instance,
633af029 4513 tree induc_val, enum tree_code induc_code,
4514 tree neutral_op)
fb85abff 4515{
fb85abff 4516 stmt_vec_info prev_phi_info;
4517 tree vectype;
3754d046 4518 machine_mode mode;
fb85abff 4519 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2e966e2a 4520 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
fb85abff 4521 basic_block exit_bb;
4522 tree scalar_dest;
4523 tree scalar_type;
42acab1c 4524 gimple *new_phi = NULL, *phi;
dc1fb456 4525 stmt_vec_info phi_info;
fb85abff 4526 gimple_stmt_iterator exit_gsi;
4527 tree vec_dest;
eefa05c8 4528 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
42acab1c 4529 gimple *epilog_stmt = NULL;
a73182ff 4530 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
42acab1c 4531 gimple *exit_phi;
fb6b80a0 4532 tree bitsize;
eefa05c8 4533 tree adjustment_def = NULL;
4534 tree vec_initial_def = NULL;
c12cfa6e 4535 tree expr, def, initial_def = NULL;
eefa05c8 4536 tree orig_name, scalar_result;
b219ece3 4537 imm_use_iterator imm_iter, phi_imm_iter;
4538 use_operand_p use_p, phi_use_p;
dc1fb456 4539 gimple *use_stmt;
4540 stmt_vec_info reduction_phi_info = NULL;
fb85abff 4541 bool nested_in_vect_loop = false;
42acab1c 4542 auto_vec<gimple *> new_phis;
435515db 4543 auto_vec<stmt_vec_info> inner_phis;
fb85abff 4544 int j, i;
c2078b80 4545 auto_vec<tree> scalar_results;
47deb25f 4546 unsigned int group_size = 1, k, ratio;
c2078b80 4547 auto_vec<tree> vec_initial_defs;
42acab1c 4548 auto_vec<gimple *> phis;
39a5d6b1 4549 bool slp_reduc = false;
633af029 4550 bool direct_slp_reduc;
39a5d6b1 4551 tree new_phi_result;
435515db 4552 stmt_vec_info inner_phi = NULL;
c12cfa6e 4553 tree induction_index = NULL_TREE;
eefa05c8 4554
4555 if (slp_node)
f1f41a6c 4556 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
48e1416a 4557
a73182ff 4558 if (nested_in_vect_loop_p (loop, stmt_info))
fb85abff 4559 {
7aa0d350 4560 outer_loop = loop;
fb85abff 4561 loop = loop->inner;
4562 nested_in_vect_loop = true;
eefa05c8 4563 gcc_assert (!slp_node);
fb85abff 4564 }
48e1416a 4565
c12cfa6e 4566 vectype = STMT_VINFO_VECTYPE (stmt_info);
fb85abff 4567 gcc_assert (vectype);
4568 mode = TYPE_MODE (vectype);
4569
eefa05c8 4570 /* 1. Create the reduction def-use cycle:
4571 Set the arguments of REDUCTION_PHIS, i.e., transform
48e1416a 4572
eefa05c8 4573 loop:
4574 vec_def = phi <null, null> # REDUCTION_PHI
4575 VECT_DEF = vector_stmt # vectorized form of STMT
4576 ...
fb85abff 4577
eefa05c8 4578 into:
4579
4580 loop:
4581 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4582 VECT_DEF = vector_stmt # vectorized form of STMT
4583 ...
4584
4585 (in case of SLP, do it for all the phis). */
4586
4587 /* Get the loop-entry arguments. */
a890896f 4588 enum vect_def_type initial_def_dt = vect_unknown_def_type;
eefa05c8 4589 if (slp_node)
4f0d4cce 4590 {
4591 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4592 vec_initial_defs.reserve (vec_num);
6154acba 4593 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
633af029 4594 &vec_initial_defs, vec_num,
e1009321 4595 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
633af029 4596 neutral_op);
4f0d4cce 4597 }
eefa05c8 4598 else
4599 {
5cc2ea45 4600 /* Get at the scalar def before the loop, that defines the initial value
4601 of the reduction variable. */
f17c6474 4602 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
b4552064 4603 loop_preheader_edge (loop));
fdf40949 4604 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4605 and we can't use zero for induc_val, use initial_def. Similarly
4606 for REDUC_MIN and initial_def larger than the base. */
4607 if (TREE_CODE (initial_def) == INTEGER_CST
4608 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4609 == INTEGER_INDUC_COND_REDUCTION)
4610 && !integer_zerop (induc_val)
bbe863be 4611 && ((induc_code == MAX_EXPR
fdf40949 4612 && tree_int_cst_lt (initial_def, induc_val))
bbe863be 4613 || (induc_code == MIN_EXPR
fdf40949 4614 && tree_int_cst_lt (induc_val, initial_def))))
4615 induc_val = initial_def;
3dd3e23d 4616
4617 if (double_reduc)
4618 /* In case of double reduction we only create a vector variable
4619 to be put in the reduction phi node. The actual statement
4620 creation is done later in this function. */
4621 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4622 else if (nested_in_vect_loop)
4623 {
4624 /* Do not use an adjustment def as that case is not supported
4625 correctly if ncopies is not one. */
4626 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
a73182ff 4627 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4628 stmt_info);
3dd3e23d 4629 }
4630 else
a73182ff 4631 vec_initial_def
4632 = get_initial_def_for_reduction (stmt_info, initial_def,
4633 &adjustment_def);
a890896f 4634 vec_initial_defs.create (1);
f1f41a6c 4635 vec_initial_defs.quick_push (vec_initial_def);
eefa05c8 4636 }
4637
4638 /* Set phi nodes arguments. */
dc1fb456 4639 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
fb85abff 4640 {
9ed1960b 4641 tree vec_init_def = vec_initial_defs[i];
4642 tree def = vect_defs[i];
eefa05c8 4643 for (j = 0; j < ncopies; j++)
4644 {
a890896f 4645 if (j != 0)
4646 {
dc1fb456 4647 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
a890896f 4648 if (nested_in_vect_loop)
4649 vec_init_def
c0dd122a 4650 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
a890896f 4651 }
4652
b4552064 4653 /* Set the loop-entry arg of the reduction-phi. */
4654
dc1fb456 4655 gphi *phi = as_a <gphi *> (phi_info->stmt);
b4552064 4656 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4657 == INTEGER_INDUC_COND_REDUCTION)
4658 {
4659 /* Initialise the reduction phi to zero. This prevents initial
4660 values of non-zero interferring with the reduction op. */
4661 gcc_assert (ncopies == 1);
4662 gcc_assert (i == 0);
4663
4664 tree vec_init_def_type = TREE_TYPE (vec_init_def);
fdf40949 4665 tree induc_val_vec
4666 = build_vector_from_val (vec_init_def_type, induc_val);
b4552064 4667
dc1fb456 4668 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4669 UNKNOWN_LOCATION);
b4552064 4670 }
4671 else
dc1fb456 4672 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4673 UNKNOWN_LOCATION);
fb85abff 4674
eefa05c8 4675 /* Set the loop-latch arg for the reduction-phi. */
4676 if (j > 0)
c0dd122a 4677 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
fb85abff 4678
dc1fb456 4679 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
fb85abff 4680
6d8fb6cf 4681 if (dump_enabled_p ())
a4e972e3 4682 dump_printf_loc (MSG_NOTE, vect_location,
4683 "transform reduction: created def-use cycle: %G%G",
4684 phi, SSA_NAME_DEF_STMT (def));
eefa05c8 4685 }
fb85abff 4686 }
4687
c12cfa6e 4688 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4689 which is updated with the current index of the loop for every match of
4690 the original loop's cond_expr (VEC_STMT). This results in a vector
4691 containing the last time the condition passed for that vector lane.
4692 The first match will be a 1 to allow 0 to be used for non-matching
4693 indexes. If there are no matches at all then the vector will be all
4694 zeroes. */
4695 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4696 {
4697 tree indx_before_incr, indx_after_incr;
ce068755 4698 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
c12cfa6e 4699
435515db 4700 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
c12cfa6e 4701 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4702
4703 int scalar_precision
98a46e07 4704 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
c12cfa6e 4705 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4706 tree cr_index_vector_type = build_vector_type
4707 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4708
4709 /* First we create a simple vector induction variable which starts
4710 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4711 vector size (STEP). */
4712
4713 /* Create a {1,2,3,...} vector. */
ce068755 4714 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
c12cfa6e 4715
4716 /* Create a vector of the step value. */
4717 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4718 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4719
4720 /* Create an induction variable. */
4721 gimple_stmt_iterator incr_gsi;
4722 bool insert_after;
4723 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4724 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4725 insert_after, &indx_before_incr, &indx_after_incr);
4726
4727 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4728 filled with zeros (VEC_ZERO). */
4729
4730 /* Create a vector of 0s. */
4731 tree zero = build_zero_cst (cr_index_scalar_type);
4732 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4733
4734 /* Create a vector phi node. */
4735 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4736 new_phi = create_phi_node (new_phi_tree, loop->header);
04b2391d 4737 loop_vinfo->add_stmt (new_phi);
c12cfa6e 4738 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4739 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4740
4741 /* Now take the condition from the loops original cond_expr
4742 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4743 every match uses values from the induction variable
4744 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4745 (NEW_PHI_TREE).
4746 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4747 the new cond_expr (INDEX_COND_EXPR). */
4748
4749 /* Duplicate the condition from vec_stmt. */
4750 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4751
4752 /* Create a conditional, where the condition is taken from vec_stmt
4753 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4754 else is the phi (NEW_PHI_TREE). */
4755 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4756 ccompare, indx_before_incr,
4757 new_phi_tree);
4758 induction_index = make_ssa_name (cr_index_vector_type);
4759 gimple *index_condition = gimple_build_assign (induction_index,
4760 index_cond_expr);
4761 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
04b2391d 4762 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
c12cfa6e 4763 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
c12cfa6e 4764
4765 /* Update the phi with the vec cond. */
4766 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4767 loop_latch_edge (loop), UNKNOWN_LOCATION);
4768 }
4769
eefa05c8 4770 /* 2. Create epilog code.
4771 The reduction epilog code operates across the elements of the vector
4772 of partial results computed by the vectorized loop.
4773 The reduction epilog code consists of:
fb85abff 4774
eefa05c8 4775 step 1: compute the scalar result in a vector (v_out2)
4776 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4777 step 3: adjust the scalar result (s_out3) if needed.
4778
4779 Step 1 can be accomplished using one the following three schemes:
e53664fa 4780 (scheme 1) using reduc_fn, if available.
fb85abff 4781 (scheme 2) using whole-vector shifts, if available.
48e1416a 4782 (scheme 3) using a scalar loop. In this case steps 1+2 above are
fb85abff 4783 combined.
48e1416a 4784
fb85abff 4785 The overall epilog code looks like this:
4786
4787 s_out0 = phi <s_loop> # original EXIT_PHI
4788 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4789 v_out2 = reduce <v_out1> # step 1
4790 s_out3 = extract_field <v_out2, 0> # step 2
4791 s_out4 = adjust_result <s_out3> # step 3
4792
4793 (step 3 is optional, and steps 1 and 2 may be combined).
eefa05c8 4794 Lastly, the uses of s_out0 are replaced by s_out4. */
fb85abff 4795
fb85abff 4796
eefa05c8 4797 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4798 v_out1 = phi <VECT_DEF>
4799 Store them in NEW_PHIS. */
fb85abff 4800
4801 exit_bb = single_exit (loop)->dest;
fb85abff 4802 prev_phi_info = NULL;
f1f41a6c 4803 new_phis.create (vect_defs.length ());
4804 FOR_EACH_VEC_ELT (vect_defs, i, def)
fb85abff 4805 {
eefa05c8 4806 for (j = 0; j < ncopies; j++)
4807 {
f9e245b2 4808 tree new_def = copy_ssa_name (def);
874117c8 4809 phi = create_phi_node (new_def, exit_bb);
04b2391d 4810 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
eefa05c8 4811 if (j == 0)
f1f41a6c 4812 new_phis.quick_push (phi);
eefa05c8 4813 else
4814 {
c0dd122a 4815 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
aebdbd31 4816 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
eefa05c8 4817 }
4818
4819 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
04b2391d 4820 prev_phi_info = phi_info;
eefa05c8 4821 }
fb85abff 4822 }
ade2ac53 4823
b219ece3 4824 /* The epilogue is created for the outer-loop, i.e., for the loop being
58045f90 4825 vectorized. Create exit phis for the outer loop. */
b219ece3 4826 if (double_reduc)
4827 {
4828 loop = outer_loop;
4829 exit_bb = single_exit (loop)->dest;
f1f41a6c 4830 inner_phis.create (vect_defs.length ());
4831 FOR_EACH_VEC_ELT (new_phis, i, phi)
58045f90 4832 {
435515db 4833 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
f9e245b2 4834 tree new_result = copy_ssa_name (PHI_RESULT (phi));
1a91d914 4835 gphi *outer_phi = create_phi_node (new_result, exit_bb);
58045f90 4836 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4837 PHI_RESULT (phi));
04b2391d 4838 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
435515db 4839 inner_phis.quick_push (phi_info);
f1f41a6c 4840 new_phis[i] = outer_phi;
435515db 4841 while (STMT_VINFO_RELATED_STMT (phi_info))
58045f90 4842 {
435515db 4843 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4844 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
874117c8 4845 outer_phi = create_phi_node (new_result, exit_bb);
58045f90 4846 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
435515db 4847 PHI_RESULT (phi_info->stmt));
04b2391d 4848 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
aebdbd31 4849 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
04b2391d 4850 prev_phi_info = outer_phi_info;
58045f90 4851 }
4852 }
b219ece3 4853 }
4854
fb85abff 4855 exit_gsi = gsi_after_labels (exit_bb);
4856
48e1416a 4857 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
e53664fa 4858 (i.e. when reduc_fn is not available) and in the final adjustment
fb85abff 4859 code (if needed). Also get the original scalar reduction variable as
48e1416a 4860 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4861 represents a reduction pattern), the tree-code and scalar-def are
4862 taken from the original stmt that the pattern-stmt (STMT) replaces.
fb85abff 4863 Otherwise (it is a regular reduction) - the tree-code and scalar-def
48e1416a 4864 are taken from STMT. */
fb85abff 4865
4a59791f 4866 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4867 if (orig_stmt_info != stmt_info)
fb85abff 4868 {
4869 /* Reduction pattern */
aebdbd31 4870 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4871 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
fb85abff 4872 }
ade2ac53 4873
aebdbd31 4874 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
eefa05c8 4875 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4876 partial results are added and not subtracted. */
4877 if (code == MINUS_EXPR)
4878 code = PLUS_EXPR;
4879
aebdbd31 4880 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
fb85abff 4881 scalar_type = TREE_TYPE (scalar_dest);
f1f41a6c 4882 scalar_results.create (group_size);
fb85abff 4883 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4884 bitsize = TYPE_SIZE (scalar_type);
fb85abff 4885
fb85abff 4886 /* In case this is a reduction in an inner-loop while vectorizing an outer
4887 loop - we don't need to extract a single scalar result at the end of the
7aa0d350 4888 inner-loop (unless it is double reduction, i.e., the use of reduction is
282bf14c 4889 outside the outer-loop). The final vector of partial results will be used
7aa0d350 4890 in the vectorized outer-loop, or reduced to a scalar result at the end of
4891 the outer-loop. */
4892 if (nested_in_vect_loop && !double_reduc)
fb85abff 4893 goto vect_finalize_reduction;
4894
39a5d6b1 4895 /* SLP reduction without reduction chain, e.g.,
4896 # a1 = phi <a2, a0>
4897 # b1 = phi <b2, b0>
4898 a2 = operation (a1)
4899 b2 = operation (b1) */
1c2fef9a 4900 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
39a5d6b1 4901
633af029 4902 /* True if we should implement SLP_REDUC using native reduction operations
4903 instead of scalar operations. */
4904 direct_slp_reduc = (reduc_fn != IFN_LAST
4905 && slp_reduc
4906 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4907
39a5d6b1 4908 /* In case of reduction chain, e.g.,
4909 # a1 = phi <a3, a0>
4910 a2 = operation (a1)
4911 a3 = operation (a2),
4912
4913 we may end up with more than one vector result. Here we reduce them to
4914 one vector. */
1c2fef9a 4915 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
39a5d6b1 4916 {
f1f41a6c 4917 tree first_vect = PHI_RESULT (new_phis[0]);
1a91d914 4918 gassign *new_vec_stmt = NULL;
39a5d6b1 4919 vec_dest = vect_create_destination_var (scalar_dest, vectype);
f1f41a6c 4920 for (k = 1; k < new_phis.length (); k++)
39a5d6b1 4921 {
42acab1c 4922 gimple *next_phi = new_phis[k];
39a5d6b1 4923 tree second_vect = PHI_RESULT (next_phi);
5e84534b 4924 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4925 new_vec_stmt = gimple_build_assign (tem, code,
4926 first_vect, second_vect);
39a5d6b1 4927 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5e84534b 4928 first_vect = tem;
39a5d6b1 4929 }
4930
4931 new_phi_result = first_vect;
2f4ce795 4932 if (new_vec_stmt)
4933 {
f1f41a6c 4934 new_phis.truncate (0);
4935 new_phis.safe_push (new_vec_stmt);
2f4ce795 4936 }
39a5d6b1 4937 }
5e84534b 4938 /* Likewise if we couldn't use a single defuse cycle. */
4939 else if (ncopies > 1)
4940 {
4941 gcc_assert (new_phis.length () == 1);
4942 tree first_vect = PHI_RESULT (new_phis[0]);
4943 gassign *new_vec_stmt = NULL;
4944 vec_dest = vect_create_destination_var (scalar_dest, vectype);
819b1150 4945 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
5e84534b 4946 for (int k = 1; k < ncopies; ++k)
4947 {
819b1150 4948 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4949 tree second_vect = PHI_RESULT (next_phi_info->stmt);
5e84534b 4950 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4951 new_vec_stmt = gimple_build_assign (tem, code,
4952 first_vect, second_vect);
4953 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4954 first_vect = tem;
4955 }
4956 new_phi_result = first_vect;
4957 new_phis.truncate (0);
4958 new_phis.safe_push (new_vec_stmt);
4959 }
39a5d6b1 4960 else
f1f41a6c 4961 new_phi_result = PHI_RESULT (new_phis[0]);
d09d8733 4962
c07fcd5e 4963 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
e53664fa 4964 && reduc_fn != IFN_LAST)
d09d8733 4965 {
4966 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4967 various data values where the condition matched and another vector
4968 (INDUCTION_INDEX) containing all the indexes of those matches. We
4969 need to extract the last matching index (which will be the index with
4970 highest value) and use this to index into the data vector.
4971 For the case where there were no matches, the data vector will contain
4972 all default values and the index vector will be all zeros. */
4973
4974 /* Get various versions of the type of the vector of indexes. */
4975 tree index_vec_type = TREE_TYPE (induction_index);
4976 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
d09d8733 4977 tree index_scalar_type = TREE_TYPE (index_vec_type);
403a6f3c 4978 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4979 (index_vec_type);
d09d8733 4980
4981 /* Get an unsigned integer version of the type of the data vector. */
3d2b0034 4982 int scalar_precision
4983 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
d09d8733 4984 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4985 tree vectype_unsigned = build_vector_type
4986 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4987
4988 /* First we need to create a vector (ZERO_VEC) of zeros and another
4989 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4990 can create using a MAX reduction and then expanding.
4991 In the case where the loop never made any matches, the max index will
4992 be zero. */
4993
4994 /* Vector of {0, 0, 0,...}. */
4995 tree zero_vec = make_ssa_name (vectype);
4996 tree zero_vec_rhs = build_zero_cst (vectype);
4997 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4998 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4999
5000 /* Find maximum value from the vector of found indexes. */
5001 tree max_index = make_ssa_name (index_scalar_type);
e53664fa 5002 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5003 1, induction_index);
5004 gimple_call_set_lhs (max_index_stmt, max_index);
d09d8733 5005 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5006
5007 /* Vector of {max_index, max_index, max_index,...}. */
5008 tree max_index_vec = make_ssa_name (index_vec_type);
5009 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5010 max_index);
5011 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5012 max_index_vec_rhs);
5013 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5014
5015 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5016 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5017 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5018 otherwise. Only one value should match, resulting in a vector
5019 (VEC_COND) with one data value and the rest zeros.
5020 In the case where the loop never made any matches, every index will
5021 match, resulting in a vector with all data values (which will all be
5022 the default value). */
5023
5024 /* Compare the max index vector to the vector of found indexes to find
5025 the position of the max value. */
403a6f3c 5026 tree vec_compare = make_ssa_name (index_vec_cmp_type);
d09d8733 5027 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5028 induction_index,
5029 max_index_vec);
5030 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5031
5032 /* Use the compare to choose either values from the data vector or
5033 zero. */
5034 tree vec_cond = make_ssa_name (vectype);
5035 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5036 vec_compare, new_phi_result,
5037 zero_vec);
5038 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5039
5040 /* Finally we need to extract the data value from the vector (VEC_COND)
5041 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5042 reduction, but because this doesn't exist, we can use a MAX reduction
5043 instead. The data value might be signed or a float so we need to cast
5044 it first.
5045 In the case where the loop never made any matches, the data values are
5046 all identical, and so will reduce down correctly. */
5047
5048 /* Make the matched data values unsigned. */
5049 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5050 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5051 vec_cond);
5052 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5053 VIEW_CONVERT_EXPR,
5054 vec_cond_cast_rhs);
5055 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5056
5057 /* Reduce down to a scalar value. */
5058 tree data_reduc = make_ssa_name (scalar_type_unsigned);
e53664fa 5059 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5060 1, vec_cond_cast);
5061 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
d09d8733 5062 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5063
5064 /* Convert the reduced value back to the result type and set as the
5065 result. */
62ea3c0e 5066 gimple_seq stmts = NULL;
abf900f6 5067 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5068 data_reduc);
62ea3c0e 5069 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
d09d8733 5070 scalar_results.safe_push (new_temp);
5071 }
c07fcd5e 5072 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
e53664fa 5073 && reduc_fn == IFN_LAST)
c07fcd5e 5074 {
e53664fa 5075 /* Condition reduction without supported IFN_REDUC_MAX. Generate
c07fcd5e 5076 idx = 0;
5077 idx_val = induction_index[0];
5078 val = data_reduc[0];
5079 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5080 if (induction_index[i] > idx_val)
5081 val = data_reduc[i], idx_val = induction_index[i];
5082 return val; */
5083
5084 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5085 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5086 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
ce068755 5087 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5088 /* Enforced by vectorizable_reduction, which ensures we have target
5089 support before allowing a conditional reduction on variable-length
5090 vectors. */
5091 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
c07fcd5e 5092 tree idx_val = NULL_TREE, val = NULL_TREE;
5093 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5094 {
5095 tree old_idx_val = idx_val;
5096 tree old_val = val;
5097 idx_val = make_ssa_name (idx_eltype);
5098 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5099 build3 (BIT_FIELD_REF, idx_eltype,
5100 induction_index,
5101 bitsize_int (el_size),
5102 bitsize_int (off)));
5103 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5104 val = make_ssa_name (data_eltype);
5105 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5106 build3 (BIT_FIELD_REF,
5107 data_eltype,
5108 new_phi_result,
5109 bitsize_int (el_size),
5110 bitsize_int (off)));
5111 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5112 if (off != 0)
5113 {
5114 tree new_idx_val = idx_val;
c07fcd5e 5115 if (off != v_size - el_size)
5116 {
5117 new_idx_val = make_ssa_name (idx_eltype);
5118 epilog_stmt = gimple_build_assign (new_idx_val,
5119 MAX_EXPR, idx_val,
5120 old_idx_val);
5121 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5122 }
22eb1ed5 5123 tree new_val = make_ssa_name (data_eltype);
c07fcd5e 5124 epilog_stmt = gimple_build_assign (new_val,
5125 COND_EXPR,
5126 build2 (GT_EXPR,
5127 boolean_type_node,
5128 idx_val,
5129 old_idx_val),
5130 val, old_val);
5131 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5132 idx_val = new_idx_val;
5133 val = new_val;
5134 }
5135 }
62ea3c0e 5136 /* Convert the reduced value back to the result type and set as the
5137 result. */
5138 gimple_seq stmts = NULL;
5139 val = gimple_convert (&stmts, scalar_type, val);
5140 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
c07fcd5e 5141 scalar_results.safe_push (val);
5142 }
d09d8733 5143
fb85abff 5144 /* 2.3 Create the reduction code, using one of the three schemes described
eefa05c8 5145 above. In SLP we simply need to extract all the elements from the
5146 vector (without reducing them), so we use scalar shifts. */
e53664fa 5147 else if (reduc_fn != IFN_LAST && !slp_reduc)
fb85abff 5148 {
5149 tree tmp;
7ba68b18 5150 tree vec_elem_type;
fb85abff 5151
16ed3c2c 5152 /* Case 1: Create:
5153 v_out2 = reduc_expr <v_out1> */
fb85abff 5154
6d8fb6cf 5155 if (dump_enabled_p ())
7bd765d4 5156 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 5157 "Reduce using direct vector reduction.\n");
fb85abff 5158
7ba68b18 5159 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5160 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5161 {
e53664fa 5162 tree tmp_dest
5163 = vect_create_destination_var (scalar_dest, vec_elem_type);
5164 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5165 new_phi_result);
5166 gimple_set_lhs (epilog_stmt, tmp_dest);
7ba68b18 5167 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
e53664fa 5168 gimple_set_lhs (epilog_stmt, new_temp);
7ba68b18 5169 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5170
e53664fa 5171 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5172 new_temp);
7ba68b18 5173 }
5174 else
e53664fa 5175 {
5176 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5177 new_phi_result);
5178 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5179 }
b4552064 5180
7ba68b18 5181 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
e53664fa 5182 gimple_set_lhs (epilog_stmt, new_temp);
fb85abff 5183 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
b4552064 5184
fdf40949 5185 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5186 == INTEGER_INDUC_COND_REDUCTION)
5187 && !operand_equal_p (initial_def, induc_val, 0))
b4552064 5188 {
fdf40949 5189 /* Earlier we set the initial value to be a vector if induc_val
5190 values. Check the result and if it is induc_val then replace
5191 with the original initial value, unless induc_val is
5192 the same as initial_def already. */
5193 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5194 induc_val);
b4552064 5195
5196 tmp = make_ssa_name (new_scalar_dest);
5197 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5198 initial_def, new_temp);
5199 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5200 new_temp = tmp;
5201 }
5202
7ba68b18 5203 scalar_results.safe_push (new_temp);
fb85abff 5204 }
633af029 5205 else if (direct_slp_reduc)
5206 {
e1009321 5207 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
633af029 5208 with the elements for other SLP statements replaced with the
5209 neutral value. We can then do a normal reduction on each vector. */
5210
5211 /* Enforced by vectorizable_reduction. */
5212 gcc_assert (new_phis.length () == 1);
5213 gcc_assert (pow2p_hwi (group_size));
5214
5215 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
06bb64b8 5216 vec<stmt_vec_info> orig_phis
5217 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
633af029 5218 gimple_seq seq = NULL;
5219
5220 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5221 and the same element size as VECTYPE. */
5222 tree index = build_index_vector (vectype, 0, 1);
5223 tree index_type = TREE_TYPE (index);
5224 tree index_elt_type = TREE_TYPE (index_type);
5225 tree mask_type = build_same_sized_truth_vector_type (index_type);
5226
5227 /* Create a vector that, for each element, identifies which of
e1009321 5228 the REDUC_GROUP_SIZE results should use it. */
633af029 5229 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5230 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5231 build_vector_from_val (index_type, index_mask));
5232
5233 /* Get a neutral vector value. This is simply a splat of the neutral
5234 scalar value if we have one, otherwise the initial scalar value
5235 is itself a neutral value. */
5236 tree vector_identity = NULL_TREE;
5237 if (neutral_op)
5238 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5239 neutral_op);
5240 for (unsigned int i = 0; i < group_size; ++i)
5241 {
5242 /* If there's no univeral neutral value, we can use the
5243 initial scalar value from the original PHI. This is used
5244 for MIN and MAX reduction, for example. */
5245 if (!neutral_op)
5246 {
5247 tree scalar_value
06bb64b8 5248 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
633af029 5249 loop_preheader_edge (loop));
5250 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5251 scalar_value);
5252 }
5253
5254 /* Calculate the equivalent of:
5255
5256 sel[j] = (index[j] == i);
5257
5258 which selects the elements of NEW_PHI_RESULT that should
5259 be included in the result. */
5260 tree compare_val = build_int_cst (index_elt_type, i);
5261 compare_val = build_vector_from_val (index_type, compare_val);
5262 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5263 index, compare_val);
5264
5265 /* Calculate the equivalent of:
5266
5267 vec = seq ? new_phi_result : vector_identity;
5268
5269 VEC is now suitable for a full vector reduction. */
5270 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5271 sel, new_phi_result, vector_identity);
5272
5273 /* Do the reduction and convert it to the appropriate type. */
d5a19a73 5274 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5275 TREE_TYPE (vectype), vec);
633af029 5276 scalar = gimple_convert (&seq, scalar_type, scalar);
5277 scalar_results.safe_push (scalar);
5278 }
5279 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5280 }
fb85abff 5281 else
5282 {
41b4a935 5283 bool reduce_with_shift;
fb85abff 5284 tree vec_temp;
5285
fdf40949 5286 /* COND reductions all do the final reduction with MAX_EXPR
5287 or MIN_EXPR. */
c07fcd5e 5288 if (code == COND_EXPR)
fdf40949 5289 {
5290 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5291 == INTEGER_INDUC_COND_REDUCTION)
5292 code = induc_code;
745ee4da 5293 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5294 == CONST_COND_REDUCTION)
5295 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
fdf40949 5296 else
5297 code = MAX_EXPR;
5298 }
c07fcd5e 5299
41b4a935 5300 /* See if the target wants to do the final (shift) reduction
5301 in a vector mode of smaller size and first reduce upper/lower
5302 halves against each other. */
5303 enum machine_mode mode1 = mode;
41b4a935 5304 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5305 unsigned sz1 = sz;
5306 if (!slp_reduc
5307 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5308 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5309
22eb1ed5 5310 tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
41b4a935 5311 reduce_with_shift = have_whole_vector_shift (mode1);
5312 if (!VECTOR_MODE_P (mode1))
5313 reduce_with_shift = false;
fb85abff 5314 else
41b4a935 5315 {
5316 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5317 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5318 reduce_with_shift = false;
5319 }
5320
5321 /* First reduce the vector to the desired vector size we should
5322 do shift reduction on by combining upper and lower halves. */
5323 new_temp = new_phi_result;
5324 while (sz > sz1)
5325 {
5326 gcc_assert (!slp_reduc);
5327 sz /= 2;
5328 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5329
5330 /* The target has to make sure we support lowpart/highpart
5331 extraction, either via direct vector extract or through
5332 an integer mode punning. */
5333 tree dst1, dst2;
5334 if (convert_optab_handler (vec_extract_optab,
5335 TYPE_MODE (TREE_TYPE (new_temp)),
5336 TYPE_MODE (vectype1))
5337 != CODE_FOR_nothing)
5338 {
5339 /* Extract sub-vectors directly once vec_extract becomes
5340 a conversion optab. */
5341 dst1 = make_ssa_name (vectype1);
5342 epilog_stmt
5343 = gimple_build_assign (dst1, BIT_FIELD_REF,
5344 build3 (BIT_FIELD_REF, vectype1,
5345 new_temp, TYPE_SIZE (vectype1),
5346 bitsize_int (0)));
5347 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5348 dst2 = make_ssa_name (vectype1);
5349 epilog_stmt
5350 = gimple_build_assign (dst2, BIT_FIELD_REF,
5351 build3 (BIT_FIELD_REF, vectype1,
5352 new_temp, TYPE_SIZE (vectype1),
5353 bitsize_int (sz * BITS_PER_UNIT)));
5354 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5355 }
5356 else
5357 {
5358 /* Extract via punning to appropriately sized integer mode
5359 vector. */
5360 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5361 1);
5362 tree etype = build_vector_type (eltype, 2);
5363 gcc_assert (convert_optab_handler (vec_extract_optab,
5364 TYPE_MODE (etype),
5365 TYPE_MODE (eltype))
5366 != CODE_FOR_nothing);
5367 tree tem = make_ssa_name (etype);
5368 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5369 build1 (VIEW_CONVERT_EXPR,
5370 etype, new_temp));
5371 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5372 new_temp = tem;
5373 tem = make_ssa_name (eltype);
5374 epilog_stmt
5375 = gimple_build_assign (tem, BIT_FIELD_REF,
5376 build3 (BIT_FIELD_REF, eltype,
5377 new_temp, TYPE_SIZE (eltype),
5378 bitsize_int (0)));
5379 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5380 dst1 = make_ssa_name (vectype1);
5381 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5382 build1 (VIEW_CONVERT_EXPR,
5383 vectype1, tem));
5384 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5385 tem = make_ssa_name (eltype);
5386 epilog_stmt
5387 = gimple_build_assign (tem, BIT_FIELD_REF,
5388 build3 (BIT_FIELD_REF, eltype,
5389 new_temp, TYPE_SIZE (eltype),
5390 bitsize_int (sz * BITS_PER_UNIT)));
5391 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5392 dst2 = make_ssa_name (vectype1);
5393 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5394 build1 (VIEW_CONVERT_EXPR,
5395 vectype1, tem));
5396 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5397 }
5398
5399 new_temp = make_ssa_name (vectype1);
5400 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5401 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5402 }
fb85abff 5403
b974a688 5404 if (reduce_with_shift && !slp_reduc)
41b4a935 5405 {
5406 int element_bitsize = tree_to_uhwi (bitsize);
5407 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5408 for variable-length vectors and also requires direct target support
5409 for loop reductions. */
5410 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5411 int nelements = vec_size_in_bits / element_bitsize;
1957c019 5412 vec_perm_builder sel;
5413 vec_perm_indices indices;
b974a688 5414
5415 int elt_offset;
5416
41b4a935 5417 tree zero_vec = build_zero_cst (vectype1);
16ed3c2c 5418 /* Case 2: Create:
b974a688 5419 for (offset = nelements/2; offset >= 1; offset/=2)
eefa05c8 5420 {
5421 Create: va' = vec_shift <va, offset>
5422 Create: va = vop <va, va'>
5423 } */
fb85abff 5424
1e937a2e 5425 tree rhs;
5426
6d8fb6cf 5427 if (dump_enabled_p ())
7bd765d4 5428 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 5429 "Reduce using vector shifts\n");
eefa05c8 5430
41b4a935 5431 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
b974a688 5432 for (elt_offset = nelements / 2;
5433 elt_offset >= 1;
5434 elt_offset /= 2)
eefa05c8 5435 {
282dc861 5436 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
1957c019 5437 indices.new_vector (sel, 2, nelements);
41b4a935 5438 tree mask = vect_gen_perm_mask_any (vectype1, indices);
e9cf809e 5439 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5440 new_temp, zero_vec, mask);
eefa05c8 5441 new_name = make_ssa_name (vec_dest, epilog_stmt);
5442 gimple_assign_set_lhs (epilog_stmt, new_name);
5443 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5444
e9cf809e 5445 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5446 new_temp);
eefa05c8 5447 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5448 gimple_assign_set_lhs (epilog_stmt, new_temp);
5449 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5450 }
fb85abff 5451
1e937a2e 5452 /* 2.4 Extract the final scalar result. Create:
5453 s_out3 = extract_field <v_out2, bitpos> */
5454
5455 if (dump_enabled_p ())
5456 dump_printf_loc (MSG_NOTE, vect_location,
5457 "extract scalar result\n");
5458
5459 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5460 bitsize, bitsize_zero_node);
5461 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5462 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5463 gimple_assign_set_lhs (epilog_stmt, new_temp);
5464 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5465 scalar_results.safe_push (new_temp);
eefa05c8 5466 }
fb85abff 5467 else
5468 {
16ed3c2c 5469 /* Case 3: Create:
eefa05c8 5470 s = extract_field <v_out2, 0>
5471 for (offset = element_size;
5472 offset < vector_size;
5473 offset += element_size;)
5474 {
5475 Create: s' = extract_field <v_out2, offset>
5476 Create: s = op <s, s'> // For non SLP cases
5477 } */
fb85abff 5478
6d8fb6cf 5479 if (dump_enabled_p ())
7bd765d4 5480 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 5481 "Reduce using scalar code.\n");
fb85abff 5482
41b4a935 5483 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5484 int element_bitsize = tree_to_uhwi (bitsize);
f1f41a6c 5485 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
eefa05c8 5486 {
b974a688 5487 int bit_offset;
2f4ce795 5488 if (gimple_code (new_phi) == GIMPLE_PHI)
5489 vec_temp = PHI_RESULT (new_phi);
5490 else
5491 vec_temp = gimple_assign_lhs (new_phi);
b974a688 5492 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
fdf40949 5493 bitsize_zero_node);
eefa05c8 5494 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5495 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5496 gimple_assign_set_lhs (epilog_stmt, new_temp);
5497 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5498
5499 /* In SLP we don't need to apply reduction operation, so we just
5500 collect s' values in SCALAR_RESULTS. */
39a5d6b1 5501 if (slp_reduc)
f1f41a6c 5502 scalar_results.safe_push (new_temp);
eefa05c8 5503
5504 for (bit_offset = element_bitsize;
5505 bit_offset < vec_size_in_bits;
5506 bit_offset += element_bitsize)
5507 {
5508 tree bitpos = bitsize_int (bit_offset);
5509 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5510 bitsize, bitpos);
5511
5512 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5513 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5514 gimple_assign_set_lhs (epilog_stmt, new_name);
5515 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5516
39a5d6b1 5517 if (slp_reduc)
eefa05c8 5518 {
5519 /* In SLP we don't need to apply reduction operation, so
5520 we just collect s' values in SCALAR_RESULTS. */
5521 new_temp = new_name;
f1f41a6c 5522 scalar_results.safe_push (new_name);
eefa05c8 5523 }
5524 else
5525 {
e9cf809e 5526 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5527 new_name, new_temp);
eefa05c8 5528 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5529 gimple_assign_set_lhs (epilog_stmt, new_temp);
5530 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5531 }
5532 }
5533 }
5534
5535 /* The only case where we need to reduce scalar results in SLP, is
282bf14c 5536 unrolling. If the size of SCALAR_RESULTS is greater than
e1009321 5537 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5538 REDUC_GROUP_SIZE. */
39a5d6b1 5539 if (slp_reduc)
eefa05c8 5540 {
5541 tree res, first_res, new_res;
42acab1c 5542 gimple *new_stmt;
eefa05c8 5543
5544 /* Reduce multiple scalar results in case of SLP unrolling. */
f1f41a6c 5545 for (j = group_size; scalar_results.iterate (j, &res);
eefa05c8 5546 j++)
5547 {
f1f41a6c 5548 first_res = scalar_results[j % group_size];
e9cf809e 5549 new_stmt = gimple_build_assign (new_scalar_dest, code,
5550 first_res, res);
eefa05c8 5551 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5552 gimple_assign_set_lhs (new_stmt, new_res);
5553 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
f1f41a6c 5554 scalar_results[j % group_size] = new_res;
eefa05c8 5555 }
5556 }
5557 else
5558 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
f1f41a6c 5559 scalar_results.safe_push (new_temp);
eefa05c8 5560 }
c07fcd5e 5561
fdf40949 5562 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5563 == INTEGER_INDUC_COND_REDUCTION)
5564 && !operand_equal_p (initial_def, induc_val, 0))
c07fcd5e 5565 {
fdf40949 5566 /* Earlier we set the initial value to be a vector if induc_val
5567 values. Check the result and if it is induc_val then replace
5568 with the original initial value, unless induc_val is
5569 the same as initial_def already. */
5570 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5571 induc_val);
c07fcd5e 5572
5573 tree tmp = make_ssa_name (new_scalar_dest);
5574 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5575 initial_def, new_temp);
5576 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5577 scalar_results[0] = tmp;
5578 }
fb85abff 5579 }
eefa05c8 5580
fb85abff 5581vect_finalize_reduction:
5582
b219ece3 5583 if (double_reduc)
5584 loop = loop->inner;
5585
fb85abff 5586 /* 2.5 Adjust the final result by the initial value of the reduction
5587 variable. (When such adjustment is not needed, then
5588 'adjustment_def' is zero). For example, if code is PLUS we create:
5589 new_temp = loop_exit_def + adjustment_def */
5590
5591 if (adjustment_def)
5592 {
39a5d6b1 5593 gcc_assert (!slp_reduc);
fb85abff 5594 if (nested_in_vect_loop)
5595 {
f1f41a6c 5596 new_phi = new_phis[0];
fb85abff 5597 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5598 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5599 new_dest = vect_create_destination_var (scalar_dest, vectype);
5600 }
5601 else
5602 {
f1f41a6c 5603 new_temp = scalar_results[0];
fb85abff 5604 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5605 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5606 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5607 }
ade2ac53 5608
fb85abff 5609 epilog_stmt = gimple_build_assign (new_dest, expr);
5610 new_temp = make_ssa_name (new_dest, epilog_stmt);
5611 gimple_assign_set_lhs (epilog_stmt, new_temp);
fb85abff 5612 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
eefa05c8 5613 if (nested_in_vect_loop)
5614 {
04b2391d 5615 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5616 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
03c0d666 5617 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
eefa05c8 5618
5619 if (!double_reduc)
f1f41a6c 5620 scalar_results.quick_push (new_temp);
eefa05c8 5621 else
f1f41a6c 5622 scalar_results[0] = new_temp;
eefa05c8 5623 }
5624 else
f1f41a6c 5625 scalar_results[0] = new_temp;
eefa05c8 5626
f1f41a6c 5627 new_phis[0] = epilog_stmt;
fb85abff 5628 }
5629
282bf14c 5630 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
eefa05c8 5631 phis with new adjusted scalar results, i.e., replace use <s_out0>
5632 with use <s_out4>.
fb85abff 5633
eefa05c8 5634 Transform:
5635 loop_exit:
5636 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5637 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5638 v_out2 = reduce <v_out1>
5639 s_out3 = extract_field <v_out2, 0>
5640 s_out4 = adjust_result <s_out3>
5641 use <s_out0>
5642 use <s_out0>
5643
5644 into:
fb85abff 5645
eefa05c8 5646 loop_exit:
5647 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5648 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5649 v_out2 = reduce <v_out1>
5650 s_out3 = extract_field <v_out2, 0>
5651 s_out4 = adjust_result <s_out3>
47deb25f 5652 use <s_out4>
5653 use <s_out4> */
eefa05c8 5654
39a5d6b1 5655
5656 /* In SLP reduction chain we reduce vector results into one vector if
e1009321 5657 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5658 LHS of the last stmt in the reduction chain, since we are looking for
5659 the loop exit phi node. */
1c2fef9a 5660 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
39a5d6b1 5661 {
06bb64b8 5662 stmt_vec_info dest_stmt_info
4a59791f 5663 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
06bb64b8 5664 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
39a5d6b1 5665 group_size = 1;
5666 }
5667
e1009321 5668 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5669 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5670 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5671 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5672 correspond to the first vector stmt, etc.
5673 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
f1f41a6c 5674 if (group_size > new_phis.length ())
47deb25f 5675 {
f1f41a6c 5676 ratio = group_size / new_phis.length ();
5677 gcc_assert (!(group_size % new_phis.length ()));
47deb25f 5678 }
5679 else
5680 ratio = 1;
eefa05c8 5681
819b1150 5682 stmt_vec_info epilog_stmt_info = NULL;
eefa05c8 5683 for (k = 0; k < group_size; k++)
fb85abff 5684 {
eefa05c8 5685 if (k % ratio == 0)
5686 {
819b1150 5687 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
dc1fb456 5688 reduction_phi_info = reduction_phis[k / ratio];
58045f90 5689 if (double_reduc)
f1f41a6c 5690 inner_phi = inner_phis[k / ratio];
eefa05c8 5691 }
7aa0d350 5692
39a5d6b1 5693 if (slp_reduc)
eefa05c8 5694 {
06bb64b8 5695 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
fb85abff 5696
06bb64b8 5697 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
aebdbd31 5698 /* SLP statements can't participate in patterns. */
5699 gcc_assert (!orig_stmt_info);
06bb64b8 5700 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
eefa05c8 5701 }
5702
f1f41a6c 5703 phis.create (3);
eefa05c8 5704 /* Find the loop-closed-use at the loop exit of the original scalar
282bf14c 5705 result. (The reduction result is expected to have two immediate uses -
eefa05c8 5706 one at the latch block, and one at the loop exit). */
5707 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
f898e094 5708 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5709 && !is_gimple_debug (USE_STMT (use_p)))
f1f41a6c 5710 phis.safe_push (USE_STMT (use_p));
eefa05c8 5711
1d4bc0bb 5712 /* While we expect to have found an exit_phi because of loop-closed-ssa
5713 form we can end up without one if the scalar cycle is dead. */
eefa05c8 5714
f1f41a6c 5715 FOR_EACH_VEC_ELT (phis, i, exit_phi)
eefa05c8 5716 {
5717 if (outer_loop)
7aa0d350 5718 {
03c0d666 5719 stmt_vec_info exit_phi_vinfo
5720 = loop_vinfo->lookup_stmt (exit_phi);
1a91d914 5721 gphi *vect_phi;
eefa05c8 5722
0f76de8e 5723 if (double_reduc)
5724 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5725 else
819b1150 5726 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
eefa05c8 5727 if (!double_reduc
5728 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5729 != vect_double_reduction_def)
7aa0d350 5730 continue;
5731
eefa05c8 5732 /* Handle double reduction:
7aa0d350 5733
eefa05c8 5734 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5735 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5736 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5737 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
7aa0d350 5738
eefa05c8 5739 At that point the regular reduction (stmt2 and stmt3) is
5740 already vectorized, as well as the exit phi node, stmt4.
5741 Here we vectorize the phi node of double reduction, stmt1, and
5742 update all relevant statements. */
7aa0d350 5743
eefa05c8 5744 /* Go through all the uses of s2 to find double reduction phi
5745 node, i.e., stmt1 above. */
5746 orig_name = PHI_RESULT (exit_phi);
5747 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
7aa0d350 5748 {
f83623cc 5749 stmt_vec_info use_stmt_vinfo;
9ed1960b 5750 tree vect_phi_init, preheader_arg, vect_phi_res;
eefa05c8 5751 basic_block bb = gimple_bb (use_stmt);
eefa05c8 5752
5753 /* Check that USE_STMT is really double reduction phi
5754 node. */
5755 if (gimple_code (use_stmt) != GIMPLE_PHI
5756 || gimple_phi_num_args (use_stmt) != 2
eefa05c8 5757 || bb->loop_father != outer_loop)
5758 continue;
03c0d666 5759 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
f83623cc 5760 if (!use_stmt_vinfo
5761 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5762 != vect_double_reduction_def)
5763 continue;
eefa05c8 5764
5765 /* Create vector phi node for double reduction:
5766 vs1 = phi <vs0, vs2>
5767 vs1 was created previously in this function by a call to
5768 vect_get_vec_def_for_operand and is stored in
5769 vec_initial_def;
58045f90 5770 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
eefa05c8 5771 vs0 is created here. */
5772
5773 /* Create vector phi node. */
5774 vect_phi = create_phi_node (vec_initial_def, bb);
04b2391d 5775 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
eefa05c8 5776
5777 /* Create vs0 - initial def of the double reduction phi. */
5778 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5779 loop_preheader_edge (outer_loop));
9ed1960b 5780 vect_phi_init = get_initial_def_for_reduction
a73182ff 5781 (stmt_info, preheader_arg, NULL);
eefa05c8 5782
5783 /* Update phi node arguments with vs0 and vs2. */
5784 add_phi_arg (vect_phi, vect_phi_init,
5785 loop_preheader_edge (outer_loop),
60d535d2 5786 UNKNOWN_LOCATION);
435515db 5787 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5788 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6d8fb6cf 5789 if (dump_enabled_p ())
a4e972e3 5790 dump_printf_loc (MSG_NOTE, vect_location,
5791 "created double reduction phi node: %G",
5792 vect_phi);
eefa05c8 5793
5794 vect_phi_res = PHI_RESULT (vect_phi);
5795
5796 /* Replace the use, i.e., set the correct vs1 in the regular
282bf14c 5797 reduction phi node. FORNOW, NCOPIES is always 1, so the
eefa05c8 5798 loop is redundant. */
dc1fb456 5799 stmt_vec_info use_info = reduction_phi_info;
5800 for (j = 0; j < ncopies; j++)
5801 {
5802 edge pr_edge = loop_preheader_edge (loop);
5803 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5804 pr_edge->dest_idx, vect_phi_res);
5805 use_info = STMT_VINFO_RELATED_STMT (use_info);
5806 }
7aa0d350 5807 }
5808 }
b219ece3 5809 }
5810
f1f41a6c 5811 phis.release ();
b219ece3 5812 if (nested_in_vect_loop)
5813 {
5814 if (double_reduc)
5815 loop = outer_loop;
5816 else
5817 continue;
5818 }
5819
f1f41a6c 5820 phis.create (3);
b219ece3 5821 /* Find the loop-closed-use at the loop exit of the original scalar
282bf14c 5822 result. (The reduction result is expected to have two immediate uses,
5823 one at the latch block, and one at the loop exit). For double
b219ece3 5824 reductions we are looking for exit phis of the outer loop. */
5825 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5826 {
5827 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
f898e094 5828 {
5829 if (!is_gimple_debug (USE_STMT (use_p)))
5830 phis.safe_push (USE_STMT (use_p));
5831 }
b219ece3 5832 else
5833 {
5834 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5835 {
5836 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5837
5838 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5839 {
5840 if (!flow_bb_inside_loop_p (loop,
f898e094 5841 gimple_bb (USE_STMT (phi_use_p)))
5842 && !is_gimple_debug (USE_STMT (phi_use_p)))
f1f41a6c 5843 phis.safe_push (USE_STMT (phi_use_p));
b219ece3 5844 }
5845 }
5846 }
5847 }
fb85abff 5848
f1f41a6c 5849 FOR_EACH_VEC_ELT (phis, i, exit_phi)
b219ece3 5850 {
eefa05c8 5851 /* Replace the uses: */
5852 orig_name = PHI_RESULT (exit_phi);
f1f41a6c 5853 scalar_result = scalar_results[k];
eefa05c8 5854 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5855 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5856 SET_USE (use_p, scalar_result);
5857 }
5858
f1f41a6c 5859 phis.release ();
fb85abff 5860 }
6ae8a044 5861}
fb85abff 5862
d77809a4 5863/* Return a vector of type VECTYPE that is equal to the vector select
5864 operation "MASK ? VEC : IDENTITY". Insert the select statements
5865 before GSI. */
5866
5867static tree
5868merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5869 tree vec, tree identity)
5870{
5871 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5872 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5873 mask, vec, identity);
5874 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5875 return cond;
5876}
5877
5878/* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5879 order, starting with LHS. Insert the extraction statements before GSI and
5880 associate the new scalar SSA names with variable SCALAR_DEST.
5881 Return the SSA name for the result. */
5882
5883static tree
5884vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5885 tree_code code, tree lhs, tree vector_rhs)
5886{
5887 tree vectype = TREE_TYPE (vector_rhs);
5888 tree scalar_type = TREE_TYPE (vectype);
5889 tree bitsize = TYPE_SIZE (scalar_type);
5890 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5891 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5892
5893 for (unsigned HOST_WIDE_INT bit_offset = 0;
5894 bit_offset < vec_size_in_bits;
5895 bit_offset += element_bitsize)
5896 {
5897 tree bitpos = bitsize_int (bit_offset);
5898 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5899 bitsize, bitpos);
5900
5901 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5902 rhs = make_ssa_name (scalar_dest, stmt);
5903 gimple_assign_set_lhs (stmt, rhs);
5904 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5905
5906 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5907 tree new_name = make_ssa_name (scalar_dest, stmt);
5908 gimple_assign_set_lhs (stmt, new_name);
5909 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5910 lhs = new_name;
5911 }
5912 return lhs;
5913}
5914
26fb5106 5915/* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5916 type of the vector input. */
5917
5918static internal_fn
5919get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5920{
5921 internal_fn mask_reduc_fn;
5922
5923 switch (reduc_fn)
5924 {
5925 case IFN_FOLD_LEFT_PLUS:
5926 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5927 break;
5928
5929 default:
5930 return IFN_LAST;
5931 }
5932
5933 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5934 OPTIMIZE_FOR_SPEED))
5935 return mask_reduc_fn;
5936 return IFN_LAST;
5937}
5938
ecc42a77 5939/* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
d77809a4 5940 statement that sets the live-out value. REDUC_DEF_STMT is the phi
ecc42a77 5941 statement. CODE is the operation performed by STMT_INFO and OPS are
d77809a4 5942 its scalar operands. REDUC_INDEX is the index of the operand in
5943 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5944 implements in-order reduction, or IFN_LAST if we should open-code it.
5945 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5946 that should be used to control the operation in a fully-masked loop. */
5947
5948static bool
ecc42a77 5949vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5950 gimple_stmt_iterator *gsi,
435515db 5951 stmt_vec_info *vec_stmt, slp_tree slp_node,
d77809a4 5952 gimple *reduc_def_stmt,
5953 tree_code code, internal_fn reduc_fn,
5954 tree ops[3], tree vectype_in,
5955 int reduc_index, vec_loop_masks *masks)
5956{
d77809a4 5957 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2e966e2a 5958 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
d77809a4 5959 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
585ed623 5960 stmt_vec_info new_stmt_info = NULL;
26fb5106 5961 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
d77809a4 5962
5963 int ncopies;
5964 if (slp_node)
5965 ncopies = 1;
5966 else
5967 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5968
a73182ff 5969 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
d77809a4 5970 gcc_assert (ncopies == 1);
5971 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5972 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5973 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5974 == FOLD_LEFT_REDUCTION);
5975
5976 if (slp_node)
5977 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5978 TYPE_VECTOR_SUBPARTS (vectype_in)));
5979
5980 tree op0 = ops[1 - reduc_index];
5981
5982 int group_size = 1;
06bb64b8 5983 stmt_vec_info scalar_dest_def_info;
d77809a4 5984 auto_vec<tree> vec_oprnds0;
5985 if (slp_node)
5986 {
1425cbaa 5987 auto_vec<vec<tree> > vec_defs (2);
5988 auto_vec<tree> sops(2);
5989 sops.quick_push (ops[0]);
5990 sops.quick_push (ops[1]);
5991 vect_get_slp_defs (sops, slp_node, &vec_defs);
5992 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5993 vec_defs[0].release ();
5994 vec_defs[1].release ();
d77809a4 5995 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
06bb64b8 5996 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
d77809a4 5997 }
5998 else
5999 {
a73182ff 6000 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
d77809a4 6001 vec_oprnds0.create (1);
6002 vec_oprnds0.quick_push (loop_vec_def0);
06bb64b8 6003 scalar_dest_def_info = stmt_info;
d77809a4 6004 }
6005
06bb64b8 6006 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
d77809a4 6007 tree scalar_type = TREE_TYPE (scalar_dest);
6008 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6009
6010 int vec_num = vec_oprnds0.length ();
6011 gcc_assert (vec_num == 1 || slp_node);
6012 tree vec_elem_type = TREE_TYPE (vectype_out);
6013 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6014
6015 tree vector_identity = NULL_TREE;
6016 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6017 vector_identity = build_zero_cst (vectype_out);
6018
6019 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6020 int i;
6021 tree def0;
6022 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6023 {
585ed623 6024 gimple *new_stmt;
d77809a4 6025 tree mask = NULL_TREE;
6026 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6027 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6028
6029 /* Handle MINUS by adding the negative. */
6030 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6031 {
6032 tree negated = make_ssa_name (vectype_out);
6033 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6034 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6035 def0 = negated;
6036 }
6037
26fb5106 6038 if (mask && mask_reduc_fn == IFN_LAST)
d77809a4 6039 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6040 vector_identity);
6041
6042 /* On the first iteration the input is simply the scalar phi
6043 result, and for subsequent iterations it is the output of
6044 the preceding operation. */
26fb5106 6045 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
d77809a4 6046 {
26fb5106 6047 if (mask && mask_reduc_fn != IFN_LAST)
6048 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6049 def0, mask);
6050 else
6051 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6052 def0);
d77809a4 6053 /* For chained SLP reductions the output of the previous reduction
6054 operation serves as the input of the next. For the final statement
6055 the output cannot be a temporary - we reuse the original
6056 scalar destination of the last statement. */
6057 if (i != vec_num - 1)
6058 {
6059 gimple_set_lhs (new_stmt, scalar_dest_var);
6060 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6061 gimple_set_lhs (new_stmt, reduc_var);
6062 }
6063 }
6064 else
6065 {
6066 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6067 reduc_var, def0);
6068 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6069 /* Remove the statement, so that we can use the same code paths
6070 as for statements that we've just created. */
6071 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
f2669ca3 6072 gsi_remove (&tmp_gsi, true);
d77809a4 6073 }
6074
6075 if (i == vec_num - 1)
6076 {
6077 gimple_set_lhs (new_stmt, scalar_dest);
06bb64b8 6078 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
6079 new_stmt);
d77809a4 6080 }
6081 else
06bb64b8 6082 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
585ed623 6083 new_stmt, gsi);
d77809a4 6084
6085 if (slp_node)
585ed623 6086 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
d77809a4 6087 }
6088
6089 if (!slp_node)
585ed623 6090 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
d77809a4 6091
6092 return true;
6093}
fb85abff 6094
b4552064 6095/* Function is_nonwrapping_integer_induction.
6096
ecc42a77 6097 Check if STMT_VINO (which is part of loop LOOP) both increments and
b4552064 6098 does not cause overflow. */
6099
6100static bool
2e966e2a 6101is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
b4552064 6102{
ecc42a77 6103 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
559260b3 6104 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
b4552064 6105 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
ecc42a77 6106 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
b4552064 6107 widest_int ni, max_loop_value, lhs_max;
30b5769f 6108 wi::overflow_type overflow = wi::OVF_NONE;
b4552064 6109
6110 /* Make sure the loop is integer based. */
6111 if (TREE_CODE (base) != INTEGER_CST
6112 || TREE_CODE (step) != INTEGER_CST)
6113 return false;
6114
b4552064 6115 /* Check that the max size of the loop will not wrap. */
6116
6117 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6118 return true;
6119
6120 if (! max_stmt_executions (loop, &ni))
6121 return false;
6122
6123 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6124 &overflow);
6125 if (overflow)
6126 return false;
6127
6128 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6129 TYPE_SIGN (lhs_type), &overflow);
6130 if (overflow)
6131 return false;
6132
6133 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6134 <= TYPE_PRECISION (lhs_type));
6135}
6136
d3a7159f 6137/* Check if masking can be supported by inserting a conditional expression.
6138 CODE is the code for the operation. COND_FN is the conditional internal
6139 function, if it exists. VECTYPE_IN is the type of the vector input. */
6140static bool
6141use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6142 tree vectype_in)
6143{
6144 if (cond_fn != IFN_LAST
6145 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6146 OPTIMIZE_FOR_SPEED))
6147 return false;
6148
6149 switch (code)
6150 {
6151 case DOT_PROD_EXPR:
2cbc1ad8 6152 case SAD_EXPR:
d3a7159f 6153 return true;
6154
6155 default:
6156 return false;
6157 }
6158}
6159
6160/* Insert a conditional expression to enable masked vectorization. CODE is the
6161 code for the operation. VOP is the array of operands. MASK is the loop
6162 mask. GSI is a statement iterator used to place the new conditional
6163 expression. */
6164static void
6165build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6166 gimple_stmt_iterator *gsi)
6167{
6168 switch (code)
6169 {
6170 case DOT_PROD_EXPR:
6171 {
6172 tree vectype = TREE_TYPE (vop[1]);
6173 tree zero = build_zero_cst (vectype);
6174 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6175 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6176 mask, vop[1], zero);
6177 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6178 vop[1] = masked_op1;
6179 break;
6180 }
6181
2cbc1ad8 6182 case SAD_EXPR:
6183 {
6184 tree vectype = TREE_TYPE (vop[1]);
6185 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6186 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6187 mask, vop[1], vop[0]);
6188 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6189 vop[1] = masked_op1;
6190 break;
6191 }
6192
d3a7159f 6193 default:
6194 gcc_unreachable ();
6195 }
6196}
6197
fb85abff 6198/* Function vectorizable_reduction.
6199
ecc42a77 6200 Check if STMT_INFO performs a reduction operation that can be vectorized.
6201 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
ade2ac53 6202 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
ecc42a77 6203 Return true if STMT_INFO is vectorizable in this way.
fb85abff 6204
48e1416a 6205 This function also handles reduction idioms (patterns) that have been
ecc42a77 6206 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6207 may be of this form:
fb85abff 6208 X = pattern_expr (arg0, arg1, ..., X)
ecc42a77 6209 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6210 sequence that had been detected and replaced by the pattern-stmt
6211 (STMT_INFO).
48e1416a 6212
d09d8733 6213 This function also handles reduction of condition expressions, for example:
6214 for (int i = 0; i < N; i++)
6215 if (a[i] < value)
6216 last = a[i];
6217 This is handled by vectorising the loop and creating an additional vector
6218 containing the loop indexes for which "a[i] < value" was true. In the
6219 function epilogue this is reduced to a single max value and then used to
6220 index into the vector of results.
6221
fb85abff 6222 In some cases of reduction patterns, the type of the reduction variable X is
ecc42a77 6223 different than the type of the other arguments of STMT_INFO.
6224 In such cases, the vectype that is used when transforming STMT_INFO into
6225 a vector stmt is different than the vectype that is used to determine the
48e1416a 6226 vectorization factor, because it consists of a different number of elements
fb85abff 6227 than the actual number of elements that are being operated upon in parallel.
6228
6229 For example, consider an accumulation of shorts into an int accumulator.
6230 On some targets it's possible to vectorize this pattern operating on 8
6231 shorts at a time (hence, the vectype for purposes of determining the
6232 vectorization factor should be V8HI); on the other hand, the vectype that
6233 is used to create the vector form is actually V4SI (the type of the result).
6234
6235 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6236 indicates what is the actual level of parallelism (V8HI in the example), so
282bf14c 6237 that the right vectorization factor would be derived. This vectype
fb85abff 6238 corresponds to the type of arguments to the reduction stmt, and should *NOT*
282bf14c 6239 be used to create the vectorized stmt. The right vectype for the vectorized
fb85abff 6240 stmt is obtained from the type of the result X:
6241 get_vectype_for_scalar_type (TREE_TYPE (X))
6242
6243 This means that, contrary to "regular" reductions (or "regular" stmts in
6244 general), the following equation:
6245 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6246 does *NOT* necessarily hold for reduction patterns. */
6247
6248bool
ecc42a77 6249vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
435515db 6250 stmt_vec_info *vec_stmt, slp_tree slp_node,
c863e35b 6251 slp_instance slp_node_instance,
6252 stmt_vector_for_cost *cost_vec)
fb85abff 6253{
6254 tree vec_dest;
6255 tree scalar_dest;
b334cbba 6256 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6257 tree vectype_in = NULL_TREE;
fb85abff 6258 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2e966e2a 6259 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
e53664fa 6260 enum tree_code code, orig_code;
6261 internal_fn reduc_fn;
3754d046 6262 machine_mode vec_mode;
fb85abff 6263 int op_type;
e53664fa 6264 optab optab;
fb85abff 6265 tree new_temp = NULL_TREE;
56fb8e9d 6266 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
819b1150 6267 stmt_vec_info cond_stmt_vinfo = NULL;
fdf40949 6268 enum tree_code cond_reduc_op_code = ERROR_MARK;
fb85abff 6269 tree scalar_type;
6270 bool is_simple_use;
fb85abff 6271 int i;
b334cbba 6272 int ncopies;
fb85abff 6273 int epilog_copies;
6274 stmt_vec_info prev_stmt_info, prev_phi_info;
fb85abff 6275 bool single_defuse_cycle = false;
585ed623 6276 stmt_vec_info new_stmt_info = NULL;
fb85abff 6277 int j;
6278 tree ops[3];
44b24fa0 6279 enum vect_def_type dts[3];
ade2ac53 6280 bool nested_cycle = false, found_nested_cycle_def = false;
119a8852 6281 bool double_reduc = false;
7aa0d350 6282 basic_block def_bb;
2e966e2a 6283 class loop * def_stmt_loop;
7aa0d350 6284 tree def_arg;
c2078b80 6285 auto_vec<tree> vec_oprnds0;
6286 auto_vec<tree> vec_oprnds1;
f17c6474 6287 auto_vec<tree> vec_oprnds2;
c2078b80 6288 auto_vec<tree> vect_defs;
dc1fb456 6289 auto_vec<stmt_vec_info> phis;
eefa05c8 6290 int vec_num;
44b24fa0 6291 tree def0, tem;
d09d8733 6292 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
834a2c29 6293 tree cond_reduc_val = NULL_TREE;
fb85abff 6294
44b24fa0 6295 /* Make sure it was already recognized as a reduction computation. */
1c2fef9a 6296 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6297 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
44b24fa0 6298 return false;
6299
a73182ff 6300 if (nested_in_vect_loop_p (loop, stmt_info))
44b24fa0 6301 {
44b24fa0 6302 loop = loop->inner;
6303 nested_cycle = true;
6304 }
6305
43157fa0 6306 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
cd24aa3c 6307 gcc_assert (slp_node
6308 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
39a5d6b1 6309
a73182ff 6310 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
ade2ac53 6311 {
5b4b7bcc 6312 tree phi_result = gimple_phi_result (phi);
44b24fa0 6313 /* Analysis is fully done on the reduction stmt invocation. */
6314 if (! vec_stmt)
6315 {
6154acba 6316 if (slp_node)
6317 slp_node_instance->reduc_phis = slp_node;
6318
44b24fa0 6319 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6320 return true;
6321 }
6322
d77809a4 6323 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6324 /* Leave the scalar phi in place. Note that checking
6325 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6326 for reductions involving a single statement. */
6327 return true;
6328
04eefad5 6329 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
0b7ea3a9 6330 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
44b24fa0 6331
aaac0b10 6332 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
3bf95150 6333 == EXTRACT_LAST_REDUCTION)
6334 /* Leave the scalar phi in place. */
6335 return true;
6336
04eefad5 6337 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
8f972a41 6338 code = gimple_assign_rhs_code (reduc_stmt);
44b24fa0 6339 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6340 {
6341 tree op = gimple_op (reduc_stmt, k);
5b4b7bcc 6342 if (op == phi_result)
44b24fa0 6343 continue;
8f972a41 6344 if (k == 1 && code == COND_EXPR)
6345 continue;
6346 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6347 gcc_assert (is_simple_use);
6348 if (dt == vect_constant_def || dt == vect_external_def)
44b24fa0 6349 continue;
ce068755 6350 if (!vectype_in
6351 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6352 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6353 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
44b24fa0 6354 break;
6355 }
687f61e6 6356 /* For a nested cycle we might end up with an operation like
6357 phi_result * phi_result. */
6358 if (!vectype_in)
6359 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
44b24fa0 6360 gcc_assert (vectype_in);
6361
6362 if (slp_node)
6363 ncopies = 1;
6364 else
4eb17cb6 6365 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
44b24fa0 6366
aaac0b10 6367 stmt_vec_info use_stmt_info;
f17c6474 6368 if (ncopies > 1
aaac0b10 6369 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6370 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
0b7ea3a9 6371 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
f17c6474 6372 single_defuse_cycle = true;
6373
44b24fa0 6374 /* Create the destination vector */
6375 scalar_dest = gimple_assign_lhs (reduc_stmt);
6376 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6377
6378 if (slp_node)
6379 /* The size vect_schedule_slp_instance computes is off for us. */
d75596cd 6380 vec_num = vect_get_num_vectors
6381 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6382 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6383 vectype_in);
44b24fa0 6384 else
6385 vec_num = 1;
6386
6387 /* Generate the reduction PHIs upfront. */
6388 prev_phi_info = NULL;
6389 for (j = 0; j < ncopies; j++)
6390 {
6391 if (j == 0 || !single_defuse_cycle)
6392 {
6393 for (i = 0; i < vec_num; i++)
6394 {
6395 /* Create the reduction-phi that defines the reduction
6396 operand. */
6154acba 6397 gimple *new_phi = create_phi_node (vec_dest, loop->header);
04b2391d 6398 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
44b24fa0 6399
6400 if (slp_node)
dc1fb456 6401 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
44b24fa0 6402 else
6403 {
6404 if (j == 0)
435515db 6405 STMT_VINFO_VEC_STMT (stmt_info)
6406 = *vec_stmt = new_phi_info;
44b24fa0 6407 else
aebdbd31 6408 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
04b2391d 6409 prev_phi_info = new_phi_info;
44b24fa0 6410 }
6411 }
6412 }
6413 }
6414
6415 return true;
ade2ac53 6416 }
fb85abff 6417
fb85abff 6418 /* 1. Is vectorizable reduction? */
39a5d6b1 6419 /* Not supportable if the reduction variable is used in the loop, unless
6420 it's a reduction chain. */
6421 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
e1009321 6422 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
fb85abff 6423 return false;
6424
6425 /* Reductions that are not used even in an enclosing outer-loop,
6426 are expected to be "live" (used out of the loop). */
f083cd24 6427 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
fb85abff 6428 && !STMT_VINFO_LIVE_P (stmt_info))
6429 return false;
6430
48e1416a 6431 /* 2. Has this been recognized as a reduction pattern?
fb85abff 6432
6433 Check if STMT represents a pattern that has been recognized
6434 in earlier analysis stages. For stmts that represent a pattern,
6435 the STMT_VINFO_RELATED_STMT field records the last stmt in
6436 the original sequence that constitutes the pattern. */
6437
aebdbd31 6438 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6439 if (orig_stmt_info)
fb85abff 6440 {
fb85abff 6441 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6442 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6443 }
48e1416a 6444
282bf14c 6445 /* 3. Check the operands of the operation. The first operands are defined
fb85abff 6446 inside the loop body. The last operand is the reduction variable,
6447 which is defined by the loop-header-phi. */
6448
ecc42a77 6449 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
fb85abff 6450
09e31a48 6451 /* Flatten RHS. */
fb85abff 6452 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6453 {
fb85abff 6454 case GIMPLE_BINARY_RHS:
6455 code = gimple_assign_rhs_code (stmt);
6456 op_type = TREE_CODE_LENGTH (code);
6457 gcc_assert (op_type == binary_op);
6458 ops[0] = gimple_assign_rhs1 (stmt);
6459 ops[1] = gimple_assign_rhs2 (stmt);
6460 break;
6461
c86930b0 6462 case GIMPLE_TERNARY_RHS:
6463 code = gimple_assign_rhs_code (stmt);
6464 op_type = TREE_CODE_LENGTH (code);
6465 gcc_assert (op_type == ternary_op);
6466 ops[0] = gimple_assign_rhs1 (stmt);
6467 ops[1] = gimple_assign_rhs2 (stmt);
6468 ops[2] = gimple_assign_rhs3 (stmt);
6469 break;
6470
fb85abff 6471 case GIMPLE_UNARY_RHS:
6472 return false;
6473
6474 default:
6475 gcc_unreachable ();
6476 }
6477
f2104a54 6478 if (code == COND_EXPR && slp_node)
6479 return false;
6480
fb85abff 6481 scalar_dest = gimple_assign_lhs (stmt);
6482 scalar_type = TREE_TYPE (scalar_dest);
48e1416a 6483 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
fb85abff 6484 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6485 return false;
6486
6960a794 6487 /* Do not try to vectorize bit-precision reductions. */
654ba22c 6488 if (!type_has_mode_precision_p (scalar_type))
6960a794 6489 return false;
6490
fb85abff 6491 /* All uses but the last are expected to be defined in the loop.
282bf14c 6492 The last use is the reduction variable. In case of nested cycle this
ade2ac53 6493 assumption is not true: we use reduc_index to record the index of the
6494 reduction variable. */
bc497cf2 6495 stmt_vec_info reduc_def_info;
6496 if (orig_stmt_info)
6497 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6498 else
6499 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6500 gcc_assert (reduc_def_info);
6501 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6502 tree reduc_def = PHI_RESULT (reduc_def_phi);
f17c6474 6503 int reduc_index = -1;
ebacf0e3 6504 for (i = 0; i < op_type; i++)
fb85abff 6505 {
0df23b96 6506 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6507 if (i == 0 && code == COND_EXPR)
6508 continue;
6509
bfa5bad6 6510 stmt_vec_info def_stmt_info;
6511 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6512 &def_stmt_info);
f17c6474 6513 dt = dts[i];
fb85abff 6514 gcc_assert (is_simple_use);
bc497cf2 6515 if (dt == vect_reduction_def
6516 && ops[i] == reduc_def)
f17c6474 6517 {
f17c6474 6518 reduc_index = i;
6519 continue;
6520 }
c6c093ed 6521 else if (tem)
f17c6474 6522 {
c6c093ed 6523 /* To properly compute ncopies we are interested in the widest
6524 input type in case we're looking at a widening accumulation. */
6525 if (!vectype_in
ce068755 6526 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6527 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
f17c6474 6528 vectype_in = tem;
6529 }
39a5d6b1 6530
f083cd24 6531 if (dt != vect_internal_def
6532 && dt != vect_external_def
fb85abff 6533 && dt != vect_constant_def
ade2ac53 6534 && dt != vect_induction_def
0df23b96 6535 && !(dt == vect_nested_cycle && nested_cycle))
fb85abff 6536 return false;
ade2ac53 6537
bc497cf2 6538 if (dt == vect_nested_cycle
6539 && ops[i] == reduc_def)
bfa5bad6 6540 {
6541 found_nested_cycle_def = true;
bfa5bad6 6542 reduc_index = i;
6543 }
b4552064 6544
56fb8e9d 6545 if (i == 1 && code == COND_EXPR)
6546 {
6547 /* Record how value of COND_EXPR is defined. */
6548 if (dt == vect_constant_def)
6549 {
6550 cond_reduc_dt = dt;
6551 cond_reduc_val = ops[i];
6552 }
fdf40949 6553 if (dt == vect_induction_def
bfa5bad6 6554 && def_stmt_info
6555 && is_nonwrapping_integer_induction (def_stmt_info, loop))
fdf40949 6556 {
6557 cond_reduc_dt = dt;
819b1150 6558 cond_stmt_vinfo = def_stmt_info;
fdf40949 6559 }
56fb8e9d 6560 }
fb85abff 6561 }
6562
fae41702 6563 if (!vectype_in)
f17c6474 6564 vectype_in = vectype_out;
de3aabcf 6565
f17c6474 6566 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6567 directy used in stmt. */
6568 if (reduc_index == -1)
6569 {
d77809a4 6570 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6571 {
6572 if (dump_enabled_p ())
6573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6574 "in-order reduction chain without SLP.\n");
6575 return false;
6576 }
f17c6474 6577 }
6578
f17c6474 6579 if (!(reduc_index == -1
6580 || dts[reduc_index] == vect_reduction_def
6581 || dts[reduc_index] == vect_nested_cycle
6582 || ((dts[reduc_index] == vect_internal_def
6583 || dts[reduc_index] == vect_external_def
6584 || dts[reduc_index] == vect_constant_def
6585 || dts[reduc_index] == vect_induction_def)
a82fc9c6 6586 && nested_cycle && found_nested_cycle_def)))
6587 {
6588 /* For pattern recognized stmts, orig_stmt might be a reduction,
6589 but some helper statements for the pattern might not, or
6590 might be COND_EXPRs with reduction uses in the condition. */
aebdbd31 6591 gcc_assert (orig_stmt_info);
a82fc9c6 6592 return false;
6593 }
ade2ac53 6594
142c3207 6595 /* PHIs should not participate in patterns. */
6596 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
119a8852 6597 enum vect_reduction_type v_reduc_type
6598 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
04eefad5 6599 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
559260b3 6600
56fb8e9d 6601 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
559260b3 6602 /* If we have a condition reduction, see if we can simplify it further. */
56fb8e9d 6603 if (v_reduc_type == COND_REDUCTION)
559260b3 6604 {
29844e5f 6605 /* TODO: We can't yet handle reduction chains, since we need to treat
6606 each COND_EXPR in the chain specially, not just the last one.
6607 E.g. for:
6608
6609 x_1 = PHI <x_3, ...>
6610 x_2 = a_2 ? ... : x_1;
6611 x_3 = a_3 ? ... : x_2;
6612
6613 we're interested in the last element in x_3 for which a_2 || a_3
6614 is true, whereas the current reduction chain handling would
6615 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6616 as a reduction operation. */
6617 if (reduc_index == -1)
6618 {
6619 if (dump_enabled_p ())
6620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6621 "conditional reduction chains not supported\n");
6622 return false;
6623 }
6624
6625 /* vect_is_simple_reduction ensured that operand 2 is the
6626 loop-carried operand. */
6627 gcc_assert (reduc_index == 2);
6628
3bf95150 6629 /* Loop peeling modifies initial value of reduction PHI, which
6630 makes the reduction stmt to be transformed different to the
6631 original stmt analyzed. We need to record reduction code for
6632 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6633 it can be used directly at transform stage. */
6634 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6635 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6636 {
6637 /* Also set the reduction type to CONST_COND_REDUCTION. */
6638 gcc_assert (cond_reduc_dt == vect_constant_def);
6639 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6640 }
6641 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6642 vectype_in, OPTIMIZE_FOR_SPEED))
6643 {
6644 if (dump_enabled_p ())
6645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6646 "optimizing condition reduction with"
6647 " FOLD_EXTRACT_LAST.\n");
6648 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6649 }
6650 else if (cond_reduc_dt == vect_induction_def)
56fb8e9d 6651 {
fdf40949 6652 tree base
6653 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6654 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6655
6656 gcc_assert (TREE_CODE (base) == INTEGER_CST
6657 && TREE_CODE (step) == INTEGER_CST);
6658 cond_reduc_val = NULL_TREE;
6659 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6660 above base; punt if base is the minimum value of the type for
6661 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6662 if (tree_int_cst_sgn (step) == -1)
6663 {
6664 cond_reduc_op_code = MIN_EXPR;
6665 if (tree_int_cst_sgn (base) == -1)
6666 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6667 else if (tree_int_cst_lt (base,
6668 TYPE_MAX_VALUE (TREE_TYPE (base))))
6669 cond_reduc_val
6670 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6671 }
6672 else
6673 {
6674 cond_reduc_op_code = MAX_EXPR;
6675 if (tree_int_cst_sgn (base) == 1)
6676 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6677 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6678 base))
6679 cond_reduc_val
6680 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6681 }
6682 if (cond_reduc_val)
6683 {
6684 if (dump_enabled_p ())
6685 dump_printf_loc (MSG_NOTE, vect_location,
6686 "condition expression based on "
6687 "integer induction.\n");
6688 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6689 = INTEGER_INDUC_COND_REDUCTION;
6690 }
56fb8e9d 6691 }
834a2c29 6692 else if (cond_reduc_dt == vect_constant_def)
56fb8e9d 6693 {
6694 enum vect_def_type cond_initial_dt;
6695 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6696 tree cond_initial_val
6697 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6698
6699 gcc_assert (cond_reduc_val != NULL_TREE);
bf8b3614 6700 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
56fb8e9d 6701 if (cond_initial_dt == vect_constant_def
6702 && types_compatible_p (TREE_TYPE (cond_initial_val),
6703 TREE_TYPE (cond_reduc_val)))
6704 {
44b24fa0 6705 tree e = fold_binary (LE_EXPR, boolean_type_node,
56fb8e9d 6706 cond_initial_val, cond_reduc_val);
6707 if (e && (integer_onep (e) || integer_zerop (e)))
6708 {
6709 if (dump_enabled_p ())
6710 dump_printf_loc (MSG_NOTE, vect_location,
6711 "condition expression based on "
6712 "compile time constant.\n");
834a2c29 6713 /* Record reduction code at analysis stage. */
6714 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6715 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
56fb8e9d 6716 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6717 = CONST_COND_REDUCTION;
6718 }
6719 }
6720 }
559260b3 6721 }
b4552064 6722
aebdbd31 6723 if (orig_stmt_info)
6724 gcc_assert (tmp == orig_stmt_info
04eefad5 6725 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
fb85abff 6726 else
34563054 6727 /* We changed STMT to be the first stmt in reduction chain, hence we
6728 check that in this case the first element in the chain is STMT. */
04eefad5 6729 gcc_assert (tmp == stmt_info
6730 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
48e1416a 6731
04eefad5 6732 if (STMT_VINFO_LIVE_P (reduc_def_info))
fb85abff 6733 return false;
6734
35b1a569 6735 if (slp_node)
eefa05c8 6736 ncopies = 1;
6737 else
4eb17cb6 6738 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
b334cbba 6739
b334cbba 6740 gcc_assert (ncopies >= 1);
6741
6742 vec_mode = TYPE_MODE (vectype_in);
ce068755 6743 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
fb85abff 6744
2fbb03c0 6745 if (nested_cycle)
6746 {
6747 def_bb = gimple_bb (reduc_def_phi);
6748 def_stmt_loop = def_bb->loop_father;
6749 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6750 loop_preheader_edge (def_stmt_loop));
6751 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6752 if (def_arg_stmt_info
6753 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6754 == vect_double_reduction_def))
6755 double_reduc = true;
6756 }
6757
98acf890 6758 vect_reduction_type reduction_type
6759 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6760 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6761 && ncopies > 1)
6762 {
6763 if (dump_enabled_p ())
6764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765 "multiple types in double reduction or condition "
6766 "reduction.\n");
6767 return false;
6768 }
6769
0df23b96 6770 if (code == COND_EXPR)
fb85abff 6771 {
d09d8733 6772 /* Only call during the analysis stage, otherwise we'll lose
98acf890 6773 STMT_VINFO_TYPE. */
a73182ff 6774 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
98acf890 6775 true, NULL, cost_vec))
0df23b96 6776 {
6d8fb6cf 6777 if (dump_enabled_p ())
7bd765d4 6778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 6779 "unsupported condition in reduction\n");
5c6f6a61 6780 return false;
0df23b96 6781 }
fb85abff 6782 }
2fbb03c0 6783 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6784 || code == LROTATE_EXPR || code == RROTATE_EXPR)
fb85abff 6785 {
2fbb03c0 6786 /* Only call during the analysis stage, otherwise we'll lose
6787 STMT_VINFO_TYPE. We only support this for nested cycles
6788 without double reductions at the moment. */
6789 if (!nested_cycle
6790 || double_reduc
6791 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6792 NULL, cost_vec)))
2d788f29 6793 {
2d788f29 6794 if (dump_enabled_p ())
6795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2fbb03c0 6796 "unsupported shift or rotation in reduction\n");
2d788f29 6797 return false;
6798 }
2fbb03c0 6799 }
6800 else
6801 {
6802 /* 4. Supportable by target? */
2d788f29 6803
0df23b96 6804 /* 4.1. check support for the operation in the loop */
b334cbba 6805 optab = optab_for_tree_code (code, vectype_in, optab_default);
0df23b96 6806 if (!optab)
6807 {
6d8fb6cf 6808 if (dump_enabled_p ())
7bd765d4 6809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 6810 "no optab.\n");
0df23b96 6811
6812 return false;
6813 }
6814
d6bf3b14 6815 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
0df23b96 6816 {
6d8fb6cf 6817 if (dump_enabled_p ())
78bb46f5 6818 dump_printf (MSG_NOTE, "op not supported by target.\n");
0df23b96 6819
52acb7ae 6820 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
fec8b6d0 6821 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
0df23b96 6822 return false;
6823
6d8fb6cf 6824 if (dump_enabled_p ())
78bb46f5 6825 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
0df23b96 6826 }
6827
6828 /* Worthwhile without SIMD support? */
b334cbba 6829 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
fec8b6d0 6830 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
0df23b96 6831 {
6d8fb6cf 6832 if (dump_enabled_p ())
7bd765d4 6833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 6834 "not worthwhile without SIMD support.\n");
0df23b96 6835
6836 return false;
6837 }
fb85abff 6838 }
6839
6840 /* 4.2. Check support for the epilog operation.
6841
6842 If STMT represents a reduction pattern, then the type of the
6843 reduction variable may be different than the type of the rest
6844 of the arguments. For example, consider the case of accumulation
6845 of shorts into an int accumulator; The original code:
6846 S1: int_a = (int) short_a;
6847 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6848
6849 was replaced with:
6850 STMT: int_acc = widen_sum <short_a, int_acc>
6851
6852 This means that:
48e1416a 6853 1. The tree-code that is used to create the vector operation in the
6854 epilog code (that reduces the partial results) is not the
6855 tree-code of STMT, but is rather the tree-code of the original
282bf14c 6856 stmt from the pattern that STMT is replacing. I.e, in the example
48e1416a 6857 above we want to use 'widen_sum' in the loop, but 'plus' in the
fb85abff 6858 epilog.
6859 2. The type (mode) we use to check available target support
48e1416a 6860 for the vector operation to be created in the *epilog*, is
6861 determined by the type of the reduction variable (in the example
d6bf3b14 6862 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
fb85abff 6863 However the type (mode) we use to check available target support
6864 for the vector operation to be created *inside the loop*, is
6865 determined by the type of the other arguments to STMT (in the
d6bf3b14 6866 example we'd check this: optab_handler (widen_sum_optab,
6867 vect_short_mode)).
48e1416a 6868
6869 This is contrary to "regular" reductions, in which the types of all
6870 the arguments are the same as the type of the reduction variable.
6871 For "regular" reductions we can therefore use the same vector type
fb85abff 6872 (and also the same tree-code) when generating the epilog code and
6873 when generating the code inside the loop. */
6874
aebdbd31 6875 if (orig_stmt_info
d77809a4 6876 && (reduction_type == TREE_CODE_REDUCTION
6877 || reduction_type == FOLD_LEFT_REDUCTION))
fb85abff 6878 {
6879 /* This is a reduction pattern: get the vectype from the type of the
6880 reduction variable, and get the tree-code from orig_stmt. */
aebdbd31 6881 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
b334cbba 6882 gcc_assert (vectype_out);
6883 vec_mode = TYPE_MODE (vectype_out);
fb85abff 6884 }
6885 else
6886 {
6887 /* Regular reduction: use the same vectype and tree-code as used for
6888 the vector code inside the loop can be used for the epilog code. */
6889 orig_code = code;
b4552064 6890
ebacf0e3 6891 if (code == MINUS_EXPR)
6892 orig_code = PLUS_EXPR;
6893
b4552064 6894 /* For simple condition reductions, replace with the actual expression
6895 we want to base our reduction around. */
3bf95150 6896 if (reduction_type == CONST_COND_REDUCTION)
56fb8e9d 6897 {
834a2c29 6898 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6899 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
56fb8e9d 6900 }
3bf95150 6901 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
fdf40949 6902 orig_code = cond_reduc_op_code;
fb85abff 6903 }
6904
e53664fa 6905 reduc_fn = IFN_LAST;
d09d8733 6906
3bf95150 6907 if (reduction_type == TREE_CODE_REDUCTION
d77809a4 6908 || reduction_type == FOLD_LEFT_REDUCTION
3bf95150 6909 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6910 || reduction_type == CONST_COND_REDUCTION)
0df23b96 6911 {
d77809a4 6912 if (reduction_type == FOLD_LEFT_REDUCTION
6913 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6914 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
d09d8733 6915 {
e53664fa 6916 if (reduc_fn != IFN_LAST
6917 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6918 OPTIMIZE_FOR_SPEED))
d09d8733 6919 {
ddbc17d5 6920 if (dump_enabled_p ())
6921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922 "reduc op not supported by target.\n");
d09d8733 6923
e53664fa 6924 reduc_fn = IFN_LAST;
d09d8733 6925 }
6926 }
6927 else
6928 {
6929 if (!nested_cycle || double_reduc)
6930 {
6931 if (dump_enabled_p ())
6932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933 "no reduc code for scalar code.\n");
6934
6935 return false;
6936 }
6937 }
0df23b96 6938 }
3bf95150 6939 else if (reduction_type == COND_REDUCTION)
0df23b96 6940 {
3d2b0034 6941 int scalar_precision
6942 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
d09d8733 6943 cr_index_scalar_type = make_unsigned_type (scalar_precision);
ce068755 6944 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6945 nunits_out);
0df23b96 6946
e53664fa 6947 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6948 OPTIMIZE_FOR_SPEED))
6949 reduc_fn = IFN_REDUC_MAX;
0df23b96 6950 }
6951
3bf95150 6952 if (reduction_type != EXTRACT_LAST_REDUCTION
331fbb0d 6953 && (!nested_cycle || double_reduc)
3bf95150 6954 && reduc_fn == IFN_LAST
6955 && !nunits_out.is_constant ())
ce068755 6956 {
6957 if (dump_enabled_p ())
6958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6959 "missing target support for reduction on"
6960 " variable-length vectors.\n");
6961 return false;
6962 }
6963
633af029 6964 /* For SLP reductions, see if there is a neutral value we can use. */
6965 tree neutral_op = NULL_TREE;
6966 if (slp_node)
e1009321 6967 neutral_op = neutral_op_for_slp_reduction
cd24aa3c 6968 (slp_node_instance->reduc_phis, code,
a477acc5 6969 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
633af029 6970
d77809a4 6971 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6972 {
6973 /* We can't support in-order reductions of code such as this:
6974
6975 for (int i = 0; i < n1; ++i)
6976 for (int j = 0; j < n2; ++j)
6977 l += a[j];
6978
6979 since GCC effectively transforms the loop when vectorizing:
6980
6981 for (int i = 0; i < n1 / VF; ++i)
6982 for (int j = 0; j < n2; ++j)
6983 for (int k = 0; k < VF; ++k)
6984 l += a[j];
6985
6986 which is a reassociation of the original operation. */
6987 if (dump_enabled_p ())
6988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6989 "in-order double reduction not supported.\n");
6990
6991 return false;
6992 }
6993
6994 if (reduction_type == FOLD_LEFT_REDUCTION
6995 && slp_node
1c2fef9a 6996 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
d77809a4 6997 {
6998 /* We cannot use in-order reductions in this case because there is
6999 an implicit reassociation of the operations involved. */
7000 if (dump_enabled_p ())
7001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7002 "in-order unchained SLP reductions not supported.\n");
7003 return false;
7004 }
7005
633af029 7006 /* For double reductions, and for SLP reductions with a neutral value,
7007 we construct a variable-length initial vector by loading a vector
7008 full of the neutral value and then shift-and-inserting the start
7009 values into the low-numbered elements. */
7010 if ((double_reduc || neutral_op)
7011 && !nunits_out.is_constant ()
7012 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7013 vectype_out, OPTIMIZE_FOR_SPEED))
ce068755 7014 {
ce068755 7015 if (dump_enabled_p ())
7016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
633af029 7017 "reduction on variable-length vectors requires"
7018 " target support for a vector-shift-and-insert"
7019 " operation.\n");
ce068755 7020 return false;
7021 }
7022
633af029 7023 /* Check extra constraints for variable-length unchained SLP reductions. */
7024 if (STMT_SLP_TYPE (stmt_info)
1c2fef9a 7025 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
633af029 7026 && !nunits_out.is_constant ())
7027 {
7028 /* We checked above that we could build the initial vector when
7029 there's a neutral element value. Check here for the case in
7030 which each SLP statement has its own initial value and in which
7031 that value needs to be repeated for every instance of the
7032 statement within the initial vector. */
7033 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7034 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7035 if (!neutral_op
7036 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7037 {
7038 if (dump_enabled_p ())
7039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7040 "unsupported form of SLP reduction for"
7041 " variable-length vectors: cannot build"
7042 " initial vector.\n");
7043 return false;
7044 }
7045 /* The epilogue code relies on the number of elements being a multiple
7046 of the group size. The duplicate-and-interleave approach to setting
7047 up the the initial vector does too. */
7048 if (!multiple_p (nunits_out, group_size))
7049 {
7050 if (dump_enabled_p ())
7051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052 "unsupported form of SLP reduction for"
7053 " variable-length vectors: the vector size"
7054 " is not a multiple of the number of results.\n");
7055 return false;
7056 }
ce068755 7057 }
7058
f0c50415 7059 /* In case of widenning multiplication by a constant, we update the type
7060 of the constant to be the type of the other operand. We check that the
7061 constant fits the type in the pattern recognition pass. */
7062 if (code == DOT_PROD_EXPR
7063 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7064 {
7065 if (TREE_CODE (ops[0]) == INTEGER_CST)
7066 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7067 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7068 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7069 else
7070 {
6d8fb6cf 7071 if (dump_enabled_p ())
7bd765d4 7072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 7073 "invalid types in dot-prod\n");
f0c50415 7074
7075 return false;
7076 }
7077 }
7078
3bf95150 7079 if (reduction_type == COND_REDUCTION)
d09d8733 7080 {
7081 widest_int ni;
7082
7083 if (! max_loop_iterations (loop, &ni))
7084 {
7085 if (dump_enabled_p ())
7086 dump_printf_loc (MSG_NOTE, vect_location,
7087 "loop count not known, cannot create cond "
7088 "reduction.\n");
7089 return false;
7090 }
7091 /* Convert backedges to iterations. */
7092 ni += 1;
7093
7094 /* The additional index will be the same type as the condition. Check
7095 that the loop can fit into this less one (because we'll use up the
7096 zero slot for when there are no matches). */
7097 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7098 if (wi::geu_p (ni, wi::to_widest (max_index)))
7099 {
7100 if (dump_enabled_p ())
7101 dump_printf_loc (MSG_NOTE, vect_location,
7102 "loop size is greater than data size.\n");
7103 return false;
7104 }
7105 }
7106
fb85abff 7107 /* In case the vectorization factor (VF) is bigger than the number
7108 of elements that we can fit in a vectype (nunits), we have to generate
7109 more than one vector stmt - i.e - we need to "unroll" the
7110 vector stmt by a factor VF/nunits. For more details see documentation
7111 in vectorizable_operation. */
7112
7113 /* If the reduction is used in an outer loop we need to generate
7114 VF intermediate results, like so (e.g. for ncopies=2):
7115 r0 = phi (init, r0)
7116 r1 = phi (init, r1)
7117 r0 = x0 + r0;
7118 r1 = x1 + r1;
7119 (i.e. we generate VF results in 2 registers).
7120 In this case we have a separate def-use cycle for each copy, and therefore
7121 for each copy we get the vector def for the reduction variable from the
7122 respective phi node created for this copy.
7123
7124 Otherwise (the reduction is unused in the loop nest), we can combine
7125 together intermediate results, like so (e.g. for ncopies=2):
7126 r = phi (init, r)
7127 r = x0 + r;
7128 r = x1 + r;
7129 (i.e. we generate VF/2 results in a single register).
7130 In this case for each copy we get the vector def for the reduction variable
7131 from the vectorized reduction operation generated in the previous iteration.
fb85abff 7132
f17c6474 7133 This only works when we see both the reduction PHI and its only consumer
7134 in vectorizable_reduction and there are no intermediate stmts
7135 participating. */
aaac0b10 7136 stmt_vec_info use_stmt_info;
04eefad5 7137 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
f17c6474 7138 if (ncopies > 1
7139 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
aaac0b10 7140 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
0b7ea3a9 7141 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
fb85abff 7142 {
7143 single_defuse_cycle = true;
7144 epilog_copies = 1;
7145 }
7146 else
7147 epilog_copies = ncopies;
7148
4bde5583 7149 /* If the reduction stmt is one of the patterns that have lane
7150 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7151 if ((ncopies > 1
7152 && ! single_defuse_cycle)
7153 && (code == DOT_PROD_EXPR
7154 || code == WIDEN_SUM_EXPR
7155 || code == SAD_EXPR))
7156 {
7157 if (dump_enabled_p ())
7158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7159 "multi def-use cycle not possible for lane-reducing "
7160 "reduction operation\n");
7161 return false;
7162 }
7163
88fefa8f 7164 if (slp_node)
7165 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7166 else
7167 vec_num = 1;
7168
7169 internal_fn cond_fn = get_conditional_internal_fn (code);
7170 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
d3a7159f 7171 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
88fefa8f 7172
4bde5583 7173 if (!vec_stmt) /* transformation not required. */
7174 {
43157fa0 7175 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
88fefa8f 7176 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7177 {
d77809a4 7178 if (reduction_type != FOLD_LEFT_REDUCTION
d3a7159f 7179 && !mask_by_cond_expr
d77809a4 7180 && (cond_fn == IFN_LAST
7181 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7182 OPTIMIZE_FOR_SPEED)))
88fefa8f 7183 {
7184 if (dump_enabled_p ())
7185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7186 "can't use a fully-masked loop because no"
7187 " conditional operation is available.\n");
7188 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7189 }
7190 else if (reduc_index == -1)
7191 {
7192 if (dump_enabled_p ())
7193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7194 "can't use a fully-masked loop for chained"
7195 " reductions.\n");
7196 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7197 }
7198 else
7199 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7200 vectype_in);
7201 }
d77809a4 7202 if (dump_enabled_p ()
7203 && reduction_type == FOLD_LEFT_REDUCTION)
7204 dump_printf_loc (MSG_NOTE, vect_location,
7205 "using an in-order (fold-left) reduction.\n");
4bde5583 7206 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7207 return true;
7208 }
7209
7210 /* Transform. */
7211
7212 if (dump_enabled_p ())
7213 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7214
7215 /* FORNOW: Multiple types are not supported for condition. */
7216 if (code == COND_EXPR)
7217 gcc_assert (ncopies == 1);
7218
88fefa8f 7219 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7220
d77809a4 7221 if (reduction_type == FOLD_LEFT_REDUCTION)
7222 return vectorize_fold_left_reduction
a73182ff 7223 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
d77809a4 7224 reduc_fn, ops, vectype_in, reduc_index, masks);
7225
3bf95150 7226 if (reduction_type == EXTRACT_LAST_REDUCTION)
7227 {
7228 gcc_assert (!slp_node);
a73182ff 7229 return vectorizable_condition (stmt_info, gsi, vec_stmt,
98acf890 7230 true, NULL, NULL);
3bf95150 7231 }
7232
4bde5583 7233 /* Create the destination vector */
7234 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7235
fb85abff 7236 prev_stmt_info = NULL;
7237 prev_phi_info = NULL;
88fefa8f 7238 if (!slp_node)
eefa05c8 7239 {
f1f41a6c 7240 vec_oprnds0.create (1);
f17c6474 7241 vec_oprnds1.create (1);
eefa05c8 7242 if (op_type == ternary_op)
f17c6474 7243 vec_oprnds2.create (1);
eefa05c8 7244 }
7245
f1f41a6c 7246 phis.create (vec_num);
7247 vect_defs.create (vec_num);
eefa05c8 7248 if (!slp_node)
f1f41a6c 7249 vect_defs.quick_push (NULL_TREE);
eefa05c8 7250
6154acba 7251 if (slp_node)
7252 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7253 else
04eefad5 7254 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6154acba 7255
fb85abff 7256 for (j = 0; j < ncopies; j++)
7257 {
0df23b96 7258 if (code == COND_EXPR)
7259 {
eefa05c8 7260 gcc_assert (!slp_node);
a73182ff 7261 vectorizable_condition (stmt_info, gsi, vec_stmt,
98acf890 7262 true, NULL, NULL);
0df23b96 7263 break;
7264 }
2fbb03c0 7265 if (code == LSHIFT_EXPR
7266 || code == RSHIFT_EXPR)
7267 {
7268 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7269 break;
7270 }
0df23b96 7271
fb85abff 7272 /* Handle uses. */
7273 if (j == 0)
7274 {
bf448dc8 7275 if (slp_node)
7276 {
7277 /* Get vec defs for all the operands except the reduction index,
1013d836 7278 ensuring the ordering of the ops in the vector is kept. */
bf448dc8 7279 auto_vec<tree, 3> slp_ops;
7280 auto_vec<vec<tree>, 3> vec_defs;
7281
f17c6474 7282 slp_ops.quick_push (ops[0]);
7283 slp_ops.quick_push (ops[1]);
bf448dc8 7284 if (op_type == ternary_op)
f17c6474 7285 slp_ops.quick_push (ops[2]);
bf448dc8 7286
4f0d4cce 7287 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
09e31a48 7288
f17c6474 7289 vec_oprnds0.safe_splice (vec_defs[0]);
7290 vec_defs[0].release ();
7291 vec_oprnds1.safe_splice (vec_defs[1]);
7292 vec_defs[1].release ();
bf448dc8 7293 if (op_type == ternary_op)
1013d836 7294 {
f17c6474 7295 vec_oprnds2.safe_splice (vec_defs[2]);
7296 vec_defs[2].release ();
1013d836 7297 }
bf448dc8 7298 }
eefa05c8 7299 else
bf448dc8 7300 {
44b24fa0 7301 vec_oprnds0.quick_push
a73182ff 7302 (vect_get_vec_def_for_operand (ops[0], stmt_info));
f17c6474 7303 vec_oprnds1.quick_push
a73182ff 7304 (vect_get_vec_def_for_operand (ops[1], stmt_info));
eefa05c8 7305 if (op_type == ternary_op)
f17c6474 7306 vec_oprnds2.quick_push
a73182ff 7307 (vect_get_vec_def_for_operand (ops[2], stmt_info));
bf448dc8 7308 }
fb85abff 7309 }
7310 else
7311 {
eefa05c8 7312 if (!slp_node)
7313 {
f17c6474 7314 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
fb85abff 7315
f17c6474 7316 if (single_defuse_cycle && reduc_index == 0)
585ed623 7317 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
f17c6474 7318 else
7319 vec_oprnds0[0]
c0dd122a 7320 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7321 vec_oprnds0[0]);
f17c6474 7322 if (single_defuse_cycle && reduc_index == 1)
585ed623 7323 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
f17c6474 7324 else
7325 vec_oprnds1[0]
c0dd122a 7326 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7327 vec_oprnds1[0]);
f17c6474 7328 if (op_type == ternary_op)
7329 {
7330 if (single_defuse_cycle && reduc_index == 2)
585ed623 7331 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
f17c6474 7332 else
7333 vec_oprnds2[0]
c0dd122a 7334 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7335 vec_oprnds2[0]);
f17c6474 7336 }
7337 }
fb85abff 7338 }
7339
f1f41a6c 7340 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
ade2ac53 7341 {
f17c6474 7342 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
d3a7159f 7343 if (masked_loop_p && !mask_by_cond_expr)
88fefa8f 7344 {
7345 /* Make sure that the reduction accumulator is vop[0]. */
7346 if (reduc_index == 1)
7347 {
7348 gcc_assert (commutative_tree_code (code));
7349 std::swap (vop[0], vop[1]);
7350 }
7351 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7352 vectype_in, i * ncopies + j);
47c52435 7353 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7354 vop[0], vop[1],
7355 vop[0]);
88fefa8f 7356 new_temp = make_ssa_name (vec_dest, call);
7357 gimple_call_set_lhs (call, new_temp);
7358 gimple_call_set_nothrow (call, true);
a73182ff 7359 new_stmt_info
7360 = vect_finish_stmt_generation (stmt_info, call, gsi);
88fefa8f 7361 }
7362 else
7363 {
7364 if (op_type == ternary_op)
7365 vop[2] = vec_oprnds2[i];
eefa05c8 7366
d3a7159f 7367 if (masked_loop_p && mask_by_cond_expr)
7368 {
7369 tree mask = vect_get_loop_mask (gsi, masks,
7370 vec_num * ncopies,
7371 vectype_in, i * ncopies + j);
7372 build_vect_cond_expr (code, vop, mask, gsi);
7373 }
7374
585ed623 7375 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7376 vop[0], vop[1], vop[2]);
50921328 7377 new_temp = make_ssa_name (vec_dest, new_stmt);
7378 gimple_assign_set_lhs (new_stmt, new_temp);
585ed623 7379 new_stmt_info
a73182ff 7380 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
88fefa8f 7381 }
39a5d6b1 7382
eefa05c8 7383 if (slp_node)
7384 {
585ed623 7385 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
f1f41a6c 7386 vect_defs.quick_push (new_temp);
ade2ac53 7387 }
eefa05c8 7388 else
f1f41a6c 7389 vect_defs[0] = new_temp;
ade2ac53 7390 }
7391
eefa05c8 7392 if (slp_node)
7393 continue;
48e1416a 7394
fb85abff 7395 if (j == 0)
585ed623 7396 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
fb85abff 7397 else
585ed623 7398 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
0df23b96 7399
585ed623 7400 prev_stmt_info = new_stmt_info;
fb85abff 7401 }
7402
7403 /* Finalize the reduction-phi (set its arguments) and create the
7404 epilog reduction code. */
eefa05c8 7405 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
435515db 7406 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
eefa05c8 7407
a73182ff 7408 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
e53664fa 7409 epilog_copies, reduc_fn, phis,
fdf40949 7410 double_reduc, slp_node, slp_node_instance,
633af029 7411 cond_reduc_val, cond_reduc_op_code,
7412 neutral_op);
eefa05c8 7413
fb85abff 7414 return true;
7415}
7416
7417/* Function vect_min_worthwhile_factor.
7418
7419 For a loop where we could vectorize the operation indicated by CODE,
7420 return the minimum vectorization factor that makes it worthwhile
7421 to use generic vectors. */
d75596cd 7422static unsigned int
fb85abff 7423vect_min_worthwhile_factor (enum tree_code code)
7424{
7425 switch (code)
7426 {
7427 case PLUS_EXPR:
7428 case MINUS_EXPR:
7429 case NEGATE_EXPR:
7430 return 4;
7431
7432 case BIT_AND_EXPR:
7433 case BIT_IOR_EXPR:
7434 case BIT_XOR_EXPR:
7435 case BIT_NOT_EXPR:
7436 return 2;
7437
7438 default:
7439 return INT_MAX;
7440 }
7441}
7442
fec8b6d0 7443/* Return true if VINFO indicates we are doing loop vectorization and if
7444 it is worth decomposing CODE operations into scalar operations for
7445 that loop's vectorization factor. */
7446
7447bool
7448vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7449{
7450 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
d75596cd 7451 unsigned HOST_WIDE_INT value;
fec8b6d0 7452 return (loop_vinfo
d75596cd 7453 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7454 && value >= vect_min_worthwhile_factor (code));
fec8b6d0 7455}
fb85abff 7456
7457/* Function vectorizable_induction
7458
ecc42a77 7459 Check if STMT_INFO performs an induction computation that can be vectorized.
fb85abff 7460 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7461 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
ecc42a77 7462 Return true if STMT_INFO is vectorizable in this way. */
fb85abff 7463
03f1a648 7464bool
ecc42a77 7465vectorizable_induction (stmt_vec_info stmt_info,
03f1a648 7466 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
435515db 7467 stmt_vec_info *vec_stmt, slp_tree slp_node,
c863e35b 7468 stmt_vector_for_cost *cost_vec)
03f1a648 7469{
03f1a648 7470 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2e966e2a 7471 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
03f1a648 7472 unsigned ncopies;
7473 bool nested_in_vect_loop = false;
2e966e2a 7474 class loop *iv_loop;
03f1a648 7475 tree vec_def;
7476 edge pe = loop_preheader_edge (loop);
7477 basic_block new_bb;
7478 tree new_vec, vec_init, vec_step, t;
7479 tree new_name;
7480 gimple *new_stmt;
7481 gphi *induction_phi;
7482 tree induc_def, vec_dest;
7483 tree init_expr, step_expr;
d75596cd 7484 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
03f1a648 7485 unsigned i;
7486 tree expr;
7487 gimple_seq stmts;
7488 imm_use_iterator imm_iter;
7489 use_operand_p use_p;
7490 gimple *exit_phi;
7491 edge latch_e;
7492 tree loop_arg;
7493 gimple_stmt_iterator si;
03f1a648 7494
ecc42a77 7495 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7496 if (!phi)
03f1a648 7497 return false;
7498
7499 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7500 return false;
7501
7502 /* Make sure it was recognized as induction computation. */
7503 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7504 return false;
7505
fb85abff 7506 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
833ff7f4 7507 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
fb85abff 7508
5cc7beaa 7509 if (slp_node)
7510 ncopies = 1;
7511 else
4eb17cb6 7512 ncopies = vect_get_num_copies (loop_vinfo, vectype);
fb85abff 7513 gcc_assert (ncopies >= 1);
03f1a648 7514
02a2bdca 7515 /* FORNOW. These restrictions should be relaxed. */
a73182ff 7516 if (nested_in_vect_loop_p (loop, stmt_info))
fb85abff 7517 {
02a2bdca 7518 imm_use_iterator imm_iter;
7519 use_operand_p use_p;
42acab1c 7520 gimple *exit_phi;
02a2bdca 7521 edge latch_e;
7522 tree loop_arg;
7523
7524 if (ncopies > 1)
7525 {
6d8fb6cf 7526 if (dump_enabled_p ())
7bd765d4 7527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 7528 "multiple types in nested loop.\n");
02a2bdca 7529 return false;
7530 }
7531
03f1a648 7532 /* FORNOW: outer loop induction with SLP not supported. */
7533 if (STMT_SLP_TYPE (stmt_info))
7534 return false;
7535
02a2bdca 7536 exit_phi = NULL;
7537 latch_e = loop_latch_edge (loop->inner);
7538 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7539 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7540 {
42acab1c 7541 gimple *use_stmt = USE_STMT (use_p);
0b308eee 7542 if (is_gimple_debug (use_stmt))
7543 continue;
7544
7545 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
02a2bdca 7546 {
0b308eee 7547 exit_phi = use_stmt;
02a2bdca 7548 break;
7549 }
7550 }
7551 if (exit_phi)
7552 {
03c0d666 7553 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
02a2bdca 7554 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7555 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7556 {
6d8fb6cf 7557 if (dump_enabled_p ())
78bb46f5 7558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 7559 "inner-loop induction only used outside "
78bb46f5 7560 "of the outer vectorized loop.\n");
02a2bdca 7561 return false;
7562 }
7563 }
fb85abff 7564
03f1a648 7565 nested_in_vect_loop = true;
7566 iv_loop = loop->inner;
7567 }
7568 else
7569 iv_loop = loop;
7570 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
fb85abff 7571
833ff7f4 7572 if (slp_node && !nunits.is_constant ())
7573 {
7574 /* The current SLP code creates the initial value element-by-element. */
7575 if (dump_enabled_p ())
7576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7577 "SLP induction not supported for variable-length"
7578 " vectors.\n");
7579 return false;
7580 }
7581
fb85abff 7582 if (!vec_stmt) /* transformation not required. */
7583 {
7584 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
88f6eb8f 7585 DUMP_VECT_SCOPE ("vectorizable_induction");
c863e35b 7586 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
fb85abff 7587 return true;
7588 }
7589
16ed3c2c 7590 /* Transform. */
fb85abff 7591
03f1a648 7592 /* Compute a vector variable, initialized with the first VF values of
7593 the induction variable. E.g., for an iv with IV_PHI='X' and
7594 evolution S, for a vector of 4 units, we want to compute:
7595 [X, X + S, X + 2*S, X + 3*S]. */
7596
6d8fb6cf 7597 if (dump_enabled_p ())
78bb46f5 7598 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
fb85abff 7599
03f1a648 7600 latch_e = loop_latch_edge (iv_loop);
7601 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7602
7603 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7604 gcc_assert (step_expr != NULL_TREE);
7605
7606 pe = loop_preheader_edge (iv_loop);
7607 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7608 loop_preheader_edge (iv_loop));
7609
03f1a648 7610 stmts = NULL;
4ca4c75d 7611 if (!nested_in_vect_loop)
6753a4bf 7612 {
4ca4c75d 7613 /* Convert the initial value to the desired type. */
7614 tree new_type = TREE_TYPE (vectype);
7615 init_expr = gimple_convert (&stmts, new_type, init_expr);
7616
7617 /* If we are using the loop mask to "peel" for alignment then we need
7618 to adjust the start value here. */
7619 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7620 if (skip_niters != NULL_TREE)
7621 {
7622 if (FLOAT_TYPE_P (vectype))
7623 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7624 skip_niters);
7625 else
7626 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7627 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7628 skip_niters, step_expr);
7629 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7630 init_expr, skip_step);
7631 }
6753a4bf 7632 }
7633
4ca4c75d 7634 /* Convert the step to the desired type. */
7635 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7636
03f1a648 7637 if (stmts)
7638 {
7639 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7640 gcc_assert (!new_bb);
7641 }
7642
7643 /* Find the first insertion point in the BB. */
ecc42a77 7644 basic_block bb = gimple_bb (phi);
03f1a648 7645 si = gsi_after_labels (bb);
7646
5cc7beaa 7647 /* For SLP induction we have to generate several IVs as for example
7648 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7649 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7650 [VF*S, VF*S, VF*S, VF*S] for all. */
7651 if (slp_node)
7652 {
833ff7f4 7653 /* Enforced above. */
7654 unsigned int const_nunits = nunits.to_constant ();
7655
5cc7beaa 7656 /* Generate [VF*S, VF*S, ... ]. */
7657 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7658 {
7659 expr = build_int_cst (integer_type_node, vf);
7660 expr = fold_convert (TREE_TYPE (step_expr), expr);
7661 }
7662 else
7663 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7664 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7665 expr, step_expr);
7666 if (! CONSTANT_CLASS_P (new_name))
a73182ff 7667 new_name = vect_init_vector (stmt_info, new_name,
5cc7beaa 7668 TREE_TYPE (step_expr), NULL);
7669 new_vec = build_vector_from_val (vectype, new_name);
a73182ff 7670 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
5cc7beaa 7671
7672 /* Now generate the IVs. */
7673 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7674 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
833ff7f4 7675 unsigned elts = const_nunits * nvects;
7676 unsigned nivs = least_common_multiple (group_size,
7677 const_nunits) / const_nunits;
5cc7beaa 7678 gcc_assert (elts % group_size == 0);
7679 tree elt = init_expr;
7680 unsigned ivn;
7681 for (ivn = 0; ivn < nivs; ++ivn)
7682 {
833ff7f4 7683 tree_vector_builder elts (vectype, const_nunits, 1);
9ed1960b 7684 stmts = NULL;
833ff7f4 7685 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
5cc7beaa 7686 {
833ff7f4 7687 if (ivn*const_nunits + eltn >= group_size
7688 && (ivn * const_nunits + eltn) % group_size == 0)
9ed1960b 7689 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7690 elt, step_expr);
eab42b58 7691 elts.quick_push (elt);
5cc7beaa 7692 }
db39ad9d 7693 vec_init = gimple_build_vector (&stmts, &elts);
9ed1960b 7694 if (stmts)
5cc7beaa 7695 {
9ed1960b 7696 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7697 gcc_assert (!new_bb);
5cc7beaa 7698 }
5cc7beaa 7699
7700 /* Create the induction-phi that defines the induction-operand. */
7701 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7702 induction_phi = create_phi_node (vec_dest, iv_loop->header);
dc1fb456 7703 stmt_vec_info induction_phi_info
7704 = loop_vinfo->add_stmt (induction_phi);
5cc7beaa 7705 induc_def = PHI_RESULT (induction_phi);
7706
7707 /* Create the iv update inside the loop */
7708 vec_def = make_ssa_name (vec_dest);
7709 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7710 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
04b2391d 7711 loop_vinfo->add_stmt (new_stmt);
5cc7beaa 7712
7713 /* Set the arguments of the phi node: */
7714 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7715 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7716 UNKNOWN_LOCATION);
7717
dc1fb456 7718 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
5cc7beaa 7719 }
7720
7721 /* Re-use IVs when we can. */
7722 if (ivn < nvects)
7723 {
7724 unsigned vfp
833ff7f4 7725 = least_common_multiple (group_size, const_nunits) / group_size;
5cc7beaa 7726 /* Generate [VF'*S, VF'*S, ... ]. */
7727 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7728 {
7729 expr = build_int_cst (integer_type_node, vfp);
7730 expr = fold_convert (TREE_TYPE (step_expr), expr);
7731 }
7732 else
7733 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7734 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7735 expr, step_expr);
7736 if (! CONSTANT_CLASS_P (new_name))
a73182ff 7737 new_name = vect_init_vector (stmt_info, new_name,
5cc7beaa 7738 TREE_TYPE (step_expr), NULL);
7739 new_vec = build_vector_from_val (vectype, new_name);
a73182ff 7740 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
5cc7beaa 7741 for (; ivn < nvects; ++ivn)
7742 {
dc1fb456 7743 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
5cc7beaa 7744 tree def;
7745 if (gimple_code (iv) == GIMPLE_PHI)
7746 def = gimple_phi_result (iv);
7747 else
7748 def = gimple_assign_lhs (iv);
7749 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7750 PLUS_EXPR,
7751 def, vec_step);
7752 if (gimple_code (iv) == GIMPLE_PHI)
7753 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7754 else
7755 {
7756 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7757 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7758 }
dc1fb456 7759 SLP_TREE_VEC_STMTS (slp_node).quick_push
7760 (loop_vinfo->add_stmt (new_stmt));
5cc7beaa 7761 }
7762 }
7763
7764 return true;
7765 }
7766
03f1a648 7767 /* Create the vector that holds the initial_value of the induction. */
7768 if (nested_in_vect_loop)
7769 {
7770 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7771 been created during vectorization of previous stmts. We obtain it
7772 from the STMT_VINFO_VEC_STMT of the defining stmt. */
a73182ff 7773 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
03f1a648 7774 /* If the initial value is not of proper type, convert it. */
7775 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7776 {
7777 new_stmt
7778 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7779 vect_simple_var,
7780 "vec_iv_"),
7781 VIEW_CONVERT_EXPR,
7782 build1 (VIEW_CONVERT_EXPR, vectype,
7783 vec_init));
7784 vec_init = gimple_assign_lhs (new_stmt);
7785 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7786 new_stmt);
7787 gcc_assert (!new_bb);
04b2391d 7788 loop_vinfo->add_stmt (new_stmt);
03f1a648 7789 }
7790 }
7791 else
7792 {
03f1a648 7793 /* iv_loop is the loop to be vectorized. Create:
7794 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7795 stmts = NULL;
7796 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7797
833ff7f4 7798 unsigned HOST_WIDE_INT const_nunits;
7799 if (nunits.is_constant (&const_nunits))
03f1a648 7800 {
833ff7f4 7801 tree_vector_builder elts (vectype, const_nunits, 1);
9ed1960b 7802 elts.quick_push (new_name);
833ff7f4 7803 for (i = 1; i < const_nunits; i++)
7804 {
7805 /* Create: new_name_i = new_name + step_expr */
7806 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7807 new_name, step_expr);
7808 elts.quick_push (new_name);
7809 }
7810 /* Create a vector from [new_name_0, new_name_1, ...,
7811 new_name_nunits-1] */
7812 vec_init = gimple_build_vector (&stmts, &elts);
03f1a648 7813 }
833ff7f4 7814 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7815 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7816 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7817 new_name, step_expr);
7818 else
7819 {
7820 /* Build:
7821 [base, base, base, ...]
7822 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7823 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7824 gcc_assert (flag_associative_math);
7825 tree index = build_index_vector (vectype, 0, 1);
7826 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7827 new_name);
7828 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7829 step_expr);
7830 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7831 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7832 vec_init, step_vec);
7833 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7834 vec_init, base_vec);
7835 }
7836
03f1a648 7837 if (stmts)
7838 {
7839 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7840 gcc_assert (!new_bb);
7841 }
03f1a648 7842 }
7843
7844
7845 /* Create the vector that holds the step of the induction. */
7846 if (nested_in_vect_loop)
7847 /* iv_loop is nested in the loop to be vectorized. Generate:
7848 vec_step = [S, S, S, S] */
7849 new_name = step_expr;
7850 else
7851 {
7852 /* iv_loop is the loop to be vectorized. Generate:
7853 vec_step = [VF*S, VF*S, VF*S, VF*S] */
f3e1d2c3 7854 gimple_seq seq = NULL;
03f1a648 7855 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7856 {
7857 expr = build_int_cst (integer_type_node, vf);
f3e1d2c3 7858 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
03f1a648 7859 }
7860 else
7861 expr = build_int_cst (TREE_TYPE (step_expr), vf);
f3e1d2c3 7862 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7863 expr, step_expr);
7864 if (seq)
7865 {
7866 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7867 gcc_assert (!new_bb);
7868 }
03f1a648 7869 }
7870
7871 t = unshare_expr (new_name);
7872 gcc_assert (CONSTANT_CLASS_P (new_name)
7873 || TREE_CODE (new_name) == SSA_NAME);
7874 new_vec = build_vector_from_val (vectype, t);
a73182ff 7875 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
03f1a648 7876
7877
7878 /* Create the following def-use cycle:
7879 loop prolog:
7880 vec_init = ...
7881 vec_step = ...
7882 loop:
7883 vec_iv = PHI <vec_init, vec_loop>
7884 ...
7885 STMT
7886 ...
7887 vec_loop = vec_iv + vec_step; */
7888
7889 /* Create the induction-phi that defines the induction-operand. */
7890 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7891 induction_phi = create_phi_node (vec_dest, iv_loop->header);
04b2391d 7892 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
03f1a648 7893 induc_def = PHI_RESULT (induction_phi);
7894
7895 /* Create the iv update inside the loop */
7896 vec_def = make_ssa_name (vec_dest);
7897 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7898 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
04b2391d 7899 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
03f1a648 7900
7901 /* Set the arguments of the phi node: */
7902 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7903 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7904 UNKNOWN_LOCATION);
7905
435515db 7906 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
03f1a648 7907
7908 /* In case that vectorization factor (VF) is bigger than the number
7909 of elements that we can fit in a vectype (nunits), we have to generate
7910 more than one vector stmt - i.e - we need to "unroll" the
7911 vector stmt by a factor VF/nunits. For more details see documentation
7912 in vectorizable_operation. */
7913
7914 if (ncopies > 1)
7915 {
f3e1d2c3 7916 gimple_seq seq = NULL;
03f1a648 7917 stmt_vec_info prev_stmt_vinfo;
7918 /* FORNOW. This restriction should be relaxed. */
7919 gcc_assert (!nested_in_vect_loop);
7920
7921 /* Create the vector that holds the step of the induction. */
7922 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7923 {
7924 expr = build_int_cst (integer_type_node, nunits);
f3e1d2c3 7925 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
03f1a648 7926 }
7927 else
7928 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
f3e1d2c3 7929 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7930 expr, step_expr);
7931 if (seq)
7932 {
7933 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7934 gcc_assert (!new_bb);
7935 }
7936
03f1a648 7937 t = unshare_expr (new_name);
7938 gcc_assert (CONSTANT_CLASS_P (new_name)
7939 || TREE_CODE (new_name) == SSA_NAME);
7940 new_vec = build_vector_from_val (vectype, t);
a73182ff 7941 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
03f1a648 7942
7943 vec_def = induc_def;
04b2391d 7944 prev_stmt_vinfo = induction_phi_info;
03f1a648 7945 for (i = 1; i < ncopies; i++)
7946 {
7947 /* vec_i = vec_prev + vec_step */
7948 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7949 vec_def, vec_step);
7950 vec_def = make_ssa_name (vec_dest, new_stmt);
7951 gimple_assign_set_lhs (new_stmt, vec_def);
7952
7953 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
04b2391d 7954 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
aebdbd31 7955 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
04b2391d 7956 prev_stmt_vinfo = new_stmt_info;
03f1a648 7957 }
7958 }
7959
7960 if (nested_in_vect_loop)
7961 {
7962 /* Find the loop-closed exit-phi of the induction, and record
7963 the final vector of induction results: */
7964 exit_phi = NULL;
7965 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7966 {
7967 gimple *use_stmt = USE_STMT (use_p);
7968 if (is_gimple_debug (use_stmt))
7969 continue;
7970
7971 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7972 {
7973 exit_phi = use_stmt;
7974 break;
7975 }
7976 }
7977 if (exit_phi)
7978 {
03c0d666 7979 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
03f1a648 7980 /* FORNOW. Currently not supporting the case that an inner-loop induction
7981 is not used in the outer-loop (i.e. only outside the outer-loop). */
7982 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7983 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7984
435515db 7985 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
03f1a648 7986 if (dump_enabled_p ())
a4e972e3 7987 dump_printf_loc (MSG_NOTE, vect_location,
7988 "vector of inductions after inner-loop:%G",
7989 new_stmt);
03f1a648 7990 }
7991 }
7992
7993
7994 if (dump_enabled_p ())
a4e972e3 7995 dump_printf_loc (MSG_NOTE, vect_location,
7996 "transform induction: created def-use cycle: %G%G",
7997 induction_phi, SSA_NAME_DEF_STMT (vec_def));
03f1a648 7998
fb85abff 7999 return true;
8000}
8001
8002/* Function vectorizable_live_operation.
8003
ecc42a77 8004 STMT_INFO computes a value that is used outside the loop. Check if
fb85abff 8005 it can be supported. */
8006
8007bool
ecc42a77 8008vectorizable_live_operation (stmt_vec_info stmt_info,
fb85abff 8009 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
75aae5b4 8010 slp_tree slp_node, int slp_index,
435515db 8011 stmt_vec_info *vec_stmt,
c863e35b 8012 stmt_vector_for_cost *)
fb85abff 8013{
fb85abff 8014 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2e966e2a 8015 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
75aae5b4 8016 imm_use_iterator imm_iter;
8017 tree lhs, lhs_type, bitsize, vec_bitsize;
8018 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
fc9fb8de 8019 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4eb17cb6 8020 int ncopies;
75aae5b4 8021 gimple *use_stmt;
8022 auto_vec<tree> vec_oprnds;
fc9fb8de 8023 int vec_entry = 0;
8024 poly_uint64 vec_index = 0;
fb85abff 8025
8026 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4eb17cb6 8027
fb85abff 8028 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8029 return false;
8030
75aae5b4 8031 /* FORNOW. CHECKME. */
a73182ff 8032 if (nested_in_vect_loop_p (loop, stmt_info))
75aae5b4 8033 return false;
8034
cf573a72 8035 /* If STMT is not relevant and it is a simple assignment and its inputs are
8036 invariant then it can remain in place, unvectorized. The original last
8037 scalar value that it computes will be used. */
8038 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3d483a94 8039 {
a73182ff 8040 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
75aae5b4 8041 if (dump_enabled_p ())
8042 dump_printf_loc (MSG_NOTE, vect_location,
8043 "statement is simple and uses invariant. Leaving in "
8044 "place.\n");
8045 return true;
8046 }
3d483a94 8047
7aaadbe8 8048 if (slp_node)
8049 ncopies = 1;
8050 else
8051 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8052
fc9fb8de 8053 if (slp_node)
8054 {
8055 gcc_assert (slp_index >= 0);
8056
8057 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8058 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8059
8060 /* Get the last occurrence of the scalar index from the concatenation of
8061 all the slp vectors. Calculate which slp vector it is and the index
8062 within. */
8063 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8064
8065 /* Calculate which vector contains the result, and which lane of
8066 that vector we need. */
8067 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8068 {
8069 if (dump_enabled_p ())
8070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8071 "Cannot determine which vector holds the"
8072 " final result.\n");
8073 return false;
8074 }
8075 }
8076
75aae5b4 8077 if (!vec_stmt)
60b29a7e 8078 {
384eaff1 8079 /* No transformation required. */
60b29a7e 8080 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8081 {
384eaff1 8082 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8083 OPTIMIZE_FOR_SPEED))
8084 {
8085 if (dump_enabled_p ())
8086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8087 "can't use a fully-masked loop because "
8088 "the target doesn't support extract last "
8089 "reduction.\n");
8090 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8091 }
8092 else if (slp_node)
8093 {
8094 if (dump_enabled_p ())
8095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8096 "can't use a fully-masked loop because an "
8097 "SLP statement is live after the loop.\n");
8098 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8099 }
8100 else if (ncopies > 1)
8101 {
8102 if (dump_enabled_p ())
8103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8104 "can't use a fully-masked loop because"
8105 " ncopies is greater than 1.\n");
8106 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8107 }
8108 else
8109 {
8110 gcc_assert (ncopies == 1 && !slp_node);
8111 vect_record_loop_mask (loop_vinfo,
8112 &LOOP_VINFO_MASKS (loop_vinfo),
8113 1, vectype);
8114 }
60b29a7e 8115 }
60b29a7e 8116 return true;
8117 }
3d483a94 8118
4a59791f 8119 /* Use the lhs of the original scalar statement. */
8120 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
fb85abff 8121
75aae5b4 8122 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8123 : gimple_get_lhs (stmt);
8124 lhs_type = TREE_TYPE (lhs);
fb85abff 8125
aa8a4b0b 8126 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8127 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8128 : TYPE_SIZE (TREE_TYPE (vectype)));
75aae5b4 8129 vec_bitsize = TYPE_SIZE (vectype);
fb85abff 8130
75aae5b4 8131 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8132 tree vec_lhs, bitstart;
8133 if (slp_node)
fb85abff 8134 {
384eaff1 8135 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8136
75aae5b4 8137 /* Get the correct slp vectorized stmt. */
dc1fb456 8138 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7feaafa2 8139 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8140 vec_lhs = gimple_phi_result (phi);
8141 else
8142 vec_lhs = gimple_get_lhs (vec_stmt);
75aae5b4 8143
8144 /* Get entry to use. */
f9674f3d 8145 bitstart = bitsize_int (vec_index);
75aae5b4 8146 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
75aae5b4 8147 }
8148 else
8149 {
8150 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
bfa5bad6 8151 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
384eaff1 8152 gcc_checking_assert (ncopies == 1
8153 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
75aae5b4 8154
8155 /* For multiple copies, get the last copy. */
8156 for (int i = 1; i < ncopies; ++i)
c0dd122a 8157 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
75aae5b4 8158
8159 /* Get the last lane in the vector. */
8160 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
fb85abff 8161 }
8162
bb038f3e 8163 gimple_seq stmts = NULL;
384eaff1 8164 tree new_tree;
8165 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8166 {
8167 /* Emit:
8168
8169 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8170
8171 where VEC_LHS is the vectorized live-out result and MASK is
8172 the loop mask for the final iteration. */
8173 gcc_assert (ncopies == 1 && !slp_node);
8174 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
384eaff1 8175 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8176 1, vectype, 0);
d5a19a73 8177 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8178 scalar_type, mask, vec_lhs);
384eaff1 8179
8180 /* Convert the extracted vector element to the required scalar type. */
8181 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8182 }
8183 else
8184 {
8185 tree bftype = TREE_TYPE (vectype);
8186 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8187 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8188 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8189 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8190 &stmts, true, NULL_TREE);
8191 }
8192
bb038f3e 8193 if (stmts)
8194 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
75aae5b4 8195
87b138f0 8196 /* Replace use of lhs with newly computed result. If the use stmt is a
8197 single arg PHI, just replace all uses of PHI result. It's necessary
8198 because lcssa PHI defining lhs may be before newly inserted stmt. */
8199 use_operand_p use_p;
8200 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8201 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8202 && !is_gimple_debug (use_stmt))
29b68e50 8203 {
87b138f0 8204 if (gimple_code (use_stmt) == GIMPLE_PHI
8205 && gimple_phi_num_args (use_stmt) == 1)
8206 {
8207 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8208 }
8209 else
8210 {
8211 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8212 SET_USE (use_p, new_tree);
8213 }
29b68e50 8214 update_stmt (use_stmt);
8215 }
75aae5b4 8216
fb85abff 8217 return true;
8218}
8219
ecc42a77 8220/* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
4c48884e 8221
8222static void
2e966e2a 8223vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
4c48884e 8224{
8225 ssa_op_iter op_iter;
8226 imm_use_iterator imm_iter;
8227 def_operand_p def_p;
42acab1c 8228 gimple *ustmt;
4c48884e 8229
ecc42a77 8230 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
4c48884e 8231 {
8232 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8233 {
8234 basic_block bb;
8235
8236 if (!is_gimple_debug (ustmt))
8237 continue;
8238
8239 bb = gimple_bb (ustmt);
8240
8241 if (!flow_bb_inside_loop_p (loop, bb))
8242 {
8243 if (gimple_debug_bind_p (ustmt))
8244 {
6d8fb6cf 8245 if (dump_enabled_p ())
7bd765d4 8246 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 8247 "killing debug use\n");
4c48884e 8248
8249 gimple_debug_bind_reset_value (ustmt);
8250 update_stmt (ustmt);
8251 }
8252 else
8253 gcc_unreachable ();
8254 }
8255 }
8256 }
8257}
8258
637a7045 8259/* Given loop represented by LOOP_VINFO, return true if computation of
8260 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8261 otherwise. */
8262
8263static bool
8264loop_niters_no_overflow (loop_vec_info loop_vinfo)
8265{
8266 /* Constant case. */
8267 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8268 {
8269 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8270 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8271
8272 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8273 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8274 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8275 return true;
8276 }
8277
8278 widest_int max;
2e966e2a 8279 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
637a7045 8280 /* Check the upper bound of loop niters. */
8281 if (get_max_loop_iterations (loop, &max))
8282 {
8283 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8284 signop sgn = TYPE_SIGN (type);
8285 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8286 if (max < type_max)
8287 return true;
8288 }
8289 return false;
8290}
8291
60b29a7e 8292/* Return a mask type with half the number of elements as TYPE. */
8293
8294tree
8295vect_halve_mask_nunits (tree type)
8296{
8297 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8298 return build_truth_vector_type (nunits, current_vector_size);
8299}
8300
8301/* Return a mask type with twice as many elements as TYPE. */
8302
8303tree
8304vect_double_mask_nunits (tree type)
8305{
8306 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8307 return build_truth_vector_type (nunits, current_vector_size);
8308}
8309
8310/* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8311 contain a sequence of NVECTORS masks that each control a vector of type
8312 VECTYPE. */
8313
8314void
8315vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8316 unsigned int nvectors, tree vectype)
8317{
8318 gcc_assert (nvectors != 0);
8319 if (masks->length () < nvectors)
8320 masks->safe_grow_cleared (nvectors);
8321 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8322 /* The number of scalars per iteration and the number of vectors are
8323 both compile-time constants. */
8324 unsigned int nscalars_per_iter
8325 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8326 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8327 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8328 {
8329 rgm->max_nscalars_per_iter = nscalars_per_iter;
8330 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8331 }
8332}
8333
8334/* Given a complete set of masks MASKS, extract mask number INDEX
8335 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8336 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8337
8338 See the comment above vec_loop_masks for more details about the mask
8339 arrangement. */
8340
8341tree
8342vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8343 unsigned int nvectors, tree vectype, unsigned int index)
8344{
8345 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8346 tree mask_type = rgm->mask_type;
8347
8348 /* Populate the rgroup's mask array, if this is the first time we've
8349 used it. */
8350 if (rgm->masks.is_empty ())
8351 {
8352 rgm->masks.safe_grow_cleared (nvectors);
8353 for (unsigned int i = 0; i < nvectors; ++i)
8354 {
8355 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8356 /* Provide a dummy definition until the real one is available. */
8357 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8358 rgm->masks[i] = mask;
8359 }
8360 }
8361
8362 tree mask = rgm->masks[index];
8363 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8364 TYPE_VECTOR_SUBPARTS (vectype)))
8365 {
8366 /* A loop mask for data type X can be reused for data type Y
8367 if X has N times more elements than Y and if Y's elements
8368 are N times bigger than X's. In this case each sequence
8369 of N elements in the loop mask will be all-zero or all-one.
8370 We can then view-convert the mask so that each sequence of
8371 N elements is replaced by a single element. */
8372 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8373 TYPE_VECTOR_SUBPARTS (vectype)));
8374 gimple_seq seq = NULL;
8375 mask_type = build_same_sized_truth_vector_type (vectype);
8376 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8377 if (seq)
8378 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8379 }
8380 return mask;
8381}
8382
12420a15 8383/* Scale profiling counters by estimation for LOOP which is vectorized
8384 by factor VF. */
8385
8386static void
2e966e2a 8387scale_profile_for_vect_loop (class loop *loop, unsigned vf)
12420a15 8388{
8389 edge preheader = loop_preheader_edge (loop);
8390 /* Reduce loop iterations by the vectorization factor. */
8391 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
ea5d3981 8392 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
12420a15 8393
205ce1aa 8394 if (freq_h.nonzero_p ())
12420a15 8395 {
ca69b069 8396 profile_probability p;
12420a15 8397
8398 /* Avoid dropping loop body profile counter to 0 because of zero count
8399 in loop's preheader. */
205ce1aa 8400 if (!(freq_e == profile_count::zero ()))
8401 freq_e = freq_e.force_nonzero ();
ca69b069 8402 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8403 scale_loop_frequencies (loop, p);
12420a15 8404 }
8405
12420a15 8406 edge exit_e = single_exit (loop);
720cfc43 8407 exit_e->probability = profile_probability::always ()
8408 .apply_scale (1, new_est_niter + 1);
12420a15 8409
8410 edge exit_l = single_pred_edge (loop->latch);
7ec47501 8411 profile_probability prob = exit_l->probability;
720cfc43 8412 exit_l->probability = exit_e->probability.invert ();
7ec47501 8413 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8414 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
12420a15 8415}
8416
e068828a 8417/* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
0384ddb0 8418 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8419 stmt_vec_info. */
daec18f5 8420
8421static void
e068828a 8422vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
0384ddb0 8423 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
daec18f5 8424{
2e966e2a 8425 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
daec18f5 8426 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
daec18f5 8427
8428 if (dump_enabled_p ())
a4e972e3 8429 dump_printf_loc (MSG_NOTE, vect_location,
8430 "------>vectorizing statement: %G", stmt_info->stmt);
daec18f5 8431
8432 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
a73182ff 8433 vect_loop_kill_debug_uses (loop, stmt_info);
daec18f5 8434
8435 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8436 && !STMT_VINFO_LIVE_P (stmt_info))
8437 return;
8438
8439 if (STMT_VINFO_VECTYPE (stmt_info))
8440 {
8441 poly_uint64 nunits
8442 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8443 if (!STMT_SLP_TYPE (stmt_info)
8444 && maybe_ne (nunits, vf)
8445 && dump_enabled_p ())
8446 /* For SLP VF is set according to unrolling factor, and not
8447 to vector size, hence for SLP this print is not valid. */
8448 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8449 }
8450
0384ddb0 8451 /* Pure SLP statements have already been vectorized. We still need
8452 to apply loop vectorization to hybrid SLP statements. */
8453 if (PURE_SLP_STMT (stmt_info))
8454 return;
daec18f5 8455
8456 if (dump_enabled_p ())
8457 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8458
9632f098 8459 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
daec18f5 8460 *seen_store = stmt_info;
8461}
8462
fb85abff 8463/* Function vect_transform_loop.
8464
8465 The analysis phase has determined that the loop is vectorizable.
8466 Vectorize the loop - created vectorized stmts to replace the scalar
5b631e09 8467 stmts in the loop, and update the loop exit condition.
8468 Returns scalar epilogue loop if any. */
fb85abff 8469
2e966e2a 8470class loop *
fb85abff 8471vect_transform_loop (loop_vec_info loop_vinfo)
8472{
2e966e2a 8473 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8474 class loop *epilogue = NULL;
fb85abff 8475 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8476 int nbbs = loop->num_nodes;
fb85abff 8477 int i;
cde959e7 8478 tree niters_vector = NULL_TREE;
8479 tree step_vector = NULL_TREE;
8480 tree niters_vector_mult_vf = NULL_TREE;
d75596cd 8481 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8482 unsigned int lowest_vf = constant_lower_bound (vf);
daec18f5 8483 gimple *stmt;
13b31e0b 8484 bool check_profitability = false;
d75596cd 8485 unsigned int th;
fb85abff 8486
88f6eb8f 8487 DUMP_VECT_SCOPE ("vec_transform_loop");
fb85abff 8488
a99aba41 8489 loop_vinfo->shared->check_datarefs ();
8490
e7430948 8491 /* Use the more conservative vectorization threshold. If the number
8492 of iterations is constant assume the cost check has been performed
8493 by our caller. If the threshold makes all loops profitable that
d75596cd 8494 run at least the (estimated) vectorization factor number of times
8495 checking is pointless, too. */
004a94a5 8496 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
d75596cd 8497 if (th >= vect_vf_for_cost (loop_vinfo)
e7430948 8498 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8499 {
6d8fb6cf 8500 if (dump_enabled_p ())
7bd765d4 8501 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 8502 "Profitability threshold is %d loop iterations.\n",
8503 th);
e7430948 8504 check_profitability = true;
8505 }
8506
19961a78 8507 /* Make sure there exists a single-predecessor exit bb. Do this before
8508 versioning. */
8509 edge e = single_exit (loop);
8510 if (! single_pred_p (e->dest))
8511 {
6bae816f 8512 split_loop_exit_edge (e, true);
19961a78 8513 if (dump_enabled_p ())
8514 dump_printf (MSG_NOTE, "split exit edge\n");
8515 }
8516
2cd0995e 8517 /* Version the loop first, if required, so the profitability check
8518 comes first. */
23a3430d 8519
d5e80d93 8520 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
e7430948 8521 {
7456a7ea 8522 poly_uint64 versioning_threshold
8523 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8524 if (check_profitability
8525 && ordered_p (poly_uint64 (th), versioning_threshold))
8526 {
8527 versioning_threshold = ordered_max (poly_uint64 (th),
8528 versioning_threshold);
8529 check_profitability = false;
8530 }
2e966e2a 8531 class loop *sloop
44245620 8532 = vect_loop_versioning (loop_vinfo, th, check_profitability,
8533 versioning_threshold);
8534 sloop->force_vectorize = false;
e7430948 8535 check_profitability = false;
8536 }
23a3430d 8537
19961a78 8538 /* Make sure there exists a single-predecessor exit bb also on the
8539 scalar loop copy. Do this after versioning but before peeling
8540 so CFG structure is fine for both scalar and if-converted loop
8541 to make slpeel_duplicate_current_defs_from_edges face matched
8542 loop closed PHI nodes on the exit. */
8543 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8544 {
8545 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8546 if (! single_pred_p (e->dest))
8547 {
31665a24 8548 split_loop_exit_edge (e, true);
19961a78 8549 if (dump_enabled_p ())
8550 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8551 }
8552 }
8553
6c6a3430 8554 tree niters = vect_build_loop_niters (loop_vinfo);
8555 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8556 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
637a7045 8557 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
cde959e7 8558 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8559 &step_vector, &niters_vector_mult_vf, th,
5b631e09 8560 check_profitability, niters_no_overflow);
e3b3a12f 8561 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8562 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8563 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8564 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
60b29a7e 8565
6c6a3430 8566 if (niters_vector == NULL_TREE)
e7430948 8567 {
60b29a7e 8568 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8569 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8570 && known_eq (lowest_vf, vf))
cde959e7 8571 {
8572 niters_vector
8573 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
d75596cd 8574 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
cde959e7 8575 step_vector = build_one_cst (TREE_TYPE (niters));
8576 }
6c6a3430 8577 else
8578 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
cde959e7 8579 &step_vector, niters_no_overflow);
c8a2b4ff 8580 }
fb85abff 8581
8582 /* 1) Make sure the loop header has exactly two entries
8583 2) Make sure we have a preheader basic block. */
8584
8585 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8586
8587 split_edge (loop_preheader_edge (loop));
8588
6753a4bf 8589 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8590 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8591 /* This will deal with any possible peeling. */
8592 vect_prepare_for_masked_peels (loop_vinfo);
8593
0384ddb0 8594 /* Schedule the SLP instances first, then handle loop vectorization
8595 below. */
8596 if (!loop_vinfo->slp_instances.is_empty ())
8597 {
8598 DUMP_VECT_SCOPE ("scheduling SLP instances");
8599 vect_schedule_slp (loop_vinfo);
8600 }
8601
fb85abff 8602 /* FORNOW: the vectorizer supports only loops which body consist
48e1416a 8603 of one basic block (header + empty latch). When the vectorizer will
8604 support more involved loop forms, the order by which the BBs are
fb85abff 8605 traversed need to be reconsidered. */
8606
8607 for (i = 0; i < nbbs; i++)
8608 {
8609 basic_block bb = bbs[i];
8610 stmt_vec_info stmt_info;
fb85abff 8611
1a91d914 8612 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8613 gsi_next (&si))
fb85abff 8614 {
1a91d914 8615 gphi *phi = si.phi ();
6d8fb6cf 8616 if (dump_enabled_p ())
a4e972e3 8617 dump_printf_loc (MSG_NOTE, vect_location,
8618 "------>vectorizing phi: %G", phi);
03c0d666 8619 stmt_info = loop_vinfo->lookup_stmt (phi);
fb85abff 8620 if (!stmt_info)
8621 continue;
8622
c64f38bf 8623 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
a73182ff 8624 vect_loop_kill_debug_uses (loop, stmt_info);
12e7ff4f 8625
fb85abff 8626 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8627 && !STMT_VINFO_LIVE_P (stmt_info))
12e7ff4f 8628 continue;
fb85abff 8629
bb4b5e0f 8630 if (STMT_VINFO_VECTYPE (stmt_info)
d75596cd 8631 && (maybe_ne
8632 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
6d8fb6cf 8633 && dump_enabled_p ())
78bb46f5 8634 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
fb85abff 8635
44b24fa0 8636 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8637 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8638 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5cc7beaa 8639 && ! PURE_SLP_STMT (stmt_info))
fb85abff 8640 {
6d8fb6cf 8641 if (dump_enabled_p ())
78bb46f5 8642 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9632f098 8643 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
fb85abff 8644 }
8645 }
8646
1a91d914 8647 for (gimple_stmt_iterator si = gsi_start_bb (bb);
daec18f5 8648 !gsi_end_p (si);)
fb85abff 8649 {
daec18f5 8650 stmt = gsi_stmt (si);
8651 /* During vectorization remove existing clobber stmts. */
8652 if (gimple_clobber_p (stmt))
8911f4de 8653 {
daec18f5 8654 unlink_stmt_vdef (stmt);
8655 gsi_remove (&si, true);
8656 release_defs (stmt);
8911f4de 8657 }
daec18f5 8658 else
fb85abff 8659 {
03c0d666 8660 stmt_info = loop_vinfo->lookup_stmt (stmt);
12e7ff4f 8661
daec18f5 8662 /* vector stmts created in the outer-loop during vectorization of
8663 stmts in an inner-loop may not have a stmt_info, and do not
8664 need to be vectorized. */
8665 stmt_vec_info seen_store = NULL;
8666 if (stmt_info)
18937389 8667 {
daec18f5 8668 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
18937389 8669 {
daec18f5 8670 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8671 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8672 !gsi_end_p (subsi); gsi_next (&subsi))
e068828a 8673 {
8674 stmt_vec_info pat_stmt_info
8675 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8676 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
0384ddb0 8677 &si, &seen_store);
e068828a 8678 }
8679 stmt_vec_info pat_stmt_info
8680 = STMT_VINFO_RELATED_STMT (stmt_info);
8681 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
0384ddb0 8682 &seen_store);
18937389 8683 }
e068828a 8684 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
0384ddb0 8685 &seen_store);
18937389 8686 }
f525c1af 8687 gsi_next (&si);
daec18f5 8688 if (seen_store)
fb85abff 8689 {
daec18f5 8690 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
f525c1af 8691 /* Interleaving. If IS_STORE is TRUE, the
8692 vectorization of the interleaving chain was
8693 completed - free all the stores in the chain. */
8694 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
daec18f5 8695 else
f525c1af 8696 /* Free the attached stmt_vec_info and remove the stmt. */
8697 loop_vinfo->remove_stmt (stmt_info);
fb85abff 8698 }
512cbd67 8699 }
daec18f5 8700 }
b6a43ebc 8701
8702 /* Stub out scalar statements that must not survive vectorization.
8703 Doing this here helps with grouped statements, or statements that
8704 are involved in patterns. */
8705 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8706 !gsi_end_p (gsi); gsi_next (&gsi))
8707 {
8708 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8709 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8710 {
8711 tree lhs = gimple_get_lhs (call);
8712 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8713 {
8714 tree zero = build_zero_cst (TREE_TYPE (lhs));
8715 gimple *new_stmt = gimple_build_assign (lhs, zero);
8716 gsi_replace (&gsi, new_stmt, true);
8717 }
8718 }
8719 }
fb85abff 8720 } /* BBs in loop */
8721
cde959e7 8722 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8723 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8724 if (integer_onep (step_vector))
8725 niters_no_overflow = true;
60b29a7e 8726 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8727 niters_vector_mult_vf, !niters_no_overflow);
fb85abff 8728
d75596cd 8729 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8730 scale_profile_for_vect_loop (loop, assumed_vf);
12420a15 8731
60b29a7e 8732 /* True if the final iteration might not handle a full vector's
8733 worth of scalar iterations. */
8734 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8c057503 8735 /* The minimum number of iterations performed by the epilogue. This
8736 is 1 when peeling for gaps because we always need a final scalar
8737 iteration. */
8738 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8739 /* +1 to convert latch counts to loop iteration counts,
8740 -min_epilogue_iters to remove iterations that cannot be performed
8741 by the vector code. */
6753a4bf 8742 int bias_for_lowest = 1 - min_epilogue_iters;
8743 int bias_for_assumed = bias_for_lowest;
8744 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8745 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8746 {
8747 /* When the amount of peeling is known at compile time, the first
8748 iteration will have exactly alignment_npeels active elements.
8749 In the worst case it will have at least one. */
8750 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8751 bias_for_lowest += lowest_vf - min_first_active;
8752 bias_for_assumed += assumed_vf - min_first_active;
8753 }
8c057503 8754 /* In these calculations the "- 1" converts loop iteration counts
8755 back to latch counts. */
8756 if (loop->any_upper_bound)
8757 loop->nb_iterations_upper_bound
60b29a7e 8758 = (final_iter_may_be_partial
6753a4bf 8759 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
60b29a7e 8760 lowest_vf) - 1
6753a4bf 8761 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
60b29a7e 8762 lowest_vf) - 1);
8c057503 8763 if (loop->any_likely_upper_bound)
8764 loop->nb_iterations_likely_upper_bound
60b29a7e 8765 = (final_iter_may_be_partial
6753a4bf 8766 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8767 + bias_for_lowest, lowest_vf) - 1
8768 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8769 + bias_for_lowest, lowest_vf) - 1);
d3f1934c 8770 if (loop->any_estimate)
8c057503 8771 loop->nb_iterations_estimate
60b29a7e 8772 = (final_iter_may_be_partial
6753a4bf 8773 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
60b29a7e 8774 assumed_vf) - 1
6753a4bf 8775 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
60b29a7e 8776 assumed_vf) - 1);
d3f1934c 8777
6d8fb6cf 8778 if (dump_enabled_p ())
b055bc88 8779 {
5b631e09 8780 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8781 {
8782 dump_printf_loc (MSG_NOTE, vect_location,
8783 "LOOP VECTORIZED\n");
8784 if (loop->inner)
8785 dump_printf_loc (MSG_NOTE, vect_location,
8786 "OUTER LOOP VECTORIZED\n");
8787 dump_printf (MSG_NOTE, "\n");
8788 }
8789 else
3106770a 8790 {
8791 dump_printf_loc (MSG_NOTE, vect_location,
8792 "LOOP EPILOGUE VECTORIZED (VS=");
8793 dump_dec (MSG_NOTE, current_vector_size);
8794 dump_printf (MSG_NOTE, ")\n");
8795 }
b055bc88 8796 }
0d85be19 8797
f7289f56 8798 /* Loops vectorized with a variable factor won't benefit from
8799 unrolling/peeling. */
8800 if (!vf.is_constant ())
8801 {
8802 loop->unroll = 1;
8803 if (dump_enabled_p ())
8804 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8805 " variable-length vectorization factor\n");
8806 }
0d85be19 8807 /* Free SLP instances here because otherwise stmt reference counting
8808 won't work. */
8809 slp_instance instance;
8810 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2068679d 8811 vect_free_slp_instance (instance, true);
0d85be19 8812 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
641b1c86 8813 /* Clear-up safelen field since its value is invalid after vectorization
8814 since vectorized loop can have loop-carried dependencies. */
8815 loop->safelen = 0;
5b631e09 8816
8817 /* Don't vectorize epilogue for epilogue. */
8818 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8819 epilogue = NULL;
8820
3106770a 8821 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8822 epilogue = NULL;
8823
5b631e09 8824 if (epilogue)
8825 {
3106770a 8826 auto_vector_sizes vector_sizes;
e7419472 8827 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
3106770a 8828 unsigned int next_size = 0;
5b631e09 8829
65354bd9 8830 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8831 on niters already ajusted for the iterations of the prologue. */
3106770a 8832 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3106770a 8833 && known_eq (vf, lowest_vf))
8834 {
65354bd9 8835 unsigned HOST_WIDE_INT eiters
3106770a 8836 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
074dac96 8837 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8838 eiters
8839 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
3106770a 8840 epilogue->nb_iterations_upper_bound = eiters - 1;
074dac96 8841 epilogue->any_upper_bound = true;
3106770a 8842
8843 unsigned int ratio;
8844 while (next_size < vector_sizes.length ()
8845 && !(constant_multiple_p (current_vector_size,
8846 vector_sizes[next_size], &ratio)
8847 && eiters >= lowest_vf / ratio))
8848 next_size += 1;
8849 }
8850 else
8851 while (next_size < vector_sizes.length ()
8852 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8853 next_size += 1;
5b631e09 8854
3106770a 8855 if (next_size == vector_sizes.length ())
8856 epilogue = NULL;
5b631e09 8857 }
8858
8859 if (epilogue)
8860 {
8861 epilogue->force_vectorize = loop->force_vectorize;
8862 epilogue->safelen = loop->safelen;
8863 epilogue->dont_vectorize = false;
8864
8865 /* We may need to if-convert epilogue to vectorize it. */
8866 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8867 tree_if_conversion (epilogue);
8868 }
8869
8870 return epilogue;
fb85abff 8871}
cfd9ca84 8872
8873/* The code below is trying to perform simple optimization - revert
8874 if-conversion for masked stores, i.e. if the mask of a store is zero
8875 do not perform it and all stored value producers also if possible.
8876 For example,
8877 for (i=0; i<n; i++)
8878 if (c[i])
8879 {
8880 p1[i] += 1;
8881 p2[i] = p3[i] +2;
8882 }
8883 this transformation will produce the following semi-hammock:
8884
8885 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8886 {
8887 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8888 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8889 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8890 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8891 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8892 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8893 }
8894*/
8895
8896void
2e966e2a 8897optimize_mask_stores (class loop *loop)
cfd9ca84 8898{
8899 basic_block *bbs = get_loop_body (loop);
8900 unsigned nbbs = loop->num_nodes;
8901 unsigned i;
8902 basic_block bb;
2e966e2a 8903 class loop *bb_loop;
cfd9ca84 8904 gimple_stmt_iterator gsi;
f64416ca 8905 gimple *stmt;
cfd9ca84 8906 auto_vec<gimple *> worklist;
72ea15e5 8907 auto_purge_vect_location sentinel;
cfd9ca84 8908
8909 vect_location = find_loop_location (loop);
8910 /* Pick up all masked stores in loop if any. */
8911 for (i = 0; i < nbbs; i++)
8912 {
8913 bb = bbs[i];
8914 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8915 gsi_next (&gsi))
8916 {
8917 stmt = gsi_stmt (gsi);
7408cd7d 8918 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
cfd9ca84 8919 worklist.safe_push (stmt);
8920 }
8921 }
8922
8923 free (bbs);
8924 if (worklist.is_empty ())
8925 return;
8926
8927 /* Loop has masked stores. */
8928 while (!worklist.is_empty ())
8929 {
8930 gimple *last, *last_store;
8931 edge e, efalse;
8932 tree mask;
8933 basic_block store_bb, join_bb;
8934 gimple_stmt_iterator gsi_to;
8935 tree vdef, new_vdef;
8936 gphi *phi;
8937 tree vectype;
8938 tree zero;
8939
8940 last = worklist.pop ();
8941 mask = gimple_call_arg (last, 2);
8942 bb = gimple_bb (last);
fa05ada9 8943 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8944 the same loop as if_bb. It could be different to LOOP when two
8945 level loop-nest is vectorized and mask_store belongs to the inner
8946 one. */
cfd9ca84 8947 e = split_block (bb, last);
fa05ada9 8948 bb_loop = bb->loop_father;
8949 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
cfd9ca84 8950 join_bb = e->dest;
8951 store_bb = create_empty_bb (bb);
fa05ada9 8952 add_bb_to_loop (store_bb, bb_loop);
cfd9ca84 8953 e->flags = EDGE_TRUE_VALUE;
8954 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8955 /* Put STORE_BB to likely part. */
720cfc43 8956 efalse->probability = profile_probability::unlikely ();
205ce1aa 8957 store_bb->count = efalse->count ();
67c30edd 8958 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
cfd9ca84 8959 if (dom_info_available_p (CDI_DOMINATORS))
8960 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8961 if (dump_enabled_p ())
8962 dump_printf_loc (MSG_NOTE, vect_location,
8963 "Create new block %d to sink mask stores.",
8964 store_bb->index);
8965 /* Create vector comparison with boolean result. */
8966 vectype = TREE_TYPE (mask);
8967 zero = build_zero_cst (vectype);
8968 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8969 gsi = gsi_last_bb (bb);
8970 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8971 /* Create new PHI node for vdef of the last masked store:
8972 .MEM_2 = VDEF <.MEM_1>
8973 will be converted to
8974 .MEM.3 = VDEF <.MEM_1>
8975 and new PHI node will be created in join bb
8976 .MEM_2 = PHI <.MEM_1, .MEM_3>
8977 */
8978 vdef = gimple_vdef (last);
8979 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8980 gimple_set_vdef (last, new_vdef);
8981 phi = create_phi_node (vdef, join_bb);
8982 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8983
8984 /* Put all masked stores with the same mask to STORE_BB if possible. */
8985 while (true)
8986 {
8987 gimple_stmt_iterator gsi_from;
f64416ca 8988 gimple *stmt1 = NULL;
8989
cfd9ca84 8990 /* Move masked store to STORE_BB. */
8991 last_store = last;
8992 gsi = gsi_for_stmt (last);
8993 gsi_from = gsi;
8994 /* Shift GSI to the previous stmt for further traversal. */
8995 gsi_prev (&gsi);
8996 gsi_to = gsi_start_bb (store_bb);
8997 gsi_move_before (&gsi_from, &gsi_to);
8998 /* Setup GSI_TO to the non-empty block start. */
8999 gsi_to = gsi_start_bb (store_bb);
9000 if (dump_enabled_p ())
a4e972e3 9001 dump_printf_loc (MSG_NOTE, vect_location,
9002 "Move stmt to created bb\n%G", last);
f64416ca 9003 /* Move all stored value producers if possible. */
9004 while (!gsi_end_p (gsi))
9005 {
9006 tree lhs;
9007 imm_use_iterator imm_iter;
9008 use_operand_p use_p;
9009 bool res;
cfd9ca84 9010
f64416ca 9011 /* Skip debug statements. */
9012 if (is_gimple_debug (gsi_stmt (gsi)))
1b889259 9013 {
9014 gsi_prev (&gsi);
9015 continue;
9016 }
f64416ca 9017 stmt1 = gsi_stmt (gsi);
9018 /* Do not consider statements writing to memory or having
9019 volatile operand. */
9020 if (gimple_vdef (stmt1)
9021 || gimple_has_volatile_ops (stmt1))
9022 break;
9023 gsi_from = gsi;
9024 gsi_prev (&gsi);
9025 lhs = gimple_get_lhs (stmt1);
9026 if (!lhs)
9027 break;
cfd9ca84 9028
f64416ca 9029 /* LHS of vectorized stmt must be SSA_NAME. */
9030 if (TREE_CODE (lhs) != SSA_NAME)
9031 break;
cfd9ca84 9032
f64416ca 9033 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9034 {
9035 /* Remove dead scalar statement. */
9036 if (has_zero_uses (lhs))
9037 {
9038 gsi_remove (&gsi_from, true);
9039 continue;
9040 }
9041 }
9042
9043 /* Check that LHS does not have uses outside of STORE_BB. */
9044 res = true;
9045 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9046 {
9047 gimple *use_stmt;
9048 use_stmt = USE_STMT (use_p);
9049 if (is_gimple_debug (use_stmt))
9050 continue;
9051 if (gimple_bb (use_stmt) != store_bb)
9052 {
9053 res = false;
9054 break;
9055 }
9056 }
9057 if (!res)
9058 break;
9059
9060 if (gimple_vuse (stmt1)
9061 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9062 break;
9063
9064 /* Can move STMT1 to STORE_BB. */
9065 if (dump_enabled_p ())
a4e972e3 9066 dump_printf_loc (MSG_NOTE, vect_location,
9067 "Move stmt to created bb\n%G", stmt1);
f64416ca 9068 gsi_move_before (&gsi_from, &gsi_to);
9069 /* Shift GSI_TO for further insertion. */
9070 gsi_prev (&gsi_to);
9071 }
9072 /* Put other masked stores with the same mask to STORE_BB. */
9073 if (worklist.is_empty ()
9074 || gimple_call_arg (worklist.last (), 2) != mask
9075 || worklist.last () != stmt1)
9076 break;
9077 last = worklist.pop ();
cfd9ca84 9078 }
9079 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9080 }
9081}
ef871d99 9082
9083/* Decide whether it is possible to use a zero-based induction variable
9084 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9085 return the value that the induction variable must be able to hold
9086 in order to ensure that the loop ends with an all-false mask.
9087 Return -1 otherwise. */
9088widest_int
9089vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9090{
9091 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
2e966e2a 9092 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
ef871d99 9093 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9094
9095 /* Calculate the value that the induction variable must be able
9096 to hit in order to ensure that we end the loop with an all-false mask.
9097 This involves adding the maximum number of inactive trailing scalar
9098 iterations. */
9099 widest_int iv_limit = -1;
9100 if (max_loop_iterations (loop, &iv_limit))
9101 {
9102 if (niters_skip)
9103 {
9104 /* Add the maximum number of skipped iterations to the
9105 maximum iteration count. */
9106 if (TREE_CODE (niters_skip) == INTEGER_CST)
9107 iv_limit += wi::to_widest (niters_skip);
9108 else
9109 iv_limit += max_vf - 1;
9110 }
9111 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9112 /* Make a conservatively-correct assumption. */
9113 iv_limit += max_vf - 1;
9114
9115 /* IV_LIMIT is the maximum number of latch iterations, which is also
9116 the maximum in-range IV value. Round this value down to the previous
9117 vector alignment boundary and then add an extra full iteration. */
9118 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9119 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9120 }
9121 return iv_limit;
9122}
9123