]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.c
[Ada] Fold Enum_Rep attribute in evaluation and not in expansion
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56
57 /* Loop Vectorization Pass.
58
59 This pass tries to vectorize loops.
60
61 For example, the vectorizer transforms the following simple loop:
62
63 short a[N]; short b[N]; short c[N]; int i;
64
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
67 }
68
69 as if it was manually vectorized by rewriting the source code into:
70
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
75
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
81 }
82
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
94
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
100
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
105
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
116
117 For example, say stmt S1 was vectorized into stmt VS1:
118
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
122
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
127
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
135
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
143
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
150
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
158
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
162
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
185
186 if (stmt_vectype)
187 {
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return opt_result::success ();
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
209
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
220
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
223 {
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
226
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
230 {
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
239 }
240
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
248 }
249
250 return opt_result::success ();
251 }
252
253 /* Function vect_determine_vectorization_factor
254
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
260
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
265
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
270 }
271
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
275 }
276 */
277
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
280 {
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
290
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
292
293 for (i = 0; i < nbbs; i++)
294 {
295 basic_block bb = bbs[i];
296
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
299 {
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
305
306 gcc_assert (stmt_info);
307
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
310 {
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
313
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
318
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
326
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
330
331 if (dump_enabled_p ())
332 {
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
336 }
337
338 vect_update_max_nunits (&vectorization_factor, vectype);
339 }
340 }
341
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
344 {
345 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
346 opt_result res
347 = vect_determine_vf_for_stmt (loop_vinfo,
348 stmt_info, &vectorization_factor);
349 if (!res)
350 return res;
351 }
352 }
353
354 /* TODO: Analyze cost. Decide if worth while to vectorize. */
355 if (dump_enabled_p ())
356 {
357 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
358 dump_dec (MSG_NOTE, vectorization_factor);
359 dump_printf (MSG_NOTE, "\n");
360 }
361
362 if (known_le (vectorization_factor, 1U))
363 return opt_result::failure_at (vect_location,
364 "not vectorized: unsupported data-type\n");
365 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
366 return opt_result::success ();
367 }
368
369
370 /* Function vect_is_simple_iv_evolution.
371
372 FORNOW: A simple evolution of an induction variables in the loop is
373 considered a polynomial evolution. */
374
375 static bool
376 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
377 tree * step)
378 {
379 tree init_expr;
380 tree step_expr;
381 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
382 basic_block bb;
383
384 /* When there is no evolution in this loop, the evolution function
385 is not "simple". */
386 if (evolution_part == NULL_TREE)
387 return false;
388
389 /* When the evolution is a polynomial of degree >= 2
390 the evolution function is not "simple". */
391 if (tree_is_chrec (evolution_part))
392 return false;
393
394 step_expr = evolution_part;
395 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
396
397 if (dump_enabled_p ())
398 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
399 step_expr, init_expr);
400
401 *init = init_expr;
402 *step = step_expr;
403
404 if (TREE_CODE (step_expr) != INTEGER_CST
405 && (TREE_CODE (step_expr) != SSA_NAME
406 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
407 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
408 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
409 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
410 || !flag_associative_math)))
411 && (TREE_CODE (step_expr) != REAL_CST
412 || !flag_associative_math))
413 {
414 if (dump_enabled_p ())
415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
416 "step unknown.\n");
417 return false;
418 }
419
420 return true;
421 }
422
423 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
424 what we are assuming is a double reduction. For example, given
425 a structure like this:
426
427 outer1:
428 x_1 = PHI <x_4(outer2), ...>;
429 ...
430
431 inner:
432 x_2 = PHI <x_1(outer1), ...>;
433 ...
434 x_3 = ...;
435 ...
436
437 outer2:
438 x_4 = PHI <x_3(inner)>;
439 ...
440
441 outer loop analysis would treat x_1 as a double reduction phi and
442 this function would then return true for x_2. */
443
444 static bool
445 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
446 {
447 use_operand_p use_p;
448 ssa_op_iter op_iter;
449 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
450 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
451 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
452 return true;
453 return false;
454 }
455
456 /* Function vect_analyze_scalar_cycles_1.
457
458 Examine the cross iteration def-use cycles of scalar variables
459 in LOOP. LOOP_VINFO represents the loop that is now being
460 considered for vectorization (can be LOOP, or an outer-loop
461 enclosing LOOP). */
462
463 static void
464 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
465 {
466 basic_block bb = loop->header;
467 tree init, step;
468 auto_vec<stmt_vec_info, 64> worklist;
469 gphi_iterator gsi;
470 bool double_reduc, reduc_chain;
471
472 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
473
474 /* First - identify all inductions. Reduction detection assumes that all the
475 inductions have been identified, therefore, this order must not be
476 changed. */
477 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
478 {
479 gphi *phi = gsi.phi ();
480 tree access_fn = NULL;
481 tree def = PHI_RESULT (phi);
482 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
483
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
486
487 /* Skip virtual phi's. The data dependences that are associated with
488 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
489 if (virtual_operand_p (def))
490 continue;
491
492 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
493
494 /* Analyze the evolution function. */
495 access_fn = analyze_scalar_evolution (loop, def);
496 if (access_fn)
497 {
498 STRIP_NOPS (access_fn);
499 if (dump_enabled_p ())
500 dump_printf_loc (MSG_NOTE, vect_location,
501 "Access function of PHI: %T\n", access_fn);
502 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
503 = initial_condition_in_loop_num (access_fn, loop->num);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
505 = evolution_part_in_loop_num (access_fn, loop->num);
506 }
507
508 if (!access_fn
509 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
510 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
511 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
512 && TREE_CODE (step) != INTEGER_CST))
513 {
514 worklist.safe_push (stmt_vinfo);
515 continue;
516 }
517
518 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
519 != NULL_TREE);
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
521
522 if (dump_enabled_p ())
523 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
524 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
525 }
526
527
528 /* Second - identify all reductions and nested cycles. */
529 while (worklist.length () > 0)
530 {
531 stmt_vec_info stmt_vinfo = worklist.pop ();
532 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
533 tree def = PHI_RESULT (phi);
534
535 if (dump_enabled_p ())
536 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
537
538 gcc_assert (!virtual_operand_p (def)
539 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
540
541 stmt_vec_info reduc_stmt_info
542 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
543 &reduc_chain);
544 if (reduc_stmt_info)
545 {
546 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
547 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
548 if (double_reduc)
549 {
550 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location,
552 "Detected double reduction.\n");
553
554 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
555 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
556 }
557 else
558 {
559 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
560 {
561 if (dump_enabled_p ())
562 dump_printf_loc (MSG_NOTE, vect_location,
563 "Detected vectorizable nested cycle.\n");
564
565 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
566 }
567 else
568 {
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "Detected reduction.\n");
572
573 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
574 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
575 /* Store the reduction cycles for possible vectorization in
576 loop-aware SLP if it was not detected as reduction
577 chain. */
578 if (! reduc_chain)
579 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
580 (reduc_stmt_info);
581 }
582 }
583 }
584 else
585 if (dump_enabled_p ())
586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
587 "Unknown def-use cycle pattern.\n");
588 }
589 }
590
591
592 /* Function vect_analyze_scalar_cycles.
593
594 Examine the cross iteration def-use cycles of scalar variables, by
595 analyzing the loop-header PHIs of scalar variables. Classify each
596 cycle as one of the following: invariant, induction, reduction, unknown.
597 We do that for the loop represented by LOOP_VINFO, and also to its
598 inner-loop, if exists.
599 Examples for scalar cycles:
600
601 Example1: reduction:
602
603 loop1:
604 for (i=0; i<N; i++)
605 sum += a[i];
606
607 Example2: induction:
608
609 loop2:
610 for (i=0; i<N; i++)
611 a[i] = i; */
612
613 static void
614 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
615 {
616 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
617
618 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
619
620 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
621 Reductions in such inner-loop therefore have different properties than
622 the reductions in the nest that gets vectorized:
623 1. When vectorized, they are executed in the same order as in the original
624 scalar loop, so we can't change the order of computation when
625 vectorizing them.
626 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
627 current checks are too strict. */
628
629 if (loop->inner)
630 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
631 }
632
633 /* Transfer group and reduction information from STMT_INFO to its
634 pattern stmt. */
635
636 static void
637 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
638 {
639 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
640 stmt_vec_info stmtp;
641 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
642 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
643 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
644 do
645 {
646 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
647 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
648 == STMT_VINFO_DEF_TYPE (stmt_info));
649 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
650 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
651 if (stmt_info)
652 REDUC_GROUP_NEXT_ELEMENT (stmtp)
653 = STMT_VINFO_RELATED_STMT (stmt_info);
654 }
655 while (stmt_info);
656 }
657
658 /* Fixup scalar cycles that now have their stmts detected as patterns. */
659
660 static void
661 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
662 {
663 stmt_vec_info first;
664 unsigned i;
665
666 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
667 if (STMT_VINFO_IN_PATTERN_P (first))
668 {
669 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
670 while (next)
671 {
672 if (! STMT_VINFO_IN_PATTERN_P (next)
673 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
674 break;
675 next = REDUC_GROUP_NEXT_ELEMENT (next);
676 }
677 /* If not all stmt in the chain are patterns or if we failed
678 to update STMT_VINFO_REDUC_IDX try to handle the chain
679 without patterns. */
680 if (! next
681 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
682 {
683 vect_fixup_reduc_chain (first);
684 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
685 = STMT_VINFO_RELATED_STMT (first);
686 }
687 }
688 }
689
690 /* Function vect_get_loop_niters.
691
692 Determine how many iterations the loop is executed and place it
693 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
694 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
695 niter information holds in ASSUMPTIONS.
696
697 Return the loop exit condition. */
698
699
700 static gcond *
701 vect_get_loop_niters (class loop *loop, tree *assumptions,
702 tree *number_of_iterations, tree *number_of_iterationsm1)
703 {
704 edge exit = single_exit (loop);
705 class tree_niter_desc niter_desc;
706 tree niter_assumptions, niter, may_be_zero;
707 gcond *cond = get_loop_exit_condition (loop);
708
709 *assumptions = boolean_true_node;
710 *number_of_iterationsm1 = chrec_dont_know;
711 *number_of_iterations = chrec_dont_know;
712 DUMP_VECT_SCOPE ("get_loop_niters");
713
714 if (!exit)
715 return cond;
716
717 may_be_zero = NULL_TREE;
718 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
719 || chrec_contains_undetermined (niter_desc.niter))
720 return cond;
721
722 niter_assumptions = niter_desc.assumptions;
723 may_be_zero = niter_desc.may_be_zero;
724 niter = niter_desc.niter;
725
726 if (may_be_zero && integer_zerop (may_be_zero))
727 may_be_zero = NULL_TREE;
728
729 if (may_be_zero)
730 {
731 if (COMPARISON_CLASS_P (may_be_zero))
732 {
733 /* Try to combine may_be_zero with assumptions, this can simplify
734 computation of niter expression. */
735 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
736 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
737 niter_assumptions,
738 fold_build1 (TRUTH_NOT_EXPR,
739 boolean_type_node,
740 may_be_zero));
741 else
742 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
743 build_int_cst (TREE_TYPE (niter), 0),
744 rewrite_to_non_trapping_overflow (niter));
745
746 may_be_zero = NULL_TREE;
747 }
748 else if (integer_nonzerop (may_be_zero))
749 {
750 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
751 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
752 return cond;
753 }
754 else
755 return cond;
756 }
757
758 *assumptions = niter_assumptions;
759 *number_of_iterationsm1 = niter;
760
761 /* We want the number of loop header executions which is the number
762 of latch executions plus one.
763 ??? For UINT_MAX latch executions this number overflows to zero
764 for loops like do { n++; } while (n != 0); */
765 if (niter && !chrec_contains_undetermined (niter))
766 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
767 build_int_cst (TREE_TYPE (niter), 1));
768 *number_of_iterations = niter;
769
770 return cond;
771 }
772
773 /* Function bb_in_loop_p
774
775 Used as predicate for dfs order traversal of the loop bbs. */
776
777 static bool
778 bb_in_loop_p (const_basic_block bb, const void *data)
779 {
780 const class loop *const loop = (const class loop *)data;
781 if (flow_bb_inside_loop_p (loop, bb))
782 return true;
783 return false;
784 }
785
786
787 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
788 stmt_vec_info structs for all the stmts in LOOP_IN. */
789
790 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
791 : vec_info (vec_info::loop, init_cost (loop_in), shared),
792 loop (loop_in),
793 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
794 num_itersm1 (NULL_TREE),
795 num_iters (NULL_TREE),
796 num_iters_unchanged (NULL_TREE),
797 num_iters_assumptions (NULL_TREE),
798 th (0),
799 versioning_threshold (0),
800 vectorization_factor (0),
801 max_vectorization_factor (0),
802 mask_skip_niters (NULL_TREE),
803 mask_compare_type (NULL_TREE),
804 simd_if_cond (NULL_TREE),
805 unaligned_dr (NULL),
806 peeling_for_alignment (0),
807 ptr_mask (0),
808 ivexpr_map (NULL),
809 scan_map (NULL),
810 slp_unrolling_factor (1),
811 single_scalar_iteration_cost (0),
812 vec_outside_cost (0),
813 vec_inside_cost (0),
814 vectorizable (false),
815 can_fully_mask_p (true),
816 fully_masked_p (false),
817 peeling_for_gaps (false),
818 peeling_for_niter (false),
819 no_data_dependencies (false),
820 has_mask_store (false),
821 scalar_loop_scaling (profile_probability::uninitialized ()),
822 scalar_loop (NULL),
823 orig_loop_info (NULL)
824 {
825 /* CHECKME: We want to visit all BBs before their successors (except for
826 latch blocks, for which this assertion wouldn't hold). In the simple
827 case of the loop forms we allow, a dfs order of the BBs would the same
828 as reversed postorder traversal, so we are safe. */
829
830 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
831 bbs, loop->num_nodes, loop);
832 gcc_assert (nbbs == loop->num_nodes);
833
834 for (unsigned int i = 0; i < nbbs; i++)
835 {
836 basic_block bb = bbs[i];
837 gimple_stmt_iterator si;
838
839 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
840 {
841 gimple *phi = gsi_stmt (si);
842 gimple_set_uid (phi, 0);
843 add_stmt (phi);
844 }
845
846 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
847 {
848 gimple *stmt = gsi_stmt (si);
849 gimple_set_uid (stmt, 0);
850 add_stmt (stmt);
851 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
852 third argument is the #pragma omp simd if (x) condition, when 0,
853 loop shouldn't be vectorized, when non-zero constant, it should
854 be vectorized normally, otherwise versioned with vectorized loop
855 done if the condition is non-zero at runtime. */
856 if (loop_in->simduid
857 && is_gimple_call (stmt)
858 && gimple_call_internal_p (stmt)
859 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
860 && gimple_call_num_args (stmt) >= 3
861 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
862 && (loop_in->simduid
863 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
864 {
865 tree arg = gimple_call_arg (stmt, 2);
866 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
867 simd_if_cond = arg;
868 else
869 gcc_assert (integer_nonzerop (arg));
870 }
871 }
872 }
873
874 epilogue_vinfos.create (6);
875 }
876
877 /* Free all levels of MASKS. */
878
879 void
880 release_vec_loop_masks (vec_loop_masks *masks)
881 {
882 rgroup_masks *rgm;
883 unsigned int i;
884 FOR_EACH_VEC_ELT (*masks, i, rgm)
885 rgm->masks.release ();
886 masks->release ();
887 }
888
889 /* Free all memory used by the _loop_vec_info, as well as all the
890 stmt_vec_info structs of all the stmts in the loop. */
891
892 _loop_vec_info::~_loop_vec_info ()
893 {
894 free (bbs);
895
896 release_vec_loop_masks (&masks);
897 delete ivexpr_map;
898 delete scan_map;
899 epilogue_vinfos.release ();
900
901 loop->aux = NULL;
902 }
903
904 /* Return an invariant or register for EXPR and emit necessary
905 computations in the LOOP_VINFO loop preheader. */
906
907 tree
908 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
909 {
910 if (is_gimple_reg (expr)
911 || is_gimple_min_invariant (expr))
912 return expr;
913
914 if (! loop_vinfo->ivexpr_map)
915 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
916 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
917 if (! cached)
918 {
919 gimple_seq stmts = NULL;
920 cached = force_gimple_operand (unshare_expr (expr),
921 &stmts, true, NULL_TREE);
922 if (stmts)
923 {
924 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
925 gsi_insert_seq_on_edge_immediate (e, stmts);
926 }
927 }
928 return cached;
929 }
930
931 /* Return true if we can use CMP_TYPE as the comparison type to produce
932 all masks required to mask LOOP_VINFO. */
933
934 static bool
935 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
936 {
937 rgroup_masks *rgm;
938 unsigned int i;
939 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
940 if (rgm->mask_type != NULL_TREE
941 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
942 cmp_type, rgm->mask_type,
943 OPTIMIZE_FOR_SPEED))
944 return false;
945 return true;
946 }
947
948 /* Calculate the maximum number of scalars per iteration for every
949 rgroup in LOOP_VINFO. */
950
951 static unsigned int
952 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
953 {
954 unsigned int res = 1;
955 unsigned int i;
956 rgroup_masks *rgm;
957 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
958 res = MAX (res, rgm->max_nscalars_per_iter);
959 return res;
960 }
961
962 /* Each statement in LOOP_VINFO can be masked where necessary. Check
963 whether we can actually generate the masks required. Return true if so,
964 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
965
966 static bool
967 vect_verify_full_masking (loop_vec_info loop_vinfo)
968 {
969 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
970 unsigned int min_ni_width;
971 unsigned int max_nscalars_per_iter
972 = vect_get_max_nscalars_per_iter (loop_vinfo);
973
974 /* Use a normal loop if there are no statements that need masking.
975 This only happens in rare degenerate cases: it means that the loop
976 has no loads, no stores, and no live-out values. */
977 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
978 return false;
979
980 /* Get the maximum number of iterations that is representable
981 in the counter type. */
982 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
983 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
984
985 /* Get a more refined estimate for the number of iterations. */
986 widest_int max_back_edges;
987 if (max_loop_iterations (loop, &max_back_edges))
988 max_ni = wi::smin (max_ni, max_back_edges + 1);
989
990 /* Account for rgroup masks, in which each bit is replicated N times. */
991 max_ni *= max_nscalars_per_iter;
992
993 /* Work out how many bits we need to represent the limit. */
994 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
995
996 /* Find a scalar mode for which WHILE_ULT is supported. */
997 opt_scalar_int_mode cmp_mode_iter;
998 tree cmp_type = NULL_TREE;
999 tree iv_type = NULL_TREE;
1000 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1001 unsigned int iv_precision = UINT_MAX;
1002
1003 if (iv_limit != -1)
1004 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1005 UNSIGNED);
1006
1007 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1008 {
1009 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1010 if (cmp_bits >= min_ni_width
1011 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1012 {
1013 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1014 if (this_type
1015 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1016 {
1017 /* Although we could stop as soon as we find a valid mode,
1018 there are at least two reasons why that's not always the
1019 best choice:
1020
1021 - An IV that's Pmode or wider is more likely to be reusable
1022 in address calculations than an IV that's narrower than
1023 Pmode.
1024
1025 - Doing the comparison in IV_PRECISION or wider allows
1026 a natural 0-based IV, whereas using a narrower comparison
1027 type requires mitigations against wrap-around.
1028
1029 Conversely, if the IV limit is variable, doing the comparison
1030 in a wider type than the original type can introduce
1031 unnecessary extensions, so picking the widest valid mode
1032 is not always a good choice either.
1033
1034 Here we prefer the first IV type that's Pmode or wider,
1035 and the first comparison type that's IV_PRECISION or wider.
1036 (The comparison type must be no wider than the IV type,
1037 to avoid extensions in the vector loop.)
1038
1039 ??? We might want to try continuing beyond Pmode for ILP32
1040 targets if CMP_BITS < IV_PRECISION. */
1041 iv_type = this_type;
1042 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1043 cmp_type = this_type;
1044 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1045 break;
1046 }
1047 }
1048 }
1049
1050 if (!cmp_type)
1051 return false;
1052
1053 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1054 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1055 return true;
1056 }
1057
1058 /* Calculate the cost of one scalar iteration of the loop. */
1059 static void
1060 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1061 {
1062 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1063 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1064 int nbbs = loop->num_nodes, factor;
1065 int innerloop_iters, i;
1066
1067 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1068
1069 /* Gather costs for statements in the scalar loop. */
1070
1071 /* FORNOW. */
1072 innerloop_iters = 1;
1073 if (loop->inner)
1074 innerloop_iters = 50; /* FIXME */
1075
1076 for (i = 0; i < nbbs; i++)
1077 {
1078 gimple_stmt_iterator si;
1079 basic_block bb = bbs[i];
1080
1081 if (bb->loop_father == loop->inner)
1082 factor = innerloop_iters;
1083 else
1084 factor = 1;
1085
1086 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1087 {
1088 gimple *stmt = gsi_stmt (si);
1089 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1090
1091 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1092 continue;
1093
1094 /* Skip stmts that are not vectorized inside the loop. */
1095 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1096 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1097 && (!STMT_VINFO_LIVE_P (vstmt_info)
1098 || !VECTORIZABLE_CYCLE_DEF
1099 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1100 continue;
1101
1102 vect_cost_for_stmt kind;
1103 if (STMT_VINFO_DATA_REF (stmt_info))
1104 {
1105 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1106 kind = scalar_load;
1107 else
1108 kind = scalar_store;
1109 }
1110 else if (vect_nop_conversion_p (stmt_info))
1111 continue;
1112 else
1113 kind = scalar_stmt;
1114
1115 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1116 factor, kind, stmt_info, 0, vect_prologue);
1117 }
1118 }
1119
1120 /* Now accumulate cost. */
1121 void *target_cost_data = init_cost (loop);
1122 stmt_info_for_cost *si;
1123 int j;
1124 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1125 j, si)
1126 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1127 si->kind, si->stmt_info, si->vectype,
1128 si->misalign, vect_body);
1129 unsigned dummy, body_cost = 0;
1130 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1131 destroy_cost_data (target_cost_data);
1132 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1133 }
1134
1135
1136 /* Function vect_analyze_loop_form_1.
1137
1138 Verify that certain CFG restrictions hold, including:
1139 - the loop has a pre-header
1140 - the loop has a single entry and exit
1141 - the loop exit condition is simple enough
1142 - the number of iterations can be analyzed, i.e, a countable loop. The
1143 niter could be analyzed under some assumptions. */
1144
1145 opt_result
1146 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1147 tree *assumptions, tree *number_of_iterationsm1,
1148 tree *number_of_iterations, gcond **inner_loop_cond)
1149 {
1150 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1151
1152 /* Different restrictions apply when we are considering an inner-most loop,
1153 vs. an outer (nested) loop.
1154 (FORNOW. May want to relax some of these restrictions in the future). */
1155
1156 if (!loop->inner)
1157 {
1158 /* Inner-most loop. We currently require that the number of BBs is
1159 exactly 2 (the header and latch). Vectorizable inner-most loops
1160 look like this:
1161
1162 (pre-header)
1163 |
1164 header <--------+
1165 | | |
1166 | +--> latch --+
1167 |
1168 (exit-bb) */
1169
1170 if (loop->num_nodes != 2)
1171 return opt_result::failure_at (vect_location,
1172 "not vectorized:"
1173 " control flow in loop.\n");
1174
1175 if (empty_block_p (loop->header))
1176 return opt_result::failure_at (vect_location,
1177 "not vectorized: empty loop.\n");
1178 }
1179 else
1180 {
1181 class loop *innerloop = loop->inner;
1182 edge entryedge;
1183
1184 /* Nested loop. We currently require that the loop is doubly-nested,
1185 contains a single inner loop, and the number of BBs is exactly 5.
1186 Vectorizable outer-loops look like this:
1187
1188 (pre-header)
1189 |
1190 header <---+
1191 | |
1192 inner-loop |
1193 | |
1194 tail ------+
1195 |
1196 (exit-bb)
1197
1198 The inner-loop has the properties expected of inner-most loops
1199 as described above. */
1200
1201 if ((loop->inner)->inner || (loop->inner)->next)
1202 return opt_result::failure_at (vect_location,
1203 "not vectorized:"
1204 " multiple nested loops.\n");
1205
1206 if (loop->num_nodes != 5)
1207 return opt_result::failure_at (vect_location,
1208 "not vectorized:"
1209 " control flow in loop.\n");
1210
1211 entryedge = loop_preheader_edge (innerloop);
1212 if (entryedge->src != loop->header
1213 || !single_exit (innerloop)
1214 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1215 return opt_result::failure_at (vect_location,
1216 "not vectorized:"
1217 " unsupported outerloop form.\n");
1218
1219 /* Analyze the inner-loop. */
1220 tree inner_niterm1, inner_niter, inner_assumptions;
1221 opt_result res
1222 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1223 &inner_assumptions, &inner_niterm1,
1224 &inner_niter, NULL);
1225 if (!res)
1226 {
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229 "not vectorized: Bad inner loop.\n");
1230 return res;
1231 }
1232
1233 /* Don't support analyzing niter under assumptions for inner
1234 loop. */
1235 if (!integer_onep (inner_assumptions))
1236 return opt_result::failure_at (vect_location,
1237 "not vectorized: Bad inner loop.\n");
1238
1239 if (!expr_invariant_in_loop_p (loop, inner_niter))
1240 return opt_result::failure_at (vect_location,
1241 "not vectorized: inner-loop count not"
1242 " invariant.\n");
1243
1244 if (dump_enabled_p ())
1245 dump_printf_loc (MSG_NOTE, vect_location,
1246 "Considering outer-loop vectorization.\n");
1247 }
1248
1249 if (!single_exit (loop))
1250 return opt_result::failure_at (vect_location,
1251 "not vectorized: multiple exits.\n");
1252 if (EDGE_COUNT (loop->header->preds) != 2)
1253 return opt_result::failure_at (vect_location,
1254 "not vectorized:"
1255 " too many incoming edges.\n");
1256
1257 /* We assume that the loop exit condition is at the end of the loop. i.e,
1258 that the loop is represented as a do-while (with a proper if-guard
1259 before the loop if needed), where the loop header contains all the
1260 executable statements, and the latch is empty. */
1261 if (!empty_block_p (loop->latch)
1262 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1263 return opt_result::failure_at (vect_location,
1264 "not vectorized: latch block not empty.\n");
1265
1266 /* Make sure the exit is not abnormal. */
1267 edge e = single_exit (loop);
1268 if (e->flags & EDGE_ABNORMAL)
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized:"
1271 " abnormal loop exit edge.\n");
1272
1273 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1274 number_of_iterationsm1);
1275 if (!*loop_cond)
1276 return opt_result::failure_at
1277 (vect_location,
1278 "not vectorized: complicated exit condition.\n");
1279
1280 if (integer_zerop (*assumptions)
1281 || !*number_of_iterations
1282 || chrec_contains_undetermined (*number_of_iterations))
1283 return opt_result::failure_at
1284 (*loop_cond,
1285 "not vectorized: number of iterations cannot be computed.\n");
1286
1287 if (integer_zerop (*number_of_iterations))
1288 return opt_result::failure_at
1289 (*loop_cond,
1290 "not vectorized: number of iterations = 0.\n");
1291
1292 return opt_result::success ();
1293 }
1294
1295 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1296
1297 opt_loop_vec_info
1298 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1299 {
1300 tree assumptions, number_of_iterations, number_of_iterationsm1;
1301 gcond *loop_cond, *inner_loop_cond = NULL;
1302
1303 opt_result res
1304 = vect_analyze_loop_form_1 (loop, &loop_cond,
1305 &assumptions, &number_of_iterationsm1,
1306 &number_of_iterations, &inner_loop_cond);
1307 if (!res)
1308 return opt_loop_vec_info::propagate_failure (res);
1309
1310 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1311 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1312 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1313 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1314 if (!integer_onep (assumptions))
1315 {
1316 /* We consider to vectorize this loop by versioning it under
1317 some assumptions. In order to do this, we need to clear
1318 existing information computed by scev and niter analyzer. */
1319 scev_reset_htab ();
1320 free_numbers_of_iterations_estimates (loop);
1321 /* Also set flag for this loop so that following scev and niter
1322 analysis are done under the assumptions. */
1323 loop_constraint_set (loop, LOOP_C_FINITE);
1324 /* Also record the assumptions for versioning. */
1325 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1326 }
1327
1328 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1329 {
1330 if (dump_enabled_p ())
1331 {
1332 dump_printf_loc (MSG_NOTE, vect_location,
1333 "Symbolic number of iterations is ");
1334 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1335 dump_printf (MSG_NOTE, "\n");
1336 }
1337 }
1338
1339 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1340 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1341 if (inner_loop_cond)
1342 {
1343 stmt_vec_info inner_loop_cond_info
1344 = loop_vinfo->lookup_stmt (inner_loop_cond);
1345 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1346 }
1347
1348 gcc_assert (!loop->aux);
1349 loop->aux = loop_vinfo;
1350 return opt_loop_vec_info::success (loop_vinfo);
1351 }
1352
1353
1354
1355 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1356 statements update the vectorization factor. */
1357
1358 static void
1359 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1360 {
1361 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1362 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1363 int nbbs = loop->num_nodes;
1364 poly_uint64 vectorization_factor;
1365 int i;
1366
1367 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1368
1369 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1370 gcc_assert (known_ne (vectorization_factor, 0U));
1371
1372 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1373 vectorization factor of the loop is the unrolling factor required by
1374 the SLP instances. If that unrolling factor is 1, we say, that we
1375 perform pure SLP on loop - cross iteration parallelism is not
1376 exploited. */
1377 bool only_slp_in_loop = true;
1378 for (i = 0; i < nbbs; i++)
1379 {
1380 basic_block bb = bbs[i];
1381 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1382 gsi_next (&si))
1383 {
1384 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1385 if (!stmt_info)
1386 continue;
1387 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1388 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1389 && !PURE_SLP_STMT (stmt_info))
1390 /* STMT needs both SLP and loop-based vectorization. */
1391 only_slp_in_loop = false;
1392 }
1393 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1394 gsi_next (&si))
1395 {
1396 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1397 stmt_info = vect_stmt_to_vectorize (stmt_info);
1398 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1399 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1400 && !PURE_SLP_STMT (stmt_info))
1401 /* STMT needs both SLP and loop-based vectorization. */
1402 only_slp_in_loop = false;
1403 }
1404 }
1405
1406 if (only_slp_in_loop)
1407 {
1408 if (dump_enabled_p ())
1409 dump_printf_loc (MSG_NOTE, vect_location,
1410 "Loop contains only SLP stmts\n");
1411 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1412 }
1413 else
1414 {
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_NOTE, vect_location,
1417 "Loop contains SLP and non-SLP stmts\n");
1418 /* Both the vectorization factor and unroll factor have the form
1419 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1420 so they must have a common multiple. */
1421 vectorization_factor
1422 = force_common_multiple (vectorization_factor,
1423 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1424 }
1425
1426 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1427 if (dump_enabled_p ())
1428 {
1429 dump_printf_loc (MSG_NOTE, vect_location,
1430 "Updating vectorization factor to ");
1431 dump_dec (MSG_NOTE, vectorization_factor);
1432 dump_printf (MSG_NOTE, ".\n");
1433 }
1434 }
1435
1436 /* Return true if STMT_INFO describes a double reduction phi and if
1437 the other phi in the reduction is also relevant for vectorization.
1438 This rejects cases such as:
1439
1440 outer1:
1441 x_1 = PHI <x_3(outer2), ...>;
1442 ...
1443
1444 inner:
1445 x_2 = ...;
1446 ...
1447
1448 outer2:
1449 x_3 = PHI <x_2(inner)>;
1450
1451 if nothing in x_2 or elsewhere makes x_1 relevant. */
1452
1453 static bool
1454 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1455 {
1456 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1457 return false;
1458
1459 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1460 }
1461
1462 /* Function vect_analyze_loop_operations.
1463
1464 Scan the loop stmts and make sure they are all vectorizable. */
1465
1466 static opt_result
1467 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1468 {
1469 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1470 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1471 int nbbs = loop->num_nodes;
1472 int i;
1473 stmt_vec_info stmt_info;
1474 bool need_to_vectorize = false;
1475 bool ok;
1476
1477 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1478
1479 auto_vec<stmt_info_for_cost> cost_vec;
1480
1481 for (i = 0; i < nbbs; i++)
1482 {
1483 basic_block bb = bbs[i];
1484
1485 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1486 gsi_next (&si))
1487 {
1488 gphi *phi = si.phi ();
1489 ok = true;
1490
1491 stmt_info = loop_vinfo->lookup_stmt (phi);
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1494 if (virtual_operand_p (gimple_phi_result (phi)))
1495 continue;
1496
1497 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1498 (i.e., a phi in the tail of the outer-loop). */
1499 if (! is_loop_header_bb_p (bb))
1500 {
1501 /* FORNOW: we currently don't support the case that these phis
1502 are not used in the outerloop (unless it is double reduction,
1503 i.e., this phi is vect_reduction_def), cause this case
1504 requires to actually do something here. */
1505 if (STMT_VINFO_LIVE_P (stmt_info)
1506 && !vect_active_double_reduction_p (stmt_info))
1507 return opt_result::failure_at (phi,
1508 "Unsupported loop-closed phi"
1509 " in outer-loop.\n");
1510
1511 /* If PHI is used in the outer loop, we check that its operand
1512 is defined in the inner loop. */
1513 if (STMT_VINFO_RELEVANT_P (stmt_info))
1514 {
1515 tree phi_op;
1516
1517 if (gimple_phi_num_args (phi) != 1)
1518 return opt_result::failure_at (phi, "unsupported phi");
1519
1520 phi_op = PHI_ARG_DEF (phi, 0);
1521 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1522 if (!op_def_info)
1523 return opt_result::failure_at (phi, "unsupported phi\n");
1524
1525 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1526 && (STMT_VINFO_RELEVANT (op_def_info)
1527 != vect_used_in_outer_by_reduction))
1528 return opt_result::failure_at (phi, "unsupported phi\n");
1529
1530 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1531 || (STMT_VINFO_DEF_TYPE (stmt_info)
1532 == vect_double_reduction_def))
1533 && !vectorizable_lc_phi (loop_vinfo,
1534 stmt_info, NULL, NULL))
1535 return opt_result::failure_at (phi, "unsupported phi\n");
1536 }
1537
1538 continue;
1539 }
1540
1541 gcc_assert (stmt_info);
1542
1543 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1544 || STMT_VINFO_LIVE_P (stmt_info))
1545 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1546 /* A scalar-dependence cycle that we don't support. */
1547 return opt_result::failure_at (phi,
1548 "not vectorized:"
1549 " scalar dependence cycle.\n");
1550
1551 if (STMT_VINFO_RELEVANT_P (stmt_info))
1552 {
1553 need_to_vectorize = true;
1554 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1555 && ! PURE_SLP_STMT (stmt_info))
1556 ok = vectorizable_induction (loop_vinfo,
1557 stmt_info, NULL, NULL, NULL,
1558 &cost_vec);
1559 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1560 || (STMT_VINFO_DEF_TYPE (stmt_info)
1561 == vect_double_reduction_def)
1562 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1563 && ! PURE_SLP_STMT (stmt_info))
1564 ok = vectorizable_reduction (loop_vinfo,
1565 stmt_info, NULL, NULL, &cost_vec);
1566 }
1567
1568 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1569 if (ok
1570 && STMT_VINFO_LIVE_P (stmt_info)
1571 && !PURE_SLP_STMT (stmt_info))
1572 ok = vectorizable_live_operation (loop_vinfo,
1573 stmt_info, NULL, NULL, NULL,
1574 -1, false, &cost_vec);
1575
1576 if (!ok)
1577 return opt_result::failure_at (phi,
1578 "not vectorized: relevant phi not "
1579 "supported: %G",
1580 static_cast <gimple *> (phi));
1581 }
1582
1583 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1584 gsi_next (&si))
1585 {
1586 gimple *stmt = gsi_stmt (si);
1587 if (!gimple_clobber_p (stmt))
1588 {
1589 opt_result res
1590 = vect_analyze_stmt (loop_vinfo,
1591 loop_vinfo->lookup_stmt (stmt),
1592 &need_to_vectorize,
1593 NULL, NULL, &cost_vec);
1594 if (!res)
1595 return res;
1596 }
1597 }
1598 } /* bbs */
1599
1600 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1601
1602 /* All operations in the loop are either irrelevant (deal with loop
1603 control, or dead), or only used outside the loop and can be moved
1604 out of the loop (e.g. invariants, inductions). The loop can be
1605 optimized away by scalar optimizations. We're better off not
1606 touching this loop. */
1607 if (!need_to_vectorize)
1608 {
1609 if (dump_enabled_p ())
1610 dump_printf_loc (MSG_NOTE, vect_location,
1611 "All the computation can be taken out of the loop.\n");
1612 return opt_result::failure_at
1613 (vect_location,
1614 "not vectorized: redundant loop. no profit to vectorize.\n");
1615 }
1616
1617 return opt_result::success ();
1618 }
1619
1620 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1621 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1622 definitely no, or -1 if it's worth retrying. */
1623
1624 static int
1625 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1626 {
1627 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1629
1630 /* Only fully-masked loops can have iteration counts less than the
1631 vectorization factor. */
1632 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1633 {
1634 HOST_WIDE_INT max_niter;
1635
1636 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1637 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1638 else
1639 max_niter = max_stmt_executions_int (loop);
1640
1641 if (max_niter != -1
1642 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1643 {
1644 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646 "not vectorized: iteration count smaller than "
1647 "vectorization factor.\n");
1648 return 0;
1649 }
1650 }
1651
1652 int min_profitable_iters, min_profitable_estimate;
1653 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1654 &min_profitable_estimate);
1655
1656 if (min_profitable_iters < 0)
1657 {
1658 if (dump_enabled_p ())
1659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660 "not vectorized: vectorization not profitable.\n");
1661 if (dump_enabled_p ())
1662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663 "not vectorized: vector version will never be "
1664 "profitable.\n");
1665 return -1;
1666 }
1667
1668 int min_scalar_loop_bound = (param_min_vect_loop_bound
1669 * assumed_vf);
1670
1671 /* Use the cost model only if it is more conservative than user specified
1672 threshold. */
1673 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1674 min_profitable_iters);
1675
1676 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1677
1678 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1679 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1680 {
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "not vectorized: vectorization not profitable.\n");
1684 if (dump_enabled_p ())
1685 dump_printf_loc (MSG_NOTE, vect_location,
1686 "not vectorized: iteration count smaller than user "
1687 "specified loop bound parameter or minimum profitable "
1688 "iterations (whichever is more conservative).\n");
1689 return 0;
1690 }
1691
1692 /* The static profitablity threshold min_profitable_estimate includes
1693 the cost of having to check at runtime whether the scalar loop
1694 should be used instead. If it turns out that we don't need or want
1695 such a check, the threshold we should use for the static estimate
1696 is simply the point at which the vector loop becomes more profitable
1697 than the scalar loop. */
1698 if (min_profitable_estimate > min_profitable_iters
1699 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1700 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1701 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1702 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1703 {
1704 if (dump_enabled_p ())
1705 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1706 " choice between the scalar and vector loops\n");
1707 min_profitable_estimate = min_profitable_iters;
1708 }
1709
1710 HOST_WIDE_INT estimated_niter;
1711
1712 /* If we are vectorizing an epilogue then we know the maximum number of
1713 scalar iterations it will cover is at least one lower than the
1714 vectorization factor of the main loop. */
1715 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1716 estimated_niter
1717 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1718 else
1719 {
1720 estimated_niter = estimated_stmt_executions_int (loop);
1721 if (estimated_niter == -1)
1722 estimated_niter = likely_max_stmt_executions_int (loop);
1723 }
1724 if (estimated_niter != -1
1725 && ((unsigned HOST_WIDE_INT) estimated_niter
1726 < MAX (th, (unsigned) min_profitable_estimate)))
1727 {
1728 if (dump_enabled_p ())
1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730 "not vectorized: estimated iteration count too "
1731 "small.\n");
1732 if (dump_enabled_p ())
1733 dump_printf_loc (MSG_NOTE, vect_location,
1734 "not vectorized: estimated iteration count smaller "
1735 "than specified loop bound parameter or minimum "
1736 "profitable iterations (whichever is more "
1737 "conservative).\n");
1738 return -1;
1739 }
1740
1741 return 1;
1742 }
1743
1744 static opt_result
1745 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1746 vec<data_reference_p> *datarefs,
1747 unsigned int *n_stmts)
1748 {
1749 *n_stmts = 0;
1750 for (unsigned i = 0; i < loop->num_nodes; i++)
1751 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1752 !gsi_end_p (gsi); gsi_next (&gsi))
1753 {
1754 gimple *stmt = gsi_stmt (gsi);
1755 if (is_gimple_debug (stmt))
1756 continue;
1757 ++(*n_stmts);
1758 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1759 if (!res)
1760 {
1761 if (is_gimple_call (stmt) && loop->safelen)
1762 {
1763 tree fndecl = gimple_call_fndecl (stmt), op;
1764 if (fndecl != NULL_TREE)
1765 {
1766 cgraph_node *node = cgraph_node::get (fndecl);
1767 if (node != NULL && node->simd_clones != NULL)
1768 {
1769 unsigned int j, n = gimple_call_num_args (stmt);
1770 for (j = 0; j < n; j++)
1771 {
1772 op = gimple_call_arg (stmt, j);
1773 if (DECL_P (op)
1774 || (REFERENCE_CLASS_P (op)
1775 && get_base_address (op)))
1776 break;
1777 }
1778 op = gimple_call_lhs (stmt);
1779 /* Ignore #pragma omp declare simd functions
1780 if they don't have data references in the
1781 call stmt itself. */
1782 if (j == n
1783 && !(op
1784 && (DECL_P (op)
1785 || (REFERENCE_CLASS_P (op)
1786 && get_base_address (op)))))
1787 continue;
1788 }
1789 }
1790 }
1791 return res;
1792 }
1793 /* If dependence analysis will give up due to the limit on the
1794 number of datarefs stop here and fail fatally. */
1795 if (datarefs->length ()
1796 > (unsigned)param_loop_max_datarefs_for_datadeps)
1797 return opt_result::failure_at (stmt, "exceeded param "
1798 "loop-max-datarefs-for-datadeps\n");
1799 }
1800 return opt_result::success ();
1801 }
1802
1803 /* Look for SLP-only access groups and turn each individual access into its own
1804 group. */
1805 static void
1806 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1807 {
1808 unsigned int i;
1809 struct data_reference *dr;
1810
1811 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1812
1813 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1814 FOR_EACH_VEC_ELT (datarefs, i, dr)
1815 {
1816 gcc_assert (DR_REF (dr));
1817 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1818
1819 /* Check if the load is a part of an interleaving chain. */
1820 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1821 {
1822 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1823 unsigned int group_size = DR_GROUP_SIZE (first_element);
1824
1825 /* Check if SLP-only groups. */
1826 if (!STMT_SLP_TYPE (stmt_info)
1827 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1828 {
1829 /* Dissolve the group. */
1830 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1831
1832 stmt_vec_info vinfo = first_element;
1833 while (vinfo)
1834 {
1835 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1836 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1837 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1838 DR_GROUP_SIZE (vinfo) = 1;
1839 if (STMT_VINFO_STRIDED_P (first_element))
1840 DR_GROUP_GAP (vinfo) = 0;
1841 else
1842 DR_GROUP_GAP (vinfo) = group_size - 1;
1843 vinfo = next;
1844 }
1845 }
1846 }
1847 }
1848 }
1849
1850
1851 /* Decides whether we need to create an epilogue loop to handle
1852 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1853
1854 void
1855 determine_peel_for_niter (loop_vec_info loop_vinfo)
1856 {
1857 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1858
1859 unsigned HOST_WIDE_INT const_vf;
1860 HOST_WIDE_INT max_niter
1861 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1862
1863 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1864 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1865 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1866 (loop_vinfo));
1867
1868 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1869 /* The main loop handles all iterations. */
1870 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1871 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1872 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1873 {
1874 /* Work out the (constant) number of iterations that need to be
1875 peeled for reasons other than niters. */
1876 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1877 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1878 peel_niter += 1;
1879 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1880 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1881 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1882 }
1883 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1884 /* ??? When peeling for gaps but not alignment, we could
1885 try to check whether the (variable) niters is known to be
1886 VF * N + 1. That's something of a niche case though. */
1887 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1888 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1889 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1890 < (unsigned) exact_log2 (const_vf))
1891 /* In case of versioning, check if the maximum number of
1892 iterations is greater than th. If they are identical,
1893 the epilogue is unnecessary. */
1894 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1895 || ((unsigned HOST_WIDE_INT) max_niter
1896 > (th / const_vf) * const_vf))))
1897 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1898 }
1899
1900
1901 /* Function vect_analyze_loop_2.
1902
1903 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1904 for it. The different analyses will record information in the
1905 loop_vec_info struct. */
1906 static opt_result
1907 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1908 {
1909 opt_result ok = opt_result::success ();
1910 int res;
1911 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1912 poly_uint64 min_vf = 2;
1913 loop_vec_info orig_loop_vinfo = NULL;
1914
1915 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1916 loop_vec_info of the first vectorized loop. */
1917 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1918 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1919 else
1920 orig_loop_vinfo = loop_vinfo;
1921 gcc_assert (orig_loop_vinfo);
1922
1923 /* The first group of checks is independent of the vector size. */
1924 fatal = true;
1925
1926 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1927 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1928 return opt_result::failure_at (vect_location,
1929 "not vectorized: simd if(0)\n");
1930
1931 /* Find all data references in the loop (which correspond to vdefs/vuses)
1932 and analyze their evolution in the loop. */
1933
1934 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1935
1936 /* Gather the data references and count stmts in the loop. */
1937 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1938 {
1939 opt_result res
1940 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1941 &LOOP_VINFO_DATAREFS (loop_vinfo),
1942 n_stmts);
1943 if (!res)
1944 {
1945 if (dump_enabled_p ())
1946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1947 "not vectorized: loop contains function "
1948 "calls or data references that cannot "
1949 "be analyzed\n");
1950 return res;
1951 }
1952 loop_vinfo->shared->save_datarefs ();
1953 }
1954 else
1955 loop_vinfo->shared->check_datarefs ();
1956
1957 /* Analyze the data references and also adjust the minimal
1958 vectorization factor according to the loads and stores. */
1959
1960 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1961 if (!ok)
1962 {
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965 "bad data references.\n");
1966 return ok;
1967 }
1968
1969 /* Classify all cross-iteration scalar data-flow cycles.
1970 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1971 vect_analyze_scalar_cycles (loop_vinfo);
1972
1973 vect_pattern_recog (loop_vinfo);
1974
1975 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1976
1977 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1978 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1979
1980 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1981 if (!ok)
1982 {
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985 "bad data access.\n");
1986 return ok;
1987 }
1988
1989 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1990
1991 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1992 if (!ok)
1993 {
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996 "unexpected pattern.\n");
1997 return ok;
1998 }
1999
2000 /* While the rest of the analysis below depends on it in some way. */
2001 fatal = false;
2002
2003 /* Analyze data dependences between the data-refs in the loop
2004 and adjust the maximum vectorization factor according to
2005 the dependences.
2006 FORNOW: fail at the first data dependence that we encounter. */
2007
2008 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2009 if (!ok)
2010 {
2011 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013 "bad data dependence.\n");
2014 return ok;
2015 }
2016 if (max_vf != MAX_VECTORIZATION_FACTOR
2017 && maybe_lt (max_vf, min_vf))
2018 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2019 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2020
2021 ok = vect_determine_vectorization_factor (loop_vinfo);
2022 if (!ok)
2023 {
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026 "can't determine vectorization factor.\n");
2027 return ok;
2028 }
2029 if (max_vf != MAX_VECTORIZATION_FACTOR
2030 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2031 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2032
2033 /* Compute the scalar iteration cost. */
2034 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2035
2036 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2037
2038 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2039 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2040 if (!ok)
2041 return ok;
2042
2043 /* If there are any SLP instances mark them as pure_slp. */
2044 bool slp = vect_make_slp_decision (loop_vinfo);
2045 if (slp)
2046 {
2047 /* Find stmts that need to be both vectorized and SLPed. */
2048 vect_detect_hybrid_slp (loop_vinfo);
2049
2050 /* Update the vectorization factor based on the SLP decision. */
2051 vect_update_vf_for_slp (loop_vinfo);
2052
2053 /* Optimize the SLP graph with the vectorization factor fixed. */
2054 vect_optimize_slp (loop_vinfo);
2055 }
2056
2057 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2058
2059 /* We don't expect to have to roll back to anything other than an empty
2060 set of rgroups. */
2061 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2062
2063 /* This is the point where we can re-start analysis with SLP forced off. */
2064 start_over:
2065
2066 /* Now the vectorization factor is final. */
2067 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2068 gcc_assert (known_ne (vectorization_factor, 0U));
2069
2070 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2071 {
2072 dump_printf_loc (MSG_NOTE, vect_location,
2073 "vectorization_factor = ");
2074 dump_dec (MSG_NOTE, vectorization_factor);
2075 dump_printf (MSG_NOTE, ", niters = %wd\n",
2076 LOOP_VINFO_INT_NITERS (loop_vinfo));
2077 }
2078
2079 /* Analyze the alignment of the data-refs in the loop.
2080 Fail if a data reference is found that cannot be vectorized. */
2081
2082 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2083 if (!ok)
2084 {
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2087 "bad data alignment.\n");
2088 return ok;
2089 }
2090
2091 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2092 It is important to call pruning after vect_analyze_data_ref_accesses,
2093 since we use grouping information gathered by interleaving analysis. */
2094 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2095 if (!ok)
2096 return ok;
2097
2098 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2099 vectorization, since we do not want to add extra peeling or
2100 add versioning for alignment. */
2101 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2102 /* This pass will decide on using loop versioning and/or loop peeling in
2103 order to enhance the alignment of data references in the loop. */
2104 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2105 else
2106 ok = vect_verify_datarefs_alignment (loop_vinfo);
2107 if (!ok)
2108 return ok;
2109
2110 if (slp)
2111 {
2112 /* Analyze operations in the SLP instances. Note this may
2113 remove unsupported SLP instances which makes the above
2114 SLP kind detection invalid. */
2115 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2116 vect_slp_analyze_operations (loop_vinfo);
2117 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2118 {
2119 ok = opt_result::failure_at (vect_location,
2120 "unsupported SLP instances\n");
2121 goto again;
2122 }
2123 }
2124
2125 /* Dissolve SLP-only groups. */
2126 vect_dissolve_slp_only_groups (loop_vinfo);
2127
2128 /* Scan all the remaining operations in the loop that are not subject
2129 to SLP and make sure they are vectorizable. */
2130 ok = vect_analyze_loop_operations (loop_vinfo);
2131 if (!ok)
2132 {
2133 if (dump_enabled_p ())
2134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2135 "bad operation or unsupported loop bound.\n");
2136 return ok;
2137 }
2138
2139 /* Decide whether to use a fully-masked loop for this vectorization
2140 factor. */
2141 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2142 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2143 && vect_verify_full_masking (loop_vinfo));
2144 if (dump_enabled_p ())
2145 {
2146 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2147 dump_printf_loc (MSG_NOTE, vect_location,
2148 "using a fully-masked loop.\n");
2149 else
2150 dump_printf_loc (MSG_NOTE, vect_location,
2151 "not using a fully-masked loop.\n");
2152 }
2153
2154 /* If epilog loop is required because of data accesses with gaps,
2155 one additional iteration needs to be peeled. Check if there is
2156 enough iterations for vectorization. */
2157 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2158 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2159 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2160 {
2161 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2162 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2163
2164 if (known_lt (wi::to_widest (scalar_niters), vf))
2165 return opt_result::failure_at (vect_location,
2166 "loop has no enough iterations to"
2167 " support peeling for gaps.\n");
2168 }
2169
2170 /* If we're vectorizing an epilogue loop, we either need a fully-masked
2171 loop or a loop that has a lower VF than the main loop. */
2172 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2173 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2174 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2175 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2176 return opt_result::failure_at (vect_location,
2177 "Vectorization factor too high for"
2178 " epilogue loop.\n");
2179
2180 /* Check the costings of the loop make vectorizing worthwhile. */
2181 res = vect_analyze_loop_costing (loop_vinfo);
2182 if (res < 0)
2183 {
2184 ok = opt_result::failure_at (vect_location,
2185 "Loop costings may not be worthwhile.\n");
2186 goto again;
2187 }
2188 if (!res)
2189 return opt_result::failure_at (vect_location,
2190 "Loop costings not worthwhile.\n");
2191
2192 determine_peel_for_niter (loop_vinfo);
2193 /* If an epilogue loop is required make sure we can create one. */
2194 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2195 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2196 {
2197 if (dump_enabled_p ())
2198 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2199 if (!vect_can_advance_ivs_p (loop_vinfo)
2200 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2201 single_exit (LOOP_VINFO_LOOP
2202 (loop_vinfo))))
2203 {
2204 ok = opt_result::failure_at (vect_location,
2205 "not vectorized: can't create required "
2206 "epilog loop\n");
2207 goto again;
2208 }
2209 }
2210
2211 /* During peeling, we need to check if number of loop iterations is
2212 enough for both peeled prolog loop and vector loop. This check
2213 can be merged along with threshold check of loop versioning, so
2214 increase threshold for this case if necessary.
2215
2216 If we are analyzing an epilogue we still want to check what its
2217 versioning threshold would be. If we decide to vectorize the epilogues we
2218 will want to use the lowest versioning threshold of all epilogues and main
2219 loop. This will enable us to enter a vectorized epilogue even when
2220 versioning the loop. We can't simply check whether the epilogue requires
2221 versioning though since we may have skipped some versioning checks when
2222 analyzing the epilogue. For instance, checks for alias versioning will be
2223 skipped when dealing with epilogues as we assume we already checked them
2224 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2225 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2226 {
2227 poly_uint64 niters_th = 0;
2228 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2229
2230 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2231 {
2232 /* Niters for peeled prolog loop. */
2233 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2234 {
2235 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2236 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2237 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2238 }
2239 else
2240 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2241 }
2242
2243 /* Niters for at least one iteration of vectorized loop. */
2244 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2245 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2246 /* One additional iteration because of peeling for gap. */
2247 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2248 niters_th += 1;
2249
2250 /* Use the same condition as vect_transform_loop to decide when to use
2251 the cost to determine a versioning threshold. */
2252 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2253 && ordered_p (th, niters_th))
2254 niters_th = ordered_max (poly_uint64 (th), niters_th);
2255
2256 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2257 }
2258
2259 gcc_assert (known_eq (vectorization_factor,
2260 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2261
2262 /* Ok to vectorize! */
2263 return opt_result::success ();
2264
2265 again:
2266 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2267 gcc_assert (!ok);
2268
2269 /* Try again with SLP forced off but if we didn't do any SLP there is
2270 no point in re-trying. */
2271 if (!slp)
2272 return ok;
2273
2274 /* If there are reduction chains re-trying will fail anyway. */
2275 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2276 return ok;
2277
2278 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2279 via interleaving or lane instructions. */
2280 slp_instance instance;
2281 slp_tree node;
2282 unsigned i, j;
2283 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2284 {
2285 stmt_vec_info vinfo;
2286 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2287 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2288 continue;
2289 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2290 unsigned int size = DR_GROUP_SIZE (vinfo);
2291 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2292 if (! vect_store_lanes_supported (vectype, size, false)
2293 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2294 && ! vect_grouped_store_supported (vectype, size))
2295 return opt_result::failure_at (vinfo->stmt,
2296 "unsupported grouped store\n");
2297 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2298 {
2299 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2300 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2301 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2302 size = DR_GROUP_SIZE (vinfo);
2303 vectype = STMT_VINFO_VECTYPE (vinfo);
2304 if (! vect_load_lanes_supported (vectype, size, false)
2305 && ! vect_grouped_load_supported (vectype, single_element_p,
2306 size))
2307 return opt_result::failure_at (vinfo->stmt,
2308 "unsupported grouped load\n");
2309 }
2310 }
2311
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_NOTE, vect_location,
2314 "re-trying with SLP disabled\n");
2315
2316 /* Roll back state appropriately. No SLP this time. */
2317 slp = false;
2318 /* Restore vectorization factor as it were without SLP. */
2319 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2320 /* Free the SLP instances. */
2321 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2322 vect_free_slp_instance (instance, false);
2323 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2324 /* Reset SLP type to loop_vect on all stmts. */
2325 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2326 {
2327 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2328 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2329 !gsi_end_p (si); gsi_next (&si))
2330 {
2331 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2332 STMT_SLP_TYPE (stmt_info) = loop_vect;
2333 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2334 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2335 {
2336 /* vectorizable_reduction adjusts reduction stmt def-types,
2337 restore them to that of the PHI. */
2338 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2339 = STMT_VINFO_DEF_TYPE (stmt_info);
2340 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2341 (STMT_VINFO_REDUC_DEF (stmt_info)))
2342 = STMT_VINFO_DEF_TYPE (stmt_info);
2343 }
2344 }
2345 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2346 !gsi_end_p (si); gsi_next (&si))
2347 {
2348 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2349 STMT_SLP_TYPE (stmt_info) = loop_vect;
2350 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2351 {
2352 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2353 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2354 STMT_SLP_TYPE (stmt_info) = loop_vect;
2355 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2356 !gsi_end_p (pi); gsi_next (&pi))
2357 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2358 = loop_vect;
2359 }
2360 }
2361 }
2362 /* Free optimized alias test DDRS. */
2363 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2364 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2365 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2366 /* Reset target cost data. */
2367 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2368 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2369 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2370 /* Reset accumulated rgroup information. */
2371 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2372 /* Reset assorted flags. */
2373 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2374 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2375 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2376 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2377 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2378
2379 goto start_over;
2380 }
2381
2382 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2383 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2384 OLD_LOOP_VINFO is better unless something specifically indicates
2385 otherwise.
2386
2387 Note that this deliberately isn't a partial order. */
2388
2389 static bool
2390 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2391 loop_vec_info old_loop_vinfo)
2392 {
2393 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2394 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2395
2396 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2397 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2398
2399 /* Always prefer a VF of loop->simdlen over any other VF. */
2400 if (loop->simdlen)
2401 {
2402 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2403 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2404 if (new_simdlen_p != old_simdlen_p)
2405 return new_simdlen_p;
2406 }
2407
2408 /* Limit the VFs to what is likely to be the maximum number of iterations,
2409 to handle cases in which at least one loop_vinfo is fully-masked. */
2410 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2411 if (estimated_max_niter != -1)
2412 {
2413 if (known_le (estimated_max_niter, new_vf))
2414 new_vf = estimated_max_niter;
2415 if (known_le (estimated_max_niter, old_vf))
2416 old_vf = estimated_max_niter;
2417 }
2418
2419 /* Check whether the (fractional) cost per scalar iteration is lower
2420 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2421 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2422 * poly_widest_int (old_vf));
2423 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2424 * poly_widest_int (new_vf));
2425 if (maybe_lt (rel_old, rel_new))
2426 {
2427 /* When old_loop_vinfo uses a variable vectorization factor,
2428 we know that it has a lower cost for at least one runtime VF.
2429 However, we don't know how likely that VF is.
2430
2431 One option would be to compare the costs for the estimated VFs.
2432 The problem is that that can put too much pressure on the cost
2433 model. E.g. if the estimated VF is also the lowest possible VF,
2434 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2435 for the estimated VF, we'd then choose new_loop_vinfo even
2436 though (a) new_loop_vinfo might not actually be better than
2437 old_loop_vinfo for that VF and (b) it would be significantly
2438 worse at larger VFs.
2439
2440 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2441 no more expensive than old_loop_vinfo even after doubling the
2442 estimated old_loop_vinfo VF. For all but trivial loops, this
2443 ensures that we only pick new_loop_vinfo if it is significantly
2444 better than old_loop_vinfo at the estimated VF. */
2445 if (rel_new.is_constant ())
2446 return false;
2447
2448 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2449 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2450 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2451 * widest_int (old_estimated_vf));
2452 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2453 * widest_int (new_estimated_vf));
2454 return estimated_rel_new * 2 <= estimated_rel_old;
2455 }
2456 if (known_lt (rel_new, rel_old))
2457 return true;
2458
2459 /* If there's nothing to choose between the loop bodies, see whether
2460 there's a difference in the prologue and epilogue costs. */
2461 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2462 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2463
2464 return false;
2465 }
2466
2467 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2468 true if we should. */
2469
2470 static bool
2471 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2472 loop_vec_info old_loop_vinfo)
2473 {
2474 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2475 return false;
2476
2477 if (dump_enabled_p ())
2478 dump_printf_loc (MSG_NOTE, vect_location,
2479 "***** Preferring vector mode %s to vector mode %s\n",
2480 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2481 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2482 return true;
2483 }
2484
2485 /* Function vect_analyze_loop.
2486
2487 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2488 for it. The different analyses will record information in the
2489 loop_vec_info struct. */
2490 opt_loop_vec_info
2491 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2492 {
2493 auto_vector_modes vector_modes;
2494
2495 /* Autodetect first vector size we try. */
2496 unsigned int autovec_flags
2497 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2498 loop->simdlen != 0);
2499 unsigned int mode_i = 0;
2500
2501 DUMP_VECT_SCOPE ("analyze_loop_nest");
2502
2503 if (loop_outer (loop)
2504 && loop_vec_info_for_loop (loop_outer (loop))
2505 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2506 return opt_loop_vec_info::failure_at (vect_location,
2507 "outer-loop already vectorized.\n");
2508
2509 if (!find_loop_nest (loop, &shared->loop_nest))
2510 return opt_loop_vec_info::failure_at
2511 (vect_location,
2512 "not vectorized: loop nest containing two or more consecutive inner"
2513 " loops cannot be vectorized\n");
2514
2515 unsigned n_stmts = 0;
2516 machine_mode autodetected_vector_mode = VOIDmode;
2517 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2518 machine_mode next_vector_mode = VOIDmode;
2519 poly_uint64 lowest_th = 0;
2520 unsigned vectorized_loops = 0;
2521 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2522 && !unlimited_cost_model (loop));
2523
2524 bool vect_epilogues = false;
2525 opt_result res = opt_result::success ();
2526 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2527 while (1)
2528 {
2529 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2530 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2531 if (!loop_vinfo)
2532 {
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2535 "bad loop form.\n");
2536 gcc_checking_assert (first_loop_vinfo == NULL);
2537 return loop_vinfo;
2538 }
2539 loop_vinfo->vector_mode = next_vector_mode;
2540
2541 bool fatal = false;
2542
2543 /* When pick_lowest_cost_p is true, we should in principle iterate
2544 over all the loop_vec_infos that LOOP_VINFO could replace and
2545 try to vectorize LOOP_VINFO under the same conditions.
2546 E.g. when trying to replace an epilogue loop, we should vectorize
2547 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2548 to replace the main loop, we should vectorize LOOP_VINFO as a main
2549 loop too.
2550
2551 However, autovectorize_vector_modes is usually sorted as follows:
2552
2553 - Modes that naturally produce lower VFs usually follow modes that
2554 naturally produce higher VFs.
2555
2556 - When modes naturally produce the same VF, maskable modes
2557 usually follow unmaskable ones, so that the maskable mode
2558 can be used to vectorize the epilogue of the unmaskable mode.
2559
2560 This order is preferred because it leads to the maximum
2561 epilogue vectorization opportunities. Targets should only use
2562 a different order if they want to make wide modes available while
2563 disparaging them relative to earlier, smaller modes. The assumption
2564 in that case is that the wider modes are more expensive in some
2565 way that isn't reflected directly in the costs.
2566
2567 There should therefore be few interesting cases in which
2568 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2569 treated as a standalone loop, and ends up being genuinely cheaper
2570 than FIRST_LOOP_VINFO. */
2571 if (vect_epilogues)
2572 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2573
2574 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2575 if (mode_i == 0)
2576 autodetected_vector_mode = loop_vinfo->vector_mode;
2577 if (dump_enabled_p ())
2578 {
2579 if (res)
2580 dump_printf_loc (MSG_NOTE, vect_location,
2581 "***** Analysis succeeded with vector mode %s\n",
2582 GET_MODE_NAME (loop_vinfo->vector_mode));
2583 else
2584 dump_printf_loc (MSG_NOTE, vect_location,
2585 "***** Analysis failed with vector mode %s\n",
2586 GET_MODE_NAME (loop_vinfo->vector_mode));
2587 }
2588
2589 loop->aux = NULL;
2590
2591 if (!fatal)
2592 while (mode_i < vector_modes.length ()
2593 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2594 {
2595 if (dump_enabled_p ())
2596 dump_printf_loc (MSG_NOTE, vect_location,
2597 "***** The result for vector mode %s would"
2598 " be the same\n",
2599 GET_MODE_NAME (vector_modes[mode_i]));
2600 mode_i += 1;
2601 }
2602
2603 if (res)
2604 {
2605 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2606 vectorized_loops++;
2607
2608 /* Once we hit the desired simdlen for the first time,
2609 discard any previous attempts. */
2610 if (simdlen
2611 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2612 {
2613 delete first_loop_vinfo;
2614 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2615 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2616 simdlen = 0;
2617 }
2618 else if (pick_lowest_cost_p && first_loop_vinfo)
2619 {
2620 /* Keep trying to roll back vectorization attempts while the
2621 loop_vec_infos they produced were worse than this one. */
2622 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2623 while (!vinfos.is_empty ()
2624 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2625 {
2626 gcc_assert (vect_epilogues);
2627 delete vinfos.pop ();
2628 }
2629 if (vinfos.is_empty ()
2630 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2631 {
2632 delete first_loop_vinfo;
2633 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2634 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2635 }
2636 }
2637
2638 if (first_loop_vinfo == NULL)
2639 {
2640 first_loop_vinfo = loop_vinfo;
2641 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2642 }
2643 else if (vect_epilogues
2644 /* For now only allow one epilogue loop. */
2645 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2646 {
2647 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2648 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2649 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2650 || maybe_ne (lowest_th, 0U));
2651 /* Keep track of the known smallest versioning
2652 threshold. */
2653 if (ordered_p (lowest_th, th))
2654 lowest_th = ordered_min (lowest_th, th);
2655 }
2656 else
2657 delete loop_vinfo;
2658
2659 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2660 enabled, SIMDUID is not set, it is the innermost loop and we have
2661 either already found the loop's SIMDLEN or there was no SIMDLEN to
2662 begin with.
2663 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2664 vect_epilogues = (!simdlen
2665 && loop->inner == NULL
2666 && param_vect_epilogues_nomask
2667 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2668 && !loop->simduid
2669 /* For now only allow one epilogue loop, but allow
2670 pick_lowest_cost_p to replace it. */
2671 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2672 || pick_lowest_cost_p));
2673
2674 /* Commit to first_loop_vinfo if we have no reason to try
2675 alternatives. */
2676 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2677 break;
2678 }
2679 else
2680 {
2681 delete loop_vinfo;
2682 if (fatal)
2683 {
2684 gcc_checking_assert (first_loop_vinfo == NULL);
2685 break;
2686 }
2687 }
2688
2689 if (mode_i < vector_modes.length ()
2690 && VECTOR_MODE_P (autodetected_vector_mode)
2691 && (related_vector_mode (vector_modes[mode_i],
2692 GET_MODE_INNER (autodetected_vector_mode))
2693 == autodetected_vector_mode)
2694 && (related_vector_mode (autodetected_vector_mode,
2695 GET_MODE_INNER (vector_modes[mode_i]))
2696 == vector_modes[mode_i]))
2697 {
2698 if (dump_enabled_p ())
2699 dump_printf_loc (MSG_NOTE, vect_location,
2700 "***** Skipping vector mode %s, which would"
2701 " repeat the analysis for %s\n",
2702 GET_MODE_NAME (vector_modes[mode_i]),
2703 GET_MODE_NAME (autodetected_vector_mode));
2704 mode_i += 1;
2705 }
2706
2707 if (mode_i == vector_modes.length ()
2708 || autodetected_vector_mode == VOIDmode)
2709 break;
2710
2711 /* Try the next biggest vector size. */
2712 next_vector_mode = vector_modes[mode_i++];
2713 if (dump_enabled_p ())
2714 dump_printf_loc (MSG_NOTE, vect_location,
2715 "***** Re-trying analysis with vector mode %s\n",
2716 GET_MODE_NAME (next_vector_mode));
2717 }
2718
2719 if (first_loop_vinfo)
2720 {
2721 loop->aux = (loop_vec_info) first_loop_vinfo;
2722 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_NOTE, vect_location,
2724 "***** Choosing vector mode %s\n",
2725 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2726 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2727 return first_loop_vinfo;
2728 }
2729
2730 return opt_loop_vec_info::propagate_failure (res);
2731 }
2732
2733 /* Return true if there is an in-order reduction function for CODE, storing
2734 it in *REDUC_FN if so. */
2735
2736 static bool
2737 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2738 {
2739 switch (code)
2740 {
2741 case PLUS_EXPR:
2742 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2743 return true;
2744
2745 default:
2746 return false;
2747 }
2748 }
2749
2750 /* Function reduction_fn_for_scalar_code
2751
2752 Input:
2753 CODE - tree_code of a reduction operations.
2754
2755 Output:
2756 REDUC_FN - the corresponding internal function to be used to reduce the
2757 vector of partial results into a single scalar result, or IFN_LAST
2758 if the operation is a supported reduction operation, but does not have
2759 such an internal function.
2760
2761 Return FALSE if CODE currently cannot be vectorized as reduction. */
2762
2763 static bool
2764 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2765 {
2766 switch (code)
2767 {
2768 case MAX_EXPR:
2769 *reduc_fn = IFN_REDUC_MAX;
2770 return true;
2771
2772 case MIN_EXPR:
2773 *reduc_fn = IFN_REDUC_MIN;
2774 return true;
2775
2776 case PLUS_EXPR:
2777 *reduc_fn = IFN_REDUC_PLUS;
2778 return true;
2779
2780 case BIT_AND_EXPR:
2781 *reduc_fn = IFN_REDUC_AND;
2782 return true;
2783
2784 case BIT_IOR_EXPR:
2785 *reduc_fn = IFN_REDUC_IOR;
2786 return true;
2787
2788 case BIT_XOR_EXPR:
2789 *reduc_fn = IFN_REDUC_XOR;
2790 return true;
2791
2792 case MULT_EXPR:
2793 case MINUS_EXPR:
2794 *reduc_fn = IFN_LAST;
2795 return true;
2796
2797 default:
2798 return false;
2799 }
2800 }
2801
2802 /* If there is a neutral value X such that SLP reduction NODE would not
2803 be affected by the introduction of additional X elements, return that X,
2804 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2805 is the vector type that would hold element X. REDUC_CHAIN is true if
2806 the SLP statements perform a single reduction, false if each statement
2807 performs an independent reduction. */
2808
2809 static tree
2810 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2811 tree_code code, bool reduc_chain)
2812 {
2813 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2814 stmt_vec_info stmt_vinfo = stmts[0];
2815 tree scalar_type = TREE_TYPE (vector_type);
2816 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2817 gcc_assert (loop);
2818
2819 switch (code)
2820 {
2821 case WIDEN_SUM_EXPR:
2822 case DOT_PROD_EXPR:
2823 case SAD_EXPR:
2824 case PLUS_EXPR:
2825 case MINUS_EXPR:
2826 case BIT_IOR_EXPR:
2827 case BIT_XOR_EXPR:
2828 return build_zero_cst (scalar_type);
2829
2830 case MULT_EXPR:
2831 return build_one_cst (scalar_type);
2832
2833 case BIT_AND_EXPR:
2834 return build_all_ones_cst (scalar_type);
2835
2836 case MAX_EXPR:
2837 case MIN_EXPR:
2838 /* For MIN/MAX the initial values are neutral. A reduction chain
2839 has only a single initial value, so that value is neutral for
2840 all statements. */
2841 if (reduc_chain)
2842 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2843 loop_preheader_edge (loop));
2844 return NULL_TREE;
2845
2846 default:
2847 return NULL_TREE;
2848 }
2849 }
2850
2851 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2852 STMT is printed with a message MSG. */
2853
2854 static void
2855 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2856 {
2857 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2858 }
2859
2860 /* Return true if we need an in-order reduction for operation CODE
2861 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2862 overflow must wrap. */
2863
2864 bool
2865 needs_fold_left_reduction_p (tree type, tree_code code)
2866 {
2867 /* CHECKME: check for !flag_finite_math_only too? */
2868 if (SCALAR_FLOAT_TYPE_P (type))
2869 switch (code)
2870 {
2871 case MIN_EXPR:
2872 case MAX_EXPR:
2873 return false;
2874
2875 default:
2876 return !flag_associative_math;
2877 }
2878
2879 if (INTEGRAL_TYPE_P (type))
2880 {
2881 if (!operation_no_trapping_overflow (type, code))
2882 return true;
2883 return false;
2884 }
2885
2886 if (SAT_FIXED_POINT_TYPE_P (type))
2887 return true;
2888
2889 return false;
2890 }
2891
2892 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2893 has a handled computation expression. Store the main reduction
2894 operation in *CODE. */
2895
2896 static bool
2897 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2898 tree loop_arg, enum tree_code *code,
2899 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2900 {
2901 auto_bitmap visited;
2902 tree lookfor = PHI_RESULT (phi);
2903 ssa_op_iter curri;
2904 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2905 while (USE_FROM_PTR (curr) != loop_arg)
2906 curr = op_iter_next_use (&curri);
2907 curri.i = curri.numops;
2908 do
2909 {
2910 path.safe_push (std::make_pair (curri, curr));
2911 tree use = USE_FROM_PTR (curr);
2912 if (use == lookfor)
2913 break;
2914 gimple *def = SSA_NAME_DEF_STMT (use);
2915 if (gimple_nop_p (def)
2916 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2917 {
2918 pop:
2919 do
2920 {
2921 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2922 curri = x.first;
2923 curr = x.second;
2924 do
2925 curr = op_iter_next_use (&curri);
2926 /* Skip already visited or non-SSA operands (from iterating
2927 over PHI args). */
2928 while (curr != NULL_USE_OPERAND_P
2929 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2930 || ! bitmap_set_bit (visited,
2931 SSA_NAME_VERSION
2932 (USE_FROM_PTR (curr)))));
2933 }
2934 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2935 if (curr == NULL_USE_OPERAND_P)
2936 break;
2937 }
2938 else
2939 {
2940 if (gimple_code (def) == GIMPLE_PHI)
2941 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2942 else
2943 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2944 while (curr != NULL_USE_OPERAND_P
2945 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2946 || ! bitmap_set_bit (visited,
2947 SSA_NAME_VERSION
2948 (USE_FROM_PTR (curr)))))
2949 curr = op_iter_next_use (&curri);
2950 if (curr == NULL_USE_OPERAND_P)
2951 goto pop;
2952 }
2953 }
2954 while (1);
2955 if (dump_file && (dump_flags & TDF_DETAILS))
2956 {
2957 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2958 unsigned i;
2959 std::pair<ssa_op_iter, use_operand_p> *x;
2960 FOR_EACH_VEC_ELT (path, i, x)
2961 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2962 dump_printf (MSG_NOTE, "\n");
2963 }
2964
2965 /* Check whether the reduction path detected is valid. */
2966 bool fail = path.length () == 0;
2967 bool neg = false;
2968 int sign = -1;
2969 *code = ERROR_MARK;
2970 for (unsigned i = 1; i < path.length (); ++i)
2971 {
2972 gimple *use_stmt = USE_STMT (path[i].second);
2973 tree op = USE_FROM_PTR (path[i].second);
2974 if (! is_gimple_assign (use_stmt)
2975 /* The following make sure we can compute the operand index
2976 easily plus it mostly disallows chaining via COND_EXPR condition
2977 operands. */
2978 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2979 && (gimple_num_ops (use_stmt) <= 2
2980 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2981 && (gimple_num_ops (use_stmt) <= 3
2982 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2983 {
2984 fail = true;
2985 break;
2986 }
2987 /* Check there's only a single stmt the op is used on inside
2988 of the loop. */
2989 imm_use_iterator imm_iter;
2990 gimple *op_use_stmt;
2991 unsigned cnt = 0;
2992 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2993 if (!is_gimple_debug (op_use_stmt)
2994 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2995 {
2996 /* We want to allow x + x but not x < 1 ? x : 2. */
2997 if (is_gimple_assign (op_use_stmt)
2998 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2999 {
3000 use_operand_p use_p;
3001 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3002 cnt++;
3003 }
3004 else
3005 cnt++;
3006 }
3007 if (cnt != 1)
3008 {
3009 fail = true;
3010 break;
3011 }
3012 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3013 if (use_code == MINUS_EXPR)
3014 {
3015 use_code = PLUS_EXPR;
3016 /* Track whether we negate the reduction value each iteration. */
3017 if (gimple_assign_rhs2 (use_stmt) == op)
3018 neg = ! neg;
3019 }
3020 if (CONVERT_EXPR_CODE_P (use_code)
3021 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3022 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3023 ;
3024 else if (*code == ERROR_MARK)
3025 {
3026 *code = use_code;
3027 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3028 }
3029 else if (use_code != *code)
3030 {
3031 fail = true;
3032 break;
3033 }
3034 else if ((use_code == MIN_EXPR
3035 || use_code == MAX_EXPR)
3036 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3037 {
3038 fail = true;
3039 break;
3040 }
3041 }
3042 return ! fail && ! neg && *code != ERROR_MARK;
3043 }
3044
3045 bool
3046 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3047 tree loop_arg, enum tree_code code)
3048 {
3049 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3050 enum tree_code code_;
3051 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3052 && code_ == code);
3053 }
3054
3055
3056
3057 /* Function vect_is_simple_reduction
3058
3059 (1) Detect a cross-iteration def-use cycle that represents a simple
3060 reduction computation. We look for the following pattern:
3061
3062 loop_header:
3063 a1 = phi < a0, a2 >
3064 a3 = ...
3065 a2 = operation (a3, a1)
3066
3067 or
3068
3069 a3 = ...
3070 loop_header:
3071 a1 = phi < a0, a2 >
3072 a2 = operation (a3, a1)
3073
3074 such that:
3075 1. operation is commutative and associative and it is safe to
3076 change the order of the computation
3077 2. no uses for a2 in the loop (a2 is used out of the loop)
3078 3. no uses of a1 in the loop besides the reduction operation
3079 4. no uses of a1 outside the loop.
3080
3081 Conditions 1,4 are tested here.
3082 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3083
3084 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3085 nested cycles.
3086
3087 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3088 reductions:
3089
3090 a1 = phi < a0, a2 >
3091 inner loop (def of a3)
3092 a2 = phi < a3 >
3093
3094 (4) Detect condition expressions, ie:
3095 for (int i = 0; i < N; i++)
3096 if (a[i] < val)
3097 ret_val = a[i];
3098
3099 */
3100
3101 static stmt_vec_info
3102 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3103 bool *double_reduc, bool *reduc_chain_p)
3104 {
3105 gphi *phi = as_a <gphi *> (phi_info->stmt);
3106 gimple *phi_use_stmt = NULL;
3107 imm_use_iterator imm_iter;
3108 use_operand_p use_p;
3109
3110 *double_reduc = false;
3111 *reduc_chain_p = false;
3112 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3113
3114 tree phi_name = PHI_RESULT (phi);
3115 /* ??? If there are no uses of the PHI result the inner loop reduction
3116 won't be detected as possibly double-reduction by vectorizable_reduction
3117 because that tries to walk the PHI arg from the preheader edge which
3118 can be constant. See PR60382. */
3119 if (has_zero_uses (phi_name))
3120 return NULL;
3121 class loop *loop = (gimple_bb (phi))->loop_father;
3122 unsigned nphi_def_loop_uses = 0;
3123 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3124 {
3125 gimple *use_stmt = USE_STMT (use_p);
3126 if (is_gimple_debug (use_stmt))
3127 continue;
3128
3129 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3130 {
3131 if (dump_enabled_p ())
3132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3133 "intermediate value used outside loop.\n");
3134
3135 return NULL;
3136 }
3137
3138 nphi_def_loop_uses++;
3139 phi_use_stmt = use_stmt;
3140 }
3141
3142 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3143 if (TREE_CODE (latch_def) != SSA_NAME)
3144 {
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3147 "reduction: not ssa_name: %T\n", latch_def);
3148 return NULL;
3149 }
3150
3151 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3152 if (!def_stmt_info
3153 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3154 return NULL;
3155
3156 bool nested_in_vect_loop
3157 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3158 unsigned nlatch_def_loop_uses = 0;
3159 auto_vec<gphi *, 3> lcphis;
3160 bool inner_loop_of_double_reduc = false;
3161 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3162 {
3163 gimple *use_stmt = USE_STMT (use_p);
3164 if (is_gimple_debug (use_stmt))
3165 continue;
3166 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3167 nlatch_def_loop_uses++;
3168 else
3169 {
3170 /* We can have more than one loop-closed PHI. */
3171 lcphis.safe_push (as_a <gphi *> (use_stmt));
3172 if (nested_in_vect_loop
3173 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3174 == vect_double_reduction_def))
3175 inner_loop_of_double_reduc = true;
3176 }
3177 }
3178
3179 /* If we are vectorizing an inner reduction we are executing that
3180 in the original order only in case we are not dealing with a
3181 double reduction. */
3182 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3183 {
3184 if (dump_enabled_p ())
3185 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3186 "detected nested cycle: ");
3187 return def_stmt_info;
3188 }
3189
3190 /* If this isn't a nested cycle or if the nested cycle reduction value
3191 is used ouside of the inner loop we cannot handle uses of the reduction
3192 value. */
3193 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3194 {
3195 if (dump_enabled_p ())
3196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3197 "reduction used in loop.\n");
3198 return NULL;
3199 }
3200
3201 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3202 defined in the inner loop. */
3203 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3204 {
3205 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3206 if (gimple_phi_num_args (def_stmt) != 1
3207 || TREE_CODE (op1) != SSA_NAME)
3208 {
3209 if (dump_enabled_p ())
3210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3211 "unsupported phi node definition.\n");
3212
3213 return NULL;
3214 }
3215
3216 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3217 if (gimple_bb (def1)
3218 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3219 && loop->inner
3220 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3221 && is_gimple_assign (def1)
3222 && is_a <gphi *> (phi_use_stmt)
3223 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3224 {
3225 if (dump_enabled_p ())
3226 report_vect_op (MSG_NOTE, def_stmt,
3227 "detected double reduction: ");
3228
3229 *double_reduc = true;
3230 return def_stmt_info;
3231 }
3232
3233 return NULL;
3234 }
3235
3236 /* Look for the expression computing latch_def from then loop PHI result. */
3237 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3238 enum tree_code code;
3239 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3240 path))
3241 {
3242 STMT_VINFO_REDUC_CODE (phi_info) = code;
3243 if (code == COND_EXPR && !nested_in_vect_loop)
3244 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3245
3246 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3247 reduction chain for which the additional restriction is that
3248 all operations in the chain are the same. */
3249 auto_vec<stmt_vec_info, 8> reduc_chain;
3250 unsigned i;
3251 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3252 for (i = path.length () - 1; i >= 1; --i)
3253 {
3254 gimple *stmt = USE_STMT (path[i].second);
3255 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3256 STMT_VINFO_REDUC_IDX (stmt_info)
3257 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3258 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3259 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3260 && (i == 1 || i == path.length () - 1));
3261 if ((stmt_code != code && !leading_conversion)
3262 /* We can only handle the final value in epilogue
3263 generation for reduction chains. */
3264 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3265 is_slp_reduc = false;
3266 /* For reduction chains we support a trailing/leading
3267 conversions. We do not store those in the actual chain. */
3268 if (leading_conversion)
3269 continue;
3270 reduc_chain.safe_push (stmt_info);
3271 }
3272 if (is_slp_reduc && reduc_chain.length () > 1)
3273 {
3274 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3275 {
3276 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3277 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3278 }
3279 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3280 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3281
3282 /* Save the chain for further analysis in SLP detection. */
3283 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3284 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3285
3286 *reduc_chain_p = true;
3287 if (dump_enabled_p ())
3288 dump_printf_loc (MSG_NOTE, vect_location,
3289 "reduction: detected reduction chain\n");
3290 }
3291 else if (dump_enabled_p ())
3292 dump_printf_loc (MSG_NOTE, vect_location,
3293 "reduction: detected reduction\n");
3294
3295 return def_stmt_info;
3296 }
3297
3298 if (dump_enabled_p ())
3299 dump_printf_loc (MSG_NOTE, vect_location,
3300 "reduction: unknown pattern\n");
3301
3302 return NULL;
3303 }
3304
3305 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3306 int
3307 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3308 int *peel_iters_epilogue,
3309 stmt_vector_for_cost *scalar_cost_vec,
3310 stmt_vector_for_cost *prologue_cost_vec,
3311 stmt_vector_for_cost *epilogue_cost_vec)
3312 {
3313 int retval = 0;
3314 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3315
3316 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3317 {
3318 *peel_iters_epilogue = assumed_vf / 2;
3319 if (dump_enabled_p ())
3320 dump_printf_loc (MSG_NOTE, vect_location,
3321 "cost model: epilogue peel iters set to vf/2 "
3322 "because loop iterations are unknown .\n");
3323
3324 /* If peeled iterations are known but number of scalar loop
3325 iterations are unknown, count a taken branch per peeled loop. */
3326 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3327 NULL, NULL_TREE, 0, vect_prologue);
3328 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3329 NULL, NULL_TREE, 0, vect_epilogue);
3330 }
3331 else
3332 {
3333 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3334 peel_iters_prologue = niters < peel_iters_prologue ?
3335 niters : peel_iters_prologue;
3336 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3337 /* If we need to peel for gaps, but no peeling is required, we have to
3338 peel VF iterations. */
3339 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3340 *peel_iters_epilogue = assumed_vf;
3341 }
3342
3343 stmt_info_for_cost *si;
3344 int j;
3345 if (peel_iters_prologue)
3346 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3347 retval += record_stmt_cost (prologue_cost_vec,
3348 si->count * peel_iters_prologue,
3349 si->kind, si->stmt_info, si->misalign,
3350 vect_prologue);
3351 if (*peel_iters_epilogue)
3352 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3353 retval += record_stmt_cost (epilogue_cost_vec,
3354 si->count * *peel_iters_epilogue,
3355 si->kind, si->stmt_info, si->misalign,
3356 vect_epilogue);
3357
3358 return retval;
3359 }
3360
3361 /* Function vect_estimate_min_profitable_iters
3362
3363 Return the number of iterations required for the vector version of the
3364 loop to be profitable relative to the cost of the scalar version of the
3365 loop.
3366
3367 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3368 of iterations for vectorization. -1 value means loop vectorization
3369 is not profitable. This returned value may be used for dynamic
3370 profitability check.
3371
3372 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3373 for static check against estimated number of iterations. */
3374
3375 static void
3376 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3377 int *ret_min_profitable_niters,
3378 int *ret_min_profitable_estimate)
3379 {
3380 int min_profitable_iters;
3381 int min_profitable_estimate;
3382 int peel_iters_prologue;
3383 int peel_iters_epilogue;
3384 unsigned vec_inside_cost = 0;
3385 int vec_outside_cost = 0;
3386 unsigned vec_prologue_cost = 0;
3387 unsigned vec_epilogue_cost = 0;
3388 int scalar_single_iter_cost = 0;
3389 int scalar_outside_cost = 0;
3390 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3391 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3392 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3393
3394 /* Cost model disabled. */
3395 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3396 {
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3399 *ret_min_profitable_niters = 0;
3400 *ret_min_profitable_estimate = 0;
3401 return;
3402 }
3403
3404 /* Requires loop versioning tests to handle misalignment. */
3405 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3406 {
3407 /* FIXME: Make cost depend on complexity of individual check. */
3408 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3409 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3410 NULL, NULL_TREE, 0, vect_prologue);
3411 if (dump_enabled_p ())
3412 dump_printf (MSG_NOTE,
3413 "cost model: Adding cost of checks for loop "
3414 "versioning to treat misalignment.\n");
3415 }
3416
3417 /* Requires loop versioning with alias checks. */
3418 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3419 {
3420 /* FIXME: Make cost depend on complexity of individual check. */
3421 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3422 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3423 NULL, NULL_TREE, 0, vect_prologue);
3424 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3425 if (len)
3426 /* Count LEN - 1 ANDs and LEN comparisons. */
3427 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3428 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3429 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3430 if (len)
3431 {
3432 /* Count LEN - 1 ANDs and LEN comparisons. */
3433 unsigned int nstmts = len * 2 - 1;
3434 /* +1 for each bias that needs adding. */
3435 for (unsigned int i = 0; i < len; ++i)
3436 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3437 nstmts += 1;
3438 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3439 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3440 }
3441 if (dump_enabled_p ())
3442 dump_printf (MSG_NOTE,
3443 "cost model: Adding cost of checks for loop "
3444 "versioning aliasing.\n");
3445 }
3446
3447 /* Requires loop versioning with niter checks. */
3448 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3449 {
3450 /* FIXME: Make cost depend on complexity of individual check. */
3451 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3452 NULL, NULL_TREE, 0, vect_prologue);
3453 if (dump_enabled_p ())
3454 dump_printf (MSG_NOTE,
3455 "cost model: Adding cost of checks for loop "
3456 "versioning niters.\n");
3457 }
3458
3459 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3460 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3461 NULL, NULL_TREE, 0, vect_prologue);
3462
3463 /* Count statements in scalar loop. Using this as scalar cost for a single
3464 iteration for now.
3465
3466 TODO: Add outer loop support.
3467
3468 TODO: Consider assigning different costs to different scalar
3469 statements. */
3470
3471 scalar_single_iter_cost
3472 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3473
3474 /* Add additional cost for the peeled instructions in prologue and epilogue
3475 loop. (For fully-masked loops there will be no peeling.)
3476
3477 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3478 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3479
3480 TODO: Build an expression that represents peel_iters for prologue and
3481 epilogue to be used in a run-time test. */
3482
3483 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3484 {
3485 peel_iters_prologue = 0;
3486 peel_iters_epilogue = 0;
3487
3488 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3489 {
3490 /* We need to peel exactly one iteration. */
3491 peel_iters_epilogue += 1;
3492 stmt_info_for_cost *si;
3493 int j;
3494 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3495 j, si)
3496 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
3497 si->kind, si->stmt_info, si->vectype,
3498 si->misalign, vect_epilogue);
3499 }
3500
3501 /* Calculate how many masks we need to generate. */
3502 unsigned int num_masks = 0;
3503 rgroup_masks *rgm;
3504 unsigned int num_vectors_m1;
3505 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3506 if (rgm->mask_type)
3507 num_masks += num_vectors_m1 + 1;
3508 gcc_assert (num_masks > 0);
3509
3510 /* In the worst case, we need to generate each mask in the prologue
3511 and in the loop body. One of the loop body mask instructions
3512 replaces the comparison in the scalar loop, and since we don't
3513 count the scalar comparison against the scalar body, we shouldn't
3514 count that vector instruction against the vector body either.
3515
3516 Sometimes we can use unpacks instead of generating prologue
3517 masks and sometimes the prologue mask will fold to a constant,
3518 so the actual prologue cost might be smaller. However, it's
3519 simpler and safer to use the worst-case cost; if this ends up
3520 being the tie-breaker between vectorizing or not, then it's
3521 probably better not to vectorize. */
3522 (void) add_stmt_cost (loop_vinfo,
3523 target_cost_data, num_masks, vector_stmt,
3524 NULL, NULL_TREE, 0, vect_prologue);
3525 (void) add_stmt_cost (loop_vinfo,
3526 target_cost_data, num_masks - 1, vector_stmt,
3527 NULL, NULL_TREE, 0, vect_body);
3528 }
3529 else if (npeel < 0)
3530 {
3531 peel_iters_prologue = assumed_vf / 2;
3532 if (dump_enabled_p ())
3533 dump_printf (MSG_NOTE, "cost model: "
3534 "prologue peel iters set to vf/2.\n");
3535
3536 /* If peeling for alignment is unknown, loop bound of main loop becomes
3537 unknown. */
3538 peel_iters_epilogue = assumed_vf / 2;
3539 if (dump_enabled_p ())
3540 dump_printf (MSG_NOTE, "cost model: "
3541 "epilogue peel iters set to vf/2 because "
3542 "peeling for alignment is unknown.\n");
3543
3544 /* If peeled iterations are unknown, count a taken branch and a not taken
3545 branch per peeled loop. Even if scalar loop iterations are known,
3546 vector iterations are not known since peeled prologue iterations are
3547 not known. Hence guards remain the same. */
3548 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3549 NULL, NULL_TREE, 0, vect_prologue);
3550 (void) add_stmt_cost (loop_vinfo,
3551 target_cost_data, 1, cond_branch_not_taken,
3552 NULL, NULL_TREE, 0, vect_prologue);
3553 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3554 NULL, NULL_TREE, 0, vect_epilogue);
3555 (void) add_stmt_cost (loop_vinfo,
3556 target_cost_data, 1, cond_branch_not_taken,
3557 NULL, NULL_TREE, 0, vect_epilogue);
3558 stmt_info_for_cost *si;
3559 int j;
3560 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3561 {
3562 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3563 si->count * peel_iters_prologue,
3564 si->kind, si->stmt_info, si->vectype,
3565 si->misalign,
3566 vect_prologue);
3567 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3568 si->count * peel_iters_epilogue,
3569 si->kind, si->stmt_info, si->vectype,
3570 si->misalign,
3571 vect_epilogue);
3572 }
3573 }
3574 else
3575 {
3576 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3577 stmt_info_for_cost *si;
3578 int j;
3579 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3580
3581 prologue_cost_vec.create (2);
3582 epilogue_cost_vec.create (2);
3583 peel_iters_prologue = npeel;
3584
3585 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3586 &peel_iters_epilogue,
3587 &LOOP_VINFO_SCALAR_ITERATION_COST
3588 (loop_vinfo),
3589 &prologue_cost_vec,
3590 &epilogue_cost_vec);
3591
3592 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3593 (void) add_stmt_cost (loop_vinfo,
3594 data, si->count, si->kind, si->stmt_info,
3595 si->vectype, si->misalign, vect_prologue);
3596
3597 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3598 (void) add_stmt_cost (loop_vinfo,
3599 data, si->count, si->kind, si->stmt_info,
3600 si->vectype, si->misalign, vect_epilogue);
3601
3602 prologue_cost_vec.release ();
3603 epilogue_cost_vec.release ();
3604 }
3605
3606 /* FORNOW: The scalar outside cost is incremented in one of the
3607 following ways:
3608
3609 1. The vectorizer checks for alignment and aliasing and generates
3610 a condition that allows dynamic vectorization. A cost model
3611 check is ANDED with the versioning condition. Hence scalar code
3612 path now has the added cost of the versioning check.
3613
3614 if (cost > th & versioning_check)
3615 jmp to vector code
3616
3617 Hence run-time scalar is incremented by not-taken branch cost.
3618
3619 2. The vectorizer then checks if a prologue is required. If the
3620 cost model check was not done before during versioning, it has to
3621 be done before the prologue check.
3622
3623 if (cost <= th)
3624 prologue = scalar_iters
3625 if (prologue == 0)
3626 jmp to vector code
3627 else
3628 execute prologue
3629 if (prologue == num_iters)
3630 go to exit
3631
3632 Hence the run-time scalar cost is incremented by a taken branch,
3633 plus a not-taken branch, plus a taken branch cost.
3634
3635 3. The vectorizer then checks if an epilogue is required. If the
3636 cost model check was not done before during prologue check, it
3637 has to be done with the epilogue check.
3638
3639 if (prologue == 0)
3640 jmp to vector code
3641 else
3642 execute prologue
3643 if (prologue == num_iters)
3644 go to exit
3645 vector code:
3646 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3647 jmp to epilogue
3648
3649 Hence the run-time scalar cost should be incremented by 2 taken
3650 branches.
3651
3652 TODO: The back end may reorder the BBS's differently and reverse
3653 conditions/branch directions. Change the estimates below to
3654 something more reasonable. */
3655
3656 /* If the number of iterations is known and we do not do versioning, we can
3657 decide whether to vectorize at compile time. Hence the scalar version
3658 do not carry cost model guard costs. */
3659 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3660 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3661 {
3662 /* Cost model check occurs at versioning. */
3663 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3664 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3665 else
3666 {
3667 /* Cost model check occurs at prologue generation. */
3668 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3669 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3670 + vect_get_stmt_cost (cond_branch_not_taken);
3671 /* Cost model check occurs at epilogue generation. */
3672 else
3673 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3674 }
3675 }
3676
3677 /* Complete the target-specific cost calculations. */
3678 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3679 &vec_inside_cost, &vec_epilogue_cost);
3680
3681 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3682
3683 /* Stash the costs so that we can compare two loop_vec_infos. */
3684 loop_vinfo->vec_inside_cost = vec_inside_cost;
3685 loop_vinfo->vec_outside_cost = vec_outside_cost;
3686
3687 if (dump_enabled_p ())
3688 {
3689 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3690 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3691 vec_inside_cost);
3692 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3693 vec_prologue_cost);
3694 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3695 vec_epilogue_cost);
3696 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3697 scalar_single_iter_cost);
3698 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3699 scalar_outside_cost);
3700 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3701 vec_outside_cost);
3702 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3703 peel_iters_prologue);
3704 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3705 peel_iters_epilogue);
3706 }
3707
3708 /* Calculate number of iterations required to make the vector version
3709 profitable, relative to the loop bodies only. The following condition
3710 must hold true:
3711 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3712 where
3713 SIC = scalar iteration cost, VIC = vector iteration cost,
3714 VOC = vector outside cost, VF = vectorization factor,
3715 NPEEL = prologue iterations + epilogue iterations,
3716 SOC = scalar outside cost for run time cost model check. */
3717
3718 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3719 - vec_inside_cost);
3720 if (saving_per_viter <= 0)
3721 {
3722 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3723 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3724 "vectorization did not happen for a simd loop");
3725
3726 if (dump_enabled_p ())
3727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3728 "cost model: the vector iteration cost = %d "
3729 "divided by the scalar iteration cost = %d "
3730 "is greater or equal to the vectorization factor = %d"
3731 ".\n",
3732 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3733 *ret_min_profitable_niters = -1;
3734 *ret_min_profitable_estimate = -1;
3735 return;
3736 }
3737
3738 /* ??? The "if" arm is written to handle all cases; see below for what
3739 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3740 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3741 {
3742 /* Rewriting the condition above in terms of the number of
3743 vector iterations (vniters) rather than the number of
3744 scalar iterations (niters) gives:
3745
3746 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3747
3748 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3749
3750 For integer N, X and Y when X > 0:
3751
3752 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3753 int outside_overhead = (vec_outside_cost
3754 - scalar_single_iter_cost * peel_iters_prologue
3755 - scalar_single_iter_cost * peel_iters_epilogue
3756 - scalar_outside_cost);
3757 /* We're only interested in cases that require at least one
3758 vector iteration. */
3759 int min_vec_niters = 1;
3760 if (outside_overhead > 0)
3761 min_vec_niters = outside_overhead / saving_per_viter + 1;
3762
3763 if (dump_enabled_p ())
3764 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3765 min_vec_niters);
3766
3767 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3768 {
3769 /* Now that we know the minimum number of vector iterations,
3770 find the minimum niters for which the scalar cost is larger:
3771
3772 SIC * niters > VIC * vniters + VOC - SOC
3773
3774 We know that the minimum niters is no more than
3775 vniters * VF + NPEEL, but it might be (and often is) less
3776 than that if a partial vector iteration is cheaper than the
3777 equivalent scalar code. */
3778 int threshold = (vec_inside_cost * min_vec_niters
3779 + vec_outside_cost
3780 - scalar_outside_cost);
3781 if (threshold <= 0)
3782 min_profitable_iters = 1;
3783 else
3784 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3785 }
3786 else
3787 /* Convert the number of vector iterations into a number of
3788 scalar iterations. */
3789 min_profitable_iters = (min_vec_niters * assumed_vf
3790 + peel_iters_prologue
3791 + peel_iters_epilogue);
3792 }
3793 else
3794 {
3795 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3796 * assumed_vf
3797 - vec_inside_cost * peel_iters_prologue
3798 - vec_inside_cost * peel_iters_epilogue);
3799 if (min_profitable_iters <= 0)
3800 min_profitable_iters = 0;
3801 else
3802 {
3803 min_profitable_iters /= saving_per_viter;
3804
3805 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3806 <= (((int) vec_inside_cost * min_profitable_iters)
3807 + (((int) vec_outside_cost - scalar_outside_cost)
3808 * assumed_vf)))
3809 min_profitable_iters++;
3810 }
3811 }
3812
3813 if (dump_enabled_p ())
3814 dump_printf (MSG_NOTE,
3815 " Calculated minimum iters for profitability: %d\n",
3816 min_profitable_iters);
3817
3818 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3819 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3820 /* We want the vectorized loop to execute at least once. */
3821 min_profitable_iters = assumed_vf + peel_iters_prologue;
3822
3823 if (dump_enabled_p ())
3824 dump_printf_loc (MSG_NOTE, vect_location,
3825 " Runtime profitability threshold = %d\n",
3826 min_profitable_iters);
3827
3828 *ret_min_profitable_niters = min_profitable_iters;
3829
3830 /* Calculate number of iterations required to make the vector version
3831 profitable, relative to the loop bodies only.
3832
3833 Non-vectorized variant is SIC * niters and it must win over vector
3834 variant on the expected loop trip count. The following condition must hold true:
3835 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3836
3837 if (vec_outside_cost <= 0)
3838 min_profitable_estimate = 0;
3839 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3840 {
3841 /* This is a repeat of the code above, but with + SOC rather
3842 than - SOC. */
3843 int outside_overhead = (vec_outside_cost
3844 - scalar_single_iter_cost * peel_iters_prologue
3845 - scalar_single_iter_cost * peel_iters_epilogue
3846 + scalar_outside_cost);
3847 int min_vec_niters = 1;
3848 if (outside_overhead > 0)
3849 min_vec_niters = outside_overhead / saving_per_viter + 1;
3850
3851 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3852 {
3853 int threshold = (vec_inside_cost * min_vec_niters
3854 + vec_outside_cost
3855 + scalar_outside_cost);
3856 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3857 }
3858 else
3859 min_profitable_estimate = (min_vec_niters * assumed_vf
3860 + peel_iters_prologue
3861 + peel_iters_epilogue);
3862 }
3863 else
3864 {
3865 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3866 * assumed_vf
3867 - vec_inside_cost * peel_iters_prologue
3868 - vec_inside_cost * peel_iters_epilogue)
3869 / ((scalar_single_iter_cost * assumed_vf)
3870 - vec_inside_cost);
3871 }
3872 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3873 if (dump_enabled_p ())
3874 dump_printf_loc (MSG_NOTE, vect_location,
3875 " Static estimate profitability threshold = %d\n",
3876 min_profitable_estimate);
3877
3878 *ret_min_profitable_estimate = min_profitable_estimate;
3879 }
3880
3881 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3882 vector elements (not bits) for a vector with NELT elements. */
3883 static void
3884 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3885 vec_perm_builder *sel)
3886 {
3887 /* The encoding is a single stepped pattern. Any wrap-around is handled
3888 by vec_perm_indices. */
3889 sel->new_vector (nelt, 1, 3);
3890 for (unsigned int i = 0; i < 3; i++)
3891 sel->quick_push (i + offset);
3892 }
3893
3894 /* Checks whether the target supports whole-vector shifts for vectors of mode
3895 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3896 it supports vec_perm_const with masks for all necessary shift amounts. */
3897 static bool
3898 have_whole_vector_shift (machine_mode mode)
3899 {
3900 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3901 return true;
3902
3903 /* Variable-length vectors should be handled via the optab. */
3904 unsigned int nelt;
3905 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3906 return false;
3907
3908 vec_perm_builder sel;
3909 vec_perm_indices indices;
3910 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3911 {
3912 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3913 indices.new_vector (sel, 2, nelt);
3914 if (!can_vec_perm_const_p (mode, indices, false))
3915 return false;
3916 }
3917 return true;
3918 }
3919
3920 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3921 functions. Design better to avoid maintenance issues. */
3922
3923 /* Function vect_model_reduction_cost.
3924
3925 Models cost for a reduction operation, including the vector ops
3926 generated within the strip-mine loop, the initial definition before
3927 the loop, and the epilogue code that must be generated. */
3928
3929 static void
3930 vect_model_reduction_cost (loop_vec_info loop_vinfo,
3931 stmt_vec_info stmt_info, internal_fn reduc_fn,
3932 vect_reduction_type reduction_type,
3933 int ncopies, stmt_vector_for_cost *cost_vec)
3934 {
3935 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3936 enum tree_code code;
3937 optab optab;
3938 tree vectype;
3939 machine_mode mode;
3940 class loop *loop = NULL;
3941
3942 if (loop_vinfo)
3943 loop = LOOP_VINFO_LOOP (loop_vinfo);
3944
3945 /* Condition reductions generate two reductions in the loop. */
3946 if (reduction_type == COND_REDUCTION)
3947 ncopies *= 2;
3948
3949 vectype = STMT_VINFO_VECTYPE (stmt_info);
3950 mode = TYPE_MODE (vectype);
3951 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3952
3953 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3954
3955 if (reduction_type == EXTRACT_LAST_REDUCTION)
3956 /* No extra instructions are needed in the prologue. The loop body
3957 operations are costed in vectorizable_condition. */
3958 inside_cost = 0;
3959 else if (reduction_type == FOLD_LEFT_REDUCTION)
3960 {
3961 /* No extra instructions needed in the prologue. */
3962 prologue_cost = 0;
3963
3964 if (reduc_fn != IFN_LAST)
3965 /* Count one reduction-like operation per vector. */
3966 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3967 stmt_info, 0, vect_body);
3968 else
3969 {
3970 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3971 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3972 inside_cost = record_stmt_cost (cost_vec, nelements,
3973 vec_to_scalar, stmt_info, 0,
3974 vect_body);
3975 inside_cost += record_stmt_cost (cost_vec, nelements,
3976 scalar_stmt, stmt_info, 0,
3977 vect_body);
3978 }
3979 }
3980 else
3981 {
3982 /* Add in cost for initial definition.
3983 For cond reduction we have four vectors: initial index, step,
3984 initial result of the data reduction, initial value of the index
3985 reduction. */
3986 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3987 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3988 scalar_to_vec, stmt_info, 0,
3989 vect_prologue);
3990
3991 /* Cost of reduction op inside loop. */
3992 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3993 stmt_info, 0, vect_body);
3994 }
3995
3996 /* Determine cost of epilogue code.
3997
3998 We have a reduction operator that will reduce the vector in one statement.
3999 Also requires scalar extract. */
4000
4001 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4002 {
4003 if (reduc_fn != IFN_LAST)
4004 {
4005 if (reduction_type == COND_REDUCTION)
4006 {
4007 /* An EQ stmt and an COND_EXPR stmt. */
4008 epilogue_cost += record_stmt_cost (cost_vec, 2,
4009 vector_stmt, stmt_info, 0,
4010 vect_epilogue);
4011 /* Reduction of the max index and a reduction of the found
4012 values. */
4013 epilogue_cost += record_stmt_cost (cost_vec, 2,
4014 vec_to_scalar, stmt_info, 0,
4015 vect_epilogue);
4016 /* A broadcast of the max value. */
4017 epilogue_cost += record_stmt_cost (cost_vec, 1,
4018 scalar_to_vec, stmt_info, 0,
4019 vect_epilogue);
4020 }
4021 else
4022 {
4023 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4024 stmt_info, 0, vect_epilogue);
4025 epilogue_cost += record_stmt_cost (cost_vec, 1,
4026 vec_to_scalar, stmt_info, 0,
4027 vect_epilogue);
4028 }
4029 }
4030 else if (reduction_type == COND_REDUCTION)
4031 {
4032 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4033 /* Extraction of scalar elements. */
4034 epilogue_cost += record_stmt_cost (cost_vec,
4035 2 * estimated_nunits,
4036 vec_to_scalar, stmt_info, 0,
4037 vect_epilogue);
4038 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4039 epilogue_cost += record_stmt_cost (cost_vec,
4040 2 * estimated_nunits - 3,
4041 scalar_stmt, stmt_info, 0,
4042 vect_epilogue);
4043 }
4044 else if (reduction_type == EXTRACT_LAST_REDUCTION
4045 || reduction_type == FOLD_LEFT_REDUCTION)
4046 /* No extra instructions need in the epilogue. */
4047 ;
4048 else
4049 {
4050 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4051 tree bitsize =
4052 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4053 int element_bitsize = tree_to_uhwi (bitsize);
4054 int nelements = vec_size_in_bits / element_bitsize;
4055
4056 if (code == COND_EXPR)
4057 code = MAX_EXPR;
4058
4059 optab = optab_for_tree_code (code, vectype, optab_default);
4060
4061 /* We have a whole vector shift available. */
4062 if (optab != unknown_optab
4063 && VECTOR_MODE_P (mode)
4064 && optab_handler (optab, mode) != CODE_FOR_nothing
4065 && have_whole_vector_shift (mode))
4066 {
4067 /* Final reduction via vector shifts and the reduction operator.
4068 Also requires scalar extract. */
4069 epilogue_cost += record_stmt_cost (cost_vec,
4070 exact_log2 (nelements) * 2,
4071 vector_stmt, stmt_info, 0,
4072 vect_epilogue);
4073 epilogue_cost += record_stmt_cost (cost_vec, 1,
4074 vec_to_scalar, stmt_info, 0,
4075 vect_epilogue);
4076 }
4077 else
4078 /* Use extracts and reduction op for final reduction. For N
4079 elements, we have N extracts and N-1 reduction ops. */
4080 epilogue_cost += record_stmt_cost (cost_vec,
4081 nelements + nelements - 1,
4082 vector_stmt, stmt_info, 0,
4083 vect_epilogue);
4084 }
4085 }
4086
4087 if (dump_enabled_p ())
4088 dump_printf (MSG_NOTE,
4089 "vect_model_reduction_cost: inside_cost = %d, "
4090 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4091 prologue_cost, epilogue_cost);
4092 }
4093
4094
4095 /* Function vect_model_induction_cost.
4096
4097 Models cost for induction operations. */
4098
4099 static void
4100 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4101 stmt_vector_for_cost *cost_vec)
4102 {
4103 unsigned inside_cost, prologue_cost;
4104
4105 if (PURE_SLP_STMT (stmt_info))
4106 return;
4107
4108 /* loop cost for vec_loop. */
4109 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4110 stmt_info, 0, vect_body);
4111
4112 /* prologue cost for vec_init and vec_step. */
4113 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4114 stmt_info, 0, vect_prologue);
4115
4116 if (dump_enabled_p ())
4117 dump_printf_loc (MSG_NOTE, vect_location,
4118 "vect_model_induction_cost: inside_cost = %d, "
4119 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4120 }
4121
4122
4123
4124 /* Function get_initial_def_for_reduction
4125
4126 Input:
4127 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4128 INIT_VAL - the initial value of the reduction variable
4129
4130 Output:
4131 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4132 of the reduction (used for adjusting the epilog - see below).
4133 Return a vector variable, initialized according to the operation that
4134 STMT_VINFO performs. This vector will be used as the initial value
4135 of the vector of partial results.
4136
4137 Option1 (adjust in epilog): Initialize the vector as follows:
4138 add/bit or/xor: [0,0,...,0,0]
4139 mult/bit and: [1,1,...,1,1]
4140 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4141 and when necessary (e.g. add/mult case) let the caller know
4142 that it needs to adjust the result by init_val.
4143
4144 Option2: Initialize the vector as follows:
4145 add/bit or/xor: [init_val,0,0,...,0]
4146 mult/bit and: [init_val,1,1,...,1]
4147 min/max/cond_expr: [init_val,init_val,...,init_val]
4148 and no adjustments are needed.
4149
4150 For example, for the following code:
4151
4152 s = init_val;
4153 for (i=0;i<n;i++)
4154 s = s + a[i];
4155
4156 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4157 For a vector of 4 units, we want to return either [0,0,0,init_val],
4158 or [0,0,0,0] and let the caller know that it needs to adjust
4159 the result at the end by 'init_val'.
4160
4161 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4162 initialization vector is simpler (same element in all entries), if
4163 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4164
4165 A cost model should help decide between these two schemes. */
4166
4167 static tree
4168 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4169 stmt_vec_info stmt_vinfo,
4170 enum tree_code code, tree init_val,
4171 tree *adjustment_def)
4172 {
4173 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4174 tree scalar_type = TREE_TYPE (init_val);
4175 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4176 tree def_for_init;
4177 tree init_def;
4178 REAL_VALUE_TYPE real_init_val = dconst0;
4179 int int_init_val = 0;
4180 gimple_seq stmts = NULL;
4181
4182 gcc_assert (vectype);
4183
4184 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4185 || SCALAR_FLOAT_TYPE_P (scalar_type));
4186
4187 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4188 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4189
4190 /* ADJUSTMENT_DEF is NULL when called from
4191 vect_create_epilog_for_reduction to vectorize double reduction. */
4192 if (adjustment_def)
4193 *adjustment_def = NULL;
4194
4195 switch (code)
4196 {
4197 case WIDEN_SUM_EXPR:
4198 case DOT_PROD_EXPR:
4199 case SAD_EXPR:
4200 case PLUS_EXPR:
4201 case MINUS_EXPR:
4202 case BIT_IOR_EXPR:
4203 case BIT_XOR_EXPR:
4204 case MULT_EXPR:
4205 case BIT_AND_EXPR:
4206 {
4207 if (code == MULT_EXPR)
4208 {
4209 real_init_val = dconst1;
4210 int_init_val = 1;
4211 }
4212
4213 if (code == BIT_AND_EXPR)
4214 int_init_val = -1;
4215
4216 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4217 def_for_init = build_real (scalar_type, real_init_val);
4218 else
4219 def_for_init = build_int_cst (scalar_type, int_init_val);
4220
4221 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4222 {
4223 /* Option1: the first element is '0' or '1' as well. */
4224 if (!operand_equal_p (def_for_init, init_val, 0))
4225 *adjustment_def = init_val;
4226 init_def = gimple_build_vector_from_val (&stmts, vectype,
4227 def_for_init);
4228 }
4229 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4230 {
4231 /* Option2 (variable length): the first element is INIT_VAL. */
4232 init_def = gimple_build_vector_from_val (&stmts, vectype,
4233 def_for_init);
4234 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4235 vectype, init_def, init_val);
4236 }
4237 else
4238 {
4239 /* Option2: the first element is INIT_VAL. */
4240 tree_vector_builder elts (vectype, 1, 2);
4241 elts.quick_push (init_val);
4242 elts.quick_push (def_for_init);
4243 init_def = gimple_build_vector (&stmts, &elts);
4244 }
4245 }
4246 break;
4247
4248 case MIN_EXPR:
4249 case MAX_EXPR:
4250 case COND_EXPR:
4251 {
4252 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4253 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4254 }
4255 break;
4256
4257 default:
4258 gcc_unreachable ();
4259 }
4260
4261 if (stmts)
4262 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4263 return init_def;
4264 }
4265
4266 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4267 NUMBER_OF_VECTORS is the number of vector defs to create.
4268 If NEUTRAL_OP is nonnull, introducing extra elements of that
4269 value will not change the result. */
4270
4271 static void
4272 get_initial_defs_for_reduction (vec_info *vinfo,
4273 slp_tree slp_node,
4274 vec<tree> *vec_oprnds,
4275 unsigned int number_of_vectors,
4276 bool reduc_chain, tree neutral_op)
4277 {
4278 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4279 stmt_vec_info stmt_vinfo = stmts[0];
4280 unsigned HOST_WIDE_INT nunits;
4281 unsigned j, number_of_places_left_in_vector;
4282 tree vector_type;
4283 unsigned int group_size = stmts.length ();
4284 unsigned int i;
4285 class loop *loop;
4286
4287 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4288
4289 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4290
4291 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4292 gcc_assert (loop);
4293 edge pe = loop_preheader_edge (loop);
4294
4295 gcc_assert (!reduc_chain || neutral_op);
4296
4297 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4298 created vectors. It is greater than 1 if unrolling is performed.
4299
4300 For example, we have two scalar operands, s1 and s2 (e.g., group of
4301 strided accesses of size two), while NUNITS is four (i.e., four scalars
4302 of this type can be packed in a vector). The output vector will contain
4303 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4304 will be 2).
4305
4306 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4307 vectors containing the operands.
4308
4309 For example, NUNITS is four as before, and the group size is 8
4310 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4311 {s5, s6, s7, s8}. */
4312
4313 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4314 nunits = group_size;
4315
4316 number_of_places_left_in_vector = nunits;
4317 bool constant_p = true;
4318 tree_vector_builder elts (vector_type, nunits, 1);
4319 elts.quick_grow (nunits);
4320 gimple_seq ctor_seq = NULL;
4321 for (j = 0; j < nunits * number_of_vectors; ++j)
4322 {
4323 tree op;
4324 i = j % group_size;
4325 stmt_vinfo = stmts[i];
4326
4327 /* Get the def before the loop. In reduction chain we have only
4328 one initial value. Else we have as many as PHIs in the group. */
4329 if (reduc_chain)
4330 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4331 else if (((vec_oprnds->length () + 1) * nunits
4332 - number_of_places_left_in_vector >= group_size)
4333 && neutral_op)
4334 op = neutral_op;
4335 else
4336 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4337
4338 /* Create 'vect_ = {op0,op1,...,opn}'. */
4339 number_of_places_left_in_vector--;
4340 elts[nunits - number_of_places_left_in_vector - 1] = op;
4341 if (!CONSTANT_CLASS_P (op))
4342 constant_p = false;
4343
4344 if (number_of_places_left_in_vector == 0)
4345 {
4346 tree init;
4347 if (constant_p && !neutral_op
4348 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4349 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4350 /* Build the vector directly from ELTS. */
4351 init = gimple_build_vector (&ctor_seq, &elts);
4352 else if (neutral_op)
4353 {
4354 /* Build a vector of the neutral value and shift the
4355 other elements into place. */
4356 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4357 neutral_op);
4358 int k = nunits;
4359 while (k > 0 && elts[k - 1] == neutral_op)
4360 k -= 1;
4361 while (k > 0)
4362 {
4363 k -= 1;
4364 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4365 vector_type, init, elts[k]);
4366 }
4367 }
4368 else
4369 {
4370 /* First time round, duplicate ELTS to fill the
4371 required number of vectors. */
4372 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4373 number_of_vectors, *vec_oprnds);
4374 break;
4375 }
4376 vec_oprnds->quick_push (init);
4377
4378 number_of_places_left_in_vector = nunits;
4379 elts.new_vector (vector_type, nunits, 1);
4380 elts.quick_grow (nunits);
4381 constant_p = true;
4382 }
4383 }
4384 if (ctor_seq != NULL)
4385 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4386 }
4387
4388 /* For a statement STMT_INFO taking part in a reduction operation return
4389 the stmt_vec_info the meta information is stored on. */
4390
4391 stmt_vec_info
4392 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4393 {
4394 stmt_info = vect_orig_stmt (stmt_info);
4395 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4396 if (!is_a <gphi *> (stmt_info->stmt))
4397 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4398 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4399 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4400 {
4401 if (gimple_phi_num_args (phi) == 1)
4402 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4403 }
4404 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4405 {
4406 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4407 stmt_vec_info info
4408 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4409 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4410 stmt_info = info;
4411 }
4412 return stmt_info;
4413 }
4414
4415 /* Function vect_create_epilog_for_reduction
4416
4417 Create code at the loop-epilog to finalize the result of a reduction
4418 computation.
4419
4420 STMT_INFO is the scalar reduction stmt that is being vectorized.
4421 SLP_NODE is an SLP node containing a group of reduction statements. The
4422 first one in this group is STMT_INFO.
4423 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4424 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4425 (counting from 0)
4426
4427 This function:
4428 1. Completes the reduction def-use cycles.
4429 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4430 by calling the function specified by REDUC_FN if available, or by
4431 other means (whole-vector shifts or a scalar loop).
4432 The function also creates a new phi node at the loop exit to preserve
4433 loop-closed form, as illustrated below.
4434
4435 The flow at the entry to this function:
4436
4437 loop:
4438 vec_def = phi <vec_init, null> # REDUCTION_PHI
4439 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4440 s_loop = scalar_stmt # (scalar) STMT_INFO
4441 loop_exit:
4442 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4443 use <s_out0>
4444 use <s_out0>
4445
4446 The above is transformed by this function into:
4447
4448 loop:
4449 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4450 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4451 s_loop = scalar_stmt # (scalar) STMT_INFO
4452 loop_exit:
4453 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4454 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4455 v_out2 = reduce <v_out1>
4456 s_out3 = extract_field <v_out2, 0>
4457 s_out4 = adjust_result <s_out3>
4458 use <s_out4>
4459 use <s_out4>
4460 */
4461
4462 static void
4463 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4464 stmt_vec_info stmt_info,
4465 slp_tree slp_node,
4466 slp_instance slp_node_instance)
4467 {
4468 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4469 gcc_assert (reduc_info->is_reduc_info);
4470 /* For double reductions we need to get at the inner loop reduction
4471 stmt which has the meta info attached. Our stmt_info is that of the
4472 loop-closed PHI of the inner loop which we remember as
4473 def for the reduction PHI generation. */
4474 bool double_reduc = false;
4475 stmt_vec_info rdef_info = stmt_info;
4476 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4477 {
4478 gcc_assert (!slp_node);
4479 double_reduc = true;
4480 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4481 (stmt_info->stmt, 0));
4482 stmt_info = vect_stmt_to_vectorize (stmt_info);
4483 }
4484 gphi *reduc_def_stmt
4485 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4486 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4487 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4488 tree vectype;
4489 machine_mode mode;
4490 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4491 basic_block exit_bb;
4492 tree scalar_dest;
4493 tree scalar_type;
4494 gimple *new_phi = NULL, *phi;
4495 gimple_stmt_iterator exit_gsi;
4496 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4497 gimple *epilog_stmt = NULL;
4498 gimple *exit_phi;
4499 tree bitsize;
4500 tree def;
4501 tree orig_name, scalar_result;
4502 imm_use_iterator imm_iter, phi_imm_iter;
4503 use_operand_p use_p, phi_use_p;
4504 gimple *use_stmt;
4505 bool nested_in_vect_loop = false;
4506 auto_vec<gimple *> new_phis;
4507 int j, i;
4508 auto_vec<tree> scalar_results;
4509 unsigned int group_size = 1, k;
4510 auto_vec<gimple *> phis;
4511 bool slp_reduc = false;
4512 bool direct_slp_reduc;
4513 tree new_phi_result;
4514 tree induction_index = NULL_TREE;
4515
4516 if (slp_node)
4517 group_size = SLP_TREE_LANES (slp_node);
4518
4519 if (nested_in_vect_loop_p (loop, stmt_info))
4520 {
4521 outer_loop = loop;
4522 loop = loop->inner;
4523 nested_in_vect_loop = true;
4524 gcc_assert (!slp_node);
4525 }
4526 gcc_assert (!nested_in_vect_loop || double_reduc);
4527
4528 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4529 gcc_assert (vectype);
4530 mode = TYPE_MODE (vectype);
4531
4532 tree initial_def = NULL;
4533 tree induc_val = NULL_TREE;
4534 tree adjustment_def = NULL;
4535 if (slp_node)
4536 ;
4537 else
4538 {
4539 /* Get at the scalar def before the loop, that defines the initial value
4540 of the reduction variable. */
4541 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4542 loop_preheader_edge (loop));
4543 /* Optimize: for induction condition reduction, if we can't use zero
4544 for induc_val, use initial_def. */
4545 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4546 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4547 else if (double_reduc)
4548 ;
4549 else if (nested_in_vect_loop)
4550 ;
4551 else
4552 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4553 }
4554
4555 unsigned vec_num;
4556 int ncopies;
4557 if (slp_node)
4558 {
4559 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4560 ncopies = 1;
4561 }
4562 else
4563 {
4564 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4565 vec_num = 1;
4566 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4567 }
4568
4569 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4570 which is updated with the current index of the loop for every match of
4571 the original loop's cond_expr (VEC_STMT). This results in a vector
4572 containing the last time the condition passed for that vector lane.
4573 The first match will be a 1 to allow 0 to be used for non-matching
4574 indexes. If there are no matches at all then the vector will be all
4575 zeroes.
4576
4577 PR92772: This algorithm is broken for architectures that support
4578 masked vectors, but do not provide fold_extract_last. */
4579 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4580 {
4581 auto_vec<std::pair<tree, bool>, 2> ccompares;
4582 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4583 cond_info = vect_stmt_to_vectorize (cond_info);
4584 while (cond_info != reduc_info)
4585 {
4586 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4587 {
4588 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4589 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4590 ccompares.safe_push
4591 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4592 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4593 }
4594 cond_info
4595 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4596 1 + STMT_VINFO_REDUC_IDX
4597 (cond_info)));
4598 cond_info = vect_stmt_to_vectorize (cond_info);
4599 }
4600 gcc_assert (ccompares.length () != 0);
4601
4602 tree indx_before_incr, indx_after_incr;
4603 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4604 int scalar_precision
4605 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4606 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4607 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4608 (TYPE_MODE (vectype), cr_index_scalar_type,
4609 TYPE_VECTOR_SUBPARTS (vectype));
4610
4611 /* First we create a simple vector induction variable which starts
4612 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4613 vector size (STEP). */
4614
4615 /* Create a {1,2,3,...} vector. */
4616 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4617
4618 /* Create a vector of the step value. */
4619 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4620 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4621
4622 /* Create an induction variable. */
4623 gimple_stmt_iterator incr_gsi;
4624 bool insert_after;
4625 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4626 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4627 insert_after, &indx_before_incr, &indx_after_incr);
4628
4629 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4630 filled with zeros (VEC_ZERO). */
4631
4632 /* Create a vector of 0s. */
4633 tree zero = build_zero_cst (cr_index_scalar_type);
4634 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4635
4636 /* Create a vector phi node. */
4637 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4638 new_phi = create_phi_node (new_phi_tree, loop->header);
4639 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4640 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4641
4642 /* Now take the condition from the loops original cond_exprs
4643 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4644 every match uses values from the induction variable
4645 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4646 (NEW_PHI_TREE).
4647 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4648 the new cond_expr (INDEX_COND_EXPR). */
4649 gimple_seq stmts = NULL;
4650 for (int i = ccompares.length () - 1; i != -1; --i)
4651 {
4652 tree ccompare = ccompares[i].first;
4653 if (ccompares[i].second)
4654 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4655 cr_index_vector_type,
4656 ccompare,
4657 indx_before_incr, new_phi_tree);
4658 else
4659 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4660 cr_index_vector_type,
4661 ccompare,
4662 new_phi_tree, indx_before_incr);
4663 }
4664 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4665
4666 /* Update the phi with the vec cond. */
4667 induction_index = new_phi_tree;
4668 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4669 loop_latch_edge (loop), UNKNOWN_LOCATION);
4670 }
4671
4672 /* 2. Create epilog code.
4673 The reduction epilog code operates across the elements of the vector
4674 of partial results computed by the vectorized loop.
4675 The reduction epilog code consists of:
4676
4677 step 1: compute the scalar result in a vector (v_out2)
4678 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4679 step 3: adjust the scalar result (s_out3) if needed.
4680
4681 Step 1 can be accomplished using one the following three schemes:
4682 (scheme 1) using reduc_fn, if available.
4683 (scheme 2) using whole-vector shifts, if available.
4684 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4685 combined.
4686
4687 The overall epilog code looks like this:
4688
4689 s_out0 = phi <s_loop> # original EXIT_PHI
4690 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4691 v_out2 = reduce <v_out1> # step 1
4692 s_out3 = extract_field <v_out2, 0> # step 2
4693 s_out4 = adjust_result <s_out3> # step 3
4694
4695 (step 3 is optional, and steps 1 and 2 may be combined).
4696 Lastly, the uses of s_out0 are replaced by s_out4. */
4697
4698
4699 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4700 v_out1 = phi <VECT_DEF>
4701 Store them in NEW_PHIS. */
4702 if (double_reduc)
4703 loop = outer_loop;
4704 exit_bb = single_exit (loop)->dest;
4705 new_phis.create (slp_node ? vec_num : ncopies);
4706 for (unsigned i = 0; i < vec_num; i++)
4707 {
4708 if (slp_node)
4709 def = vect_get_slp_vect_def (slp_node, i);
4710 else
4711 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
4712 for (j = 0; j < ncopies; j++)
4713 {
4714 tree new_def = copy_ssa_name (def);
4715 phi = create_phi_node (new_def, exit_bb);
4716 if (j == 0)
4717 new_phis.quick_push (phi);
4718 else
4719 {
4720 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
4721 new_phis.quick_push (phi);
4722 }
4723
4724 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4725 }
4726 }
4727
4728 exit_gsi = gsi_after_labels (exit_bb);
4729
4730 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4731 (i.e. when reduc_fn is not available) and in the final adjustment
4732 code (if needed). Also get the original scalar reduction variable as
4733 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4734 represents a reduction pattern), the tree-code and scalar-def are
4735 taken from the original stmt that the pattern-stmt (STMT) replaces.
4736 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4737 are taken from STMT. */
4738
4739 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4740 if (orig_stmt_info != stmt_info)
4741 {
4742 /* Reduction pattern */
4743 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4744 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4745 }
4746
4747 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4748 scalar_type = TREE_TYPE (scalar_dest);
4749 scalar_results.create (group_size);
4750 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4751 bitsize = TYPE_SIZE (scalar_type);
4752
4753 /* SLP reduction without reduction chain, e.g.,
4754 # a1 = phi <a2, a0>
4755 # b1 = phi <b2, b0>
4756 a2 = operation (a1)
4757 b2 = operation (b1) */
4758 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4759
4760 /* True if we should implement SLP_REDUC using native reduction operations
4761 instead of scalar operations. */
4762 direct_slp_reduc = (reduc_fn != IFN_LAST
4763 && slp_reduc
4764 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4765
4766 /* In case of reduction chain, e.g.,
4767 # a1 = phi <a3, a0>
4768 a2 = operation (a1)
4769 a3 = operation (a2),
4770
4771 we may end up with more than one vector result. Here we reduce them to
4772 one vector. */
4773 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4774 {
4775 gimple_seq stmts = NULL;
4776 tree first_vect = PHI_RESULT (new_phis[0]);
4777 first_vect = gimple_convert (&stmts, vectype, first_vect);
4778 for (k = 1; k < new_phis.length (); k++)
4779 {
4780 gimple *next_phi = new_phis[k];
4781 tree second_vect = PHI_RESULT (next_phi);
4782 second_vect = gimple_convert (&stmts, vectype, second_vect);
4783 first_vect = gimple_build (&stmts, code, vectype,
4784 first_vect, second_vect);
4785 }
4786 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4787
4788 new_phi_result = first_vect;
4789 new_phis.truncate (0);
4790 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4791 }
4792 /* Likewise if we couldn't use a single defuse cycle. */
4793 else if (ncopies > 1)
4794 {
4795 gimple_seq stmts = NULL;
4796 tree first_vect = PHI_RESULT (new_phis[0]);
4797 first_vect = gimple_convert (&stmts, vectype, first_vect);
4798 for (int k = 1; k < ncopies; ++k)
4799 {
4800 tree second_vect = PHI_RESULT (new_phis[k]);
4801 second_vect = gimple_convert (&stmts, vectype, second_vect);
4802 first_vect = gimple_build (&stmts, code, vectype,
4803 first_vect, second_vect);
4804 }
4805 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4806 new_phi_result = first_vect;
4807 new_phis.truncate (0);
4808 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4809 }
4810 else
4811 new_phi_result = PHI_RESULT (new_phis[0]);
4812
4813 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4814 && reduc_fn != IFN_LAST)
4815 {
4816 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4817 various data values where the condition matched and another vector
4818 (INDUCTION_INDEX) containing all the indexes of those matches. We
4819 need to extract the last matching index (which will be the index with
4820 highest value) and use this to index into the data vector.
4821 For the case where there were no matches, the data vector will contain
4822 all default values and the index vector will be all zeros. */
4823
4824 /* Get various versions of the type of the vector of indexes. */
4825 tree index_vec_type = TREE_TYPE (induction_index);
4826 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4827 tree index_scalar_type = TREE_TYPE (index_vec_type);
4828 tree index_vec_cmp_type = truth_type_for (index_vec_type);
4829
4830 /* Get an unsigned integer version of the type of the data vector. */
4831 int scalar_precision
4832 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4833 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4834 tree vectype_unsigned = build_vector_type
4835 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4836
4837 /* First we need to create a vector (ZERO_VEC) of zeros and another
4838 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4839 can create using a MAX reduction and then expanding.
4840 In the case where the loop never made any matches, the max index will
4841 be zero. */
4842
4843 /* Vector of {0, 0, 0,...}. */
4844 tree zero_vec = build_zero_cst (vectype);
4845
4846 gimple_seq stmts = NULL;
4847 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4848 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4849
4850 /* Find maximum value from the vector of found indexes. */
4851 tree max_index = make_ssa_name (index_scalar_type);
4852 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4853 1, induction_index);
4854 gimple_call_set_lhs (max_index_stmt, max_index);
4855 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4856
4857 /* Vector of {max_index, max_index, max_index,...}. */
4858 tree max_index_vec = make_ssa_name (index_vec_type);
4859 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4860 max_index);
4861 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4862 max_index_vec_rhs);
4863 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4864
4865 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4866 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4867 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4868 otherwise. Only one value should match, resulting in a vector
4869 (VEC_COND) with one data value and the rest zeros.
4870 In the case where the loop never made any matches, every index will
4871 match, resulting in a vector with all data values (which will all be
4872 the default value). */
4873
4874 /* Compare the max index vector to the vector of found indexes to find
4875 the position of the max value. */
4876 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4877 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4878 induction_index,
4879 max_index_vec);
4880 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4881
4882 /* Use the compare to choose either values from the data vector or
4883 zero. */
4884 tree vec_cond = make_ssa_name (vectype);
4885 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4886 vec_compare, new_phi_result,
4887 zero_vec);
4888 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4889
4890 /* Finally we need to extract the data value from the vector (VEC_COND)
4891 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4892 reduction, but because this doesn't exist, we can use a MAX reduction
4893 instead. The data value might be signed or a float so we need to cast
4894 it first.
4895 In the case where the loop never made any matches, the data values are
4896 all identical, and so will reduce down correctly. */
4897
4898 /* Make the matched data values unsigned. */
4899 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4900 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4901 vec_cond);
4902 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4903 VIEW_CONVERT_EXPR,
4904 vec_cond_cast_rhs);
4905 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4906
4907 /* Reduce down to a scalar value. */
4908 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4909 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4910 1, vec_cond_cast);
4911 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4912 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4913
4914 /* Convert the reduced value back to the result type and set as the
4915 result. */
4916 stmts = NULL;
4917 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4918 data_reduc);
4919 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4920 scalar_results.safe_push (new_temp);
4921 }
4922 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4923 && reduc_fn == IFN_LAST)
4924 {
4925 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4926 idx = 0;
4927 idx_val = induction_index[0];
4928 val = data_reduc[0];
4929 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4930 if (induction_index[i] > idx_val)
4931 val = data_reduc[i], idx_val = induction_index[i];
4932 return val; */
4933
4934 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4935 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4936 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4937 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4938 /* Enforced by vectorizable_reduction, which ensures we have target
4939 support before allowing a conditional reduction on variable-length
4940 vectors. */
4941 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4942 tree idx_val = NULL_TREE, val = NULL_TREE;
4943 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4944 {
4945 tree old_idx_val = idx_val;
4946 tree old_val = val;
4947 idx_val = make_ssa_name (idx_eltype);
4948 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4949 build3 (BIT_FIELD_REF, idx_eltype,
4950 induction_index,
4951 bitsize_int (el_size),
4952 bitsize_int (off)));
4953 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4954 val = make_ssa_name (data_eltype);
4955 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4956 build3 (BIT_FIELD_REF,
4957 data_eltype,
4958 new_phi_result,
4959 bitsize_int (el_size),
4960 bitsize_int (off)));
4961 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4962 if (off != 0)
4963 {
4964 tree new_idx_val = idx_val;
4965 if (off != v_size - el_size)
4966 {
4967 new_idx_val = make_ssa_name (idx_eltype);
4968 epilog_stmt = gimple_build_assign (new_idx_val,
4969 MAX_EXPR, idx_val,
4970 old_idx_val);
4971 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4972 }
4973 tree new_val = make_ssa_name (data_eltype);
4974 epilog_stmt = gimple_build_assign (new_val,
4975 COND_EXPR,
4976 build2 (GT_EXPR,
4977 boolean_type_node,
4978 idx_val,
4979 old_idx_val),
4980 val, old_val);
4981 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4982 idx_val = new_idx_val;
4983 val = new_val;
4984 }
4985 }
4986 /* Convert the reduced value back to the result type and set as the
4987 result. */
4988 gimple_seq stmts = NULL;
4989 val = gimple_convert (&stmts, scalar_type, val);
4990 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4991 scalar_results.safe_push (val);
4992 }
4993
4994 /* 2.3 Create the reduction code, using one of the three schemes described
4995 above. In SLP we simply need to extract all the elements from the
4996 vector (without reducing them), so we use scalar shifts. */
4997 else if (reduc_fn != IFN_LAST && !slp_reduc)
4998 {
4999 tree tmp;
5000 tree vec_elem_type;
5001
5002 /* Case 1: Create:
5003 v_out2 = reduc_expr <v_out1> */
5004
5005 if (dump_enabled_p ())
5006 dump_printf_loc (MSG_NOTE, vect_location,
5007 "Reduce using direct vector reduction.\n");
5008
5009 gimple_seq stmts = NULL;
5010 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5011 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5012 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5013 vec_elem_type, new_phi_result);
5014 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5015 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5016
5017 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5018 && induc_val)
5019 {
5020 /* Earlier we set the initial value to be a vector if induc_val
5021 values. Check the result and if it is induc_val then replace
5022 with the original initial value, unless induc_val is
5023 the same as initial_def already. */
5024 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5025 induc_val);
5026
5027 tmp = make_ssa_name (new_scalar_dest);
5028 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5029 initial_def, new_temp);
5030 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5031 new_temp = tmp;
5032 }
5033
5034 scalar_results.safe_push (new_temp);
5035 }
5036 else if (direct_slp_reduc)
5037 {
5038 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5039 with the elements for other SLP statements replaced with the
5040 neutral value. We can then do a normal reduction on each vector. */
5041
5042 /* Enforced by vectorizable_reduction. */
5043 gcc_assert (new_phis.length () == 1);
5044 gcc_assert (pow2p_hwi (group_size));
5045
5046 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5047 vec<stmt_vec_info> orig_phis
5048 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5049 gimple_seq seq = NULL;
5050
5051 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5052 and the same element size as VECTYPE. */
5053 tree index = build_index_vector (vectype, 0, 1);
5054 tree index_type = TREE_TYPE (index);
5055 tree index_elt_type = TREE_TYPE (index_type);
5056 tree mask_type = truth_type_for (index_type);
5057
5058 /* Create a vector that, for each element, identifies which of
5059 the REDUC_GROUP_SIZE results should use it. */
5060 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5061 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5062 build_vector_from_val (index_type, index_mask));
5063
5064 /* Get a neutral vector value. This is simply a splat of the neutral
5065 scalar value if we have one, otherwise the initial scalar value
5066 is itself a neutral value. */
5067 tree vector_identity = NULL_TREE;
5068 tree neutral_op = NULL_TREE;
5069 if (slp_node)
5070 {
5071 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5072 neutral_op
5073 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5074 vectype, code, first != NULL);
5075 }
5076 if (neutral_op)
5077 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5078 neutral_op);
5079 for (unsigned int i = 0; i < group_size; ++i)
5080 {
5081 /* If there's no univeral neutral value, we can use the
5082 initial scalar value from the original PHI. This is used
5083 for MIN and MAX reduction, for example. */
5084 if (!neutral_op)
5085 {
5086 tree scalar_value
5087 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5088 loop_preheader_edge (loop));
5089 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5090 scalar_value);
5091 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5092 scalar_value);
5093 }
5094
5095 /* Calculate the equivalent of:
5096
5097 sel[j] = (index[j] == i);
5098
5099 which selects the elements of NEW_PHI_RESULT that should
5100 be included in the result. */
5101 tree compare_val = build_int_cst (index_elt_type, i);
5102 compare_val = build_vector_from_val (index_type, compare_val);
5103 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5104 index, compare_val);
5105
5106 /* Calculate the equivalent of:
5107
5108 vec = seq ? new_phi_result : vector_identity;
5109
5110 VEC is now suitable for a full vector reduction. */
5111 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5112 sel, new_phi_result, vector_identity);
5113
5114 /* Do the reduction and convert it to the appropriate type. */
5115 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5116 TREE_TYPE (vectype), vec);
5117 scalar = gimple_convert (&seq, scalar_type, scalar);
5118 scalar_results.safe_push (scalar);
5119 }
5120 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5121 }
5122 else
5123 {
5124 bool reduce_with_shift;
5125 tree vec_temp;
5126
5127 gcc_assert (slp_reduc || new_phis.length () == 1);
5128
5129 /* See if the target wants to do the final (shift) reduction
5130 in a vector mode of smaller size and first reduce upper/lower
5131 halves against each other. */
5132 enum machine_mode mode1 = mode;
5133 tree stype = TREE_TYPE (vectype);
5134 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5135 unsigned nunits1 = nunits;
5136 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5137 && new_phis.length () == 1)
5138 {
5139 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5140 /* For SLP reductions we have to make sure lanes match up, but
5141 since we're doing individual element final reduction reducing
5142 vector width here is even more important.
5143 ??? We can also separate lanes with permutes, for the common
5144 case of power-of-two group-size odd/even extracts would work. */
5145 if (slp_reduc && nunits != nunits1)
5146 {
5147 nunits1 = least_common_multiple (nunits1, group_size);
5148 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5149 }
5150 }
5151 if (!slp_reduc
5152 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5153 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5154
5155 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5156 stype, nunits1);
5157 reduce_with_shift = have_whole_vector_shift (mode1);
5158 if (!VECTOR_MODE_P (mode1))
5159 reduce_with_shift = false;
5160 else
5161 {
5162 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5163 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5164 reduce_with_shift = false;
5165 }
5166
5167 /* First reduce the vector to the desired vector size we should
5168 do shift reduction on by combining upper and lower halves. */
5169 new_temp = new_phi_result;
5170 while (nunits > nunits1)
5171 {
5172 nunits /= 2;
5173 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5174 stype, nunits);
5175 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5176
5177 /* The target has to make sure we support lowpart/highpart
5178 extraction, either via direct vector extract or through
5179 an integer mode punning. */
5180 tree dst1, dst2;
5181 if (convert_optab_handler (vec_extract_optab,
5182 TYPE_MODE (TREE_TYPE (new_temp)),
5183 TYPE_MODE (vectype1))
5184 != CODE_FOR_nothing)
5185 {
5186 /* Extract sub-vectors directly once vec_extract becomes
5187 a conversion optab. */
5188 dst1 = make_ssa_name (vectype1);
5189 epilog_stmt
5190 = gimple_build_assign (dst1, BIT_FIELD_REF,
5191 build3 (BIT_FIELD_REF, vectype1,
5192 new_temp, TYPE_SIZE (vectype1),
5193 bitsize_int (0)));
5194 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5195 dst2 = make_ssa_name (vectype1);
5196 epilog_stmt
5197 = gimple_build_assign (dst2, BIT_FIELD_REF,
5198 build3 (BIT_FIELD_REF, vectype1,
5199 new_temp, TYPE_SIZE (vectype1),
5200 bitsize_int (bitsize)));
5201 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5202 }
5203 else
5204 {
5205 /* Extract via punning to appropriately sized integer mode
5206 vector. */
5207 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5208 tree etype = build_vector_type (eltype, 2);
5209 gcc_assert (convert_optab_handler (vec_extract_optab,
5210 TYPE_MODE (etype),
5211 TYPE_MODE (eltype))
5212 != CODE_FOR_nothing);
5213 tree tem = make_ssa_name (etype);
5214 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5215 build1 (VIEW_CONVERT_EXPR,
5216 etype, new_temp));
5217 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5218 new_temp = tem;
5219 tem = make_ssa_name (eltype);
5220 epilog_stmt
5221 = gimple_build_assign (tem, BIT_FIELD_REF,
5222 build3 (BIT_FIELD_REF, eltype,
5223 new_temp, TYPE_SIZE (eltype),
5224 bitsize_int (0)));
5225 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5226 dst1 = make_ssa_name (vectype1);
5227 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5228 build1 (VIEW_CONVERT_EXPR,
5229 vectype1, tem));
5230 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5231 tem = make_ssa_name (eltype);
5232 epilog_stmt
5233 = gimple_build_assign (tem, BIT_FIELD_REF,
5234 build3 (BIT_FIELD_REF, eltype,
5235 new_temp, TYPE_SIZE (eltype),
5236 bitsize_int (bitsize)));
5237 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5238 dst2 = make_ssa_name (vectype1);
5239 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5240 build1 (VIEW_CONVERT_EXPR,
5241 vectype1, tem));
5242 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5243 }
5244
5245 new_temp = make_ssa_name (vectype1);
5246 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5247 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5248 new_phis[0] = epilog_stmt;
5249 }
5250
5251 if (reduce_with_shift && !slp_reduc)
5252 {
5253 int element_bitsize = tree_to_uhwi (bitsize);
5254 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5255 for variable-length vectors and also requires direct target support
5256 for loop reductions. */
5257 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5258 int nelements = vec_size_in_bits / element_bitsize;
5259 vec_perm_builder sel;
5260 vec_perm_indices indices;
5261
5262 int elt_offset;
5263
5264 tree zero_vec = build_zero_cst (vectype1);
5265 /* Case 2: Create:
5266 for (offset = nelements/2; offset >= 1; offset/=2)
5267 {
5268 Create: va' = vec_shift <va, offset>
5269 Create: va = vop <va, va'>
5270 } */
5271
5272 tree rhs;
5273
5274 if (dump_enabled_p ())
5275 dump_printf_loc (MSG_NOTE, vect_location,
5276 "Reduce using vector shifts\n");
5277
5278 gimple_seq stmts = NULL;
5279 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5280 for (elt_offset = nelements / 2;
5281 elt_offset >= 1;
5282 elt_offset /= 2)
5283 {
5284 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5285 indices.new_vector (sel, 2, nelements);
5286 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5287 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5288 new_temp, zero_vec, mask);
5289 new_temp = gimple_build (&stmts, code,
5290 vectype1, new_name, new_temp);
5291 }
5292 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5293
5294 /* 2.4 Extract the final scalar result. Create:
5295 s_out3 = extract_field <v_out2, bitpos> */
5296
5297 if (dump_enabled_p ())
5298 dump_printf_loc (MSG_NOTE, vect_location,
5299 "extract scalar result\n");
5300
5301 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5302 bitsize, bitsize_zero_node);
5303 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5304 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5305 gimple_assign_set_lhs (epilog_stmt, new_temp);
5306 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5307 scalar_results.safe_push (new_temp);
5308 }
5309 else
5310 {
5311 /* Case 3: Create:
5312 s = extract_field <v_out2, 0>
5313 for (offset = element_size;
5314 offset < vector_size;
5315 offset += element_size;)
5316 {
5317 Create: s' = extract_field <v_out2, offset>
5318 Create: s = op <s, s'> // For non SLP cases
5319 } */
5320
5321 if (dump_enabled_p ())
5322 dump_printf_loc (MSG_NOTE, vect_location,
5323 "Reduce using scalar code.\n");
5324
5325 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5326 int element_bitsize = tree_to_uhwi (bitsize);
5327 tree compute_type = TREE_TYPE (vectype);
5328 gimple_seq stmts = NULL;
5329 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5330 {
5331 int bit_offset;
5332 if (gimple_code (new_phi) == GIMPLE_PHI)
5333 vec_temp = PHI_RESULT (new_phi);
5334 else
5335 vec_temp = gimple_assign_lhs (new_phi);
5336 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5337 vec_temp, bitsize, bitsize_zero_node);
5338
5339 /* In SLP we don't need to apply reduction operation, so we just
5340 collect s' values in SCALAR_RESULTS. */
5341 if (slp_reduc)
5342 scalar_results.safe_push (new_temp);
5343
5344 for (bit_offset = element_bitsize;
5345 bit_offset < vec_size_in_bits;
5346 bit_offset += element_bitsize)
5347 {
5348 tree bitpos = bitsize_int (bit_offset);
5349 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5350 compute_type, vec_temp,
5351 bitsize, bitpos);
5352 if (slp_reduc)
5353 {
5354 /* In SLP we don't need to apply reduction operation, so
5355 we just collect s' values in SCALAR_RESULTS. */
5356 new_temp = new_name;
5357 scalar_results.safe_push (new_name);
5358 }
5359 else
5360 new_temp = gimple_build (&stmts, code, compute_type,
5361 new_name, new_temp);
5362 }
5363 }
5364
5365 /* The only case where we need to reduce scalar results in SLP, is
5366 unrolling. If the size of SCALAR_RESULTS is greater than
5367 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5368 REDUC_GROUP_SIZE. */
5369 if (slp_reduc)
5370 {
5371 tree res, first_res, new_res;
5372
5373 /* Reduce multiple scalar results in case of SLP unrolling. */
5374 for (j = group_size; scalar_results.iterate (j, &res);
5375 j++)
5376 {
5377 first_res = scalar_results[j % group_size];
5378 new_res = gimple_build (&stmts, code, compute_type,
5379 first_res, res);
5380 scalar_results[j % group_size] = new_res;
5381 }
5382 for (k = 0; k < group_size; k++)
5383 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5384 scalar_results[k]);
5385 }
5386 else
5387 {
5388 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5389 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5390 scalar_results.safe_push (new_temp);
5391 }
5392
5393 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5394 }
5395
5396 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5397 && induc_val)
5398 {
5399 /* Earlier we set the initial value to be a vector if induc_val
5400 values. Check the result and if it is induc_val then replace
5401 with the original initial value, unless induc_val is
5402 the same as initial_def already. */
5403 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5404 induc_val);
5405
5406 tree tmp = make_ssa_name (new_scalar_dest);
5407 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5408 initial_def, new_temp);
5409 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5410 scalar_results[0] = tmp;
5411 }
5412 }
5413
5414 /* 2.5 Adjust the final result by the initial value of the reduction
5415 variable. (When such adjustment is not needed, then
5416 'adjustment_def' is zero). For example, if code is PLUS we create:
5417 new_temp = loop_exit_def + adjustment_def */
5418
5419 if (adjustment_def)
5420 {
5421 gcc_assert (!slp_reduc);
5422 gimple_seq stmts = NULL;
5423 if (nested_in_vect_loop)
5424 {
5425 new_phi = new_phis[0];
5426 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5427 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5428 new_temp = gimple_build (&stmts, code, vectype,
5429 PHI_RESULT (new_phi), adjustment_def);
5430 }
5431 else
5432 {
5433 new_temp = scalar_results[0];
5434 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5435 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5436 new_temp = gimple_build (&stmts, code, scalar_type,
5437 new_temp, adjustment_def);
5438 }
5439
5440 epilog_stmt = gimple_seq_last_stmt (stmts);
5441 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5442 if (nested_in_vect_loop)
5443 {
5444 if (!double_reduc)
5445 scalar_results.quick_push (new_temp);
5446 else
5447 scalar_results[0] = new_temp;
5448 }
5449 else
5450 scalar_results[0] = new_temp;
5451
5452 new_phis[0] = epilog_stmt;
5453 }
5454
5455 if (double_reduc)
5456 loop = loop->inner;
5457
5458 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5459 phis with new adjusted scalar results, i.e., replace use <s_out0>
5460 with use <s_out4>.
5461
5462 Transform:
5463 loop_exit:
5464 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5465 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5466 v_out2 = reduce <v_out1>
5467 s_out3 = extract_field <v_out2, 0>
5468 s_out4 = adjust_result <s_out3>
5469 use <s_out0>
5470 use <s_out0>
5471
5472 into:
5473
5474 loop_exit:
5475 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5476 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5477 v_out2 = reduce <v_out1>
5478 s_out3 = extract_field <v_out2, 0>
5479 s_out4 = adjust_result <s_out3>
5480 use <s_out4>
5481 use <s_out4> */
5482
5483
5484 /* In SLP reduction chain we reduce vector results into one vector if
5485 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5486 LHS of the last stmt in the reduction chain, since we are looking for
5487 the loop exit phi node. */
5488 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5489 {
5490 stmt_vec_info dest_stmt_info
5491 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5492 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5493 group_size = 1;
5494 }
5495
5496 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5497 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5498 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5499 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5500 correspond to the first vector stmt, etc.
5501 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5502 if (group_size > new_phis.length ())
5503 gcc_assert (!(group_size % new_phis.length ()));
5504
5505 for (k = 0; k < group_size; k++)
5506 {
5507 if (slp_reduc)
5508 {
5509 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5510
5511 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5512 /* SLP statements can't participate in patterns. */
5513 gcc_assert (!orig_stmt_info);
5514 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5515 }
5516
5517 if (nested_in_vect_loop)
5518 {
5519 if (double_reduc)
5520 loop = outer_loop;
5521 else
5522 gcc_unreachable ();
5523 }
5524
5525 phis.create (3);
5526 /* Find the loop-closed-use at the loop exit of the original scalar
5527 result. (The reduction result is expected to have two immediate uses,
5528 one at the latch block, and one at the loop exit). For double
5529 reductions we are looking for exit phis of the outer loop. */
5530 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5531 {
5532 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5533 {
5534 if (!is_gimple_debug (USE_STMT (use_p)))
5535 phis.safe_push (USE_STMT (use_p));
5536 }
5537 else
5538 {
5539 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5540 {
5541 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5542
5543 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5544 {
5545 if (!flow_bb_inside_loop_p (loop,
5546 gimple_bb (USE_STMT (phi_use_p)))
5547 && !is_gimple_debug (USE_STMT (phi_use_p)))
5548 phis.safe_push (USE_STMT (phi_use_p));
5549 }
5550 }
5551 }
5552 }
5553
5554 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5555 {
5556 /* Replace the uses: */
5557 orig_name = PHI_RESULT (exit_phi);
5558 scalar_result = scalar_results[k];
5559 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5560 {
5561 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5562 SET_USE (use_p, scalar_result);
5563 update_stmt (use_stmt);
5564 }
5565 }
5566
5567 phis.release ();
5568 }
5569 }
5570
5571 /* Return a vector of type VECTYPE that is equal to the vector select
5572 operation "MASK ? VEC : IDENTITY". Insert the select statements
5573 before GSI. */
5574
5575 static tree
5576 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5577 tree vec, tree identity)
5578 {
5579 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5580 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5581 mask, vec, identity);
5582 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5583 return cond;
5584 }
5585
5586 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5587 order, starting with LHS. Insert the extraction statements before GSI and
5588 associate the new scalar SSA names with variable SCALAR_DEST.
5589 Return the SSA name for the result. */
5590
5591 static tree
5592 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5593 tree_code code, tree lhs, tree vector_rhs)
5594 {
5595 tree vectype = TREE_TYPE (vector_rhs);
5596 tree scalar_type = TREE_TYPE (vectype);
5597 tree bitsize = TYPE_SIZE (scalar_type);
5598 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5599 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5600
5601 for (unsigned HOST_WIDE_INT bit_offset = 0;
5602 bit_offset < vec_size_in_bits;
5603 bit_offset += element_bitsize)
5604 {
5605 tree bitpos = bitsize_int (bit_offset);
5606 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5607 bitsize, bitpos);
5608
5609 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5610 rhs = make_ssa_name (scalar_dest, stmt);
5611 gimple_assign_set_lhs (stmt, rhs);
5612 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5613
5614 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5615 tree new_name = make_ssa_name (scalar_dest, stmt);
5616 gimple_assign_set_lhs (stmt, new_name);
5617 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5618 lhs = new_name;
5619 }
5620 return lhs;
5621 }
5622
5623 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5624 type of the vector input. */
5625
5626 static internal_fn
5627 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5628 {
5629 internal_fn mask_reduc_fn;
5630
5631 switch (reduc_fn)
5632 {
5633 case IFN_FOLD_LEFT_PLUS:
5634 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5635 break;
5636
5637 default:
5638 return IFN_LAST;
5639 }
5640
5641 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5642 OPTIMIZE_FOR_SPEED))
5643 return mask_reduc_fn;
5644 return IFN_LAST;
5645 }
5646
5647 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5648 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5649 statement. CODE is the operation performed by STMT_INFO and OPS are
5650 its scalar operands. REDUC_INDEX is the index of the operand in
5651 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5652 implements in-order reduction, or IFN_LAST if we should open-code it.
5653 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5654 that should be used to control the operation in a fully-masked loop. */
5655
5656 static bool
5657 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5658 stmt_vec_info stmt_info,
5659 gimple_stmt_iterator *gsi,
5660 gimple **vec_stmt, slp_tree slp_node,
5661 gimple *reduc_def_stmt,
5662 tree_code code, internal_fn reduc_fn,
5663 tree ops[3], tree vectype_in,
5664 int reduc_index, vec_loop_masks *masks)
5665 {
5666 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5667 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5668 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5669
5670 int ncopies;
5671 if (slp_node)
5672 ncopies = 1;
5673 else
5674 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5675
5676 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5677 gcc_assert (ncopies == 1);
5678 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5679
5680 if (slp_node)
5681 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5682 TYPE_VECTOR_SUBPARTS (vectype_in)));
5683
5684 tree op0 = ops[1 - reduc_index];
5685
5686 int group_size = 1;
5687 stmt_vec_info scalar_dest_def_info;
5688 auto_vec<tree> vec_oprnds0;
5689 if (slp_node)
5690 {
5691 auto_vec<vec<tree> > vec_defs (2);
5692 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5693 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5694 vec_defs[0].release ();
5695 vec_defs[1].release ();
5696 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5697 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5698 }
5699 else
5700 {
5701 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
5702 op0, &vec_oprnds0);
5703 scalar_dest_def_info = stmt_info;
5704 }
5705
5706 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5707 tree scalar_type = TREE_TYPE (scalar_dest);
5708 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5709
5710 int vec_num = vec_oprnds0.length ();
5711 gcc_assert (vec_num == 1 || slp_node);
5712 tree vec_elem_type = TREE_TYPE (vectype_out);
5713 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5714
5715 tree vector_identity = NULL_TREE;
5716 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5717 vector_identity = build_zero_cst (vectype_out);
5718
5719 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5720 int i;
5721 tree def0;
5722 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5723 {
5724 gimple *new_stmt;
5725 tree mask = NULL_TREE;
5726 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5727 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5728
5729 /* Handle MINUS by adding the negative. */
5730 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5731 {
5732 tree negated = make_ssa_name (vectype_out);
5733 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5734 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5735 def0 = negated;
5736 }
5737
5738 if (mask && mask_reduc_fn == IFN_LAST)
5739 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5740 vector_identity);
5741
5742 /* On the first iteration the input is simply the scalar phi
5743 result, and for subsequent iterations it is the output of
5744 the preceding operation. */
5745 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5746 {
5747 if (mask && mask_reduc_fn != IFN_LAST)
5748 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5749 def0, mask);
5750 else
5751 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5752 def0);
5753 /* For chained SLP reductions the output of the previous reduction
5754 operation serves as the input of the next. For the final statement
5755 the output cannot be a temporary - we reuse the original
5756 scalar destination of the last statement. */
5757 if (i != vec_num - 1)
5758 {
5759 gimple_set_lhs (new_stmt, scalar_dest_var);
5760 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5761 gimple_set_lhs (new_stmt, reduc_var);
5762 }
5763 }
5764 else
5765 {
5766 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5767 reduc_var, def0);
5768 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5769 /* Remove the statement, so that we can use the same code paths
5770 as for statements that we've just created. */
5771 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5772 gsi_remove (&tmp_gsi, true);
5773 }
5774
5775 if (i == vec_num - 1)
5776 {
5777 gimple_set_lhs (new_stmt, scalar_dest);
5778 vect_finish_replace_stmt (loop_vinfo,
5779 scalar_dest_def_info,
5780 new_stmt);
5781 }
5782 else
5783 vect_finish_stmt_generation (loop_vinfo,
5784 scalar_dest_def_info,
5785 new_stmt, gsi);
5786
5787 if (slp_node)
5788 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5789 else
5790 {
5791 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5792 *vec_stmt = new_stmt;
5793 }
5794 }
5795
5796 return true;
5797 }
5798
5799 /* Function is_nonwrapping_integer_induction.
5800
5801 Check if STMT_VINO (which is part of loop LOOP) both increments and
5802 does not cause overflow. */
5803
5804 static bool
5805 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5806 {
5807 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5808 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5809 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5810 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5811 widest_int ni, max_loop_value, lhs_max;
5812 wi::overflow_type overflow = wi::OVF_NONE;
5813
5814 /* Make sure the loop is integer based. */
5815 if (TREE_CODE (base) != INTEGER_CST
5816 || TREE_CODE (step) != INTEGER_CST)
5817 return false;
5818
5819 /* Check that the max size of the loop will not wrap. */
5820
5821 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5822 return true;
5823
5824 if (! max_stmt_executions (loop, &ni))
5825 return false;
5826
5827 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5828 &overflow);
5829 if (overflow)
5830 return false;
5831
5832 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5833 TYPE_SIGN (lhs_type), &overflow);
5834 if (overflow)
5835 return false;
5836
5837 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5838 <= TYPE_PRECISION (lhs_type));
5839 }
5840
5841 /* Check if masking can be supported by inserting a conditional expression.
5842 CODE is the code for the operation. COND_FN is the conditional internal
5843 function, if it exists. VECTYPE_IN is the type of the vector input. */
5844 static bool
5845 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5846 tree vectype_in)
5847 {
5848 if (cond_fn != IFN_LAST
5849 && direct_internal_fn_supported_p (cond_fn, vectype_in,
5850 OPTIMIZE_FOR_SPEED))
5851 return false;
5852
5853 switch (code)
5854 {
5855 case DOT_PROD_EXPR:
5856 case SAD_EXPR:
5857 return true;
5858
5859 default:
5860 return false;
5861 }
5862 }
5863
5864 /* Insert a conditional expression to enable masked vectorization. CODE is the
5865 code for the operation. VOP is the array of operands. MASK is the loop
5866 mask. GSI is a statement iterator used to place the new conditional
5867 expression. */
5868 static void
5869 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5870 gimple_stmt_iterator *gsi)
5871 {
5872 switch (code)
5873 {
5874 case DOT_PROD_EXPR:
5875 {
5876 tree vectype = TREE_TYPE (vop[1]);
5877 tree zero = build_zero_cst (vectype);
5878 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5879 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5880 mask, vop[1], zero);
5881 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5882 vop[1] = masked_op1;
5883 break;
5884 }
5885
5886 case SAD_EXPR:
5887 {
5888 tree vectype = TREE_TYPE (vop[1]);
5889 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5890 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5891 mask, vop[1], vop[0]);
5892 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5893 vop[1] = masked_op1;
5894 break;
5895 }
5896
5897 default:
5898 gcc_unreachable ();
5899 }
5900 }
5901
5902 /* Function vectorizable_reduction.
5903
5904 Check if STMT_INFO performs a reduction operation that can be vectorized.
5905 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5906 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5907 Return true if STMT_INFO is vectorizable in this way.
5908
5909 This function also handles reduction idioms (patterns) that have been
5910 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5911 may be of this form:
5912 X = pattern_expr (arg0, arg1, ..., X)
5913 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5914 sequence that had been detected and replaced by the pattern-stmt
5915 (STMT_INFO).
5916
5917 This function also handles reduction of condition expressions, for example:
5918 for (int i = 0; i < N; i++)
5919 if (a[i] < value)
5920 last = a[i];
5921 This is handled by vectorising the loop and creating an additional vector
5922 containing the loop indexes for which "a[i] < value" was true. In the
5923 function epilogue this is reduced to a single max value and then used to
5924 index into the vector of results.
5925
5926 In some cases of reduction patterns, the type of the reduction variable X is
5927 different than the type of the other arguments of STMT_INFO.
5928 In such cases, the vectype that is used when transforming STMT_INFO into
5929 a vector stmt is different than the vectype that is used to determine the
5930 vectorization factor, because it consists of a different number of elements
5931 than the actual number of elements that are being operated upon in parallel.
5932
5933 For example, consider an accumulation of shorts into an int accumulator.
5934 On some targets it's possible to vectorize this pattern operating on 8
5935 shorts at a time (hence, the vectype for purposes of determining the
5936 vectorization factor should be V8HI); on the other hand, the vectype that
5937 is used to create the vector form is actually V4SI (the type of the result).
5938
5939 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5940 indicates what is the actual level of parallelism (V8HI in the example), so
5941 that the right vectorization factor would be derived. This vectype
5942 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5943 be used to create the vectorized stmt. The right vectype for the vectorized
5944 stmt is obtained from the type of the result X:
5945 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5946
5947 This means that, contrary to "regular" reductions (or "regular" stmts in
5948 general), the following equation:
5949 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5950 does *NOT* necessarily hold for reduction patterns. */
5951
5952 bool
5953 vectorizable_reduction (loop_vec_info loop_vinfo,
5954 stmt_vec_info stmt_info, slp_tree slp_node,
5955 slp_instance slp_node_instance,
5956 stmt_vector_for_cost *cost_vec)
5957 {
5958 tree scalar_dest;
5959 tree vectype_in = NULL_TREE;
5960 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5961 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5962 stmt_vec_info cond_stmt_vinfo = NULL;
5963 tree scalar_type;
5964 int i;
5965 int ncopies;
5966 bool single_defuse_cycle = false;
5967 bool nested_cycle = false;
5968 bool double_reduc = false;
5969 int vec_num;
5970 tree tem;
5971 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5972 tree cond_reduc_val = NULL_TREE;
5973
5974 /* Make sure it was already recognized as a reduction computation. */
5975 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5976 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5977 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5978 return false;
5979
5980 /* The stmt we store reduction analysis meta on. */
5981 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5982 reduc_info->is_reduc_info = true;
5983
5984 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5985 {
5986 if (is_a <gphi *> (stmt_info->stmt))
5987 /* Analysis for double-reduction is done on the outer
5988 loop PHI, nested cycles have no further restrictions. */
5989 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5990 else
5991 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5992 return true;
5993 }
5994
5995 stmt_vec_info orig_stmt_of_analysis = stmt_info;
5996 stmt_vec_info phi_info = stmt_info;
5997 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5998 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5999 {
6000 if (!is_a <gphi *> (stmt_info->stmt))
6001 {
6002 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6003 return true;
6004 }
6005 if (slp_node)
6006 {
6007 slp_node_instance->reduc_phis = slp_node;
6008 /* ??? We're leaving slp_node to point to the PHIs, we only
6009 need it to get at the number of vector stmts which wasn't
6010 yet initialized for the instance root. */
6011 }
6012 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6013 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6014 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6015 {
6016 use_operand_p use_p;
6017 gimple *use_stmt;
6018 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6019 &use_p, &use_stmt);
6020 gcc_assert (res);
6021 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6022 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6023 }
6024 }
6025
6026 /* PHIs should not participate in patterns. */
6027 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6028 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6029
6030 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6031 and compute the reduction chain length. */
6032 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6033 loop_latch_edge (loop));
6034 unsigned reduc_chain_length = 0;
6035 bool only_slp_reduc_chain = true;
6036 stmt_info = NULL;
6037 while (reduc_def != PHI_RESULT (reduc_def_phi))
6038 {
6039 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6040 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6041 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6042 {
6043 if (dump_enabled_p ())
6044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6045 "reduction chain broken by patterns.\n");
6046 return false;
6047 }
6048 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6049 only_slp_reduc_chain = false;
6050 /* ??? For epilogue generation live members of the chain need
6051 to point back to the PHI via their original stmt for
6052 info_for_reduction to work. */
6053 if (STMT_VINFO_LIVE_P (vdef))
6054 STMT_VINFO_REDUC_DEF (def) = phi_info;
6055 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6056 if (!assign)
6057 {
6058 if (dump_enabled_p ())
6059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6060 "reduction chain includes calls.\n");
6061 return false;
6062 }
6063 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6064 {
6065 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6066 TREE_TYPE (gimple_assign_rhs1 (assign))))
6067 {
6068 if (dump_enabled_p ())
6069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6070 "conversion in the reduction chain.\n");
6071 return false;
6072 }
6073 }
6074 else if (!stmt_info)
6075 /* First non-conversion stmt. */
6076 stmt_info = vdef;
6077 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6078 reduc_chain_length++;
6079 }
6080 /* PHIs should not participate in patterns. */
6081 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6082
6083 if (nested_in_vect_loop_p (loop, stmt_info))
6084 {
6085 loop = loop->inner;
6086 nested_cycle = true;
6087 }
6088
6089 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6090 element. */
6091 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6092 {
6093 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6094 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6095 }
6096 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6097 gcc_assert (slp_node
6098 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6099
6100 /* 1. Is vectorizable reduction? */
6101 /* Not supportable if the reduction variable is used in the loop, unless
6102 it's a reduction chain. */
6103 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6104 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6105 return false;
6106
6107 /* Reductions that are not used even in an enclosing outer-loop,
6108 are expected to be "live" (used out of the loop). */
6109 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6110 && !STMT_VINFO_LIVE_P (stmt_info))
6111 return false;
6112
6113 /* 2. Has this been recognized as a reduction pattern?
6114
6115 Check if STMT represents a pattern that has been recognized
6116 in earlier analysis stages. For stmts that represent a pattern,
6117 the STMT_VINFO_RELATED_STMT field records the last stmt in
6118 the original sequence that constitutes the pattern. */
6119
6120 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6121 if (orig_stmt_info)
6122 {
6123 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6124 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6125 }
6126
6127 /* 3. Check the operands of the operation. The first operands are defined
6128 inside the loop body. The last operand is the reduction variable,
6129 which is defined by the loop-header-phi. */
6130
6131 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6132 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6133 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6134 enum tree_code code = gimple_assign_rhs_code (stmt);
6135 bool lane_reduc_code_p
6136 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6137 int op_type = TREE_CODE_LENGTH (code);
6138
6139 scalar_dest = gimple_assign_lhs (stmt);
6140 scalar_type = TREE_TYPE (scalar_dest);
6141 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6142 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6143 return false;
6144
6145 /* Do not try to vectorize bit-precision reductions. */
6146 if (!type_has_mode_precision_p (scalar_type))
6147 return false;
6148
6149 /* For lane-reducing ops we're reducing the number of reduction PHIs
6150 which means the only use of that may be in the lane-reducing operation. */
6151 if (lane_reduc_code_p
6152 && reduc_chain_length != 1
6153 && !only_slp_reduc_chain)
6154 {
6155 if (dump_enabled_p ())
6156 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6157 "lane-reducing reduction with extra stmts.\n");
6158 return false;
6159 }
6160
6161 /* All uses but the last are expected to be defined in the loop.
6162 The last use is the reduction variable. In case of nested cycle this
6163 assumption is not true: we use reduc_index to record the index of the
6164 reduction variable. */
6165 /* ??? To get at invariant/constant uses on the SLP node we have to
6166 get to it here, slp_node is still the reduction PHI. */
6167 slp_tree slp_for_stmt_info = NULL;
6168 if (slp_node)
6169 {
6170 slp_for_stmt_info = slp_node_instance->root;
6171 /* And then there's reduction chain with a conversion ... */
6172 if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6173 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6174 gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6175 }
6176 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6177 /* We need to skip an extra operand for COND_EXPRs with embedded
6178 comparison. */
6179 unsigned opno_adjust = 0;
6180 if (code == COND_EXPR
6181 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6182 opno_adjust = 1;
6183 for (i = 0; i < op_type; i++)
6184 {
6185 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6186 if (i == 0 && code == COND_EXPR)
6187 continue;
6188
6189 stmt_vec_info def_stmt_info;
6190 enum vect_def_type dt;
6191 tree op;
6192 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6193 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6194 &def_stmt_info))
6195 {
6196 if (dump_enabled_p ())
6197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6198 "use not simple.\n");
6199 return false;
6200 }
6201 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6202 continue;
6203
6204 /* There should be only one cycle def in the stmt, the one
6205 leading to reduc_def. */
6206 if (VECTORIZABLE_CYCLE_DEF (dt))
6207 return false;
6208
6209 /* To properly compute ncopies we are interested in the widest
6210 non-reduction input type in case we're looking at a widening
6211 accumulation that we later handle in vect_transform_reduction. */
6212 if (lane_reduc_code_p
6213 && tem
6214 && (!vectype_in
6215 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6216 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6217 vectype_in = tem;
6218
6219 if (code == COND_EXPR)
6220 {
6221 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6222 if (dt == vect_constant_def)
6223 {
6224 cond_reduc_dt = dt;
6225 cond_reduc_val = op;
6226 }
6227 if (dt == vect_induction_def
6228 && def_stmt_info
6229 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6230 {
6231 cond_reduc_dt = dt;
6232 cond_stmt_vinfo = def_stmt_info;
6233 }
6234 }
6235 }
6236 if (!vectype_in)
6237 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6238 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6239
6240 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6241 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6242 /* If we have a condition reduction, see if we can simplify it further. */
6243 if (v_reduc_type == COND_REDUCTION)
6244 {
6245 if (slp_node)
6246 return false;
6247
6248 /* When the condition uses the reduction value in the condition, fail. */
6249 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6250 {
6251 if (dump_enabled_p ())
6252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6253 "condition depends on previous iteration\n");
6254 return false;
6255 }
6256
6257 if (reduc_chain_length == 1
6258 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6259 vectype_in, OPTIMIZE_FOR_SPEED))
6260 {
6261 if (dump_enabled_p ())
6262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263 "optimizing condition reduction with"
6264 " FOLD_EXTRACT_LAST.\n");
6265 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6266 }
6267 else if (cond_reduc_dt == vect_induction_def)
6268 {
6269 tree base
6270 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6271 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6272
6273 gcc_assert (TREE_CODE (base) == INTEGER_CST
6274 && TREE_CODE (step) == INTEGER_CST);
6275 cond_reduc_val = NULL_TREE;
6276 enum tree_code cond_reduc_op_code = ERROR_MARK;
6277 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6278 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6279 ;
6280 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6281 above base; punt if base is the minimum value of the type for
6282 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6283 else if (tree_int_cst_sgn (step) == -1)
6284 {
6285 cond_reduc_op_code = MIN_EXPR;
6286 if (tree_int_cst_sgn (base) == -1)
6287 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6288 else if (tree_int_cst_lt (base,
6289 TYPE_MAX_VALUE (TREE_TYPE (base))))
6290 cond_reduc_val
6291 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6292 }
6293 else
6294 {
6295 cond_reduc_op_code = MAX_EXPR;
6296 if (tree_int_cst_sgn (base) == 1)
6297 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6298 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6299 base))
6300 cond_reduc_val
6301 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6302 }
6303 if (cond_reduc_val)
6304 {
6305 if (dump_enabled_p ())
6306 dump_printf_loc (MSG_NOTE, vect_location,
6307 "condition expression based on "
6308 "integer induction.\n");
6309 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6310 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6311 = cond_reduc_val;
6312 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6313 }
6314 }
6315 else if (cond_reduc_dt == vect_constant_def)
6316 {
6317 enum vect_def_type cond_initial_dt;
6318 tree cond_initial_val
6319 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6320
6321 gcc_assert (cond_reduc_val != NULL_TREE);
6322 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6323 if (cond_initial_dt == vect_constant_def
6324 && types_compatible_p (TREE_TYPE (cond_initial_val),
6325 TREE_TYPE (cond_reduc_val)))
6326 {
6327 tree e = fold_binary (LE_EXPR, boolean_type_node,
6328 cond_initial_val, cond_reduc_val);
6329 if (e && (integer_onep (e) || integer_zerop (e)))
6330 {
6331 if (dump_enabled_p ())
6332 dump_printf_loc (MSG_NOTE, vect_location,
6333 "condition expression based on "
6334 "compile time constant.\n");
6335 /* Record reduction code at analysis stage. */
6336 STMT_VINFO_REDUC_CODE (reduc_info)
6337 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6338 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6339 }
6340 }
6341 }
6342 }
6343
6344 if (STMT_VINFO_LIVE_P (phi_info))
6345 return false;
6346
6347 if (slp_node)
6348 ncopies = 1;
6349 else
6350 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6351
6352 gcc_assert (ncopies >= 1);
6353
6354 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6355
6356 if (nested_cycle)
6357 {
6358 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6359 == vect_double_reduction_def);
6360 double_reduc = true;
6361 }
6362
6363 /* 4.2. Check support for the epilog operation.
6364
6365 If STMT represents a reduction pattern, then the type of the
6366 reduction variable may be different than the type of the rest
6367 of the arguments. For example, consider the case of accumulation
6368 of shorts into an int accumulator; The original code:
6369 S1: int_a = (int) short_a;
6370 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6371
6372 was replaced with:
6373 STMT: int_acc = widen_sum <short_a, int_acc>
6374
6375 This means that:
6376 1. The tree-code that is used to create the vector operation in the
6377 epilog code (that reduces the partial results) is not the
6378 tree-code of STMT, but is rather the tree-code of the original
6379 stmt from the pattern that STMT is replacing. I.e, in the example
6380 above we want to use 'widen_sum' in the loop, but 'plus' in the
6381 epilog.
6382 2. The type (mode) we use to check available target support
6383 for the vector operation to be created in the *epilog*, is
6384 determined by the type of the reduction variable (in the example
6385 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6386 However the type (mode) we use to check available target support
6387 for the vector operation to be created *inside the loop*, is
6388 determined by the type of the other arguments to STMT (in the
6389 example we'd check this: optab_handler (widen_sum_optab,
6390 vect_short_mode)).
6391
6392 This is contrary to "regular" reductions, in which the types of all
6393 the arguments are the same as the type of the reduction variable.
6394 For "regular" reductions we can therefore use the same vector type
6395 (and also the same tree-code) when generating the epilog code and
6396 when generating the code inside the loop. */
6397
6398 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6399 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6400
6401 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6402 if (reduction_type == TREE_CODE_REDUCTION)
6403 {
6404 /* Check whether it's ok to change the order of the computation.
6405 Generally, when vectorizing a reduction we change the order of the
6406 computation. This may change the behavior of the program in some
6407 cases, so we need to check that this is ok. One exception is when
6408 vectorizing an outer-loop: the inner-loop is executed sequentially,
6409 and therefore vectorizing reductions in the inner-loop during
6410 outer-loop vectorization is safe. */
6411 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6412 {
6413 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6414 is not directy used in stmt. */
6415 if (!only_slp_reduc_chain
6416 && reduc_chain_length != 1)
6417 {
6418 if (dump_enabled_p ())
6419 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6420 "in-order reduction chain without SLP.\n");
6421 return false;
6422 }
6423 STMT_VINFO_REDUC_TYPE (reduc_info)
6424 = reduction_type = FOLD_LEFT_REDUCTION;
6425 }
6426 else if (!commutative_tree_code (orig_code)
6427 || !associative_tree_code (orig_code))
6428 {
6429 if (dump_enabled_p ())
6430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6431 "reduction: not commutative/associative");
6432 return false;
6433 }
6434 }
6435
6436 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6437 && ncopies > 1)
6438 {
6439 if (dump_enabled_p ())
6440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6441 "multiple types in double reduction or condition "
6442 "reduction or fold-left reduction.\n");
6443 return false;
6444 }
6445
6446 internal_fn reduc_fn = IFN_LAST;
6447 if (reduction_type == TREE_CODE_REDUCTION
6448 || reduction_type == FOLD_LEFT_REDUCTION
6449 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6450 || reduction_type == CONST_COND_REDUCTION)
6451 {
6452 if (reduction_type == FOLD_LEFT_REDUCTION
6453 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6454 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6455 {
6456 if (reduc_fn != IFN_LAST
6457 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6458 OPTIMIZE_FOR_SPEED))
6459 {
6460 if (dump_enabled_p ())
6461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6462 "reduc op not supported by target.\n");
6463
6464 reduc_fn = IFN_LAST;
6465 }
6466 }
6467 else
6468 {
6469 if (!nested_cycle || double_reduc)
6470 {
6471 if (dump_enabled_p ())
6472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6473 "no reduc code for scalar code.\n");
6474
6475 return false;
6476 }
6477 }
6478 }
6479 else if (reduction_type == COND_REDUCTION)
6480 {
6481 int scalar_precision
6482 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6483 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6484 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6485 nunits_out);
6486
6487 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6488 OPTIMIZE_FOR_SPEED))
6489 reduc_fn = IFN_REDUC_MAX;
6490 }
6491 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6492
6493 if (reduction_type != EXTRACT_LAST_REDUCTION
6494 && (!nested_cycle || double_reduc)
6495 && reduc_fn == IFN_LAST
6496 && !nunits_out.is_constant ())
6497 {
6498 if (dump_enabled_p ())
6499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6500 "missing target support for reduction on"
6501 " variable-length vectors.\n");
6502 return false;
6503 }
6504
6505 /* For SLP reductions, see if there is a neutral value we can use. */
6506 tree neutral_op = NULL_TREE;
6507 if (slp_node)
6508 neutral_op = neutral_op_for_slp_reduction
6509 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6510 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6511
6512 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6513 {
6514 /* We can't support in-order reductions of code such as this:
6515
6516 for (int i = 0; i < n1; ++i)
6517 for (int j = 0; j < n2; ++j)
6518 l += a[j];
6519
6520 since GCC effectively transforms the loop when vectorizing:
6521
6522 for (int i = 0; i < n1 / VF; ++i)
6523 for (int j = 0; j < n2; ++j)
6524 for (int k = 0; k < VF; ++k)
6525 l += a[j];
6526
6527 which is a reassociation of the original operation. */
6528 if (dump_enabled_p ())
6529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6530 "in-order double reduction not supported.\n");
6531
6532 return false;
6533 }
6534
6535 if (reduction_type == FOLD_LEFT_REDUCTION
6536 && slp_node
6537 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6538 {
6539 /* We cannot use in-order reductions in this case because there is
6540 an implicit reassociation of the operations involved. */
6541 if (dump_enabled_p ())
6542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6543 "in-order unchained SLP reductions not supported.\n");
6544 return false;
6545 }
6546
6547 /* For double reductions, and for SLP reductions with a neutral value,
6548 we construct a variable-length initial vector by loading a vector
6549 full of the neutral value and then shift-and-inserting the start
6550 values into the low-numbered elements. */
6551 if ((double_reduc || neutral_op)
6552 && !nunits_out.is_constant ()
6553 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6554 vectype_out, OPTIMIZE_FOR_SPEED))
6555 {
6556 if (dump_enabled_p ())
6557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6558 "reduction on variable-length vectors requires"
6559 " target support for a vector-shift-and-insert"
6560 " operation.\n");
6561 return false;
6562 }
6563
6564 /* Check extra constraints for variable-length unchained SLP reductions. */
6565 if (STMT_SLP_TYPE (stmt_info)
6566 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6567 && !nunits_out.is_constant ())
6568 {
6569 /* We checked above that we could build the initial vector when
6570 there's a neutral element value. Check here for the case in
6571 which each SLP statement has its own initial value and in which
6572 that value needs to be repeated for every instance of the
6573 statement within the initial vector. */
6574 unsigned int group_size = SLP_TREE_LANES (slp_node);
6575 if (!neutral_op
6576 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6577 TREE_TYPE (vectype_out)))
6578 {
6579 if (dump_enabled_p ())
6580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581 "unsupported form of SLP reduction for"
6582 " variable-length vectors: cannot build"
6583 " initial vector.\n");
6584 return false;
6585 }
6586 /* The epilogue code relies on the number of elements being a multiple
6587 of the group size. The duplicate-and-interleave approach to setting
6588 up the initial vector does too. */
6589 if (!multiple_p (nunits_out, group_size))
6590 {
6591 if (dump_enabled_p ())
6592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6593 "unsupported form of SLP reduction for"
6594 " variable-length vectors: the vector size"
6595 " is not a multiple of the number of results.\n");
6596 return false;
6597 }
6598 }
6599
6600 if (reduction_type == COND_REDUCTION)
6601 {
6602 widest_int ni;
6603
6604 if (! max_loop_iterations (loop, &ni))
6605 {
6606 if (dump_enabled_p ())
6607 dump_printf_loc (MSG_NOTE, vect_location,
6608 "loop count not known, cannot create cond "
6609 "reduction.\n");
6610 return false;
6611 }
6612 /* Convert backedges to iterations. */
6613 ni += 1;
6614
6615 /* The additional index will be the same type as the condition. Check
6616 that the loop can fit into this less one (because we'll use up the
6617 zero slot for when there are no matches). */
6618 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6619 if (wi::geu_p (ni, wi::to_widest (max_index)))
6620 {
6621 if (dump_enabled_p ())
6622 dump_printf_loc (MSG_NOTE, vect_location,
6623 "loop size is greater than data size.\n");
6624 return false;
6625 }
6626 }
6627
6628 /* In case the vectorization factor (VF) is bigger than the number
6629 of elements that we can fit in a vectype (nunits), we have to generate
6630 more than one vector stmt - i.e - we need to "unroll" the
6631 vector stmt by a factor VF/nunits. For more details see documentation
6632 in vectorizable_operation. */
6633
6634 /* If the reduction is used in an outer loop we need to generate
6635 VF intermediate results, like so (e.g. for ncopies=2):
6636 r0 = phi (init, r0)
6637 r1 = phi (init, r1)
6638 r0 = x0 + r0;
6639 r1 = x1 + r1;
6640 (i.e. we generate VF results in 2 registers).
6641 In this case we have a separate def-use cycle for each copy, and therefore
6642 for each copy we get the vector def for the reduction variable from the
6643 respective phi node created for this copy.
6644
6645 Otherwise (the reduction is unused in the loop nest), we can combine
6646 together intermediate results, like so (e.g. for ncopies=2):
6647 r = phi (init, r)
6648 r = x0 + r;
6649 r = x1 + r;
6650 (i.e. we generate VF/2 results in a single register).
6651 In this case for each copy we get the vector def for the reduction variable
6652 from the vectorized reduction operation generated in the previous iteration.
6653
6654 This only works when we see both the reduction PHI and its only consumer
6655 in vectorizable_reduction and there are no intermediate stmts
6656 participating. */
6657 if (ncopies > 1
6658 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6659 && reduc_chain_length == 1)
6660 single_defuse_cycle = true;
6661
6662 if (single_defuse_cycle || lane_reduc_code_p)
6663 {
6664 gcc_assert (code != COND_EXPR);
6665
6666 /* 4. Supportable by target? */
6667 bool ok = true;
6668
6669 /* 4.1. check support for the operation in the loop */
6670 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6671 if (!optab)
6672 {
6673 if (dump_enabled_p ())
6674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6675 "no optab.\n");
6676 ok = false;
6677 }
6678
6679 machine_mode vec_mode = TYPE_MODE (vectype_in);
6680 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6681 {
6682 if (dump_enabled_p ())
6683 dump_printf (MSG_NOTE, "op not supported by target.\n");
6684 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6685 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6686 ok = false;
6687 else
6688 if (dump_enabled_p ())
6689 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6690 }
6691
6692 /* Worthwhile without SIMD support? */
6693 if (ok
6694 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6695 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6696 {
6697 if (dump_enabled_p ())
6698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6699 "not worthwhile without SIMD support.\n");
6700 ok = false;
6701 }
6702
6703 /* lane-reducing operations have to go through vect_transform_reduction.
6704 For the other cases try without the single cycle optimization. */
6705 if (!ok)
6706 {
6707 if (lane_reduc_code_p)
6708 return false;
6709 else
6710 single_defuse_cycle = false;
6711 }
6712 }
6713 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6714
6715 /* If the reduction stmt is one of the patterns that have lane
6716 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6717 if ((ncopies > 1 && ! single_defuse_cycle)
6718 && lane_reduc_code_p)
6719 {
6720 if (dump_enabled_p ())
6721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6722 "multi def-use cycle not possible for lane-reducing "
6723 "reduction operation\n");
6724 return false;
6725 }
6726
6727 if (slp_node
6728 && !(!single_defuse_cycle
6729 && code != DOT_PROD_EXPR
6730 && code != WIDEN_SUM_EXPR
6731 && code != SAD_EXPR
6732 && reduction_type != FOLD_LEFT_REDUCTION))
6733 for (i = 0; i < op_type; i++)
6734 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6735 {
6736 if (dump_enabled_p ())
6737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6738 "incompatible vector types for invariants\n");
6739 return false;
6740 }
6741
6742 if (slp_node)
6743 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6744 else
6745 vec_num = 1;
6746
6747 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
6748 reduction_type, ncopies, cost_vec);
6749 if (dump_enabled_p ()
6750 && reduction_type == FOLD_LEFT_REDUCTION)
6751 dump_printf_loc (MSG_NOTE, vect_location,
6752 "using an in-order (fold-left) reduction.\n");
6753 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6754 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6755 reductions go through their own vectorizable_* routines. */
6756 if (!single_defuse_cycle
6757 && code != DOT_PROD_EXPR
6758 && code != WIDEN_SUM_EXPR
6759 && code != SAD_EXPR
6760 && reduction_type != FOLD_LEFT_REDUCTION)
6761 {
6762 stmt_vec_info tem
6763 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6764 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6765 {
6766 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6767 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6768 }
6769 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6770 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6771 }
6772 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6773 {
6774 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6775 internal_fn cond_fn = get_conditional_internal_fn (code);
6776
6777 if (reduction_type != FOLD_LEFT_REDUCTION
6778 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6779 && (cond_fn == IFN_LAST
6780 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6781 OPTIMIZE_FOR_SPEED)))
6782 {
6783 if (dump_enabled_p ())
6784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785 "can't use a fully-masked loop because no"
6786 " conditional operation is available.\n");
6787 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6788 }
6789 else if (reduction_type == FOLD_LEFT_REDUCTION
6790 && reduc_fn == IFN_LAST
6791 && !expand_vec_cond_expr_p (vectype_in,
6792 truth_type_for (vectype_in),
6793 SSA_NAME))
6794 {
6795 if (dump_enabled_p ())
6796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6797 "can't use a fully-masked loop because no"
6798 " conditional operation is available.\n");
6799 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6800 }
6801 else
6802 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6803 vectype_in, NULL);
6804 }
6805 return true;
6806 }
6807
6808 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6809 value. */
6810
6811 bool
6812 vect_transform_reduction (loop_vec_info loop_vinfo,
6813 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6814 gimple **vec_stmt, slp_tree slp_node)
6815 {
6816 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6817 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6818 int i;
6819 int ncopies;
6820 int vec_num;
6821
6822 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6823 gcc_assert (reduc_info->is_reduc_info);
6824
6825 if (nested_in_vect_loop_p (loop, stmt_info))
6826 {
6827 loop = loop->inner;
6828 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6829 }
6830
6831 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6832 enum tree_code code = gimple_assign_rhs_code (stmt);
6833 int op_type = TREE_CODE_LENGTH (code);
6834
6835 /* Flatten RHS. */
6836 tree ops[3];
6837 switch (get_gimple_rhs_class (code))
6838 {
6839 case GIMPLE_TERNARY_RHS:
6840 ops[2] = gimple_assign_rhs3 (stmt);
6841 /* Fall thru. */
6842 case GIMPLE_BINARY_RHS:
6843 ops[0] = gimple_assign_rhs1 (stmt);
6844 ops[1] = gimple_assign_rhs2 (stmt);
6845 break;
6846 default:
6847 gcc_unreachable ();
6848 }
6849
6850 /* All uses but the last are expected to be defined in the loop.
6851 The last use is the reduction variable. In case of nested cycle this
6852 assumption is not true: we use reduc_index to record the index of the
6853 reduction variable. */
6854 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6855 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6856 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6857 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6858
6859 if (slp_node)
6860 {
6861 ncopies = 1;
6862 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6863 }
6864 else
6865 {
6866 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6867 vec_num = 1;
6868 }
6869
6870 internal_fn cond_fn = get_conditional_internal_fn (code);
6871 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6872 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6873
6874 /* Transform. */
6875 tree new_temp = NULL_TREE;
6876 auto_vec<tree> vec_oprnds0;
6877 auto_vec<tree> vec_oprnds1;
6878 auto_vec<tree> vec_oprnds2;
6879 tree def0;
6880
6881 if (dump_enabled_p ())
6882 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6883
6884 /* FORNOW: Multiple types are not supported for condition. */
6885 if (code == COND_EXPR)
6886 gcc_assert (ncopies == 1);
6887
6888 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6889
6890 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6891 if (reduction_type == FOLD_LEFT_REDUCTION)
6892 {
6893 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6894 return vectorize_fold_left_reduction
6895 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6896 reduc_fn, ops, vectype_in, reduc_index, masks);
6897 }
6898
6899 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6900 gcc_assert (single_defuse_cycle
6901 || code == DOT_PROD_EXPR
6902 || code == WIDEN_SUM_EXPR
6903 || code == SAD_EXPR);
6904
6905 /* Create the destination vector */
6906 tree scalar_dest = gimple_assign_lhs (stmt);
6907 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6908
6909 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
6910 single_defuse_cycle && reduc_index == 0
6911 ? NULL_TREE : ops[0], &vec_oprnds0,
6912 single_defuse_cycle && reduc_index == 1
6913 ? NULL_TREE : ops[1], &vec_oprnds1,
6914 op_type == ternary_op
6915 && !(single_defuse_cycle && reduc_index == 2)
6916 ? ops[2] : NULL_TREE, &vec_oprnds2);
6917 if (single_defuse_cycle)
6918 {
6919 gcc_assert (!slp_node);
6920 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6921 ops[reduc_index],
6922 reduc_index == 0 ? &vec_oprnds0
6923 : (reduc_index == 1 ? &vec_oprnds1
6924 : &vec_oprnds2));
6925 }
6926
6927 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6928 {
6929 gimple *new_stmt;
6930 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6931 if (masked_loop_p && !mask_by_cond_expr)
6932 {
6933 /* Make sure that the reduction accumulator is vop[0]. */
6934 if (reduc_index == 1)
6935 {
6936 gcc_assert (commutative_tree_code (code));
6937 std::swap (vop[0], vop[1]);
6938 }
6939 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6940 vectype_in, i);
6941 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6942 vop[0], vop[1], vop[0]);
6943 new_temp = make_ssa_name (vec_dest, call);
6944 gimple_call_set_lhs (call, new_temp);
6945 gimple_call_set_nothrow (call, true);
6946 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
6947 new_stmt = call;
6948 }
6949 else
6950 {
6951 if (op_type == ternary_op)
6952 vop[2] = vec_oprnds2[i];
6953
6954 if (masked_loop_p && mask_by_cond_expr)
6955 {
6956 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6957 vectype_in, i);
6958 build_vect_cond_expr (code, vop, mask, gsi);
6959 }
6960
6961 new_stmt = gimple_build_assign (vec_dest, code,
6962 vop[0], vop[1], vop[2]);
6963 new_temp = make_ssa_name (vec_dest, new_stmt);
6964 gimple_assign_set_lhs (new_stmt, new_temp);
6965 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
6966 }
6967
6968 if (slp_node)
6969 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6970 else if (single_defuse_cycle
6971 && i < ncopies - 1)
6972 {
6973 if (reduc_index == 0)
6974 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
6975 else if (reduc_index == 1)
6976 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
6977 else if (reduc_index == 2)
6978 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
6979 }
6980 else
6981 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6982 }
6983
6984 if (!slp_node)
6985 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
6986
6987 return true;
6988 }
6989
6990 /* Transform phase of a cycle PHI. */
6991
6992 bool
6993 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
6994 stmt_vec_info stmt_info, gimple **vec_stmt,
6995 slp_tree slp_node, slp_instance slp_node_instance)
6996 {
6997 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6998 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6999 int i;
7000 int ncopies;
7001 int j;
7002 bool nested_cycle = false;
7003 int vec_num;
7004
7005 if (nested_in_vect_loop_p (loop, stmt_info))
7006 {
7007 loop = loop->inner;
7008 nested_cycle = true;
7009 }
7010
7011 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7012 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7013 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7014 gcc_assert (reduc_info->is_reduc_info);
7015
7016 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7017 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7018 /* Leave the scalar phi in place. */
7019 return true;
7020
7021 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7022 /* For a nested cycle we do not fill the above. */
7023 if (!vectype_in)
7024 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7025 gcc_assert (vectype_in);
7026
7027 if (slp_node)
7028 {
7029 /* The size vect_schedule_slp_instance computes is off for us. */
7030 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7031 * SLP_TREE_LANES (slp_node), vectype_in);
7032 ncopies = 1;
7033 }
7034 else
7035 {
7036 vec_num = 1;
7037 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7038 }
7039
7040 /* Check whether we should use a single PHI node and accumulate
7041 vectors to one before the backedge. */
7042 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7043 ncopies = 1;
7044
7045 /* Create the destination vector */
7046 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7047 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7048 vectype_out);
7049
7050 /* Get the loop-entry arguments. */
7051 tree vec_initial_def;
7052 auto_vec<tree> vec_initial_defs;
7053 if (slp_node)
7054 {
7055 vec_initial_defs.reserve (vec_num);
7056 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7057 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7058 tree neutral_op
7059 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7060 STMT_VINFO_REDUC_CODE (reduc_info),
7061 first != NULL);
7062 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7063 &vec_initial_defs, vec_num,
7064 first != NULL, neutral_op);
7065 }
7066 else
7067 {
7068 /* Get at the scalar def before the loop, that defines the initial
7069 value of the reduction variable. */
7070 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7071 loop_preheader_edge (loop));
7072 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7073 and we can't use zero for induc_val, use initial_def. Similarly
7074 for REDUC_MIN and initial_def larger than the base. */
7075 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7076 {
7077 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7078 if (TREE_CODE (initial_def) == INTEGER_CST
7079 && !integer_zerop (induc_val)
7080 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7081 && tree_int_cst_lt (initial_def, induc_val))
7082 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7083 && tree_int_cst_lt (induc_val, initial_def))))
7084 {
7085 induc_val = initial_def;
7086 /* Communicate we used the initial_def to epilouge
7087 generation. */
7088 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7089 }
7090 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7091 vec_initial_defs.create (ncopies);
7092 for (i = 0; i < ncopies; ++i)
7093 vec_initial_defs.quick_push (vec_initial_def);
7094 }
7095 else if (nested_cycle)
7096 {
7097 /* Do not use an adjustment def as that case is not supported
7098 correctly if ncopies is not one. */
7099 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7100 ncopies, initial_def,
7101 &vec_initial_defs);
7102 }
7103 else
7104 {
7105 tree adjustment_def = NULL_TREE;
7106 tree *adjustment_defp = &adjustment_def;
7107 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7108 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7109 adjustment_defp = NULL;
7110 vec_initial_def
7111 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7112 initial_def, adjustment_defp);
7113 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7114 vec_initial_defs.create (ncopies);
7115 for (i = 0; i < ncopies; ++i)
7116 vec_initial_defs.quick_push (vec_initial_def);
7117 }
7118 }
7119
7120 /* Generate the reduction PHIs upfront. */
7121 for (i = 0; i < vec_num; i++)
7122 {
7123 tree vec_init_def = vec_initial_defs[i];
7124 for (j = 0; j < ncopies; j++)
7125 {
7126 /* Create the reduction-phi that defines the reduction
7127 operand. */
7128 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7129
7130 /* Set the loop-entry arg of the reduction-phi. */
7131 if (j != 0 && nested_cycle)
7132 vec_init_def = vec_initial_defs[j];
7133 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7134 UNKNOWN_LOCATION);
7135
7136 /* The loop-latch arg is set in epilogue processing. */
7137
7138 if (slp_node)
7139 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7140 else
7141 {
7142 if (j == 0)
7143 *vec_stmt = new_phi;
7144 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7145 }
7146 }
7147 }
7148
7149 return true;
7150 }
7151
7152 /* Vectorizes LC PHIs. */
7153
7154 bool
7155 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7156 stmt_vec_info stmt_info, gimple **vec_stmt,
7157 slp_tree slp_node)
7158 {
7159 if (!loop_vinfo
7160 || !is_a <gphi *> (stmt_info->stmt)
7161 || gimple_phi_num_args (stmt_info->stmt) != 1)
7162 return false;
7163
7164 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7165 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7166 return false;
7167
7168 if (!vec_stmt) /* transformation not required. */
7169 {
7170 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7171 return true;
7172 }
7173
7174 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7175 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7176 basic_block bb = gimple_bb (stmt_info->stmt);
7177 edge e = single_pred_edge (bb);
7178 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7179 auto_vec<tree> vec_oprnds;
7180 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7181 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7182 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7183 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7184 {
7185 /* Create the vectorized LC PHI node. */
7186 gphi *new_phi = create_phi_node (vec_dest, bb);
7187 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7188 if (slp_node)
7189 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7190 else
7191 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7192 }
7193 if (!slp_node)
7194 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7195
7196 return true;
7197 }
7198
7199
7200 /* Function vect_min_worthwhile_factor.
7201
7202 For a loop where we could vectorize the operation indicated by CODE,
7203 return the minimum vectorization factor that makes it worthwhile
7204 to use generic vectors. */
7205 static unsigned int
7206 vect_min_worthwhile_factor (enum tree_code code)
7207 {
7208 switch (code)
7209 {
7210 case PLUS_EXPR:
7211 case MINUS_EXPR:
7212 case NEGATE_EXPR:
7213 return 4;
7214
7215 case BIT_AND_EXPR:
7216 case BIT_IOR_EXPR:
7217 case BIT_XOR_EXPR:
7218 case BIT_NOT_EXPR:
7219 return 2;
7220
7221 default:
7222 return INT_MAX;
7223 }
7224 }
7225
7226 /* Return true if VINFO indicates we are doing loop vectorization and if
7227 it is worth decomposing CODE operations into scalar operations for
7228 that loop's vectorization factor. */
7229
7230 bool
7231 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7232 {
7233 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7234 unsigned HOST_WIDE_INT value;
7235 return (loop_vinfo
7236 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7237 && value >= vect_min_worthwhile_factor (code));
7238 }
7239
7240 /* Function vectorizable_induction
7241
7242 Check if STMT_INFO performs an induction computation that can be vectorized.
7243 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7244 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7245 Return true if STMT_INFO is vectorizable in this way. */
7246
7247 bool
7248 vectorizable_induction (loop_vec_info loop_vinfo,
7249 stmt_vec_info stmt_info,
7250 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7251 gimple **vec_stmt, slp_tree slp_node,
7252 stmt_vector_for_cost *cost_vec)
7253 {
7254 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7255 unsigned ncopies;
7256 bool nested_in_vect_loop = false;
7257 class loop *iv_loop;
7258 tree vec_def;
7259 edge pe = loop_preheader_edge (loop);
7260 basic_block new_bb;
7261 tree new_vec, vec_init, vec_step, t;
7262 tree new_name;
7263 gimple *new_stmt;
7264 gphi *induction_phi;
7265 tree induc_def, vec_dest;
7266 tree init_expr, step_expr;
7267 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7268 unsigned i;
7269 tree expr;
7270 gimple_seq stmts;
7271 gimple_stmt_iterator si;
7272
7273 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7274 if (!phi)
7275 return false;
7276
7277 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7278 return false;
7279
7280 /* Make sure it was recognized as induction computation. */
7281 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7282 return false;
7283
7284 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7285 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7286
7287 if (slp_node)
7288 ncopies = 1;
7289 else
7290 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7291 gcc_assert (ncopies >= 1);
7292
7293 /* FORNOW. These restrictions should be relaxed. */
7294 if (nested_in_vect_loop_p (loop, stmt_info))
7295 {
7296 imm_use_iterator imm_iter;
7297 use_operand_p use_p;
7298 gimple *exit_phi;
7299 edge latch_e;
7300 tree loop_arg;
7301
7302 if (ncopies > 1)
7303 {
7304 if (dump_enabled_p ())
7305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7306 "multiple types in nested loop.\n");
7307 return false;
7308 }
7309
7310 /* FORNOW: outer loop induction with SLP not supported. */
7311 if (STMT_SLP_TYPE (stmt_info))
7312 return false;
7313
7314 exit_phi = NULL;
7315 latch_e = loop_latch_edge (loop->inner);
7316 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7317 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7318 {
7319 gimple *use_stmt = USE_STMT (use_p);
7320 if (is_gimple_debug (use_stmt))
7321 continue;
7322
7323 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7324 {
7325 exit_phi = use_stmt;
7326 break;
7327 }
7328 }
7329 if (exit_phi)
7330 {
7331 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7332 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7333 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7334 {
7335 if (dump_enabled_p ())
7336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7337 "inner-loop induction only used outside "
7338 "of the outer vectorized loop.\n");
7339 return false;
7340 }
7341 }
7342
7343 nested_in_vect_loop = true;
7344 iv_loop = loop->inner;
7345 }
7346 else
7347 iv_loop = loop;
7348 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7349
7350 if (slp_node && !nunits.is_constant ())
7351 {
7352 /* The current SLP code creates the initial value element-by-element. */
7353 if (dump_enabled_p ())
7354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7355 "SLP induction not supported for variable-length"
7356 " vectors.\n");
7357 return false;
7358 }
7359
7360 if (!vec_stmt) /* transformation not required. */
7361 {
7362 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7363 DUMP_VECT_SCOPE ("vectorizable_induction");
7364 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7365 return true;
7366 }
7367
7368 /* Transform. */
7369
7370 /* Compute a vector variable, initialized with the first VF values of
7371 the induction variable. E.g., for an iv with IV_PHI='X' and
7372 evolution S, for a vector of 4 units, we want to compute:
7373 [X, X + S, X + 2*S, X + 3*S]. */
7374
7375 if (dump_enabled_p ())
7376 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7377
7378 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7379 gcc_assert (step_expr != NULL_TREE);
7380 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7381
7382 pe = loop_preheader_edge (iv_loop);
7383 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7384 loop_preheader_edge (iv_loop));
7385
7386 stmts = NULL;
7387 if (!nested_in_vect_loop)
7388 {
7389 /* Convert the initial value to the IV update type. */
7390 tree new_type = TREE_TYPE (step_expr);
7391 init_expr = gimple_convert (&stmts, new_type, init_expr);
7392
7393 /* If we are using the loop mask to "peel" for alignment then we need
7394 to adjust the start value here. */
7395 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7396 if (skip_niters != NULL_TREE)
7397 {
7398 if (FLOAT_TYPE_P (vectype))
7399 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7400 skip_niters);
7401 else
7402 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7403 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7404 skip_niters, step_expr);
7405 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7406 init_expr, skip_step);
7407 }
7408 }
7409
7410 if (stmts)
7411 {
7412 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7413 gcc_assert (!new_bb);
7414 }
7415
7416 /* Find the first insertion point in the BB. */
7417 basic_block bb = gimple_bb (phi);
7418 si = gsi_after_labels (bb);
7419
7420 /* For SLP induction we have to generate several IVs as for example
7421 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7422 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7423 [VF*S, VF*S, VF*S, VF*S] for all. */
7424 if (slp_node)
7425 {
7426 /* Enforced above. */
7427 unsigned int const_nunits = nunits.to_constant ();
7428
7429 /* Generate [VF*S, VF*S, ... ]. */
7430 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7431 {
7432 expr = build_int_cst (integer_type_node, vf);
7433 expr = fold_convert (TREE_TYPE (step_expr), expr);
7434 }
7435 else
7436 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7437 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7438 expr, step_expr);
7439 if (! CONSTANT_CLASS_P (new_name))
7440 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7441 TREE_TYPE (step_expr), NULL);
7442 new_vec = build_vector_from_val (step_vectype, new_name);
7443 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7444 new_vec, step_vectype, NULL);
7445
7446 /* Now generate the IVs. */
7447 unsigned group_size = SLP_TREE_LANES (slp_node);
7448 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7449 unsigned elts = const_nunits * nvects;
7450 /* Compute the number of distinct IVs we need. First reduce
7451 group_size if it is a multiple of const_nunits so we get
7452 one IV for a group_size of 4 but const_nunits 2. */
7453 unsigned group_sizep = group_size;
7454 if (group_sizep % const_nunits == 0)
7455 group_sizep = group_sizep / const_nunits;
7456 unsigned nivs = least_common_multiple (group_sizep,
7457 const_nunits) / const_nunits;
7458 gcc_assert (elts % group_size == 0);
7459 tree elt = init_expr;
7460 unsigned ivn;
7461 for (ivn = 0; ivn < nivs; ++ivn)
7462 {
7463 tree_vector_builder elts (step_vectype, const_nunits, 1);
7464 stmts = NULL;
7465 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7466 {
7467 if (ivn*const_nunits + eltn >= group_size
7468 && (ivn * const_nunits + eltn) % group_size == 0)
7469 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7470 elt, step_expr);
7471 elts.quick_push (elt);
7472 }
7473 vec_init = gimple_build_vector (&stmts, &elts);
7474 vec_init = gimple_convert (&stmts, vectype, vec_init);
7475 if (stmts)
7476 {
7477 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7478 gcc_assert (!new_bb);
7479 }
7480
7481 /* Create the induction-phi that defines the induction-operand. */
7482 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7483 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7484 induc_def = PHI_RESULT (induction_phi);
7485
7486 /* Create the iv update inside the loop */
7487 gimple_seq stmts = NULL;
7488 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7489 vec_def = gimple_build (&stmts,
7490 PLUS_EXPR, step_vectype, vec_def, vec_step);
7491 vec_def = gimple_convert (&stmts, vectype, vec_def);
7492 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7493
7494 /* Set the arguments of the phi node: */
7495 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7496 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7497 UNKNOWN_LOCATION);
7498
7499 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7500 }
7501 /* Fill up to the number of vectors we need for the whole group. */
7502 nivs = least_common_multiple (group_size,
7503 const_nunits) / const_nunits;
7504 for (; ivn < nivs; ++ivn)
7505 SLP_TREE_VEC_STMTS (slp_node)
7506 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7507
7508 /* Re-use IVs when we can. */
7509 if (ivn < nvects)
7510 {
7511 unsigned vfp
7512 = least_common_multiple (group_size, const_nunits) / group_size;
7513 /* Generate [VF'*S, VF'*S, ... ]. */
7514 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7515 {
7516 expr = build_int_cst (integer_type_node, vfp);
7517 expr = fold_convert (TREE_TYPE (step_expr), expr);
7518 }
7519 else
7520 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7521 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7522 expr, step_expr);
7523 if (! CONSTANT_CLASS_P (new_name))
7524 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7525 TREE_TYPE (step_expr), NULL);
7526 new_vec = build_vector_from_val (step_vectype, new_name);
7527 vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7528 step_vectype, NULL);
7529 for (; ivn < nvects; ++ivn)
7530 {
7531 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7532 tree def;
7533 if (gimple_code (iv) == GIMPLE_PHI)
7534 def = gimple_phi_result (iv);
7535 else
7536 def = gimple_assign_lhs (iv);
7537 gimple_seq stmts = NULL;
7538 def = gimple_convert (&stmts, step_vectype, def);
7539 def = gimple_build (&stmts,
7540 PLUS_EXPR, step_vectype, def, vec_step);
7541 def = gimple_convert (&stmts, vectype, def);
7542 if (gimple_code (iv) == GIMPLE_PHI)
7543 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7544 else
7545 {
7546 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7547 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7548 }
7549 SLP_TREE_VEC_STMTS (slp_node)
7550 .quick_push (SSA_NAME_DEF_STMT (def));
7551 }
7552 }
7553
7554 return true;
7555 }
7556
7557 /* Create the vector that holds the initial_value of the induction. */
7558 if (nested_in_vect_loop)
7559 {
7560 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7561 been created during vectorization of previous stmts. We obtain it
7562 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7563 auto_vec<tree> vec_inits;
7564 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7565 init_expr, &vec_inits);
7566 vec_init = vec_inits[0];
7567 /* If the initial value is not of proper type, convert it. */
7568 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7569 {
7570 new_stmt
7571 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7572 vect_simple_var,
7573 "vec_iv_"),
7574 VIEW_CONVERT_EXPR,
7575 build1 (VIEW_CONVERT_EXPR, vectype,
7576 vec_init));
7577 vec_init = gimple_assign_lhs (new_stmt);
7578 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7579 new_stmt);
7580 gcc_assert (!new_bb);
7581 }
7582 }
7583 else
7584 {
7585 /* iv_loop is the loop to be vectorized. Create:
7586 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7587 stmts = NULL;
7588 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7589
7590 unsigned HOST_WIDE_INT const_nunits;
7591 if (nunits.is_constant (&const_nunits))
7592 {
7593 tree_vector_builder elts (step_vectype, const_nunits, 1);
7594 elts.quick_push (new_name);
7595 for (i = 1; i < const_nunits; i++)
7596 {
7597 /* Create: new_name_i = new_name + step_expr */
7598 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7599 new_name, step_expr);
7600 elts.quick_push (new_name);
7601 }
7602 /* Create a vector from [new_name_0, new_name_1, ...,
7603 new_name_nunits-1] */
7604 vec_init = gimple_build_vector (&stmts, &elts);
7605 }
7606 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7607 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7608 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7609 new_name, step_expr);
7610 else
7611 {
7612 /* Build:
7613 [base, base, base, ...]
7614 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7615 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7616 gcc_assert (flag_associative_math);
7617 tree index = build_index_vector (step_vectype, 0, 1);
7618 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7619 new_name);
7620 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7621 step_expr);
7622 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7623 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7624 vec_init, step_vec);
7625 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7626 vec_init, base_vec);
7627 }
7628 vec_init = gimple_convert (&stmts, vectype, vec_init);
7629
7630 if (stmts)
7631 {
7632 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7633 gcc_assert (!new_bb);
7634 }
7635 }
7636
7637
7638 /* Create the vector that holds the step of the induction. */
7639 if (nested_in_vect_loop)
7640 /* iv_loop is nested in the loop to be vectorized. Generate:
7641 vec_step = [S, S, S, S] */
7642 new_name = step_expr;
7643 else
7644 {
7645 /* iv_loop is the loop to be vectorized. Generate:
7646 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7647 gimple_seq seq = NULL;
7648 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7649 {
7650 expr = build_int_cst (integer_type_node, vf);
7651 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7652 }
7653 else
7654 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7655 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7656 expr, step_expr);
7657 if (seq)
7658 {
7659 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7660 gcc_assert (!new_bb);
7661 }
7662 }
7663
7664 t = unshare_expr (new_name);
7665 gcc_assert (CONSTANT_CLASS_P (new_name)
7666 || TREE_CODE (new_name) == SSA_NAME);
7667 new_vec = build_vector_from_val (step_vectype, t);
7668 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7669 new_vec, step_vectype, NULL);
7670
7671
7672 /* Create the following def-use cycle:
7673 loop prolog:
7674 vec_init = ...
7675 vec_step = ...
7676 loop:
7677 vec_iv = PHI <vec_init, vec_loop>
7678 ...
7679 STMT
7680 ...
7681 vec_loop = vec_iv + vec_step; */
7682
7683 /* Create the induction-phi that defines the induction-operand. */
7684 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7685 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7686 induc_def = PHI_RESULT (induction_phi);
7687
7688 /* Create the iv update inside the loop */
7689 stmts = NULL;
7690 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7691 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7692 vec_def = gimple_convert (&stmts, vectype, vec_def);
7693 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7694 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7695
7696 /* Set the arguments of the phi node: */
7697 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7698 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7699 UNKNOWN_LOCATION);
7700
7701 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
7702 *vec_stmt = induction_phi;
7703
7704 /* In case that vectorization factor (VF) is bigger than the number
7705 of elements that we can fit in a vectype (nunits), we have to generate
7706 more than one vector stmt - i.e - we need to "unroll" the
7707 vector stmt by a factor VF/nunits. For more details see documentation
7708 in vectorizable_operation. */
7709
7710 if (ncopies > 1)
7711 {
7712 gimple_seq seq = NULL;
7713 /* FORNOW. This restriction should be relaxed. */
7714 gcc_assert (!nested_in_vect_loop);
7715
7716 /* Create the vector that holds the step of the induction. */
7717 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7718 {
7719 expr = build_int_cst (integer_type_node, nunits);
7720 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7721 }
7722 else
7723 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7724 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7725 expr, step_expr);
7726 if (seq)
7727 {
7728 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7729 gcc_assert (!new_bb);
7730 }
7731
7732 t = unshare_expr (new_name);
7733 gcc_assert (CONSTANT_CLASS_P (new_name)
7734 || TREE_CODE (new_name) == SSA_NAME);
7735 new_vec = build_vector_from_val (step_vectype, t);
7736 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7737 new_vec, step_vectype, NULL);
7738
7739 vec_def = induc_def;
7740 for (i = 1; i < ncopies; i++)
7741 {
7742 /* vec_i = vec_prev + vec_step */
7743 gimple_seq stmts = NULL;
7744 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7745 vec_def = gimple_build (&stmts,
7746 PLUS_EXPR, step_vectype, vec_def, vec_step);
7747 vec_def = gimple_convert (&stmts, vectype, vec_def);
7748
7749 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7750 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7751 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7752 }
7753 }
7754
7755 if (dump_enabled_p ())
7756 dump_printf_loc (MSG_NOTE, vect_location,
7757 "transform induction: created def-use cycle: %G%G",
7758 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7759
7760 return true;
7761 }
7762
7763 /* Function vectorizable_live_operation.
7764
7765 STMT_INFO computes a value that is used outside the loop. Check if
7766 it can be supported. */
7767
7768 bool
7769 vectorizable_live_operation (loop_vec_info loop_vinfo,
7770 stmt_vec_info stmt_info,
7771 gimple_stmt_iterator *gsi,
7772 slp_tree slp_node, slp_instance slp_node_instance,
7773 int slp_index, bool vec_stmt_p,
7774 stmt_vector_for_cost *)
7775 {
7776 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7777 imm_use_iterator imm_iter;
7778 tree lhs, lhs_type, bitsize, vec_bitsize;
7779 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7780 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7781 int ncopies;
7782 gimple *use_stmt;
7783 auto_vec<tree> vec_oprnds;
7784 int vec_entry = 0;
7785 poly_uint64 vec_index = 0;
7786
7787 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7788
7789 /* If a stmt of a reduction is live, vectorize it via
7790 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7791 validity so just trigger the transform here. */
7792 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7793 {
7794 if (!vec_stmt_p)
7795 return true;
7796 if (slp_node)
7797 {
7798 /* For reduction chains the meta-info is attached to
7799 the group leader. */
7800 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7801 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7802 /* For SLP reductions we vectorize the epilogue for
7803 all involved stmts together. */
7804 else if (slp_index != 0)
7805 return true;
7806 else
7807 /* For SLP reductions the meta-info is attached to
7808 the representative. */
7809 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
7810 }
7811 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7812 gcc_assert (reduc_info->is_reduc_info);
7813 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7814 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7815 return true;
7816 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
7817 slp_node_instance);
7818 return true;
7819 }
7820
7821 /* FORNOW. CHECKME. */
7822 if (nested_in_vect_loop_p (loop, stmt_info))
7823 return false;
7824
7825 /* If STMT is not relevant and it is a simple assignment and its inputs are
7826 invariant then it can remain in place, unvectorized. The original last
7827 scalar value that it computes will be used. */
7828 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7829 {
7830 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7831 if (dump_enabled_p ())
7832 dump_printf_loc (MSG_NOTE, vect_location,
7833 "statement is simple and uses invariant. Leaving in "
7834 "place.\n");
7835 return true;
7836 }
7837
7838 if (slp_node)
7839 ncopies = 1;
7840 else
7841 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7842
7843 if (slp_node)
7844 {
7845 gcc_assert (slp_index >= 0);
7846
7847 int num_scalar = SLP_TREE_LANES (slp_node);
7848 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7849
7850 /* Get the last occurrence of the scalar index from the concatenation of
7851 all the slp vectors. Calculate which slp vector it is and the index
7852 within. */
7853 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7854
7855 /* Calculate which vector contains the result, and which lane of
7856 that vector we need. */
7857 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7858 {
7859 if (dump_enabled_p ())
7860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7861 "Cannot determine which vector holds the"
7862 " final result.\n");
7863 return false;
7864 }
7865 }
7866
7867 if (!vec_stmt_p)
7868 {
7869 /* No transformation required. */
7870 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7871 {
7872 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7873 OPTIMIZE_FOR_SPEED))
7874 {
7875 if (dump_enabled_p ())
7876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7877 "can't use a fully-masked loop because "
7878 "the target doesn't support extract last "
7879 "reduction.\n");
7880 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7881 }
7882 else if (slp_node)
7883 {
7884 if (dump_enabled_p ())
7885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886 "can't use a fully-masked loop because an "
7887 "SLP statement is live after the loop.\n");
7888 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7889 }
7890 else if (ncopies > 1)
7891 {
7892 if (dump_enabled_p ())
7893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7894 "can't use a fully-masked loop because"
7895 " ncopies is greater than 1.\n");
7896 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7897 }
7898 else
7899 {
7900 gcc_assert (ncopies == 1 && !slp_node);
7901 vect_record_loop_mask (loop_vinfo,
7902 &LOOP_VINFO_MASKS (loop_vinfo),
7903 1, vectype, NULL);
7904 }
7905 }
7906 return true;
7907 }
7908
7909 /* Use the lhs of the original scalar statement. */
7910 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7911
7912 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7913 : gimple_get_lhs (stmt);
7914 lhs_type = TREE_TYPE (lhs);
7915
7916 bitsize = vector_element_bits_tree (vectype);
7917 vec_bitsize = TYPE_SIZE (vectype);
7918
7919 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7920 tree vec_lhs, bitstart;
7921 if (slp_node)
7922 {
7923 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7924
7925 /* Get the correct slp vectorized stmt. */
7926 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
7927 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7928 vec_lhs = gimple_phi_result (phi);
7929 else
7930 vec_lhs = gimple_get_lhs (vec_stmt);
7931
7932 /* Get entry to use. */
7933 bitstart = bitsize_int (vec_index);
7934 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7935 }
7936 else
7937 {
7938 /* For multiple copies, get the last copy. */
7939 vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
7940
7941 /* Get the last lane in the vector. */
7942 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7943 }
7944
7945 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
7946 requirement, insert one phi node for it. It looks like:
7947 loop;
7948 BB:
7949 # lhs' = PHI <lhs>
7950 ==>
7951 loop;
7952 BB:
7953 # vec_lhs' = PHI <vec_lhs>
7954 new_tree = lane_extract <vec_lhs', ...>;
7955 lhs' = new_tree; */
7956
7957 basic_block exit_bb = single_exit (loop)->dest;
7958 gcc_assert (single_pred_p (exit_bb));
7959
7960 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
7961 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
7962 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
7963
7964 gimple_seq stmts = NULL;
7965 tree new_tree;
7966 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7967 {
7968 /* Emit:
7969
7970 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7971
7972 where VEC_LHS is the vectorized live-out result and MASK is
7973 the loop mask for the final iteration. */
7974 gcc_assert (ncopies == 1 && !slp_node);
7975 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7976 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
7977 vectype, 0);
7978 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
7979 mask, vec_lhs_phi);
7980
7981 /* Convert the extracted vector element to the required scalar type. */
7982 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7983 }
7984 else
7985 {
7986 tree bftype = TREE_TYPE (vectype);
7987 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7988 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7989 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
7990 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7991 &stmts, true, NULL_TREE);
7992 }
7993
7994 if (stmts)
7995 {
7996 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
7997 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
7998
7999 /* Remove existing phi from lhs and create one copy from new_tree. */
8000 tree lhs_phi = NULL_TREE;
8001 gimple_stmt_iterator gsi;
8002 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8003 {
8004 gimple *phi = gsi_stmt (gsi);
8005 if ((gimple_phi_arg_def (phi, 0) == lhs))
8006 {
8007 remove_phi_node (&gsi, false);
8008 lhs_phi = gimple_phi_result (phi);
8009 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8010 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8011 break;
8012 }
8013 }
8014 }
8015
8016 /* Replace use of lhs with newly computed result. If the use stmt is a
8017 single arg PHI, just replace all uses of PHI result. It's necessary
8018 because lcssa PHI defining lhs may be before newly inserted stmt. */
8019 use_operand_p use_p;
8020 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8021 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8022 && !is_gimple_debug (use_stmt))
8023 {
8024 if (gimple_code (use_stmt) == GIMPLE_PHI
8025 && gimple_phi_num_args (use_stmt) == 1)
8026 {
8027 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8028 }
8029 else
8030 {
8031 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8032 SET_USE (use_p, new_tree);
8033 }
8034 update_stmt (use_stmt);
8035 }
8036
8037 return true;
8038 }
8039
8040 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8041
8042 static void
8043 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8044 {
8045 ssa_op_iter op_iter;
8046 imm_use_iterator imm_iter;
8047 def_operand_p def_p;
8048 gimple *ustmt;
8049
8050 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8051 {
8052 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8053 {
8054 basic_block bb;
8055
8056 if (!is_gimple_debug (ustmt))
8057 continue;
8058
8059 bb = gimple_bb (ustmt);
8060
8061 if (!flow_bb_inside_loop_p (loop, bb))
8062 {
8063 if (gimple_debug_bind_p (ustmt))
8064 {
8065 if (dump_enabled_p ())
8066 dump_printf_loc (MSG_NOTE, vect_location,
8067 "killing debug use\n");
8068
8069 gimple_debug_bind_reset_value (ustmt);
8070 update_stmt (ustmt);
8071 }
8072 else
8073 gcc_unreachable ();
8074 }
8075 }
8076 }
8077 }
8078
8079 /* Given loop represented by LOOP_VINFO, return true if computation of
8080 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8081 otherwise. */
8082
8083 static bool
8084 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8085 {
8086 /* Constant case. */
8087 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8088 {
8089 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8090 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8091
8092 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8093 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8094 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8095 return true;
8096 }
8097
8098 widest_int max;
8099 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8100 /* Check the upper bound of loop niters. */
8101 if (get_max_loop_iterations (loop, &max))
8102 {
8103 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8104 signop sgn = TYPE_SIGN (type);
8105 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8106 if (max < type_max)
8107 return true;
8108 }
8109 return false;
8110 }
8111
8112 /* Return a mask type with half the number of elements as OLD_TYPE,
8113 given that it should have mode NEW_MODE. */
8114
8115 tree
8116 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8117 {
8118 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8119 return build_truth_vector_type_for_mode (nunits, new_mode);
8120 }
8121
8122 /* Return a mask type with twice as many elements as OLD_TYPE,
8123 given that it should have mode NEW_MODE. */
8124
8125 tree
8126 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8127 {
8128 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8129 return build_truth_vector_type_for_mode (nunits, new_mode);
8130 }
8131
8132 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8133 contain a sequence of NVECTORS masks that each control a vector of type
8134 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8135 these vector masks with the vector version of SCALAR_MASK. */
8136
8137 void
8138 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8139 unsigned int nvectors, tree vectype, tree scalar_mask)
8140 {
8141 gcc_assert (nvectors != 0);
8142 if (masks->length () < nvectors)
8143 masks->safe_grow_cleared (nvectors);
8144 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8145 /* The number of scalars per iteration and the number of vectors are
8146 both compile-time constants. */
8147 unsigned int nscalars_per_iter
8148 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8149 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8150
8151 if (scalar_mask)
8152 {
8153 scalar_cond_masked_key cond (scalar_mask, nvectors);
8154 loop_vinfo->scalar_cond_masked_set.add (cond);
8155 }
8156
8157 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8158 {
8159 rgm->max_nscalars_per_iter = nscalars_per_iter;
8160 rgm->mask_type = truth_type_for (vectype);
8161 }
8162 }
8163
8164 /* Given a complete set of masks MASKS, extract mask number INDEX
8165 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8166 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8167
8168 See the comment above vec_loop_masks for more details about the mask
8169 arrangement. */
8170
8171 tree
8172 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8173 unsigned int nvectors, tree vectype, unsigned int index)
8174 {
8175 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8176 tree mask_type = rgm->mask_type;
8177
8178 /* Populate the rgroup's mask array, if this is the first time we've
8179 used it. */
8180 if (rgm->masks.is_empty ())
8181 {
8182 rgm->masks.safe_grow_cleared (nvectors);
8183 for (unsigned int i = 0; i < nvectors; ++i)
8184 {
8185 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8186 /* Provide a dummy definition until the real one is available. */
8187 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8188 rgm->masks[i] = mask;
8189 }
8190 }
8191
8192 tree mask = rgm->masks[index];
8193 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8194 TYPE_VECTOR_SUBPARTS (vectype)))
8195 {
8196 /* A loop mask for data type X can be reused for data type Y
8197 if X has N times more elements than Y and if Y's elements
8198 are N times bigger than X's. In this case each sequence
8199 of N elements in the loop mask will be all-zero or all-one.
8200 We can then view-convert the mask so that each sequence of
8201 N elements is replaced by a single element. */
8202 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8203 TYPE_VECTOR_SUBPARTS (vectype)));
8204 gimple_seq seq = NULL;
8205 mask_type = truth_type_for (vectype);
8206 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8207 if (seq)
8208 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8209 }
8210 return mask;
8211 }
8212
8213 /* Scale profiling counters by estimation for LOOP which is vectorized
8214 by factor VF. */
8215
8216 static void
8217 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8218 {
8219 edge preheader = loop_preheader_edge (loop);
8220 /* Reduce loop iterations by the vectorization factor. */
8221 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8222 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8223
8224 if (freq_h.nonzero_p ())
8225 {
8226 profile_probability p;
8227
8228 /* Avoid dropping loop body profile counter to 0 because of zero count
8229 in loop's preheader. */
8230 if (!(freq_e == profile_count::zero ()))
8231 freq_e = freq_e.force_nonzero ();
8232 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8233 scale_loop_frequencies (loop, p);
8234 }
8235
8236 edge exit_e = single_exit (loop);
8237 exit_e->probability = profile_probability::always ()
8238 .apply_scale (1, new_est_niter + 1);
8239
8240 edge exit_l = single_pred_edge (loop->latch);
8241 profile_probability prob = exit_l->probability;
8242 exit_l->probability = exit_e->probability.invert ();
8243 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8244 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8245 }
8246
8247 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8248 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8249 stmt_vec_info. */
8250
8251 static void
8252 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8253 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8254 {
8255 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8256 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8257
8258 if (dump_enabled_p ())
8259 dump_printf_loc (MSG_NOTE, vect_location,
8260 "------>vectorizing statement: %G", stmt_info->stmt);
8261
8262 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8263 vect_loop_kill_debug_uses (loop, stmt_info);
8264
8265 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8266 && !STMT_VINFO_LIVE_P (stmt_info))
8267 return;
8268
8269 if (STMT_VINFO_VECTYPE (stmt_info))
8270 {
8271 poly_uint64 nunits
8272 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8273 if (!STMT_SLP_TYPE (stmt_info)
8274 && maybe_ne (nunits, vf)
8275 && dump_enabled_p ())
8276 /* For SLP VF is set according to unrolling factor, and not
8277 to vector size, hence for SLP this print is not valid. */
8278 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8279 }
8280
8281 /* Pure SLP statements have already been vectorized. We still need
8282 to apply loop vectorization to hybrid SLP statements. */
8283 if (PURE_SLP_STMT (stmt_info))
8284 return;
8285
8286 if (dump_enabled_p ())
8287 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8288
8289 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8290 *seen_store = stmt_info;
8291 }
8292
8293 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8294 in the hash_map with its corresponding values. */
8295
8296 static tree
8297 find_in_mapping (tree t, void *context)
8298 {
8299 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8300
8301 tree *value = mapping->get (t);
8302 return value ? *value : t;
8303 }
8304
8305 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8306 original loop that has now been vectorized.
8307
8308 The inits of the data_references need to be advanced with the number of
8309 iterations of the main loop. This has been computed in vect_do_peeling and
8310 is stored in parameter ADVANCE. We first restore the data_references
8311 initial offset with the values recored in ORIG_DRS_INIT.
8312
8313 Since the loop_vec_info of this EPILOGUE was constructed for the original
8314 loop, its stmt_vec_infos all point to the original statements. These need
8315 to be updated to point to their corresponding copies as well as the SSA_NAMES
8316 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8317
8318 The data_reference's connections also need to be updated. Their
8319 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8320 stmt_vec_infos, their statements need to point to their corresponding copy,
8321 if they are gather loads or scatter stores then their reference needs to be
8322 updated to point to its corresponding copy and finally we set
8323 'base_misaligned' to false as we have already peeled for alignment in the
8324 prologue of the main loop. */
8325
8326 static void
8327 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8328 {
8329 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8330 auto_vec<gimple *> stmt_worklist;
8331 hash_map<tree,tree> mapping;
8332 gimple *orig_stmt, *new_stmt;
8333 gimple_stmt_iterator epilogue_gsi;
8334 gphi_iterator epilogue_phi_gsi;
8335 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8336 basic_block *epilogue_bbs = get_loop_body (epilogue);
8337 unsigned i;
8338
8339 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8340
8341 /* Advance data_reference's with the number of iterations of the previous
8342 loop and its prologue. */
8343 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8344
8345
8346 /* The EPILOGUE loop is a copy of the original loop so they share the same
8347 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8348 point to the copied statements. We also create a mapping of all LHS' in
8349 the original loop and all the LHS' in the EPILOGUE and create worklists to
8350 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8351 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8352 {
8353 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8354 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8355 {
8356 new_stmt = epilogue_phi_gsi.phi ();
8357
8358 gcc_assert (gimple_uid (new_stmt) > 0);
8359 stmt_vinfo
8360 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8361
8362 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8363 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8364
8365 mapping.put (gimple_phi_result (orig_stmt),
8366 gimple_phi_result (new_stmt));
8367 /* PHI nodes can not have patterns or related statements. */
8368 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8369 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8370 }
8371
8372 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8373 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8374 {
8375 new_stmt = gsi_stmt (epilogue_gsi);
8376
8377 gcc_assert (gimple_uid (new_stmt) > 0);
8378 stmt_vinfo
8379 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8380
8381 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8382 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8383
8384 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8385 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8386
8387 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8388 {
8389 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8390 for (gimple_stmt_iterator gsi = gsi_start (seq);
8391 !gsi_end_p (gsi); gsi_next (&gsi))
8392 stmt_worklist.safe_push (gsi_stmt (gsi));
8393 }
8394
8395 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8396 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8397 {
8398 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8399 stmt_worklist.safe_push (stmt);
8400 /* Set BB such that the assert in
8401 'get_initial_def_for_reduction' is able to determine that
8402 the BB of the related stmt is inside this loop. */
8403 gimple_set_bb (stmt,
8404 gimple_bb (new_stmt));
8405 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8406 gcc_assert (related_vinfo == NULL
8407 || related_vinfo == stmt_vinfo);
8408 }
8409 }
8410 }
8411
8412 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8413 using the original main loop and thus need to be updated to refer to the
8414 cloned variables used in the epilogue. */
8415 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8416 {
8417 gimple *stmt = stmt_worklist[i];
8418 tree *new_op;
8419
8420 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8421 {
8422 tree op = gimple_op (stmt, j);
8423 if ((new_op = mapping.get(op)))
8424 gimple_set_op (stmt, j, *new_op);
8425 else
8426 {
8427 /* PR92429: The last argument of simplify_replace_tree disables
8428 folding when replacing arguments. This is required as
8429 otherwise you might end up with different statements than the
8430 ones analyzed in vect_loop_analyze, leading to different
8431 vectorization. */
8432 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8433 &find_in_mapping, &mapping, false);
8434 gimple_set_op (stmt, j, op);
8435 }
8436 }
8437 }
8438
8439 struct data_reference *dr;
8440 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8441 FOR_EACH_VEC_ELT (datarefs, i, dr)
8442 {
8443 orig_stmt = DR_STMT (dr);
8444 gcc_assert (gimple_uid (orig_stmt) > 0);
8445 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8446 /* Data references for gather loads and scatter stores do not use the
8447 updated offset we set using ADVANCE. Instead we have to make sure the
8448 reference in the data references point to the corresponding copy of
8449 the original in the epilogue. */
8450 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8451 == VMAT_GATHER_SCATTER)
8452 {
8453 DR_REF (dr)
8454 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8455 &find_in_mapping, &mapping);
8456 DR_BASE_ADDRESS (dr)
8457 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8458 &find_in_mapping, &mapping);
8459 }
8460 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8461 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8462 /* The vector size of the epilogue is smaller than that of the main loop
8463 so the alignment is either the same or lower. This means the dr will
8464 thus by definition be aligned. */
8465 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8466 }
8467
8468 epilogue_vinfo->shared->datarefs_copy.release ();
8469 epilogue_vinfo->shared->save_datarefs ();
8470 }
8471
8472 /* Function vect_transform_loop.
8473
8474 The analysis phase has determined that the loop is vectorizable.
8475 Vectorize the loop - created vectorized stmts to replace the scalar
8476 stmts in the loop, and update the loop exit condition.
8477 Returns scalar epilogue loop if any. */
8478
8479 class loop *
8480 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8481 {
8482 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8483 class loop *epilogue = NULL;
8484 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8485 int nbbs = loop->num_nodes;
8486 int i;
8487 tree niters_vector = NULL_TREE;
8488 tree step_vector = NULL_TREE;
8489 tree niters_vector_mult_vf = NULL_TREE;
8490 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8491 unsigned int lowest_vf = constant_lower_bound (vf);
8492 gimple *stmt;
8493 bool check_profitability = false;
8494 unsigned int th;
8495
8496 DUMP_VECT_SCOPE ("vec_transform_loop");
8497
8498 loop_vinfo->shared->check_datarefs ();
8499
8500 /* Use the more conservative vectorization threshold. If the number
8501 of iterations is constant assume the cost check has been performed
8502 by our caller. If the threshold makes all loops profitable that
8503 run at least the (estimated) vectorization factor number of times
8504 checking is pointless, too. */
8505 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8506 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8507 {
8508 if (dump_enabled_p ())
8509 dump_printf_loc (MSG_NOTE, vect_location,
8510 "Profitability threshold is %d loop iterations.\n",
8511 th);
8512 check_profitability = true;
8513 }
8514
8515 /* Make sure there exists a single-predecessor exit bb. Do this before
8516 versioning. */
8517 edge e = single_exit (loop);
8518 if (! single_pred_p (e->dest))
8519 {
8520 split_loop_exit_edge (e, true);
8521 if (dump_enabled_p ())
8522 dump_printf (MSG_NOTE, "split exit edge\n");
8523 }
8524
8525 /* Version the loop first, if required, so the profitability check
8526 comes first. */
8527
8528 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8529 {
8530 class loop *sloop
8531 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8532 sloop->force_vectorize = false;
8533 check_profitability = false;
8534 }
8535
8536 /* Make sure there exists a single-predecessor exit bb also on the
8537 scalar loop copy. Do this after versioning but before peeling
8538 so CFG structure is fine for both scalar and if-converted loop
8539 to make slpeel_duplicate_current_defs_from_edges face matched
8540 loop closed PHI nodes on the exit. */
8541 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8542 {
8543 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8544 if (! single_pred_p (e->dest))
8545 {
8546 split_loop_exit_edge (e, true);
8547 if (dump_enabled_p ())
8548 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8549 }
8550 }
8551
8552 tree niters = vect_build_loop_niters (loop_vinfo);
8553 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8554 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8555 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8556 tree advance;
8557 drs_init_vec orig_drs_init;
8558
8559 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8560 &step_vector, &niters_vector_mult_vf, th,
8561 check_profitability, niters_no_overflow,
8562 &advance);
8563
8564 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8565 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8566 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8567 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8568
8569 if (niters_vector == NULL_TREE)
8570 {
8571 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8572 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8573 && known_eq (lowest_vf, vf))
8574 {
8575 niters_vector
8576 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8577 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8578 step_vector = build_one_cst (TREE_TYPE (niters));
8579 }
8580 else
8581 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8582 &step_vector, niters_no_overflow);
8583 }
8584
8585 /* 1) Make sure the loop header has exactly two entries
8586 2) Make sure we have a preheader basic block. */
8587
8588 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8589
8590 split_edge (loop_preheader_edge (loop));
8591
8592 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8593 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8594 /* This will deal with any possible peeling. */
8595 vect_prepare_for_masked_peels (loop_vinfo);
8596
8597 /* Schedule the SLP instances first, then handle loop vectorization
8598 below. */
8599 if (!loop_vinfo->slp_instances.is_empty ())
8600 {
8601 DUMP_VECT_SCOPE ("scheduling SLP instances");
8602 vect_schedule_slp (loop_vinfo);
8603 }
8604
8605 /* FORNOW: the vectorizer supports only loops which body consist
8606 of one basic block (header + empty latch). When the vectorizer will
8607 support more involved loop forms, the order by which the BBs are
8608 traversed need to be reconsidered. */
8609
8610 for (i = 0; i < nbbs; i++)
8611 {
8612 basic_block bb = bbs[i];
8613 stmt_vec_info stmt_info;
8614
8615 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8616 gsi_next (&si))
8617 {
8618 gphi *phi = si.phi ();
8619 if (dump_enabled_p ())
8620 dump_printf_loc (MSG_NOTE, vect_location,
8621 "------>vectorizing phi: %G", phi);
8622 stmt_info = loop_vinfo->lookup_stmt (phi);
8623 if (!stmt_info)
8624 continue;
8625
8626 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8627 vect_loop_kill_debug_uses (loop, stmt_info);
8628
8629 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8630 && !STMT_VINFO_LIVE_P (stmt_info))
8631 continue;
8632
8633 if (STMT_VINFO_VECTYPE (stmt_info)
8634 && (maybe_ne
8635 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8636 && dump_enabled_p ())
8637 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8638
8639 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8640 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8641 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8642 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8643 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8644 && ! PURE_SLP_STMT (stmt_info))
8645 {
8646 if (dump_enabled_p ())
8647 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8648 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
8649 }
8650 }
8651
8652 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8653 !gsi_end_p (si);)
8654 {
8655 stmt = gsi_stmt (si);
8656 /* During vectorization remove existing clobber stmts. */
8657 if (gimple_clobber_p (stmt))
8658 {
8659 unlink_stmt_vdef (stmt);
8660 gsi_remove (&si, true);
8661 release_defs (stmt);
8662 }
8663 else
8664 {
8665 /* Ignore vector stmts created in the outer loop. */
8666 stmt_info = loop_vinfo->lookup_stmt (stmt);
8667
8668 /* vector stmts created in the outer-loop during vectorization of
8669 stmts in an inner-loop may not have a stmt_info, and do not
8670 need to be vectorized. */
8671 stmt_vec_info seen_store = NULL;
8672 if (stmt_info)
8673 {
8674 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8675 {
8676 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8677 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8678 !gsi_end_p (subsi); gsi_next (&subsi))
8679 {
8680 stmt_vec_info pat_stmt_info
8681 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8682 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8683 &si, &seen_store);
8684 }
8685 stmt_vec_info pat_stmt_info
8686 = STMT_VINFO_RELATED_STMT (stmt_info);
8687 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8688 &seen_store);
8689 }
8690 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8691 &seen_store);
8692 }
8693 gsi_next (&si);
8694 if (seen_store)
8695 {
8696 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8697 /* Interleaving. If IS_STORE is TRUE, the
8698 vectorization of the interleaving chain was
8699 completed - free all the stores in the chain. */
8700 vect_remove_stores (loop_vinfo,
8701 DR_GROUP_FIRST_ELEMENT (seen_store));
8702 else
8703 /* Free the attached stmt_vec_info and remove the stmt. */
8704 loop_vinfo->remove_stmt (stmt_info);
8705 }
8706 }
8707 }
8708
8709 /* Stub out scalar statements that must not survive vectorization.
8710 Doing this here helps with grouped statements, or statements that
8711 are involved in patterns. */
8712 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8713 !gsi_end_p (gsi); gsi_next (&gsi))
8714 {
8715 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8716 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8717 {
8718 tree lhs = gimple_get_lhs (call);
8719 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8720 {
8721 tree zero = build_zero_cst (TREE_TYPE (lhs));
8722 gimple *new_stmt = gimple_build_assign (lhs, zero);
8723 gsi_replace (&gsi, new_stmt, true);
8724 }
8725 }
8726 }
8727 } /* BBs in loop */
8728
8729 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8730 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8731 if (integer_onep (step_vector))
8732 niters_no_overflow = true;
8733 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8734 niters_vector_mult_vf, !niters_no_overflow);
8735
8736 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8737 scale_profile_for_vect_loop (loop, assumed_vf);
8738
8739 /* True if the final iteration might not handle a full vector's
8740 worth of scalar iterations. */
8741 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8742 /* The minimum number of iterations performed by the epilogue. This
8743 is 1 when peeling for gaps because we always need a final scalar
8744 iteration. */
8745 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8746 /* +1 to convert latch counts to loop iteration counts,
8747 -min_epilogue_iters to remove iterations that cannot be performed
8748 by the vector code. */
8749 int bias_for_lowest = 1 - min_epilogue_iters;
8750 int bias_for_assumed = bias_for_lowest;
8751 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8752 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8753 {
8754 /* When the amount of peeling is known at compile time, the first
8755 iteration will have exactly alignment_npeels active elements.
8756 In the worst case it will have at least one. */
8757 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8758 bias_for_lowest += lowest_vf - min_first_active;
8759 bias_for_assumed += assumed_vf - min_first_active;
8760 }
8761 /* In these calculations the "- 1" converts loop iteration counts
8762 back to latch counts. */
8763 if (loop->any_upper_bound)
8764 loop->nb_iterations_upper_bound
8765 = (final_iter_may_be_partial
8766 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8767 lowest_vf) - 1
8768 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8769 lowest_vf) - 1);
8770 if (loop->any_likely_upper_bound)
8771 loop->nb_iterations_likely_upper_bound
8772 = (final_iter_may_be_partial
8773 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8774 + bias_for_lowest, lowest_vf) - 1
8775 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8776 + bias_for_lowest, lowest_vf) - 1);
8777 if (loop->any_estimate)
8778 loop->nb_iterations_estimate
8779 = (final_iter_may_be_partial
8780 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8781 assumed_vf) - 1
8782 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8783 assumed_vf) - 1);
8784
8785 if (dump_enabled_p ())
8786 {
8787 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8788 {
8789 dump_printf_loc (MSG_NOTE, vect_location,
8790 "LOOP VECTORIZED\n");
8791 if (loop->inner)
8792 dump_printf_loc (MSG_NOTE, vect_location,
8793 "OUTER LOOP VECTORIZED\n");
8794 dump_printf (MSG_NOTE, "\n");
8795 }
8796 else
8797 dump_printf_loc (MSG_NOTE, vect_location,
8798 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8799 GET_MODE_NAME (loop_vinfo->vector_mode));
8800 }
8801
8802 /* Loops vectorized with a variable factor won't benefit from
8803 unrolling/peeling. */
8804 if (!vf.is_constant ())
8805 {
8806 loop->unroll = 1;
8807 if (dump_enabled_p ())
8808 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8809 " variable-length vectorization factor\n");
8810 }
8811 /* Free SLP instances here because otherwise stmt reference counting
8812 won't work. */
8813 slp_instance instance;
8814 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8815 vect_free_slp_instance (instance, true);
8816 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8817 /* Clear-up safelen field since its value is invalid after vectorization
8818 since vectorized loop can have loop-carried dependencies. */
8819 loop->safelen = 0;
8820
8821 if (epilogue)
8822 {
8823 update_epilogue_loop_vinfo (epilogue, advance);
8824
8825 epilogue->simduid = loop->simduid;
8826 epilogue->force_vectorize = loop->force_vectorize;
8827 epilogue->dont_vectorize = false;
8828 }
8829
8830 return epilogue;
8831 }
8832
8833 /* The code below is trying to perform simple optimization - revert
8834 if-conversion for masked stores, i.e. if the mask of a store is zero
8835 do not perform it and all stored value producers also if possible.
8836 For example,
8837 for (i=0; i<n; i++)
8838 if (c[i])
8839 {
8840 p1[i] += 1;
8841 p2[i] = p3[i] +2;
8842 }
8843 this transformation will produce the following semi-hammock:
8844
8845 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8846 {
8847 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8848 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8849 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8850 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8851 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8852 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8853 }
8854 */
8855
8856 void
8857 optimize_mask_stores (class loop *loop)
8858 {
8859 basic_block *bbs = get_loop_body (loop);
8860 unsigned nbbs = loop->num_nodes;
8861 unsigned i;
8862 basic_block bb;
8863 class loop *bb_loop;
8864 gimple_stmt_iterator gsi;
8865 gimple *stmt;
8866 auto_vec<gimple *> worklist;
8867 auto_purge_vect_location sentinel;
8868
8869 vect_location = find_loop_location (loop);
8870 /* Pick up all masked stores in loop if any. */
8871 for (i = 0; i < nbbs; i++)
8872 {
8873 bb = bbs[i];
8874 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8875 gsi_next (&gsi))
8876 {
8877 stmt = gsi_stmt (gsi);
8878 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8879 worklist.safe_push (stmt);
8880 }
8881 }
8882
8883 free (bbs);
8884 if (worklist.is_empty ())
8885 return;
8886
8887 /* Loop has masked stores. */
8888 while (!worklist.is_empty ())
8889 {
8890 gimple *last, *last_store;
8891 edge e, efalse;
8892 tree mask;
8893 basic_block store_bb, join_bb;
8894 gimple_stmt_iterator gsi_to;
8895 tree vdef, new_vdef;
8896 gphi *phi;
8897 tree vectype;
8898 tree zero;
8899
8900 last = worklist.pop ();
8901 mask = gimple_call_arg (last, 2);
8902 bb = gimple_bb (last);
8903 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8904 the same loop as if_bb. It could be different to LOOP when two
8905 level loop-nest is vectorized and mask_store belongs to the inner
8906 one. */
8907 e = split_block (bb, last);
8908 bb_loop = bb->loop_father;
8909 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8910 join_bb = e->dest;
8911 store_bb = create_empty_bb (bb);
8912 add_bb_to_loop (store_bb, bb_loop);
8913 e->flags = EDGE_TRUE_VALUE;
8914 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8915 /* Put STORE_BB to likely part. */
8916 efalse->probability = profile_probability::unlikely ();
8917 store_bb->count = efalse->count ();
8918 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8919 if (dom_info_available_p (CDI_DOMINATORS))
8920 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8921 if (dump_enabled_p ())
8922 dump_printf_loc (MSG_NOTE, vect_location,
8923 "Create new block %d to sink mask stores.",
8924 store_bb->index);
8925 /* Create vector comparison with boolean result. */
8926 vectype = TREE_TYPE (mask);
8927 zero = build_zero_cst (vectype);
8928 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8929 gsi = gsi_last_bb (bb);
8930 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8931 /* Create new PHI node for vdef of the last masked store:
8932 .MEM_2 = VDEF <.MEM_1>
8933 will be converted to
8934 .MEM.3 = VDEF <.MEM_1>
8935 and new PHI node will be created in join bb
8936 .MEM_2 = PHI <.MEM_1, .MEM_3>
8937 */
8938 vdef = gimple_vdef (last);
8939 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8940 gimple_set_vdef (last, new_vdef);
8941 phi = create_phi_node (vdef, join_bb);
8942 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8943
8944 /* Put all masked stores with the same mask to STORE_BB if possible. */
8945 while (true)
8946 {
8947 gimple_stmt_iterator gsi_from;
8948 gimple *stmt1 = NULL;
8949
8950 /* Move masked store to STORE_BB. */
8951 last_store = last;
8952 gsi = gsi_for_stmt (last);
8953 gsi_from = gsi;
8954 /* Shift GSI to the previous stmt for further traversal. */
8955 gsi_prev (&gsi);
8956 gsi_to = gsi_start_bb (store_bb);
8957 gsi_move_before (&gsi_from, &gsi_to);
8958 /* Setup GSI_TO to the non-empty block start. */
8959 gsi_to = gsi_start_bb (store_bb);
8960 if (dump_enabled_p ())
8961 dump_printf_loc (MSG_NOTE, vect_location,
8962 "Move stmt to created bb\n%G", last);
8963 /* Move all stored value producers if possible. */
8964 while (!gsi_end_p (gsi))
8965 {
8966 tree lhs;
8967 imm_use_iterator imm_iter;
8968 use_operand_p use_p;
8969 bool res;
8970
8971 /* Skip debug statements. */
8972 if (is_gimple_debug (gsi_stmt (gsi)))
8973 {
8974 gsi_prev (&gsi);
8975 continue;
8976 }
8977 stmt1 = gsi_stmt (gsi);
8978 /* Do not consider statements writing to memory or having
8979 volatile operand. */
8980 if (gimple_vdef (stmt1)
8981 || gimple_has_volatile_ops (stmt1))
8982 break;
8983 gsi_from = gsi;
8984 gsi_prev (&gsi);
8985 lhs = gimple_get_lhs (stmt1);
8986 if (!lhs)
8987 break;
8988
8989 /* LHS of vectorized stmt must be SSA_NAME. */
8990 if (TREE_CODE (lhs) != SSA_NAME)
8991 break;
8992
8993 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8994 {
8995 /* Remove dead scalar statement. */
8996 if (has_zero_uses (lhs))
8997 {
8998 gsi_remove (&gsi_from, true);
8999 continue;
9000 }
9001 }
9002
9003 /* Check that LHS does not have uses outside of STORE_BB. */
9004 res = true;
9005 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9006 {
9007 gimple *use_stmt;
9008 use_stmt = USE_STMT (use_p);
9009 if (is_gimple_debug (use_stmt))
9010 continue;
9011 if (gimple_bb (use_stmt) != store_bb)
9012 {
9013 res = false;
9014 break;
9015 }
9016 }
9017 if (!res)
9018 break;
9019
9020 if (gimple_vuse (stmt1)
9021 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9022 break;
9023
9024 /* Can move STMT1 to STORE_BB. */
9025 if (dump_enabled_p ())
9026 dump_printf_loc (MSG_NOTE, vect_location,
9027 "Move stmt to created bb\n%G", stmt1);
9028 gsi_move_before (&gsi_from, &gsi_to);
9029 /* Shift GSI_TO for further insertion. */
9030 gsi_prev (&gsi_to);
9031 }
9032 /* Put other masked stores with the same mask to STORE_BB. */
9033 if (worklist.is_empty ()
9034 || gimple_call_arg (worklist.last (), 2) != mask
9035 || worklist.last () != stmt1)
9036 break;
9037 last = worklist.pop ();
9038 }
9039 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9040 }
9041 }
9042
9043 /* Decide whether it is possible to use a zero-based induction variable
9044 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9045 return the value that the induction variable must be able to hold
9046 in order to ensure that the loop ends with an all-false mask.
9047 Return -1 otherwise. */
9048 widest_int
9049 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9050 {
9051 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9052 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9053 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9054
9055 /* Calculate the value that the induction variable must be able
9056 to hit in order to ensure that we end the loop with an all-false mask.
9057 This involves adding the maximum number of inactive trailing scalar
9058 iterations. */
9059 widest_int iv_limit = -1;
9060 if (max_loop_iterations (loop, &iv_limit))
9061 {
9062 if (niters_skip)
9063 {
9064 /* Add the maximum number of skipped iterations to the
9065 maximum iteration count. */
9066 if (TREE_CODE (niters_skip) == INTEGER_CST)
9067 iv_limit += wi::to_widest (niters_skip);
9068 else
9069 iv_limit += max_vf - 1;
9070 }
9071 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9072 /* Make a conservatively-correct assumption. */
9073 iv_limit += max_vf - 1;
9074
9075 /* IV_LIMIT is the maximum number of latch iterations, which is also
9076 the maximum in-range IV value. Round this value down to the previous
9077 vector alignment boundary and then add an extra full iteration. */
9078 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9079 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9080 }
9081 return iv_limit;
9082 }
9083