]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.c
2019-02-22 Richard Biener <rguenther@suse.de>
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
161
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
184
185 if (stmt_vectype)
186 {
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 }
199
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
202
203 return opt_result::success ();
204 }
205
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
212
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
216 {
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
225
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
228 {
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
235 {
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
247 }
248
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
256 }
257
258 return opt_result::success ();
259 }
260
261 /* Function vect_determine_vectorization_factor
262
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
268
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
273
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
278 }
279
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
283 }
284 */
285
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 {
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
299
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
301
302 for (i = 0; i < nbbs; i++)
303 {
304 basic_block bb = bbs[i];
305
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
308 {
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
314
315 gcc_assert (stmt_info);
316
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
319 {
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
322
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
327
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
335
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
339
340 if (dump_enabled_p ())
341 {
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
345 }
346
347 vect_update_max_nunits (&vectorization_factor, vectype);
348 }
349 }
350
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
353 {
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
360 }
361 }
362
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
365 {
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
369 }
370
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
375
376 for (i = 0; i < mask_producers.length (); i++)
377 {
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
383 }
384
385 return opt_result::success ();
386 }
387
388
389 /* Function vect_is_simple_iv_evolution.
390
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
393
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
397 {
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
402
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
407
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
412
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
415
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
419
420 *init = init_expr;
421 *step = step_expr;
422
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
432 {
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
437 }
438
439 return true;
440 }
441
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
445
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
448 ...
449
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
452 ...
453 x_3 = ...;
454 ...
455
456 outer2:
457 x_4 = PHI <x_3(inner)>;
458 ...
459
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
462
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
465 {
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
474 }
475
476 /* Function vect_analyze_scalar_cycles_1.
477
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
482
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
485 {
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
491
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
493
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
498 {
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
503
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
506
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
511
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
513
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
517 {
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
526 }
527
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
533 {
534 worklist.safe_push (stmt_vinfo);
535 continue;
536 }
537
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
541
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
545 }
546
547
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
550 {
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
554
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
557
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
560
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
565 {
566 if (double_reduc)
567 {
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
571
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
575 }
576 else
577 {
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
583
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
586 }
587 else
588 {
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
592
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
601 }
602 }
603 }
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
608 }
609 }
610
611
612 /* Function vect_analyze_scalar_cycles.
613
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
620
621 Example1: reduction:
622
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
626
627 Example2: induction:
628
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
632
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
635 {
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
637
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
639
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
648
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 }
652
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
655
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
658 {
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664 do
665 {
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
672 }
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 }
676
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
678
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
681 {
682 stmt_vec_info first;
683 unsigned i;
684
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
687 {
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
690 {
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
694 }
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
698 {
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
702 }
703 }
704 }
705
706 /* Function vect_get_loop_niters.
707
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
712
713 Return the loop exit condition. */
714
715
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
719 {
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
724
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
729
730 if (!exit)
731 return cond;
732
733 niter = chrec_dont_know;
734 may_be_zero = NULL_TREE;
735 niter_assumptions = boolean_true_node;
736 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
737 || chrec_contains_undetermined (niter_desc.niter))
738 return cond;
739
740 niter_assumptions = niter_desc.assumptions;
741 may_be_zero = niter_desc.may_be_zero;
742 niter = niter_desc.niter;
743
744 if (may_be_zero && integer_zerop (may_be_zero))
745 may_be_zero = NULL_TREE;
746
747 if (may_be_zero)
748 {
749 if (COMPARISON_CLASS_P (may_be_zero))
750 {
751 /* Try to combine may_be_zero with assumptions, this can simplify
752 computation of niter expression. */
753 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
754 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
755 niter_assumptions,
756 fold_build1 (TRUTH_NOT_EXPR,
757 boolean_type_node,
758 may_be_zero));
759 else
760 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
761 build_int_cst (TREE_TYPE (niter), 0),
762 rewrite_to_non_trapping_overflow (niter));
763
764 may_be_zero = NULL_TREE;
765 }
766 else if (integer_nonzerop (may_be_zero))
767 {
768 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
769 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
770 return cond;
771 }
772 else
773 return cond;
774 }
775
776 *assumptions = niter_assumptions;
777 *number_of_iterationsm1 = niter;
778
779 /* We want the number of loop header executions which is the number
780 of latch executions plus one.
781 ??? For UINT_MAX latch executions this number overflows to zero
782 for loops like do { n++; } while (n != 0); */
783 if (niter && !chrec_contains_undetermined (niter))
784 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
785 build_int_cst (TREE_TYPE (niter), 1));
786 *number_of_iterations = niter;
787
788 return cond;
789 }
790
791 /* Function bb_in_loop_p
792
793 Used as predicate for dfs order traversal of the loop bbs. */
794
795 static bool
796 bb_in_loop_p (const_basic_block bb, const void *data)
797 {
798 const struct loop *const loop = (const struct loop *)data;
799 if (flow_bb_inside_loop_p (loop, bb))
800 return true;
801 return false;
802 }
803
804
805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
806 stmt_vec_info structs for all the stmts in LOOP_IN. */
807
808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
809 : vec_info (vec_info::loop, init_cost (loop_in), shared),
810 loop (loop_in),
811 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
812 num_itersm1 (NULL_TREE),
813 num_iters (NULL_TREE),
814 num_iters_unchanged (NULL_TREE),
815 num_iters_assumptions (NULL_TREE),
816 th (0),
817 versioning_threshold (0),
818 vectorization_factor (0),
819 max_vectorization_factor (0),
820 mask_skip_niters (NULL_TREE),
821 mask_compare_type (NULL_TREE),
822 unaligned_dr (NULL),
823 peeling_for_alignment (0),
824 ptr_mask (0),
825 ivexpr_map (NULL),
826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
829 can_fully_mask_p (true),
830 fully_masked_p (false),
831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop (NULL),
837 orig_loop_info (NULL)
838 {
839 /* CHECKME: We want to visit all BBs before their successors (except for
840 latch blocks, for which this assertion wouldn't hold). In the simple
841 case of the loop forms we allow, a dfs order of the BBs would the same
842 as reversed postorder traversal, so we are safe. */
843
844 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
845 bbs, loop->num_nodes, loop);
846 gcc_assert (nbbs == loop->num_nodes);
847
848 for (unsigned int i = 0; i < nbbs; i++)
849 {
850 basic_block bb = bbs[i];
851 gimple_stmt_iterator si;
852
853 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
854 {
855 gimple *phi = gsi_stmt (si);
856 gimple_set_uid (phi, 0);
857 add_stmt (phi);
858 }
859
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
861 {
862 gimple *stmt = gsi_stmt (si);
863 gimple_set_uid (stmt, 0);
864 add_stmt (stmt);
865 }
866 }
867 }
868
869 /* Free all levels of MASKS. */
870
871 void
872 release_vec_loop_masks (vec_loop_masks *masks)
873 {
874 rgroup_masks *rgm;
875 unsigned int i;
876 FOR_EACH_VEC_ELT (*masks, i, rgm)
877 rgm->masks.release ();
878 masks->release ();
879 }
880
881 /* Free all memory used by the _loop_vec_info, as well as all the
882 stmt_vec_info structs of all the stmts in the loop. */
883
884 _loop_vec_info::~_loop_vec_info ()
885 {
886 int nbbs;
887 gimple_stmt_iterator si;
888 int j;
889
890 nbbs = loop->num_nodes;
891 for (j = 0; j < nbbs; j++)
892 {
893 basic_block bb = bbs[j];
894 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
895 {
896 gimple *stmt = gsi_stmt (si);
897
898 /* We may have broken canonical form by moving a constant
899 into RHS1 of a commutative op. Fix such occurrences. */
900 if (operands_swapped && is_gimple_assign (stmt))
901 {
902 enum tree_code code = gimple_assign_rhs_code (stmt);
903
904 if ((code == PLUS_EXPR
905 || code == POINTER_PLUS_EXPR
906 || code == MULT_EXPR)
907 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
908 swap_ssa_operands (stmt,
909 gimple_assign_rhs1_ptr (stmt),
910 gimple_assign_rhs2_ptr (stmt));
911 else if (code == COND_EXPR
912 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
913 {
914 tree cond_expr = gimple_assign_rhs1 (stmt);
915 enum tree_code cond_code = TREE_CODE (cond_expr);
916
917 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
918 {
919 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
920 0));
921 cond_code = invert_tree_comparison (cond_code,
922 honor_nans);
923 if (cond_code != ERROR_MARK)
924 {
925 TREE_SET_CODE (cond_expr, cond_code);
926 swap_ssa_operands (stmt,
927 gimple_assign_rhs2_ptr (stmt),
928 gimple_assign_rhs3_ptr (stmt));
929 }
930 }
931 }
932 }
933 gsi_next (&si);
934 }
935 }
936
937 free (bbs);
938
939 release_vec_loop_masks (&masks);
940 delete ivexpr_map;
941
942 loop->aux = NULL;
943 }
944
945 /* Return an invariant or register for EXPR and emit necessary
946 computations in the LOOP_VINFO loop preheader. */
947
948 tree
949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
950 {
951 if (is_gimple_reg (expr)
952 || is_gimple_min_invariant (expr))
953 return expr;
954
955 if (! loop_vinfo->ivexpr_map)
956 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
957 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
958 if (! cached)
959 {
960 gimple_seq stmts = NULL;
961 cached = force_gimple_operand (unshare_expr (expr),
962 &stmts, true, NULL_TREE);
963 if (stmts)
964 {
965 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
966 gsi_insert_seq_on_edge_immediate (e, stmts);
967 }
968 }
969 return cached;
970 }
971
972 /* Return true if we can use CMP_TYPE as the comparison type to produce
973 all masks required to mask LOOP_VINFO. */
974
975 static bool
976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
977 {
978 rgroup_masks *rgm;
979 unsigned int i;
980 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
981 if (rgm->mask_type != NULL_TREE
982 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
983 cmp_type, rgm->mask_type,
984 OPTIMIZE_FOR_SPEED))
985 return false;
986 return true;
987 }
988
989 /* Calculate the maximum number of scalars per iteration for every
990 rgroup in LOOP_VINFO. */
991
992 static unsigned int
993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
994 {
995 unsigned int res = 1;
996 unsigned int i;
997 rgroup_masks *rgm;
998 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
999 res = MAX (res, rgm->max_nscalars_per_iter);
1000 return res;
1001 }
1002
1003 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1004 whether we can actually generate the masks required. Return true if so,
1005 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1006
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1009 {
1010 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011 unsigned int min_ni_width;
1012
1013 /* Use a normal loop if there are no statements that need masking.
1014 This only happens in rare degenerate cases: it means that the loop
1015 has no loads, no stores, and no live-out values. */
1016 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017 return false;
1018
1019 /* Get the maximum number of iterations that is representable
1020 in the counter type. */
1021 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1023
1024 /* Get a more refined estimate for the number of iterations. */
1025 widest_int max_back_edges;
1026 if (max_loop_iterations (loop, &max_back_edges))
1027 max_ni = wi::smin (max_ni, max_back_edges + 1);
1028
1029 /* Account for rgroup masks, in which each bit is replicated N times. */
1030 max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1031
1032 /* Work out how many bits we need to represent the limit. */
1033 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1034
1035 /* Find a scalar mode for which WHILE_ULT is supported. */
1036 opt_scalar_int_mode cmp_mode_iter;
1037 tree cmp_type = NULL_TREE;
1038 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1039 {
1040 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041 if (cmp_bits >= min_ni_width
1042 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1043 {
1044 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045 if (this_type
1046 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1047 {
1048 /* Although we could stop as soon as we find a valid mode,
1049 it's often better to continue until we hit Pmode, since the
1050 operands to the WHILE are more likely to be reusable in
1051 address calculations. */
1052 cmp_type = this_type;
1053 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054 break;
1055 }
1056 }
1057 }
1058
1059 if (!cmp_type)
1060 return false;
1061
1062 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063 return true;
1064 }
1065
1066 /* Calculate the cost of one scalar iteration of the loop. */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1069 {
1070 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072 int nbbs = loop->num_nodes, factor;
1073 int innerloop_iters, i;
1074
1075 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1076
1077 /* Gather costs for statements in the scalar loop. */
1078
1079 /* FORNOW. */
1080 innerloop_iters = 1;
1081 if (loop->inner)
1082 innerloop_iters = 50; /* FIXME */
1083
1084 for (i = 0; i < nbbs; i++)
1085 {
1086 gimple_stmt_iterator si;
1087 basic_block bb = bbs[i];
1088
1089 if (bb->loop_father == loop->inner)
1090 factor = innerloop_iters;
1091 else
1092 factor = 1;
1093
1094 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095 {
1096 gimple *stmt = gsi_stmt (si);
1097 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1098
1099 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100 continue;
1101
1102 /* Skip stmts that are not vectorized inside the loop. */
1103 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1104 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1105 && (!STMT_VINFO_LIVE_P (vstmt_info)
1106 || !VECTORIZABLE_CYCLE_DEF
1107 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1108 continue;
1109
1110 vect_cost_for_stmt kind;
1111 if (STMT_VINFO_DATA_REF (stmt_info))
1112 {
1113 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114 kind = scalar_load;
1115 else
1116 kind = scalar_store;
1117 }
1118 else
1119 kind = scalar_stmt;
1120
1121 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122 factor, kind, stmt_info, 0, vect_prologue);
1123 }
1124 }
1125
1126 /* Now accumulate cost. */
1127 void *target_cost_data = init_cost (loop);
1128 stmt_info_for_cost *si;
1129 int j;
1130 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131 j, si)
1132 (void) add_stmt_cost (target_cost_data, si->count,
1133 si->kind, si->stmt_info, si->misalign,
1134 vect_body);
1135 unsigned dummy, body_cost = 0;
1136 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137 destroy_cost_data (target_cost_data);
1138 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1139 }
1140
1141
1142 /* Function vect_analyze_loop_form_1.
1143
1144 Verify that certain CFG restrictions hold, including:
1145 - the loop has a pre-header
1146 - the loop has a single entry and exit
1147 - the loop exit condition is simple enough
1148 - the number of iterations can be analyzed, i.e, a countable loop. The
1149 niter could be analyzed under some assumptions. */
1150
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153 tree *assumptions, tree *number_of_iterationsm1,
1154 tree *number_of_iterations, gcond **inner_loop_cond)
1155 {
1156 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157
1158 /* Different restrictions apply when we are considering an inner-most loop,
1159 vs. an outer (nested) loop.
1160 (FORNOW. May want to relax some of these restrictions in the future). */
1161
1162 if (!loop->inner)
1163 {
1164 /* Inner-most loop. We currently require that the number of BBs is
1165 exactly 2 (the header and latch). Vectorizable inner-most loops
1166 look like this:
1167
1168 (pre-header)
1169 |
1170 header <--------+
1171 | | |
1172 | +--> latch --+
1173 |
1174 (exit-bb) */
1175
1176 if (loop->num_nodes != 2)
1177 return opt_result::failure_at (vect_location,
1178 "not vectorized:"
1179 " control flow in loop.\n");
1180
1181 if (empty_block_p (loop->header))
1182 return opt_result::failure_at (vect_location,
1183 "not vectorized: empty loop.\n");
1184 }
1185 else
1186 {
1187 struct loop *innerloop = loop->inner;
1188 edge entryedge;
1189
1190 /* Nested loop. We currently require that the loop is doubly-nested,
1191 contains a single inner loop, and the number of BBs is exactly 5.
1192 Vectorizable outer-loops look like this:
1193
1194 (pre-header)
1195 |
1196 header <---+
1197 | |
1198 inner-loop |
1199 | |
1200 tail ------+
1201 |
1202 (exit-bb)
1203
1204 The inner-loop has the properties expected of inner-most loops
1205 as described above. */
1206
1207 if ((loop->inner)->inner || (loop->inner)->next)
1208 return opt_result::failure_at (vect_location,
1209 "not vectorized:"
1210 " multiple nested loops.\n");
1211
1212 if (loop->num_nodes != 5)
1213 return opt_result::failure_at (vect_location,
1214 "not vectorized:"
1215 " control flow in loop.\n");
1216
1217 entryedge = loop_preheader_edge (innerloop);
1218 if (entryedge->src != loop->header
1219 || !single_exit (innerloop)
1220 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221 return opt_result::failure_at (vect_location,
1222 "not vectorized:"
1223 " unsupported outerloop form.\n");
1224
1225 /* Analyze the inner-loop. */
1226 tree inner_niterm1, inner_niter, inner_assumptions;
1227 opt_result res
1228 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229 &inner_assumptions, &inner_niterm1,
1230 &inner_niter, NULL);
1231 if (!res)
1232 {
1233 if (dump_enabled_p ())
1234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235 "not vectorized: Bad inner loop.\n");
1236 return res;
1237 }
1238
1239 /* Don't support analyzing niter under assumptions for inner
1240 loop. */
1241 if (!integer_onep (inner_assumptions))
1242 return opt_result::failure_at (vect_location,
1243 "not vectorized: Bad inner loop.\n");
1244
1245 if (!expr_invariant_in_loop_p (loop, inner_niter))
1246 return opt_result::failure_at (vect_location,
1247 "not vectorized: inner-loop count not"
1248 " invariant.\n");
1249
1250 if (dump_enabled_p ())
1251 dump_printf_loc (MSG_NOTE, vect_location,
1252 "Considering outer-loop vectorization.\n");
1253 }
1254
1255 if (!single_exit (loop))
1256 return opt_result::failure_at (vect_location,
1257 "not vectorized: multiple exits.\n");
1258 if (EDGE_COUNT (loop->header->preds) != 2)
1259 return opt_result::failure_at (vect_location,
1260 "not vectorized:"
1261 " too many incoming edges.\n");
1262
1263 /* We assume that the loop exit condition is at the end of the loop. i.e,
1264 that the loop is represented as a do-while (with a proper if-guard
1265 before the loop if needed), where the loop header contains all the
1266 executable statements, and the latch is empty. */
1267 if (!empty_block_p (loop->latch)
1268 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269 return opt_result::failure_at (vect_location,
1270 "not vectorized: latch block not empty.\n");
1271
1272 /* Make sure the exit is not abnormal. */
1273 edge e = single_exit (loop);
1274 if (e->flags & EDGE_ABNORMAL)
1275 return opt_result::failure_at (vect_location,
1276 "not vectorized:"
1277 " abnormal loop exit edge.\n");
1278
1279 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280 number_of_iterationsm1);
1281 if (!*loop_cond)
1282 return opt_result::failure_at
1283 (vect_location,
1284 "not vectorized: complicated exit condition.\n");
1285
1286 if (integer_zerop (*assumptions)
1287 || !*number_of_iterations
1288 || chrec_contains_undetermined (*number_of_iterations))
1289 return opt_result::failure_at
1290 (*loop_cond,
1291 "not vectorized: number of iterations cannot be computed.\n");
1292
1293 if (integer_zerop (*number_of_iterations))
1294 return opt_result::failure_at
1295 (*loop_cond,
1296 "not vectorized: number of iterations = 0.\n");
1297
1298 return opt_result::success ();
1299 }
1300
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1302
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1305 {
1306 tree assumptions, number_of_iterations, number_of_iterationsm1;
1307 gcond *loop_cond, *inner_loop_cond = NULL;
1308
1309 opt_result res
1310 = vect_analyze_loop_form_1 (loop, &loop_cond,
1311 &assumptions, &number_of_iterationsm1,
1312 &number_of_iterations, &inner_loop_cond);
1313 if (!res)
1314 return opt_loop_vec_info::propagate_failure (res);
1315
1316 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320 if (!integer_onep (assumptions))
1321 {
1322 /* We consider to vectorize this loop by versioning it under
1323 some assumptions. In order to do this, we need to clear
1324 existing information computed by scev and niter analyzer. */
1325 scev_reset_htab ();
1326 free_numbers_of_iterations_estimates (loop);
1327 /* Also set flag for this loop so that following scev and niter
1328 analysis are done under the assumptions. */
1329 loop_constraint_set (loop, LOOP_C_FINITE);
1330 /* Also record the assumptions for versioning. */
1331 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1332 }
1333
1334 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1335 {
1336 if (dump_enabled_p ())
1337 {
1338 dump_printf_loc (MSG_NOTE, vect_location,
1339 "Symbolic number of iterations is ");
1340 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341 dump_printf (MSG_NOTE, "\n");
1342 }
1343 }
1344
1345 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347 if (inner_loop_cond)
1348 {
1349 stmt_vec_info inner_loop_cond_info
1350 = loop_vinfo->lookup_stmt (inner_loop_cond);
1351 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1352 }
1353
1354 gcc_assert (!loop->aux);
1355 loop->aux = loop_vinfo;
1356 return opt_loop_vec_info::success (loop_vinfo);
1357 }
1358
1359
1360
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362 statements update the vectorization factor. */
1363
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1366 {
1367 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369 int nbbs = loop->num_nodes;
1370 poly_uint64 vectorization_factor;
1371 int i;
1372
1373 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1374
1375 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376 gcc_assert (known_ne (vectorization_factor, 0U));
1377
1378 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379 vectorization factor of the loop is the unrolling factor required by
1380 the SLP instances. If that unrolling factor is 1, we say, that we
1381 perform pure SLP on loop - cross iteration parallelism is not
1382 exploited. */
1383 bool only_slp_in_loop = true;
1384 for (i = 0; i < nbbs; i++)
1385 {
1386 basic_block bb = bbs[i];
1387 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388 gsi_next (&si))
1389 {
1390 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391 stmt_info = vect_stmt_to_vectorize (stmt_info);
1392 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394 && !PURE_SLP_STMT (stmt_info))
1395 /* STMT needs both SLP and loop-based vectorization. */
1396 only_slp_in_loop = false;
1397 }
1398 }
1399
1400 if (only_slp_in_loop)
1401 {
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_NOTE, vect_location,
1404 "Loop contains only SLP stmts\n");
1405 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406 }
1407 else
1408 {
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "Loop contains SLP and non-SLP stmts\n");
1412 /* Both the vectorization factor and unroll factor have the form
1413 current_vector_size * X for some rational X, so they must have
1414 a common multiple. */
1415 vectorization_factor
1416 = force_common_multiple (vectorization_factor,
1417 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1418 }
1419
1420 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1421 if (dump_enabled_p ())
1422 {
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "Updating vectorization factor to ");
1425 dump_dec (MSG_NOTE, vectorization_factor);
1426 dump_printf (MSG_NOTE, ".\n");
1427 }
1428 }
1429
1430 /* Return true if STMT_INFO describes a double reduction phi and if
1431 the other phi in the reduction is also relevant for vectorization.
1432 This rejects cases such as:
1433
1434 outer1:
1435 x_1 = PHI <x_3(outer2), ...>;
1436 ...
1437
1438 inner:
1439 x_2 = ...;
1440 ...
1441
1442 outer2:
1443 x_3 = PHI <x_2(inner)>;
1444
1445 if nothing in x_2 or elsewhere makes x_1 relevant. */
1446
1447 static bool
1448 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1449 {
1450 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1451 return false;
1452
1453 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1454 }
1455
1456 /* Function vect_analyze_loop_operations.
1457
1458 Scan the loop stmts and make sure they are all vectorizable. */
1459
1460 static opt_result
1461 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1462 {
1463 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1464 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1465 int nbbs = loop->num_nodes;
1466 int i;
1467 stmt_vec_info stmt_info;
1468 bool need_to_vectorize = false;
1469 bool ok;
1470
1471 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1472
1473 auto_vec<stmt_info_for_cost> cost_vec;
1474
1475 for (i = 0; i < nbbs; i++)
1476 {
1477 basic_block bb = bbs[i];
1478
1479 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1480 gsi_next (&si))
1481 {
1482 gphi *phi = si.phi ();
1483 ok = true;
1484
1485 stmt_info = loop_vinfo->lookup_stmt (phi);
1486 if (dump_enabled_p ())
1487 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1488 if (virtual_operand_p (gimple_phi_result (phi)))
1489 continue;
1490
1491 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1492 (i.e., a phi in the tail of the outer-loop). */
1493 if (! is_loop_header_bb_p (bb))
1494 {
1495 /* FORNOW: we currently don't support the case that these phis
1496 are not used in the outerloop (unless it is double reduction,
1497 i.e., this phi is vect_reduction_def), cause this case
1498 requires to actually do something here. */
1499 if (STMT_VINFO_LIVE_P (stmt_info)
1500 && !vect_active_double_reduction_p (stmt_info))
1501 return opt_result::failure_at (phi,
1502 "Unsupported loop-closed phi"
1503 " in outer-loop.\n");
1504
1505 /* If PHI is used in the outer loop, we check that its operand
1506 is defined in the inner loop. */
1507 if (STMT_VINFO_RELEVANT_P (stmt_info))
1508 {
1509 tree phi_op;
1510
1511 if (gimple_phi_num_args (phi) != 1)
1512 return opt_result::failure_at (phi, "unsupported phi");
1513
1514 phi_op = PHI_ARG_DEF (phi, 0);
1515 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1516 if (!op_def_info)
1517 return opt_result::failure_at (phi, "unsupported phi");
1518
1519 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1520 && (STMT_VINFO_RELEVANT (op_def_info)
1521 != vect_used_in_outer_by_reduction))
1522 return opt_result::failure_at (phi, "unsupported phi");
1523 }
1524
1525 continue;
1526 }
1527
1528 gcc_assert (stmt_info);
1529
1530 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1531 || STMT_VINFO_LIVE_P (stmt_info))
1532 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1533 /* A scalar-dependence cycle that we don't support. */
1534 return opt_result::failure_at (phi,
1535 "not vectorized:"
1536 " scalar dependence cycle.\n");
1537
1538 if (STMT_VINFO_RELEVANT_P (stmt_info))
1539 {
1540 need_to_vectorize = true;
1541 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1542 && ! PURE_SLP_STMT (stmt_info))
1543 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1544 &cost_vec);
1545 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1546 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1547 && ! PURE_SLP_STMT (stmt_info))
1548 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1549 &cost_vec);
1550 }
1551
1552 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1553 if (ok
1554 && STMT_VINFO_LIVE_P (stmt_info)
1555 && !PURE_SLP_STMT (stmt_info))
1556 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1557 &cost_vec);
1558
1559 if (!ok)
1560 return opt_result::failure_at (phi,
1561 "not vectorized: relevant phi not "
1562 "supported: %G",
1563 static_cast <gimple *> (phi));
1564 }
1565
1566 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1567 gsi_next (&si))
1568 {
1569 gimple *stmt = gsi_stmt (si);
1570 if (!gimple_clobber_p (stmt))
1571 {
1572 opt_result res
1573 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1574 &need_to_vectorize,
1575 NULL, NULL, &cost_vec);
1576 if (!res)
1577 return res;
1578 }
1579 }
1580 } /* bbs */
1581
1582 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1583
1584 /* All operations in the loop are either irrelevant (deal with loop
1585 control, or dead), or only used outside the loop and can be moved
1586 out of the loop (e.g. invariants, inductions). The loop can be
1587 optimized away by scalar optimizations. We're better off not
1588 touching this loop. */
1589 if (!need_to_vectorize)
1590 {
1591 if (dump_enabled_p ())
1592 dump_printf_loc (MSG_NOTE, vect_location,
1593 "All the computation can be taken out of the loop.\n");
1594 return opt_result::failure_at
1595 (vect_location,
1596 "not vectorized: redundant loop. no profit to vectorize.\n");
1597 }
1598
1599 return opt_result::success ();
1600 }
1601
1602 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1603 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1604 definitely no, or -1 if it's worth retrying. */
1605
1606 static int
1607 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1608 {
1609 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1610 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1611
1612 /* Only fully-masked loops can have iteration counts less than the
1613 vectorization factor. */
1614 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1615 {
1616 HOST_WIDE_INT max_niter;
1617
1618 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1619 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1620 else
1621 max_niter = max_stmt_executions_int (loop);
1622
1623 if (max_niter != -1
1624 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1625 {
1626 if (dump_enabled_p ())
1627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628 "not vectorized: iteration count smaller than "
1629 "vectorization factor.\n");
1630 return 0;
1631 }
1632 }
1633
1634 int min_profitable_iters, min_profitable_estimate;
1635 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1636 &min_profitable_estimate);
1637
1638 if (min_profitable_iters < 0)
1639 {
1640 if (dump_enabled_p ())
1641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642 "not vectorized: vectorization not profitable.\n");
1643 if (dump_enabled_p ())
1644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645 "not vectorized: vector version will never be "
1646 "profitable.\n");
1647 return -1;
1648 }
1649
1650 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1651 * assumed_vf);
1652
1653 /* Use the cost model only if it is more conservative than user specified
1654 threshold. */
1655 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1656 min_profitable_iters);
1657
1658 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1659
1660 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1661 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1662 {
1663 if (dump_enabled_p ())
1664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665 "not vectorized: vectorization not profitable.\n");
1666 if (dump_enabled_p ())
1667 dump_printf_loc (MSG_NOTE, vect_location,
1668 "not vectorized: iteration count smaller than user "
1669 "specified loop bound parameter or minimum profitable "
1670 "iterations (whichever is more conservative).\n");
1671 return 0;
1672 }
1673
1674 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1675 if (estimated_niter == -1)
1676 estimated_niter = likely_max_stmt_executions_int (loop);
1677 if (estimated_niter != -1
1678 && ((unsigned HOST_WIDE_INT) estimated_niter
1679 < MAX (th, (unsigned) min_profitable_estimate)))
1680 {
1681 if (dump_enabled_p ())
1682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683 "not vectorized: estimated iteration count too "
1684 "small.\n");
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_NOTE, vect_location,
1687 "not vectorized: estimated iteration count smaller "
1688 "than specified loop bound parameter or minimum "
1689 "profitable iterations (whichever is more "
1690 "conservative).\n");
1691 return -1;
1692 }
1693
1694 return 1;
1695 }
1696
1697 static opt_result
1698 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1699 vec<data_reference_p> *datarefs,
1700 unsigned int *n_stmts)
1701 {
1702 *n_stmts = 0;
1703 for (unsigned i = 0; i < loop->num_nodes; i++)
1704 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1705 !gsi_end_p (gsi); gsi_next (&gsi))
1706 {
1707 gimple *stmt = gsi_stmt (gsi);
1708 if (is_gimple_debug (stmt))
1709 continue;
1710 ++(*n_stmts);
1711 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1712 if (!res)
1713 {
1714 if (is_gimple_call (stmt) && loop->safelen)
1715 {
1716 tree fndecl = gimple_call_fndecl (stmt), op;
1717 if (fndecl != NULL_TREE)
1718 {
1719 cgraph_node *node = cgraph_node::get (fndecl);
1720 if (node != NULL && node->simd_clones != NULL)
1721 {
1722 unsigned int j, n = gimple_call_num_args (stmt);
1723 for (j = 0; j < n; j++)
1724 {
1725 op = gimple_call_arg (stmt, j);
1726 if (DECL_P (op)
1727 || (REFERENCE_CLASS_P (op)
1728 && get_base_address (op)))
1729 break;
1730 }
1731 op = gimple_call_lhs (stmt);
1732 /* Ignore #pragma omp declare simd functions
1733 if they don't have data references in the
1734 call stmt itself. */
1735 if (j == n
1736 && !(op
1737 && (DECL_P (op)
1738 || (REFERENCE_CLASS_P (op)
1739 && get_base_address (op)))))
1740 continue;
1741 }
1742 }
1743 }
1744 return res;
1745 }
1746 /* If dependence analysis will give up due to the limit on the
1747 number of datarefs stop here and fail fatally. */
1748 if (datarefs->length ()
1749 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1750 return opt_result::failure_at (stmt, "exceeded param "
1751 "loop-max-datarefs-for-datadeps\n");
1752 }
1753 return opt_result::success ();
1754 }
1755
1756 /* Function vect_analyze_loop_2.
1757
1758 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1759 for it. The different analyses will record information in the
1760 loop_vec_info struct. */
1761 static opt_result
1762 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1763 {
1764 opt_result ok = opt_result::success ();
1765 int res;
1766 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1767 poly_uint64 min_vf = 2;
1768
1769 /* The first group of checks is independent of the vector size. */
1770 fatal = true;
1771
1772 /* Find all data references in the loop (which correspond to vdefs/vuses)
1773 and analyze their evolution in the loop. */
1774
1775 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1776
1777 /* Gather the data references and count stmts in the loop. */
1778 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1779 {
1780 opt_result res
1781 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1782 &LOOP_VINFO_DATAREFS (loop_vinfo),
1783 n_stmts);
1784 if (!res)
1785 {
1786 if (dump_enabled_p ())
1787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1788 "not vectorized: loop contains function "
1789 "calls or data references that cannot "
1790 "be analyzed\n");
1791 return res;
1792 }
1793 loop_vinfo->shared->save_datarefs ();
1794 }
1795 else
1796 loop_vinfo->shared->check_datarefs ();
1797
1798 /* Analyze the data references and also adjust the minimal
1799 vectorization factor according to the loads and stores. */
1800
1801 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1802 if (!ok)
1803 {
1804 if (dump_enabled_p ())
1805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806 "bad data references.\n");
1807 return ok;
1808 }
1809
1810 /* Classify all cross-iteration scalar data-flow cycles.
1811 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1812 vect_analyze_scalar_cycles (loop_vinfo);
1813
1814 vect_pattern_recog (loop_vinfo);
1815
1816 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1817
1818 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1819 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1820
1821 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1822 if (!ok)
1823 {
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 "bad data access.\n");
1827 return ok;
1828 }
1829
1830 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1831
1832 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1833 if (!ok)
1834 {
1835 if (dump_enabled_p ())
1836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837 "unexpected pattern.\n");
1838 return ok;
1839 }
1840
1841 /* While the rest of the analysis below depends on it in some way. */
1842 fatal = false;
1843
1844 /* Analyze data dependences between the data-refs in the loop
1845 and adjust the maximum vectorization factor according to
1846 the dependences.
1847 FORNOW: fail at the first data dependence that we encounter. */
1848
1849 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1850 if (!ok)
1851 {
1852 if (dump_enabled_p ())
1853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854 "bad data dependence.\n");
1855 return ok;
1856 }
1857 if (max_vf != MAX_VECTORIZATION_FACTOR
1858 && maybe_lt (max_vf, min_vf))
1859 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1860 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1861
1862 ok = vect_determine_vectorization_factor (loop_vinfo);
1863 if (!ok)
1864 {
1865 if (dump_enabled_p ())
1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867 "can't determine vectorization factor.\n");
1868 return ok;
1869 }
1870 if (max_vf != MAX_VECTORIZATION_FACTOR
1871 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1872 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1873
1874 /* Compute the scalar iteration cost. */
1875 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1876
1877 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1878 unsigned th;
1879
1880 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1881 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1882 if (!ok)
1883 return ok;
1884
1885 /* If there are any SLP instances mark them as pure_slp. */
1886 bool slp = vect_make_slp_decision (loop_vinfo);
1887 if (slp)
1888 {
1889 /* Find stmts that need to be both vectorized and SLPed. */
1890 vect_detect_hybrid_slp (loop_vinfo);
1891
1892 /* Update the vectorization factor based on the SLP decision. */
1893 vect_update_vf_for_slp (loop_vinfo);
1894 }
1895
1896 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1897
1898 /* We don't expect to have to roll back to anything other than an empty
1899 set of rgroups. */
1900 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1901
1902 /* This is the point where we can re-start analysis with SLP forced off. */
1903 start_over:
1904
1905 /* Now the vectorization factor is final. */
1906 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907 gcc_assert (known_ne (vectorization_factor, 0U));
1908
1909 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1910 {
1911 dump_printf_loc (MSG_NOTE, vect_location,
1912 "vectorization_factor = ");
1913 dump_dec (MSG_NOTE, vectorization_factor);
1914 dump_printf (MSG_NOTE, ", niters = %wd\n",
1915 LOOP_VINFO_INT_NITERS (loop_vinfo));
1916 }
1917
1918 HOST_WIDE_INT max_niter
1919 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1920
1921 /* Analyze the alignment of the data-refs in the loop.
1922 Fail if a data reference is found that cannot be vectorized. */
1923
1924 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1925 if (!ok)
1926 {
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "bad data alignment.\n");
1930 return ok;
1931 }
1932
1933 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1934 It is important to call pruning after vect_analyze_data_ref_accesses,
1935 since we use grouping information gathered by interleaving analysis. */
1936 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1937 if (!ok)
1938 return ok;
1939
1940 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1941 vectorization, since we do not want to add extra peeling or
1942 add versioning for alignment. */
1943 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944 /* This pass will decide on using loop versioning and/or loop peeling in
1945 order to enhance the alignment of data references in the loop. */
1946 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1947 else
1948 ok = vect_verify_datarefs_alignment (loop_vinfo);
1949 if (!ok)
1950 return ok;
1951
1952 if (slp)
1953 {
1954 /* Analyze operations in the SLP instances. Note this may
1955 remove unsupported SLP instances which makes the above
1956 SLP kind detection invalid. */
1957 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1958 vect_slp_analyze_operations (loop_vinfo);
1959 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1960 {
1961 ok = opt_result::failure_at (vect_location,
1962 "unsupported SLP instances\n");
1963 goto again;
1964 }
1965 }
1966
1967 /* Scan all the remaining operations in the loop that are not subject
1968 to SLP and make sure they are vectorizable. */
1969 ok = vect_analyze_loop_operations (loop_vinfo);
1970 if (!ok)
1971 {
1972 if (dump_enabled_p ())
1973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974 "bad operation or unsupported loop bound.\n");
1975 return ok;
1976 }
1977
1978 /* Decide whether to use a fully-masked loop for this vectorization
1979 factor. */
1980 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1981 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1982 && vect_verify_full_masking (loop_vinfo));
1983 if (dump_enabled_p ())
1984 {
1985 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1986 dump_printf_loc (MSG_NOTE, vect_location,
1987 "using a fully-masked loop.\n");
1988 else
1989 dump_printf_loc (MSG_NOTE, vect_location,
1990 "not using a fully-masked loop.\n");
1991 }
1992
1993 /* If epilog loop is required because of data accesses with gaps,
1994 one additional iteration needs to be peeled. Check if there is
1995 enough iterations for vectorization. */
1996 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1997 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1998 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1999 {
2000 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2002
2003 if (known_lt (wi::to_widest (scalar_niters), vf))
2004 return opt_result::failure_at (vect_location,
2005 "loop has no enough iterations to"
2006 " support peeling for gaps.\n");
2007 }
2008
2009 /* Check the costings of the loop make vectorizing worthwhile. */
2010 res = vect_analyze_loop_costing (loop_vinfo);
2011 if (res < 0)
2012 {
2013 ok = opt_result::failure_at (vect_location,
2014 "Loop costings may not be worthwhile.\n");
2015 goto again;
2016 }
2017 if (!res)
2018 return opt_result::failure_at (vect_location,
2019 "Loop costings not worthwhile.\n");
2020
2021 /* Decide whether we need to create an epilogue loop to handle
2022 remaining scalar iterations. */
2023 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2024
2025 unsigned HOST_WIDE_INT const_vf;
2026 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027 /* The main loop handles all iterations. */
2028 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2029 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2030 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2031 {
2032 /* Work out the (constant) number of iterations that need to be
2033 peeled for reasons other than niters. */
2034 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2035 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2036 peel_niter += 1;
2037 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2038 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2039 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2040 }
2041 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2042 /* ??? When peeling for gaps but not alignment, we could
2043 try to check whether the (variable) niters is known to be
2044 VF * N + 1. That's something of a niche case though. */
2045 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2046 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2047 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2048 < (unsigned) exact_log2 (const_vf))
2049 /* In case of versioning, check if the maximum number of
2050 iterations is greater than th. If they are identical,
2051 the epilogue is unnecessary. */
2052 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2053 || ((unsigned HOST_WIDE_INT) max_niter
2054 > (th / const_vf) * const_vf))))
2055 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2056
2057 /* If an epilogue loop is required make sure we can create one. */
2058 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2060 {
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2063 if (!vect_can_advance_ivs_p (loop_vinfo)
2064 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2065 single_exit (LOOP_VINFO_LOOP
2066 (loop_vinfo))))
2067 {
2068 ok = opt_result::failure_at (vect_location,
2069 "not vectorized: can't create required "
2070 "epilog loop\n");
2071 goto again;
2072 }
2073 }
2074
2075 /* During peeling, we need to check if number of loop iterations is
2076 enough for both peeled prolog loop and vector loop. This check
2077 can be merged along with threshold check of loop versioning, so
2078 increase threshold for this case if necessary. */
2079 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2080 {
2081 poly_uint64 niters_th = 0;
2082
2083 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2084 {
2085 /* Niters for peeled prolog loop. */
2086 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2087 {
2088 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2089 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2090 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2091 }
2092 else
2093 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2094 }
2095
2096 /* Niters for at least one iteration of vectorized loop. */
2097 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2098 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2099 /* One additional iteration because of peeling for gap. */
2100 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2101 niters_th += 1;
2102 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2103 }
2104
2105 gcc_assert (known_eq (vectorization_factor,
2106 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2107
2108 /* Ok to vectorize! */
2109 return opt_result::success ();
2110
2111 again:
2112 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2113 gcc_assert (!ok);
2114
2115 /* Try again with SLP forced off but if we didn't do any SLP there is
2116 no point in re-trying. */
2117 if (!slp)
2118 return ok;
2119
2120 /* If there are reduction chains re-trying will fail anyway. */
2121 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2122 return ok;
2123
2124 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2125 via interleaving or lane instructions. */
2126 slp_instance instance;
2127 slp_tree node;
2128 unsigned i, j;
2129 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2130 {
2131 stmt_vec_info vinfo;
2132 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2133 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2134 continue;
2135 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2136 unsigned int size = DR_GROUP_SIZE (vinfo);
2137 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2138 if (! vect_store_lanes_supported (vectype, size, false)
2139 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2140 && ! vect_grouped_store_supported (vectype, size))
2141 return opt_result::failure_at (vinfo->stmt,
2142 "unsupported grouped store\n");
2143 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2144 {
2145 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2146 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2147 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2148 size = DR_GROUP_SIZE (vinfo);
2149 vectype = STMT_VINFO_VECTYPE (vinfo);
2150 if (! vect_load_lanes_supported (vectype, size, false)
2151 && ! vect_grouped_load_supported (vectype, single_element_p,
2152 size))
2153 return opt_result::failure_at (vinfo->stmt,
2154 "unsupported grouped load\n");
2155 }
2156 }
2157
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE, vect_location,
2160 "re-trying with SLP disabled\n");
2161
2162 /* Roll back state appropriately. No SLP this time. */
2163 slp = false;
2164 /* Restore vectorization factor as it were without SLP. */
2165 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2166 /* Free the SLP instances. */
2167 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2168 vect_free_slp_instance (instance, false);
2169 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2170 /* Reset SLP type to loop_vect on all stmts. */
2171 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2172 {
2173 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2174 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2175 !gsi_end_p (si); gsi_next (&si))
2176 {
2177 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2178 STMT_SLP_TYPE (stmt_info) = loop_vect;
2179 }
2180 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2181 !gsi_end_p (si); gsi_next (&si))
2182 {
2183 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2184 STMT_SLP_TYPE (stmt_info) = loop_vect;
2185 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2186 {
2187 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2188 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2189 STMT_SLP_TYPE (stmt_info) = loop_vect;
2190 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2191 !gsi_end_p (pi); gsi_next (&pi))
2192 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2193 = loop_vect;
2194 }
2195 }
2196 }
2197 /* Free optimized alias test DDRS. */
2198 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2199 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2200 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2201 /* Reset target cost data. */
2202 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2203 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2204 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2205 /* Reset accumulated rgroup information. */
2206 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2207 /* Reset assorted flags. */
2208 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2209 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2210 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2211 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2212 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2213
2214 goto start_over;
2215 }
2216
2217 /* Function vect_analyze_loop.
2218
2219 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2220 for it. The different analyses will record information in the
2221 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2222 be vectorized. */
2223 opt_loop_vec_info
2224 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2225 vec_info_shared *shared)
2226 {
2227 auto_vector_sizes vector_sizes;
2228
2229 /* Autodetect first vector size we try. */
2230 current_vector_size = 0;
2231 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2232 unsigned int next_size = 0;
2233
2234 DUMP_VECT_SCOPE ("analyze_loop_nest");
2235
2236 if (loop_outer (loop)
2237 && loop_vec_info_for_loop (loop_outer (loop))
2238 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2239 return opt_loop_vec_info::failure_at (vect_location,
2240 "outer-loop already vectorized.\n");
2241
2242 if (!find_loop_nest (loop, &shared->loop_nest))
2243 return opt_loop_vec_info::failure_at
2244 (vect_location,
2245 "not vectorized: loop nest containing two or more consecutive inner"
2246 " loops cannot be vectorized\n");
2247
2248 unsigned n_stmts = 0;
2249 poly_uint64 autodetected_vector_size = 0;
2250 while (1)
2251 {
2252 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2253 opt_loop_vec_info loop_vinfo
2254 = vect_analyze_loop_form (loop, shared);
2255 if (!loop_vinfo)
2256 {
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "bad loop form.\n");
2260 return loop_vinfo;
2261 }
2262
2263 bool fatal = false;
2264
2265 if (orig_loop_vinfo)
2266 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2267
2268 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2269 if (res)
2270 {
2271 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2272
2273 return loop_vinfo;
2274 }
2275
2276 delete loop_vinfo;
2277
2278 if (next_size == 0)
2279 autodetected_vector_size = current_vector_size;
2280
2281 if (next_size < vector_sizes.length ()
2282 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2283 next_size += 1;
2284
2285 if (fatal
2286 || next_size == vector_sizes.length ()
2287 || known_eq (current_vector_size, 0U))
2288 return opt_loop_vec_info::propagate_failure (res);
2289
2290 /* Try the next biggest vector size. */
2291 current_vector_size = vector_sizes[next_size++];
2292 if (dump_enabled_p ())
2293 {
2294 dump_printf_loc (MSG_NOTE, vect_location,
2295 "***** Re-trying analysis with "
2296 "vector size ");
2297 dump_dec (MSG_NOTE, current_vector_size);
2298 dump_printf (MSG_NOTE, "\n");
2299 }
2300 }
2301 }
2302
2303 /* Return true if there is an in-order reduction function for CODE, storing
2304 it in *REDUC_FN if so. */
2305
2306 static bool
2307 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2308 {
2309 switch (code)
2310 {
2311 case PLUS_EXPR:
2312 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2313 return true;
2314
2315 default:
2316 return false;
2317 }
2318 }
2319
2320 /* Function reduction_fn_for_scalar_code
2321
2322 Input:
2323 CODE - tree_code of a reduction operations.
2324
2325 Output:
2326 REDUC_FN - the corresponding internal function to be used to reduce the
2327 vector of partial results into a single scalar result, or IFN_LAST
2328 if the operation is a supported reduction operation, but does not have
2329 such an internal function.
2330
2331 Return FALSE if CODE currently cannot be vectorized as reduction. */
2332
2333 static bool
2334 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2335 {
2336 switch (code)
2337 {
2338 case MAX_EXPR:
2339 *reduc_fn = IFN_REDUC_MAX;
2340 return true;
2341
2342 case MIN_EXPR:
2343 *reduc_fn = IFN_REDUC_MIN;
2344 return true;
2345
2346 case PLUS_EXPR:
2347 *reduc_fn = IFN_REDUC_PLUS;
2348 return true;
2349
2350 case BIT_AND_EXPR:
2351 *reduc_fn = IFN_REDUC_AND;
2352 return true;
2353
2354 case BIT_IOR_EXPR:
2355 *reduc_fn = IFN_REDUC_IOR;
2356 return true;
2357
2358 case BIT_XOR_EXPR:
2359 *reduc_fn = IFN_REDUC_XOR;
2360 return true;
2361
2362 case MULT_EXPR:
2363 case MINUS_EXPR:
2364 *reduc_fn = IFN_LAST;
2365 return true;
2366
2367 default:
2368 return false;
2369 }
2370 }
2371
2372 /* If there is a neutral value X such that SLP reduction NODE would not
2373 be affected by the introduction of additional X elements, return that X,
2374 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2375 is true if the SLP statements perform a single reduction, false if each
2376 statement performs an independent reduction. */
2377
2378 static tree
2379 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2380 bool reduc_chain)
2381 {
2382 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2383 stmt_vec_info stmt_vinfo = stmts[0];
2384 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2385 tree scalar_type = TREE_TYPE (vector_type);
2386 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2387 gcc_assert (loop);
2388
2389 switch (code)
2390 {
2391 case WIDEN_SUM_EXPR:
2392 case DOT_PROD_EXPR:
2393 case SAD_EXPR:
2394 case PLUS_EXPR:
2395 case MINUS_EXPR:
2396 case BIT_IOR_EXPR:
2397 case BIT_XOR_EXPR:
2398 return build_zero_cst (scalar_type);
2399
2400 case MULT_EXPR:
2401 return build_one_cst (scalar_type);
2402
2403 case BIT_AND_EXPR:
2404 return build_all_ones_cst (scalar_type);
2405
2406 case MAX_EXPR:
2407 case MIN_EXPR:
2408 /* For MIN/MAX the initial values are neutral. A reduction chain
2409 has only a single initial value, so that value is neutral for
2410 all statements. */
2411 if (reduc_chain)
2412 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2413 loop_preheader_edge (loop));
2414 return NULL_TREE;
2415
2416 default:
2417 return NULL_TREE;
2418 }
2419 }
2420
2421 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2422 STMT is printed with a message MSG. */
2423
2424 static void
2425 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2426 {
2427 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2428 }
2429
2430 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2431 operation. Return true if the results of DEF_STMT_INFO are something
2432 that can be accumulated by such a reduction. */
2433
2434 static bool
2435 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2436 {
2437 return (is_gimple_assign (def_stmt_info->stmt)
2438 || is_gimple_call (def_stmt_info->stmt)
2439 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2440 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2441 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2442 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2443 }
2444
2445 /* Detect SLP reduction of the form:
2446
2447 #a1 = phi <a5, a0>
2448 a2 = operation (a1)
2449 a3 = operation (a2)
2450 a4 = operation (a3)
2451 a5 = operation (a4)
2452
2453 #a = phi <a5>
2454
2455 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2456 FIRST_STMT is the first reduction stmt in the chain
2457 (a2 = operation (a1)).
2458
2459 Return TRUE if a reduction chain was detected. */
2460
2461 static bool
2462 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2463 gimple *first_stmt)
2464 {
2465 struct loop *loop = (gimple_bb (phi))->loop_father;
2466 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2467 enum tree_code code;
2468 gimple *loop_use_stmt = NULL;
2469 stmt_vec_info use_stmt_info;
2470 tree lhs;
2471 imm_use_iterator imm_iter;
2472 use_operand_p use_p;
2473 int nloop_uses, size = 0, n_out_of_loop_uses;
2474 bool found = false;
2475
2476 if (loop != vect_loop)
2477 return false;
2478
2479 auto_vec<stmt_vec_info, 8> reduc_chain;
2480 lhs = PHI_RESULT (phi);
2481 code = gimple_assign_rhs_code (first_stmt);
2482 while (1)
2483 {
2484 nloop_uses = 0;
2485 n_out_of_loop_uses = 0;
2486 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2487 {
2488 gimple *use_stmt = USE_STMT (use_p);
2489 if (is_gimple_debug (use_stmt))
2490 continue;
2491
2492 /* Check if we got back to the reduction phi. */
2493 if (use_stmt == phi)
2494 {
2495 loop_use_stmt = use_stmt;
2496 found = true;
2497 break;
2498 }
2499
2500 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2501 {
2502 loop_use_stmt = use_stmt;
2503 nloop_uses++;
2504 }
2505 else
2506 n_out_of_loop_uses++;
2507
2508 /* There are can be either a single use in the loop or two uses in
2509 phi nodes. */
2510 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2511 return false;
2512 }
2513
2514 if (found)
2515 break;
2516
2517 /* We reached a statement with no loop uses. */
2518 if (nloop_uses == 0)
2519 return false;
2520
2521 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2522 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2523 return false;
2524
2525 if (!is_gimple_assign (loop_use_stmt)
2526 || code != gimple_assign_rhs_code (loop_use_stmt)
2527 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2528 return false;
2529
2530 /* Insert USE_STMT into reduction chain. */
2531 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2532 reduc_chain.safe_push (use_stmt_info);
2533
2534 lhs = gimple_assign_lhs (loop_use_stmt);
2535 size++;
2536 }
2537
2538 if (!found || loop_use_stmt != phi || size < 2)
2539 return false;
2540
2541 /* Swap the operands, if needed, to make the reduction operand be the second
2542 operand. */
2543 lhs = PHI_RESULT (phi);
2544 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2545 {
2546 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2547 if (gimple_assign_rhs2 (next_stmt) == lhs)
2548 {
2549 tree op = gimple_assign_rhs1 (next_stmt);
2550 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2551
2552 /* Check that the other def is either defined in the loop
2553 ("vect_internal_def"), or it's an induction (defined by a
2554 loop-header phi-node). */
2555 if (def_stmt_info
2556 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2557 && vect_valid_reduction_input_p (def_stmt_info))
2558 {
2559 lhs = gimple_assign_lhs (next_stmt);
2560 continue;
2561 }
2562
2563 return false;
2564 }
2565 else
2566 {
2567 tree op = gimple_assign_rhs2 (next_stmt);
2568 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2569
2570 /* Check that the other def is either defined in the loop
2571 ("vect_internal_def"), or it's an induction (defined by a
2572 loop-header phi-node). */
2573 if (def_stmt_info
2574 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2575 && vect_valid_reduction_input_p (def_stmt_info))
2576 {
2577 if (dump_enabled_p ())
2578 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2579 next_stmt);
2580
2581 swap_ssa_operands (next_stmt,
2582 gimple_assign_rhs1_ptr (next_stmt),
2583 gimple_assign_rhs2_ptr (next_stmt));
2584 update_stmt (next_stmt);
2585
2586 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2587 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2588 }
2589 else
2590 return false;
2591 }
2592
2593 lhs = gimple_assign_lhs (next_stmt);
2594 }
2595
2596 /* Build up the actual chain. */
2597 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2598 {
2599 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2600 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2601 }
2602 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2603 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2604
2605 /* Save the chain for further analysis in SLP detection. */
2606 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2607 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2608
2609 return true;
2610 }
2611
2612 /* Return true if we need an in-order reduction for operation CODE
2613 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2614 overflow must wrap. */
2615
2616 static bool
2617 needs_fold_left_reduction_p (tree type, tree_code code,
2618 bool need_wrapping_integral_overflow)
2619 {
2620 /* CHECKME: check for !flag_finite_math_only too? */
2621 if (SCALAR_FLOAT_TYPE_P (type))
2622 switch (code)
2623 {
2624 case MIN_EXPR:
2625 case MAX_EXPR:
2626 return false;
2627
2628 default:
2629 return !flag_associative_math;
2630 }
2631
2632 if (INTEGRAL_TYPE_P (type))
2633 {
2634 if (!operation_no_trapping_overflow (type, code))
2635 return true;
2636 if (need_wrapping_integral_overflow
2637 && !TYPE_OVERFLOW_WRAPS (type)
2638 && operation_can_overflow (code))
2639 return true;
2640 return false;
2641 }
2642
2643 if (SAT_FIXED_POINT_TYPE_P (type))
2644 return true;
2645
2646 return false;
2647 }
2648
2649 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2650 reduction operation CODE has a handled computation expression. */
2651
2652 bool
2653 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2654 tree loop_arg, enum tree_code code)
2655 {
2656 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2657 auto_bitmap visited;
2658 tree lookfor = PHI_RESULT (phi);
2659 ssa_op_iter curri;
2660 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2661 while (USE_FROM_PTR (curr) != loop_arg)
2662 curr = op_iter_next_use (&curri);
2663 curri.i = curri.numops;
2664 do
2665 {
2666 path.safe_push (std::make_pair (curri, curr));
2667 tree use = USE_FROM_PTR (curr);
2668 if (use == lookfor)
2669 break;
2670 gimple *def = SSA_NAME_DEF_STMT (use);
2671 if (gimple_nop_p (def)
2672 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2673 {
2674 pop:
2675 do
2676 {
2677 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2678 curri = x.first;
2679 curr = x.second;
2680 do
2681 curr = op_iter_next_use (&curri);
2682 /* Skip already visited or non-SSA operands (from iterating
2683 over PHI args). */
2684 while (curr != NULL_USE_OPERAND_P
2685 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2686 || ! bitmap_set_bit (visited,
2687 SSA_NAME_VERSION
2688 (USE_FROM_PTR (curr)))));
2689 }
2690 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2691 if (curr == NULL_USE_OPERAND_P)
2692 break;
2693 }
2694 else
2695 {
2696 if (gimple_code (def) == GIMPLE_PHI)
2697 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2698 else
2699 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2700 while (curr != NULL_USE_OPERAND_P
2701 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2702 || ! bitmap_set_bit (visited,
2703 SSA_NAME_VERSION
2704 (USE_FROM_PTR (curr)))))
2705 curr = op_iter_next_use (&curri);
2706 if (curr == NULL_USE_OPERAND_P)
2707 goto pop;
2708 }
2709 }
2710 while (1);
2711 if (dump_file && (dump_flags & TDF_DETAILS))
2712 {
2713 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2714 unsigned i;
2715 std::pair<ssa_op_iter, use_operand_p> *x;
2716 FOR_EACH_VEC_ELT (path, i, x)
2717 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2718 dump_printf (MSG_NOTE, "\n");
2719 }
2720
2721 /* Check whether the reduction path detected is valid. */
2722 bool fail = path.length () == 0;
2723 bool neg = false;
2724 for (unsigned i = 1; i < path.length (); ++i)
2725 {
2726 gimple *use_stmt = USE_STMT (path[i].second);
2727 tree op = USE_FROM_PTR (path[i].second);
2728 if (! has_single_use (op)
2729 || ! is_gimple_assign (use_stmt))
2730 {
2731 fail = true;
2732 break;
2733 }
2734 if (gimple_assign_rhs_code (use_stmt) != code)
2735 {
2736 if (code == PLUS_EXPR
2737 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2738 {
2739 /* Track whether we negate the reduction value each iteration. */
2740 if (gimple_assign_rhs2 (use_stmt) == op)
2741 neg = ! neg;
2742 }
2743 else
2744 {
2745 fail = true;
2746 break;
2747 }
2748 }
2749 }
2750 return ! fail && ! neg;
2751 }
2752
2753
2754 /* Function vect_is_simple_reduction
2755
2756 (1) Detect a cross-iteration def-use cycle that represents a simple
2757 reduction computation. We look for the following pattern:
2758
2759 loop_header:
2760 a1 = phi < a0, a2 >
2761 a3 = ...
2762 a2 = operation (a3, a1)
2763
2764 or
2765
2766 a3 = ...
2767 loop_header:
2768 a1 = phi < a0, a2 >
2769 a2 = operation (a3, a1)
2770
2771 such that:
2772 1. operation is commutative and associative and it is safe to
2773 change the order of the computation
2774 2. no uses for a2 in the loop (a2 is used out of the loop)
2775 3. no uses of a1 in the loop besides the reduction operation
2776 4. no uses of a1 outside the loop.
2777
2778 Conditions 1,4 are tested here.
2779 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2780
2781 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2782 nested cycles.
2783
2784 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2785 reductions:
2786
2787 a1 = phi < a0, a2 >
2788 inner loop (def of a3)
2789 a2 = phi < a3 >
2790
2791 (4) Detect condition expressions, ie:
2792 for (int i = 0; i < N; i++)
2793 if (a[i] < val)
2794 ret_val = a[i];
2795
2796 */
2797
2798 static stmt_vec_info
2799 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2800 bool *double_reduc,
2801 bool need_wrapping_integral_overflow,
2802 enum vect_reduction_type *v_reduc_type)
2803 {
2804 gphi *phi = as_a <gphi *> (phi_info->stmt);
2805 struct loop *loop = (gimple_bb (phi))->loop_father;
2806 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2807 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2808 gimple *phi_use_stmt = NULL;
2809 enum tree_code orig_code, code;
2810 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2811 tree type;
2812 tree name;
2813 imm_use_iterator imm_iter;
2814 use_operand_p use_p;
2815 bool phi_def;
2816
2817 *double_reduc = false;
2818 *v_reduc_type = TREE_CODE_REDUCTION;
2819
2820 tree phi_name = PHI_RESULT (phi);
2821 /* ??? If there are no uses of the PHI result the inner loop reduction
2822 won't be detected as possibly double-reduction by vectorizable_reduction
2823 because that tries to walk the PHI arg from the preheader edge which
2824 can be constant. See PR60382. */
2825 if (has_zero_uses (phi_name))
2826 return NULL;
2827 unsigned nphi_def_loop_uses = 0;
2828 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2829 {
2830 gimple *use_stmt = USE_STMT (use_p);
2831 if (is_gimple_debug (use_stmt))
2832 continue;
2833
2834 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2835 {
2836 if (dump_enabled_p ())
2837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2838 "intermediate value used outside loop.\n");
2839
2840 return NULL;
2841 }
2842
2843 nphi_def_loop_uses++;
2844 phi_use_stmt = use_stmt;
2845 }
2846
2847 edge latch_e = loop_latch_edge (loop);
2848 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2849 if (TREE_CODE (loop_arg) != SSA_NAME)
2850 {
2851 if (dump_enabled_p ())
2852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2853 "reduction: not ssa_name: %T\n", loop_arg);
2854 return NULL;
2855 }
2856
2857 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2858 if (!def_stmt_info
2859 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2860 return NULL;
2861
2862 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2863 {
2864 name = gimple_assign_lhs (def_stmt);
2865 phi_def = false;
2866 }
2867 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2868 {
2869 name = PHI_RESULT (def_stmt);
2870 phi_def = true;
2871 }
2872 else
2873 {
2874 if (dump_enabled_p ())
2875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2876 "reduction: unhandled reduction operation: %G",
2877 def_stmt_info->stmt);
2878 return NULL;
2879 }
2880
2881 unsigned nlatch_def_loop_uses = 0;
2882 auto_vec<gphi *, 3> lcphis;
2883 bool inner_loop_of_double_reduc = false;
2884 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2885 {
2886 gimple *use_stmt = USE_STMT (use_p);
2887 if (is_gimple_debug (use_stmt))
2888 continue;
2889 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2890 nlatch_def_loop_uses++;
2891 else
2892 {
2893 /* We can have more than one loop-closed PHI. */
2894 lcphis.safe_push (as_a <gphi *> (use_stmt));
2895 if (nested_in_vect_loop
2896 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2897 == vect_double_reduction_def))
2898 inner_loop_of_double_reduc = true;
2899 }
2900 }
2901
2902 /* If this isn't a nested cycle or if the nested cycle reduction value
2903 is used ouside of the inner loop we cannot handle uses of the reduction
2904 value. */
2905 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2906 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2907 {
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910 "reduction used in loop.\n");
2911 return NULL;
2912 }
2913
2914 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2915 defined in the inner loop. */
2916 if (phi_def)
2917 {
2918 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2919 op1 = PHI_ARG_DEF (def_stmt, 0);
2920
2921 if (gimple_phi_num_args (def_stmt) != 1
2922 || TREE_CODE (op1) != SSA_NAME)
2923 {
2924 if (dump_enabled_p ())
2925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2926 "unsupported phi node definition.\n");
2927
2928 return NULL;
2929 }
2930
2931 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2932 if (gimple_bb (def1)
2933 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2934 && loop->inner
2935 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2936 && is_gimple_assign (def1)
2937 && is_a <gphi *> (phi_use_stmt)
2938 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2939 {
2940 if (dump_enabled_p ())
2941 report_vect_op (MSG_NOTE, def_stmt,
2942 "detected double reduction: ");
2943
2944 *double_reduc = true;
2945 return def_stmt_info;
2946 }
2947
2948 return NULL;
2949 }
2950
2951 /* If we are vectorizing an inner reduction we are executing that
2952 in the original order only in case we are not dealing with a
2953 double reduction. */
2954 bool check_reduction = true;
2955 if (flow_loop_nested_p (vect_loop, loop))
2956 {
2957 gphi *lcphi;
2958 unsigned i;
2959 check_reduction = false;
2960 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2961 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2962 {
2963 gimple *use_stmt = USE_STMT (use_p);
2964 if (is_gimple_debug (use_stmt))
2965 continue;
2966 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2967 check_reduction = true;
2968 }
2969 }
2970
2971 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2972 code = orig_code = gimple_assign_rhs_code (def_stmt);
2973
2974 if (nested_in_vect_loop && !check_reduction)
2975 {
2976 /* FIXME: Even for non-reductions code generation is funneled
2977 through vectorizable_reduction for the stmt defining the
2978 PHI latch value. So we have to artificially restrict ourselves
2979 for the supported operations. */
2980 switch (get_gimple_rhs_class (code))
2981 {
2982 case GIMPLE_BINARY_RHS:
2983 case GIMPLE_TERNARY_RHS:
2984 break;
2985 default:
2986 /* Not supported by vectorizable_reduction. */
2987 if (dump_enabled_p ())
2988 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2989 "nested cycle: not handled operation: ");
2990 return NULL;
2991 }
2992 if (dump_enabled_p ())
2993 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2994 return def_stmt_info;
2995 }
2996
2997 /* We can handle "res -= x[i]", which is non-associative by
2998 simply rewriting this into "res += -x[i]". Avoid changing
2999 gimple instruction for the first simple tests and only do this
3000 if we're allowed to change code at all. */
3001 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3002 code = PLUS_EXPR;
3003
3004 if (code == COND_EXPR)
3005 {
3006 if (! nested_in_vect_loop)
3007 *v_reduc_type = COND_REDUCTION;
3008
3009 op3 = gimple_assign_rhs1 (def_stmt);
3010 if (COMPARISON_CLASS_P (op3))
3011 {
3012 op4 = TREE_OPERAND (op3, 1);
3013 op3 = TREE_OPERAND (op3, 0);
3014 }
3015 if (op3 == phi_name || op4 == phi_name)
3016 {
3017 if (dump_enabled_p ())
3018 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3019 "reduction: condition depends on previous"
3020 " iteration: ");
3021 return NULL;
3022 }
3023
3024 op1 = gimple_assign_rhs2 (def_stmt);
3025 op2 = gimple_assign_rhs3 (def_stmt);
3026 }
3027 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3028 {
3029 if (dump_enabled_p ())
3030 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3031 "reduction: not commutative/associative: ");
3032 return NULL;
3033 }
3034 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3035 {
3036 op1 = gimple_assign_rhs1 (def_stmt);
3037 op2 = gimple_assign_rhs2 (def_stmt);
3038 }
3039 else
3040 {
3041 if (dump_enabled_p ())
3042 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3043 "reduction: not handled operation: ");
3044 return NULL;
3045 }
3046
3047 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3048 {
3049 if (dump_enabled_p ())
3050 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3051 "reduction: both uses not ssa_names: ");
3052
3053 return NULL;
3054 }
3055
3056 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3057 if ((TREE_CODE (op1) == SSA_NAME
3058 && !types_compatible_p (type,TREE_TYPE (op1)))
3059 || (TREE_CODE (op2) == SSA_NAME
3060 && !types_compatible_p (type, TREE_TYPE (op2)))
3061 || (op3 && TREE_CODE (op3) == SSA_NAME
3062 && !types_compatible_p (type, TREE_TYPE (op3)))
3063 || (op4 && TREE_CODE (op4) == SSA_NAME
3064 && !types_compatible_p (type, TREE_TYPE (op4))))
3065 {
3066 if (dump_enabled_p ())
3067 {
3068 dump_printf_loc (MSG_NOTE, vect_location,
3069 "reduction: multiple types: operation type: "
3070 "%T, operands types: %T,%T",
3071 type, TREE_TYPE (op1), TREE_TYPE (op2));
3072 if (op3)
3073 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3074
3075 if (op4)
3076 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3077 dump_printf (MSG_NOTE, "\n");
3078 }
3079
3080 return NULL;
3081 }
3082
3083 /* Check whether it's ok to change the order of the computation.
3084 Generally, when vectorizing a reduction we change the order of the
3085 computation. This may change the behavior of the program in some
3086 cases, so we need to check that this is ok. One exception is when
3087 vectorizing an outer-loop: the inner-loop is executed sequentially,
3088 and therefore vectorizing reductions in the inner-loop during
3089 outer-loop vectorization is safe. */
3090 if (check_reduction
3091 && *v_reduc_type == TREE_CODE_REDUCTION
3092 && needs_fold_left_reduction_p (type, code,
3093 need_wrapping_integral_overflow))
3094 *v_reduc_type = FOLD_LEFT_REDUCTION;
3095
3096 /* Reduction is safe. We're dealing with one of the following:
3097 1) integer arithmetic and no trapv
3098 2) floating point arithmetic, and special flags permit this optimization
3099 3) nested cycle (i.e., outer loop vectorization). */
3100 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3101 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3102 if (code != COND_EXPR && !def1_info && !def2_info)
3103 {
3104 if (dump_enabled_p ())
3105 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3106 return NULL;
3107 }
3108
3109 /* Check that one def is the reduction def, defined by PHI,
3110 the other def is either defined in the loop ("vect_internal_def"),
3111 or it's an induction (defined by a loop-header phi-node). */
3112
3113 if (def2_info
3114 && def2_info->stmt == phi
3115 && (code == COND_EXPR
3116 || !def1_info
3117 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3118 || vect_valid_reduction_input_p (def1_info)))
3119 {
3120 if (dump_enabled_p ())
3121 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3122 return def_stmt_info;
3123 }
3124
3125 if (def1_info
3126 && def1_info->stmt == phi
3127 && (code == COND_EXPR
3128 || !def2_info
3129 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3130 || vect_valid_reduction_input_p (def2_info)))
3131 {
3132 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3133 {
3134 /* Check if we can swap operands (just for simplicity - so that
3135 the rest of the code can assume that the reduction variable
3136 is always the last (second) argument). */
3137 if (code == COND_EXPR)
3138 {
3139 /* Swap cond_expr by inverting the condition. */
3140 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3141 enum tree_code invert_code = ERROR_MARK;
3142 enum tree_code cond_code = TREE_CODE (cond_expr);
3143
3144 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3145 {
3146 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3147 invert_code = invert_tree_comparison (cond_code, honor_nans);
3148 }
3149 if (invert_code != ERROR_MARK)
3150 {
3151 TREE_SET_CODE (cond_expr, invert_code);
3152 swap_ssa_operands (def_stmt,
3153 gimple_assign_rhs2_ptr (def_stmt),
3154 gimple_assign_rhs3_ptr (def_stmt));
3155 }
3156 else
3157 {
3158 if (dump_enabled_p ())
3159 report_vect_op (MSG_NOTE, def_stmt,
3160 "detected reduction: cannot swap operands "
3161 "for cond_expr");
3162 return NULL;
3163 }
3164 }
3165 else
3166 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3167 gimple_assign_rhs2_ptr (def_stmt));
3168
3169 if (dump_enabled_p ())
3170 report_vect_op (MSG_NOTE, def_stmt,
3171 "detected reduction: need to swap operands: ");
3172
3173 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3174 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3175 }
3176 else
3177 {
3178 if (dump_enabled_p ())
3179 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3180 }
3181
3182 return def_stmt_info;
3183 }
3184
3185 /* Try to find SLP reduction chain. */
3186 if (! nested_in_vect_loop
3187 && code != COND_EXPR
3188 && orig_code != MINUS_EXPR
3189 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3190 {
3191 if (dump_enabled_p ())
3192 report_vect_op (MSG_NOTE, def_stmt,
3193 "reduction: detected reduction chain: ");
3194
3195 return def_stmt_info;
3196 }
3197
3198 /* Look for the expression computing loop_arg from loop PHI result. */
3199 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3200 return def_stmt_info;
3201
3202 if (dump_enabled_p ())
3203 {
3204 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3205 "reduction: unknown pattern: ");
3206 }
3207
3208 return NULL;
3209 }
3210
3211 /* Wrapper around vect_is_simple_reduction, which will modify code
3212 in-place if it enables detection of more reductions. Arguments
3213 as there. */
3214
3215 stmt_vec_info
3216 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3217 bool *double_reduc,
3218 bool need_wrapping_integral_overflow)
3219 {
3220 enum vect_reduction_type v_reduc_type;
3221 stmt_vec_info def_info
3222 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3223 need_wrapping_integral_overflow,
3224 &v_reduc_type);
3225 if (def_info)
3226 {
3227 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3228 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3229 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3230 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3231 }
3232 return def_info;
3233 }
3234
3235 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3236 int
3237 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3238 int *peel_iters_epilogue,
3239 stmt_vector_for_cost *scalar_cost_vec,
3240 stmt_vector_for_cost *prologue_cost_vec,
3241 stmt_vector_for_cost *epilogue_cost_vec)
3242 {
3243 int retval = 0;
3244 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3245
3246 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3247 {
3248 *peel_iters_epilogue = assumed_vf / 2;
3249 if (dump_enabled_p ())
3250 dump_printf_loc (MSG_NOTE, vect_location,
3251 "cost model: epilogue peel iters set to vf/2 "
3252 "because loop iterations are unknown .\n");
3253
3254 /* If peeled iterations are known but number of scalar loop
3255 iterations are unknown, count a taken branch per peeled loop. */
3256 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3257 NULL, 0, vect_prologue);
3258 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3259 NULL, 0, vect_epilogue);
3260 }
3261 else
3262 {
3263 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3264 peel_iters_prologue = niters < peel_iters_prologue ?
3265 niters : peel_iters_prologue;
3266 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3267 /* If we need to peel for gaps, but no peeling is required, we have to
3268 peel VF iterations. */
3269 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3270 *peel_iters_epilogue = assumed_vf;
3271 }
3272
3273 stmt_info_for_cost *si;
3274 int j;
3275 if (peel_iters_prologue)
3276 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3277 retval += record_stmt_cost (prologue_cost_vec,
3278 si->count * peel_iters_prologue,
3279 si->kind, si->stmt_info, si->misalign,
3280 vect_prologue);
3281 if (*peel_iters_epilogue)
3282 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3283 retval += record_stmt_cost (epilogue_cost_vec,
3284 si->count * *peel_iters_epilogue,
3285 si->kind, si->stmt_info, si->misalign,
3286 vect_epilogue);
3287
3288 return retval;
3289 }
3290
3291 /* Function vect_estimate_min_profitable_iters
3292
3293 Return the number of iterations required for the vector version of the
3294 loop to be profitable relative to the cost of the scalar version of the
3295 loop.
3296
3297 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3298 of iterations for vectorization. -1 value means loop vectorization
3299 is not profitable. This returned value may be used for dynamic
3300 profitability check.
3301
3302 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3303 for static check against estimated number of iterations. */
3304
3305 static void
3306 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3307 int *ret_min_profitable_niters,
3308 int *ret_min_profitable_estimate)
3309 {
3310 int min_profitable_iters;
3311 int min_profitable_estimate;
3312 int peel_iters_prologue;
3313 int peel_iters_epilogue;
3314 unsigned vec_inside_cost = 0;
3315 int vec_outside_cost = 0;
3316 unsigned vec_prologue_cost = 0;
3317 unsigned vec_epilogue_cost = 0;
3318 int scalar_single_iter_cost = 0;
3319 int scalar_outside_cost = 0;
3320 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3321 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3322 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3323
3324 /* Cost model disabled. */
3325 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3326 {
3327 if (dump_enabled_p ())
3328 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3329 *ret_min_profitable_niters = 0;
3330 *ret_min_profitable_estimate = 0;
3331 return;
3332 }
3333
3334 /* Requires loop versioning tests to handle misalignment. */
3335 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3336 {
3337 /* FIXME: Make cost depend on complexity of individual check. */
3338 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3339 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3340 vect_prologue);
3341 if (dump_enabled_p ())
3342 dump_printf (MSG_NOTE,
3343 "cost model: Adding cost of checks for loop "
3344 "versioning to treat misalignment.\n");
3345 }
3346
3347 /* Requires loop versioning with alias checks. */
3348 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3349 {
3350 /* FIXME: Make cost depend on complexity of individual check. */
3351 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3352 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3353 vect_prologue);
3354 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3355 if (len)
3356 /* Count LEN - 1 ANDs and LEN comparisons. */
3357 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3358 NULL, 0, vect_prologue);
3359 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3360 if (len)
3361 {
3362 /* Count LEN - 1 ANDs and LEN comparisons. */
3363 unsigned int nstmts = len * 2 - 1;
3364 /* +1 for each bias that needs adding. */
3365 for (unsigned int i = 0; i < len; ++i)
3366 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3367 nstmts += 1;
3368 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3369 NULL, 0, vect_prologue);
3370 }
3371 if (dump_enabled_p ())
3372 dump_printf (MSG_NOTE,
3373 "cost model: Adding cost of checks for loop "
3374 "versioning aliasing.\n");
3375 }
3376
3377 /* Requires loop versioning with niter checks. */
3378 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3379 {
3380 /* FIXME: Make cost depend on complexity of individual check. */
3381 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3382 vect_prologue);
3383 if (dump_enabled_p ())
3384 dump_printf (MSG_NOTE,
3385 "cost model: Adding cost of checks for loop "
3386 "versioning niters.\n");
3387 }
3388
3389 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3390 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3391 vect_prologue);
3392
3393 /* Count statements in scalar loop. Using this as scalar cost for a single
3394 iteration for now.
3395
3396 TODO: Add outer loop support.
3397
3398 TODO: Consider assigning different costs to different scalar
3399 statements. */
3400
3401 scalar_single_iter_cost
3402 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3403
3404 /* Add additional cost for the peeled instructions in prologue and epilogue
3405 loop. (For fully-masked loops there will be no peeling.)
3406
3407 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3408 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3409
3410 TODO: Build an expression that represents peel_iters for prologue and
3411 epilogue to be used in a run-time test. */
3412
3413 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3414 {
3415 peel_iters_prologue = 0;
3416 peel_iters_epilogue = 0;
3417
3418 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3419 {
3420 /* We need to peel exactly one iteration. */
3421 peel_iters_epilogue += 1;
3422 stmt_info_for_cost *si;
3423 int j;
3424 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3425 j, si)
3426 (void) add_stmt_cost (target_cost_data, si->count,
3427 si->kind, si->stmt_info, si->misalign,
3428 vect_epilogue);
3429 }
3430 }
3431 else if (npeel < 0)
3432 {
3433 peel_iters_prologue = assumed_vf / 2;
3434 if (dump_enabled_p ())
3435 dump_printf (MSG_NOTE, "cost model: "
3436 "prologue peel iters set to vf/2.\n");
3437
3438 /* If peeling for alignment is unknown, loop bound of main loop becomes
3439 unknown. */
3440 peel_iters_epilogue = assumed_vf / 2;
3441 if (dump_enabled_p ())
3442 dump_printf (MSG_NOTE, "cost model: "
3443 "epilogue peel iters set to vf/2 because "
3444 "peeling for alignment is unknown.\n");
3445
3446 /* If peeled iterations are unknown, count a taken branch and a not taken
3447 branch per peeled loop. Even if scalar loop iterations are known,
3448 vector iterations are not known since peeled prologue iterations are
3449 not known. Hence guards remain the same. */
3450 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3451 NULL, 0, vect_prologue);
3452 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3453 NULL, 0, vect_prologue);
3454 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3455 NULL, 0, vect_epilogue);
3456 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3457 NULL, 0, vect_epilogue);
3458 stmt_info_for_cost *si;
3459 int j;
3460 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3461 {
3462 (void) add_stmt_cost (target_cost_data,
3463 si->count * peel_iters_prologue,
3464 si->kind, si->stmt_info, si->misalign,
3465 vect_prologue);
3466 (void) add_stmt_cost (target_cost_data,
3467 si->count * peel_iters_epilogue,
3468 si->kind, si->stmt_info, si->misalign,
3469 vect_epilogue);
3470 }
3471 }
3472 else
3473 {
3474 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3475 stmt_info_for_cost *si;
3476 int j;
3477 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3478
3479 prologue_cost_vec.create (2);
3480 epilogue_cost_vec.create (2);
3481 peel_iters_prologue = npeel;
3482
3483 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3484 &peel_iters_epilogue,
3485 &LOOP_VINFO_SCALAR_ITERATION_COST
3486 (loop_vinfo),
3487 &prologue_cost_vec,
3488 &epilogue_cost_vec);
3489
3490 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3491 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3492 si->misalign, vect_prologue);
3493
3494 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3495 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3496 si->misalign, vect_epilogue);
3497
3498 prologue_cost_vec.release ();
3499 epilogue_cost_vec.release ();
3500 }
3501
3502 /* FORNOW: The scalar outside cost is incremented in one of the
3503 following ways:
3504
3505 1. The vectorizer checks for alignment and aliasing and generates
3506 a condition that allows dynamic vectorization. A cost model
3507 check is ANDED with the versioning condition. Hence scalar code
3508 path now has the added cost of the versioning check.
3509
3510 if (cost > th & versioning_check)
3511 jmp to vector code
3512
3513 Hence run-time scalar is incremented by not-taken branch cost.
3514
3515 2. The vectorizer then checks if a prologue is required. If the
3516 cost model check was not done before during versioning, it has to
3517 be done before the prologue check.
3518
3519 if (cost <= th)
3520 prologue = scalar_iters
3521 if (prologue == 0)
3522 jmp to vector code
3523 else
3524 execute prologue
3525 if (prologue == num_iters)
3526 go to exit
3527
3528 Hence the run-time scalar cost is incremented by a taken branch,
3529 plus a not-taken branch, plus a taken branch cost.
3530
3531 3. The vectorizer then checks if an epilogue is required. If the
3532 cost model check was not done before during prologue check, it
3533 has to be done with the epilogue check.
3534
3535 if (prologue == 0)
3536 jmp to vector code
3537 else
3538 execute prologue
3539 if (prologue == num_iters)
3540 go to exit
3541 vector code:
3542 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3543 jmp to epilogue
3544
3545 Hence the run-time scalar cost should be incremented by 2 taken
3546 branches.
3547
3548 TODO: The back end may reorder the BBS's differently and reverse
3549 conditions/branch directions. Change the estimates below to
3550 something more reasonable. */
3551
3552 /* If the number of iterations is known and we do not do versioning, we can
3553 decide whether to vectorize at compile time. Hence the scalar version
3554 do not carry cost model guard costs. */
3555 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3556 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3557 {
3558 /* Cost model check occurs at versioning. */
3559 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3560 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3561 else
3562 {
3563 /* Cost model check occurs at prologue generation. */
3564 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3565 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3566 + vect_get_stmt_cost (cond_branch_not_taken);
3567 /* Cost model check occurs at epilogue generation. */
3568 else
3569 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3570 }
3571 }
3572
3573 /* Complete the target-specific cost calculations. */
3574 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3575 &vec_inside_cost, &vec_epilogue_cost);
3576
3577 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3578
3579 if (dump_enabled_p ())
3580 {
3581 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3582 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3583 vec_inside_cost);
3584 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3585 vec_prologue_cost);
3586 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3587 vec_epilogue_cost);
3588 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3589 scalar_single_iter_cost);
3590 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3591 scalar_outside_cost);
3592 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3593 vec_outside_cost);
3594 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3595 peel_iters_prologue);
3596 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3597 peel_iters_epilogue);
3598 }
3599
3600 /* Calculate number of iterations required to make the vector version
3601 profitable, relative to the loop bodies only. The following condition
3602 must hold true:
3603 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3604 where
3605 SIC = scalar iteration cost, VIC = vector iteration cost,
3606 VOC = vector outside cost, VF = vectorization factor,
3607 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3608 SOC = scalar outside cost for run time cost model check. */
3609
3610 if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3611 {
3612 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3613 * assumed_vf
3614 - vec_inside_cost * peel_iters_prologue
3615 - vec_inside_cost * peel_iters_epilogue);
3616 if (min_profitable_iters <= 0)
3617 min_profitable_iters = 0;
3618 else
3619 {
3620 min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3621 - vec_inside_cost);
3622
3623 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3624 <= (((int) vec_inside_cost * min_profitable_iters)
3625 + (((int) vec_outside_cost - scalar_outside_cost)
3626 * assumed_vf)))
3627 min_profitable_iters++;
3628 }
3629 }
3630 /* vector version will never be profitable. */
3631 else
3632 {
3633 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3634 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3635 "vectorization did not happen for a simd loop");
3636
3637 if (dump_enabled_p ())
3638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3639 "cost model: the vector iteration cost = %d "
3640 "divided by the scalar iteration cost = %d "
3641 "is greater or equal to the vectorization factor = %d"
3642 ".\n",
3643 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3644 *ret_min_profitable_niters = -1;
3645 *ret_min_profitable_estimate = -1;
3646 return;
3647 }
3648
3649 if (dump_enabled_p ())
3650 dump_printf (MSG_NOTE,
3651 " Calculated minimum iters for profitability: %d\n",
3652 min_profitable_iters);
3653
3654 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3655 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3656 /* We want the vectorized loop to execute at least once. */
3657 min_profitable_iters = assumed_vf + peel_iters_prologue;
3658
3659 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_NOTE, vect_location,
3661 " Runtime profitability threshold = %d\n",
3662 min_profitable_iters);
3663
3664 *ret_min_profitable_niters = min_profitable_iters;
3665
3666 /* Calculate number of iterations required to make the vector version
3667 profitable, relative to the loop bodies only.
3668
3669 Non-vectorized variant is SIC * niters and it must win over vector
3670 variant on the expected loop trip count. The following condition must hold true:
3671 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3672
3673 if (vec_outside_cost <= 0)
3674 min_profitable_estimate = 0;
3675 else
3676 {
3677 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3678 * assumed_vf
3679 - vec_inside_cost * peel_iters_prologue
3680 - vec_inside_cost * peel_iters_epilogue)
3681 / ((scalar_single_iter_cost * assumed_vf)
3682 - vec_inside_cost);
3683 }
3684 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3685 if (dump_enabled_p ())
3686 dump_printf_loc (MSG_NOTE, vect_location,
3687 " Static estimate profitability threshold = %d\n",
3688 min_profitable_estimate);
3689
3690 *ret_min_profitable_estimate = min_profitable_estimate;
3691 }
3692
3693 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3694 vector elements (not bits) for a vector with NELT elements. */
3695 static void
3696 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3697 vec_perm_builder *sel)
3698 {
3699 /* The encoding is a single stepped pattern. Any wrap-around is handled
3700 by vec_perm_indices. */
3701 sel->new_vector (nelt, 1, 3);
3702 for (unsigned int i = 0; i < 3; i++)
3703 sel->quick_push (i + offset);
3704 }
3705
3706 /* Checks whether the target supports whole-vector shifts for vectors of mode
3707 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3708 it supports vec_perm_const with masks for all necessary shift amounts. */
3709 static bool
3710 have_whole_vector_shift (machine_mode mode)
3711 {
3712 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3713 return true;
3714
3715 /* Variable-length vectors should be handled via the optab. */
3716 unsigned int nelt;
3717 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3718 return false;
3719
3720 vec_perm_builder sel;
3721 vec_perm_indices indices;
3722 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3723 {
3724 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3725 indices.new_vector (sel, 2, nelt);
3726 if (!can_vec_perm_const_p (mode, indices, false))
3727 return false;
3728 }
3729 return true;
3730 }
3731
3732 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3733 functions. Design better to avoid maintenance issues. */
3734
3735 /* Function vect_model_reduction_cost.
3736
3737 Models cost for a reduction operation, including the vector ops
3738 generated within the strip-mine loop, the initial definition before
3739 the loop, and the epilogue code that must be generated. */
3740
3741 static void
3742 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3743 int ncopies, stmt_vector_for_cost *cost_vec)
3744 {
3745 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3746 enum tree_code code;
3747 optab optab;
3748 tree vectype;
3749 machine_mode mode;
3750 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3751 struct loop *loop = NULL;
3752
3753 if (loop_vinfo)
3754 loop = LOOP_VINFO_LOOP (loop_vinfo);
3755
3756 /* Condition reductions generate two reductions in the loop. */
3757 vect_reduction_type reduction_type
3758 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3759 if (reduction_type == COND_REDUCTION)
3760 ncopies *= 2;
3761
3762 vectype = STMT_VINFO_VECTYPE (stmt_info);
3763 mode = TYPE_MODE (vectype);
3764 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3765
3766 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3767
3768 if (reduction_type == EXTRACT_LAST_REDUCTION
3769 || reduction_type == FOLD_LEFT_REDUCTION)
3770 {
3771 /* No extra instructions needed in the prologue. */
3772 prologue_cost = 0;
3773
3774 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3775 /* Count one reduction-like operation per vector. */
3776 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3777 stmt_info, 0, vect_body);
3778 else
3779 {
3780 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3781 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3782 inside_cost = record_stmt_cost (cost_vec, nelements,
3783 vec_to_scalar, stmt_info, 0,
3784 vect_body);
3785 inside_cost += record_stmt_cost (cost_vec, nelements,
3786 scalar_stmt, stmt_info, 0,
3787 vect_body);
3788 }
3789 }
3790 else
3791 {
3792 /* Add in cost for initial definition.
3793 For cond reduction we have four vectors: initial index, step,
3794 initial result of the data reduction, initial value of the index
3795 reduction. */
3796 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3797 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3798 scalar_to_vec, stmt_info, 0,
3799 vect_prologue);
3800
3801 /* Cost of reduction op inside loop. */
3802 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3803 stmt_info, 0, vect_body);
3804 }
3805
3806 /* Determine cost of epilogue code.
3807
3808 We have a reduction operator that will reduce the vector in one statement.
3809 Also requires scalar extract. */
3810
3811 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3812 {
3813 if (reduc_fn != IFN_LAST)
3814 {
3815 if (reduction_type == COND_REDUCTION)
3816 {
3817 /* An EQ stmt and an COND_EXPR stmt. */
3818 epilogue_cost += record_stmt_cost (cost_vec, 2,
3819 vector_stmt, stmt_info, 0,
3820 vect_epilogue);
3821 /* Reduction of the max index and a reduction of the found
3822 values. */
3823 epilogue_cost += record_stmt_cost (cost_vec, 2,
3824 vec_to_scalar, stmt_info, 0,
3825 vect_epilogue);
3826 /* A broadcast of the max value. */
3827 epilogue_cost += record_stmt_cost (cost_vec, 1,
3828 scalar_to_vec, stmt_info, 0,
3829 vect_epilogue);
3830 }
3831 else
3832 {
3833 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3834 stmt_info, 0, vect_epilogue);
3835 epilogue_cost += record_stmt_cost (cost_vec, 1,
3836 vec_to_scalar, stmt_info, 0,
3837 vect_epilogue);
3838 }
3839 }
3840 else if (reduction_type == COND_REDUCTION)
3841 {
3842 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3843 /* Extraction of scalar elements. */
3844 epilogue_cost += record_stmt_cost (cost_vec,
3845 2 * estimated_nunits,
3846 vec_to_scalar, stmt_info, 0,
3847 vect_epilogue);
3848 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3849 epilogue_cost += record_stmt_cost (cost_vec,
3850 2 * estimated_nunits - 3,
3851 scalar_stmt, stmt_info, 0,
3852 vect_epilogue);
3853 }
3854 else if (reduction_type == EXTRACT_LAST_REDUCTION
3855 || reduction_type == FOLD_LEFT_REDUCTION)
3856 /* No extra instructions need in the epilogue. */
3857 ;
3858 else
3859 {
3860 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3861 tree bitsize =
3862 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3863 int element_bitsize = tree_to_uhwi (bitsize);
3864 int nelements = vec_size_in_bits / element_bitsize;
3865
3866 if (code == COND_EXPR)
3867 code = MAX_EXPR;
3868
3869 optab = optab_for_tree_code (code, vectype, optab_default);
3870
3871 /* We have a whole vector shift available. */
3872 if (optab != unknown_optab
3873 && VECTOR_MODE_P (mode)
3874 && optab_handler (optab, mode) != CODE_FOR_nothing
3875 && have_whole_vector_shift (mode))
3876 {
3877 /* Final reduction via vector shifts and the reduction operator.
3878 Also requires scalar extract. */
3879 epilogue_cost += record_stmt_cost (cost_vec,
3880 exact_log2 (nelements) * 2,
3881 vector_stmt, stmt_info, 0,
3882 vect_epilogue);
3883 epilogue_cost += record_stmt_cost (cost_vec, 1,
3884 vec_to_scalar, stmt_info, 0,
3885 vect_epilogue);
3886 }
3887 else
3888 /* Use extracts and reduction op for final reduction. For N
3889 elements, we have N extracts and N-1 reduction ops. */
3890 epilogue_cost += record_stmt_cost (cost_vec,
3891 nelements + nelements - 1,
3892 vector_stmt, stmt_info, 0,
3893 vect_epilogue);
3894 }
3895 }
3896
3897 if (dump_enabled_p ())
3898 dump_printf (MSG_NOTE,
3899 "vect_model_reduction_cost: inside_cost = %d, "
3900 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3901 prologue_cost, epilogue_cost);
3902 }
3903
3904
3905 /* Function vect_model_induction_cost.
3906
3907 Models cost for induction operations. */
3908
3909 static void
3910 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3911 stmt_vector_for_cost *cost_vec)
3912 {
3913 unsigned inside_cost, prologue_cost;
3914
3915 if (PURE_SLP_STMT (stmt_info))
3916 return;
3917
3918 /* loop cost for vec_loop. */
3919 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3920 stmt_info, 0, vect_body);
3921
3922 /* prologue cost for vec_init and vec_step. */
3923 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3924 stmt_info, 0, vect_prologue);
3925
3926 if (dump_enabled_p ())
3927 dump_printf_loc (MSG_NOTE, vect_location,
3928 "vect_model_induction_cost: inside_cost = %d, "
3929 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3930 }
3931
3932
3933
3934 /* Function get_initial_def_for_reduction
3935
3936 Input:
3937 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3938 INIT_VAL - the initial value of the reduction variable
3939
3940 Output:
3941 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3942 of the reduction (used for adjusting the epilog - see below).
3943 Return a vector variable, initialized according to the operation that
3944 STMT_VINFO performs. This vector will be used as the initial value
3945 of the vector of partial results.
3946
3947 Option1 (adjust in epilog): Initialize the vector as follows:
3948 add/bit or/xor: [0,0,...,0,0]
3949 mult/bit and: [1,1,...,1,1]
3950 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3951 and when necessary (e.g. add/mult case) let the caller know
3952 that it needs to adjust the result by init_val.
3953
3954 Option2: Initialize the vector as follows:
3955 add/bit or/xor: [init_val,0,0,...,0]
3956 mult/bit and: [init_val,1,1,...,1]
3957 min/max/cond_expr: [init_val,init_val,...,init_val]
3958 and no adjustments are needed.
3959
3960 For example, for the following code:
3961
3962 s = init_val;
3963 for (i=0;i<n;i++)
3964 s = s + a[i];
3965
3966 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3967 For a vector of 4 units, we want to return either [0,0,0,init_val],
3968 or [0,0,0,0] and let the caller know that it needs to adjust
3969 the result at the end by 'init_val'.
3970
3971 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3972 initialization vector is simpler (same element in all entries), if
3973 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3974
3975 A cost model should help decide between these two schemes. */
3976
3977 tree
3978 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
3979 tree *adjustment_def)
3980 {
3981 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3982 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3983 tree scalar_type = TREE_TYPE (init_val);
3984 tree vectype = get_vectype_for_scalar_type (scalar_type);
3985 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
3986 tree def_for_init;
3987 tree init_def;
3988 REAL_VALUE_TYPE real_init_val = dconst0;
3989 int int_init_val = 0;
3990 gimple_seq stmts = NULL;
3991
3992 gcc_assert (vectype);
3993
3994 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3995 || SCALAR_FLOAT_TYPE_P (scalar_type));
3996
3997 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
3998 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
3999
4000 vect_reduction_type reduction_type
4001 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4002
4003 switch (code)
4004 {
4005 case WIDEN_SUM_EXPR:
4006 case DOT_PROD_EXPR:
4007 case SAD_EXPR:
4008 case PLUS_EXPR:
4009 case MINUS_EXPR:
4010 case BIT_IOR_EXPR:
4011 case BIT_XOR_EXPR:
4012 case MULT_EXPR:
4013 case BIT_AND_EXPR:
4014 {
4015 /* ADJUSTMENT_DEF is NULL when called from
4016 vect_create_epilog_for_reduction to vectorize double reduction. */
4017 if (adjustment_def)
4018 *adjustment_def = init_val;
4019
4020 if (code == MULT_EXPR)
4021 {
4022 real_init_val = dconst1;
4023 int_init_val = 1;
4024 }
4025
4026 if (code == BIT_AND_EXPR)
4027 int_init_val = -1;
4028
4029 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4030 def_for_init = build_real (scalar_type, real_init_val);
4031 else
4032 def_for_init = build_int_cst (scalar_type, int_init_val);
4033
4034 if (adjustment_def)
4035 /* Option1: the first element is '0' or '1' as well. */
4036 init_def = gimple_build_vector_from_val (&stmts, vectype,
4037 def_for_init);
4038 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4039 {
4040 /* Option2 (variable length): the first element is INIT_VAL. */
4041 init_def = gimple_build_vector_from_val (&stmts, vectype,
4042 def_for_init);
4043 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4044 vectype, init_def, init_val);
4045 }
4046 else
4047 {
4048 /* Option2: the first element is INIT_VAL. */
4049 tree_vector_builder elts (vectype, 1, 2);
4050 elts.quick_push (init_val);
4051 elts.quick_push (def_for_init);
4052 init_def = gimple_build_vector (&stmts, &elts);
4053 }
4054 }
4055 break;
4056
4057 case MIN_EXPR:
4058 case MAX_EXPR:
4059 case COND_EXPR:
4060 {
4061 if (adjustment_def)
4062 {
4063 *adjustment_def = NULL_TREE;
4064 if (reduction_type != COND_REDUCTION
4065 && reduction_type != EXTRACT_LAST_REDUCTION)
4066 {
4067 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4068 break;
4069 }
4070 }
4071 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4072 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4073 }
4074 break;
4075
4076 default:
4077 gcc_unreachable ();
4078 }
4079
4080 if (stmts)
4081 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4082 return init_def;
4083 }
4084
4085 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4086 NUMBER_OF_VECTORS is the number of vector defs to create.
4087 If NEUTRAL_OP is nonnull, introducing extra elements of that
4088 value will not change the result. */
4089
4090 static void
4091 get_initial_defs_for_reduction (slp_tree slp_node,
4092 vec<tree> *vec_oprnds,
4093 unsigned int number_of_vectors,
4094 bool reduc_chain, tree neutral_op)
4095 {
4096 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4097 stmt_vec_info stmt_vinfo = stmts[0];
4098 unsigned HOST_WIDE_INT nunits;
4099 unsigned j, number_of_places_left_in_vector;
4100 tree vector_type;
4101 unsigned int group_size = stmts.length ();
4102 unsigned int i;
4103 struct loop *loop;
4104
4105 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4106
4107 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4108
4109 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4110 gcc_assert (loop);
4111 edge pe = loop_preheader_edge (loop);
4112
4113 gcc_assert (!reduc_chain || neutral_op);
4114
4115 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4116 created vectors. It is greater than 1 if unrolling is performed.
4117
4118 For example, we have two scalar operands, s1 and s2 (e.g., group of
4119 strided accesses of size two), while NUNITS is four (i.e., four scalars
4120 of this type can be packed in a vector). The output vector will contain
4121 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4122 will be 2).
4123
4124 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4125 vectors containing the operands.
4126
4127 For example, NUNITS is four as before, and the group size is 8
4128 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4129 {s5, s6, s7, s8}. */
4130
4131 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4132 nunits = group_size;
4133
4134 number_of_places_left_in_vector = nunits;
4135 bool constant_p = true;
4136 tree_vector_builder elts (vector_type, nunits, 1);
4137 elts.quick_grow (nunits);
4138 gimple_seq ctor_seq = NULL;
4139 for (j = 0; j < nunits * number_of_vectors; ++j)
4140 {
4141 tree op;
4142 i = j % group_size;
4143 stmt_vinfo = stmts[i];
4144
4145 /* Get the def before the loop. In reduction chain we have only
4146 one initial value. Else we have as many as PHIs in the group. */
4147 if (reduc_chain)
4148 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4149 else if (((vec_oprnds->length () + 1) * nunits
4150 - number_of_places_left_in_vector >= group_size)
4151 && neutral_op)
4152 op = neutral_op;
4153 else
4154 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4155
4156 /* Create 'vect_ = {op0,op1,...,opn}'. */
4157 number_of_places_left_in_vector--;
4158 elts[nunits - number_of_places_left_in_vector - 1] = op;
4159 if (!CONSTANT_CLASS_P (op))
4160 constant_p = false;
4161
4162 if (number_of_places_left_in_vector == 0)
4163 {
4164 tree init;
4165 if (constant_p && !neutral_op
4166 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4167 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4168 /* Build the vector directly from ELTS. */
4169 init = gimple_build_vector (&ctor_seq, &elts);
4170 else if (neutral_op)
4171 {
4172 /* Build a vector of the neutral value and shift the
4173 other elements into place. */
4174 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4175 neutral_op);
4176 int k = nunits;
4177 while (k > 0 && elts[k - 1] == neutral_op)
4178 k -= 1;
4179 while (k > 0)
4180 {
4181 k -= 1;
4182 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4183 vector_type, init, elts[k]);
4184 }
4185 }
4186 else
4187 {
4188 /* First time round, duplicate ELTS to fill the
4189 required number of vectors. */
4190 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4191 number_of_vectors, *vec_oprnds);
4192 break;
4193 }
4194 vec_oprnds->quick_push (init);
4195
4196 number_of_places_left_in_vector = nunits;
4197 elts.new_vector (vector_type, nunits, 1);
4198 elts.quick_grow (nunits);
4199 constant_p = true;
4200 }
4201 }
4202 if (ctor_seq != NULL)
4203 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4204 }
4205
4206
4207 /* Function vect_create_epilog_for_reduction
4208
4209 Create code at the loop-epilog to finalize the result of a reduction
4210 computation.
4211
4212 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4213 reduction statements.
4214 STMT_INFO is the scalar reduction stmt that is being vectorized.
4215 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4216 number of elements that we can fit in a vectype (nunits). In this case
4217 we have to generate more than one vector stmt - i.e - we need to "unroll"
4218 the vector stmt by a factor VF/nunits. For more details see documentation
4219 in vectorizable_operation.
4220 REDUC_FN is the internal function for the epilog reduction.
4221 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4222 computation.
4223 REDUC_INDEX is the index of the operand in the right hand side of the
4224 statement that is defined by REDUCTION_PHI.
4225 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4226 SLP_NODE is an SLP node containing a group of reduction statements. The
4227 first one in this group is STMT_INFO.
4228 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4229 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4230 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4231 any value of the IV in the loop.
4232 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4233 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4234 null if this is not an SLP reduction
4235
4236 This function:
4237 1. Creates the reduction def-use cycles: sets the arguments for
4238 REDUCTION_PHIS:
4239 The loop-entry argument is the vectorized initial-value of the reduction.
4240 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4241 sums.
4242 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4243 by calling the function specified by REDUC_FN if available, or by
4244 other means (whole-vector shifts or a scalar loop).
4245 The function also creates a new phi node at the loop exit to preserve
4246 loop-closed form, as illustrated below.
4247
4248 The flow at the entry to this function:
4249
4250 loop:
4251 vec_def = phi <null, null> # REDUCTION_PHI
4252 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4253 s_loop = scalar_stmt # (scalar) STMT_INFO
4254 loop_exit:
4255 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4256 use <s_out0>
4257 use <s_out0>
4258
4259 The above is transformed by this function into:
4260
4261 loop:
4262 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4263 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4264 s_loop = scalar_stmt # (scalar) STMT_INFO
4265 loop_exit:
4266 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4267 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4268 v_out2 = reduce <v_out1>
4269 s_out3 = extract_field <v_out2, 0>
4270 s_out4 = adjust_result <s_out3>
4271 use <s_out4>
4272 use <s_out4>
4273 */
4274
4275 static void
4276 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4277 stmt_vec_info stmt_info,
4278 gimple *reduc_def_stmt,
4279 int ncopies, internal_fn reduc_fn,
4280 vec<stmt_vec_info> reduction_phis,
4281 bool double_reduc,
4282 slp_tree slp_node,
4283 slp_instance slp_node_instance,
4284 tree induc_val, enum tree_code induc_code,
4285 tree neutral_op)
4286 {
4287 stmt_vec_info prev_phi_info;
4288 tree vectype;
4289 machine_mode mode;
4290 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4291 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4292 basic_block exit_bb;
4293 tree scalar_dest;
4294 tree scalar_type;
4295 gimple *new_phi = NULL, *phi;
4296 stmt_vec_info phi_info;
4297 gimple_stmt_iterator exit_gsi;
4298 tree vec_dest;
4299 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4300 gimple *epilog_stmt = NULL;
4301 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4302 gimple *exit_phi;
4303 tree bitsize;
4304 tree adjustment_def = NULL;
4305 tree vec_initial_def = NULL;
4306 tree expr, def, initial_def = NULL;
4307 tree orig_name, scalar_result;
4308 imm_use_iterator imm_iter, phi_imm_iter;
4309 use_operand_p use_p, phi_use_p;
4310 gimple *use_stmt;
4311 stmt_vec_info reduction_phi_info = NULL;
4312 bool nested_in_vect_loop = false;
4313 auto_vec<gimple *> new_phis;
4314 auto_vec<stmt_vec_info> inner_phis;
4315 int j, i;
4316 auto_vec<tree> scalar_results;
4317 unsigned int group_size = 1, k, ratio;
4318 auto_vec<tree> vec_initial_defs;
4319 auto_vec<gimple *> phis;
4320 bool slp_reduc = false;
4321 bool direct_slp_reduc;
4322 tree new_phi_result;
4323 stmt_vec_info inner_phi = NULL;
4324 tree induction_index = NULL_TREE;
4325
4326 if (slp_node)
4327 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4328
4329 if (nested_in_vect_loop_p (loop, stmt_info))
4330 {
4331 outer_loop = loop;
4332 loop = loop->inner;
4333 nested_in_vect_loop = true;
4334 gcc_assert (!slp_node);
4335 }
4336
4337 vectype = STMT_VINFO_VECTYPE (stmt_info);
4338 gcc_assert (vectype);
4339 mode = TYPE_MODE (vectype);
4340
4341 /* 1. Create the reduction def-use cycle:
4342 Set the arguments of REDUCTION_PHIS, i.e., transform
4343
4344 loop:
4345 vec_def = phi <null, null> # REDUCTION_PHI
4346 VECT_DEF = vector_stmt # vectorized form of STMT
4347 ...
4348
4349 into:
4350
4351 loop:
4352 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4353 VECT_DEF = vector_stmt # vectorized form of STMT
4354 ...
4355
4356 (in case of SLP, do it for all the phis). */
4357
4358 /* Get the loop-entry arguments. */
4359 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4360 if (slp_node)
4361 {
4362 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4363 vec_initial_defs.reserve (vec_num);
4364 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4365 &vec_initial_defs, vec_num,
4366 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4367 neutral_op);
4368 }
4369 else
4370 {
4371 /* Get at the scalar def before the loop, that defines the initial value
4372 of the reduction variable. */
4373 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4374 loop_preheader_edge (loop));
4375 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4376 and we can't use zero for induc_val, use initial_def. Similarly
4377 for REDUC_MIN and initial_def larger than the base. */
4378 if (TREE_CODE (initial_def) == INTEGER_CST
4379 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4380 == INTEGER_INDUC_COND_REDUCTION)
4381 && !integer_zerop (induc_val)
4382 && ((induc_code == MAX_EXPR
4383 && tree_int_cst_lt (initial_def, induc_val))
4384 || (induc_code == MIN_EXPR
4385 && tree_int_cst_lt (induc_val, initial_def))))
4386 induc_val = initial_def;
4387
4388 if (double_reduc)
4389 /* In case of double reduction we only create a vector variable
4390 to be put in the reduction phi node. The actual statement
4391 creation is done later in this function. */
4392 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4393 else if (nested_in_vect_loop)
4394 {
4395 /* Do not use an adjustment def as that case is not supported
4396 correctly if ncopies is not one. */
4397 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4398 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4399 stmt_info);
4400 }
4401 else
4402 vec_initial_def
4403 = get_initial_def_for_reduction (stmt_info, initial_def,
4404 &adjustment_def);
4405 vec_initial_defs.create (1);
4406 vec_initial_defs.quick_push (vec_initial_def);
4407 }
4408
4409 /* Set phi nodes arguments. */
4410 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4411 {
4412 tree vec_init_def = vec_initial_defs[i];
4413 tree def = vect_defs[i];
4414 for (j = 0; j < ncopies; j++)
4415 {
4416 if (j != 0)
4417 {
4418 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4419 if (nested_in_vect_loop)
4420 vec_init_def
4421 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4422 }
4423
4424 /* Set the loop-entry arg of the reduction-phi. */
4425
4426 gphi *phi = as_a <gphi *> (phi_info->stmt);
4427 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4428 == INTEGER_INDUC_COND_REDUCTION)
4429 {
4430 /* Initialise the reduction phi to zero. This prevents initial
4431 values of non-zero interferring with the reduction op. */
4432 gcc_assert (ncopies == 1);
4433 gcc_assert (i == 0);
4434
4435 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4436 tree induc_val_vec
4437 = build_vector_from_val (vec_init_def_type, induc_val);
4438
4439 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4440 UNKNOWN_LOCATION);
4441 }
4442 else
4443 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4444 UNKNOWN_LOCATION);
4445
4446 /* Set the loop-latch arg for the reduction-phi. */
4447 if (j > 0)
4448 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4449
4450 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4451
4452 if (dump_enabled_p ())
4453 dump_printf_loc (MSG_NOTE, vect_location,
4454 "transform reduction: created def-use cycle: %G%G",
4455 phi, SSA_NAME_DEF_STMT (def));
4456 }
4457 }
4458
4459 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4460 which is updated with the current index of the loop for every match of
4461 the original loop's cond_expr (VEC_STMT). This results in a vector
4462 containing the last time the condition passed for that vector lane.
4463 The first match will be a 1 to allow 0 to be used for non-matching
4464 indexes. If there are no matches at all then the vector will be all
4465 zeroes. */
4466 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4467 {
4468 tree indx_before_incr, indx_after_incr;
4469 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4470
4471 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4472 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4473
4474 int scalar_precision
4475 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4476 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4477 tree cr_index_vector_type = build_vector_type
4478 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4479
4480 /* First we create a simple vector induction variable which starts
4481 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4482 vector size (STEP). */
4483
4484 /* Create a {1,2,3,...} vector. */
4485 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4486
4487 /* Create a vector of the step value. */
4488 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4489 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4490
4491 /* Create an induction variable. */
4492 gimple_stmt_iterator incr_gsi;
4493 bool insert_after;
4494 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4495 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4496 insert_after, &indx_before_incr, &indx_after_incr);
4497
4498 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4499 filled with zeros (VEC_ZERO). */
4500
4501 /* Create a vector of 0s. */
4502 tree zero = build_zero_cst (cr_index_scalar_type);
4503 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4504
4505 /* Create a vector phi node. */
4506 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4507 new_phi = create_phi_node (new_phi_tree, loop->header);
4508 loop_vinfo->add_stmt (new_phi);
4509 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4510 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4511
4512 /* Now take the condition from the loops original cond_expr
4513 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4514 every match uses values from the induction variable
4515 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4516 (NEW_PHI_TREE).
4517 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4518 the new cond_expr (INDEX_COND_EXPR). */
4519
4520 /* Duplicate the condition from vec_stmt. */
4521 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4522
4523 /* Create a conditional, where the condition is taken from vec_stmt
4524 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4525 else is the phi (NEW_PHI_TREE). */
4526 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4527 ccompare, indx_before_incr,
4528 new_phi_tree);
4529 induction_index = make_ssa_name (cr_index_vector_type);
4530 gimple *index_condition = gimple_build_assign (induction_index,
4531 index_cond_expr);
4532 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4533 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4534 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4535
4536 /* Update the phi with the vec cond. */
4537 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4538 loop_latch_edge (loop), UNKNOWN_LOCATION);
4539 }
4540
4541 /* 2. Create epilog code.
4542 The reduction epilog code operates across the elements of the vector
4543 of partial results computed by the vectorized loop.
4544 The reduction epilog code consists of:
4545
4546 step 1: compute the scalar result in a vector (v_out2)
4547 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4548 step 3: adjust the scalar result (s_out3) if needed.
4549
4550 Step 1 can be accomplished using one the following three schemes:
4551 (scheme 1) using reduc_fn, if available.
4552 (scheme 2) using whole-vector shifts, if available.
4553 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4554 combined.
4555
4556 The overall epilog code looks like this:
4557
4558 s_out0 = phi <s_loop> # original EXIT_PHI
4559 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4560 v_out2 = reduce <v_out1> # step 1
4561 s_out3 = extract_field <v_out2, 0> # step 2
4562 s_out4 = adjust_result <s_out3> # step 3
4563
4564 (step 3 is optional, and steps 1 and 2 may be combined).
4565 Lastly, the uses of s_out0 are replaced by s_out4. */
4566
4567
4568 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4569 v_out1 = phi <VECT_DEF>
4570 Store them in NEW_PHIS. */
4571
4572 exit_bb = single_exit (loop)->dest;
4573 prev_phi_info = NULL;
4574 new_phis.create (vect_defs.length ());
4575 FOR_EACH_VEC_ELT (vect_defs, i, def)
4576 {
4577 for (j = 0; j < ncopies; j++)
4578 {
4579 tree new_def = copy_ssa_name (def);
4580 phi = create_phi_node (new_def, exit_bb);
4581 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4582 if (j == 0)
4583 new_phis.quick_push (phi);
4584 else
4585 {
4586 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4587 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4588 }
4589
4590 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4591 prev_phi_info = phi_info;
4592 }
4593 }
4594
4595 /* The epilogue is created for the outer-loop, i.e., for the loop being
4596 vectorized. Create exit phis for the outer loop. */
4597 if (double_reduc)
4598 {
4599 loop = outer_loop;
4600 exit_bb = single_exit (loop)->dest;
4601 inner_phis.create (vect_defs.length ());
4602 FOR_EACH_VEC_ELT (new_phis, i, phi)
4603 {
4604 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4605 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4606 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4607 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4608 PHI_RESULT (phi));
4609 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4610 inner_phis.quick_push (phi_info);
4611 new_phis[i] = outer_phi;
4612 while (STMT_VINFO_RELATED_STMT (phi_info))
4613 {
4614 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4615 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4616 outer_phi = create_phi_node (new_result, exit_bb);
4617 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4618 PHI_RESULT (phi_info->stmt));
4619 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4620 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4621 prev_phi_info = outer_phi_info;
4622 }
4623 }
4624 }
4625
4626 exit_gsi = gsi_after_labels (exit_bb);
4627
4628 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4629 (i.e. when reduc_fn is not available) and in the final adjustment
4630 code (if needed). Also get the original scalar reduction variable as
4631 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4632 represents a reduction pattern), the tree-code and scalar-def are
4633 taken from the original stmt that the pattern-stmt (STMT) replaces.
4634 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4635 are taken from STMT. */
4636
4637 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4638 if (orig_stmt_info != stmt_info)
4639 {
4640 /* Reduction pattern */
4641 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4642 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4643 }
4644
4645 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4646 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4647 partial results are added and not subtracted. */
4648 if (code == MINUS_EXPR)
4649 code = PLUS_EXPR;
4650
4651 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4652 scalar_type = TREE_TYPE (scalar_dest);
4653 scalar_results.create (group_size);
4654 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4655 bitsize = TYPE_SIZE (scalar_type);
4656
4657 /* In case this is a reduction in an inner-loop while vectorizing an outer
4658 loop - we don't need to extract a single scalar result at the end of the
4659 inner-loop (unless it is double reduction, i.e., the use of reduction is
4660 outside the outer-loop). The final vector of partial results will be used
4661 in the vectorized outer-loop, or reduced to a scalar result at the end of
4662 the outer-loop. */
4663 if (nested_in_vect_loop && !double_reduc)
4664 goto vect_finalize_reduction;
4665
4666 /* SLP reduction without reduction chain, e.g.,
4667 # a1 = phi <a2, a0>
4668 # b1 = phi <b2, b0>
4669 a2 = operation (a1)
4670 b2 = operation (b1) */
4671 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4672
4673 /* True if we should implement SLP_REDUC using native reduction operations
4674 instead of scalar operations. */
4675 direct_slp_reduc = (reduc_fn != IFN_LAST
4676 && slp_reduc
4677 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4678
4679 /* In case of reduction chain, e.g.,
4680 # a1 = phi <a3, a0>
4681 a2 = operation (a1)
4682 a3 = operation (a2),
4683
4684 we may end up with more than one vector result. Here we reduce them to
4685 one vector. */
4686 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4687 {
4688 tree first_vect = PHI_RESULT (new_phis[0]);
4689 gassign *new_vec_stmt = NULL;
4690 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4691 for (k = 1; k < new_phis.length (); k++)
4692 {
4693 gimple *next_phi = new_phis[k];
4694 tree second_vect = PHI_RESULT (next_phi);
4695 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4696 new_vec_stmt = gimple_build_assign (tem, code,
4697 first_vect, second_vect);
4698 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4699 first_vect = tem;
4700 }
4701
4702 new_phi_result = first_vect;
4703 if (new_vec_stmt)
4704 {
4705 new_phis.truncate (0);
4706 new_phis.safe_push (new_vec_stmt);
4707 }
4708 }
4709 /* Likewise if we couldn't use a single defuse cycle. */
4710 else if (ncopies > 1)
4711 {
4712 gcc_assert (new_phis.length () == 1);
4713 tree first_vect = PHI_RESULT (new_phis[0]);
4714 gassign *new_vec_stmt = NULL;
4715 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4716 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4717 for (int k = 1; k < ncopies; ++k)
4718 {
4719 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4720 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4721 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4722 new_vec_stmt = gimple_build_assign (tem, code,
4723 first_vect, second_vect);
4724 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4725 first_vect = tem;
4726 }
4727 new_phi_result = first_vect;
4728 new_phis.truncate (0);
4729 new_phis.safe_push (new_vec_stmt);
4730 }
4731 else
4732 new_phi_result = PHI_RESULT (new_phis[0]);
4733
4734 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4735 && reduc_fn != IFN_LAST)
4736 {
4737 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4738 various data values where the condition matched and another vector
4739 (INDUCTION_INDEX) containing all the indexes of those matches. We
4740 need to extract the last matching index (which will be the index with
4741 highest value) and use this to index into the data vector.
4742 For the case where there were no matches, the data vector will contain
4743 all default values and the index vector will be all zeros. */
4744
4745 /* Get various versions of the type of the vector of indexes. */
4746 tree index_vec_type = TREE_TYPE (induction_index);
4747 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4748 tree index_scalar_type = TREE_TYPE (index_vec_type);
4749 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4750 (index_vec_type);
4751
4752 /* Get an unsigned integer version of the type of the data vector. */
4753 int scalar_precision
4754 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4755 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4756 tree vectype_unsigned = build_vector_type
4757 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4758
4759 /* First we need to create a vector (ZERO_VEC) of zeros and another
4760 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4761 can create using a MAX reduction and then expanding.
4762 In the case where the loop never made any matches, the max index will
4763 be zero. */
4764
4765 /* Vector of {0, 0, 0,...}. */
4766 tree zero_vec = make_ssa_name (vectype);
4767 tree zero_vec_rhs = build_zero_cst (vectype);
4768 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4769 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4770
4771 /* Find maximum value from the vector of found indexes. */
4772 tree max_index = make_ssa_name (index_scalar_type);
4773 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4774 1, induction_index);
4775 gimple_call_set_lhs (max_index_stmt, max_index);
4776 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4777
4778 /* Vector of {max_index, max_index, max_index,...}. */
4779 tree max_index_vec = make_ssa_name (index_vec_type);
4780 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4781 max_index);
4782 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4783 max_index_vec_rhs);
4784 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4785
4786 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4787 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4788 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4789 otherwise. Only one value should match, resulting in a vector
4790 (VEC_COND) with one data value and the rest zeros.
4791 In the case where the loop never made any matches, every index will
4792 match, resulting in a vector with all data values (which will all be
4793 the default value). */
4794
4795 /* Compare the max index vector to the vector of found indexes to find
4796 the position of the max value. */
4797 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4798 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4799 induction_index,
4800 max_index_vec);
4801 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4802
4803 /* Use the compare to choose either values from the data vector or
4804 zero. */
4805 tree vec_cond = make_ssa_name (vectype);
4806 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4807 vec_compare, new_phi_result,
4808 zero_vec);
4809 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4810
4811 /* Finally we need to extract the data value from the vector (VEC_COND)
4812 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4813 reduction, but because this doesn't exist, we can use a MAX reduction
4814 instead. The data value might be signed or a float so we need to cast
4815 it first.
4816 In the case where the loop never made any matches, the data values are
4817 all identical, and so will reduce down correctly. */
4818
4819 /* Make the matched data values unsigned. */
4820 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4821 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4822 vec_cond);
4823 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4824 VIEW_CONVERT_EXPR,
4825 vec_cond_cast_rhs);
4826 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4827
4828 /* Reduce down to a scalar value. */
4829 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4830 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4831 1, vec_cond_cast);
4832 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4833 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4834
4835 /* Convert the reduced value back to the result type and set as the
4836 result. */
4837 gimple_seq stmts = NULL;
4838 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4839 data_reduc);
4840 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4841 scalar_results.safe_push (new_temp);
4842 }
4843 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4844 && reduc_fn == IFN_LAST)
4845 {
4846 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4847 idx = 0;
4848 idx_val = induction_index[0];
4849 val = data_reduc[0];
4850 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4851 if (induction_index[i] > idx_val)
4852 val = data_reduc[i], idx_val = induction_index[i];
4853 return val; */
4854
4855 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4856 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4857 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4858 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4859 /* Enforced by vectorizable_reduction, which ensures we have target
4860 support before allowing a conditional reduction on variable-length
4861 vectors. */
4862 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4863 tree idx_val = NULL_TREE, val = NULL_TREE;
4864 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4865 {
4866 tree old_idx_val = idx_val;
4867 tree old_val = val;
4868 idx_val = make_ssa_name (idx_eltype);
4869 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4870 build3 (BIT_FIELD_REF, idx_eltype,
4871 induction_index,
4872 bitsize_int (el_size),
4873 bitsize_int (off)));
4874 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4875 val = make_ssa_name (data_eltype);
4876 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4877 build3 (BIT_FIELD_REF,
4878 data_eltype,
4879 new_phi_result,
4880 bitsize_int (el_size),
4881 bitsize_int (off)));
4882 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4883 if (off != 0)
4884 {
4885 tree new_idx_val = idx_val;
4886 tree new_val = val;
4887 if (off != v_size - el_size)
4888 {
4889 new_idx_val = make_ssa_name (idx_eltype);
4890 epilog_stmt = gimple_build_assign (new_idx_val,
4891 MAX_EXPR, idx_val,
4892 old_idx_val);
4893 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4894 }
4895 new_val = make_ssa_name (data_eltype);
4896 epilog_stmt = gimple_build_assign (new_val,
4897 COND_EXPR,
4898 build2 (GT_EXPR,
4899 boolean_type_node,
4900 idx_val,
4901 old_idx_val),
4902 val, old_val);
4903 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4904 idx_val = new_idx_val;
4905 val = new_val;
4906 }
4907 }
4908 /* Convert the reduced value back to the result type and set as the
4909 result. */
4910 gimple_seq stmts = NULL;
4911 val = gimple_convert (&stmts, scalar_type, val);
4912 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4913 scalar_results.safe_push (val);
4914 }
4915
4916 /* 2.3 Create the reduction code, using one of the three schemes described
4917 above. In SLP we simply need to extract all the elements from the
4918 vector (without reducing them), so we use scalar shifts. */
4919 else if (reduc_fn != IFN_LAST && !slp_reduc)
4920 {
4921 tree tmp;
4922 tree vec_elem_type;
4923
4924 /* Case 1: Create:
4925 v_out2 = reduc_expr <v_out1> */
4926
4927 if (dump_enabled_p ())
4928 dump_printf_loc (MSG_NOTE, vect_location,
4929 "Reduce using direct vector reduction.\n");
4930
4931 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4932 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4933 {
4934 tree tmp_dest
4935 = vect_create_destination_var (scalar_dest, vec_elem_type);
4936 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4937 new_phi_result);
4938 gimple_set_lhs (epilog_stmt, tmp_dest);
4939 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4940 gimple_set_lhs (epilog_stmt, new_temp);
4941 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4942
4943 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4944 new_temp);
4945 }
4946 else
4947 {
4948 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4949 new_phi_result);
4950 gimple_set_lhs (epilog_stmt, new_scalar_dest);
4951 }
4952
4953 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4954 gimple_set_lhs (epilog_stmt, new_temp);
4955 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4956
4957 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4958 == INTEGER_INDUC_COND_REDUCTION)
4959 && !operand_equal_p (initial_def, induc_val, 0))
4960 {
4961 /* Earlier we set the initial value to be a vector if induc_val
4962 values. Check the result and if it is induc_val then replace
4963 with the original initial value, unless induc_val is
4964 the same as initial_def already. */
4965 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4966 induc_val);
4967
4968 tmp = make_ssa_name (new_scalar_dest);
4969 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4970 initial_def, new_temp);
4971 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4972 new_temp = tmp;
4973 }
4974
4975 scalar_results.safe_push (new_temp);
4976 }
4977 else if (direct_slp_reduc)
4978 {
4979 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4980 with the elements for other SLP statements replaced with the
4981 neutral value. We can then do a normal reduction on each vector. */
4982
4983 /* Enforced by vectorizable_reduction. */
4984 gcc_assert (new_phis.length () == 1);
4985 gcc_assert (pow2p_hwi (group_size));
4986
4987 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4988 vec<stmt_vec_info> orig_phis
4989 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4990 gimple_seq seq = NULL;
4991
4992 /* Build a vector {0, 1, 2, ...}, with the same number of elements
4993 and the same element size as VECTYPE. */
4994 tree index = build_index_vector (vectype, 0, 1);
4995 tree index_type = TREE_TYPE (index);
4996 tree index_elt_type = TREE_TYPE (index_type);
4997 tree mask_type = build_same_sized_truth_vector_type (index_type);
4998
4999 /* Create a vector that, for each element, identifies which of
5000 the REDUC_GROUP_SIZE results should use it. */
5001 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5002 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5003 build_vector_from_val (index_type, index_mask));
5004
5005 /* Get a neutral vector value. This is simply a splat of the neutral
5006 scalar value if we have one, otherwise the initial scalar value
5007 is itself a neutral value. */
5008 tree vector_identity = NULL_TREE;
5009 if (neutral_op)
5010 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5011 neutral_op);
5012 for (unsigned int i = 0; i < group_size; ++i)
5013 {
5014 /* If there's no univeral neutral value, we can use the
5015 initial scalar value from the original PHI. This is used
5016 for MIN and MAX reduction, for example. */
5017 if (!neutral_op)
5018 {
5019 tree scalar_value
5020 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5021 loop_preheader_edge (loop));
5022 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5023 scalar_value);
5024 }
5025
5026 /* Calculate the equivalent of:
5027
5028 sel[j] = (index[j] == i);
5029
5030 which selects the elements of NEW_PHI_RESULT that should
5031 be included in the result. */
5032 tree compare_val = build_int_cst (index_elt_type, i);
5033 compare_val = build_vector_from_val (index_type, compare_val);
5034 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5035 index, compare_val);
5036
5037 /* Calculate the equivalent of:
5038
5039 vec = seq ? new_phi_result : vector_identity;
5040
5041 VEC is now suitable for a full vector reduction. */
5042 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5043 sel, new_phi_result, vector_identity);
5044
5045 /* Do the reduction and convert it to the appropriate type. */
5046 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5047 TREE_TYPE (vectype), vec);
5048 scalar = gimple_convert (&seq, scalar_type, scalar);
5049 scalar_results.safe_push (scalar);
5050 }
5051 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5052 }
5053 else
5054 {
5055 bool reduce_with_shift;
5056 tree vec_temp;
5057
5058 /* COND reductions all do the final reduction with MAX_EXPR
5059 or MIN_EXPR. */
5060 if (code == COND_EXPR)
5061 {
5062 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5063 == INTEGER_INDUC_COND_REDUCTION)
5064 code = induc_code;
5065 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5066 == CONST_COND_REDUCTION)
5067 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5068 else
5069 code = MAX_EXPR;
5070 }
5071
5072 /* See if the target wants to do the final (shift) reduction
5073 in a vector mode of smaller size and first reduce upper/lower
5074 halves against each other. */
5075 enum machine_mode mode1 = mode;
5076 tree vectype1 = vectype;
5077 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5078 unsigned sz1 = sz;
5079 if (!slp_reduc
5080 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5081 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5082
5083 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5084 reduce_with_shift = have_whole_vector_shift (mode1);
5085 if (!VECTOR_MODE_P (mode1))
5086 reduce_with_shift = false;
5087 else
5088 {
5089 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5090 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5091 reduce_with_shift = false;
5092 }
5093
5094 /* First reduce the vector to the desired vector size we should
5095 do shift reduction on by combining upper and lower halves. */
5096 new_temp = new_phi_result;
5097 while (sz > sz1)
5098 {
5099 gcc_assert (!slp_reduc);
5100 sz /= 2;
5101 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5102
5103 /* The target has to make sure we support lowpart/highpart
5104 extraction, either via direct vector extract or through
5105 an integer mode punning. */
5106 tree dst1, dst2;
5107 if (convert_optab_handler (vec_extract_optab,
5108 TYPE_MODE (TREE_TYPE (new_temp)),
5109 TYPE_MODE (vectype1))
5110 != CODE_FOR_nothing)
5111 {
5112 /* Extract sub-vectors directly once vec_extract becomes
5113 a conversion optab. */
5114 dst1 = make_ssa_name (vectype1);
5115 epilog_stmt
5116 = gimple_build_assign (dst1, BIT_FIELD_REF,
5117 build3 (BIT_FIELD_REF, vectype1,
5118 new_temp, TYPE_SIZE (vectype1),
5119 bitsize_int (0)));
5120 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5121 dst2 = make_ssa_name (vectype1);
5122 epilog_stmt
5123 = gimple_build_assign (dst2, BIT_FIELD_REF,
5124 build3 (BIT_FIELD_REF, vectype1,
5125 new_temp, TYPE_SIZE (vectype1),
5126 bitsize_int (sz * BITS_PER_UNIT)));
5127 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5128 }
5129 else
5130 {
5131 /* Extract via punning to appropriately sized integer mode
5132 vector. */
5133 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5134 1);
5135 tree etype = build_vector_type (eltype, 2);
5136 gcc_assert (convert_optab_handler (vec_extract_optab,
5137 TYPE_MODE (etype),
5138 TYPE_MODE (eltype))
5139 != CODE_FOR_nothing);
5140 tree tem = make_ssa_name (etype);
5141 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5142 build1 (VIEW_CONVERT_EXPR,
5143 etype, new_temp));
5144 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5145 new_temp = tem;
5146 tem = make_ssa_name (eltype);
5147 epilog_stmt
5148 = gimple_build_assign (tem, BIT_FIELD_REF,
5149 build3 (BIT_FIELD_REF, eltype,
5150 new_temp, TYPE_SIZE (eltype),
5151 bitsize_int (0)));
5152 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5153 dst1 = make_ssa_name (vectype1);
5154 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5155 build1 (VIEW_CONVERT_EXPR,
5156 vectype1, tem));
5157 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5158 tem = make_ssa_name (eltype);
5159 epilog_stmt
5160 = gimple_build_assign (tem, BIT_FIELD_REF,
5161 build3 (BIT_FIELD_REF, eltype,
5162 new_temp, TYPE_SIZE (eltype),
5163 bitsize_int (sz * BITS_PER_UNIT)));
5164 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5165 dst2 = make_ssa_name (vectype1);
5166 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5167 build1 (VIEW_CONVERT_EXPR,
5168 vectype1, tem));
5169 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5170 }
5171
5172 new_temp = make_ssa_name (vectype1);
5173 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5174 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5175 }
5176
5177 if (reduce_with_shift && !slp_reduc)
5178 {
5179 int element_bitsize = tree_to_uhwi (bitsize);
5180 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5181 for variable-length vectors and also requires direct target support
5182 for loop reductions. */
5183 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5184 int nelements = vec_size_in_bits / element_bitsize;
5185 vec_perm_builder sel;
5186 vec_perm_indices indices;
5187
5188 int elt_offset;
5189
5190 tree zero_vec = build_zero_cst (vectype1);
5191 /* Case 2: Create:
5192 for (offset = nelements/2; offset >= 1; offset/=2)
5193 {
5194 Create: va' = vec_shift <va, offset>
5195 Create: va = vop <va, va'>
5196 } */
5197
5198 tree rhs;
5199
5200 if (dump_enabled_p ())
5201 dump_printf_loc (MSG_NOTE, vect_location,
5202 "Reduce using vector shifts\n");
5203
5204 mode1 = TYPE_MODE (vectype1);
5205 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5206 for (elt_offset = nelements / 2;
5207 elt_offset >= 1;
5208 elt_offset /= 2)
5209 {
5210 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5211 indices.new_vector (sel, 2, nelements);
5212 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5213 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5214 new_temp, zero_vec, mask);
5215 new_name = make_ssa_name (vec_dest, epilog_stmt);
5216 gimple_assign_set_lhs (epilog_stmt, new_name);
5217 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5218
5219 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5220 new_temp);
5221 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5222 gimple_assign_set_lhs (epilog_stmt, new_temp);
5223 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5224 }
5225
5226 /* 2.4 Extract the final scalar result. Create:
5227 s_out3 = extract_field <v_out2, bitpos> */
5228
5229 if (dump_enabled_p ())
5230 dump_printf_loc (MSG_NOTE, vect_location,
5231 "extract scalar result\n");
5232
5233 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5234 bitsize, bitsize_zero_node);
5235 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5236 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5237 gimple_assign_set_lhs (epilog_stmt, new_temp);
5238 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5239 scalar_results.safe_push (new_temp);
5240 }
5241 else
5242 {
5243 /* Case 3: Create:
5244 s = extract_field <v_out2, 0>
5245 for (offset = element_size;
5246 offset < vector_size;
5247 offset += element_size;)
5248 {
5249 Create: s' = extract_field <v_out2, offset>
5250 Create: s = op <s, s'> // For non SLP cases
5251 } */
5252
5253 if (dump_enabled_p ())
5254 dump_printf_loc (MSG_NOTE, vect_location,
5255 "Reduce using scalar code.\n");
5256
5257 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5258 int element_bitsize = tree_to_uhwi (bitsize);
5259 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5260 {
5261 int bit_offset;
5262 if (gimple_code (new_phi) == GIMPLE_PHI)
5263 vec_temp = PHI_RESULT (new_phi);
5264 else
5265 vec_temp = gimple_assign_lhs (new_phi);
5266 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5267 bitsize_zero_node);
5268 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5269 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5270 gimple_assign_set_lhs (epilog_stmt, new_temp);
5271 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5272
5273 /* In SLP we don't need to apply reduction operation, so we just
5274 collect s' values in SCALAR_RESULTS. */
5275 if (slp_reduc)
5276 scalar_results.safe_push (new_temp);
5277
5278 for (bit_offset = element_bitsize;
5279 bit_offset < vec_size_in_bits;
5280 bit_offset += element_bitsize)
5281 {
5282 tree bitpos = bitsize_int (bit_offset);
5283 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5284 bitsize, bitpos);
5285
5286 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5287 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5288 gimple_assign_set_lhs (epilog_stmt, new_name);
5289 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5290
5291 if (slp_reduc)
5292 {
5293 /* In SLP we don't need to apply reduction operation, so
5294 we just collect s' values in SCALAR_RESULTS. */
5295 new_temp = new_name;
5296 scalar_results.safe_push (new_name);
5297 }
5298 else
5299 {
5300 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5301 new_name, new_temp);
5302 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5303 gimple_assign_set_lhs (epilog_stmt, new_temp);
5304 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305 }
5306 }
5307 }
5308
5309 /* The only case where we need to reduce scalar results in SLP, is
5310 unrolling. If the size of SCALAR_RESULTS is greater than
5311 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5312 REDUC_GROUP_SIZE. */
5313 if (slp_reduc)
5314 {
5315 tree res, first_res, new_res;
5316 gimple *new_stmt;
5317
5318 /* Reduce multiple scalar results in case of SLP unrolling. */
5319 for (j = group_size; scalar_results.iterate (j, &res);
5320 j++)
5321 {
5322 first_res = scalar_results[j % group_size];
5323 new_stmt = gimple_build_assign (new_scalar_dest, code,
5324 first_res, res);
5325 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5326 gimple_assign_set_lhs (new_stmt, new_res);
5327 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5328 scalar_results[j % group_size] = new_res;
5329 }
5330 }
5331 else
5332 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5333 scalar_results.safe_push (new_temp);
5334 }
5335
5336 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5337 == INTEGER_INDUC_COND_REDUCTION)
5338 && !operand_equal_p (initial_def, induc_val, 0))
5339 {
5340 /* Earlier we set the initial value to be a vector if induc_val
5341 values. Check the result and if it is induc_val then replace
5342 with the original initial value, unless induc_val is
5343 the same as initial_def already. */
5344 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5345 induc_val);
5346
5347 tree tmp = make_ssa_name (new_scalar_dest);
5348 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5349 initial_def, new_temp);
5350 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5351 scalar_results[0] = tmp;
5352 }
5353 }
5354
5355 vect_finalize_reduction:
5356
5357 if (double_reduc)
5358 loop = loop->inner;
5359
5360 /* 2.5 Adjust the final result by the initial value of the reduction
5361 variable. (When such adjustment is not needed, then
5362 'adjustment_def' is zero). For example, if code is PLUS we create:
5363 new_temp = loop_exit_def + adjustment_def */
5364
5365 if (adjustment_def)
5366 {
5367 gcc_assert (!slp_reduc);
5368 if (nested_in_vect_loop)
5369 {
5370 new_phi = new_phis[0];
5371 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5372 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5373 new_dest = vect_create_destination_var (scalar_dest, vectype);
5374 }
5375 else
5376 {
5377 new_temp = scalar_results[0];
5378 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5379 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5380 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5381 }
5382
5383 epilog_stmt = gimple_build_assign (new_dest, expr);
5384 new_temp = make_ssa_name (new_dest, epilog_stmt);
5385 gimple_assign_set_lhs (epilog_stmt, new_temp);
5386 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5387 if (nested_in_vect_loop)
5388 {
5389 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5390 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5391 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5392
5393 if (!double_reduc)
5394 scalar_results.quick_push (new_temp);
5395 else
5396 scalar_results[0] = new_temp;
5397 }
5398 else
5399 scalar_results[0] = new_temp;
5400
5401 new_phis[0] = epilog_stmt;
5402 }
5403
5404 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5405 phis with new adjusted scalar results, i.e., replace use <s_out0>
5406 with use <s_out4>.
5407
5408 Transform:
5409 loop_exit:
5410 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5411 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5412 v_out2 = reduce <v_out1>
5413 s_out3 = extract_field <v_out2, 0>
5414 s_out4 = adjust_result <s_out3>
5415 use <s_out0>
5416 use <s_out0>
5417
5418 into:
5419
5420 loop_exit:
5421 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5422 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5423 v_out2 = reduce <v_out1>
5424 s_out3 = extract_field <v_out2, 0>
5425 s_out4 = adjust_result <s_out3>
5426 use <s_out4>
5427 use <s_out4> */
5428
5429
5430 /* In SLP reduction chain we reduce vector results into one vector if
5431 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5432 LHS of the last stmt in the reduction chain, since we are looking for
5433 the loop exit phi node. */
5434 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5435 {
5436 stmt_vec_info dest_stmt_info
5437 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5438 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5439 group_size = 1;
5440 }
5441
5442 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5443 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5444 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5445 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5446 correspond to the first vector stmt, etc.
5447 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5448 if (group_size > new_phis.length ())
5449 {
5450 ratio = group_size / new_phis.length ();
5451 gcc_assert (!(group_size % new_phis.length ()));
5452 }
5453 else
5454 ratio = 1;
5455
5456 stmt_vec_info epilog_stmt_info = NULL;
5457 for (k = 0; k < group_size; k++)
5458 {
5459 if (k % ratio == 0)
5460 {
5461 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5462 reduction_phi_info = reduction_phis[k / ratio];
5463 if (double_reduc)
5464 inner_phi = inner_phis[k / ratio];
5465 }
5466
5467 if (slp_reduc)
5468 {
5469 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5470
5471 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5472 /* SLP statements can't participate in patterns. */
5473 gcc_assert (!orig_stmt_info);
5474 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5475 }
5476
5477 phis.create (3);
5478 /* Find the loop-closed-use at the loop exit of the original scalar
5479 result. (The reduction result is expected to have two immediate uses -
5480 one at the latch block, and one at the loop exit). */
5481 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5482 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5483 && !is_gimple_debug (USE_STMT (use_p)))
5484 phis.safe_push (USE_STMT (use_p));
5485
5486 /* While we expect to have found an exit_phi because of loop-closed-ssa
5487 form we can end up without one if the scalar cycle is dead. */
5488
5489 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5490 {
5491 if (outer_loop)
5492 {
5493 stmt_vec_info exit_phi_vinfo
5494 = loop_vinfo->lookup_stmt (exit_phi);
5495 gphi *vect_phi;
5496
5497 if (double_reduc)
5498 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5499 else
5500 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5501 if (!double_reduc
5502 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5503 != vect_double_reduction_def)
5504 continue;
5505
5506 /* Handle double reduction:
5507
5508 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5509 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5510 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5511 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5512
5513 At that point the regular reduction (stmt2 and stmt3) is
5514 already vectorized, as well as the exit phi node, stmt4.
5515 Here we vectorize the phi node of double reduction, stmt1, and
5516 update all relevant statements. */
5517
5518 /* Go through all the uses of s2 to find double reduction phi
5519 node, i.e., stmt1 above. */
5520 orig_name = PHI_RESULT (exit_phi);
5521 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5522 {
5523 stmt_vec_info use_stmt_vinfo;
5524 tree vect_phi_init, preheader_arg, vect_phi_res;
5525 basic_block bb = gimple_bb (use_stmt);
5526
5527 /* Check that USE_STMT is really double reduction phi
5528 node. */
5529 if (gimple_code (use_stmt) != GIMPLE_PHI
5530 || gimple_phi_num_args (use_stmt) != 2
5531 || bb->loop_father != outer_loop)
5532 continue;
5533 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5534 if (!use_stmt_vinfo
5535 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5536 != vect_double_reduction_def)
5537 continue;
5538
5539 /* Create vector phi node for double reduction:
5540 vs1 = phi <vs0, vs2>
5541 vs1 was created previously in this function by a call to
5542 vect_get_vec_def_for_operand and is stored in
5543 vec_initial_def;
5544 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5545 vs0 is created here. */
5546
5547 /* Create vector phi node. */
5548 vect_phi = create_phi_node (vec_initial_def, bb);
5549 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5550
5551 /* Create vs0 - initial def of the double reduction phi. */
5552 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5553 loop_preheader_edge (outer_loop));
5554 vect_phi_init = get_initial_def_for_reduction
5555 (stmt_info, preheader_arg, NULL);
5556
5557 /* Update phi node arguments with vs0 and vs2. */
5558 add_phi_arg (vect_phi, vect_phi_init,
5559 loop_preheader_edge (outer_loop),
5560 UNKNOWN_LOCATION);
5561 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5562 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5563 if (dump_enabled_p ())
5564 dump_printf_loc (MSG_NOTE, vect_location,
5565 "created double reduction phi node: %G",
5566 vect_phi);
5567
5568 vect_phi_res = PHI_RESULT (vect_phi);
5569
5570 /* Replace the use, i.e., set the correct vs1 in the regular
5571 reduction phi node. FORNOW, NCOPIES is always 1, so the
5572 loop is redundant. */
5573 stmt_vec_info use_info = reduction_phi_info;
5574 for (j = 0; j < ncopies; j++)
5575 {
5576 edge pr_edge = loop_preheader_edge (loop);
5577 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5578 pr_edge->dest_idx, vect_phi_res);
5579 use_info = STMT_VINFO_RELATED_STMT (use_info);
5580 }
5581 }
5582 }
5583 }
5584
5585 phis.release ();
5586 if (nested_in_vect_loop)
5587 {
5588 if (double_reduc)
5589 loop = outer_loop;
5590 else
5591 continue;
5592 }
5593
5594 phis.create (3);
5595 /* Find the loop-closed-use at the loop exit of the original scalar
5596 result. (The reduction result is expected to have two immediate uses,
5597 one at the latch block, and one at the loop exit). For double
5598 reductions we are looking for exit phis of the outer loop. */
5599 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5600 {
5601 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5602 {
5603 if (!is_gimple_debug (USE_STMT (use_p)))
5604 phis.safe_push (USE_STMT (use_p));
5605 }
5606 else
5607 {
5608 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5609 {
5610 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5611
5612 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5613 {
5614 if (!flow_bb_inside_loop_p (loop,
5615 gimple_bb (USE_STMT (phi_use_p)))
5616 && !is_gimple_debug (USE_STMT (phi_use_p)))
5617 phis.safe_push (USE_STMT (phi_use_p));
5618 }
5619 }
5620 }
5621 }
5622
5623 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5624 {
5625 /* Replace the uses: */
5626 orig_name = PHI_RESULT (exit_phi);
5627 scalar_result = scalar_results[k];
5628 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5629 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5630 SET_USE (use_p, scalar_result);
5631 }
5632
5633 phis.release ();
5634 }
5635 }
5636
5637 /* Return a vector of type VECTYPE that is equal to the vector select
5638 operation "MASK ? VEC : IDENTITY". Insert the select statements
5639 before GSI. */
5640
5641 static tree
5642 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5643 tree vec, tree identity)
5644 {
5645 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5646 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5647 mask, vec, identity);
5648 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5649 return cond;
5650 }
5651
5652 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5653 order, starting with LHS. Insert the extraction statements before GSI and
5654 associate the new scalar SSA names with variable SCALAR_DEST.
5655 Return the SSA name for the result. */
5656
5657 static tree
5658 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5659 tree_code code, tree lhs, tree vector_rhs)
5660 {
5661 tree vectype = TREE_TYPE (vector_rhs);
5662 tree scalar_type = TREE_TYPE (vectype);
5663 tree bitsize = TYPE_SIZE (scalar_type);
5664 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5665 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5666
5667 for (unsigned HOST_WIDE_INT bit_offset = 0;
5668 bit_offset < vec_size_in_bits;
5669 bit_offset += element_bitsize)
5670 {
5671 tree bitpos = bitsize_int (bit_offset);
5672 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5673 bitsize, bitpos);
5674
5675 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5676 rhs = make_ssa_name (scalar_dest, stmt);
5677 gimple_assign_set_lhs (stmt, rhs);
5678 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5679
5680 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5681 tree new_name = make_ssa_name (scalar_dest, stmt);
5682 gimple_assign_set_lhs (stmt, new_name);
5683 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5684 lhs = new_name;
5685 }
5686 return lhs;
5687 }
5688
5689 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5690 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5691 statement. CODE is the operation performed by STMT_INFO and OPS are
5692 its scalar operands. REDUC_INDEX is the index of the operand in
5693 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5694 implements in-order reduction, or IFN_LAST if we should open-code it.
5695 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5696 that should be used to control the operation in a fully-masked loop. */
5697
5698 static bool
5699 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5700 gimple_stmt_iterator *gsi,
5701 stmt_vec_info *vec_stmt, slp_tree slp_node,
5702 gimple *reduc_def_stmt,
5703 tree_code code, internal_fn reduc_fn,
5704 tree ops[3], tree vectype_in,
5705 int reduc_index, vec_loop_masks *masks)
5706 {
5707 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5708 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5709 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5710 stmt_vec_info new_stmt_info = NULL;
5711
5712 int ncopies;
5713 if (slp_node)
5714 ncopies = 1;
5715 else
5716 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5717
5718 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5719 gcc_assert (ncopies == 1);
5720 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5721 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5722 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5723 == FOLD_LEFT_REDUCTION);
5724
5725 if (slp_node)
5726 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5727 TYPE_VECTOR_SUBPARTS (vectype_in)));
5728
5729 tree op0 = ops[1 - reduc_index];
5730
5731 int group_size = 1;
5732 stmt_vec_info scalar_dest_def_info;
5733 auto_vec<tree> vec_oprnds0;
5734 if (slp_node)
5735 {
5736 auto_vec<vec<tree> > vec_defs (2);
5737 auto_vec<tree> sops(2);
5738 sops.quick_push (ops[0]);
5739 sops.quick_push (ops[1]);
5740 vect_get_slp_defs (sops, slp_node, &vec_defs);
5741 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5742 vec_defs[0].release ();
5743 vec_defs[1].release ();
5744 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5745 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5746 }
5747 else
5748 {
5749 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5750 vec_oprnds0.create (1);
5751 vec_oprnds0.quick_push (loop_vec_def0);
5752 scalar_dest_def_info = stmt_info;
5753 }
5754
5755 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5756 tree scalar_type = TREE_TYPE (scalar_dest);
5757 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5758
5759 int vec_num = vec_oprnds0.length ();
5760 gcc_assert (vec_num == 1 || slp_node);
5761 tree vec_elem_type = TREE_TYPE (vectype_out);
5762 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5763
5764 tree vector_identity = NULL_TREE;
5765 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5766 vector_identity = build_zero_cst (vectype_out);
5767
5768 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5769 int i;
5770 tree def0;
5771 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5772 {
5773 gimple *new_stmt;
5774 tree mask = NULL_TREE;
5775 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5776 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5777
5778 /* Handle MINUS by adding the negative. */
5779 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5780 {
5781 tree negated = make_ssa_name (vectype_out);
5782 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5783 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5784 def0 = negated;
5785 }
5786
5787 if (mask)
5788 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5789 vector_identity);
5790
5791 /* On the first iteration the input is simply the scalar phi
5792 result, and for subsequent iterations it is the output of
5793 the preceding operation. */
5794 if (reduc_fn != IFN_LAST)
5795 {
5796 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5797 /* For chained SLP reductions the output of the previous reduction
5798 operation serves as the input of the next. For the final statement
5799 the output cannot be a temporary - we reuse the original
5800 scalar destination of the last statement. */
5801 if (i != vec_num - 1)
5802 {
5803 gimple_set_lhs (new_stmt, scalar_dest_var);
5804 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5805 gimple_set_lhs (new_stmt, reduc_var);
5806 }
5807 }
5808 else
5809 {
5810 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5811 reduc_var, def0);
5812 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5813 /* Remove the statement, so that we can use the same code paths
5814 as for statements that we've just created. */
5815 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5816 gsi_remove (&tmp_gsi, true);
5817 }
5818
5819 if (i == vec_num - 1)
5820 {
5821 gimple_set_lhs (new_stmt, scalar_dest);
5822 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5823 new_stmt);
5824 }
5825 else
5826 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5827 new_stmt, gsi);
5828
5829 if (slp_node)
5830 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5831 }
5832
5833 if (!slp_node)
5834 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5835
5836 return true;
5837 }
5838
5839 /* Function is_nonwrapping_integer_induction.
5840
5841 Check if STMT_VINO (which is part of loop LOOP) both increments and
5842 does not cause overflow. */
5843
5844 static bool
5845 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5846 {
5847 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5848 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5849 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5850 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5851 widest_int ni, max_loop_value, lhs_max;
5852 wi::overflow_type overflow = wi::OVF_NONE;
5853
5854 /* Make sure the loop is integer based. */
5855 if (TREE_CODE (base) != INTEGER_CST
5856 || TREE_CODE (step) != INTEGER_CST)
5857 return false;
5858
5859 /* Check that the max size of the loop will not wrap. */
5860
5861 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5862 return true;
5863
5864 if (! max_stmt_executions (loop, &ni))
5865 return false;
5866
5867 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5868 &overflow);
5869 if (overflow)
5870 return false;
5871
5872 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5873 TYPE_SIGN (lhs_type), &overflow);
5874 if (overflow)
5875 return false;
5876
5877 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5878 <= TYPE_PRECISION (lhs_type));
5879 }
5880
5881 /* Function vectorizable_reduction.
5882
5883 Check if STMT_INFO performs a reduction operation that can be vectorized.
5884 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5885 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5886 Return true if STMT_INFO is vectorizable in this way.
5887
5888 This function also handles reduction idioms (patterns) that have been
5889 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5890 may be of this form:
5891 X = pattern_expr (arg0, arg1, ..., X)
5892 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5893 sequence that had been detected and replaced by the pattern-stmt
5894 (STMT_INFO).
5895
5896 This function also handles reduction of condition expressions, for example:
5897 for (int i = 0; i < N; i++)
5898 if (a[i] < value)
5899 last = a[i];
5900 This is handled by vectorising the loop and creating an additional vector
5901 containing the loop indexes for which "a[i] < value" was true. In the
5902 function epilogue this is reduced to a single max value and then used to
5903 index into the vector of results.
5904
5905 In some cases of reduction patterns, the type of the reduction variable X is
5906 different than the type of the other arguments of STMT_INFO.
5907 In such cases, the vectype that is used when transforming STMT_INFO into
5908 a vector stmt is different than the vectype that is used to determine the
5909 vectorization factor, because it consists of a different number of elements
5910 than the actual number of elements that are being operated upon in parallel.
5911
5912 For example, consider an accumulation of shorts into an int accumulator.
5913 On some targets it's possible to vectorize this pattern operating on 8
5914 shorts at a time (hence, the vectype for purposes of determining the
5915 vectorization factor should be V8HI); on the other hand, the vectype that
5916 is used to create the vector form is actually V4SI (the type of the result).
5917
5918 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5919 indicates what is the actual level of parallelism (V8HI in the example), so
5920 that the right vectorization factor would be derived. This vectype
5921 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5922 be used to create the vectorized stmt. The right vectype for the vectorized
5923 stmt is obtained from the type of the result X:
5924 get_vectype_for_scalar_type (TREE_TYPE (X))
5925
5926 This means that, contrary to "regular" reductions (or "regular" stmts in
5927 general), the following equation:
5928 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5929 does *NOT* necessarily hold for reduction patterns. */
5930
5931 bool
5932 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5933 stmt_vec_info *vec_stmt, slp_tree slp_node,
5934 slp_instance slp_node_instance,
5935 stmt_vector_for_cost *cost_vec)
5936 {
5937 tree vec_dest;
5938 tree scalar_dest;
5939 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5940 tree vectype_in = NULL_TREE;
5941 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5942 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5943 enum tree_code code, orig_code;
5944 internal_fn reduc_fn;
5945 machine_mode vec_mode;
5946 int op_type;
5947 optab optab;
5948 tree new_temp = NULL_TREE;
5949 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5950 stmt_vec_info cond_stmt_vinfo = NULL;
5951 enum tree_code cond_reduc_op_code = ERROR_MARK;
5952 tree scalar_type;
5953 bool is_simple_use;
5954 int i;
5955 int ncopies;
5956 int epilog_copies;
5957 stmt_vec_info prev_stmt_info, prev_phi_info;
5958 bool single_defuse_cycle = false;
5959 stmt_vec_info new_stmt_info = NULL;
5960 int j;
5961 tree ops[3];
5962 enum vect_def_type dts[3];
5963 bool nested_cycle = false, found_nested_cycle_def = false;
5964 bool double_reduc = false;
5965 basic_block def_bb;
5966 struct loop * def_stmt_loop;
5967 tree def_arg;
5968 auto_vec<tree> vec_oprnds0;
5969 auto_vec<tree> vec_oprnds1;
5970 auto_vec<tree> vec_oprnds2;
5971 auto_vec<tree> vect_defs;
5972 auto_vec<stmt_vec_info> phis;
5973 int vec_num;
5974 tree def0, tem;
5975 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5976 tree cond_reduc_val = NULL_TREE;
5977
5978 /* Make sure it was already recognized as a reduction computation. */
5979 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5980 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5981 return false;
5982
5983 if (nested_in_vect_loop_p (loop, stmt_info))
5984 {
5985 loop = loop->inner;
5986 nested_cycle = true;
5987 }
5988
5989 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5990 gcc_assert (slp_node
5991 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
5992
5993 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
5994 {
5995 tree phi_result = gimple_phi_result (phi);
5996 /* Analysis is fully done on the reduction stmt invocation. */
5997 if (! vec_stmt)
5998 {
5999 if (slp_node)
6000 slp_node_instance->reduc_phis = slp_node;
6001
6002 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6003 return true;
6004 }
6005
6006 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6007 /* Leave the scalar phi in place. Note that checking
6008 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6009 for reductions involving a single statement. */
6010 return true;
6011
6012 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6013 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6014
6015 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6016 == EXTRACT_LAST_REDUCTION)
6017 /* Leave the scalar phi in place. */
6018 return true;
6019
6020 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6021 code = gimple_assign_rhs_code (reduc_stmt);
6022 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6023 {
6024 tree op = gimple_op (reduc_stmt, k);
6025 if (op == phi_result)
6026 continue;
6027 if (k == 1 && code == COND_EXPR)
6028 continue;
6029 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6030 gcc_assert (is_simple_use);
6031 if (dt == vect_constant_def || dt == vect_external_def)
6032 continue;
6033 if (!vectype_in
6034 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6035 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6036 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6037 break;
6038 }
6039 /* For a nested cycle we might end up with an operation like
6040 phi_result * phi_result. */
6041 if (!vectype_in)
6042 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6043 gcc_assert (vectype_in);
6044
6045 if (slp_node)
6046 ncopies = 1;
6047 else
6048 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6049
6050 stmt_vec_info use_stmt_info;
6051 if (ncopies > 1
6052 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6053 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6054 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6055 single_defuse_cycle = true;
6056
6057 /* Create the destination vector */
6058 scalar_dest = gimple_assign_lhs (reduc_stmt);
6059 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6060
6061 if (slp_node)
6062 /* The size vect_schedule_slp_instance computes is off for us. */
6063 vec_num = vect_get_num_vectors
6064 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6065 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6066 vectype_in);
6067 else
6068 vec_num = 1;
6069
6070 /* Generate the reduction PHIs upfront. */
6071 prev_phi_info = NULL;
6072 for (j = 0; j < ncopies; j++)
6073 {
6074 if (j == 0 || !single_defuse_cycle)
6075 {
6076 for (i = 0; i < vec_num; i++)
6077 {
6078 /* Create the reduction-phi that defines the reduction
6079 operand. */
6080 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6081 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6082
6083 if (slp_node)
6084 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6085 else
6086 {
6087 if (j == 0)
6088 STMT_VINFO_VEC_STMT (stmt_info)
6089 = *vec_stmt = new_phi_info;
6090 else
6091 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6092 prev_phi_info = new_phi_info;
6093 }
6094 }
6095 }
6096 }
6097
6098 return true;
6099 }
6100
6101 /* 1. Is vectorizable reduction? */
6102 /* Not supportable if the reduction variable is used in the loop, unless
6103 it's a reduction chain. */
6104 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6105 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6106 return false;
6107
6108 /* Reductions that are not used even in an enclosing outer-loop,
6109 are expected to be "live" (used out of the loop). */
6110 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6111 && !STMT_VINFO_LIVE_P (stmt_info))
6112 return false;
6113
6114 /* 2. Has this been recognized as a reduction pattern?
6115
6116 Check if STMT represents a pattern that has been recognized
6117 in earlier analysis stages. For stmts that represent a pattern,
6118 the STMT_VINFO_RELATED_STMT field records the last stmt in
6119 the original sequence that constitutes the pattern. */
6120
6121 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6122 if (orig_stmt_info)
6123 {
6124 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6125 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6126 }
6127
6128 /* 3. Check the operands of the operation. The first operands are defined
6129 inside the loop body. The last operand is the reduction variable,
6130 which is defined by the loop-header-phi. */
6131
6132 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6133
6134 /* Flatten RHS. */
6135 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6136 {
6137 case GIMPLE_BINARY_RHS:
6138 code = gimple_assign_rhs_code (stmt);
6139 op_type = TREE_CODE_LENGTH (code);
6140 gcc_assert (op_type == binary_op);
6141 ops[0] = gimple_assign_rhs1 (stmt);
6142 ops[1] = gimple_assign_rhs2 (stmt);
6143 break;
6144
6145 case GIMPLE_TERNARY_RHS:
6146 code = gimple_assign_rhs_code (stmt);
6147 op_type = TREE_CODE_LENGTH (code);
6148 gcc_assert (op_type == ternary_op);
6149 ops[0] = gimple_assign_rhs1 (stmt);
6150 ops[1] = gimple_assign_rhs2 (stmt);
6151 ops[2] = gimple_assign_rhs3 (stmt);
6152 break;
6153
6154 case GIMPLE_UNARY_RHS:
6155 return false;
6156
6157 default:
6158 gcc_unreachable ();
6159 }
6160
6161 if (code == COND_EXPR && slp_node)
6162 return false;
6163
6164 scalar_dest = gimple_assign_lhs (stmt);
6165 scalar_type = TREE_TYPE (scalar_dest);
6166 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6167 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6168 return false;
6169
6170 /* Do not try to vectorize bit-precision reductions. */
6171 if (!type_has_mode_precision_p (scalar_type))
6172 return false;
6173
6174 /* All uses but the last are expected to be defined in the loop.
6175 The last use is the reduction variable. In case of nested cycle this
6176 assumption is not true: we use reduc_index to record the index of the
6177 reduction variable. */
6178 stmt_vec_info reduc_def_info;
6179 if (orig_stmt_info)
6180 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6181 else
6182 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6183 gcc_assert (reduc_def_info);
6184 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6185 tree reduc_def = PHI_RESULT (reduc_def_phi);
6186 int reduc_index = -1;
6187 for (i = 0; i < op_type; i++)
6188 {
6189 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6190 if (i == 0 && code == COND_EXPR)
6191 continue;
6192
6193 stmt_vec_info def_stmt_info;
6194 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6195 &def_stmt_info);
6196 dt = dts[i];
6197 gcc_assert (is_simple_use);
6198 if (dt == vect_reduction_def
6199 && ops[i] == reduc_def)
6200 {
6201 reduc_index = i;
6202 continue;
6203 }
6204 else if (tem)
6205 {
6206 /* To properly compute ncopies we are interested in the widest
6207 input type in case we're looking at a widening accumulation. */
6208 if (!vectype_in
6209 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6210 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6211 vectype_in = tem;
6212 }
6213
6214 if (dt != vect_internal_def
6215 && dt != vect_external_def
6216 && dt != vect_constant_def
6217 && dt != vect_induction_def
6218 && !(dt == vect_nested_cycle && nested_cycle))
6219 return false;
6220
6221 if (dt == vect_nested_cycle
6222 && ops[i] == reduc_def)
6223 {
6224 found_nested_cycle_def = true;
6225 reduc_index = i;
6226 }
6227
6228 if (i == 1 && code == COND_EXPR)
6229 {
6230 /* Record how value of COND_EXPR is defined. */
6231 if (dt == vect_constant_def)
6232 {
6233 cond_reduc_dt = dt;
6234 cond_reduc_val = ops[i];
6235 }
6236 if (dt == vect_induction_def
6237 && def_stmt_info
6238 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6239 {
6240 cond_reduc_dt = dt;
6241 cond_stmt_vinfo = def_stmt_info;
6242 }
6243 }
6244 }
6245
6246 if (!vectype_in)
6247 vectype_in = vectype_out;
6248
6249 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6250 directy used in stmt. */
6251 if (reduc_index == -1)
6252 {
6253 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6254 {
6255 if (dump_enabled_p ())
6256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6257 "in-order reduction chain without SLP.\n");
6258 return false;
6259 }
6260 }
6261
6262 if (!(reduc_index == -1
6263 || dts[reduc_index] == vect_reduction_def
6264 || dts[reduc_index] == vect_nested_cycle
6265 || ((dts[reduc_index] == vect_internal_def
6266 || dts[reduc_index] == vect_external_def
6267 || dts[reduc_index] == vect_constant_def
6268 || dts[reduc_index] == vect_induction_def)
6269 && nested_cycle && found_nested_cycle_def)))
6270 {
6271 /* For pattern recognized stmts, orig_stmt might be a reduction,
6272 but some helper statements for the pattern might not, or
6273 might be COND_EXPRs with reduction uses in the condition. */
6274 gcc_assert (orig_stmt_info);
6275 return false;
6276 }
6277
6278 /* PHIs should not participate in patterns. */
6279 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6280 enum vect_reduction_type v_reduc_type
6281 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6282 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6283
6284 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6285 /* If we have a condition reduction, see if we can simplify it further. */
6286 if (v_reduc_type == COND_REDUCTION)
6287 {
6288 /* TODO: We can't yet handle reduction chains, since we need to treat
6289 each COND_EXPR in the chain specially, not just the last one.
6290 E.g. for:
6291
6292 x_1 = PHI <x_3, ...>
6293 x_2 = a_2 ? ... : x_1;
6294 x_3 = a_3 ? ... : x_2;
6295
6296 we're interested in the last element in x_3 for which a_2 || a_3
6297 is true, whereas the current reduction chain handling would
6298 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6299 as a reduction operation. */
6300 if (reduc_index == -1)
6301 {
6302 if (dump_enabled_p ())
6303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6304 "conditional reduction chains not supported\n");
6305 return false;
6306 }
6307
6308 /* vect_is_simple_reduction ensured that operand 2 is the
6309 loop-carried operand. */
6310 gcc_assert (reduc_index == 2);
6311
6312 /* Loop peeling modifies initial value of reduction PHI, which
6313 makes the reduction stmt to be transformed different to the
6314 original stmt analyzed. We need to record reduction code for
6315 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6316 it can be used directly at transform stage. */
6317 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6318 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6319 {
6320 /* Also set the reduction type to CONST_COND_REDUCTION. */
6321 gcc_assert (cond_reduc_dt == vect_constant_def);
6322 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6323 }
6324 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6325 vectype_in, OPTIMIZE_FOR_SPEED))
6326 {
6327 if (dump_enabled_p ())
6328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6329 "optimizing condition reduction with"
6330 " FOLD_EXTRACT_LAST.\n");
6331 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6332 }
6333 else if (cond_reduc_dt == vect_induction_def)
6334 {
6335 tree base
6336 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6337 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6338
6339 gcc_assert (TREE_CODE (base) == INTEGER_CST
6340 && TREE_CODE (step) == INTEGER_CST);
6341 cond_reduc_val = NULL_TREE;
6342 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6343 above base; punt if base is the minimum value of the type for
6344 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6345 if (tree_int_cst_sgn (step) == -1)
6346 {
6347 cond_reduc_op_code = MIN_EXPR;
6348 if (tree_int_cst_sgn (base) == -1)
6349 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6350 else if (tree_int_cst_lt (base,
6351 TYPE_MAX_VALUE (TREE_TYPE (base))))
6352 cond_reduc_val
6353 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6354 }
6355 else
6356 {
6357 cond_reduc_op_code = MAX_EXPR;
6358 if (tree_int_cst_sgn (base) == 1)
6359 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6360 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6361 base))
6362 cond_reduc_val
6363 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6364 }
6365 if (cond_reduc_val)
6366 {
6367 if (dump_enabled_p ())
6368 dump_printf_loc (MSG_NOTE, vect_location,
6369 "condition expression based on "
6370 "integer induction.\n");
6371 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6372 = INTEGER_INDUC_COND_REDUCTION;
6373 }
6374 }
6375 else if (cond_reduc_dt == vect_constant_def)
6376 {
6377 enum vect_def_type cond_initial_dt;
6378 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6379 tree cond_initial_val
6380 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6381
6382 gcc_assert (cond_reduc_val != NULL_TREE);
6383 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6384 if (cond_initial_dt == vect_constant_def
6385 && types_compatible_p (TREE_TYPE (cond_initial_val),
6386 TREE_TYPE (cond_reduc_val)))
6387 {
6388 tree e = fold_binary (LE_EXPR, boolean_type_node,
6389 cond_initial_val, cond_reduc_val);
6390 if (e && (integer_onep (e) || integer_zerop (e)))
6391 {
6392 if (dump_enabled_p ())
6393 dump_printf_loc (MSG_NOTE, vect_location,
6394 "condition expression based on "
6395 "compile time constant.\n");
6396 /* Record reduction code at analysis stage. */
6397 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6398 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6399 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6400 = CONST_COND_REDUCTION;
6401 }
6402 }
6403 }
6404 }
6405
6406 if (orig_stmt_info)
6407 gcc_assert (tmp == orig_stmt_info
6408 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6409 else
6410 /* We changed STMT to be the first stmt in reduction chain, hence we
6411 check that in this case the first element in the chain is STMT. */
6412 gcc_assert (tmp == stmt_info
6413 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6414
6415 if (STMT_VINFO_LIVE_P (reduc_def_info))
6416 return false;
6417
6418 if (slp_node)
6419 ncopies = 1;
6420 else
6421 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6422
6423 gcc_assert (ncopies >= 1);
6424
6425 vec_mode = TYPE_MODE (vectype_in);
6426 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6427
6428 if (nested_cycle)
6429 {
6430 def_bb = gimple_bb (reduc_def_phi);
6431 def_stmt_loop = def_bb->loop_father;
6432 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6433 loop_preheader_edge (def_stmt_loop));
6434 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6435 if (def_arg_stmt_info
6436 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6437 == vect_double_reduction_def))
6438 double_reduc = true;
6439 }
6440
6441 vect_reduction_type reduction_type
6442 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6443 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6444 && ncopies > 1)
6445 {
6446 if (dump_enabled_p ())
6447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448 "multiple types in double reduction or condition "
6449 "reduction.\n");
6450 return false;
6451 }
6452
6453 if (code == COND_EXPR)
6454 {
6455 /* Only call during the analysis stage, otherwise we'll lose
6456 STMT_VINFO_TYPE. */
6457 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6458 true, NULL, cost_vec))
6459 {
6460 if (dump_enabled_p ())
6461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6462 "unsupported condition in reduction\n");
6463 return false;
6464 }
6465 }
6466 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6467 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6468 {
6469 /* Only call during the analysis stage, otherwise we'll lose
6470 STMT_VINFO_TYPE. We only support this for nested cycles
6471 without double reductions at the moment. */
6472 if (!nested_cycle
6473 || double_reduc
6474 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6475 NULL, cost_vec)))
6476 {
6477 if (dump_enabled_p ())
6478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6479 "unsupported shift or rotation in reduction\n");
6480 return false;
6481 }
6482 }
6483 else
6484 {
6485 /* 4. Supportable by target? */
6486
6487 /* 4.1. check support for the operation in the loop */
6488 optab = optab_for_tree_code (code, vectype_in, optab_default);
6489 if (!optab)
6490 {
6491 if (dump_enabled_p ())
6492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493 "no optab.\n");
6494
6495 return false;
6496 }
6497
6498 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6499 {
6500 if (dump_enabled_p ())
6501 dump_printf (MSG_NOTE, "op not supported by target.\n");
6502
6503 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6504 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6505 return false;
6506
6507 if (dump_enabled_p ())
6508 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6509 }
6510
6511 /* Worthwhile without SIMD support? */
6512 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6513 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6514 {
6515 if (dump_enabled_p ())
6516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517 "not worthwhile without SIMD support.\n");
6518
6519 return false;
6520 }
6521 }
6522
6523 /* 4.2. Check support for the epilog operation.
6524
6525 If STMT represents a reduction pattern, then the type of the
6526 reduction variable may be different than the type of the rest
6527 of the arguments. For example, consider the case of accumulation
6528 of shorts into an int accumulator; The original code:
6529 S1: int_a = (int) short_a;
6530 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6531
6532 was replaced with:
6533 STMT: int_acc = widen_sum <short_a, int_acc>
6534
6535 This means that:
6536 1. The tree-code that is used to create the vector operation in the
6537 epilog code (that reduces the partial results) is not the
6538 tree-code of STMT, but is rather the tree-code of the original
6539 stmt from the pattern that STMT is replacing. I.e, in the example
6540 above we want to use 'widen_sum' in the loop, but 'plus' in the
6541 epilog.
6542 2. The type (mode) we use to check available target support
6543 for the vector operation to be created in the *epilog*, is
6544 determined by the type of the reduction variable (in the example
6545 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6546 However the type (mode) we use to check available target support
6547 for the vector operation to be created *inside the loop*, is
6548 determined by the type of the other arguments to STMT (in the
6549 example we'd check this: optab_handler (widen_sum_optab,
6550 vect_short_mode)).
6551
6552 This is contrary to "regular" reductions, in which the types of all
6553 the arguments are the same as the type of the reduction variable.
6554 For "regular" reductions we can therefore use the same vector type
6555 (and also the same tree-code) when generating the epilog code and
6556 when generating the code inside the loop. */
6557
6558 if (orig_stmt_info
6559 && (reduction_type == TREE_CODE_REDUCTION
6560 || reduction_type == FOLD_LEFT_REDUCTION))
6561 {
6562 /* This is a reduction pattern: get the vectype from the type of the
6563 reduction variable, and get the tree-code from orig_stmt. */
6564 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6565 gcc_assert (vectype_out);
6566 vec_mode = TYPE_MODE (vectype_out);
6567 }
6568 else
6569 {
6570 /* Regular reduction: use the same vectype and tree-code as used for
6571 the vector code inside the loop can be used for the epilog code. */
6572 orig_code = code;
6573
6574 if (code == MINUS_EXPR)
6575 orig_code = PLUS_EXPR;
6576
6577 /* For simple condition reductions, replace with the actual expression
6578 we want to base our reduction around. */
6579 if (reduction_type == CONST_COND_REDUCTION)
6580 {
6581 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6582 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6583 }
6584 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6585 orig_code = cond_reduc_op_code;
6586 }
6587
6588 reduc_fn = IFN_LAST;
6589
6590 if (reduction_type == TREE_CODE_REDUCTION
6591 || reduction_type == FOLD_LEFT_REDUCTION
6592 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6593 || reduction_type == CONST_COND_REDUCTION)
6594 {
6595 if (reduction_type == FOLD_LEFT_REDUCTION
6596 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6597 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6598 {
6599 if (reduc_fn != IFN_LAST
6600 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6601 OPTIMIZE_FOR_SPEED))
6602 {
6603 if (dump_enabled_p ())
6604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6605 "reduc op not supported by target.\n");
6606
6607 reduc_fn = IFN_LAST;
6608 }
6609 }
6610 else
6611 {
6612 if (!nested_cycle || double_reduc)
6613 {
6614 if (dump_enabled_p ())
6615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6616 "no reduc code for scalar code.\n");
6617
6618 return false;
6619 }
6620 }
6621 }
6622 else if (reduction_type == COND_REDUCTION)
6623 {
6624 int scalar_precision
6625 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6626 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6627 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6628 nunits_out);
6629
6630 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6631 OPTIMIZE_FOR_SPEED))
6632 reduc_fn = IFN_REDUC_MAX;
6633 }
6634
6635 if (reduction_type != EXTRACT_LAST_REDUCTION
6636 && (!nested_cycle || double_reduc)
6637 && reduc_fn == IFN_LAST
6638 && !nunits_out.is_constant ())
6639 {
6640 if (dump_enabled_p ())
6641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6642 "missing target support for reduction on"
6643 " variable-length vectors.\n");
6644 return false;
6645 }
6646
6647 /* For SLP reductions, see if there is a neutral value we can use. */
6648 tree neutral_op = NULL_TREE;
6649 if (slp_node)
6650 neutral_op = neutral_op_for_slp_reduction
6651 (slp_node_instance->reduc_phis, code,
6652 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6653
6654 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6655 {
6656 /* We can't support in-order reductions of code such as this:
6657
6658 for (int i = 0; i < n1; ++i)
6659 for (int j = 0; j < n2; ++j)
6660 l += a[j];
6661
6662 since GCC effectively transforms the loop when vectorizing:
6663
6664 for (int i = 0; i < n1 / VF; ++i)
6665 for (int j = 0; j < n2; ++j)
6666 for (int k = 0; k < VF; ++k)
6667 l += a[j];
6668
6669 which is a reassociation of the original operation. */
6670 if (dump_enabled_p ())
6671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6672 "in-order double reduction not supported.\n");
6673
6674 return false;
6675 }
6676
6677 if (reduction_type == FOLD_LEFT_REDUCTION
6678 && slp_node
6679 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6680 {
6681 /* We cannot use in-order reductions in this case because there is
6682 an implicit reassociation of the operations involved. */
6683 if (dump_enabled_p ())
6684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6685 "in-order unchained SLP reductions not supported.\n");
6686 return false;
6687 }
6688
6689 /* For double reductions, and for SLP reductions with a neutral value,
6690 we construct a variable-length initial vector by loading a vector
6691 full of the neutral value and then shift-and-inserting the start
6692 values into the low-numbered elements. */
6693 if ((double_reduc || neutral_op)
6694 && !nunits_out.is_constant ()
6695 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6696 vectype_out, OPTIMIZE_FOR_SPEED))
6697 {
6698 if (dump_enabled_p ())
6699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700 "reduction on variable-length vectors requires"
6701 " target support for a vector-shift-and-insert"
6702 " operation.\n");
6703 return false;
6704 }
6705
6706 /* Check extra constraints for variable-length unchained SLP reductions. */
6707 if (STMT_SLP_TYPE (stmt_info)
6708 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6709 && !nunits_out.is_constant ())
6710 {
6711 /* We checked above that we could build the initial vector when
6712 there's a neutral element value. Check here for the case in
6713 which each SLP statement has its own initial value and in which
6714 that value needs to be repeated for every instance of the
6715 statement within the initial vector. */
6716 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6717 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6718 if (!neutral_op
6719 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6720 {
6721 if (dump_enabled_p ())
6722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6723 "unsupported form of SLP reduction for"
6724 " variable-length vectors: cannot build"
6725 " initial vector.\n");
6726 return false;
6727 }
6728 /* The epilogue code relies on the number of elements being a multiple
6729 of the group size. The duplicate-and-interleave approach to setting
6730 up the the initial vector does too. */
6731 if (!multiple_p (nunits_out, group_size))
6732 {
6733 if (dump_enabled_p ())
6734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6735 "unsupported form of SLP reduction for"
6736 " variable-length vectors: the vector size"
6737 " is not a multiple of the number of results.\n");
6738 return false;
6739 }
6740 }
6741
6742 /* In case of widenning multiplication by a constant, we update the type
6743 of the constant to be the type of the other operand. We check that the
6744 constant fits the type in the pattern recognition pass. */
6745 if (code == DOT_PROD_EXPR
6746 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6747 {
6748 if (TREE_CODE (ops[0]) == INTEGER_CST)
6749 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6750 else if (TREE_CODE (ops[1]) == INTEGER_CST)
6751 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6752 else
6753 {
6754 if (dump_enabled_p ())
6755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6756 "invalid types in dot-prod\n");
6757
6758 return false;
6759 }
6760 }
6761
6762 if (reduction_type == COND_REDUCTION)
6763 {
6764 widest_int ni;
6765
6766 if (! max_loop_iterations (loop, &ni))
6767 {
6768 if (dump_enabled_p ())
6769 dump_printf_loc (MSG_NOTE, vect_location,
6770 "loop count not known, cannot create cond "
6771 "reduction.\n");
6772 return false;
6773 }
6774 /* Convert backedges to iterations. */
6775 ni += 1;
6776
6777 /* The additional index will be the same type as the condition. Check
6778 that the loop can fit into this less one (because we'll use up the
6779 zero slot for when there are no matches). */
6780 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6781 if (wi::geu_p (ni, wi::to_widest (max_index)))
6782 {
6783 if (dump_enabled_p ())
6784 dump_printf_loc (MSG_NOTE, vect_location,
6785 "loop size is greater than data size.\n");
6786 return false;
6787 }
6788 }
6789
6790 /* In case the vectorization factor (VF) is bigger than the number
6791 of elements that we can fit in a vectype (nunits), we have to generate
6792 more than one vector stmt - i.e - we need to "unroll" the
6793 vector stmt by a factor VF/nunits. For more details see documentation
6794 in vectorizable_operation. */
6795
6796 /* If the reduction is used in an outer loop we need to generate
6797 VF intermediate results, like so (e.g. for ncopies=2):
6798 r0 = phi (init, r0)
6799 r1 = phi (init, r1)
6800 r0 = x0 + r0;
6801 r1 = x1 + r1;
6802 (i.e. we generate VF results in 2 registers).
6803 In this case we have a separate def-use cycle for each copy, and therefore
6804 for each copy we get the vector def for the reduction variable from the
6805 respective phi node created for this copy.
6806
6807 Otherwise (the reduction is unused in the loop nest), we can combine
6808 together intermediate results, like so (e.g. for ncopies=2):
6809 r = phi (init, r)
6810 r = x0 + r;
6811 r = x1 + r;
6812 (i.e. we generate VF/2 results in a single register).
6813 In this case for each copy we get the vector def for the reduction variable
6814 from the vectorized reduction operation generated in the previous iteration.
6815
6816 This only works when we see both the reduction PHI and its only consumer
6817 in vectorizable_reduction and there are no intermediate stmts
6818 participating. */
6819 stmt_vec_info use_stmt_info;
6820 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6821 if (ncopies > 1
6822 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6823 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6824 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6825 {
6826 single_defuse_cycle = true;
6827 epilog_copies = 1;
6828 }
6829 else
6830 epilog_copies = ncopies;
6831
6832 /* If the reduction stmt is one of the patterns that have lane
6833 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6834 if ((ncopies > 1
6835 && ! single_defuse_cycle)
6836 && (code == DOT_PROD_EXPR
6837 || code == WIDEN_SUM_EXPR
6838 || code == SAD_EXPR))
6839 {
6840 if (dump_enabled_p ())
6841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6842 "multi def-use cycle not possible for lane-reducing "
6843 "reduction operation\n");
6844 return false;
6845 }
6846
6847 if (slp_node)
6848 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6849 else
6850 vec_num = 1;
6851
6852 internal_fn cond_fn = get_conditional_internal_fn (code);
6853 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6854
6855 if (!vec_stmt) /* transformation not required. */
6856 {
6857 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6858 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6859 {
6860 if (reduction_type != FOLD_LEFT_REDUCTION
6861 && (cond_fn == IFN_LAST
6862 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6863 OPTIMIZE_FOR_SPEED)))
6864 {
6865 if (dump_enabled_p ())
6866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867 "can't use a fully-masked loop because no"
6868 " conditional operation is available.\n");
6869 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6870 }
6871 else if (reduc_index == -1)
6872 {
6873 if (dump_enabled_p ())
6874 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6875 "can't use a fully-masked loop for chained"
6876 " reductions.\n");
6877 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6878 }
6879 else
6880 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6881 vectype_in);
6882 }
6883 if (dump_enabled_p ()
6884 && reduction_type == FOLD_LEFT_REDUCTION)
6885 dump_printf_loc (MSG_NOTE, vect_location,
6886 "using an in-order (fold-left) reduction.\n");
6887 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6888 return true;
6889 }
6890
6891 /* Transform. */
6892
6893 if (dump_enabled_p ())
6894 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6895
6896 /* FORNOW: Multiple types are not supported for condition. */
6897 if (code == COND_EXPR)
6898 gcc_assert (ncopies == 1);
6899
6900 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6901
6902 if (reduction_type == FOLD_LEFT_REDUCTION)
6903 return vectorize_fold_left_reduction
6904 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6905 reduc_fn, ops, vectype_in, reduc_index, masks);
6906
6907 if (reduction_type == EXTRACT_LAST_REDUCTION)
6908 {
6909 gcc_assert (!slp_node);
6910 return vectorizable_condition (stmt_info, gsi, vec_stmt,
6911 true, NULL, NULL);
6912 }
6913
6914 /* Create the destination vector */
6915 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6916
6917 prev_stmt_info = NULL;
6918 prev_phi_info = NULL;
6919 if (!slp_node)
6920 {
6921 vec_oprnds0.create (1);
6922 vec_oprnds1.create (1);
6923 if (op_type == ternary_op)
6924 vec_oprnds2.create (1);
6925 }
6926
6927 phis.create (vec_num);
6928 vect_defs.create (vec_num);
6929 if (!slp_node)
6930 vect_defs.quick_push (NULL_TREE);
6931
6932 if (slp_node)
6933 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6934 else
6935 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6936
6937 for (j = 0; j < ncopies; j++)
6938 {
6939 if (code == COND_EXPR)
6940 {
6941 gcc_assert (!slp_node);
6942 vectorizable_condition (stmt_info, gsi, vec_stmt,
6943 true, NULL, NULL);
6944 break;
6945 }
6946 if (code == LSHIFT_EXPR
6947 || code == RSHIFT_EXPR)
6948 {
6949 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
6950 break;
6951 }
6952
6953 /* Handle uses. */
6954 if (j == 0)
6955 {
6956 if (slp_node)
6957 {
6958 /* Get vec defs for all the operands except the reduction index,
6959 ensuring the ordering of the ops in the vector is kept. */
6960 auto_vec<tree, 3> slp_ops;
6961 auto_vec<vec<tree>, 3> vec_defs;
6962
6963 slp_ops.quick_push (ops[0]);
6964 slp_ops.quick_push (ops[1]);
6965 if (op_type == ternary_op)
6966 slp_ops.quick_push (ops[2]);
6967
6968 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6969
6970 vec_oprnds0.safe_splice (vec_defs[0]);
6971 vec_defs[0].release ();
6972 vec_oprnds1.safe_splice (vec_defs[1]);
6973 vec_defs[1].release ();
6974 if (op_type == ternary_op)
6975 {
6976 vec_oprnds2.safe_splice (vec_defs[2]);
6977 vec_defs[2].release ();
6978 }
6979 }
6980 else
6981 {
6982 vec_oprnds0.quick_push
6983 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6984 vec_oprnds1.quick_push
6985 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6986 if (op_type == ternary_op)
6987 vec_oprnds2.quick_push
6988 (vect_get_vec_def_for_operand (ops[2], stmt_info));
6989 }
6990 }
6991 else
6992 {
6993 if (!slp_node)
6994 {
6995 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6996
6997 if (single_defuse_cycle && reduc_index == 0)
6998 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6999 else
7000 vec_oprnds0[0]
7001 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7002 vec_oprnds0[0]);
7003 if (single_defuse_cycle && reduc_index == 1)
7004 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7005 else
7006 vec_oprnds1[0]
7007 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7008 vec_oprnds1[0]);
7009 if (op_type == ternary_op)
7010 {
7011 if (single_defuse_cycle && reduc_index == 2)
7012 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7013 else
7014 vec_oprnds2[0]
7015 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7016 vec_oprnds2[0]);
7017 }
7018 }
7019 }
7020
7021 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7022 {
7023 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7024 if (masked_loop_p)
7025 {
7026 /* Make sure that the reduction accumulator is vop[0]. */
7027 if (reduc_index == 1)
7028 {
7029 gcc_assert (commutative_tree_code (code));
7030 std::swap (vop[0], vop[1]);
7031 }
7032 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7033 vectype_in, i * ncopies + j);
7034 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7035 vop[0], vop[1],
7036 vop[0]);
7037 new_temp = make_ssa_name (vec_dest, call);
7038 gimple_call_set_lhs (call, new_temp);
7039 gimple_call_set_nothrow (call, true);
7040 new_stmt_info
7041 = vect_finish_stmt_generation (stmt_info, call, gsi);
7042 }
7043 else
7044 {
7045 if (op_type == ternary_op)
7046 vop[2] = vec_oprnds2[i];
7047
7048 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7049 vop[0], vop[1], vop[2]);
7050 new_temp = make_ssa_name (vec_dest, new_stmt);
7051 gimple_assign_set_lhs (new_stmt, new_temp);
7052 new_stmt_info
7053 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7054 }
7055
7056 if (slp_node)
7057 {
7058 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7059 vect_defs.quick_push (new_temp);
7060 }
7061 else
7062 vect_defs[0] = new_temp;
7063 }
7064
7065 if (slp_node)
7066 continue;
7067
7068 if (j == 0)
7069 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7070 else
7071 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7072
7073 prev_stmt_info = new_stmt_info;
7074 }
7075
7076 /* Finalize the reduction-phi (set its arguments) and create the
7077 epilog reduction code. */
7078 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7079 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7080
7081 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7082 epilog_copies, reduc_fn, phis,
7083 double_reduc, slp_node, slp_node_instance,
7084 cond_reduc_val, cond_reduc_op_code,
7085 neutral_op);
7086
7087 return true;
7088 }
7089
7090 /* Function vect_min_worthwhile_factor.
7091
7092 For a loop where we could vectorize the operation indicated by CODE,
7093 return the minimum vectorization factor that makes it worthwhile
7094 to use generic vectors. */
7095 static unsigned int
7096 vect_min_worthwhile_factor (enum tree_code code)
7097 {
7098 switch (code)
7099 {
7100 case PLUS_EXPR:
7101 case MINUS_EXPR:
7102 case NEGATE_EXPR:
7103 return 4;
7104
7105 case BIT_AND_EXPR:
7106 case BIT_IOR_EXPR:
7107 case BIT_XOR_EXPR:
7108 case BIT_NOT_EXPR:
7109 return 2;
7110
7111 default:
7112 return INT_MAX;
7113 }
7114 }
7115
7116 /* Return true if VINFO indicates we are doing loop vectorization and if
7117 it is worth decomposing CODE operations into scalar operations for
7118 that loop's vectorization factor. */
7119
7120 bool
7121 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7122 {
7123 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7124 unsigned HOST_WIDE_INT value;
7125 return (loop_vinfo
7126 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7127 && value >= vect_min_worthwhile_factor (code));
7128 }
7129
7130 /* Function vectorizable_induction
7131
7132 Check if STMT_INFO performs an induction computation that can be vectorized.
7133 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7134 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7135 Return true if STMT_INFO is vectorizable in this way. */
7136
7137 bool
7138 vectorizable_induction (stmt_vec_info stmt_info,
7139 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7140 stmt_vec_info *vec_stmt, slp_tree slp_node,
7141 stmt_vector_for_cost *cost_vec)
7142 {
7143 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7144 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7145 unsigned ncopies;
7146 bool nested_in_vect_loop = false;
7147 struct loop *iv_loop;
7148 tree vec_def;
7149 edge pe = loop_preheader_edge (loop);
7150 basic_block new_bb;
7151 tree new_vec, vec_init, vec_step, t;
7152 tree new_name;
7153 gimple *new_stmt;
7154 gphi *induction_phi;
7155 tree induc_def, vec_dest;
7156 tree init_expr, step_expr;
7157 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7158 unsigned i;
7159 tree expr;
7160 gimple_seq stmts;
7161 imm_use_iterator imm_iter;
7162 use_operand_p use_p;
7163 gimple *exit_phi;
7164 edge latch_e;
7165 tree loop_arg;
7166 gimple_stmt_iterator si;
7167
7168 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7169 if (!phi)
7170 return false;
7171
7172 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7173 return false;
7174
7175 /* Make sure it was recognized as induction computation. */
7176 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7177 return false;
7178
7179 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7180 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7181
7182 if (slp_node)
7183 ncopies = 1;
7184 else
7185 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7186 gcc_assert (ncopies >= 1);
7187
7188 /* FORNOW. These restrictions should be relaxed. */
7189 if (nested_in_vect_loop_p (loop, stmt_info))
7190 {
7191 imm_use_iterator imm_iter;
7192 use_operand_p use_p;
7193 gimple *exit_phi;
7194 edge latch_e;
7195 tree loop_arg;
7196
7197 if (ncopies > 1)
7198 {
7199 if (dump_enabled_p ())
7200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7201 "multiple types in nested loop.\n");
7202 return false;
7203 }
7204
7205 /* FORNOW: outer loop induction with SLP not supported. */
7206 if (STMT_SLP_TYPE (stmt_info))
7207 return false;
7208
7209 exit_phi = NULL;
7210 latch_e = loop_latch_edge (loop->inner);
7211 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7212 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7213 {
7214 gimple *use_stmt = USE_STMT (use_p);
7215 if (is_gimple_debug (use_stmt))
7216 continue;
7217
7218 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7219 {
7220 exit_phi = use_stmt;
7221 break;
7222 }
7223 }
7224 if (exit_phi)
7225 {
7226 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7227 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7228 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7229 {
7230 if (dump_enabled_p ())
7231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7232 "inner-loop induction only used outside "
7233 "of the outer vectorized loop.\n");
7234 return false;
7235 }
7236 }
7237
7238 nested_in_vect_loop = true;
7239 iv_loop = loop->inner;
7240 }
7241 else
7242 iv_loop = loop;
7243 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7244
7245 if (slp_node && !nunits.is_constant ())
7246 {
7247 /* The current SLP code creates the initial value element-by-element. */
7248 if (dump_enabled_p ())
7249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7250 "SLP induction not supported for variable-length"
7251 " vectors.\n");
7252 return false;
7253 }
7254
7255 if (!vec_stmt) /* transformation not required. */
7256 {
7257 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7258 DUMP_VECT_SCOPE ("vectorizable_induction");
7259 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7260 return true;
7261 }
7262
7263 /* Transform. */
7264
7265 /* Compute a vector variable, initialized with the first VF values of
7266 the induction variable. E.g., for an iv with IV_PHI='X' and
7267 evolution S, for a vector of 4 units, we want to compute:
7268 [X, X + S, X + 2*S, X + 3*S]. */
7269
7270 if (dump_enabled_p ())
7271 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7272
7273 latch_e = loop_latch_edge (iv_loop);
7274 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7275
7276 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7277 gcc_assert (step_expr != NULL_TREE);
7278
7279 pe = loop_preheader_edge (iv_loop);
7280 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7281 loop_preheader_edge (iv_loop));
7282
7283 stmts = NULL;
7284 if (!nested_in_vect_loop)
7285 {
7286 /* Convert the initial value to the desired type. */
7287 tree new_type = TREE_TYPE (vectype);
7288 init_expr = gimple_convert (&stmts, new_type, init_expr);
7289
7290 /* If we are using the loop mask to "peel" for alignment then we need
7291 to adjust the start value here. */
7292 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7293 if (skip_niters != NULL_TREE)
7294 {
7295 if (FLOAT_TYPE_P (vectype))
7296 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7297 skip_niters);
7298 else
7299 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7300 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7301 skip_niters, step_expr);
7302 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7303 init_expr, skip_step);
7304 }
7305 }
7306
7307 /* Convert the step to the desired type. */
7308 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7309
7310 if (stmts)
7311 {
7312 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7313 gcc_assert (!new_bb);
7314 }
7315
7316 /* Find the first insertion point in the BB. */
7317 basic_block bb = gimple_bb (phi);
7318 si = gsi_after_labels (bb);
7319
7320 /* For SLP induction we have to generate several IVs as for example
7321 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7322 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7323 [VF*S, VF*S, VF*S, VF*S] for all. */
7324 if (slp_node)
7325 {
7326 /* Enforced above. */
7327 unsigned int const_nunits = nunits.to_constant ();
7328
7329 /* Generate [VF*S, VF*S, ... ]. */
7330 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7331 {
7332 expr = build_int_cst (integer_type_node, vf);
7333 expr = fold_convert (TREE_TYPE (step_expr), expr);
7334 }
7335 else
7336 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7337 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7338 expr, step_expr);
7339 if (! CONSTANT_CLASS_P (new_name))
7340 new_name = vect_init_vector (stmt_info, new_name,
7341 TREE_TYPE (step_expr), NULL);
7342 new_vec = build_vector_from_val (vectype, new_name);
7343 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7344
7345 /* Now generate the IVs. */
7346 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7347 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7348 unsigned elts = const_nunits * nvects;
7349 unsigned nivs = least_common_multiple (group_size,
7350 const_nunits) / const_nunits;
7351 gcc_assert (elts % group_size == 0);
7352 tree elt = init_expr;
7353 unsigned ivn;
7354 for (ivn = 0; ivn < nivs; ++ivn)
7355 {
7356 tree_vector_builder elts (vectype, const_nunits, 1);
7357 stmts = NULL;
7358 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7359 {
7360 if (ivn*const_nunits + eltn >= group_size
7361 && (ivn * const_nunits + eltn) % group_size == 0)
7362 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7363 elt, step_expr);
7364 elts.quick_push (elt);
7365 }
7366 vec_init = gimple_build_vector (&stmts, &elts);
7367 if (stmts)
7368 {
7369 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7370 gcc_assert (!new_bb);
7371 }
7372
7373 /* Create the induction-phi that defines the induction-operand. */
7374 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7375 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7376 stmt_vec_info induction_phi_info
7377 = loop_vinfo->add_stmt (induction_phi);
7378 induc_def = PHI_RESULT (induction_phi);
7379
7380 /* Create the iv update inside the loop */
7381 vec_def = make_ssa_name (vec_dest);
7382 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7383 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7384 loop_vinfo->add_stmt (new_stmt);
7385
7386 /* Set the arguments of the phi node: */
7387 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7388 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7389 UNKNOWN_LOCATION);
7390
7391 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7392 }
7393
7394 /* Re-use IVs when we can. */
7395 if (ivn < nvects)
7396 {
7397 unsigned vfp
7398 = least_common_multiple (group_size, const_nunits) / group_size;
7399 /* Generate [VF'*S, VF'*S, ... ]. */
7400 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7401 {
7402 expr = build_int_cst (integer_type_node, vfp);
7403 expr = fold_convert (TREE_TYPE (step_expr), expr);
7404 }
7405 else
7406 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7407 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7408 expr, step_expr);
7409 if (! CONSTANT_CLASS_P (new_name))
7410 new_name = vect_init_vector (stmt_info, new_name,
7411 TREE_TYPE (step_expr), NULL);
7412 new_vec = build_vector_from_val (vectype, new_name);
7413 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7414 for (; ivn < nvects; ++ivn)
7415 {
7416 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7417 tree def;
7418 if (gimple_code (iv) == GIMPLE_PHI)
7419 def = gimple_phi_result (iv);
7420 else
7421 def = gimple_assign_lhs (iv);
7422 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7423 PLUS_EXPR,
7424 def, vec_step);
7425 if (gimple_code (iv) == GIMPLE_PHI)
7426 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7427 else
7428 {
7429 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7430 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7431 }
7432 SLP_TREE_VEC_STMTS (slp_node).quick_push
7433 (loop_vinfo->add_stmt (new_stmt));
7434 }
7435 }
7436
7437 return true;
7438 }
7439
7440 /* Create the vector that holds the initial_value of the induction. */
7441 if (nested_in_vect_loop)
7442 {
7443 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7444 been created during vectorization of previous stmts. We obtain it
7445 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7446 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7447 /* If the initial value is not of proper type, convert it. */
7448 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7449 {
7450 new_stmt
7451 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7452 vect_simple_var,
7453 "vec_iv_"),
7454 VIEW_CONVERT_EXPR,
7455 build1 (VIEW_CONVERT_EXPR, vectype,
7456 vec_init));
7457 vec_init = gimple_assign_lhs (new_stmt);
7458 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7459 new_stmt);
7460 gcc_assert (!new_bb);
7461 loop_vinfo->add_stmt (new_stmt);
7462 }
7463 }
7464 else
7465 {
7466 /* iv_loop is the loop to be vectorized. Create:
7467 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7468 stmts = NULL;
7469 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7470
7471 unsigned HOST_WIDE_INT const_nunits;
7472 if (nunits.is_constant (&const_nunits))
7473 {
7474 tree_vector_builder elts (vectype, const_nunits, 1);
7475 elts.quick_push (new_name);
7476 for (i = 1; i < const_nunits; i++)
7477 {
7478 /* Create: new_name_i = new_name + step_expr */
7479 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7480 new_name, step_expr);
7481 elts.quick_push (new_name);
7482 }
7483 /* Create a vector from [new_name_0, new_name_1, ...,
7484 new_name_nunits-1] */
7485 vec_init = gimple_build_vector (&stmts, &elts);
7486 }
7487 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7488 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7489 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7490 new_name, step_expr);
7491 else
7492 {
7493 /* Build:
7494 [base, base, base, ...]
7495 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7496 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7497 gcc_assert (flag_associative_math);
7498 tree index = build_index_vector (vectype, 0, 1);
7499 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7500 new_name);
7501 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7502 step_expr);
7503 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7504 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7505 vec_init, step_vec);
7506 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7507 vec_init, base_vec);
7508 }
7509
7510 if (stmts)
7511 {
7512 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7513 gcc_assert (!new_bb);
7514 }
7515 }
7516
7517
7518 /* Create the vector that holds the step of the induction. */
7519 if (nested_in_vect_loop)
7520 /* iv_loop is nested in the loop to be vectorized. Generate:
7521 vec_step = [S, S, S, S] */
7522 new_name = step_expr;
7523 else
7524 {
7525 /* iv_loop is the loop to be vectorized. Generate:
7526 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7527 gimple_seq seq = NULL;
7528 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7529 {
7530 expr = build_int_cst (integer_type_node, vf);
7531 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7532 }
7533 else
7534 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7535 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7536 expr, step_expr);
7537 if (seq)
7538 {
7539 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7540 gcc_assert (!new_bb);
7541 }
7542 }
7543
7544 t = unshare_expr (new_name);
7545 gcc_assert (CONSTANT_CLASS_P (new_name)
7546 || TREE_CODE (new_name) == SSA_NAME);
7547 new_vec = build_vector_from_val (vectype, t);
7548 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7549
7550
7551 /* Create the following def-use cycle:
7552 loop prolog:
7553 vec_init = ...
7554 vec_step = ...
7555 loop:
7556 vec_iv = PHI <vec_init, vec_loop>
7557 ...
7558 STMT
7559 ...
7560 vec_loop = vec_iv + vec_step; */
7561
7562 /* Create the induction-phi that defines the induction-operand. */
7563 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7564 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7565 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7566 induc_def = PHI_RESULT (induction_phi);
7567
7568 /* Create the iv update inside the loop */
7569 vec_def = make_ssa_name (vec_dest);
7570 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7571 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7572 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7573
7574 /* Set the arguments of the phi node: */
7575 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7576 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7577 UNKNOWN_LOCATION);
7578
7579 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7580
7581 /* In case that vectorization factor (VF) is bigger than the number
7582 of elements that we can fit in a vectype (nunits), we have to generate
7583 more than one vector stmt - i.e - we need to "unroll" the
7584 vector stmt by a factor VF/nunits. For more details see documentation
7585 in vectorizable_operation. */
7586
7587 if (ncopies > 1)
7588 {
7589 gimple_seq seq = NULL;
7590 stmt_vec_info prev_stmt_vinfo;
7591 /* FORNOW. This restriction should be relaxed. */
7592 gcc_assert (!nested_in_vect_loop);
7593
7594 /* Create the vector that holds the step of the induction. */
7595 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7596 {
7597 expr = build_int_cst (integer_type_node, nunits);
7598 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7599 }
7600 else
7601 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7602 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7603 expr, step_expr);
7604 if (seq)
7605 {
7606 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7607 gcc_assert (!new_bb);
7608 }
7609
7610 t = unshare_expr (new_name);
7611 gcc_assert (CONSTANT_CLASS_P (new_name)
7612 || TREE_CODE (new_name) == SSA_NAME);
7613 new_vec = build_vector_from_val (vectype, t);
7614 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7615
7616 vec_def = induc_def;
7617 prev_stmt_vinfo = induction_phi_info;
7618 for (i = 1; i < ncopies; i++)
7619 {
7620 /* vec_i = vec_prev + vec_step */
7621 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7622 vec_def, vec_step);
7623 vec_def = make_ssa_name (vec_dest, new_stmt);
7624 gimple_assign_set_lhs (new_stmt, vec_def);
7625
7626 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7627 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7628 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7629 prev_stmt_vinfo = new_stmt_info;
7630 }
7631 }
7632
7633 if (nested_in_vect_loop)
7634 {
7635 /* Find the loop-closed exit-phi of the induction, and record
7636 the final vector of induction results: */
7637 exit_phi = NULL;
7638 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7639 {
7640 gimple *use_stmt = USE_STMT (use_p);
7641 if (is_gimple_debug (use_stmt))
7642 continue;
7643
7644 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7645 {
7646 exit_phi = use_stmt;
7647 break;
7648 }
7649 }
7650 if (exit_phi)
7651 {
7652 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7653 /* FORNOW. Currently not supporting the case that an inner-loop induction
7654 is not used in the outer-loop (i.e. only outside the outer-loop). */
7655 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7656 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7657
7658 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7659 if (dump_enabled_p ())
7660 dump_printf_loc (MSG_NOTE, vect_location,
7661 "vector of inductions after inner-loop:%G",
7662 new_stmt);
7663 }
7664 }
7665
7666
7667 if (dump_enabled_p ())
7668 dump_printf_loc (MSG_NOTE, vect_location,
7669 "transform induction: created def-use cycle: %G%G",
7670 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7671
7672 return true;
7673 }
7674
7675 /* Function vectorizable_live_operation.
7676
7677 STMT_INFO computes a value that is used outside the loop. Check if
7678 it can be supported. */
7679
7680 bool
7681 vectorizable_live_operation (stmt_vec_info stmt_info,
7682 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7683 slp_tree slp_node, int slp_index,
7684 stmt_vec_info *vec_stmt,
7685 stmt_vector_for_cost *)
7686 {
7687 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7688 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7689 imm_use_iterator imm_iter;
7690 tree lhs, lhs_type, bitsize, vec_bitsize;
7691 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7692 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7693 int ncopies;
7694 gimple *use_stmt;
7695 auto_vec<tree> vec_oprnds;
7696 int vec_entry = 0;
7697 poly_uint64 vec_index = 0;
7698
7699 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7700
7701 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7702 return false;
7703
7704 /* FORNOW. CHECKME. */
7705 if (nested_in_vect_loop_p (loop, stmt_info))
7706 return false;
7707
7708 /* If STMT is not relevant and it is a simple assignment and its inputs are
7709 invariant then it can remain in place, unvectorized. The original last
7710 scalar value that it computes will be used. */
7711 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7712 {
7713 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7714 if (dump_enabled_p ())
7715 dump_printf_loc (MSG_NOTE, vect_location,
7716 "statement is simple and uses invariant. Leaving in "
7717 "place.\n");
7718 return true;
7719 }
7720
7721 if (slp_node)
7722 ncopies = 1;
7723 else
7724 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7725
7726 if (slp_node)
7727 {
7728 gcc_assert (slp_index >= 0);
7729
7730 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7731 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7732
7733 /* Get the last occurrence of the scalar index from the concatenation of
7734 all the slp vectors. Calculate which slp vector it is and the index
7735 within. */
7736 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7737
7738 /* Calculate which vector contains the result, and which lane of
7739 that vector we need. */
7740 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7741 {
7742 if (dump_enabled_p ())
7743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7744 "Cannot determine which vector holds the"
7745 " final result.\n");
7746 return false;
7747 }
7748 }
7749
7750 if (!vec_stmt)
7751 {
7752 /* No transformation required. */
7753 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7754 {
7755 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7756 OPTIMIZE_FOR_SPEED))
7757 {
7758 if (dump_enabled_p ())
7759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7760 "can't use a fully-masked loop because "
7761 "the target doesn't support extract last "
7762 "reduction.\n");
7763 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7764 }
7765 else if (slp_node)
7766 {
7767 if (dump_enabled_p ())
7768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7769 "can't use a fully-masked loop because an "
7770 "SLP statement is live after the loop.\n");
7771 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7772 }
7773 else if (ncopies > 1)
7774 {
7775 if (dump_enabled_p ())
7776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7777 "can't use a fully-masked loop because"
7778 " ncopies is greater than 1.\n");
7779 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7780 }
7781 else
7782 {
7783 gcc_assert (ncopies == 1 && !slp_node);
7784 vect_record_loop_mask (loop_vinfo,
7785 &LOOP_VINFO_MASKS (loop_vinfo),
7786 1, vectype);
7787 }
7788 }
7789 return true;
7790 }
7791
7792 /* Use the lhs of the original scalar statement. */
7793 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7794
7795 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7796 : gimple_get_lhs (stmt);
7797 lhs_type = TREE_TYPE (lhs);
7798
7799 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7800 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7801 : TYPE_SIZE (TREE_TYPE (vectype)));
7802 vec_bitsize = TYPE_SIZE (vectype);
7803
7804 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7805 tree vec_lhs, bitstart;
7806 if (slp_node)
7807 {
7808 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7809
7810 /* Get the correct slp vectorized stmt. */
7811 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7812 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7813 vec_lhs = gimple_phi_result (phi);
7814 else
7815 vec_lhs = gimple_get_lhs (vec_stmt);
7816
7817 /* Get entry to use. */
7818 bitstart = bitsize_int (vec_index);
7819 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7820 }
7821 else
7822 {
7823 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7824 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7825 gcc_checking_assert (ncopies == 1
7826 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7827
7828 /* For multiple copies, get the last copy. */
7829 for (int i = 1; i < ncopies; ++i)
7830 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7831
7832 /* Get the last lane in the vector. */
7833 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7834 }
7835
7836 gimple_seq stmts = NULL;
7837 tree new_tree;
7838 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7839 {
7840 /* Emit:
7841
7842 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7843
7844 where VEC_LHS is the vectorized live-out result and MASK is
7845 the loop mask for the final iteration. */
7846 gcc_assert (ncopies == 1 && !slp_node);
7847 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7848 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7849 1, vectype, 0);
7850 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7851 scalar_type, mask, vec_lhs);
7852
7853 /* Convert the extracted vector element to the required scalar type. */
7854 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7855 }
7856 else
7857 {
7858 tree bftype = TREE_TYPE (vectype);
7859 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7860 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7861 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7862 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7863 &stmts, true, NULL_TREE);
7864 }
7865
7866 if (stmts)
7867 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7868
7869 /* Replace use of lhs with newly computed result. If the use stmt is a
7870 single arg PHI, just replace all uses of PHI result. It's necessary
7871 because lcssa PHI defining lhs may be before newly inserted stmt. */
7872 use_operand_p use_p;
7873 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7874 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7875 && !is_gimple_debug (use_stmt))
7876 {
7877 if (gimple_code (use_stmt) == GIMPLE_PHI
7878 && gimple_phi_num_args (use_stmt) == 1)
7879 {
7880 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7881 }
7882 else
7883 {
7884 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7885 SET_USE (use_p, new_tree);
7886 }
7887 update_stmt (use_stmt);
7888 }
7889
7890 return true;
7891 }
7892
7893 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7894
7895 static void
7896 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7897 {
7898 ssa_op_iter op_iter;
7899 imm_use_iterator imm_iter;
7900 def_operand_p def_p;
7901 gimple *ustmt;
7902
7903 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7904 {
7905 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7906 {
7907 basic_block bb;
7908
7909 if (!is_gimple_debug (ustmt))
7910 continue;
7911
7912 bb = gimple_bb (ustmt);
7913
7914 if (!flow_bb_inside_loop_p (loop, bb))
7915 {
7916 if (gimple_debug_bind_p (ustmt))
7917 {
7918 if (dump_enabled_p ())
7919 dump_printf_loc (MSG_NOTE, vect_location,
7920 "killing debug use\n");
7921
7922 gimple_debug_bind_reset_value (ustmt);
7923 update_stmt (ustmt);
7924 }
7925 else
7926 gcc_unreachable ();
7927 }
7928 }
7929 }
7930 }
7931
7932 /* Given loop represented by LOOP_VINFO, return true if computation of
7933 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7934 otherwise. */
7935
7936 static bool
7937 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7938 {
7939 /* Constant case. */
7940 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7941 {
7942 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7943 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7944
7945 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7946 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7947 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7948 return true;
7949 }
7950
7951 widest_int max;
7952 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7953 /* Check the upper bound of loop niters. */
7954 if (get_max_loop_iterations (loop, &max))
7955 {
7956 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7957 signop sgn = TYPE_SIGN (type);
7958 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7959 if (max < type_max)
7960 return true;
7961 }
7962 return false;
7963 }
7964
7965 /* Return a mask type with half the number of elements as TYPE. */
7966
7967 tree
7968 vect_halve_mask_nunits (tree type)
7969 {
7970 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7971 return build_truth_vector_type (nunits, current_vector_size);
7972 }
7973
7974 /* Return a mask type with twice as many elements as TYPE. */
7975
7976 tree
7977 vect_double_mask_nunits (tree type)
7978 {
7979 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7980 return build_truth_vector_type (nunits, current_vector_size);
7981 }
7982
7983 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7984 contain a sequence of NVECTORS masks that each control a vector of type
7985 VECTYPE. */
7986
7987 void
7988 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
7989 unsigned int nvectors, tree vectype)
7990 {
7991 gcc_assert (nvectors != 0);
7992 if (masks->length () < nvectors)
7993 masks->safe_grow_cleared (nvectors);
7994 rgroup_masks *rgm = &(*masks)[nvectors - 1];
7995 /* The number of scalars per iteration and the number of vectors are
7996 both compile-time constants. */
7997 unsigned int nscalars_per_iter
7998 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
7999 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8000 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8001 {
8002 rgm->max_nscalars_per_iter = nscalars_per_iter;
8003 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8004 }
8005 }
8006
8007 /* Given a complete set of masks MASKS, extract mask number INDEX
8008 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8009 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8010
8011 See the comment above vec_loop_masks for more details about the mask
8012 arrangement. */
8013
8014 tree
8015 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8016 unsigned int nvectors, tree vectype, unsigned int index)
8017 {
8018 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8019 tree mask_type = rgm->mask_type;
8020
8021 /* Populate the rgroup's mask array, if this is the first time we've
8022 used it. */
8023 if (rgm->masks.is_empty ())
8024 {
8025 rgm->masks.safe_grow_cleared (nvectors);
8026 for (unsigned int i = 0; i < nvectors; ++i)
8027 {
8028 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8029 /* Provide a dummy definition until the real one is available. */
8030 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8031 rgm->masks[i] = mask;
8032 }
8033 }
8034
8035 tree mask = rgm->masks[index];
8036 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8037 TYPE_VECTOR_SUBPARTS (vectype)))
8038 {
8039 /* A loop mask for data type X can be reused for data type Y
8040 if X has N times more elements than Y and if Y's elements
8041 are N times bigger than X's. In this case each sequence
8042 of N elements in the loop mask will be all-zero or all-one.
8043 We can then view-convert the mask so that each sequence of
8044 N elements is replaced by a single element. */
8045 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8046 TYPE_VECTOR_SUBPARTS (vectype)));
8047 gimple_seq seq = NULL;
8048 mask_type = build_same_sized_truth_vector_type (vectype);
8049 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8050 if (seq)
8051 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8052 }
8053 return mask;
8054 }
8055
8056 /* Scale profiling counters by estimation for LOOP which is vectorized
8057 by factor VF. */
8058
8059 static void
8060 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8061 {
8062 edge preheader = loop_preheader_edge (loop);
8063 /* Reduce loop iterations by the vectorization factor. */
8064 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8065 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8066
8067 if (freq_h.nonzero_p ())
8068 {
8069 profile_probability p;
8070
8071 /* Avoid dropping loop body profile counter to 0 because of zero count
8072 in loop's preheader. */
8073 if (!(freq_e == profile_count::zero ()))
8074 freq_e = freq_e.force_nonzero ();
8075 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8076 scale_loop_frequencies (loop, p);
8077 }
8078
8079 edge exit_e = single_exit (loop);
8080 exit_e->probability = profile_probability::always ()
8081 .apply_scale (1, new_est_niter + 1);
8082
8083 edge exit_l = single_pred_edge (loop->latch);
8084 profile_probability prob = exit_l->probability;
8085 exit_l->probability = exit_e->probability.invert ();
8086 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8087 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8088 }
8089
8090 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8091 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8092 stmt_vec_info. */
8093
8094 static void
8095 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8096 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8097 {
8098 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8099 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8100
8101 if (dump_enabled_p ())
8102 dump_printf_loc (MSG_NOTE, vect_location,
8103 "------>vectorizing statement: %G", stmt_info->stmt);
8104
8105 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8106 vect_loop_kill_debug_uses (loop, stmt_info);
8107
8108 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8109 && !STMT_VINFO_LIVE_P (stmt_info))
8110 return;
8111
8112 if (STMT_VINFO_VECTYPE (stmt_info))
8113 {
8114 poly_uint64 nunits
8115 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8116 if (!STMT_SLP_TYPE (stmt_info)
8117 && maybe_ne (nunits, vf)
8118 && dump_enabled_p ())
8119 /* For SLP VF is set according to unrolling factor, and not
8120 to vector size, hence for SLP this print is not valid. */
8121 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8122 }
8123
8124 /* Pure SLP statements have already been vectorized. We still need
8125 to apply loop vectorization to hybrid SLP statements. */
8126 if (PURE_SLP_STMT (stmt_info))
8127 return;
8128
8129 if (dump_enabled_p ())
8130 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8131
8132 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8133 *seen_store = stmt_info;
8134 }
8135
8136 /* Function vect_transform_loop.
8137
8138 The analysis phase has determined that the loop is vectorizable.
8139 Vectorize the loop - created vectorized stmts to replace the scalar
8140 stmts in the loop, and update the loop exit condition.
8141 Returns scalar epilogue loop if any. */
8142
8143 struct loop *
8144 vect_transform_loop (loop_vec_info loop_vinfo)
8145 {
8146 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8147 struct loop *epilogue = NULL;
8148 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8149 int nbbs = loop->num_nodes;
8150 int i;
8151 tree niters_vector = NULL_TREE;
8152 tree step_vector = NULL_TREE;
8153 tree niters_vector_mult_vf = NULL_TREE;
8154 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8155 unsigned int lowest_vf = constant_lower_bound (vf);
8156 gimple *stmt;
8157 bool check_profitability = false;
8158 unsigned int th;
8159
8160 DUMP_VECT_SCOPE ("vec_transform_loop");
8161
8162 loop_vinfo->shared->check_datarefs ();
8163
8164 /* Use the more conservative vectorization threshold. If the number
8165 of iterations is constant assume the cost check has been performed
8166 by our caller. If the threshold makes all loops profitable that
8167 run at least the (estimated) vectorization factor number of times
8168 checking is pointless, too. */
8169 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8170 if (th >= vect_vf_for_cost (loop_vinfo)
8171 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8172 {
8173 if (dump_enabled_p ())
8174 dump_printf_loc (MSG_NOTE, vect_location,
8175 "Profitability threshold is %d loop iterations.\n",
8176 th);
8177 check_profitability = true;
8178 }
8179
8180 /* Make sure there exists a single-predecessor exit bb. Do this before
8181 versioning. */
8182 edge e = single_exit (loop);
8183 if (! single_pred_p (e->dest))
8184 {
8185 split_loop_exit_edge (e, true);
8186 if (dump_enabled_p ())
8187 dump_printf (MSG_NOTE, "split exit edge\n");
8188 }
8189
8190 /* Version the loop first, if required, so the profitability check
8191 comes first. */
8192
8193 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8194 {
8195 poly_uint64 versioning_threshold
8196 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8197 if (check_profitability
8198 && ordered_p (poly_uint64 (th), versioning_threshold))
8199 {
8200 versioning_threshold = ordered_max (poly_uint64 (th),
8201 versioning_threshold);
8202 check_profitability = false;
8203 }
8204 vect_loop_versioning (loop_vinfo, th, check_profitability,
8205 versioning_threshold);
8206 check_profitability = false;
8207 }
8208
8209 /* Make sure there exists a single-predecessor exit bb also on the
8210 scalar loop copy. Do this after versioning but before peeling
8211 so CFG structure is fine for both scalar and if-converted loop
8212 to make slpeel_duplicate_current_defs_from_edges face matched
8213 loop closed PHI nodes on the exit. */
8214 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8215 {
8216 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8217 if (! single_pred_p (e->dest))
8218 {
8219 split_loop_exit_edge (e, true);
8220 if (dump_enabled_p ())
8221 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8222 }
8223 }
8224
8225 tree niters = vect_build_loop_niters (loop_vinfo);
8226 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8227 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8228 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8229 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8230 &step_vector, &niters_vector_mult_vf, th,
8231 check_profitability, niters_no_overflow);
8232
8233 if (niters_vector == NULL_TREE)
8234 {
8235 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8236 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8237 && known_eq (lowest_vf, vf))
8238 {
8239 niters_vector
8240 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8241 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8242 step_vector = build_one_cst (TREE_TYPE (niters));
8243 }
8244 else
8245 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8246 &step_vector, niters_no_overflow);
8247 }
8248
8249 /* 1) Make sure the loop header has exactly two entries
8250 2) Make sure we have a preheader basic block. */
8251
8252 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8253
8254 split_edge (loop_preheader_edge (loop));
8255
8256 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8257 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8258 /* This will deal with any possible peeling. */
8259 vect_prepare_for_masked_peels (loop_vinfo);
8260
8261 /* Schedule the SLP instances first, then handle loop vectorization
8262 below. */
8263 if (!loop_vinfo->slp_instances.is_empty ())
8264 {
8265 DUMP_VECT_SCOPE ("scheduling SLP instances");
8266 vect_schedule_slp (loop_vinfo);
8267 }
8268
8269 /* FORNOW: the vectorizer supports only loops which body consist
8270 of one basic block (header + empty latch). When the vectorizer will
8271 support more involved loop forms, the order by which the BBs are
8272 traversed need to be reconsidered. */
8273
8274 for (i = 0; i < nbbs; i++)
8275 {
8276 basic_block bb = bbs[i];
8277 stmt_vec_info stmt_info;
8278
8279 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8280 gsi_next (&si))
8281 {
8282 gphi *phi = si.phi ();
8283 if (dump_enabled_p ())
8284 dump_printf_loc (MSG_NOTE, vect_location,
8285 "------>vectorizing phi: %G", phi);
8286 stmt_info = loop_vinfo->lookup_stmt (phi);
8287 if (!stmt_info)
8288 continue;
8289
8290 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8291 vect_loop_kill_debug_uses (loop, stmt_info);
8292
8293 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8294 && !STMT_VINFO_LIVE_P (stmt_info))
8295 continue;
8296
8297 if (STMT_VINFO_VECTYPE (stmt_info)
8298 && (maybe_ne
8299 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8300 && dump_enabled_p ())
8301 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8302
8303 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8304 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8305 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8306 && ! PURE_SLP_STMT (stmt_info))
8307 {
8308 if (dump_enabled_p ())
8309 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8310 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8311 }
8312 }
8313
8314 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8315 !gsi_end_p (si);)
8316 {
8317 stmt = gsi_stmt (si);
8318 /* During vectorization remove existing clobber stmts. */
8319 if (gimple_clobber_p (stmt))
8320 {
8321 unlink_stmt_vdef (stmt);
8322 gsi_remove (&si, true);
8323 release_defs (stmt);
8324 }
8325 else
8326 {
8327 stmt_info = loop_vinfo->lookup_stmt (stmt);
8328
8329 /* vector stmts created in the outer-loop during vectorization of
8330 stmts in an inner-loop may not have a stmt_info, and do not
8331 need to be vectorized. */
8332 stmt_vec_info seen_store = NULL;
8333 if (stmt_info)
8334 {
8335 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8336 {
8337 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8338 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8339 !gsi_end_p (subsi); gsi_next (&subsi))
8340 {
8341 stmt_vec_info pat_stmt_info
8342 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8343 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8344 &si, &seen_store);
8345 }
8346 stmt_vec_info pat_stmt_info
8347 = STMT_VINFO_RELATED_STMT (stmt_info);
8348 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8349 &seen_store);
8350 }
8351 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8352 &seen_store);
8353 }
8354 gsi_next (&si);
8355 if (seen_store)
8356 {
8357 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8358 /* Interleaving. If IS_STORE is TRUE, the
8359 vectorization of the interleaving chain was
8360 completed - free all the stores in the chain. */
8361 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8362 else
8363 /* Free the attached stmt_vec_info and remove the stmt. */
8364 loop_vinfo->remove_stmt (stmt_info);
8365 }
8366 }
8367 }
8368
8369 /* Stub out scalar statements that must not survive vectorization.
8370 Doing this here helps with grouped statements, or statements that
8371 are involved in patterns. */
8372 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8373 !gsi_end_p (gsi); gsi_next (&gsi))
8374 {
8375 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8376 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8377 {
8378 tree lhs = gimple_get_lhs (call);
8379 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8380 {
8381 tree zero = build_zero_cst (TREE_TYPE (lhs));
8382 gimple *new_stmt = gimple_build_assign (lhs, zero);
8383 gsi_replace (&gsi, new_stmt, true);
8384 }
8385 }
8386 }
8387 } /* BBs in loop */
8388
8389 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8390 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8391 if (integer_onep (step_vector))
8392 niters_no_overflow = true;
8393 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8394 niters_vector_mult_vf, !niters_no_overflow);
8395
8396 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8397 scale_profile_for_vect_loop (loop, assumed_vf);
8398
8399 /* True if the final iteration might not handle a full vector's
8400 worth of scalar iterations. */
8401 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8402 /* The minimum number of iterations performed by the epilogue. This
8403 is 1 when peeling for gaps because we always need a final scalar
8404 iteration. */
8405 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8406 /* +1 to convert latch counts to loop iteration counts,
8407 -min_epilogue_iters to remove iterations that cannot be performed
8408 by the vector code. */
8409 int bias_for_lowest = 1 - min_epilogue_iters;
8410 int bias_for_assumed = bias_for_lowest;
8411 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8412 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8413 {
8414 /* When the amount of peeling is known at compile time, the first
8415 iteration will have exactly alignment_npeels active elements.
8416 In the worst case it will have at least one. */
8417 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8418 bias_for_lowest += lowest_vf - min_first_active;
8419 bias_for_assumed += assumed_vf - min_first_active;
8420 }
8421 /* In these calculations the "- 1" converts loop iteration counts
8422 back to latch counts. */
8423 if (loop->any_upper_bound)
8424 loop->nb_iterations_upper_bound
8425 = (final_iter_may_be_partial
8426 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8427 lowest_vf) - 1
8428 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8429 lowest_vf) - 1);
8430 if (loop->any_likely_upper_bound)
8431 loop->nb_iterations_likely_upper_bound
8432 = (final_iter_may_be_partial
8433 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8434 + bias_for_lowest, lowest_vf) - 1
8435 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8436 + bias_for_lowest, lowest_vf) - 1);
8437 if (loop->any_estimate)
8438 loop->nb_iterations_estimate
8439 = (final_iter_may_be_partial
8440 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8441 assumed_vf) - 1
8442 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8443 assumed_vf) - 1);
8444
8445 if (dump_enabled_p ())
8446 {
8447 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8448 {
8449 dump_printf_loc (MSG_NOTE, vect_location,
8450 "LOOP VECTORIZED\n");
8451 if (loop->inner)
8452 dump_printf_loc (MSG_NOTE, vect_location,
8453 "OUTER LOOP VECTORIZED\n");
8454 dump_printf (MSG_NOTE, "\n");
8455 }
8456 else
8457 {
8458 dump_printf_loc (MSG_NOTE, vect_location,
8459 "LOOP EPILOGUE VECTORIZED (VS=");
8460 dump_dec (MSG_NOTE, current_vector_size);
8461 dump_printf (MSG_NOTE, ")\n");
8462 }
8463 }
8464
8465 /* Loops vectorized with a variable factor won't benefit from
8466 unrolling/peeling. */
8467 if (!vf.is_constant ())
8468 {
8469 loop->unroll = 1;
8470 if (dump_enabled_p ())
8471 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8472 " variable-length vectorization factor\n");
8473 }
8474 /* Free SLP instances here because otherwise stmt reference counting
8475 won't work. */
8476 slp_instance instance;
8477 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8478 vect_free_slp_instance (instance, true);
8479 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8480 /* Clear-up safelen field since its value is invalid after vectorization
8481 since vectorized loop can have loop-carried dependencies. */
8482 loop->safelen = 0;
8483
8484 /* Don't vectorize epilogue for epilogue. */
8485 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8486 epilogue = NULL;
8487
8488 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8489 epilogue = NULL;
8490
8491 if (epilogue)
8492 {
8493 auto_vector_sizes vector_sizes;
8494 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8495 unsigned int next_size = 0;
8496
8497 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8498 on niters already ajusted for the iterations of the prologue. */
8499 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8500 && known_eq (vf, lowest_vf))
8501 {
8502 unsigned HOST_WIDE_INT eiters
8503 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8504 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8505 eiters
8506 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8507 epilogue->nb_iterations_upper_bound = eiters - 1;
8508 epilogue->any_upper_bound = true;
8509
8510 unsigned int ratio;
8511 while (next_size < vector_sizes.length ()
8512 && !(constant_multiple_p (current_vector_size,
8513 vector_sizes[next_size], &ratio)
8514 && eiters >= lowest_vf / ratio))
8515 next_size += 1;
8516 }
8517 else
8518 while (next_size < vector_sizes.length ()
8519 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8520 next_size += 1;
8521
8522 if (next_size == vector_sizes.length ())
8523 epilogue = NULL;
8524 }
8525
8526 if (epilogue)
8527 {
8528 epilogue->force_vectorize = loop->force_vectorize;
8529 epilogue->safelen = loop->safelen;
8530 epilogue->dont_vectorize = false;
8531
8532 /* We may need to if-convert epilogue to vectorize it. */
8533 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8534 tree_if_conversion (epilogue);
8535 }
8536
8537 return epilogue;
8538 }
8539
8540 /* The code below is trying to perform simple optimization - revert
8541 if-conversion for masked stores, i.e. if the mask of a store is zero
8542 do not perform it and all stored value producers also if possible.
8543 For example,
8544 for (i=0; i<n; i++)
8545 if (c[i])
8546 {
8547 p1[i] += 1;
8548 p2[i] = p3[i] +2;
8549 }
8550 this transformation will produce the following semi-hammock:
8551
8552 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8553 {
8554 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8555 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8556 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8557 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8558 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8559 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8560 }
8561 */
8562
8563 void
8564 optimize_mask_stores (struct loop *loop)
8565 {
8566 basic_block *bbs = get_loop_body (loop);
8567 unsigned nbbs = loop->num_nodes;
8568 unsigned i;
8569 basic_block bb;
8570 struct loop *bb_loop;
8571 gimple_stmt_iterator gsi;
8572 gimple *stmt;
8573 auto_vec<gimple *> worklist;
8574 auto_purge_vect_location sentinel;
8575
8576 vect_location = find_loop_location (loop);
8577 /* Pick up all masked stores in loop if any. */
8578 for (i = 0; i < nbbs; i++)
8579 {
8580 bb = bbs[i];
8581 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8582 gsi_next (&gsi))
8583 {
8584 stmt = gsi_stmt (gsi);
8585 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8586 worklist.safe_push (stmt);
8587 }
8588 }
8589
8590 free (bbs);
8591 if (worklist.is_empty ())
8592 return;
8593
8594 /* Loop has masked stores. */
8595 while (!worklist.is_empty ())
8596 {
8597 gimple *last, *last_store;
8598 edge e, efalse;
8599 tree mask;
8600 basic_block store_bb, join_bb;
8601 gimple_stmt_iterator gsi_to;
8602 tree vdef, new_vdef;
8603 gphi *phi;
8604 tree vectype;
8605 tree zero;
8606
8607 last = worklist.pop ();
8608 mask = gimple_call_arg (last, 2);
8609 bb = gimple_bb (last);
8610 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8611 the same loop as if_bb. It could be different to LOOP when two
8612 level loop-nest is vectorized and mask_store belongs to the inner
8613 one. */
8614 e = split_block (bb, last);
8615 bb_loop = bb->loop_father;
8616 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8617 join_bb = e->dest;
8618 store_bb = create_empty_bb (bb);
8619 add_bb_to_loop (store_bb, bb_loop);
8620 e->flags = EDGE_TRUE_VALUE;
8621 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8622 /* Put STORE_BB to likely part. */
8623 efalse->probability = profile_probability::unlikely ();
8624 store_bb->count = efalse->count ();
8625 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8626 if (dom_info_available_p (CDI_DOMINATORS))
8627 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8628 if (dump_enabled_p ())
8629 dump_printf_loc (MSG_NOTE, vect_location,
8630 "Create new block %d to sink mask stores.",
8631 store_bb->index);
8632 /* Create vector comparison with boolean result. */
8633 vectype = TREE_TYPE (mask);
8634 zero = build_zero_cst (vectype);
8635 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8636 gsi = gsi_last_bb (bb);
8637 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8638 /* Create new PHI node for vdef of the last masked store:
8639 .MEM_2 = VDEF <.MEM_1>
8640 will be converted to
8641 .MEM.3 = VDEF <.MEM_1>
8642 and new PHI node will be created in join bb
8643 .MEM_2 = PHI <.MEM_1, .MEM_3>
8644 */
8645 vdef = gimple_vdef (last);
8646 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8647 gimple_set_vdef (last, new_vdef);
8648 phi = create_phi_node (vdef, join_bb);
8649 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8650
8651 /* Put all masked stores with the same mask to STORE_BB if possible. */
8652 while (true)
8653 {
8654 gimple_stmt_iterator gsi_from;
8655 gimple *stmt1 = NULL;
8656
8657 /* Move masked store to STORE_BB. */
8658 last_store = last;
8659 gsi = gsi_for_stmt (last);
8660 gsi_from = gsi;
8661 /* Shift GSI to the previous stmt for further traversal. */
8662 gsi_prev (&gsi);
8663 gsi_to = gsi_start_bb (store_bb);
8664 gsi_move_before (&gsi_from, &gsi_to);
8665 /* Setup GSI_TO to the non-empty block start. */
8666 gsi_to = gsi_start_bb (store_bb);
8667 if (dump_enabled_p ())
8668 dump_printf_loc (MSG_NOTE, vect_location,
8669 "Move stmt to created bb\n%G", last);
8670 /* Move all stored value producers if possible. */
8671 while (!gsi_end_p (gsi))
8672 {
8673 tree lhs;
8674 imm_use_iterator imm_iter;
8675 use_operand_p use_p;
8676 bool res;
8677
8678 /* Skip debug statements. */
8679 if (is_gimple_debug (gsi_stmt (gsi)))
8680 {
8681 gsi_prev (&gsi);
8682 continue;
8683 }
8684 stmt1 = gsi_stmt (gsi);
8685 /* Do not consider statements writing to memory or having
8686 volatile operand. */
8687 if (gimple_vdef (stmt1)
8688 || gimple_has_volatile_ops (stmt1))
8689 break;
8690 gsi_from = gsi;
8691 gsi_prev (&gsi);
8692 lhs = gimple_get_lhs (stmt1);
8693 if (!lhs)
8694 break;
8695
8696 /* LHS of vectorized stmt must be SSA_NAME. */
8697 if (TREE_CODE (lhs) != SSA_NAME)
8698 break;
8699
8700 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8701 {
8702 /* Remove dead scalar statement. */
8703 if (has_zero_uses (lhs))
8704 {
8705 gsi_remove (&gsi_from, true);
8706 continue;
8707 }
8708 }
8709
8710 /* Check that LHS does not have uses outside of STORE_BB. */
8711 res = true;
8712 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8713 {
8714 gimple *use_stmt;
8715 use_stmt = USE_STMT (use_p);
8716 if (is_gimple_debug (use_stmt))
8717 continue;
8718 if (gimple_bb (use_stmt) != store_bb)
8719 {
8720 res = false;
8721 break;
8722 }
8723 }
8724 if (!res)
8725 break;
8726
8727 if (gimple_vuse (stmt1)
8728 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8729 break;
8730
8731 /* Can move STMT1 to STORE_BB. */
8732 if (dump_enabled_p ())
8733 dump_printf_loc (MSG_NOTE, vect_location,
8734 "Move stmt to created bb\n%G", stmt1);
8735 gsi_move_before (&gsi_from, &gsi_to);
8736 /* Shift GSI_TO for further insertion. */
8737 gsi_prev (&gsi_to);
8738 }
8739 /* Put other masked stores with the same mask to STORE_BB. */
8740 if (worklist.is_empty ()
8741 || gimple_call_arg (worklist.last (), 2) != mask
8742 || worklist.last () != stmt1)
8743 break;
8744 last = worklist.pop ();
8745 }
8746 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8747 }
8748 }