]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
re PR tree-optimization/65930 (Reduction with sign-change not handled)
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
53
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
56 {
57 int i;
58 slp_tree child;
59
60 if (--node->refcnt != 0)
61 return;
62
63 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64 vect_free_slp_tree (child, final_p);
65
66 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67 Some statements might no longer exist, after having been
68 removed by vect_transform_stmt. Updating the remaining
69 statements would be redundant. */
70 if (!final_p)
71 {
72 stmt_vec_info stmt_info;
73 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 {
75 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
77 }
78 }
79
80 SLP_TREE_CHILDREN (node).release ();
81 SLP_TREE_SCALAR_STMTS (node).release ();
82 SLP_TREE_SCALAR_OPS (node).release ();
83 SLP_TREE_VEC_STMTS (node).release ();
84 SLP_TREE_LOAD_PERMUTATION (node).release ();
85
86 free (node);
87 }
88
89 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
90 have vectorized the instance or if we have made a final decision not
91 to vectorize the statements in any way. */
92
93 void
94 vect_free_slp_instance (slp_instance instance, bool final_p)
95 {
96 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
97 SLP_INSTANCE_LOADS (instance).release ();
98 free (instance);
99 }
100
101
102 /* Create an SLP node for SCALAR_STMTS. */
103
104 static slp_tree
105 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
106 {
107 slp_tree node;
108 stmt_vec_info stmt_info = scalar_stmts[0];
109 unsigned int nops;
110
111 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
112 nops = gimple_call_num_args (stmt);
113 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
114 {
115 nops = gimple_num_ops (stmt) - 1;
116 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
117 nops++;
118 }
119 else if (is_a <gphi *> (stmt_info->stmt))
120 nops = 0;
121 else
122 return NULL;
123
124 node = XNEW (struct _slp_tree);
125 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
126 SLP_TREE_SCALAR_OPS (node) = vNULL;
127 SLP_TREE_VEC_STMTS (node).create (0);
128 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
129 SLP_TREE_CHILDREN (node).create (nops);
130 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
131 SLP_TREE_TWO_OPERATORS (node) = false;
132 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
133 node->refcnt = 1;
134 node->max_nunits = 1;
135
136 unsigned i;
137 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
138 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
139
140 return node;
141 }
142
143 /* Create an SLP node for OPS. */
144
145 static slp_tree
146 vect_create_new_slp_node (vec<tree> ops)
147 {
148 slp_tree node;
149
150 node = XNEW (struct _slp_tree);
151 SLP_TREE_SCALAR_STMTS (node) = vNULL;
152 SLP_TREE_SCALAR_OPS (node) = ops;
153 SLP_TREE_VEC_STMTS (node).create (0);
154 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
155 SLP_TREE_CHILDREN (node) = vNULL;
156 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
157 SLP_TREE_TWO_OPERATORS (node) = false;
158 SLP_TREE_DEF_TYPE (node) = vect_external_def;
159 node->refcnt = 1;
160 node->max_nunits = 1;
161
162 return node;
163 }
164
165
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
168 node. */
169 typedef struct _slp_oprnd_info
170 {
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
173 /* Operands. */
174 vec<tree> ops;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
177 stmt. */
178 tree first_op_type;
179 enum vect_def_type first_dt;
180 bool any_pattern;
181 } *slp_oprnd_info;
182
183
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
185 operand. */
186 static vec<slp_oprnd_info>
187 vect_create_oprnd_info (int nops, int group_size)
188 {
189 int i;
190 slp_oprnd_info oprnd_info;
191 vec<slp_oprnd_info> oprnds_info;
192
193 oprnds_info.create (nops);
194 for (i = 0; i < nops; i++)
195 {
196 oprnd_info = XNEW (struct _slp_oprnd_info);
197 oprnd_info->def_stmts.create (group_size);
198 oprnd_info->ops.create (group_size);
199 oprnd_info->first_dt = vect_uninitialized_def;
200 oprnd_info->first_op_type = NULL_TREE;
201 oprnd_info->any_pattern = false;
202 oprnds_info.quick_push (oprnd_info);
203 }
204
205 return oprnds_info;
206 }
207
208
209 /* Free operands info. */
210
211 static void
212 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216
217 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
218 {
219 oprnd_info->def_stmts.release ();
220 oprnd_info->ops.release ();
221 XDELETE (oprnd_info);
222 }
223
224 oprnds_info.release ();
225 }
226
227
228 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
229 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
230 of the chain. */
231
232 int
233 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
234 stmt_vec_info first_stmt_info)
235 {
236 stmt_vec_info next_stmt_info = first_stmt_info;
237 int result = 0;
238
239 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
240 return -1;
241
242 do
243 {
244 if (next_stmt_info == stmt_info)
245 return result;
246 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
247 if (next_stmt_info)
248 result += DR_GROUP_GAP (next_stmt_info);
249 }
250 while (next_stmt_info);
251
252 return -1;
253 }
254
255 /* Check whether it is possible to load COUNT elements of type ELT_MODE
256 using the method implemented by duplicate_and_interleave. Return true
257 if so, returning the number of intermediate vectors in *NVECTORS_OUT
258 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
259 (if nonnull). */
260
261 bool
262 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
263 machine_mode elt_mode,
264 unsigned int *nvectors_out,
265 tree *vector_type_out,
266 tree *permutes)
267 {
268 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
269 poly_int64 nelts;
270 unsigned int nvectors = 1;
271 for (;;)
272 {
273 scalar_int_mode int_mode;
274 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
275 if (multiple_p (vinfo->vector_size, elt_bytes, &nelts)
276 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
277 {
278 tree int_type = build_nonstandard_integer_type
279 (GET_MODE_BITSIZE (int_mode), 1);
280 tree vector_type = build_vector_type (int_type, nelts);
281 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
282 {
283 vec_perm_builder sel1 (nelts, 2, 3);
284 vec_perm_builder sel2 (nelts, 2, 3);
285 poly_int64 half_nelts = exact_div (nelts, 2);
286 for (unsigned int i = 0; i < 3; ++i)
287 {
288 sel1.quick_push (i);
289 sel1.quick_push (i + nelts);
290 sel2.quick_push (half_nelts + i);
291 sel2.quick_push (half_nelts + i + nelts);
292 }
293 vec_perm_indices indices1 (sel1, 2, nelts);
294 vec_perm_indices indices2 (sel2, 2, nelts);
295 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
296 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
297 {
298 if (nvectors_out)
299 *nvectors_out = nvectors;
300 if (vector_type_out)
301 *vector_type_out = vector_type;
302 if (permutes)
303 {
304 permutes[0] = vect_gen_perm_mask_checked (vector_type,
305 indices1);
306 permutes[1] = vect_gen_perm_mask_checked (vector_type,
307 indices2);
308 }
309 return true;
310 }
311 }
312 }
313 if (!multiple_p (elt_bytes, 2, &elt_bytes))
314 return false;
315 nvectors *= 2;
316 }
317 }
318
319 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
320 they are of a valid type and that they match the defs of the first stmt of
321 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
322 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
323 indicates swap is required for cond_expr stmts. Specifically, *SWAP
324 is 1 if STMT is cond and operands of comparison need to be swapped;
325 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
326 If there is any operand swap in this function, *SWAP is set to non-zero
327 value.
328 If there was a fatal error return -1; if the error could be corrected by
329 swapping operands of father node of this one, return 1; if everything is
330 ok return 0. */
331 static int
332 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
333 vec<stmt_vec_info> stmts, unsigned stmt_num,
334 vec<slp_oprnd_info> *oprnds_info)
335 {
336 stmt_vec_info stmt_info = stmts[stmt_num];
337 tree oprnd;
338 unsigned int i, number_of_oprnds;
339 enum vect_def_type dt = vect_uninitialized_def;
340 slp_oprnd_info oprnd_info;
341 int first_op_idx = 1;
342 unsigned int commutative_op = -1U;
343 bool first_op_cond = false;
344 bool first = stmt_num == 0;
345
346 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
347 {
348 number_of_oprnds = gimple_call_num_args (stmt);
349 first_op_idx = 3;
350 if (gimple_call_internal_p (stmt))
351 {
352 internal_fn ifn = gimple_call_internal_fn (stmt);
353 commutative_op = first_commutative_argument (ifn);
354
355 /* Masked load, only look at mask. */
356 if (ifn == IFN_MASK_LOAD)
357 {
358 number_of_oprnds = 1;
359 /* Mask operand index. */
360 first_op_idx = 5;
361 }
362 }
363 }
364 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
365 {
366 enum tree_code code = gimple_assign_rhs_code (stmt);
367 number_of_oprnds = gimple_num_ops (stmt) - 1;
368 /* Swap can only be done for cond_expr if asked to, otherwise we
369 could result in different comparison code to the first stmt. */
370 if (code == COND_EXPR
371 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
372 {
373 first_op_cond = true;
374 number_of_oprnds++;
375 }
376 else
377 commutative_op = commutative_tree_code (code) ? 0U : -1U;
378 }
379 else
380 return -1;
381
382 bool swapped = (*swap != 0);
383 gcc_assert (!swapped || first_op_cond);
384 for (i = 0; i < number_of_oprnds; i++)
385 {
386 again:
387 if (first_op_cond)
388 {
389 /* Map indicating how operands of cond_expr should be swapped. */
390 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
391 int *map = maps[*swap];
392
393 if (i < 2)
394 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
395 first_op_idx), map[i]);
396 else
397 oprnd = gimple_op (stmt_info->stmt, map[i]);
398 }
399 else
400 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
401 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
402 oprnd = TREE_OPERAND (oprnd, 0);
403
404 oprnd_info = (*oprnds_info)[i];
405
406 stmt_vec_info def_stmt_info;
407 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
408 {
409 if (dump_enabled_p ())
410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
411 "Build SLP failed: can't analyze def for %T\n",
412 oprnd);
413
414 return -1;
415 }
416
417 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
418 oprnd_info->any_pattern = true;
419
420 if (first)
421 {
422 /* For the swapping logic below force vect_reduction_def
423 for the reduction op in a SLP reduction group. */
424 if (!STMT_VINFO_DATA_REF (stmt_info)
425 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
426 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
427 && def_stmt_info)
428 dt = vect_reduction_def;
429 oprnd_info->first_dt = dt;
430 oprnd_info->first_op_type = TREE_TYPE (oprnd);
431 }
432 else
433 {
434 /* Not first stmt of the group, check that the def-stmt/s match
435 the def-stmt/s of the first stmt. Allow different definition
436 types for reduction chains: the first stmt must be a
437 vect_reduction_def (a phi node), and the rest
438 end in the reduction chain. */
439 tree type = TREE_TYPE (oprnd);
440 if ((oprnd_info->first_dt != dt
441 && !(oprnd_info->first_dt == vect_reduction_def
442 && !STMT_VINFO_DATA_REF (stmt_info)
443 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
444 && def_stmt_info
445 && !STMT_VINFO_DATA_REF (def_stmt_info)
446 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
447 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
448 && !((oprnd_info->first_dt == vect_external_def
449 || oprnd_info->first_dt == vect_constant_def)
450 && (dt == vect_external_def
451 || dt == vect_constant_def)))
452 || !types_compatible_p (oprnd_info->first_op_type, type)
453 || (!STMT_VINFO_DATA_REF (stmt_info)
454 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
455 && ((!def_stmt_info
456 || STMT_VINFO_DATA_REF (def_stmt_info)
457 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
458 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
459 != (oprnd_info->first_dt != vect_reduction_def))))
460 {
461 /* Try swapping operands if we got a mismatch. */
462 if (i == commutative_op && !swapped)
463 {
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "trying swapped operands\n");
467 swapped = true;
468 goto again;
469 }
470
471 if (dump_enabled_p ())
472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
473 "Build SLP failed: different types\n");
474
475 return 1;
476 }
477 if ((dt == vect_constant_def
478 || dt == vect_external_def)
479 && !vinfo->vector_size.is_constant ()
480 && (TREE_CODE (type) == BOOLEAN_TYPE
481 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
482 TYPE_MODE (type))))
483 {
484 if (dump_enabled_p ())
485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
486 "Build SLP failed: invalid type of def "
487 "for variable-length SLP %T\n", oprnd);
488 return -1;
489 }
490 }
491
492 /* Check the types of the definitions. */
493 switch (dt)
494 {
495 case vect_external_def:
496 /* Make sure to demote the overall operand to external. */
497 oprnd_info->first_dt = vect_external_def;
498 /* Fallthru. */
499 case vect_constant_def:
500 oprnd_info->def_stmts.quick_push (NULL);
501 oprnd_info->ops.quick_push (oprnd);
502 break;
503
504 case vect_internal_def:
505 case vect_reduction_def:
506 if (oprnd_info->first_dt == vect_reduction_def
507 && !STMT_VINFO_DATA_REF (stmt_info)
508 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
509 && !STMT_VINFO_DATA_REF (def_stmt_info)
510 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
511 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
512 {
513 /* For a SLP reduction chain we want to duplicate the
514 reduction to each of the chain members. That gets
515 us a sane SLP graph (still the stmts are not 100%
516 correct wrt the initial values). */
517 gcc_assert (!first);
518 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
519 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
520 break;
521 }
522 /* Fallthru. */
523 case vect_induction_def:
524 oprnd_info->def_stmts.quick_push (def_stmt_info);
525 oprnd_info->ops.quick_push (oprnd);
526 break;
527
528 default:
529 /* FORNOW: Not supported. */
530 if (dump_enabled_p ())
531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
532 "Build SLP failed: illegal type of def %T\n",
533 oprnd);
534
535 return -1;
536 }
537 }
538
539 /* Swap operands. */
540 if (swapped)
541 {
542 if (first_op_cond)
543 {
544 /* If there are already uses of this stmt in a SLP instance then
545 we've committed to the operand order and can't swap it. */
546 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
547 {
548 if (dump_enabled_p ())
549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
550 "Build SLP failed: cannot swap operands of "
551 "shared stmt %G", stmt_info->stmt);
552 return -1;
553 }
554
555 /* To get rid of this swapping we have to move the stmt code
556 to the SLP tree as well (and gather it here per stmt). */
557 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
558 tree cond = gimple_assign_rhs1 (stmt);
559 enum tree_code code = TREE_CODE (cond);
560
561 /* Swap. */
562 if (*swap == 1)
563 {
564 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
565 &TREE_OPERAND (cond, 1));
566 TREE_SET_CODE (cond, swap_tree_comparison (code));
567 }
568 /* Invert. */
569 else
570 {
571 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
572 gimple_assign_rhs3_ptr (stmt));
573 if (STMT_VINFO_REDUC_IDX (stmt_info) == 1)
574 STMT_VINFO_REDUC_IDX (stmt_info) = 2;
575 else if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
576 STMT_VINFO_REDUC_IDX (stmt_info) = 1;
577 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
578 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
579 gcc_assert (code != ERROR_MARK);
580 TREE_SET_CODE (cond, code);
581 }
582 }
583 else
584 {
585 /* Commutative ops need not reflect swapping, ops are in
586 the SLP tree. */
587 }
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_NOTE, vect_location,
590 "swapped operands to match def types in %G",
591 stmt_info->stmt);
592 }
593
594 *swap = swapped;
595 return 0;
596 }
597
598 /* Return true if call statements CALL1 and CALL2 are similar enough
599 to be combined into the same SLP group. */
600
601 static bool
602 compatible_calls_p (gcall *call1, gcall *call2)
603 {
604 unsigned int nargs = gimple_call_num_args (call1);
605 if (nargs != gimple_call_num_args (call2))
606 return false;
607
608 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
609 return false;
610
611 if (gimple_call_internal_p (call1))
612 {
613 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
614 TREE_TYPE (gimple_call_lhs (call2))))
615 return false;
616 for (unsigned int i = 0; i < nargs; ++i)
617 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
618 TREE_TYPE (gimple_call_arg (call2, i))))
619 return false;
620 }
621 else
622 {
623 if (!operand_equal_p (gimple_call_fn (call1),
624 gimple_call_fn (call2), 0))
625 return false;
626
627 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
628 return false;
629 }
630 return true;
631 }
632
633 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
634 caller's attempt to find the vector type in STMT_INFO with the narrowest
635 element type. Return true if VECTYPE is nonnull and if it is valid
636 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
637 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
638 vect_build_slp_tree. */
639
640 static bool
641 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
642 tree vectype, poly_uint64 *max_nunits)
643 {
644 if (!vectype)
645 {
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
648 "Build SLP failed: unsupported data-type in %G\n",
649 stmt_info->stmt);
650 /* Fatal mismatch. */
651 return false;
652 }
653
654 /* If populating the vector type requires unrolling then fail
655 before adjusting *max_nunits for basic-block vectorization. */
656 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
657 unsigned HOST_WIDE_INT const_nunits;
658 if (STMT_VINFO_BB_VINFO (stmt_info)
659 && (!nunits.is_constant (&const_nunits)
660 || const_nunits > group_size))
661 {
662 if (dump_enabled_p ())
663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
664 "Build SLP failed: unrolling required "
665 "in basic block SLP\n");
666 /* Fatal mismatch. */
667 return false;
668 }
669
670 /* In case of multiple types we need to detect the smallest type. */
671 vect_update_max_nunits (max_nunits, vectype);
672 return true;
673 }
674
675 /* STMTS is a group of GROUP_SIZE SLP statements in which some
676 statements do the same operation as the first statement and in which
677 the others do ALT_STMT_CODE. Return true if we can take one vector
678 of the first operation and one vector of the second and permute them
679 to get the required result. VECTYPE is the type of the vector that
680 would be permuted. */
681
682 static bool
683 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
684 unsigned int group_size, tree vectype,
685 tree_code alt_stmt_code)
686 {
687 unsigned HOST_WIDE_INT count;
688 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
689 return false;
690
691 vec_perm_builder sel (count, count, 1);
692 for (unsigned int i = 0; i < count; ++i)
693 {
694 unsigned int elt = i;
695 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
696 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
697 elt += count;
698 sel.quick_push (elt);
699 }
700 vec_perm_indices indices (sel, 2, count);
701 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
702 }
703
704 /* Verify if the scalar stmts STMTS are isomorphic, require data
705 permutation or are of unsupported types of operation. Return
706 true if they are, otherwise return false and indicate in *MATCHES
707 which stmts are not isomorphic to the first one. If MATCHES[0]
708 is false then this indicates the comparison could not be
709 carried out or the stmts will never be vectorized by SLP.
710
711 Note COND_EXPR is possibly isomorphic to another one after swapping its
712 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
713 the first stmt by swapping the two operands of comparison; set SWAP[i]
714 to 2 if stmt I is isormorphic to the first stmt by inverting the code
715 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
716 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
717
718 static bool
719 vect_build_slp_tree_1 (unsigned char *swap,
720 vec<stmt_vec_info> stmts, unsigned int group_size,
721 poly_uint64 *max_nunits, bool *matches,
722 bool *two_operators)
723 {
724 unsigned int i;
725 stmt_vec_info first_stmt_info = stmts[0];
726 enum tree_code first_stmt_code = ERROR_MARK;
727 enum tree_code alt_stmt_code = ERROR_MARK;
728 enum tree_code rhs_code = ERROR_MARK;
729 enum tree_code first_cond_code = ERROR_MARK;
730 tree lhs;
731 bool need_same_oprnds = false;
732 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
733 optab optab;
734 int icode;
735 machine_mode optab_op2_mode;
736 machine_mode vec_mode;
737 stmt_vec_info first_load = NULL, prev_first_load = NULL;
738 bool load_p = false;
739
740 /* For every stmt in NODE find its def stmt/s. */
741 stmt_vec_info stmt_info;
742 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
743 {
744 gimple *stmt = stmt_info->stmt;
745 swap[i] = 0;
746 matches[i] = false;
747
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
750
751 /* Fail to vectorize statements marked as unvectorizable. */
752 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
753 {
754 if (dump_enabled_p ())
755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756 "Build SLP failed: unvectorizable statement %G",
757 stmt);
758 /* Fatal mismatch. */
759 matches[0] = false;
760 return false;
761 }
762
763 lhs = gimple_get_lhs (stmt);
764 if (lhs == NULL_TREE)
765 {
766 if (dump_enabled_p ())
767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
768 "Build SLP failed: not GIMPLE_ASSIGN nor "
769 "GIMPLE_CALL %G", stmt);
770 /* Fatal mismatch. */
771 matches[0] = false;
772 return false;
773 }
774
775 tree nunits_vectype;
776 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
777 &nunits_vectype)
778 || (nunits_vectype
779 && !vect_record_max_nunits (stmt_info, group_size,
780 nunits_vectype, max_nunits)))
781 {
782 /* Fatal mismatch. */
783 matches[0] = false;
784 return false;
785 }
786
787 gcc_assert (vectype);
788
789 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
790 {
791 rhs_code = CALL_EXPR;
792
793 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
794 load_p = true;
795 else if ((gimple_call_internal_p (call_stmt)
796 && (!vectorizable_internal_fn_p
797 (gimple_call_internal_fn (call_stmt))))
798 || gimple_call_tail_p (call_stmt)
799 || gimple_call_noreturn_p (call_stmt)
800 || !gimple_call_nothrow_p (call_stmt)
801 || gimple_call_chain (call_stmt))
802 {
803 if (dump_enabled_p ())
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805 "Build SLP failed: unsupported call type %G",
806 call_stmt);
807 /* Fatal mismatch. */
808 matches[0] = false;
809 return false;
810 }
811 }
812 else
813 {
814 rhs_code = gimple_assign_rhs_code (stmt);
815 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
816 }
817
818 /* Check the operation. */
819 if (i == 0)
820 {
821 first_stmt_code = rhs_code;
822
823 /* Shift arguments should be equal in all the packed stmts for a
824 vector shift with scalar shift operand. */
825 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
826 || rhs_code == LROTATE_EXPR
827 || rhs_code == RROTATE_EXPR)
828 {
829 if (vectype == boolean_type_node)
830 {
831 if (dump_enabled_p ())
832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 "Build SLP failed: shift of a"
834 " boolean.\n");
835 /* Fatal mismatch. */
836 matches[0] = false;
837 return false;
838 }
839
840 vec_mode = TYPE_MODE (vectype);
841
842 /* First see if we have a vector/vector shift. */
843 optab = optab_for_tree_code (rhs_code, vectype,
844 optab_vector);
845
846 if (!optab
847 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
848 {
849 /* No vector/vector shift, try for a vector/scalar shift. */
850 optab = optab_for_tree_code (rhs_code, vectype,
851 optab_scalar);
852
853 if (!optab)
854 {
855 if (dump_enabled_p ())
856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
857 "Build SLP failed: no optab.\n");
858 /* Fatal mismatch. */
859 matches[0] = false;
860 return false;
861 }
862 icode = (int) optab_handler (optab, vec_mode);
863 if (icode == CODE_FOR_nothing)
864 {
865 if (dump_enabled_p ())
866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
867 "Build SLP failed: "
868 "op not supported by target.\n");
869 /* Fatal mismatch. */
870 matches[0] = false;
871 return false;
872 }
873 optab_op2_mode = insn_data[icode].operand[2].mode;
874 if (!VECTOR_MODE_P (optab_op2_mode))
875 {
876 need_same_oprnds = true;
877 first_op1 = gimple_assign_rhs2 (stmt);
878 }
879 }
880 }
881 else if (rhs_code == WIDEN_LSHIFT_EXPR)
882 {
883 need_same_oprnds = true;
884 first_op1 = gimple_assign_rhs2 (stmt);
885 }
886 }
887 else
888 {
889 if (first_stmt_code != rhs_code
890 && alt_stmt_code == ERROR_MARK)
891 alt_stmt_code = rhs_code;
892 if (first_stmt_code != rhs_code
893 && (first_stmt_code != IMAGPART_EXPR
894 || rhs_code != REALPART_EXPR)
895 && (first_stmt_code != REALPART_EXPR
896 || rhs_code != IMAGPART_EXPR)
897 /* Handle mismatches in plus/minus by computing both
898 and merging the results. */
899 && !((first_stmt_code == PLUS_EXPR
900 || first_stmt_code == MINUS_EXPR)
901 && (alt_stmt_code == PLUS_EXPR
902 || alt_stmt_code == MINUS_EXPR)
903 && rhs_code == alt_stmt_code)
904 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
905 && (first_stmt_code == ARRAY_REF
906 || first_stmt_code == BIT_FIELD_REF
907 || first_stmt_code == INDIRECT_REF
908 || first_stmt_code == COMPONENT_REF
909 || first_stmt_code == MEM_REF)))
910 {
911 if (dump_enabled_p ())
912 {
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "Build SLP failed: different operation "
915 "in stmt %G", stmt);
916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
917 "original stmt %G", first_stmt_info->stmt);
918 }
919 /* Mismatch. */
920 continue;
921 }
922
923 if (need_same_oprnds
924 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
925 {
926 if (dump_enabled_p ())
927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
928 "Build SLP failed: different shift "
929 "arguments in %G", stmt);
930 /* Mismatch. */
931 continue;
932 }
933
934 if (!load_p && rhs_code == CALL_EXPR)
935 {
936 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
937 as_a <gcall *> (stmt)))
938 {
939 if (dump_enabled_p ())
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "Build SLP failed: different calls in %G",
942 stmt);
943 /* Mismatch. */
944 continue;
945 }
946 }
947 }
948
949 /* Grouped store or load. */
950 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
951 {
952 if (REFERENCE_CLASS_P (lhs))
953 {
954 /* Store. */
955 ;
956 }
957 else
958 {
959 /* Load. */
960 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
961 if (prev_first_load)
962 {
963 /* Check that there are no loads from different interleaving
964 chains in the same node. */
965 if (prev_first_load != first_load)
966 {
967 if (dump_enabled_p ())
968 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
969 vect_location,
970 "Build SLP failed: different "
971 "interleaving chains in one node %G",
972 stmt);
973 /* Mismatch. */
974 continue;
975 }
976 }
977 else
978 prev_first_load = first_load;
979 }
980 } /* Grouped access. */
981 else
982 {
983 if (load_p)
984 {
985 /* Not grouped load. */
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
988 "Build SLP failed: not grouped load %G", stmt);
989
990 /* FORNOW: Not grouped loads are not supported. */
991 /* Fatal mismatch. */
992 matches[0] = false;
993 return false;
994 }
995
996 /* Not memory operation. */
997 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
998 && TREE_CODE_CLASS (rhs_code) != tcc_unary
999 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1000 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1001 && rhs_code != VIEW_CONVERT_EXPR
1002 && rhs_code != CALL_EXPR)
1003 {
1004 if (dump_enabled_p ())
1005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1006 "Build SLP failed: operation unsupported %G",
1007 stmt);
1008 /* Fatal mismatch. */
1009 matches[0] = false;
1010 return false;
1011 }
1012
1013 if (rhs_code == COND_EXPR)
1014 {
1015 tree cond_expr = gimple_assign_rhs1 (stmt);
1016 enum tree_code cond_code = TREE_CODE (cond_expr);
1017 enum tree_code swap_code = ERROR_MARK;
1018 enum tree_code invert_code = ERROR_MARK;
1019
1020 if (i == 0)
1021 first_cond_code = TREE_CODE (cond_expr);
1022 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1023 {
1024 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1025 swap_code = swap_tree_comparison (cond_code);
1026 invert_code = invert_tree_comparison (cond_code, honor_nans);
1027 }
1028
1029 if (first_cond_code == cond_code)
1030 ;
1031 /* Isomorphic can be achieved by swapping. */
1032 else if (first_cond_code == swap_code)
1033 swap[i] = 1;
1034 /* Isomorphic can be achieved by inverting. */
1035 else if (first_cond_code == invert_code)
1036 swap[i] = 2;
1037 else
1038 {
1039 if (dump_enabled_p ())
1040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1041 "Build SLP failed: different"
1042 " operation %G", stmt);
1043 /* Mismatch. */
1044 continue;
1045 }
1046 }
1047 }
1048
1049 matches[i] = true;
1050 }
1051
1052 for (i = 0; i < group_size; ++i)
1053 if (!matches[i])
1054 return false;
1055
1056 /* If we allowed a two-operation SLP node verify the target can cope
1057 with the permute we are going to use. */
1058 if (alt_stmt_code != ERROR_MARK
1059 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1060 {
1061 if (vectype == boolean_type_node
1062 || !vect_two_operations_perm_ok_p (stmts, group_size,
1063 vectype, alt_stmt_code))
1064 {
1065 for (i = 0; i < group_size; ++i)
1066 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1067 {
1068 matches[i] = false;
1069 if (dump_enabled_p ())
1070 {
1071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1072 "Build SLP failed: different operation "
1073 "in stmt %G", stmts[i]->stmt);
1074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1075 "original stmt %G", first_stmt_info->stmt);
1076 }
1077 }
1078 return false;
1079 }
1080 *two_operators = true;
1081 }
1082
1083 return true;
1084 }
1085
1086 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1087 Note we never remove apart from at destruction time so we do not
1088 need a special value for deleted that differs from empty. */
1089 struct bst_traits
1090 {
1091 typedef vec <stmt_vec_info> value_type;
1092 typedef vec <stmt_vec_info> compare_type;
1093 static inline hashval_t hash (value_type);
1094 static inline bool equal (value_type existing, value_type candidate);
1095 static inline bool is_empty (value_type x) { return !x.exists (); }
1096 static inline bool is_deleted (value_type x) { return !x.exists (); }
1097 static inline void mark_empty (value_type &x) { x.release (); }
1098 static inline void mark_deleted (value_type &x) { x.release (); }
1099 static inline void remove (value_type &x) { x.release (); }
1100 };
1101 inline hashval_t
1102 bst_traits::hash (value_type x)
1103 {
1104 inchash::hash h;
1105 for (unsigned i = 0; i < x.length (); ++i)
1106 h.add_int (gimple_uid (x[i]->stmt));
1107 return h.end ();
1108 }
1109 inline bool
1110 bst_traits::equal (value_type existing, value_type candidate)
1111 {
1112 if (existing.length () != candidate.length ())
1113 return false;
1114 for (unsigned i = 0; i < existing.length (); ++i)
1115 if (existing[i] != candidate[i])
1116 return false;
1117 return true;
1118 }
1119
1120 typedef hash_map <vec <gimple *>, slp_tree,
1121 simple_hashmap_traits <bst_traits, slp_tree> >
1122 scalar_stmts_to_slp_tree_map_t;
1123
1124 static slp_tree
1125 vect_build_slp_tree_2 (vec_info *vinfo,
1126 vec<stmt_vec_info> stmts, unsigned int group_size,
1127 poly_uint64 *max_nunits,
1128 bool *matches, unsigned *npermutes, unsigned *tree_size,
1129 scalar_stmts_to_slp_tree_map_t *bst_map);
1130
1131 static slp_tree
1132 vect_build_slp_tree (vec_info *vinfo,
1133 vec<stmt_vec_info> stmts, unsigned int group_size,
1134 poly_uint64 *max_nunits,
1135 bool *matches, unsigned *npermutes, unsigned *tree_size,
1136 scalar_stmts_to_slp_tree_map_t *bst_map)
1137 {
1138 if (slp_tree *leader = bst_map->get (stmts))
1139 {
1140 if (dump_enabled_p ())
1141 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1142 *leader ? "" : "failed ", *leader);
1143 if (*leader)
1144 {
1145 (*leader)->refcnt++;
1146 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1147 }
1148 return *leader;
1149 }
1150 poly_uint64 this_max_nunits = 1;
1151 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1152 matches, npermutes, tree_size, bst_map);
1153 if (res)
1154 {
1155 res->max_nunits = this_max_nunits;
1156 vect_update_max_nunits (max_nunits, this_max_nunits);
1157 /* Keep a reference for the bst_map use. */
1158 res->refcnt++;
1159 }
1160 bst_map->put (stmts.copy (), res);
1161 return res;
1162 }
1163
1164 /* Recursively build an SLP tree starting from NODE.
1165 Fail (and return a value not equal to zero) if def-stmts are not
1166 isomorphic, require data permutation or are of unsupported types of
1167 operation. Otherwise, return 0.
1168 The value returned is the depth in the SLP tree where a mismatch
1169 was found. */
1170
1171 static slp_tree
1172 vect_build_slp_tree_2 (vec_info *vinfo,
1173 vec<stmt_vec_info> stmts, unsigned int group_size,
1174 poly_uint64 *max_nunits,
1175 bool *matches, unsigned *npermutes, unsigned *tree_size,
1176 scalar_stmts_to_slp_tree_map_t *bst_map)
1177 {
1178 unsigned nops, i, this_tree_size = 0;
1179 poly_uint64 this_max_nunits = *max_nunits;
1180 slp_tree node;
1181
1182 matches[0] = false;
1183
1184 stmt_vec_info stmt_info = stmts[0];
1185 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1186 nops = gimple_call_num_args (stmt);
1187 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1188 {
1189 nops = gimple_num_ops (stmt) - 1;
1190 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1191 nops++;
1192 }
1193 else if (is_a <gphi *> (stmt_info->stmt))
1194 nops = 0;
1195 else
1196 return NULL;
1197
1198 /* If the SLP node is a PHI (induction or reduction), terminate
1199 the recursion. */
1200 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1201 {
1202 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1203 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1204 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1205 return NULL;
1206
1207 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1208 /* Induction from different IVs is not supported. */
1209 if (def_type == vect_induction_def)
1210 {
1211 stmt_vec_info other_info;
1212 FOR_EACH_VEC_ELT (stmts, i, other_info)
1213 if (stmt_info != other_info)
1214 return NULL;
1215 }
1216 else if (def_type == vect_reduction_def
1217 || def_type == vect_double_reduction_def
1218 || def_type == vect_nested_cycle)
1219 {
1220 /* Else def types have to match. */
1221 stmt_vec_info other_info;
1222 FOR_EACH_VEC_ELT (stmts, i, other_info)
1223 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1224 return NULL;
1225 }
1226 else
1227 return NULL;
1228 (*tree_size)++;
1229 node = vect_create_new_slp_node (stmts);
1230 return node;
1231 }
1232
1233
1234 bool two_operators = false;
1235 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1236 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1237 &this_max_nunits, matches, &two_operators))
1238 return NULL;
1239
1240 /* If the SLP node is a load, terminate the recursion unless masked. */
1241 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1242 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1243 {
1244 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1245 {
1246 /* Masked load. */
1247 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1248 nops = 1;
1249 }
1250 else
1251 {
1252 *max_nunits = this_max_nunits;
1253 (*tree_size)++;
1254 node = vect_create_new_slp_node (stmts);
1255 return node;
1256 }
1257 }
1258
1259 /* Get at the operands, verifying they are compatible. */
1260 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1261 slp_oprnd_info oprnd_info;
1262 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1263 {
1264 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1265 stmts, i, &oprnds_info);
1266 if (res != 0)
1267 matches[(res == -1) ? 0 : i] = false;
1268 if (!matches[0])
1269 break;
1270 }
1271 for (i = 0; i < group_size; ++i)
1272 if (!matches[i])
1273 {
1274 vect_free_oprnd_info (oprnds_info);
1275 return NULL;
1276 }
1277
1278 auto_vec<slp_tree, 4> children;
1279
1280 stmt_info = stmts[0];
1281
1282 /* Create SLP_TREE nodes for the definition node/s. */
1283 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1284 {
1285 slp_tree child;
1286 unsigned old_tree_size = this_tree_size;
1287 unsigned int j;
1288
1289 if (oprnd_info->first_dt == vect_uninitialized_def)
1290 {
1291 /* COND_EXPR have one too many eventually if the condition
1292 is a SSA name. */
1293 gcc_assert (i == 3 && nops == 4);
1294 continue;
1295 }
1296
1297 if (oprnd_info->first_dt != vect_internal_def
1298 && oprnd_info->first_dt != vect_reduction_def
1299 && oprnd_info->first_dt != vect_induction_def)
1300 {
1301 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1302 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1303 oprnd_info->ops = vNULL;
1304 children.safe_push (invnode);
1305 continue;
1306 }
1307
1308 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1309 group_size, &this_max_nunits,
1310 matches, npermutes,
1311 &this_tree_size, bst_map)) != NULL)
1312 {
1313 /* If we have all children of child built up from scalars then just
1314 throw that away and build it up this node from scalars. */
1315 if (is_a <bb_vec_info> (vinfo)
1316 && !SLP_TREE_CHILDREN (child).is_empty ()
1317 /* ??? Rejecting patterns this way doesn't work. We'd have to
1318 do extra work to cancel the pattern so the uses see the
1319 scalar version. */
1320 && !oprnd_info->any_pattern)
1321 {
1322 slp_tree grandchild;
1323
1324 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1325 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1326 break;
1327 if (!grandchild)
1328 {
1329 /* Roll back. */
1330 this_tree_size = old_tree_size;
1331 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1332 vect_free_slp_tree (grandchild, false);
1333 SLP_TREE_CHILDREN (child).truncate (0);
1334
1335 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_NOTE, vect_location,
1337 "Building parent vector operands from "
1338 "scalars instead\n");
1339 oprnd_info->def_stmts = vNULL;
1340 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1341 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1342 oprnd_info->ops = vNULL;
1343 ++this_tree_size;
1344 children.safe_push (child);
1345 continue;
1346 }
1347 }
1348
1349 oprnd_info->def_stmts = vNULL;
1350 children.safe_push (child);
1351 continue;
1352 }
1353
1354 /* If the SLP build failed fatally and we analyze a basic-block
1355 simply treat nodes we fail to build as externally defined
1356 (and thus build vectors from the scalar defs).
1357 The cost model will reject outright expensive cases.
1358 ??? This doesn't treat cases where permutation ultimatively
1359 fails (or we don't try permutation below). Ideally we'd
1360 even compute a permutation that will end up with the maximum
1361 SLP tree size... */
1362 if (is_a <bb_vec_info> (vinfo)
1363 && !matches[0]
1364 /* ??? Rejecting patterns this way doesn't work. We'd have to
1365 do extra work to cancel the pattern so the uses see the
1366 scalar version. */
1367 && !is_pattern_stmt_p (stmt_info)
1368 && !oprnd_info->any_pattern)
1369 {
1370 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_NOTE, vect_location,
1372 "Building vector operands from scalars\n");
1373 this_tree_size++;
1374 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1375 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1376 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1377 children.safe_push (child);
1378 oprnd_info->ops = vNULL;
1379 oprnd_info->def_stmts = vNULL;
1380 continue;
1381 }
1382
1383 /* If the SLP build for operand zero failed and operand zero
1384 and one can be commutated try that for the scalar stmts
1385 that failed the match. */
1386 if (i == 0
1387 /* A first scalar stmt mismatch signals a fatal mismatch. */
1388 && matches[0]
1389 /* ??? For COND_EXPRs we can swap the comparison operands
1390 as well as the arms under some constraints. */
1391 && nops == 2
1392 && oprnds_info[1]->first_dt == vect_internal_def
1393 && is_gimple_assign (stmt_info->stmt)
1394 /* Swapping operands for reductions breaks assumptions later on. */
1395 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1396 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1397 /* Do so only if the number of not successful permutes was nor more
1398 than a cut-ff as re-trying the recursive match on
1399 possibly each level of the tree would expose exponential
1400 behavior. */
1401 && *npermutes < 4)
1402 {
1403 /* See whether we can swap the matching or the non-matching
1404 stmt operands. */
1405 bool swap_not_matching = true;
1406 do
1407 {
1408 for (j = 0; j < group_size; ++j)
1409 {
1410 if (matches[j] != !swap_not_matching)
1411 continue;
1412 stmt_vec_info stmt_info = stmts[j];
1413 /* Verify if we can swap operands of this stmt. */
1414 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1415 if (!stmt
1416 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1417 {
1418 if (!swap_not_matching)
1419 goto fail;
1420 swap_not_matching = false;
1421 break;
1422 }
1423 }
1424 }
1425 while (j != group_size);
1426
1427 /* Swap mismatched definition stmts. */
1428 if (dump_enabled_p ())
1429 dump_printf_loc (MSG_NOTE, vect_location,
1430 "Re-trying with swapped operands of stmts ");
1431 for (j = 0; j < group_size; ++j)
1432 if (matches[j] == !swap_not_matching)
1433 {
1434 std::swap (oprnds_info[0]->def_stmts[j],
1435 oprnds_info[1]->def_stmts[j]);
1436 std::swap (oprnds_info[0]->ops[j],
1437 oprnds_info[1]->ops[j]);
1438 if (dump_enabled_p ())
1439 dump_printf (MSG_NOTE, "%d ", j);
1440 }
1441 if (dump_enabled_p ())
1442 dump_printf (MSG_NOTE, "\n");
1443 /* And try again with scratch 'matches' ... */
1444 bool *tem = XALLOCAVEC (bool, group_size);
1445 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1446 group_size, &this_max_nunits,
1447 tem, npermutes,
1448 &this_tree_size, bst_map)) != NULL)
1449 {
1450 /* If we have all children of child built up from scalars then
1451 just throw that away and build it up this node from scalars. */
1452 if (is_a <bb_vec_info> (vinfo)
1453 && !SLP_TREE_CHILDREN (child).is_empty ()
1454 /* ??? Rejecting patterns this way doesn't work. We'd have
1455 to do extra work to cancel the pattern so the uses see the
1456 scalar version. */
1457 && !oprnd_info->any_pattern)
1458 {
1459 unsigned int j;
1460 slp_tree grandchild;
1461
1462 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1463 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1464 break;
1465 if (!grandchild)
1466 {
1467 /* Roll back. */
1468 this_tree_size = old_tree_size;
1469 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1470 vect_free_slp_tree (grandchild, false);
1471 SLP_TREE_CHILDREN (child).truncate (0);
1472
1473 if (dump_enabled_p ())
1474 dump_printf_loc (MSG_NOTE, vect_location,
1475 "Building parent vector operands from "
1476 "scalars instead\n");
1477 oprnd_info->def_stmts = vNULL;
1478 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1479 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1480 oprnd_info->ops = vNULL;
1481 ++this_tree_size;
1482 children.safe_push (child);
1483 continue;
1484 }
1485 }
1486
1487 oprnd_info->def_stmts = vNULL;
1488 children.safe_push (child);
1489 continue;
1490 }
1491
1492 ++*npermutes;
1493 }
1494
1495 fail:
1496 gcc_assert (child == NULL);
1497 FOR_EACH_VEC_ELT (children, j, child)
1498 vect_free_slp_tree (child, false);
1499 vect_free_oprnd_info (oprnds_info);
1500 return NULL;
1501 }
1502
1503 vect_free_oprnd_info (oprnds_info);
1504
1505 *tree_size += this_tree_size + 1;
1506 *max_nunits = this_max_nunits;
1507
1508 node = vect_create_new_slp_node (stmts);
1509 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1510 SLP_TREE_CHILDREN (node).splice (children);
1511 return node;
1512 }
1513
1514 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1515
1516 static void
1517 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1518 slp_tree node, hash_set<slp_tree> &visited)
1519 {
1520 unsigned i;
1521 stmt_vec_info stmt_info;
1522 slp_tree child;
1523 tree op;
1524
1525 if (visited.add (node))
1526 return;
1527
1528 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1529 dump_user_location_t user_loc = loc.get_user_location ();
1530 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
1531 SLP_TREE_DEF_TYPE (node) == vect_external_def
1532 ? " (external)"
1533 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1534 ? " (constant)"
1535 : ""), node,
1536 estimated_poly_value (node->max_nunits));
1537 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1538 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1539 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1540 else
1541 {
1542 dump_printf_loc (metadata, user_loc, "\t{ ");
1543 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1544 dump_printf (metadata, "%T%s ", op,
1545 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1546 dump_printf (metadata, "}\n");
1547 }
1548 if (SLP_TREE_CHILDREN (node).is_empty ())
1549 return;
1550 dump_printf_loc (metadata, user_loc, "\tchildren");
1551 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1552 dump_printf (dump_kind, " %p", (void *)child);
1553 dump_printf (dump_kind, "\n");
1554 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1555 vect_print_slp_tree (dump_kind, loc, child, visited);
1556 }
1557
1558 static void
1559 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1560 slp_tree node)
1561 {
1562 hash_set<slp_tree> visited;
1563 vect_print_slp_tree (dump_kind, loc, node, visited);
1564 }
1565
1566 /* Mark the tree rooted at NODE with PURE_SLP. */
1567
1568 static void
1569 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1570 {
1571 int i;
1572 stmt_vec_info stmt_info;
1573 slp_tree child;
1574
1575 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1576 return;
1577
1578 if (visited.add (node))
1579 return;
1580
1581 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1582 STMT_SLP_TYPE (stmt_info) = pure_slp;
1583
1584 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1585 vect_mark_slp_stmts (child, visited);
1586 }
1587
1588 static void
1589 vect_mark_slp_stmts (slp_tree node)
1590 {
1591 hash_set<slp_tree> visited;
1592 vect_mark_slp_stmts (node, visited);
1593 }
1594
1595 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1596
1597 static void
1598 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1599 {
1600 int i;
1601 stmt_vec_info stmt_info;
1602 slp_tree child;
1603
1604 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1605 return;
1606
1607 if (visited.add (node))
1608 return;
1609
1610 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1611 {
1612 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1613 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1614 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1615 }
1616
1617 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1618 vect_mark_slp_stmts_relevant (child, visited);
1619 }
1620
1621 static void
1622 vect_mark_slp_stmts_relevant (slp_tree node)
1623 {
1624 hash_set<slp_tree> visited;
1625 vect_mark_slp_stmts_relevant (node, visited);
1626 }
1627
1628
1629 /* Rearrange the statements of NODE according to PERMUTATION. */
1630
1631 static void
1632 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1633 vec<unsigned> permutation,
1634 hash_set<slp_tree> &visited)
1635 {
1636 unsigned int i;
1637 slp_tree child;
1638
1639 if (visited.add (node))
1640 return;
1641
1642 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1643 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1644
1645 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1646 {
1647 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1648 vec<stmt_vec_info> tmp_stmts;
1649 tmp_stmts.create (group_size);
1650 tmp_stmts.quick_grow (group_size);
1651 stmt_vec_info stmt_info;
1652 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1653 tmp_stmts[permutation[i]] = stmt_info;
1654 SLP_TREE_SCALAR_STMTS (node).release ();
1655 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1656 }
1657 if (SLP_TREE_SCALAR_OPS (node).exists ())
1658 {
1659 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1660 vec<tree> tmp_ops;
1661 tmp_ops.create (group_size);
1662 tmp_ops.quick_grow (group_size);
1663 tree op;
1664 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1665 tmp_ops[permutation[i]] = op;
1666 SLP_TREE_SCALAR_OPS (node).release ();
1667 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1668 }
1669 }
1670
1671
1672 /* Attempt to reorder stmts in a reduction chain so that we don't
1673 require any load permutation. Return true if that was possible,
1674 otherwise return false. */
1675
1676 static bool
1677 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1678 {
1679 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1680 unsigned int i, j;
1681 unsigned int lidx;
1682 slp_tree node, load;
1683
1684 /* Compare all the permutation sequences to the first one. We know
1685 that at least one load is permuted. */
1686 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1687 if (!node->load_permutation.exists ())
1688 return false;
1689 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1690 {
1691 if (!load->load_permutation.exists ())
1692 return false;
1693 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1694 if (lidx != node->load_permutation[j])
1695 return false;
1696 }
1697
1698 /* Check that the loads in the first sequence are different and there
1699 are no gaps between them. */
1700 auto_sbitmap load_index (group_size);
1701 bitmap_clear (load_index);
1702 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1703 {
1704 if (lidx >= group_size)
1705 return false;
1706 if (bitmap_bit_p (load_index, lidx))
1707 return false;
1708
1709 bitmap_set_bit (load_index, lidx);
1710 }
1711 for (i = 0; i < group_size; i++)
1712 if (!bitmap_bit_p (load_index, i))
1713 return false;
1714
1715 /* This permutation is valid for reduction. Since the order of the
1716 statements in the nodes is not important unless they are memory
1717 accesses, we can rearrange the statements in all the nodes
1718 according to the order of the loads. */
1719 hash_set<slp_tree> visited;
1720 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1721 node->load_permutation, visited);
1722
1723 /* We are done, no actual permutations need to be generated. */
1724 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1725 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1726 {
1727 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1728 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1729 /* But we have to keep those permutations that are required because
1730 of handling of gaps. */
1731 if (known_eq (unrolling_factor, 1U)
1732 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1733 && DR_GROUP_GAP (first_stmt_info) == 0))
1734 SLP_TREE_LOAD_PERMUTATION (node).release ();
1735 else
1736 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1737 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1738 }
1739
1740 return true;
1741 }
1742
1743 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1744
1745 static void
1746 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1747 hash_set<slp_tree> &visited)
1748 {
1749 if (visited.add (node))
1750 return;
1751
1752 if (SLP_TREE_CHILDREN (node).length () == 0)
1753 {
1754 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1755 return;
1756 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1757 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1758 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1759 SLP_INSTANCE_LOADS (inst).safe_push (node);
1760 }
1761 else
1762 {
1763 unsigned i;
1764 slp_tree child;
1765 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1766 vect_gather_slp_loads (inst, child, visited);
1767 }
1768 }
1769
1770 static void
1771 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1772 {
1773 hash_set<slp_tree> visited;
1774 vect_gather_slp_loads (inst, node, visited);
1775 }
1776
1777 /* Check if the required load permutations in the SLP instance
1778 SLP_INSTN are supported. */
1779
1780 static bool
1781 vect_supported_load_permutation_p (slp_instance slp_instn)
1782 {
1783 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1784 unsigned int i, j, k, next;
1785 slp_tree node;
1786
1787 if (dump_enabled_p ())
1788 {
1789 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1790 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1791 if (node->load_permutation.exists ())
1792 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1793 dump_printf (MSG_NOTE, "%d ", next);
1794 else
1795 for (k = 0; k < group_size; ++k)
1796 dump_printf (MSG_NOTE, "%d ", k);
1797 dump_printf (MSG_NOTE, "\n");
1798 }
1799
1800 /* In case of reduction every load permutation is allowed, since the order
1801 of the reduction statements is not important (as opposed to the case of
1802 grouped stores). The only condition we need to check is that all the
1803 load nodes are of the same size and have the same permutation (and then
1804 rearrange all the nodes of the SLP instance according to this
1805 permutation). */
1806
1807 /* Check that all the load nodes are of the same size. */
1808 /* ??? Can't we assert this? */
1809 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1810 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1811 return false;
1812
1813 node = SLP_INSTANCE_TREE (slp_instn);
1814 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1815
1816 /* Reduction (there are no data-refs in the root).
1817 In reduction chain the order of the loads is not important. */
1818 if (!STMT_VINFO_DATA_REF (stmt_info)
1819 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1820 vect_attempt_slp_rearrange_stmts (slp_instn);
1821
1822 /* In basic block vectorization we allow any subchain of an interleaving
1823 chain.
1824 FORNOW: not supported in loop SLP because of realignment compications. */
1825 if (STMT_VINFO_BB_VINFO (stmt_info))
1826 {
1827 /* Check whether the loads in an instance form a subchain and thus
1828 no permutation is necessary. */
1829 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1830 {
1831 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1832 continue;
1833 bool subchain_p = true;
1834 stmt_vec_info next_load_info = NULL;
1835 stmt_vec_info load_info;
1836 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1837 {
1838 if (j != 0
1839 && (next_load_info != load_info
1840 || DR_GROUP_GAP (load_info) != 1))
1841 {
1842 subchain_p = false;
1843 break;
1844 }
1845 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1846 }
1847 if (subchain_p)
1848 SLP_TREE_LOAD_PERMUTATION (node).release ();
1849 else
1850 {
1851 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1852 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1853 unsigned HOST_WIDE_INT nunits;
1854 unsigned k, maxk = 0;
1855 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1856 if (k > maxk)
1857 maxk = k;
1858 /* In BB vectorization we may not actually use a loaded vector
1859 accessing elements in excess of DR_GROUP_SIZE. */
1860 tree vectype = STMT_VINFO_VECTYPE (group_info);
1861 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1862 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1863 {
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866 "BB vectorization with gaps at the end of "
1867 "a load is not supported\n");
1868 return false;
1869 }
1870
1871 /* Verify the permutation can be generated. */
1872 vec<tree> tem;
1873 unsigned n_perms;
1874 if (!vect_transform_slp_perm_load (node, tem, NULL,
1875 1, slp_instn, true, &n_perms))
1876 {
1877 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1879 vect_location,
1880 "unsupported load permutation\n");
1881 return false;
1882 }
1883 }
1884 }
1885 return true;
1886 }
1887
1888 /* For loop vectorization verify we can generate the permutation. Be
1889 conservative about the vectorization factor, there are permutations
1890 that will use three vector inputs only starting from a specific factor
1891 and the vectorization factor is not yet final.
1892 ??? The SLP instance unrolling factor might not be the maximum one. */
1893 unsigned n_perms;
1894 poly_uint64 test_vf
1895 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1896 LOOP_VINFO_VECT_FACTOR
1897 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1898 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1899 if (node->load_permutation.exists ()
1900 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1901 slp_instn, true, &n_perms))
1902 return false;
1903
1904 return true;
1905 }
1906
1907
1908 /* Find the last store in SLP INSTANCE. */
1909
1910 stmt_vec_info
1911 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1912 {
1913 stmt_vec_info last = NULL;
1914 stmt_vec_info stmt_vinfo;
1915
1916 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1917 {
1918 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1919 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1920 }
1921
1922 return last;
1923 }
1924
1925 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1926 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1927 (also containing the first GROUP1_SIZE stmts, since stores are
1928 consecutive), the second containing the remainder.
1929 Return the first stmt in the second group. */
1930
1931 static stmt_vec_info
1932 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1933 {
1934 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1935 gcc_assert (group1_size > 0);
1936 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1937 gcc_assert (group2_size > 0);
1938 DR_GROUP_SIZE (first_vinfo) = group1_size;
1939
1940 stmt_vec_info stmt_info = first_vinfo;
1941 for (unsigned i = group1_size; i > 1; i--)
1942 {
1943 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1944 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1945 }
1946 /* STMT is now the last element of the first group. */
1947 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1948 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1949
1950 DR_GROUP_SIZE (group2) = group2_size;
1951 for (stmt_info = group2; stmt_info;
1952 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1953 {
1954 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1955 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1956 }
1957
1958 /* For the second group, the DR_GROUP_GAP is that before the original group,
1959 plus skipping over the first vector. */
1960 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1961
1962 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1963 DR_GROUP_GAP (first_vinfo) += group2_size;
1964
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1967 group1_size, group2_size);
1968
1969 return group2;
1970 }
1971
1972 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1973 statements and a vector of NUNITS elements. */
1974
1975 static poly_uint64
1976 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1977 {
1978 return exact_div (common_multiple (nunits, group_size), group_size);
1979 }
1980
1981 /* Analyze an SLP instance starting from a group of grouped stores. Call
1982 vect_build_slp_tree to build a tree of packed stmts if possible.
1983 Return FALSE if it's impossible to SLP any stmt in the loop. */
1984
1985 static bool
1986 vect_analyze_slp_instance (vec_info *vinfo,
1987 stmt_vec_info stmt_info, unsigned max_tree_size)
1988 {
1989 slp_instance new_instance;
1990 slp_tree node;
1991 unsigned int group_size;
1992 tree vectype, scalar_type = NULL_TREE;
1993 unsigned int i;
1994 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1995 vec<stmt_vec_info> scalar_stmts;
1996
1997 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1998 {
1999 scalar_type = TREE_TYPE (DR_REF (dr));
2000 vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2001 group_size = DR_GROUP_SIZE (stmt_info);
2002 }
2003 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2004 {
2005 gcc_assert (is_a <loop_vec_info> (vinfo));
2006 vectype = STMT_VINFO_VECTYPE (stmt_info);
2007 group_size = REDUC_GROUP_SIZE (stmt_info);
2008 }
2009 else
2010 {
2011 gcc_assert (is_a <loop_vec_info> (vinfo));
2012 vectype = STMT_VINFO_VECTYPE (stmt_info);
2013 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2014 }
2015
2016 if (!vectype)
2017 {
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 "Build SLP failed: unsupported data-type %T\n",
2021 scalar_type);
2022
2023 return false;
2024 }
2025 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2026
2027 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2028 scalar_stmts.create (group_size);
2029 stmt_vec_info next_info = stmt_info;
2030 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031 {
2032 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2033 while (next_info)
2034 {
2035 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2036 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2037 }
2038 }
2039 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2040 {
2041 /* Collect the reduction stmts and store them in
2042 SLP_TREE_SCALAR_STMTS. */
2043 while (next_info)
2044 {
2045 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2046 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2047 }
2048 /* Mark the first element of the reduction chain as reduction to properly
2049 transform the node. In the reduction analysis phase only the last
2050 element of the chain is marked as reduction. */
2051 STMT_VINFO_DEF_TYPE (stmt_info)
2052 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2053 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2054 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2055 }
2056 else
2057 {
2058 /* Collect reduction statements. */
2059 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2060 for (i = 0; reductions.iterate (i, &next_info); i++)
2061 scalar_stmts.safe_push (next_info);
2062 }
2063
2064 /* Build the tree for the SLP instance. */
2065 bool *matches = XALLOCAVEC (bool, group_size);
2066 unsigned npermutes = 0;
2067 scalar_stmts_to_slp_tree_map_t *bst_map
2068 = new scalar_stmts_to_slp_tree_map_t ();
2069 poly_uint64 max_nunits = nunits;
2070 unsigned tree_size = 0;
2071 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2072 &max_nunits, matches, &npermutes,
2073 &tree_size, bst_map);
2074 /* The map keeps a reference on SLP nodes built, release that. */
2075 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2076 it != bst_map->end (); ++it)
2077 if ((*it).second)
2078 vect_free_slp_tree ((*it).second, false);
2079 delete bst_map;
2080 if (node != NULL)
2081 {
2082 /* If this is a reduction chain with a conversion in front
2083 amend the SLP tree with a node for that. */
2084 if (!dr
2085 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2086 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2087 {
2088 /* Get at the conversion stmt - we know it's the single use
2089 of the last stmt of the reduction chain. */
2090 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2091 use_operand_p use_p;
2092 gimple *use_stmt;
2093 bool r = single_imm_use (gimple_assign_lhs (tem), &use_p, &use_stmt);
2094 gcc_assert (r);
2095 next_info = vinfo->lookup_stmt (use_stmt);
2096 next_info = vect_stmt_to_vectorize (next_info);
2097 scalar_stmts = vNULL;
2098 scalar_stmts.create (group_size);
2099 for (unsigned i = 0; i < group_size; ++i)
2100 scalar_stmts.quick_push (next_info);
2101 slp_tree conv = vect_create_new_slp_node (scalar_stmts);
2102 SLP_TREE_CHILDREN (conv).quick_push (node);
2103 node = conv;
2104 /* We also have to fake this conversion stmt as SLP reduction group
2105 so we don't have to mess with too much code elsewhere. */
2106 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2107 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2108 }
2109
2110 /* Calculate the unrolling factor based on the smallest type. */
2111 poly_uint64 unrolling_factor
2112 = calculate_unrolling_factor (max_nunits, group_size);
2113
2114 if (maybe_ne (unrolling_factor, 1U)
2115 && is_a <bb_vec_info> (vinfo))
2116 {
2117 unsigned HOST_WIDE_INT const_max_nunits;
2118 if (!max_nunits.is_constant (&const_max_nunits)
2119 || const_max_nunits > group_size)
2120 {
2121 if (dump_enabled_p ())
2122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123 "Build SLP failed: store group "
2124 "size not a multiple of the vector size "
2125 "in basic block SLP\n");
2126 vect_free_slp_tree (node, false);
2127 return false;
2128 }
2129 /* Fatal mismatch. */
2130 matches[group_size / const_max_nunits * const_max_nunits] = false;
2131 vect_free_slp_tree (node, false);
2132 }
2133 else
2134 {
2135 /* Create a new SLP instance. */
2136 new_instance = XNEW (class _slp_instance);
2137 SLP_INSTANCE_TREE (new_instance) = node;
2138 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2139 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2140 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2141 vect_gather_slp_loads (new_instance, node);
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_NOTE, vect_location,
2144 "SLP size %u vs. limit %u.\n",
2145 tree_size, max_tree_size);
2146
2147 /* Compute the load permutation. */
2148 slp_tree load_node;
2149 bool loads_permuted = false;
2150 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2151 {
2152 vec<unsigned> load_permutation;
2153 int j;
2154 stmt_vec_info load_info;
2155 bool this_load_permuted = false;
2156 load_permutation.create (group_size);
2157 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2158 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2159 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2160 {
2161 int load_place = vect_get_place_in_interleaving_chain
2162 (load_info, first_stmt_info);
2163 gcc_assert (load_place != -1);
2164 if (load_place != j)
2165 this_load_permuted = true;
2166 load_permutation.safe_push (load_place);
2167 }
2168 if (!this_load_permuted
2169 /* The load requires permutation when unrolling exposes
2170 a gap either because the group is larger than the SLP
2171 group-size or because there is a gap between the groups. */
2172 && (known_eq (unrolling_factor, 1U)
2173 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2174 && DR_GROUP_GAP (first_stmt_info) == 0)))
2175 {
2176 load_permutation.release ();
2177 continue;
2178 }
2179 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2180 loads_permuted = true;
2181 }
2182
2183 if (loads_permuted)
2184 {
2185 if (!vect_supported_load_permutation_p (new_instance))
2186 {
2187 if (dump_enabled_p ())
2188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189 "Build SLP failed: unsupported load "
2190 "permutation %G", stmt_info->stmt);
2191 vect_free_slp_instance (new_instance, false);
2192 return false;
2193 }
2194 }
2195
2196 /* If the loads and stores can be handled with load/store-lan
2197 instructions do not generate this SLP instance. */
2198 if (is_a <loop_vec_info> (vinfo)
2199 && loads_permuted
2200 && dr && vect_store_lanes_supported (vectype, group_size, false))
2201 {
2202 slp_tree load_node;
2203 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2204 {
2205 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2206 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2207 /* Use SLP for strided accesses (or if we can't load-lanes). */
2208 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2209 || ! vect_load_lanes_supported
2210 (STMT_VINFO_VECTYPE (stmt_vinfo),
2211 DR_GROUP_SIZE (stmt_vinfo), false))
2212 break;
2213 }
2214 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2215 {
2216 if (dump_enabled_p ())
2217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2218 "Built SLP cancelled: can use "
2219 "load/store-lanes\n");
2220 vect_free_slp_instance (new_instance, false);
2221 return false;
2222 }
2223 }
2224
2225 vinfo->slp_instances.safe_push (new_instance);
2226
2227 if (dump_enabled_p ())
2228 {
2229 dump_printf_loc (MSG_NOTE, vect_location,
2230 "Final SLP tree for instance:\n");
2231 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2232 }
2233
2234 return true;
2235 }
2236 }
2237 else
2238 {
2239 /* Failed to SLP. */
2240 /* Free the allocated memory. */
2241 scalar_stmts.release ();
2242 }
2243
2244 /* For basic block SLP, try to break the group up into multiples of the
2245 vector size. */
2246 unsigned HOST_WIDE_INT const_nunits;
2247 if (is_a <bb_vec_info> (vinfo)
2248 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2249 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2250 && nunits.is_constant (&const_nunits))
2251 {
2252 /* We consider breaking the group only on VF boundaries from the existing
2253 start. */
2254 for (i = 0; i < group_size; i++)
2255 if (!matches[i]) break;
2256
2257 if (i >= const_nunits && i < group_size)
2258 {
2259 /* Split into two groups at the first vector boundary before i. */
2260 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2261 unsigned group1_size = i & ~(const_nunits - 1);
2262
2263 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2264 group1_size);
2265 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2266 max_tree_size);
2267 /* If the first non-match was in the middle of a vector,
2268 skip the rest of that vector. */
2269 if (group1_size < i)
2270 {
2271 i = group1_size + const_nunits;
2272 if (i < group_size)
2273 rest = vect_split_slp_store_group (rest, const_nunits);
2274 }
2275 if (i < group_size)
2276 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2277 return res;
2278 }
2279 /* Even though the first vector did not all match, we might be able to SLP
2280 (some) of the remainder. FORNOW ignore this possibility. */
2281 }
2282
2283 return false;
2284 }
2285
2286
2287 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2288 trees of packed scalar stmts if SLP is possible. */
2289
2290 opt_result
2291 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2292 {
2293 unsigned int i;
2294 stmt_vec_info first_element;
2295
2296 DUMP_VECT_SCOPE ("vect_analyze_slp");
2297
2298 /* Find SLP sequences starting from groups of grouped stores. */
2299 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2300 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2301
2302 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2303 {
2304 if (loop_vinfo->reduction_chains.length () > 0)
2305 {
2306 /* Find SLP sequences starting from reduction chains. */
2307 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2308 if (! vect_analyze_slp_instance (vinfo, first_element,
2309 max_tree_size))
2310 {
2311 /* Dissolve reduction chain group. */
2312 stmt_vec_info vinfo = first_element;
2313 stmt_vec_info last = NULL;
2314 while (vinfo)
2315 {
2316 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2317 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2318 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2319 last = vinfo;
2320 vinfo = next;
2321 }
2322 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2323 /* It can be still vectorized as part of an SLP reduction. */
2324 loop_vinfo->reductions.safe_push (last);
2325 }
2326 }
2327
2328 /* Find SLP sequences starting from groups of reductions. */
2329 if (loop_vinfo->reductions.length () > 1)
2330 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2331 max_tree_size);
2332 }
2333
2334 return opt_result::success ();
2335 }
2336
2337
2338 /* For each possible SLP instance decide whether to SLP it and calculate overall
2339 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2340 least one instance. */
2341
2342 bool
2343 vect_make_slp_decision (loop_vec_info loop_vinfo)
2344 {
2345 unsigned int i;
2346 poly_uint64 unrolling_factor = 1;
2347 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2348 slp_instance instance;
2349 int decided_to_slp = 0;
2350
2351 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2352
2353 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2354 {
2355 /* FORNOW: SLP if you can. */
2356 /* All unroll factors have the form vinfo->vector_size * X for some
2357 rational X, so they must have a common multiple. */
2358 unrolling_factor
2359 = force_common_multiple (unrolling_factor,
2360 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2361
2362 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2363 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2364 loop-based vectorization. Such stmts will be marked as HYBRID. */
2365 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2366 decided_to_slp++;
2367 }
2368
2369 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2370
2371 if (decided_to_slp && dump_enabled_p ())
2372 {
2373 dump_printf_loc (MSG_NOTE, vect_location,
2374 "Decided to SLP %d instances. Unrolling factor ",
2375 decided_to_slp);
2376 dump_dec (MSG_NOTE, unrolling_factor);
2377 dump_printf (MSG_NOTE, "\n");
2378 }
2379
2380 return (decided_to_slp > 0);
2381 }
2382
2383
2384 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2385 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2386
2387 static void
2388 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2389 hash_map<slp_tree, unsigned> &visited)
2390 {
2391 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2392 imm_use_iterator imm_iter;
2393 gimple *use_stmt;
2394 stmt_vec_info use_vinfo;
2395 slp_tree child;
2396 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2397 int j;
2398
2399 /* We need to union stype over the incoming graph edges but we still
2400 want to limit recursion to stay O(N+E). */
2401 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2402
2403 /* Propagate hybrid down the SLP tree. */
2404 if (stype == hybrid)
2405 ;
2406 else if (HYBRID_SLP_STMT (stmt_vinfo))
2407 stype = hybrid;
2408 else if (!only_edge)
2409 {
2410 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2411 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2412 /* If we get a pattern stmt here we have to use the LHS of the
2413 original stmt for immediate uses. */
2414 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2415 tree def;
2416 if (gimple_code (stmt) == GIMPLE_PHI)
2417 def = gimple_phi_result (stmt);
2418 else
2419 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2420 if (def)
2421 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2422 {
2423 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2424 if (!use_vinfo)
2425 continue;
2426 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2427 if (!STMT_SLP_TYPE (use_vinfo)
2428 && (STMT_VINFO_RELEVANT (use_vinfo)
2429 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2430 && !(gimple_code (use_stmt) == GIMPLE_PHI
2431 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2432 {
2433 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2435 "def in non-SLP stmt: %G", use_stmt);
2436 stype = hybrid;
2437 }
2438 }
2439 }
2440
2441 if (stype == hybrid
2442 && !HYBRID_SLP_STMT (stmt_vinfo))
2443 {
2444 if (dump_enabled_p ())
2445 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2446 stmt_vinfo->stmt);
2447 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2448 }
2449
2450 if (!only_edge)
2451 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2452 if (SLP_TREE_DEF_TYPE (child) != vect_external_def
2453 && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
2454 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2455 }
2456
2457 static void
2458 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2459 {
2460 hash_map<slp_tree, unsigned> visited;
2461 vect_detect_hybrid_slp_stmts (node, i, stype, visited);
2462 }
2463
2464 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2465
2466 static tree
2467 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2468 {
2469 walk_stmt_info *wi = (walk_stmt_info *)data;
2470 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2471
2472 if (wi->is_lhs)
2473 return NULL_TREE;
2474
2475 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2476 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2477 {
2478 if (dump_enabled_p ())
2479 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2480 def_stmt_info->stmt);
2481 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2482 }
2483
2484 return NULL_TREE;
2485 }
2486
2487 static tree
2488 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2489 walk_stmt_info *wi)
2490 {
2491 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2492 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2493 /* If the stmt is in a SLP instance then this isn't a reason
2494 to mark use definitions in other SLP instances as hybrid. */
2495 if (! STMT_SLP_TYPE (use_vinfo)
2496 && (STMT_VINFO_RELEVANT (use_vinfo)
2497 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2498 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2499 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2500 ;
2501 else
2502 *handled = true;
2503 return NULL_TREE;
2504 }
2505
2506 /* Find stmts that must be both vectorized and SLPed. */
2507
2508 void
2509 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2510 {
2511 unsigned int i;
2512 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2513 slp_instance instance;
2514
2515 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2516
2517 /* First walk all pattern stmt in the loop and mark defs of uses as
2518 hybrid because immediate uses in them are not recorded. */
2519 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2520 {
2521 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2522 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2523 gsi_next (&gsi))
2524 {
2525 gimple *stmt = gsi_stmt (gsi);
2526 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2527 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2528 {
2529 walk_stmt_info wi;
2530 memset (&wi, 0, sizeof (wi));
2531 wi.info = loop_vinfo;
2532 gimple_stmt_iterator gsi2
2533 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2534 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2535 vect_detect_hybrid_slp_1, &wi);
2536 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2537 vect_detect_hybrid_slp_2,
2538 vect_detect_hybrid_slp_1, &wi);
2539 }
2540 }
2541 }
2542
2543 /* Then walk the SLP instance trees marking stmts with uses in
2544 non-SLP stmts as hybrid, also propagating hybrid down the
2545 SLP tree, collecting the above info on-the-fly. */
2546 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2547 {
2548 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2549 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2550 i, pure_slp);
2551 }
2552 }
2553
2554
2555 /* Initialize a bb_vec_info struct for the statements between
2556 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2557
2558 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2559 gimple_stmt_iterator region_end_in,
2560 vec_info_shared *shared)
2561 : vec_info (vec_info::bb, init_cost (NULL), shared),
2562 bb (gsi_bb (region_begin_in)),
2563 region_begin (region_begin_in),
2564 region_end (region_end_in)
2565 {
2566 gimple_stmt_iterator gsi;
2567
2568 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2569 gsi_next (&gsi))
2570 {
2571 gimple *stmt = gsi_stmt (gsi);
2572 gimple_set_uid (stmt, 0);
2573 add_stmt (stmt);
2574 }
2575
2576 bb->aux = this;
2577 }
2578
2579
2580 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2581 stmts in the basic block. */
2582
2583 _bb_vec_info::~_bb_vec_info ()
2584 {
2585 for (gimple_stmt_iterator si = region_begin;
2586 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2587 /* Reset region marker. */
2588 gimple_set_uid (gsi_stmt (si), -1);
2589
2590 bb->aux = NULL;
2591 }
2592
2593 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2594 given then that child nodes have already been processed, and that
2595 their def types currently match their SLP node's def type. */
2596
2597 static bool
2598 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2599 slp_instance node_instance,
2600 stmt_vector_for_cost *cost_vec)
2601 {
2602 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2603 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2604
2605 /* For BB vectorization vector types are assigned here.
2606 Memory accesses already got their vector type assigned
2607 in vect_analyze_data_refs. */
2608 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2609 if (bb_vinfo
2610 && ! STMT_VINFO_DATA_REF (stmt_info))
2611 {
2612 tree vectype, nunits_vectype;
2613 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2614 &nunits_vectype))
2615 /* We checked this when building the node. */
2616 gcc_unreachable ();
2617 if (vectype == boolean_type_node)
2618 {
2619 vectype = vect_get_mask_type_for_stmt (stmt_info);
2620 if (!vectype)
2621 /* vect_get_mask_type_for_stmt has already explained the
2622 failure. */
2623 return false;
2624 }
2625
2626 stmt_vec_info sstmt_info;
2627 unsigned int i;
2628 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2629 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2630 }
2631
2632 /* Calculate the number of vector statements to be created for the
2633 scalar stmts in this node. For SLP reductions it is equal to the
2634 number of vector statements in the children (which has already been
2635 calculated by the recursive call). Otherwise it is the number of
2636 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2637 VF divided by the number of elements in a vector. */
2638 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2639 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2640 {
2641 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2642 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2643 {
2644 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2645 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2646 break;
2647 }
2648 }
2649 else
2650 {
2651 poly_uint64 vf;
2652 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2653 vf = loop_vinfo->vectorization_factor;
2654 else
2655 vf = 1;
2656 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2657 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2658 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2659 = vect_get_num_vectors (vf * group_size, vectype);
2660 }
2661
2662 bool dummy;
2663 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2664 }
2665
2666 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2667 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2668
2669 Return true if the operations are supported. */
2670
2671 static bool
2672 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2673 slp_instance node_instance,
2674 scalar_stmts_to_slp_tree_map_t *visited,
2675 scalar_stmts_to_slp_tree_map_t *lvisited,
2676 stmt_vector_for_cost *cost_vec)
2677 {
2678 int i, j;
2679 slp_tree child;
2680
2681 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2682 return true;
2683
2684 /* If we already analyzed the exact same set of scalar stmts we're done.
2685 We share the generated vector stmts for those. */
2686 slp_tree *leader;
2687 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2688 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2689 {
2690 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2691 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2692 return true;
2693 }
2694
2695 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2696 doesn't result in any issue since we throw away the lvisited set
2697 when we fail. */
2698 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2699
2700 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2701 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2702 visited, lvisited, cost_vec))
2703 return false;
2704
2705 /* ??? We have to catch the case late where two first scalar stmts appear
2706 in multiple SLP children with different def type and fail. Remember
2707 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2708 match it when that is vect_internal_def. */
2709 auto_vec<vect_def_type, 4> dt;
2710 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2711 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2712 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2713 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2714
2715 /* Push SLP node def-type to stmt operands. */
2716 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2717 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2718 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2719 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2720 = SLP_TREE_DEF_TYPE (child);
2721
2722 /* Check everything worked out. */
2723 bool res = true;
2724 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2725 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2726 {
2727 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2728 {
2729 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2730 != SLP_TREE_DEF_TYPE (child))
2731 res = false;
2732 }
2733 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2734 != dt[j])
2735 res = false;
2736 }
2737 if (!res && dump_enabled_p ())
2738 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2739 "not vectorized: same operand with different "
2740 "def type in stmt.\n");
2741
2742 if (res)
2743 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2744 cost_vec);
2745
2746 /* Restore def-types. */
2747 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2748 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2749 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2750
2751 return res;
2752 }
2753
2754
2755 /* Analyze statements in SLP instances of VINFO. Return true if the
2756 operations are supported. */
2757
2758 bool
2759 vect_slp_analyze_operations (vec_info *vinfo)
2760 {
2761 slp_instance instance;
2762 int i;
2763
2764 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2765
2766 scalar_stmts_to_slp_tree_map_t *visited
2767 = new scalar_stmts_to_slp_tree_map_t ();
2768 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2769 {
2770 scalar_stmts_to_slp_tree_map_t lvisited;
2771 stmt_vector_for_cost cost_vec;
2772 cost_vec.create (2);
2773 if (!vect_slp_analyze_node_operations (vinfo,
2774 SLP_INSTANCE_TREE (instance),
2775 instance, visited, &lvisited,
2776 &cost_vec))
2777 {
2778 slp_tree node = SLP_INSTANCE_TREE (instance);
2779 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2780 if (dump_enabled_p ())
2781 dump_printf_loc (MSG_NOTE, vect_location,
2782 "removing SLP instance operations starting from: %G",
2783 stmt_info->stmt);
2784 vect_free_slp_instance (instance, false);
2785 vinfo->slp_instances.ordered_remove (i);
2786 cost_vec.release ();
2787 }
2788 else
2789 {
2790 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2791 x != lvisited.end(); ++x)
2792 visited->put ((*x).first.copy (), (*x).second);
2793 i++;
2794
2795 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2796 cost_vec.release ();
2797 }
2798 }
2799 delete visited;
2800
2801 return !vinfo->slp_instances.is_empty ();
2802 }
2803
2804
2805 /* Compute the scalar cost of the SLP node NODE and its children
2806 and return it. Do not account defs that are marked in LIFE and
2807 update LIFE according to uses of NODE. */
2808
2809 static void
2810 vect_bb_slp_scalar_cost (basic_block bb,
2811 slp_tree node, vec<bool, va_heap> *life,
2812 stmt_vector_for_cost *cost_vec,
2813 hash_set<slp_tree> &visited)
2814 {
2815 unsigned i;
2816 stmt_vec_info stmt_info;
2817 slp_tree child;
2818
2819 if (visited.add (node))
2820 return;
2821
2822 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2823 {
2824 gimple *stmt = stmt_info->stmt;
2825 vec_info *vinfo = stmt_info->vinfo;
2826 ssa_op_iter op_iter;
2827 def_operand_p def_p;
2828
2829 if ((*life)[i])
2830 continue;
2831
2832 /* If there is a non-vectorized use of the defs then the scalar
2833 stmt is kept live in which case we do not account it or any
2834 required defs in the SLP children in the scalar cost. This
2835 way we make the vectorization more costly when compared to
2836 the scalar cost. */
2837 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2838 {
2839 imm_use_iterator use_iter;
2840 gimple *use_stmt;
2841 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2842 if (!is_gimple_debug (use_stmt))
2843 {
2844 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2845 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2846 {
2847 (*life)[i] = true;
2848 BREAK_FROM_IMM_USE_STMT (use_iter);
2849 }
2850 }
2851 }
2852 if ((*life)[i])
2853 continue;
2854
2855 /* Count scalar stmts only once. */
2856 if (gimple_visited_p (stmt))
2857 continue;
2858 gimple_set_visited (stmt, true);
2859
2860 vect_cost_for_stmt kind;
2861 if (STMT_VINFO_DATA_REF (stmt_info))
2862 {
2863 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2864 kind = scalar_load;
2865 else
2866 kind = scalar_store;
2867 }
2868 else
2869 kind = scalar_stmt;
2870 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2871 }
2872
2873 auto_vec<bool, 20> subtree_life;
2874 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2875 {
2876 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2877 {
2878 /* Do not directly pass LIFE to the recursive call, copy it to
2879 confine changes in the callee to the current child/subtree. */
2880 subtree_life.safe_splice (*life);
2881 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2882 visited);
2883 subtree_life.truncate (0);
2884 }
2885 }
2886 }
2887
2888 static void
2889 vect_bb_slp_scalar_cost (basic_block bb,
2890 slp_tree node, vec<bool, va_heap> *life,
2891 stmt_vector_for_cost *cost_vec)
2892 {
2893 hash_set<slp_tree> visited;
2894 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2895 }
2896
2897 /* Check if vectorization of the basic block is profitable. */
2898
2899 static bool
2900 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2901 {
2902 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2903 slp_instance instance;
2904 int i;
2905 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2906 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2907
2908 /* Calculate scalar cost. */
2909 stmt_vector_for_cost scalar_costs;
2910 scalar_costs.create (0);
2911 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2912 {
2913 auto_vec<bool, 20> life;
2914 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2915 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2916 SLP_INSTANCE_TREE (instance),
2917 &life, &scalar_costs);
2918 }
2919 void *target_cost_data = init_cost (NULL);
2920 add_stmt_costs (target_cost_data, &scalar_costs);
2921 scalar_costs.release ();
2922 unsigned dummy;
2923 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2924 destroy_cost_data (target_cost_data);
2925
2926 /* Unset visited flag. */
2927 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2928 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2929 gimple_set_visited (gsi_stmt (gsi), false);
2930
2931 /* Complete the target-specific cost calculation. */
2932 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2933 &vec_inside_cost, &vec_epilogue_cost);
2934
2935 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2936
2937 if (dump_enabled_p ())
2938 {
2939 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2940 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2941 vec_inside_cost);
2942 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2943 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2944 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2945 }
2946
2947 /* Vectorization is profitable if its cost is more than the cost of scalar
2948 version. Note that we err on the vector side for equal cost because
2949 the cost estimate is otherwise quite pessimistic (constant uses are
2950 free on the scalar side but cost a load on the vector side for
2951 example). */
2952 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2953 return false;
2954
2955 return true;
2956 }
2957
2958 /* Check if the region described by BB_VINFO can be vectorized, returning
2959 true if so. When returning false, set FATAL to true if the same failure
2960 would prevent vectorization at other vector sizes, false if it is still
2961 worth trying other sizes. N_STMTS is the number of statements in the
2962 region. */
2963
2964 static bool
2965 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
2966 {
2967 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2968
2969 slp_instance instance;
2970 int i;
2971 poly_uint64 min_vf = 2;
2972
2973 /* The first group of checks is independent of the vector size. */
2974 fatal = true;
2975
2976 /* Analyze the data references. */
2977
2978 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
2979 {
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982 "not vectorized: unhandled data-ref in basic "
2983 "block.\n");
2984 return false;
2985 }
2986
2987 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2988 {
2989 if (dump_enabled_p ())
2990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2991 "not vectorized: not enough data-refs in "
2992 "basic block.\n");
2993 return false;
2994 }
2995
2996 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2997 {
2998 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3000 "not vectorized: unhandled data access in "
3001 "basic block.\n");
3002 return false;
3003 }
3004
3005 /* If there are no grouped stores in the region there is no need
3006 to continue with pattern recog as vect_analyze_slp will fail
3007 anyway. */
3008 if (bb_vinfo->grouped_stores.is_empty ())
3009 {
3010 if (dump_enabled_p ())
3011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012 "not vectorized: no grouped stores in "
3013 "basic block.\n");
3014 return false;
3015 }
3016
3017 /* While the rest of the analysis below depends on it in some way. */
3018 fatal = false;
3019
3020 vect_pattern_recog (bb_vinfo);
3021
3022 /* Check the SLP opportunities in the basic block, analyze and build SLP
3023 trees. */
3024 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3025 {
3026 if (dump_enabled_p ())
3027 {
3028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029 "Failed to SLP the basic block.\n");
3030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3031 "not vectorized: failed to find SLP opportunities "
3032 "in basic block.\n");
3033 }
3034 return false;
3035 }
3036
3037 vect_record_base_alignments (bb_vinfo);
3038
3039 /* Analyze and verify the alignment of data references and the
3040 dependence in the SLP instances. */
3041 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3042 {
3043 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3044 || ! vect_slp_analyze_instance_dependence (instance))
3045 {
3046 slp_tree node = SLP_INSTANCE_TREE (instance);
3047 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3048 if (dump_enabled_p ())
3049 dump_printf_loc (MSG_NOTE, vect_location,
3050 "removing SLP instance operations starting from: %G",
3051 stmt_info->stmt);
3052 vect_free_slp_instance (instance, false);
3053 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3054 continue;
3055 }
3056
3057 /* Mark all the statements that we want to vectorize as pure SLP and
3058 relevant. */
3059 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3060 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3061
3062 i++;
3063 }
3064 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3065 return false;
3066
3067 if (!vect_slp_analyze_operations (bb_vinfo))
3068 {
3069 if (dump_enabled_p ())
3070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3071 "not vectorized: bad operation in basic block.\n");
3072 return false;
3073 }
3074
3075 /* Cost model: check if the vectorization is worthwhile. */
3076 if (!unlimited_cost_model (NULL)
3077 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3078 {
3079 if (dump_enabled_p ())
3080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3081 "not vectorized: vectorization is not "
3082 "profitable.\n");
3083 return false;
3084 }
3085
3086 if (dump_enabled_p ())
3087 dump_printf_loc (MSG_NOTE, vect_location,
3088 "Basic block will be vectorized using SLP\n");
3089 return true;
3090 }
3091
3092 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3093 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3094 on success. The region has N_STMTS statements and has the datarefs
3095 given by DATAREFS. */
3096
3097 static bool
3098 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3099 gimple_stmt_iterator region_end,
3100 vec<data_reference_p> datarefs,
3101 unsigned int n_stmts)
3102 {
3103 bb_vec_info bb_vinfo;
3104 auto_vector_sizes vector_sizes;
3105
3106 /* Autodetect first vector size we try. */
3107 poly_uint64 next_vector_size = 0;
3108 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
3109 unsigned int next_size = 0;
3110
3111 vec_info_shared shared;
3112
3113 poly_uint64 autodetected_vector_size = 0;
3114 while (1)
3115 {
3116 bool vectorized = false;
3117 bool fatal = false;
3118 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3119
3120 bool first_time_p = shared.datarefs.is_empty ();
3121 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3122 if (first_time_p)
3123 bb_vinfo->shared->save_datarefs ();
3124 else
3125 bb_vinfo->shared->check_datarefs ();
3126 bb_vinfo->vector_size = next_vector_size;
3127
3128 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3129 && dbg_cnt (vect_slp))
3130 {
3131 if (dump_enabled_p ())
3132 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3133
3134 bb_vinfo->shared->check_datarefs ();
3135 vect_schedule_slp (bb_vinfo);
3136
3137 unsigned HOST_WIDE_INT bytes;
3138 if (dump_enabled_p ())
3139 {
3140 if (bb_vinfo->vector_size.is_constant (&bytes))
3141 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3142 "basic block part vectorized using %wu byte "
3143 "vectors\n", bytes);
3144 else
3145 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3146 "basic block part vectorized using variable "
3147 "length vectors\n");
3148 }
3149
3150 vectorized = true;
3151 }
3152
3153 if (next_size == 0)
3154 autodetected_vector_size = bb_vinfo->vector_size;
3155
3156 delete bb_vinfo;
3157
3158 if (next_size < vector_sizes.length ()
3159 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3160 next_size += 1;
3161
3162 if (vectorized
3163 || next_size == vector_sizes.length ()
3164 || known_eq (autodetected_vector_size, 0U)
3165 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3166 vector sizes will fail do not bother iterating. */
3167 || fatal)
3168 return vectorized;
3169
3170 /* Try the next biggest vector size. */
3171 next_vector_size = vector_sizes[next_size++];
3172 if (dump_enabled_p ())
3173 {
3174 dump_printf_loc (MSG_NOTE, vect_location,
3175 "***** Re-trying analysis with "
3176 "vector size ");
3177 dump_dec (MSG_NOTE, next_vector_size);
3178 dump_printf (MSG_NOTE, "\n");
3179 }
3180 }
3181 }
3182
3183 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3184 true if anything in the basic-block was vectorized. */
3185
3186 bool
3187 vect_slp_bb (basic_block bb)
3188 {
3189 gimple_stmt_iterator gsi;
3190 bool any_vectorized = false;
3191
3192 gsi = gsi_start_bb (bb);
3193 while (!gsi_end_p (gsi))
3194 {
3195 gimple_stmt_iterator region_begin = gsi;
3196 vec<data_reference_p> datarefs = vNULL;
3197 int insns = 0;
3198
3199 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3200 {
3201 gimple *stmt = gsi_stmt (gsi);
3202 if (is_gimple_debug (stmt))
3203 continue;
3204 insns++;
3205
3206 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3207 vect_location = stmt;
3208
3209 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3210 break;
3211 }
3212
3213 /* Skip leading unhandled stmts. */
3214 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3215 {
3216 gsi_next (&gsi);
3217 continue;
3218 }
3219
3220 gimple_stmt_iterator region_end = gsi;
3221
3222 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3223 {
3224 if (dump_enabled_p ())
3225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3226 "not vectorized: too many instructions in "
3227 "basic block.\n");
3228 }
3229 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3230 any_vectorized = true;
3231
3232 if (gsi_end_p (region_end))
3233 break;
3234
3235 /* Skip the unhandled stmt. */
3236 gsi_next (&gsi);
3237 }
3238
3239 return any_vectorized;
3240 }
3241
3242
3243 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3244
3245 static bool
3246 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3247 {
3248 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3249 tree op, vectype;
3250 enum vect_def_type dt;
3251
3252 /* For comparison and COND_EXPR type is chosen depending
3253 on the non-constant other comparison operand. */
3254 if (TREE_CODE_CLASS (code) == tcc_comparison)
3255 {
3256 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3257 op = gimple_assign_rhs1 (stmt);
3258
3259 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3260 gcc_unreachable ();
3261
3262 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3263 }
3264
3265 if (code == COND_EXPR)
3266 {
3267 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3268 tree cond = gimple_assign_rhs1 (stmt);
3269
3270 if (TREE_CODE (cond) == SSA_NAME)
3271 op = cond;
3272 else
3273 op = TREE_OPERAND (cond, 0);
3274
3275 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3276 gcc_unreachable ();
3277
3278 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3279 }
3280
3281 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3282 }
3283
3284 /* Build a variable-length vector in which the elements in ELTS are repeated
3285 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3286 RESULTS and add any new instructions to SEQ.
3287
3288 The approach we use is:
3289
3290 (1) Find a vector mode VM with integer elements of mode IM.
3291
3292 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3293 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3294 from small vectors to IM.
3295
3296 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3297
3298 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3299 correct byte contents.
3300
3301 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3302
3303 We try to find the largest IM for which this sequence works, in order
3304 to cut down on the number of interleaves. */
3305
3306 void
3307 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3308 vec<tree> elts, unsigned int nresults,
3309 vec<tree> &results)
3310 {
3311 unsigned int nelts = elts.length ();
3312 tree element_type = TREE_TYPE (vector_type);
3313
3314 /* (1) Find a vector mode VM with integer elements of mode IM. */
3315 unsigned int nvectors = 1;
3316 tree new_vector_type;
3317 tree permutes[2];
3318 if (!can_duplicate_and_interleave_p (vinfo, nelts, TYPE_MODE (element_type),
3319 &nvectors, &new_vector_type,
3320 permutes))
3321 gcc_unreachable ();
3322
3323 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3324 unsigned int partial_nelts = nelts / nvectors;
3325 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3326
3327 tree_vector_builder partial_elts;
3328 auto_vec<tree, 32> pieces (nvectors * 2);
3329 pieces.quick_grow (nvectors * 2);
3330 for (unsigned int i = 0; i < nvectors; ++i)
3331 {
3332 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3333 ELTS' has mode IM. */
3334 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3335 for (unsigned int j = 0; j < partial_nelts; ++j)
3336 partial_elts.quick_push (elts[i * partial_nelts + j]);
3337 tree t = gimple_build_vector (seq, &partial_elts);
3338 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3339 TREE_TYPE (new_vector_type), t);
3340
3341 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3342 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3343 }
3344
3345 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3346 correct byte contents.
3347
3348 We need to repeat the following operation log2(nvectors) times:
3349
3350 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3351 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3352
3353 However, if each input repeats every N elements and the VF is
3354 a multiple of N * 2, the HI result is the same as the LO. */
3355 unsigned int in_start = 0;
3356 unsigned int out_start = nvectors;
3357 unsigned int hi_start = nvectors / 2;
3358 /* A bound on the number of outputs needed to produce NRESULTS results
3359 in the final iteration. */
3360 unsigned int noutputs_bound = nvectors * nresults;
3361 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3362 {
3363 noutputs_bound /= 2;
3364 unsigned int limit = MIN (noutputs_bound, nvectors);
3365 for (unsigned int i = 0; i < limit; ++i)
3366 {
3367 if ((i & 1) != 0
3368 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3369 2 * in_repeat))
3370 {
3371 pieces[out_start + i] = pieces[out_start + i - 1];
3372 continue;
3373 }
3374
3375 tree output = make_ssa_name (new_vector_type);
3376 tree input1 = pieces[in_start + (i / 2)];
3377 tree input2 = pieces[in_start + (i / 2) + hi_start];
3378 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3379 input1, input2,
3380 permutes[i & 1]);
3381 gimple_seq_add_stmt (seq, stmt);
3382 pieces[out_start + i] = output;
3383 }
3384 std::swap (in_start, out_start);
3385 }
3386
3387 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3388 results.reserve (nresults);
3389 for (unsigned int i = 0; i < nresults; ++i)
3390 if (i < nvectors)
3391 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3392 pieces[in_start + i]));
3393 else
3394 results.quick_push (results[i - nvectors]);
3395 }
3396
3397
3398 /* For constant and loop invariant defs of SLP_NODE this function returns
3399 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3400 OP_NODE determines the node for the operand containing the scalar
3401 operands. */
3402
3403 static void
3404 vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
3405 vec<tree> *vec_oprnds)
3406 {
3407 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3408 vec_info *vinfo = stmt_vinfo->vinfo;
3409 unsigned HOST_WIDE_INT nunits;
3410 tree vec_cst;
3411 unsigned j, number_of_places_left_in_vector;
3412 tree vector_type;
3413 tree vop;
3414 int group_size = op_node->ops.length ();
3415 unsigned int vec_num, i;
3416 unsigned number_of_copies = 1;
3417 bool constant_p;
3418 tree neutral_op = NULL;
3419 gimple_seq ctor_seq = NULL;
3420 auto_vec<tree, 16> permute_results;
3421
3422 /* ??? SLP analysis should compute the vector type for the
3423 constant / invariant and store it in the SLP node. */
3424 tree op = op_node->ops[0];
3425 /* Check if vector type is a boolean vector. */
3426 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3427 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3428 && vect_mask_constant_operand_p (stmt_vinfo))
3429 vector_type
3430 = build_same_sized_truth_vector_type (stmt_vectype);
3431 else
3432 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
3433
3434 /* ??? For lane-reducing ops we should also have the required number
3435 of vector stmts initialized rather than second-guessing here. */
3436 unsigned int number_of_vectors;
3437 if (is_gimple_assign (stmt_vinfo->stmt)
3438 && (gimple_assign_rhs_code (stmt_vinfo->stmt) == SAD_EXPR
3439 || gimple_assign_rhs_code (stmt_vinfo->stmt) == DOT_PROD_EXPR
3440 || gimple_assign_rhs_code (stmt_vinfo->stmt) == WIDEN_SUM_EXPR))
3441 number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3442 else
3443 number_of_vectors
3444 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3445 * TYPE_VECTOR_SUBPARTS (stmt_vectype),
3446 vector_type);
3447 vec_oprnds->create (number_of_vectors);
3448 auto_vec<tree> voprnds (number_of_vectors);
3449
3450 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3451 created vectors. It is greater than 1 if unrolling is performed.
3452
3453 For example, we have two scalar operands, s1 and s2 (e.g., group of
3454 strided accesses of size two), while NUNITS is four (i.e., four scalars
3455 of this type can be packed in a vector). The output vector will contain
3456 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3457 will be 2).
3458
3459 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3460 containing the operands.
3461
3462 For example, NUNITS is four as before, and the group size is 8
3463 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3464 {s5, s6, s7, s8}. */
3465
3466 /* When using duplicate_and_interleave, we just need one element for
3467 each scalar statement. */
3468 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3469 nunits = group_size;
3470
3471 number_of_copies = nunits * number_of_vectors / group_size;
3472
3473 number_of_places_left_in_vector = nunits;
3474 constant_p = true;
3475 tree_vector_builder elts (vector_type, nunits, 1);
3476 elts.quick_grow (nunits);
3477 bool place_after_defs = false;
3478 for (j = 0; j < number_of_copies; j++)
3479 {
3480 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3481 {
3482 /* Create 'vect_ = {op0,op1,...,opn}'. */
3483 number_of_places_left_in_vector--;
3484 tree orig_op = op;
3485 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3486 {
3487 if (CONSTANT_CLASS_P (op))
3488 {
3489 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3490 {
3491 /* Can't use VIEW_CONVERT_EXPR for booleans because
3492 of possibly different sizes of scalar value and
3493 vector element. */
3494 if (integer_zerop (op))
3495 op = build_int_cst (TREE_TYPE (vector_type), 0);
3496 else if (integer_onep (op))
3497 op = build_all_ones_cst (TREE_TYPE (vector_type));
3498 else
3499 gcc_unreachable ();
3500 }
3501 else
3502 op = fold_unary (VIEW_CONVERT_EXPR,
3503 TREE_TYPE (vector_type), op);
3504 gcc_assert (op && CONSTANT_CLASS_P (op));
3505 }
3506 else
3507 {
3508 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3509 gimple *init_stmt;
3510 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3511 {
3512 tree true_val
3513 = build_all_ones_cst (TREE_TYPE (vector_type));
3514 tree false_val
3515 = build_zero_cst (TREE_TYPE (vector_type));
3516 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3517 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3518 op, true_val,
3519 false_val);
3520 }
3521 else
3522 {
3523 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3524 op);
3525 init_stmt
3526 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3527 op);
3528 }
3529 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3530 op = new_temp;
3531 }
3532 }
3533 elts[number_of_places_left_in_vector] = op;
3534 if (!CONSTANT_CLASS_P (op))
3535 constant_p = false;
3536 if (TREE_CODE (orig_op) == SSA_NAME
3537 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3538 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3539 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3540 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3541 place_after_defs = true;
3542
3543 if (number_of_places_left_in_vector == 0)
3544 {
3545 if (constant_p
3546 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3547 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3548 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3549 else
3550 {
3551 if (permute_results.is_empty ())
3552 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3553 elts, number_of_vectors,
3554 permute_results);
3555 vec_cst = permute_results[number_of_vectors - j - 1];
3556 }
3557 tree init;
3558 gimple_stmt_iterator gsi;
3559 if (place_after_defs)
3560 {
3561 stmt_vec_info last_stmt_info
3562 = vect_find_last_scalar_stmt_in_slp (slp_node);
3563 gsi = gsi_for_stmt (last_stmt_info->stmt);
3564 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3565 &gsi);
3566 }
3567 else
3568 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3569 NULL);
3570 if (ctor_seq != NULL)
3571 {
3572 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3573 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3574 ctor_seq = NULL;
3575 }
3576 voprnds.quick_push (init);
3577 place_after_defs = false;
3578 number_of_places_left_in_vector = nunits;
3579 constant_p = true;
3580 elts.new_vector (vector_type, nunits, 1);
3581 elts.quick_grow (nunits);
3582 }
3583 }
3584 }
3585
3586 /* Since the vectors are created in the reverse order, we should invert
3587 them. */
3588 vec_num = voprnds.length ();
3589 for (j = vec_num; j != 0; j--)
3590 {
3591 vop = voprnds[j - 1];
3592 vec_oprnds->quick_push (vop);
3593 }
3594
3595 /* In case that VF is greater than the unrolling factor needed for the SLP
3596 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3597 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3598 to replicate the vectors. */
3599 while (number_of_vectors > vec_oprnds->length ())
3600 {
3601 tree neutral_vec = NULL;
3602
3603 if (neutral_op)
3604 {
3605 if (!neutral_vec)
3606 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3607
3608 vec_oprnds->quick_push (neutral_vec);
3609 }
3610 else
3611 {
3612 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3613 vec_oprnds->quick_push (vop);
3614 }
3615 }
3616 }
3617
3618
3619 /* Get vectorized definitions from SLP_NODE that contains corresponding
3620 vectorized def-stmts. */
3621
3622 static void
3623 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3624 {
3625 stmt_vec_info vec_def_stmt_info;
3626 unsigned int i;
3627
3628 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3629
3630 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3631 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3632 }
3633
3634
3635 /* Get N vectorized definitions for SLP_NODE.
3636 If the scalar definitions are loop invariants or constants, collect them and
3637 call vect_get_constant_vectors() to create vector stmts.
3638 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3639 must be stored in the corresponding child of SLP_NODE, and we call
3640 vect_get_slp_vect_defs () to retrieve them. */
3641
3642 void
3643 vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3644 {
3645 if (n == -1U)
3646 n = SLP_TREE_CHILDREN (slp_node).length ();
3647
3648 for (unsigned i = 0; i < n; ++i)
3649 {
3650 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3651
3652 vec<tree> vec_defs = vNULL;
3653
3654 /* For each operand we check if it has vectorized definitions in a child
3655 node or we need to create them (for invariants and constants). */
3656 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3657 {
3658 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3659 vect_get_slp_vect_defs (child, &vec_defs);
3660 }
3661 else
3662 vect_get_constant_vectors (child, slp_node, &vec_defs);
3663
3664 vec_oprnds->quick_push (vec_defs);
3665 }
3666 }
3667
3668 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3669 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3670 permute statements for the SLP node NODE of the SLP instance
3671 SLP_NODE_INSTANCE. */
3672
3673 bool
3674 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3675 gimple_stmt_iterator *gsi, poly_uint64 vf,
3676 slp_instance slp_node_instance, bool analyze_only,
3677 unsigned *n_perms)
3678 {
3679 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3680 vec_info *vinfo = stmt_info->vinfo;
3681 int vec_index = 0;
3682 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3683 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3684 unsigned int mask_element;
3685 machine_mode mode;
3686
3687 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3688 return false;
3689
3690 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3691
3692 mode = TYPE_MODE (vectype);
3693 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3694
3695 /* Initialize the vect stmts of NODE to properly insert the generated
3696 stmts later. */
3697 if (! analyze_only)
3698 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3699 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3700 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3701
3702 /* Generate permutation masks for every NODE. Number of masks for each NODE
3703 is equal to GROUP_SIZE.
3704 E.g., we have a group of three nodes with three loads from the same
3705 location in each node, and the vector size is 4. I.e., we have a
3706 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3707 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3708 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3709 ...
3710
3711 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3712 The last mask is illegal since we assume two operands for permute
3713 operation, and the mask element values can't be outside that range.
3714 Hence, the last mask must be converted into {2,5,5,5}.
3715 For the first two permutations we need the first and the second input
3716 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3717 we need the second and the third vectors: {b1,c1,a2,b2} and
3718 {c2,a3,b3,c3}. */
3719
3720 int vect_stmts_counter = 0;
3721 unsigned int index = 0;
3722 int first_vec_index = -1;
3723 int second_vec_index = -1;
3724 bool noop_p = true;
3725 *n_perms = 0;
3726
3727 vec_perm_builder mask;
3728 unsigned int nelts_to_build;
3729 unsigned int nvectors_per_build;
3730 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3731 && multiple_p (nunits, group_size));
3732 if (repeating_p)
3733 {
3734 /* A single vector contains a whole number of copies of the node, so:
3735 (a) all permutes can use the same mask; and
3736 (b) the permutes only need a single vector input. */
3737 mask.new_vector (nunits, group_size, 3);
3738 nelts_to_build = mask.encoded_nelts ();
3739 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3740 }
3741 else
3742 {
3743 /* We need to construct a separate mask for each vector statement. */
3744 unsigned HOST_WIDE_INT const_nunits, const_vf;
3745 if (!nunits.is_constant (&const_nunits)
3746 || !vf.is_constant (&const_vf))
3747 return false;
3748 mask.new_vector (const_nunits, const_nunits, 1);
3749 nelts_to_build = const_vf * group_size;
3750 nvectors_per_build = 1;
3751 }
3752
3753 unsigned int count = mask.encoded_nelts ();
3754 mask.quick_grow (count);
3755 vec_perm_indices indices;
3756
3757 for (unsigned int j = 0; j < nelts_to_build; j++)
3758 {
3759 unsigned int iter_num = j / group_size;
3760 unsigned int stmt_num = j % group_size;
3761 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3762 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3763 if (repeating_p)
3764 {
3765 first_vec_index = 0;
3766 mask_element = i;
3767 }
3768 else
3769 {
3770 /* Enforced before the loop when !repeating_p. */
3771 unsigned int const_nunits = nunits.to_constant ();
3772 vec_index = i / const_nunits;
3773 mask_element = i % const_nunits;
3774 if (vec_index == first_vec_index
3775 || first_vec_index == -1)
3776 {
3777 first_vec_index = vec_index;
3778 }
3779 else if (vec_index == second_vec_index
3780 || second_vec_index == -1)
3781 {
3782 second_vec_index = vec_index;
3783 mask_element += const_nunits;
3784 }
3785 else
3786 {
3787 if (dump_enabled_p ())
3788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3789 "permutation requires at "
3790 "least three vectors %G",
3791 stmt_info->stmt);
3792 gcc_assert (analyze_only);
3793 return false;
3794 }
3795
3796 gcc_assert (mask_element < 2 * const_nunits);
3797 }
3798
3799 if (mask_element != index)
3800 noop_p = false;
3801 mask[index++] = mask_element;
3802
3803 if (index == count && !noop_p)
3804 {
3805 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3806 if (!can_vec_perm_const_p (mode, indices))
3807 {
3808 if (dump_enabled_p ())
3809 {
3810 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3811 vect_location,
3812 "unsupported vect permute { ");
3813 for (i = 0; i < count; ++i)
3814 {
3815 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3816 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3817 }
3818 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3819 }
3820 gcc_assert (analyze_only);
3821 return false;
3822 }
3823
3824 ++*n_perms;
3825 }
3826
3827 if (index == count)
3828 {
3829 if (!analyze_only)
3830 {
3831 tree mask_vec = NULL_TREE;
3832
3833 if (! noop_p)
3834 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3835
3836 if (second_vec_index == -1)
3837 second_vec_index = first_vec_index;
3838
3839 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3840 {
3841 /* Generate the permute statement if necessary. */
3842 tree first_vec = dr_chain[first_vec_index + ri];
3843 tree second_vec = dr_chain[second_vec_index + ri];
3844 stmt_vec_info perm_stmt_info;
3845 if (! noop_p)
3846 {
3847 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3848 tree perm_dest
3849 = vect_create_destination_var (gimple_assign_lhs (stmt),
3850 vectype);
3851 perm_dest = make_ssa_name (perm_dest);
3852 gassign *perm_stmt
3853 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3854 first_vec, second_vec,
3855 mask_vec);
3856 perm_stmt_info
3857 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3858 gsi);
3859 }
3860 else
3861 /* If mask was NULL_TREE generate the requested
3862 identity transform. */
3863 perm_stmt_info = vinfo->lookup_def (first_vec);
3864
3865 /* Store the vector statement in NODE. */
3866 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3867 = perm_stmt_info;
3868 }
3869 }
3870
3871 index = 0;
3872 first_vec_index = -1;
3873 second_vec_index = -1;
3874 noop_p = true;
3875 }
3876 }
3877
3878 return true;
3879 }
3880
3881 /* Vectorize SLP instance tree in postorder. */
3882
3883 static void
3884 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3885 scalar_stmts_to_slp_tree_map_t *bst_map)
3886 {
3887 gimple_stmt_iterator si;
3888 stmt_vec_info stmt_info;
3889 unsigned int group_size;
3890 tree vectype;
3891 int i, j;
3892 slp_tree child;
3893
3894 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3895 return;
3896
3897 /* See if we have already vectorized the node in the graph of the
3898 SLP instance. */
3899 if (SLP_TREE_VEC_STMTS (node).exists ())
3900 return;
3901
3902 /* See if we have already vectorized the same set of stmts and reuse their
3903 vectorized stmts across instances. */
3904 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3905 {
3906 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3907 return;
3908 }
3909
3910 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3911 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3912 vect_schedule_slp_instance (child, instance, bst_map);
3913
3914 /* Push SLP node def-type to stmts. */
3915 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3916 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3917 {
3918 stmt_vec_info child_stmt_info;
3919 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3920 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3921 }
3922
3923 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3924
3925 /* VECTYPE is the type of the destination. */
3926 vectype = STMT_VINFO_VECTYPE (stmt_info);
3927 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3928 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3929
3930 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3931 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3932
3933 if (dump_enabled_p ())
3934 dump_printf_loc (MSG_NOTE, vect_location,
3935 "------>vectorizing SLP node starting from: %G",
3936 stmt_info->stmt);
3937
3938 /* Vectorized stmts go before the last scalar stmt which is where
3939 all uses are ready. */
3940 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3941 si = gsi_for_stmt (last_stmt_info->stmt);
3942
3943 /* Handle two-operation SLP nodes by vectorizing the group with
3944 both operations and then performing a merge. */
3945 if (SLP_TREE_TWO_OPERATORS (node))
3946 {
3947 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3948 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3949 enum tree_code ocode = ERROR_MARK;
3950 stmt_vec_info ostmt_info;
3951 vec_perm_builder mask (group_size, group_size, 1);
3952 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3953 {
3954 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3955 if (gimple_assign_rhs_code (ostmt) != code0)
3956 {
3957 mask.quick_push (1);
3958 ocode = gimple_assign_rhs_code (ostmt);
3959 }
3960 else
3961 mask.quick_push (0);
3962 }
3963 if (ocode != ERROR_MARK)
3964 {
3965 vec<stmt_vec_info> v0;
3966 vec<stmt_vec_info> v1;
3967 unsigned j;
3968 tree tmask = NULL_TREE;
3969 vect_transform_stmt (stmt_info, &si, node, instance);
3970 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3971 SLP_TREE_VEC_STMTS (node).truncate (0);
3972 gimple_assign_set_rhs_code (stmt, ocode);
3973 vect_transform_stmt (stmt_info, &si, node, instance);
3974 gimple_assign_set_rhs_code (stmt, code0);
3975 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3976 SLP_TREE_VEC_STMTS (node).truncate (0);
3977 tree meltype = build_nonstandard_integer_type
3978 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3979 tree mvectype = get_same_sized_vectype (meltype, vectype);
3980 unsigned k = 0, l;
3981 for (j = 0; j < v0.length (); ++j)
3982 {
3983 /* Enforced by vect_build_slp_tree, which rejects variable-length
3984 vectors for SLP_TREE_TWO_OPERATORS. */
3985 unsigned int const_nunits = nunits.to_constant ();
3986 tree_vector_builder melts (mvectype, const_nunits, 1);
3987 for (l = 0; l < const_nunits; ++l)
3988 {
3989 if (k >= group_size)
3990 k = 0;
3991 tree t = build_int_cst (meltype,
3992 mask[k++] * const_nunits + l);
3993 melts.quick_push (t);
3994 }
3995 tmask = melts.build ();
3996
3997 /* ??? Not all targets support a VEC_PERM_EXPR with a
3998 constant mask that would translate to a vec_merge RTX
3999 (with their vec_perm_const_ok). We can either not
4000 vectorize in that case or let veclower do its job.
4001 Unfortunately that isn't too great and at least for
4002 plus/minus we'd eventually like to match targets
4003 vector addsub instructions. */
4004 gimple *vstmt;
4005 vstmt = gimple_build_assign (make_ssa_name (vectype),
4006 VEC_PERM_EXPR,
4007 gimple_assign_lhs (v0[j]->stmt),
4008 gimple_assign_lhs (v1[j]->stmt),
4009 tmask);
4010 SLP_TREE_VEC_STMTS (node).quick_push
4011 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
4012 }
4013 v0.release ();
4014 v1.release ();
4015 return;
4016 }
4017 }
4018 vect_transform_stmt (stmt_info, &si, node, instance);
4019
4020 /* Restore stmt def-types. */
4021 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4022 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4023 {
4024 stmt_vec_info child_stmt_info;
4025 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4026 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4027 }
4028 }
4029
4030 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4031 For loop vectorization this is done in vectorizable_call, but for SLP
4032 it needs to be deferred until end of vect_schedule_slp, because multiple
4033 SLP instances may refer to the same scalar stmt. */
4034
4035 static void
4036 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4037 {
4038 gimple *new_stmt;
4039 gimple_stmt_iterator gsi;
4040 int i;
4041 slp_tree child;
4042 tree lhs;
4043 stmt_vec_info stmt_info;
4044
4045 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4046 return;
4047
4048 if (visited.add (node))
4049 return;
4050
4051 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4052 vect_remove_slp_scalar_calls (child, visited);
4053
4054 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4055 {
4056 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4057 if (!stmt || gimple_bb (stmt) == NULL)
4058 continue;
4059 if (is_pattern_stmt_p (stmt_info)
4060 || !PURE_SLP_STMT (stmt_info))
4061 continue;
4062 lhs = gimple_call_lhs (stmt);
4063 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4064 gsi = gsi_for_stmt (stmt);
4065 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4066 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4067 }
4068 }
4069
4070 static void
4071 vect_remove_slp_scalar_calls (slp_tree node)
4072 {
4073 hash_set<slp_tree> visited;
4074 vect_remove_slp_scalar_calls (node, visited);
4075 }
4076
4077 /* Generate vector code for all SLP instances in the loop/basic block. */
4078
4079 void
4080 vect_schedule_slp (vec_info *vinfo)
4081 {
4082 vec<slp_instance> slp_instances;
4083 slp_instance instance;
4084 unsigned int i;
4085
4086 scalar_stmts_to_slp_tree_map_t *bst_map
4087 = new scalar_stmts_to_slp_tree_map_t ();
4088 slp_instances = vinfo->slp_instances;
4089 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4090 {
4091 /* Schedule the tree of INSTANCE. */
4092 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4093 instance, bst_map);
4094 if (dump_enabled_p ())
4095 dump_printf_loc (MSG_NOTE, vect_location,
4096 "vectorizing stmts using SLP.\n");
4097 }
4098 delete bst_map;
4099
4100 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4101 {
4102 slp_tree root = SLP_INSTANCE_TREE (instance);
4103 stmt_vec_info store_info;
4104 unsigned int j;
4105
4106 /* Remove scalar call stmts. Do not do this for basic-block
4107 vectorization as not all uses may be vectorized.
4108 ??? Why should this be necessary? DCE should be able to
4109 remove the stmts itself.
4110 ??? For BB vectorization we can as well remove scalar
4111 stmts starting from the SLP tree root if they have no
4112 uses. */
4113 if (is_a <loop_vec_info> (vinfo))
4114 vect_remove_slp_scalar_calls (root);
4115
4116 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4117 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4118 {
4119 if (!STMT_VINFO_DATA_REF (store_info))
4120 break;
4121
4122 store_info = vect_orig_stmt (store_info);
4123 /* Free the attached stmt_vec_info and remove the stmt. */
4124 vinfo->remove_stmt (store_info);
4125 }
4126 }
4127 }