]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
tree-vectorizer.h (_slp_tree::ops): New member.
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
53
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
56 {
57 int i;
58 slp_tree child;
59
60 if (--node->refcnt != 0)
61 return;
62
63 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64 vect_free_slp_tree (child, final_p);
65
66 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67 Some statements might no longer exist, after having been
68 removed by vect_transform_stmt. Updating the remaining
69 statements would be redundant. */
70 if (!final_p)
71 {
72 stmt_vec_info stmt_info;
73 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 {
75 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
77 }
78 }
79
80 SLP_TREE_CHILDREN (node).release ();
81 SLP_TREE_SCALAR_STMTS (node).release ();
82 SLP_TREE_SCALAR_OPS (node).release ();
83 SLP_TREE_VEC_STMTS (node).release ();
84 SLP_TREE_LOAD_PERMUTATION (node).release ();
85
86 free (node);
87 }
88
89 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
90 have vectorized the instance or if we have made a final decision not
91 to vectorize the statements in any way. */
92
93 void
94 vect_free_slp_instance (slp_instance instance, bool final_p)
95 {
96 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
97 SLP_INSTANCE_LOADS (instance).release ();
98 free (instance);
99 }
100
101
102 /* Create an SLP node for SCALAR_STMTS. */
103
104 static slp_tree
105 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
106 {
107 slp_tree node;
108 stmt_vec_info stmt_info = scalar_stmts[0];
109 unsigned int nops;
110
111 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
112 nops = gimple_call_num_args (stmt);
113 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
114 {
115 nops = gimple_num_ops (stmt) - 1;
116 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
117 nops++;
118 }
119 else if (is_a <gphi *> (stmt_info->stmt))
120 nops = 0;
121 else
122 return NULL;
123
124 node = XNEW (struct _slp_tree);
125 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
126 SLP_TREE_SCALAR_OPS (node) = vNULL;
127 SLP_TREE_VEC_STMTS (node).create (0);
128 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
129 SLP_TREE_CHILDREN (node).create (nops);
130 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
131 SLP_TREE_TWO_OPERATORS (node) = false;
132 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
133 node->refcnt = 1;
134 node->max_nunits = 1;
135
136 unsigned i;
137 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
138 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
139
140 return node;
141 }
142
143 /* Create an SLP node for OPS. */
144
145 static slp_tree
146 vect_create_new_slp_node (vec<tree> ops)
147 {
148 slp_tree node;
149
150 node = XNEW (struct _slp_tree);
151 SLP_TREE_SCALAR_STMTS (node) = vNULL;
152 SLP_TREE_SCALAR_OPS (node) = ops;
153 SLP_TREE_VEC_STMTS (node).create (0);
154 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
155 SLP_TREE_CHILDREN (node) = vNULL;
156 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
157 SLP_TREE_TWO_OPERATORS (node) = false;
158 SLP_TREE_DEF_TYPE (node) = vect_external_def;
159 node->refcnt = 1;
160 node->max_nunits = 1;
161
162 return node;
163 }
164
165
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
168 node. */
169 typedef struct _slp_oprnd_info
170 {
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
173 /* Operands. */
174 vec<tree> ops;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
177 stmt. */
178 tree first_op_type;
179 enum vect_def_type first_dt;
180 bool first_pattern;
181 bool second_pattern;
182 } *slp_oprnd_info;
183
184
185 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
186 operand. */
187 static vec<slp_oprnd_info>
188 vect_create_oprnd_info (int nops, int group_size)
189 {
190 int i;
191 slp_oprnd_info oprnd_info;
192 vec<slp_oprnd_info> oprnds_info;
193
194 oprnds_info.create (nops);
195 for (i = 0; i < nops; i++)
196 {
197 oprnd_info = XNEW (struct _slp_oprnd_info);
198 oprnd_info->def_stmts.create (group_size);
199 oprnd_info->ops.create (group_size);
200 oprnd_info->first_dt = vect_uninitialized_def;
201 oprnd_info->first_op_type = NULL_TREE;
202 oprnd_info->first_pattern = false;
203 oprnd_info->second_pattern = false;
204 oprnds_info.quick_push (oprnd_info);
205 }
206
207 return oprnds_info;
208 }
209
210
211 /* Free operands info. */
212
213 static void
214 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
215 {
216 int i;
217 slp_oprnd_info oprnd_info;
218
219 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
220 {
221 oprnd_info->def_stmts.release ();
222 oprnd_info->ops.release ();
223 XDELETE (oprnd_info);
224 }
225
226 oprnds_info.release ();
227 }
228
229
230 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
231 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
232 of the chain. */
233
234 int
235 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
236 stmt_vec_info first_stmt_info)
237 {
238 stmt_vec_info next_stmt_info = first_stmt_info;
239 int result = 0;
240
241 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
242 return -1;
243
244 do
245 {
246 if (next_stmt_info == stmt_info)
247 return result;
248 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
249 if (next_stmt_info)
250 result += DR_GROUP_GAP (next_stmt_info);
251 }
252 while (next_stmt_info);
253
254 return -1;
255 }
256
257 /* Check whether it is possible to load COUNT elements of type ELT_MODE
258 using the method implemented by duplicate_and_interleave. Return true
259 if so, returning the number of intermediate vectors in *NVECTORS_OUT
260 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
261 (if nonnull). */
262
263 bool
264 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
265 machine_mode elt_mode,
266 unsigned int *nvectors_out,
267 tree *vector_type_out,
268 tree *permutes)
269 {
270 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
271 poly_int64 nelts;
272 unsigned int nvectors = 1;
273 for (;;)
274 {
275 scalar_int_mode int_mode;
276 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
277 if (multiple_p (vinfo->vector_size, elt_bytes, &nelts)
278 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
279 {
280 tree int_type = build_nonstandard_integer_type
281 (GET_MODE_BITSIZE (int_mode), 1);
282 tree vector_type = build_vector_type (int_type, nelts);
283 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
284 {
285 vec_perm_builder sel1 (nelts, 2, 3);
286 vec_perm_builder sel2 (nelts, 2, 3);
287 poly_int64 half_nelts = exact_div (nelts, 2);
288 for (unsigned int i = 0; i < 3; ++i)
289 {
290 sel1.quick_push (i);
291 sel1.quick_push (i + nelts);
292 sel2.quick_push (half_nelts + i);
293 sel2.quick_push (half_nelts + i + nelts);
294 }
295 vec_perm_indices indices1 (sel1, 2, nelts);
296 vec_perm_indices indices2 (sel2, 2, nelts);
297 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
298 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
299 {
300 if (nvectors_out)
301 *nvectors_out = nvectors;
302 if (vector_type_out)
303 *vector_type_out = vector_type;
304 if (permutes)
305 {
306 permutes[0] = vect_gen_perm_mask_checked (vector_type,
307 indices1);
308 permutes[1] = vect_gen_perm_mask_checked (vector_type,
309 indices2);
310 }
311 return true;
312 }
313 }
314 }
315 if (!multiple_p (elt_bytes, 2, &elt_bytes))
316 return false;
317 nvectors *= 2;
318 }
319 }
320
321 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
322 they are of a valid type and that they match the defs of the first stmt of
323 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
324 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
325 indicates swap is required for cond_expr stmts. Specifically, *SWAP
326 is 1 if STMT is cond and operands of comparison need to be swapped;
327 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
328 If there is any operand swap in this function, *SWAP is set to non-zero
329 value.
330 If there was a fatal error return -1; if the error could be corrected by
331 swapping operands of father node of this one, return 1; if everything is
332 ok return 0. */
333 static int
334 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
335 vec<stmt_vec_info> stmts, unsigned stmt_num,
336 vec<slp_oprnd_info> *oprnds_info)
337 {
338 stmt_vec_info stmt_info = stmts[stmt_num];
339 tree oprnd;
340 unsigned int i, number_of_oprnds;
341 enum vect_def_type dt = vect_uninitialized_def;
342 bool pattern = false;
343 slp_oprnd_info oprnd_info;
344 int first_op_idx = 1;
345 unsigned int commutative_op = -1U;
346 bool first_op_cond = false;
347 bool first = stmt_num == 0;
348 bool second = stmt_num == 1;
349
350 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
351 {
352 number_of_oprnds = gimple_call_num_args (stmt);
353 first_op_idx = 3;
354 if (gimple_call_internal_p (stmt))
355 {
356 internal_fn ifn = gimple_call_internal_fn (stmt);
357 commutative_op = first_commutative_argument (ifn);
358
359 /* Masked load, only look at mask. */
360 if (ifn == IFN_MASK_LOAD)
361 {
362 number_of_oprnds = 1;
363 /* Mask operand index. */
364 first_op_idx = 5;
365 }
366 }
367 }
368 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
369 {
370 enum tree_code code = gimple_assign_rhs_code (stmt);
371 number_of_oprnds = gimple_num_ops (stmt) - 1;
372 /* Swap can only be done for cond_expr if asked to, otherwise we
373 could result in different comparison code to the first stmt. */
374 if (code == COND_EXPR
375 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
376 {
377 first_op_cond = true;
378 number_of_oprnds++;
379 }
380 else
381 commutative_op = commutative_tree_code (code) ? 0U : -1U;
382 }
383 else
384 return -1;
385
386 bool swapped = (*swap != 0);
387 gcc_assert (!swapped || first_op_cond);
388 for (i = 0; i < number_of_oprnds; i++)
389 {
390 again:
391 if (first_op_cond)
392 {
393 /* Map indicating how operands of cond_expr should be swapped. */
394 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
395 int *map = maps[*swap];
396
397 if (i < 2)
398 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
399 first_op_idx), map[i]);
400 else
401 oprnd = gimple_op (stmt_info->stmt, map[i]);
402 }
403 else
404 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
405 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
406 oprnd = TREE_OPERAND (oprnd, 0);
407
408 oprnd_info = (*oprnds_info)[i];
409
410 stmt_vec_info def_stmt_info;
411 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
412 {
413 if (dump_enabled_p ())
414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
415 "Build SLP failed: can't analyze def for %T\n",
416 oprnd);
417
418 return -1;
419 }
420
421 if (second)
422 oprnd_info->second_pattern = pattern;
423
424 if (first)
425 {
426 oprnd_info->first_dt = dt;
427 oprnd_info->first_pattern = pattern;
428 oprnd_info->first_op_type = TREE_TYPE (oprnd);
429 }
430 else
431 {
432 /* Not first stmt of the group, check that the def-stmt/s match
433 the def-stmt/s of the first stmt. Allow different definition
434 types for reduction chains: the first stmt must be a
435 vect_reduction_def (a phi node), and the rest
436 vect_internal_def. */
437 tree type = TREE_TYPE (oprnd);
438 if ((oprnd_info->first_dt != dt
439 && !(oprnd_info->first_dt == vect_reduction_def
440 && dt == vect_internal_def)
441 && !((oprnd_info->first_dt == vect_external_def
442 || oprnd_info->first_dt == vect_constant_def)
443 && (dt == vect_external_def
444 || dt == vect_constant_def)))
445 || !types_compatible_p (oprnd_info->first_op_type, type))
446 {
447 /* Try swapping operands if we got a mismatch. */
448 if (i == commutative_op && !swapped)
449 {
450 swapped = true;
451 goto again;
452 }
453
454 if (dump_enabled_p ())
455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
456 "Build SLP failed: different types\n");
457
458 return 1;
459 }
460 if ((dt == vect_constant_def
461 || dt == vect_external_def)
462 && !vinfo->vector_size.is_constant ()
463 && (TREE_CODE (type) == BOOLEAN_TYPE
464 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
465 TYPE_MODE (type))))
466 {
467 if (dump_enabled_p ())
468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
469 "Build SLP failed: invalid type of def "
470 "for variable-length SLP %T\n", oprnd);
471 return -1;
472 }
473 }
474
475 /* Check the types of the definitions. */
476 switch (dt)
477 {
478 case vect_constant_def:
479 case vect_external_def:
480 oprnd_info->def_stmts.quick_push (NULL);
481 oprnd_info->ops.quick_push (oprnd);
482 break;
483
484 case vect_reduction_def:
485 case vect_induction_def:
486 case vect_internal_def:
487 oprnd_info->def_stmts.quick_push (def_stmt_info);
488 oprnd_info->ops.quick_push (oprnd);
489 break;
490
491 default:
492 /* FORNOW: Not supported. */
493 if (dump_enabled_p ())
494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
495 "Build SLP failed: illegal type of def %T\n",
496 oprnd);
497
498 return -1;
499 }
500 }
501
502 /* Swap operands. */
503 if (swapped)
504 {
505 /* If there are already uses of this stmt in a SLP instance then
506 we've committed to the operand order and can't swap it. */
507 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
508 {
509 if (dump_enabled_p ())
510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
511 "Build SLP failed: cannot swap operands of "
512 "shared stmt %G", stmt_info->stmt);
513 return -1;
514 }
515
516 if (first_op_cond)
517 {
518 /* To get rid of this swapping we have to move the stmt code
519 to the SLP tree as well (and gather it here per stmt). */
520 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
521 tree cond = gimple_assign_rhs1 (stmt);
522 enum tree_code code = TREE_CODE (cond);
523
524 /* Swap. */
525 if (*swap == 1)
526 {
527 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
528 &TREE_OPERAND (cond, 1));
529 TREE_SET_CODE (cond, swap_tree_comparison (code));
530 }
531 /* Invert. */
532 else
533 {
534 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
535 gimple_assign_rhs3_ptr (stmt));
536 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
537 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
538 gcc_assert (code != ERROR_MARK);
539 TREE_SET_CODE (cond, code);
540 }
541 }
542 else
543 {
544 /* Commutative ops need not reflect swapping, ops are in
545 the SLP tree. */
546 }
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_NOTE, vect_location,
549 "swapped operands to match def types in %G",
550 stmt_info->stmt);
551 }
552
553 *swap = swapped;
554 return 0;
555 }
556
557 /* Return true if call statements CALL1 and CALL2 are similar enough
558 to be combined into the same SLP group. */
559
560 static bool
561 compatible_calls_p (gcall *call1, gcall *call2)
562 {
563 unsigned int nargs = gimple_call_num_args (call1);
564 if (nargs != gimple_call_num_args (call2))
565 return false;
566
567 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
568 return false;
569
570 if (gimple_call_internal_p (call1))
571 {
572 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
573 TREE_TYPE (gimple_call_lhs (call2))))
574 return false;
575 for (unsigned int i = 0; i < nargs; ++i)
576 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
577 TREE_TYPE (gimple_call_arg (call2, i))))
578 return false;
579 }
580 else
581 {
582 if (!operand_equal_p (gimple_call_fn (call1),
583 gimple_call_fn (call2), 0))
584 return false;
585
586 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
587 return false;
588 }
589 return true;
590 }
591
592 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
593 caller's attempt to find the vector type in STMT_INFO with the narrowest
594 element type. Return true if VECTYPE is nonnull and if it is valid
595 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
596 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
597 vect_build_slp_tree. */
598
599 static bool
600 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
601 tree vectype, poly_uint64 *max_nunits)
602 {
603 if (!vectype)
604 {
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Build SLP failed: unsupported data-type in %G\n",
608 stmt_info->stmt);
609 /* Fatal mismatch. */
610 return false;
611 }
612
613 /* If populating the vector type requires unrolling then fail
614 before adjusting *max_nunits for basic-block vectorization. */
615 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
616 unsigned HOST_WIDE_INT const_nunits;
617 if (STMT_VINFO_BB_VINFO (stmt_info)
618 && (!nunits.is_constant (&const_nunits)
619 || const_nunits > group_size))
620 {
621 if (dump_enabled_p ())
622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
623 "Build SLP failed: unrolling required "
624 "in basic block SLP\n");
625 /* Fatal mismatch. */
626 return false;
627 }
628
629 /* In case of multiple types we need to detect the smallest type. */
630 vect_update_max_nunits (max_nunits, vectype);
631 return true;
632 }
633
634 /* STMTS is a group of GROUP_SIZE SLP statements in which some
635 statements do the same operation as the first statement and in which
636 the others do ALT_STMT_CODE. Return true if we can take one vector
637 of the first operation and one vector of the second and permute them
638 to get the required result. VECTYPE is the type of the vector that
639 would be permuted. */
640
641 static bool
642 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
643 unsigned int group_size, tree vectype,
644 tree_code alt_stmt_code)
645 {
646 unsigned HOST_WIDE_INT count;
647 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
648 return false;
649
650 vec_perm_builder sel (count, count, 1);
651 for (unsigned int i = 0; i < count; ++i)
652 {
653 unsigned int elt = i;
654 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
655 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
656 elt += count;
657 sel.quick_push (elt);
658 }
659 vec_perm_indices indices (sel, 2, count);
660 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
661 }
662
663 /* Verify if the scalar stmts STMTS are isomorphic, require data
664 permutation or are of unsupported types of operation. Return
665 true if they are, otherwise return false and indicate in *MATCHES
666 which stmts are not isomorphic to the first one. If MATCHES[0]
667 is false then this indicates the comparison could not be
668 carried out or the stmts will never be vectorized by SLP.
669
670 Note COND_EXPR is possibly isomorphic to another one after swapping its
671 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
672 the first stmt by swapping the two operands of comparison; set SWAP[i]
673 to 2 if stmt I is isormorphic to the first stmt by inverting the code
674 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
675 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
676
677 static bool
678 vect_build_slp_tree_1 (unsigned char *swap,
679 vec<stmt_vec_info> stmts, unsigned int group_size,
680 poly_uint64 *max_nunits, bool *matches,
681 bool *two_operators)
682 {
683 unsigned int i;
684 stmt_vec_info first_stmt_info = stmts[0];
685 enum tree_code first_stmt_code = ERROR_MARK;
686 enum tree_code alt_stmt_code = ERROR_MARK;
687 enum tree_code rhs_code = ERROR_MARK;
688 enum tree_code first_cond_code = ERROR_MARK;
689 tree lhs;
690 bool need_same_oprnds = false;
691 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
692 optab optab;
693 int icode;
694 machine_mode optab_op2_mode;
695 machine_mode vec_mode;
696 stmt_vec_info first_load = NULL, prev_first_load = NULL;
697 bool load_p = false;
698
699 /* For every stmt in NODE find its def stmt/s. */
700 stmt_vec_info stmt_info;
701 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
702 {
703 gimple *stmt = stmt_info->stmt;
704 swap[i] = 0;
705 matches[i] = false;
706
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
709
710 /* Fail to vectorize statements marked as unvectorizable. */
711 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
712 {
713 if (dump_enabled_p ())
714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
715 "Build SLP failed: unvectorizable statement %G",
716 stmt);
717 /* Fatal mismatch. */
718 matches[0] = false;
719 return false;
720 }
721
722 lhs = gimple_get_lhs (stmt);
723 if (lhs == NULL_TREE)
724 {
725 if (dump_enabled_p ())
726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
727 "Build SLP failed: not GIMPLE_ASSIGN nor "
728 "GIMPLE_CALL %G", stmt);
729 /* Fatal mismatch. */
730 matches[0] = false;
731 return false;
732 }
733
734 tree nunits_vectype;
735 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
736 &nunits_vectype)
737 || (nunits_vectype
738 && !vect_record_max_nunits (stmt_info, group_size,
739 nunits_vectype, max_nunits)))
740 {
741 /* Fatal mismatch. */
742 matches[0] = false;
743 return false;
744 }
745
746 gcc_assert (vectype);
747
748 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
749 {
750 rhs_code = CALL_EXPR;
751
752 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
753 load_p = true;
754 else if ((gimple_call_internal_p (call_stmt)
755 && (!vectorizable_internal_fn_p
756 (gimple_call_internal_fn (call_stmt))))
757 || gimple_call_tail_p (call_stmt)
758 || gimple_call_noreturn_p (call_stmt)
759 || !gimple_call_nothrow_p (call_stmt)
760 || gimple_call_chain (call_stmt))
761 {
762 if (dump_enabled_p ())
763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
764 "Build SLP failed: unsupported call type %G",
765 call_stmt);
766 /* Fatal mismatch. */
767 matches[0] = false;
768 return false;
769 }
770 }
771 else
772 {
773 rhs_code = gimple_assign_rhs_code (stmt);
774 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
775 }
776
777 /* Check the operation. */
778 if (i == 0)
779 {
780 first_stmt_code = rhs_code;
781
782 /* Shift arguments should be equal in all the packed stmts for a
783 vector shift with scalar shift operand. */
784 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
785 || rhs_code == LROTATE_EXPR
786 || rhs_code == RROTATE_EXPR)
787 {
788 if (vectype == boolean_type_node)
789 {
790 if (dump_enabled_p ())
791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
792 "Build SLP failed: shift of a"
793 " boolean.\n");
794 /* Fatal mismatch. */
795 matches[0] = false;
796 return false;
797 }
798
799 vec_mode = TYPE_MODE (vectype);
800
801 /* First see if we have a vector/vector shift. */
802 optab = optab_for_tree_code (rhs_code, vectype,
803 optab_vector);
804
805 if (!optab
806 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
807 {
808 /* No vector/vector shift, try for a vector/scalar shift. */
809 optab = optab_for_tree_code (rhs_code, vectype,
810 optab_scalar);
811
812 if (!optab)
813 {
814 if (dump_enabled_p ())
815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
816 "Build SLP failed: no optab.\n");
817 /* Fatal mismatch. */
818 matches[0] = false;
819 return false;
820 }
821 icode = (int) optab_handler (optab, vec_mode);
822 if (icode == CODE_FOR_nothing)
823 {
824 if (dump_enabled_p ())
825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
826 "Build SLP failed: "
827 "op not supported by target.\n");
828 /* Fatal mismatch. */
829 matches[0] = false;
830 return false;
831 }
832 optab_op2_mode = insn_data[icode].operand[2].mode;
833 if (!VECTOR_MODE_P (optab_op2_mode))
834 {
835 need_same_oprnds = true;
836 first_op1 = gimple_assign_rhs2 (stmt);
837 }
838 }
839 }
840 else if (rhs_code == WIDEN_LSHIFT_EXPR)
841 {
842 need_same_oprnds = true;
843 first_op1 = gimple_assign_rhs2 (stmt);
844 }
845 }
846 else
847 {
848 if (first_stmt_code != rhs_code
849 && alt_stmt_code == ERROR_MARK)
850 alt_stmt_code = rhs_code;
851 if (first_stmt_code != rhs_code
852 && (first_stmt_code != IMAGPART_EXPR
853 || rhs_code != REALPART_EXPR)
854 && (first_stmt_code != REALPART_EXPR
855 || rhs_code != IMAGPART_EXPR)
856 /* Handle mismatches in plus/minus by computing both
857 and merging the results. */
858 && !((first_stmt_code == PLUS_EXPR
859 || first_stmt_code == MINUS_EXPR)
860 && (alt_stmt_code == PLUS_EXPR
861 || alt_stmt_code == MINUS_EXPR)
862 && rhs_code == alt_stmt_code)
863 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
864 && (first_stmt_code == ARRAY_REF
865 || first_stmt_code == BIT_FIELD_REF
866 || first_stmt_code == INDIRECT_REF
867 || first_stmt_code == COMPONENT_REF
868 || first_stmt_code == MEM_REF)))
869 {
870 if (dump_enabled_p ())
871 {
872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
873 "Build SLP failed: different operation "
874 "in stmt %G", stmt);
875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
876 "original stmt %G", first_stmt_info->stmt);
877 }
878 /* Mismatch. */
879 continue;
880 }
881
882 if (need_same_oprnds
883 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
884 {
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
887 "Build SLP failed: different shift "
888 "arguments in %G", stmt);
889 /* Mismatch. */
890 continue;
891 }
892
893 if (!load_p && rhs_code == CALL_EXPR)
894 {
895 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
896 as_a <gcall *> (stmt)))
897 {
898 if (dump_enabled_p ())
899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
900 "Build SLP failed: different calls in %G",
901 stmt);
902 /* Mismatch. */
903 continue;
904 }
905 }
906 }
907
908 /* Grouped store or load. */
909 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
910 {
911 if (REFERENCE_CLASS_P (lhs))
912 {
913 /* Store. */
914 ;
915 }
916 else
917 {
918 /* Load. */
919 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
920 if (prev_first_load)
921 {
922 /* Check that there are no loads from different interleaving
923 chains in the same node. */
924 if (prev_first_load != first_load)
925 {
926 if (dump_enabled_p ())
927 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
928 vect_location,
929 "Build SLP failed: different "
930 "interleaving chains in one node %G",
931 stmt);
932 /* Mismatch. */
933 continue;
934 }
935 }
936 else
937 prev_first_load = first_load;
938 }
939 } /* Grouped access. */
940 else
941 {
942 if (load_p)
943 {
944 /* Not grouped load. */
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "Build SLP failed: not grouped load %G", stmt);
948
949 /* FORNOW: Not grouped loads are not supported. */
950 /* Fatal mismatch. */
951 matches[0] = false;
952 return false;
953 }
954
955 /* Not memory operation. */
956 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
957 && TREE_CODE_CLASS (rhs_code) != tcc_unary
958 && TREE_CODE_CLASS (rhs_code) != tcc_expression
959 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
960 && rhs_code != VIEW_CONVERT_EXPR
961 && rhs_code != CALL_EXPR)
962 {
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: operation unsupported %G",
966 stmt);
967 /* Fatal mismatch. */
968 matches[0] = false;
969 return false;
970 }
971
972 if (rhs_code == COND_EXPR)
973 {
974 tree cond_expr = gimple_assign_rhs1 (stmt);
975 enum tree_code cond_code = TREE_CODE (cond_expr);
976 enum tree_code swap_code = ERROR_MARK;
977 enum tree_code invert_code = ERROR_MARK;
978
979 if (i == 0)
980 first_cond_code = TREE_CODE (cond_expr);
981 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
982 {
983 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
984 swap_code = swap_tree_comparison (cond_code);
985 invert_code = invert_tree_comparison (cond_code, honor_nans);
986 }
987
988 if (first_cond_code == cond_code)
989 ;
990 /* Isomorphic can be achieved by swapping. */
991 else if (first_cond_code == swap_code)
992 swap[i] = 1;
993 /* Isomorphic can be achieved by inverting. */
994 else if (first_cond_code == invert_code)
995 swap[i] = 2;
996 else
997 {
998 if (dump_enabled_p ())
999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1000 "Build SLP failed: different"
1001 " operation %G", stmt);
1002 /* Mismatch. */
1003 continue;
1004 }
1005 }
1006 }
1007
1008 matches[i] = true;
1009 }
1010
1011 for (i = 0; i < group_size; ++i)
1012 if (!matches[i])
1013 return false;
1014
1015 /* If we allowed a two-operation SLP node verify the target can cope
1016 with the permute we are going to use. */
1017 if (alt_stmt_code != ERROR_MARK
1018 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1019 {
1020 if (vectype == boolean_type_node
1021 || !vect_two_operations_perm_ok_p (stmts, group_size,
1022 vectype, alt_stmt_code))
1023 {
1024 for (i = 0; i < group_size; ++i)
1025 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1026 {
1027 matches[i] = false;
1028 if (dump_enabled_p ())
1029 {
1030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1031 "Build SLP failed: different operation "
1032 "in stmt %G", stmts[i]->stmt);
1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 "original stmt %G", first_stmt_info->stmt);
1035 }
1036 }
1037 return false;
1038 }
1039 *two_operators = true;
1040 }
1041
1042 return true;
1043 }
1044
1045 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1046 Note we never remove apart from at destruction time so we do not
1047 need a special value for deleted that differs from empty. */
1048 struct bst_traits
1049 {
1050 typedef vec <stmt_vec_info> value_type;
1051 typedef vec <stmt_vec_info> compare_type;
1052 static inline hashval_t hash (value_type);
1053 static inline bool equal (value_type existing, value_type candidate);
1054 static inline bool is_empty (value_type x) { return !x.exists (); }
1055 static inline bool is_deleted (value_type x) { return !x.exists (); }
1056 static inline void mark_empty (value_type &x) { x.release (); }
1057 static inline void mark_deleted (value_type &x) { x.release (); }
1058 static inline void remove (value_type &x) { x.release (); }
1059 };
1060 inline hashval_t
1061 bst_traits::hash (value_type x)
1062 {
1063 inchash::hash h;
1064 for (unsigned i = 0; i < x.length (); ++i)
1065 h.add_int (gimple_uid (x[i]->stmt));
1066 return h.end ();
1067 }
1068 inline bool
1069 bst_traits::equal (value_type existing, value_type candidate)
1070 {
1071 if (existing.length () != candidate.length ())
1072 return false;
1073 for (unsigned i = 0; i < existing.length (); ++i)
1074 if (existing[i] != candidate[i])
1075 return false;
1076 return true;
1077 }
1078
1079 typedef hash_map <vec <gimple *>, slp_tree,
1080 simple_hashmap_traits <bst_traits, slp_tree> >
1081 scalar_stmts_to_slp_tree_map_t;
1082
1083 static slp_tree
1084 vect_build_slp_tree_2 (vec_info *vinfo,
1085 vec<stmt_vec_info> stmts, unsigned int group_size,
1086 poly_uint64 *max_nunits,
1087 bool *matches, unsigned *npermutes, unsigned *tree_size,
1088 scalar_stmts_to_slp_tree_map_t *bst_map);
1089
1090 static slp_tree
1091 vect_build_slp_tree (vec_info *vinfo,
1092 vec<stmt_vec_info> stmts, unsigned int group_size,
1093 poly_uint64 *max_nunits,
1094 bool *matches, unsigned *npermutes, unsigned *tree_size,
1095 scalar_stmts_to_slp_tree_map_t *bst_map)
1096 {
1097 if (slp_tree *leader = bst_map->get (stmts))
1098 {
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1101 *leader ? "" : "failed ", *leader);
1102 if (*leader)
1103 {
1104 (*leader)->refcnt++;
1105 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1106 }
1107 return *leader;
1108 }
1109 poly_uint64 this_max_nunits = 1;
1110 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1111 matches, npermutes, tree_size, bst_map);
1112 if (res)
1113 {
1114 res->max_nunits = this_max_nunits;
1115 vect_update_max_nunits (max_nunits, this_max_nunits);
1116 /* Keep a reference for the bst_map use. */
1117 res->refcnt++;
1118 }
1119 bst_map->put (stmts.copy (), res);
1120 return res;
1121 }
1122
1123 /* Recursively build an SLP tree starting from NODE.
1124 Fail (and return a value not equal to zero) if def-stmts are not
1125 isomorphic, require data permutation or are of unsupported types of
1126 operation. Otherwise, return 0.
1127 The value returned is the depth in the SLP tree where a mismatch
1128 was found. */
1129
1130 static slp_tree
1131 vect_build_slp_tree_2 (vec_info *vinfo,
1132 vec<stmt_vec_info> stmts, unsigned int group_size,
1133 poly_uint64 *max_nunits,
1134 bool *matches, unsigned *npermutes, unsigned *tree_size,
1135 scalar_stmts_to_slp_tree_map_t *bst_map)
1136 {
1137 unsigned nops, i, this_tree_size = 0;
1138 poly_uint64 this_max_nunits = *max_nunits;
1139 slp_tree node;
1140
1141 matches[0] = false;
1142
1143 stmt_vec_info stmt_info = stmts[0];
1144 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1145 nops = gimple_call_num_args (stmt);
1146 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1147 {
1148 nops = gimple_num_ops (stmt) - 1;
1149 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1150 nops++;
1151 }
1152 else if (is_a <gphi *> (stmt_info->stmt))
1153 nops = 0;
1154 else
1155 return NULL;
1156
1157 /* If the SLP node is a PHI (induction or reduction), terminate
1158 the recursion. */
1159 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1160 {
1161 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1162 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1163 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1164 return NULL;
1165
1166 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1167 /* Induction from different IVs is not supported. */
1168 if (def_type == vect_induction_def)
1169 {
1170 stmt_vec_info other_info;
1171 FOR_EACH_VEC_ELT (stmts, i, other_info)
1172 if (stmt_info != other_info)
1173 return NULL;
1174 }
1175 else if (def_type == vect_reduction_def
1176 || def_type == vect_double_reduction_def
1177 || def_type == vect_nested_cycle)
1178 {
1179 /* Else def types have to match. */
1180 stmt_vec_info other_info;
1181 FOR_EACH_VEC_ELT (stmts, i, other_info)
1182 {
1183 /* But for reduction chains only check on the first stmt. */
1184 if (!STMT_VINFO_DATA_REF (other_info)
1185 && REDUC_GROUP_FIRST_ELEMENT (other_info)
1186 && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
1187 continue;
1188 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1189 return NULL;
1190 }
1191 }
1192 else
1193 return NULL;
1194 (*tree_size)++;
1195 node = vect_create_new_slp_node (stmts);
1196 return node;
1197 }
1198
1199
1200 bool two_operators = false;
1201 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1202 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1203 &this_max_nunits, matches, &two_operators))
1204 return NULL;
1205
1206 /* If the SLP node is a load, terminate the recursion unless masked. */
1207 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1208 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1209 {
1210 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1211 {
1212 /* Masked load. */
1213 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1214 nops = 1;
1215 }
1216 else
1217 {
1218 *max_nunits = this_max_nunits;
1219 (*tree_size)++;
1220 node = vect_create_new_slp_node (stmts);
1221 return node;
1222 }
1223 }
1224
1225 /* Get at the operands, verifying they are compatible. */
1226 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1227 slp_oprnd_info oprnd_info;
1228 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1229 {
1230 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1231 stmts, i, &oprnds_info);
1232 if (res != 0)
1233 matches[(res == -1) ? 0 : i] = false;
1234 if (!matches[0])
1235 break;
1236 }
1237 for (i = 0; i < group_size; ++i)
1238 if (!matches[i])
1239 {
1240 vect_free_oprnd_info (oprnds_info);
1241 return NULL;
1242 }
1243
1244 auto_vec<slp_tree, 4> children;
1245
1246 stmt_info = stmts[0];
1247
1248 /* Create SLP_TREE nodes for the definition node/s. */
1249 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1250 {
1251 slp_tree child;
1252 unsigned old_tree_size = this_tree_size;
1253 unsigned int j;
1254
1255 if (oprnd_info->first_dt == vect_uninitialized_def)
1256 {
1257 /* COND_EXPR have one too many eventually if the condition
1258 is a SSA name. */
1259 gcc_assert (i == 3 && nops == 4);
1260 continue;
1261 }
1262
1263 if (oprnd_info->first_dt != vect_internal_def
1264 && oprnd_info->first_dt != vect_reduction_def
1265 && oprnd_info->first_dt != vect_induction_def)
1266 {
1267 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1268 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1269 oprnd_info->ops = vNULL;
1270 children.safe_push (invnode);
1271 continue;
1272 }
1273
1274 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1275 group_size, &this_max_nunits,
1276 matches, npermutes,
1277 &this_tree_size, bst_map)) != NULL)
1278 {
1279 /* If we have all children of child built up from scalars then just
1280 throw that away and build it up this node from scalars. */
1281 if (is_a <bb_vec_info> (vinfo)
1282 && !SLP_TREE_CHILDREN (child).is_empty ()
1283 /* ??? Rejecting patterns this way doesn't work. We'd have to
1284 do extra work to cancel the pattern so the uses see the
1285 scalar version. */
1286 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1287 {
1288 slp_tree grandchild;
1289
1290 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1291 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1292 break;
1293 if (!grandchild)
1294 {
1295 /* Roll back. */
1296 this_tree_size = old_tree_size;
1297 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1298 vect_free_slp_tree (grandchild, false);
1299 SLP_TREE_CHILDREN (child).truncate (0);
1300
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_NOTE, vect_location,
1303 "Building parent vector operands from "
1304 "scalars instead\n");
1305 oprnd_info->def_stmts = vNULL;
1306 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1307 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1308 oprnd_info->ops = vNULL;
1309 ++this_tree_size;
1310 children.safe_push (child);
1311 continue;
1312 }
1313 }
1314
1315 oprnd_info->def_stmts = vNULL;
1316 children.safe_push (child);
1317 continue;
1318 }
1319
1320 /* If the SLP build failed fatally and we analyze a basic-block
1321 simply treat nodes we fail to build as externally defined
1322 (and thus build vectors from the scalar defs).
1323 The cost model will reject outright expensive cases.
1324 ??? This doesn't treat cases where permutation ultimatively
1325 fails (or we don't try permutation below). Ideally we'd
1326 even compute a permutation that will end up with the maximum
1327 SLP tree size... */
1328 if (is_a <bb_vec_info> (vinfo)
1329 && !matches[0]
1330 /* ??? Rejecting patterns this way doesn't work. We'd have to
1331 do extra work to cancel the pattern so the uses see the
1332 scalar version. */
1333 && !is_pattern_stmt_p (stmt_info))
1334 {
1335 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_NOTE, vect_location,
1337 "Building vector operands from scalars\n");
1338 this_tree_size++;
1339 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1340 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1341 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1342 children.safe_push (child);
1343 oprnd_info->ops = vNULL;
1344 oprnd_info->def_stmts = vNULL;
1345 continue;
1346 }
1347
1348 /* If the SLP build for operand zero failed and operand zero
1349 and one can be commutated try that for the scalar stmts
1350 that failed the match. */
1351 if (i == 0
1352 /* A first scalar stmt mismatch signals a fatal mismatch. */
1353 && matches[0]
1354 /* ??? For COND_EXPRs we can swap the comparison operands
1355 as well as the arms under some constraints. */
1356 && nops == 2
1357 && oprnds_info[1]->first_dt == vect_internal_def
1358 && is_gimple_assign (stmt_info->stmt)
1359 /* Swapping operands for reductions breaks assumptions later on. */
1360 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1361 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1362 /* Do so only if the number of not successful permutes was nor more
1363 than a cut-ff as re-trying the recursive match on
1364 possibly each level of the tree would expose exponential
1365 behavior. */
1366 && *npermutes < 4)
1367 {
1368 /* See whether we can swap the matching or the non-matching
1369 stmt operands. */
1370 bool swap_not_matching = true;
1371 do
1372 {
1373 for (j = 0; j < group_size; ++j)
1374 {
1375 if (matches[j] != !swap_not_matching)
1376 continue;
1377 stmt_vec_info stmt_info = stmts[j];
1378 /* Verify if we can swap operands of this stmt. */
1379 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1380 if (!stmt
1381 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1382 {
1383 if (!swap_not_matching)
1384 goto fail;
1385 swap_not_matching = false;
1386 break;
1387 }
1388 /* Verify if we can safely swap or if we committed to a
1389 specific operand order already.
1390 ??? Instead of modifying GIMPLE stmts here we could
1391 record whether we want to swap operands in the SLP
1392 node and temporarily do that when processing it
1393 (or wrap operand accessors in a helper). */
1394 else if (swap[j] != 0
1395 || STMT_VINFO_NUM_SLP_USES (stmt_info))
1396 {
1397 if (!swap_not_matching)
1398 {
1399 if (dump_enabled_p ())
1400 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1401 vect_location,
1402 "Build SLP failed: cannot swap "
1403 "operands of shared stmt %G",
1404 stmts[j]->stmt);
1405 goto fail;
1406 }
1407 swap_not_matching = false;
1408 break;
1409 }
1410 }
1411 }
1412 while (j != group_size);
1413
1414 /* Swap mismatched definition stmts. */
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_NOTE, vect_location,
1417 "Re-trying with swapped operands of stmts ");
1418 for (j = 0; j < group_size; ++j)
1419 if (matches[j] == !swap_not_matching)
1420 {
1421 std::swap (oprnds_info[0]->def_stmts[j],
1422 oprnds_info[1]->def_stmts[j]);
1423 std::swap (oprnds_info[0]->ops[j],
1424 oprnds_info[1]->ops[j]);
1425 if (dump_enabled_p ())
1426 dump_printf (MSG_NOTE, "%d ", j);
1427 }
1428 if (dump_enabled_p ())
1429 dump_printf (MSG_NOTE, "\n");
1430 /* And try again with scratch 'matches' ... */
1431 bool *tem = XALLOCAVEC (bool, group_size);
1432 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1433 group_size, &this_max_nunits,
1434 tem, npermutes,
1435 &this_tree_size, bst_map)) != NULL)
1436 {
1437 /* If we have all children of child built up from scalars then
1438 just throw that away and build it up this node from scalars. */
1439 if (is_a <bb_vec_info> (vinfo)
1440 && !SLP_TREE_CHILDREN (child).is_empty ()
1441 /* ??? Rejecting patterns this way doesn't work. We'd have
1442 to do extra work to cancel the pattern so the uses see the
1443 scalar version. */
1444 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1445 {
1446 unsigned int j;
1447 slp_tree grandchild;
1448
1449 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1450 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1451 break;
1452 if (!grandchild)
1453 {
1454 /* Roll back. */
1455 this_tree_size = old_tree_size;
1456 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1457 vect_free_slp_tree (grandchild, false);
1458 SLP_TREE_CHILDREN (child).truncate (0);
1459
1460 if (dump_enabled_p ())
1461 dump_printf_loc (MSG_NOTE, vect_location,
1462 "Building parent vector operands from "
1463 "scalars instead\n");
1464 oprnd_info->def_stmts = vNULL;
1465 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1466 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1467 oprnd_info->ops = vNULL;
1468 ++this_tree_size;
1469 children.safe_push (child);
1470 continue;
1471 }
1472 }
1473
1474 oprnd_info->def_stmts = vNULL;
1475 children.safe_push (child);
1476 continue;
1477 }
1478
1479 ++*npermutes;
1480 }
1481
1482 fail:
1483 gcc_assert (child == NULL);
1484 FOR_EACH_VEC_ELT (children, j, child)
1485 vect_free_slp_tree (child, false);
1486 vect_free_oprnd_info (oprnds_info);
1487 return NULL;
1488 }
1489
1490 vect_free_oprnd_info (oprnds_info);
1491
1492 *tree_size += this_tree_size + 1;
1493 *max_nunits = this_max_nunits;
1494
1495 node = vect_create_new_slp_node (stmts);
1496 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1497 SLP_TREE_CHILDREN (node).splice (children);
1498 return node;
1499 }
1500
1501 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1502
1503 static void
1504 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1505 slp_tree node, hash_set<slp_tree> &visited)
1506 {
1507 int i;
1508 stmt_vec_info stmt_info;
1509 slp_tree child;
1510
1511 if (visited.add (node))
1512 return;
1513
1514 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1515 dump_user_location_t user_loc = loc.get_user_location ();
1516 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
1517 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1518 ? " (external)" : "", node,
1519 estimated_poly_value (node->max_nunits));
1520 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1521 dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt);
1522 if (SLP_TREE_CHILDREN (node).is_empty ())
1523 return;
1524 dump_printf_loc (metadata, user_loc, "\tchildren");
1525 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1526 dump_printf (dump_kind, " %p", (void *)child);
1527 dump_printf (dump_kind, "\n");
1528 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1529 vect_print_slp_tree (dump_kind, loc, child, visited);
1530 }
1531
1532 static void
1533 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1534 slp_tree node)
1535 {
1536 hash_set<slp_tree> visited;
1537 vect_print_slp_tree (dump_kind, loc, node, visited);
1538 }
1539
1540 /* Mark the tree rooted at NODE with PURE_SLP. */
1541
1542 static void
1543 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1544 {
1545 int i;
1546 stmt_vec_info stmt_info;
1547 slp_tree child;
1548
1549 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1550 return;
1551
1552 if (visited.add (node))
1553 return;
1554
1555 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1556 STMT_SLP_TYPE (stmt_info) = pure_slp;
1557
1558 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1559 vect_mark_slp_stmts (child, visited);
1560 }
1561
1562 static void
1563 vect_mark_slp_stmts (slp_tree node)
1564 {
1565 hash_set<slp_tree> visited;
1566 vect_mark_slp_stmts (node, visited);
1567 }
1568
1569 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1570
1571 static void
1572 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1573 {
1574 int i;
1575 stmt_vec_info stmt_info;
1576 slp_tree child;
1577
1578 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1579 return;
1580
1581 if (visited.add (node))
1582 return;
1583
1584 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1585 {
1586 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1587 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1588 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1589 }
1590
1591 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1592 vect_mark_slp_stmts_relevant (child, visited);
1593 }
1594
1595 static void
1596 vect_mark_slp_stmts_relevant (slp_tree node)
1597 {
1598 hash_set<slp_tree> visited;
1599 vect_mark_slp_stmts_relevant (node, visited);
1600 }
1601
1602
1603 /* Rearrange the statements of NODE according to PERMUTATION. */
1604
1605 static void
1606 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1607 vec<unsigned> permutation,
1608 hash_set<slp_tree> &visited)
1609 {
1610 unsigned int i;
1611 slp_tree child;
1612
1613 if (visited.add (node))
1614 return;
1615
1616 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1617 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1618
1619 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1620 {
1621 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1622 vec<stmt_vec_info> tmp_stmts;
1623 tmp_stmts.create (group_size);
1624 tmp_stmts.quick_grow (group_size);
1625 stmt_vec_info stmt_info;
1626 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1627 tmp_stmts[permutation[i]] = stmt_info;
1628 SLP_TREE_SCALAR_STMTS (node).release ();
1629 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1630 }
1631 if (SLP_TREE_SCALAR_OPS (node).exists ())
1632 {
1633 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1634 vec<tree> tmp_ops;
1635 tmp_ops.create (group_size);
1636 tmp_ops.quick_grow (group_size);
1637 tree op;
1638 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1639 tmp_ops[permutation[i]] = op;
1640 SLP_TREE_SCALAR_OPS (node).release ();
1641 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1642 }
1643 }
1644
1645
1646 /* Attempt to reorder stmts in a reduction chain so that we don't
1647 require any load permutation. Return true if that was possible,
1648 otherwise return false. */
1649
1650 static bool
1651 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1652 {
1653 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1654 unsigned int i, j;
1655 unsigned int lidx;
1656 slp_tree node, load;
1657
1658 /* Compare all the permutation sequences to the first one. We know
1659 that at least one load is permuted. */
1660 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1661 if (!node->load_permutation.exists ())
1662 return false;
1663 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1664 {
1665 if (!load->load_permutation.exists ())
1666 return false;
1667 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1668 if (lidx != node->load_permutation[j])
1669 return false;
1670 }
1671
1672 /* Check that the loads in the first sequence are different and there
1673 are no gaps between them. */
1674 auto_sbitmap load_index (group_size);
1675 bitmap_clear (load_index);
1676 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1677 {
1678 if (lidx >= group_size)
1679 return false;
1680 if (bitmap_bit_p (load_index, lidx))
1681 return false;
1682
1683 bitmap_set_bit (load_index, lidx);
1684 }
1685 for (i = 0; i < group_size; i++)
1686 if (!bitmap_bit_p (load_index, i))
1687 return false;
1688
1689 /* This permutation is valid for reduction. Since the order of the
1690 statements in the nodes is not important unless they are memory
1691 accesses, we can rearrange the statements in all the nodes
1692 according to the order of the loads. */
1693 hash_set<slp_tree> visited;
1694 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1695 node->load_permutation, visited);
1696
1697 /* We are done, no actual permutations need to be generated. */
1698 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1699 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1700 {
1701 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1702 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1703 /* But we have to keep those permutations that are required because
1704 of handling of gaps. */
1705 if (known_eq (unrolling_factor, 1U)
1706 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1707 && DR_GROUP_GAP (first_stmt_info) == 0))
1708 SLP_TREE_LOAD_PERMUTATION (node).release ();
1709 else
1710 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1711 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1712 }
1713
1714 return true;
1715 }
1716
1717 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1718
1719 static void
1720 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1721 hash_set<slp_tree> &visited)
1722 {
1723 if (visited.add (node))
1724 return;
1725
1726 if (SLP_TREE_CHILDREN (node).length () == 0)
1727 {
1728 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1729 return;
1730 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1731 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1732 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1733 SLP_INSTANCE_LOADS (inst).safe_push (node);
1734 }
1735 else
1736 {
1737 unsigned i;
1738 slp_tree child;
1739 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1740 vect_gather_slp_loads (inst, child, visited);
1741 }
1742 }
1743
1744 static void
1745 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1746 {
1747 hash_set<slp_tree> visited;
1748 vect_gather_slp_loads (inst, node, visited);
1749 }
1750
1751 /* Check if the required load permutations in the SLP instance
1752 SLP_INSTN are supported. */
1753
1754 static bool
1755 vect_supported_load_permutation_p (slp_instance slp_instn)
1756 {
1757 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1758 unsigned int i, j, k, next;
1759 slp_tree node;
1760
1761 if (dump_enabled_p ())
1762 {
1763 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1764 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1765 if (node->load_permutation.exists ())
1766 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1767 dump_printf (MSG_NOTE, "%d ", next);
1768 else
1769 for (k = 0; k < group_size; ++k)
1770 dump_printf (MSG_NOTE, "%d ", k);
1771 dump_printf (MSG_NOTE, "\n");
1772 }
1773
1774 /* In case of reduction every load permutation is allowed, since the order
1775 of the reduction statements is not important (as opposed to the case of
1776 grouped stores). The only condition we need to check is that all the
1777 load nodes are of the same size and have the same permutation (and then
1778 rearrange all the nodes of the SLP instance according to this
1779 permutation). */
1780
1781 /* Check that all the load nodes are of the same size. */
1782 /* ??? Can't we assert this? */
1783 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1784 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1785 return false;
1786
1787 node = SLP_INSTANCE_TREE (slp_instn);
1788 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1789
1790 /* Reduction (there are no data-refs in the root).
1791 In reduction chain the order of the loads is not important. */
1792 if (!STMT_VINFO_DATA_REF (stmt_info)
1793 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1794 vect_attempt_slp_rearrange_stmts (slp_instn);
1795
1796 /* In basic block vectorization we allow any subchain of an interleaving
1797 chain.
1798 FORNOW: not supported in loop SLP because of realignment compications. */
1799 if (STMT_VINFO_BB_VINFO (stmt_info))
1800 {
1801 /* Check whether the loads in an instance form a subchain and thus
1802 no permutation is necessary. */
1803 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1804 {
1805 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1806 continue;
1807 bool subchain_p = true;
1808 stmt_vec_info next_load_info = NULL;
1809 stmt_vec_info load_info;
1810 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1811 {
1812 if (j != 0
1813 && (next_load_info != load_info
1814 || DR_GROUP_GAP (load_info) != 1))
1815 {
1816 subchain_p = false;
1817 break;
1818 }
1819 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1820 }
1821 if (subchain_p)
1822 SLP_TREE_LOAD_PERMUTATION (node).release ();
1823 else
1824 {
1825 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1826 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1827 unsigned HOST_WIDE_INT nunits;
1828 unsigned k, maxk = 0;
1829 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1830 if (k > maxk)
1831 maxk = k;
1832 /* In BB vectorization we may not actually use a loaded vector
1833 accessing elements in excess of DR_GROUP_SIZE. */
1834 tree vectype = STMT_VINFO_VECTYPE (group_info);
1835 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1836 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1837 {
1838 if (dump_enabled_p ())
1839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840 "BB vectorization with gaps at the end of "
1841 "a load is not supported\n");
1842 return false;
1843 }
1844
1845 /* Verify the permutation can be generated. */
1846 vec<tree> tem;
1847 unsigned n_perms;
1848 if (!vect_transform_slp_perm_load (node, tem, NULL,
1849 1, slp_instn, true, &n_perms))
1850 {
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1853 vect_location,
1854 "unsupported load permutation\n");
1855 return false;
1856 }
1857 }
1858 }
1859 return true;
1860 }
1861
1862 /* For loop vectorization verify we can generate the permutation. Be
1863 conservative about the vectorization factor, there are permutations
1864 that will use three vector inputs only starting from a specific factor
1865 and the vectorization factor is not yet final.
1866 ??? The SLP instance unrolling factor might not be the maximum one. */
1867 unsigned n_perms;
1868 poly_uint64 test_vf
1869 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1870 LOOP_VINFO_VECT_FACTOR
1871 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1872 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1873 if (node->load_permutation.exists ()
1874 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1875 slp_instn, true, &n_perms))
1876 return false;
1877
1878 return true;
1879 }
1880
1881
1882 /* Find the last store in SLP INSTANCE. */
1883
1884 stmt_vec_info
1885 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1886 {
1887 stmt_vec_info last = NULL;
1888 stmt_vec_info stmt_vinfo;
1889
1890 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1891 {
1892 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1893 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1894 }
1895
1896 return last;
1897 }
1898
1899 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1900 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1901 (also containing the first GROUP1_SIZE stmts, since stores are
1902 consecutive), the second containing the remainder.
1903 Return the first stmt in the second group. */
1904
1905 static stmt_vec_info
1906 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1907 {
1908 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1909 gcc_assert (group1_size > 0);
1910 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1911 gcc_assert (group2_size > 0);
1912 DR_GROUP_SIZE (first_vinfo) = group1_size;
1913
1914 stmt_vec_info stmt_info = first_vinfo;
1915 for (unsigned i = group1_size; i > 1; i--)
1916 {
1917 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1918 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1919 }
1920 /* STMT is now the last element of the first group. */
1921 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1922 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1923
1924 DR_GROUP_SIZE (group2) = group2_size;
1925 for (stmt_info = group2; stmt_info;
1926 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1927 {
1928 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1929 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1930 }
1931
1932 /* For the second group, the DR_GROUP_GAP is that before the original group,
1933 plus skipping over the first vector. */
1934 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1935
1936 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1937 DR_GROUP_GAP (first_vinfo) += group2_size;
1938
1939 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1941 group1_size, group2_size);
1942
1943 return group2;
1944 }
1945
1946 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1947 statements and a vector of NUNITS elements. */
1948
1949 static poly_uint64
1950 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1951 {
1952 return exact_div (common_multiple (nunits, group_size), group_size);
1953 }
1954
1955 /* Analyze an SLP instance starting from a group of grouped stores. Call
1956 vect_build_slp_tree to build a tree of packed stmts if possible.
1957 Return FALSE if it's impossible to SLP any stmt in the loop. */
1958
1959 static bool
1960 vect_analyze_slp_instance (vec_info *vinfo,
1961 stmt_vec_info stmt_info, unsigned max_tree_size)
1962 {
1963 slp_instance new_instance;
1964 slp_tree node;
1965 unsigned int group_size;
1966 tree vectype, scalar_type = NULL_TREE;
1967 unsigned int i;
1968 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1969 vec<stmt_vec_info> scalar_stmts;
1970
1971 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1972 {
1973 scalar_type = TREE_TYPE (DR_REF (dr));
1974 vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1975 group_size = DR_GROUP_SIZE (stmt_info);
1976 }
1977 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1978 {
1979 gcc_assert (is_a <loop_vec_info> (vinfo));
1980 vectype = STMT_VINFO_VECTYPE (stmt_info);
1981 group_size = REDUC_GROUP_SIZE (stmt_info);
1982 }
1983 else
1984 {
1985 gcc_assert (is_a <loop_vec_info> (vinfo));
1986 vectype = STMT_VINFO_VECTYPE (stmt_info);
1987 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1988 }
1989
1990 if (!vectype)
1991 {
1992 if (dump_enabled_p ())
1993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1994 "Build SLP failed: unsupported data-type %T\n",
1995 scalar_type);
1996
1997 return false;
1998 }
1999 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2000
2001 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2002 scalar_stmts.create (group_size);
2003 stmt_vec_info next_info = stmt_info;
2004 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2005 {
2006 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2007 while (next_info)
2008 {
2009 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2010 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2011 }
2012 }
2013 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2014 {
2015 /* Collect the reduction stmts and store them in
2016 SLP_TREE_SCALAR_STMTS. */
2017 while (next_info)
2018 {
2019 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2020 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2021 }
2022 /* Mark the first element of the reduction chain as reduction to properly
2023 transform the node. In the reduction analysis phase only the last
2024 element of the chain is marked as reduction. */
2025 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
2026 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2027 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2028 }
2029 else
2030 {
2031 /* Collect reduction statements. */
2032 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2033 for (i = 0; reductions.iterate (i, &next_info); i++)
2034 scalar_stmts.safe_push (next_info);
2035 }
2036
2037 /* Build the tree for the SLP instance. */
2038 bool *matches = XALLOCAVEC (bool, group_size);
2039 unsigned npermutes = 0;
2040 scalar_stmts_to_slp_tree_map_t *bst_map
2041 = new scalar_stmts_to_slp_tree_map_t ();
2042 poly_uint64 max_nunits = nunits;
2043 unsigned tree_size = 0;
2044 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2045 &max_nunits, matches, &npermutes,
2046 &tree_size, bst_map);
2047 /* The map keeps a reference on SLP nodes built, release that. */
2048 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2049 it != bst_map->end (); ++it)
2050 if ((*it).second)
2051 vect_free_slp_tree ((*it).second, false);
2052 delete bst_map;
2053 if (node != NULL)
2054 {
2055 /* Calculate the unrolling factor based on the smallest type. */
2056 poly_uint64 unrolling_factor
2057 = calculate_unrolling_factor (max_nunits, group_size);
2058
2059 if (maybe_ne (unrolling_factor, 1U)
2060 && is_a <bb_vec_info> (vinfo))
2061 {
2062 unsigned HOST_WIDE_INT const_max_nunits;
2063 if (!max_nunits.is_constant (&const_max_nunits)
2064 || const_max_nunits > group_size)
2065 {
2066 if (dump_enabled_p ())
2067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2068 "Build SLP failed: store group "
2069 "size not a multiple of the vector size "
2070 "in basic block SLP\n");
2071 vect_free_slp_tree (node, false);
2072 return false;
2073 }
2074 /* Fatal mismatch. */
2075 matches[group_size / const_max_nunits * const_max_nunits] = false;
2076 vect_free_slp_tree (node, false);
2077 }
2078 else
2079 {
2080 /* Create a new SLP instance. */
2081 new_instance = XNEW (class _slp_instance);
2082 SLP_INSTANCE_TREE (new_instance) = node;
2083 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2084 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2085 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2086 vect_gather_slp_loads (new_instance, node);
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_NOTE, vect_location,
2089 "SLP size %u vs. limit %u.\n",
2090 tree_size, max_tree_size);
2091
2092 /* Compute the load permutation. */
2093 slp_tree load_node;
2094 bool loads_permuted = false;
2095 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2096 {
2097 vec<unsigned> load_permutation;
2098 int j;
2099 stmt_vec_info load_info;
2100 bool this_load_permuted = false;
2101 load_permutation.create (group_size);
2102 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2103 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2105 {
2106 int load_place = vect_get_place_in_interleaving_chain
2107 (load_info, first_stmt_info);
2108 gcc_assert (load_place != -1);
2109 if (load_place != j)
2110 this_load_permuted = true;
2111 load_permutation.safe_push (load_place);
2112 }
2113 if (!this_load_permuted
2114 /* The load requires permutation when unrolling exposes
2115 a gap either because the group is larger than the SLP
2116 group-size or because there is a gap between the groups. */
2117 && (known_eq (unrolling_factor, 1U)
2118 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2119 && DR_GROUP_GAP (first_stmt_info) == 0)))
2120 {
2121 load_permutation.release ();
2122 continue;
2123 }
2124 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2125 loads_permuted = true;
2126 }
2127
2128 if (loads_permuted)
2129 {
2130 if (!vect_supported_load_permutation_p (new_instance))
2131 {
2132 if (dump_enabled_p ())
2133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2134 "Build SLP failed: unsupported load "
2135 "permutation %G", stmt_info->stmt);
2136 vect_free_slp_instance (new_instance, false);
2137 return false;
2138 }
2139 }
2140
2141 /* If the loads and stores can be handled with load/store-lan
2142 instructions do not generate this SLP instance. */
2143 if (is_a <loop_vec_info> (vinfo)
2144 && loads_permuted
2145 && dr && vect_store_lanes_supported (vectype, group_size, false))
2146 {
2147 slp_tree load_node;
2148 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2149 {
2150 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2151 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2152 /* Use SLP for strided accesses (or if we can't load-lanes). */
2153 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2154 || ! vect_load_lanes_supported
2155 (STMT_VINFO_VECTYPE (stmt_vinfo),
2156 DR_GROUP_SIZE (stmt_vinfo), false))
2157 break;
2158 }
2159 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2160 {
2161 if (dump_enabled_p ())
2162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2163 "Built SLP cancelled: can use "
2164 "load/store-lanes\n");
2165 vect_free_slp_instance (new_instance, false);
2166 return false;
2167 }
2168 }
2169
2170 vinfo->slp_instances.safe_push (new_instance);
2171
2172 if (dump_enabled_p ())
2173 {
2174 dump_printf_loc (MSG_NOTE, vect_location,
2175 "Final SLP tree for instance:\n");
2176 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2177 }
2178
2179 return true;
2180 }
2181 }
2182 else
2183 {
2184 /* Failed to SLP. */
2185 /* Free the allocated memory. */
2186 scalar_stmts.release ();
2187 }
2188
2189 /* For basic block SLP, try to break the group up into multiples of the
2190 vector size. */
2191 unsigned HOST_WIDE_INT const_nunits;
2192 if (is_a <bb_vec_info> (vinfo)
2193 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2194 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2195 && nunits.is_constant (&const_nunits))
2196 {
2197 /* We consider breaking the group only on VF boundaries from the existing
2198 start. */
2199 for (i = 0; i < group_size; i++)
2200 if (!matches[i]) break;
2201
2202 if (i >= const_nunits && i < group_size)
2203 {
2204 /* Split into two groups at the first vector boundary before i. */
2205 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2206 unsigned group1_size = i & ~(const_nunits - 1);
2207
2208 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2209 group1_size);
2210 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2211 max_tree_size);
2212 /* If the first non-match was in the middle of a vector,
2213 skip the rest of that vector. */
2214 if (group1_size < i)
2215 {
2216 i = group1_size + const_nunits;
2217 if (i < group_size)
2218 rest = vect_split_slp_store_group (rest, const_nunits);
2219 }
2220 if (i < group_size)
2221 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2222 return res;
2223 }
2224 /* Even though the first vector did not all match, we might be able to SLP
2225 (some) of the remainder. FORNOW ignore this possibility. */
2226 }
2227
2228 return false;
2229 }
2230
2231
2232 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2233 trees of packed scalar stmts if SLP is possible. */
2234
2235 opt_result
2236 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2237 {
2238 unsigned int i;
2239 stmt_vec_info first_element;
2240
2241 DUMP_VECT_SCOPE ("vect_analyze_slp");
2242
2243 /* Find SLP sequences starting from groups of grouped stores. */
2244 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2245 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2246
2247 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2248 {
2249 if (loop_vinfo->reduction_chains.length () > 0)
2250 {
2251 /* Find SLP sequences starting from reduction chains. */
2252 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2253 if (! vect_analyze_slp_instance (vinfo, first_element,
2254 max_tree_size))
2255 {
2256 /* Dissolve reduction chain group. */
2257 stmt_vec_info vinfo = first_element;
2258 while (vinfo)
2259 {
2260 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2261 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2262 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2263 vinfo = next;
2264 }
2265 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2266 }
2267 }
2268
2269 /* Find SLP sequences starting from groups of reductions. */
2270 if (loop_vinfo->reductions.length () > 1)
2271 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2272 max_tree_size);
2273 }
2274
2275 return opt_result::success ();
2276 }
2277
2278
2279 /* For each possible SLP instance decide whether to SLP it and calculate overall
2280 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2281 least one instance. */
2282
2283 bool
2284 vect_make_slp_decision (loop_vec_info loop_vinfo)
2285 {
2286 unsigned int i;
2287 poly_uint64 unrolling_factor = 1;
2288 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2289 slp_instance instance;
2290 int decided_to_slp = 0;
2291
2292 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2293
2294 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2295 {
2296 /* FORNOW: SLP if you can. */
2297 /* All unroll factors have the form vinfo->vector_size * X for some
2298 rational X, so they must have a common multiple. */
2299 unrolling_factor
2300 = force_common_multiple (unrolling_factor,
2301 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2302
2303 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2304 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2305 loop-based vectorization. Such stmts will be marked as HYBRID. */
2306 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2307 decided_to_slp++;
2308 }
2309
2310 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2311
2312 if (decided_to_slp && dump_enabled_p ())
2313 {
2314 dump_printf_loc (MSG_NOTE, vect_location,
2315 "Decided to SLP %d instances. Unrolling factor ",
2316 decided_to_slp);
2317 dump_dec (MSG_NOTE, unrolling_factor);
2318 dump_printf (MSG_NOTE, "\n");
2319 }
2320
2321 return (decided_to_slp > 0);
2322 }
2323
2324
2325 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2326 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2327
2328 static void
2329 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2330 hash_map<slp_tree, unsigned> &visited)
2331 {
2332 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2333 imm_use_iterator imm_iter;
2334 gimple *use_stmt;
2335 stmt_vec_info use_vinfo;
2336 slp_tree child;
2337 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2338 int j;
2339
2340 /* We need to union stype over the incoming graph edges but we still
2341 want to limit recursion to stay O(N+E). */
2342 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2343
2344 /* Propagate hybrid down the SLP tree. */
2345 if (stype == hybrid)
2346 ;
2347 else if (HYBRID_SLP_STMT (stmt_vinfo))
2348 stype = hybrid;
2349 else if (!only_edge)
2350 {
2351 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2352 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2353 /* If we get a pattern stmt here we have to use the LHS of the
2354 original stmt for immediate uses. */
2355 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2356 tree def;
2357 if (gimple_code (stmt) == GIMPLE_PHI)
2358 def = gimple_phi_result (stmt);
2359 else
2360 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2361 if (def)
2362 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2363 {
2364 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2365 if (!use_vinfo)
2366 continue;
2367 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2368 if (!STMT_SLP_TYPE (use_vinfo)
2369 && (STMT_VINFO_RELEVANT (use_vinfo)
2370 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2371 && !(gimple_code (use_stmt) == GIMPLE_PHI
2372 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2373 {
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2376 "def in non-SLP stmt: %G", use_stmt);
2377 stype = hybrid;
2378 }
2379 }
2380 }
2381
2382 if (stype == hybrid
2383 && !HYBRID_SLP_STMT (stmt_vinfo))
2384 {
2385 if (dump_enabled_p ())
2386 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2387 stmt_vinfo->stmt);
2388 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2389 }
2390
2391 if (!only_edge)
2392 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2393 if (SLP_TREE_DEF_TYPE (child) != vect_external_def
2394 && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
2395 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2396 }
2397
2398 static void
2399 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2400 {
2401 hash_map<slp_tree, unsigned> visited;
2402 vect_detect_hybrid_slp_stmts (node, i, stype, visited);
2403 }
2404
2405 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2406
2407 static tree
2408 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2409 {
2410 walk_stmt_info *wi = (walk_stmt_info *)data;
2411 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2412
2413 if (wi->is_lhs)
2414 return NULL_TREE;
2415
2416 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2417 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2418 {
2419 if (dump_enabled_p ())
2420 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2421 def_stmt_info->stmt);
2422 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2423 }
2424
2425 return NULL_TREE;
2426 }
2427
2428 static tree
2429 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2430 walk_stmt_info *wi)
2431 {
2432 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2433 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2434 /* If the stmt is in a SLP instance then this isn't a reason
2435 to mark use definitions in other SLP instances as hybrid. */
2436 if (! STMT_SLP_TYPE (use_vinfo)
2437 && (STMT_VINFO_RELEVANT (use_vinfo)
2438 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2439 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2440 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2441 ;
2442 else
2443 *handled = true;
2444 return NULL_TREE;
2445 }
2446
2447 /* Find stmts that must be both vectorized and SLPed. */
2448
2449 void
2450 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2451 {
2452 unsigned int i;
2453 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2454 slp_instance instance;
2455
2456 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2457
2458 /* First walk all pattern stmt in the loop and mark defs of uses as
2459 hybrid because immediate uses in them are not recorded. */
2460 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2461 {
2462 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2463 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2464 gsi_next (&gsi))
2465 {
2466 gimple *stmt = gsi_stmt (gsi);
2467 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2468 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2469 {
2470 walk_stmt_info wi;
2471 memset (&wi, 0, sizeof (wi));
2472 wi.info = loop_vinfo;
2473 gimple_stmt_iterator gsi2
2474 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2475 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2476 vect_detect_hybrid_slp_1, &wi);
2477 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2478 vect_detect_hybrid_slp_2,
2479 vect_detect_hybrid_slp_1, &wi);
2480 }
2481 }
2482 }
2483
2484 /* Then walk the SLP instance trees marking stmts with uses in
2485 non-SLP stmts as hybrid, also propagating hybrid down the
2486 SLP tree, collecting the above info on-the-fly. */
2487 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2488 {
2489 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2490 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2491 i, pure_slp);
2492 }
2493 }
2494
2495
2496 /* Initialize a bb_vec_info struct for the statements between
2497 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2498
2499 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2500 gimple_stmt_iterator region_end_in,
2501 vec_info_shared *shared)
2502 : vec_info (vec_info::bb, init_cost (NULL), shared),
2503 bb (gsi_bb (region_begin_in)),
2504 region_begin (region_begin_in),
2505 region_end (region_end_in)
2506 {
2507 gimple_stmt_iterator gsi;
2508
2509 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2510 gsi_next (&gsi))
2511 {
2512 gimple *stmt = gsi_stmt (gsi);
2513 gimple_set_uid (stmt, 0);
2514 add_stmt (stmt);
2515 }
2516
2517 bb->aux = this;
2518 }
2519
2520
2521 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2522 stmts in the basic block. */
2523
2524 _bb_vec_info::~_bb_vec_info ()
2525 {
2526 for (gimple_stmt_iterator si = region_begin;
2527 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2528 /* Reset region marker. */
2529 gimple_set_uid (gsi_stmt (si), -1);
2530
2531 bb->aux = NULL;
2532 }
2533
2534 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2535 given then that child nodes have already been processed, and that
2536 their def types currently match their SLP node's def type. */
2537
2538 static bool
2539 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2540 slp_instance node_instance,
2541 stmt_vector_for_cost *cost_vec)
2542 {
2543 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2544 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2545
2546 /* For BB vectorization vector types are assigned here.
2547 Memory accesses already got their vector type assigned
2548 in vect_analyze_data_refs. */
2549 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2550 if (bb_vinfo
2551 && ! STMT_VINFO_DATA_REF (stmt_info))
2552 {
2553 tree vectype, nunits_vectype;
2554 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2555 &nunits_vectype))
2556 /* We checked this when building the node. */
2557 gcc_unreachable ();
2558 if (vectype == boolean_type_node)
2559 {
2560 vectype = vect_get_mask_type_for_stmt (stmt_info);
2561 if (!vectype)
2562 /* vect_get_mask_type_for_stmt has already explained the
2563 failure. */
2564 return false;
2565 }
2566
2567 stmt_vec_info sstmt_info;
2568 unsigned int i;
2569 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2570 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2571 }
2572
2573 /* Calculate the number of vector statements to be created for the
2574 scalar stmts in this node. For SLP reductions it is equal to the
2575 number of vector statements in the children (which has already been
2576 calculated by the recursive call). Otherwise it is the number of
2577 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2578 VF divided by the number of elements in a vector. */
2579 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2580 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2581 {
2582 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2583 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2584 {
2585 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2586 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2587 break;
2588 }
2589 }
2590 else
2591 {
2592 poly_uint64 vf;
2593 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2594 vf = loop_vinfo->vectorization_factor;
2595 else
2596 vf = 1;
2597 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2598 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2599 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2600 = vect_get_num_vectors (vf * group_size, vectype);
2601 }
2602
2603 bool dummy;
2604 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2605 }
2606
2607 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2608 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2609
2610 Return true if the operations are supported. */
2611
2612 static bool
2613 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2614 slp_instance node_instance,
2615 scalar_stmts_to_slp_tree_map_t *visited,
2616 scalar_stmts_to_slp_tree_map_t *lvisited,
2617 stmt_vector_for_cost *cost_vec)
2618 {
2619 int i, j;
2620 slp_tree child;
2621
2622 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2623 return true;
2624
2625 /* If we already analyzed the exact same set of scalar stmts we're done.
2626 We share the generated vector stmts for those. */
2627 slp_tree *leader;
2628 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2629 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2630 {
2631 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2632 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2633 return true;
2634 }
2635
2636 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2637 doesn't result in any issue since we throw away the lvisited set
2638 when we fail. */
2639 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2640
2641 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2642 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2643 visited, lvisited, cost_vec))
2644 return false;
2645
2646 /* ??? We have to catch the case late where two first scalar stmts appear
2647 in multiple SLP children with different def type and fail. Remember
2648 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2649 match it when that is vect_internal_def. */
2650 auto_vec<vect_def_type, 4> dt;
2651 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2652 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2653 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2654 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2655
2656 /* Push SLP node def-type to stmt operands. */
2657 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2658 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2659 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2660 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2661 = SLP_TREE_DEF_TYPE (child);
2662
2663 /* Check everything worked out. */
2664 bool res = true;
2665 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2666 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2667 {
2668 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2669 {
2670 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2671 != SLP_TREE_DEF_TYPE (child))
2672 res = false;
2673 }
2674 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2675 != dt[j])
2676 res = false;
2677 }
2678 if (!res && dump_enabled_p ())
2679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2680 "not vectorized: same operand with different "
2681 "def type in stmt.\n");
2682
2683 if (res)
2684 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2685 cost_vec);
2686
2687 /* Restore def-types. */
2688 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2689 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2690 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2691
2692 return res;
2693 }
2694
2695
2696 /* Analyze statements in SLP instances of VINFO. Return true if the
2697 operations are supported. */
2698
2699 bool
2700 vect_slp_analyze_operations (vec_info *vinfo)
2701 {
2702 slp_instance instance;
2703 int i;
2704
2705 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2706
2707 scalar_stmts_to_slp_tree_map_t *visited
2708 = new scalar_stmts_to_slp_tree_map_t ();
2709 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2710 {
2711 scalar_stmts_to_slp_tree_map_t lvisited;
2712 stmt_vector_for_cost cost_vec;
2713 cost_vec.create (2);
2714 if (!vect_slp_analyze_node_operations (vinfo,
2715 SLP_INSTANCE_TREE (instance),
2716 instance, visited, &lvisited,
2717 &cost_vec))
2718 {
2719 slp_tree node = SLP_INSTANCE_TREE (instance);
2720 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2721 if (dump_enabled_p ())
2722 dump_printf_loc (MSG_NOTE, vect_location,
2723 "removing SLP instance operations starting from: %G",
2724 stmt_info->stmt);
2725 vect_free_slp_instance (instance, false);
2726 vinfo->slp_instances.ordered_remove (i);
2727 cost_vec.release ();
2728 }
2729 else
2730 {
2731 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2732 x != lvisited.end(); ++x)
2733 visited->put ((*x).first.copy (), (*x).second);
2734 i++;
2735
2736 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2737 cost_vec.release ();
2738 }
2739 }
2740 delete visited;
2741
2742 return !vinfo->slp_instances.is_empty ();
2743 }
2744
2745
2746 /* Compute the scalar cost of the SLP node NODE and its children
2747 and return it. Do not account defs that are marked in LIFE and
2748 update LIFE according to uses of NODE. */
2749
2750 static void
2751 vect_bb_slp_scalar_cost (basic_block bb,
2752 slp_tree node, vec<bool, va_heap> *life,
2753 stmt_vector_for_cost *cost_vec,
2754 hash_set<slp_tree> &visited)
2755 {
2756 unsigned i;
2757 stmt_vec_info stmt_info;
2758 slp_tree child;
2759
2760 if (visited.add (node))
2761 return;
2762
2763 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2764 {
2765 gimple *stmt = stmt_info->stmt;
2766 vec_info *vinfo = stmt_info->vinfo;
2767 ssa_op_iter op_iter;
2768 def_operand_p def_p;
2769
2770 if ((*life)[i])
2771 continue;
2772
2773 /* If there is a non-vectorized use of the defs then the scalar
2774 stmt is kept live in which case we do not account it or any
2775 required defs in the SLP children in the scalar cost. This
2776 way we make the vectorization more costly when compared to
2777 the scalar cost. */
2778 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2779 {
2780 imm_use_iterator use_iter;
2781 gimple *use_stmt;
2782 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2783 if (!is_gimple_debug (use_stmt))
2784 {
2785 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2786 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2787 {
2788 (*life)[i] = true;
2789 BREAK_FROM_IMM_USE_STMT (use_iter);
2790 }
2791 }
2792 }
2793 if ((*life)[i])
2794 continue;
2795
2796 /* Count scalar stmts only once. */
2797 if (gimple_visited_p (stmt))
2798 continue;
2799 gimple_set_visited (stmt, true);
2800
2801 vect_cost_for_stmt kind;
2802 if (STMT_VINFO_DATA_REF (stmt_info))
2803 {
2804 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2805 kind = scalar_load;
2806 else
2807 kind = scalar_store;
2808 }
2809 else
2810 kind = scalar_stmt;
2811 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2812 }
2813
2814 auto_vec<bool, 20> subtree_life;
2815 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2816 {
2817 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2818 {
2819 /* Do not directly pass LIFE to the recursive call, copy it to
2820 confine changes in the callee to the current child/subtree. */
2821 subtree_life.safe_splice (*life);
2822 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2823 visited);
2824 subtree_life.truncate (0);
2825 }
2826 }
2827 }
2828
2829 static void
2830 vect_bb_slp_scalar_cost (basic_block bb,
2831 slp_tree node, vec<bool, va_heap> *life,
2832 stmt_vector_for_cost *cost_vec)
2833 {
2834 hash_set<slp_tree> visited;
2835 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2836 }
2837
2838 /* Check if vectorization of the basic block is profitable. */
2839
2840 static bool
2841 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2842 {
2843 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2844 slp_instance instance;
2845 int i;
2846 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2847 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2848
2849 /* Calculate scalar cost. */
2850 stmt_vector_for_cost scalar_costs;
2851 scalar_costs.create (0);
2852 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2853 {
2854 auto_vec<bool, 20> life;
2855 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2856 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2857 SLP_INSTANCE_TREE (instance),
2858 &life, &scalar_costs);
2859 }
2860 void *target_cost_data = init_cost (NULL);
2861 add_stmt_costs (target_cost_data, &scalar_costs);
2862 scalar_costs.release ();
2863 unsigned dummy;
2864 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2865 destroy_cost_data (target_cost_data);
2866
2867 /* Unset visited flag. */
2868 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2869 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2870 gimple_set_visited (gsi_stmt (gsi), false);
2871
2872 /* Complete the target-specific cost calculation. */
2873 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2874 &vec_inside_cost, &vec_epilogue_cost);
2875
2876 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2877
2878 if (dump_enabled_p ())
2879 {
2880 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2881 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2882 vec_inside_cost);
2883 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2884 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2885 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2886 }
2887
2888 /* Vectorization is profitable if its cost is more than the cost of scalar
2889 version. Note that we err on the vector side for equal cost because
2890 the cost estimate is otherwise quite pessimistic (constant uses are
2891 free on the scalar side but cost a load on the vector side for
2892 example). */
2893 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2894 return false;
2895
2896 return true;
2897 }
2898
2899 /* Check if the region described by BB_VINFO can be vectorized, returning
2900 true if so. When returning false, set FATAL to true if the same failure
2901 would prevent vectorization at other vector sizes, false if it is still
2902 worth trying other sizes. N_STMTS is the number of statements in the
2903 region. */
2904
2905 static bool
2906 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
2907 {
2908 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2909
2910 slp_instance instance;
2911 int i;
2912 poly_uint64 min_vf = 2;
2913
2914 /* The first group of checks is independent of the vector size. */
2915 fatal = true;
2916
2917 /* Analyze the data references. */
2918
2919 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
2920 {
2921 if (dump_enabled_p ())
2922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923 "not vectorized: unhandled data-ref in basic "
2924 "block.\n");
2925 return false;
2926 }
2927
2928 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2929 {
2930 if (dump_enabled_p ())
2931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2932 "not vectorized: not enough data-refs in "
2933 "basic block.\n");
2934 return false;
2935 }
2936
2937 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2938 {
2939 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2941 "not vectorized: unhandled data access in "
2942 "basic block.\n");
2943 return false;
2944 }
2945
2946 /* If there are no grouped stores in the region there is no need
2947 to continue with pattern recog as vect_analyze_slp will fail
2948 anyway. */
2949 if (bb_vinfo->grouped_stores.is_empty ())
2950 {
2951 if (dump_enabled_p ())
2952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2953 "not vectorized: no grouped stores in "
2954 "basic block.\n");
2955 return false;
2956 }
2957
2958 /* While the rest of the analysis below depends on it in some way. */
2959 fatal = false;
2960
2961 vect_pattern_recog (bb_vinfo);
2962
2963 /* Check the SLP opportunities in the basic block, analyze and build SLP
2964 trees. */
2965 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2966 {
2967 if (dump_enabled_p ())
2968 {
2969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2970 "Failed to SLP the basic block.\n");
2971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2972 "not vectorized: failed to find SLP opportunities "
2973 "in basic block.\n");
2974 }
2975 return false;
2976 }
2977
2978 vect_record_base_alignments (bb_vinfo);
2979
2980 /* Analyze and verify the alignment of data references and the
2981 dependence in the SLP instances. */
2982 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2983 {
2984 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2985 || ! vect_slp_analyze_instance_dependence (instance))
2986 {
2987 slp_tree node = SLP_INSTANCE_TREE (instance);
2988 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2989 if (dump_enabled_p ())
2990 dump_printf_loc (MSG_NOTE, vect_location,
2991 "removing SLP instance operations starting from: %G",
2992 stmt_info->stmt);
2993 vect_free_slp_instance (instance, false);
2994 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2995 continue;
2996 }
2997
2998 /* Mark all the statements that we want to vectorize as pure SLP and
2999 relevant. */
3000 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3001 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3002
3003 i++;
3004 }
3005 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3006 return false;
3007
3008 if (!vect_slp_analyze_operations (bb_vinfo))
3009 {
3010 if (dump_enabled_p ())
3011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012 "not vectorized: bad operation in basic block.\n");
3013 return false;
3014 }
3015
3016 /* Cost model: check if the vectorization is worthwhile. */
3017 if (!unlimited_cost_model (NULL)
3018 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3019 {
3020 if (dump_enabled_p ())
3021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3022 "not vectorized: vectorization is not "
3023 "profitable.\n");
3024 return false;
3025 }
3026
3027 if (dump_enabled_p ())
3028 dump_printf_loc (MSG_NOTE, vect_location,
3029 "Basic block will be vectorized using SLP\n");
3030 return true;
3031 }
3032
3033 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3034 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3035 on success. The region has N_STMTS statements and has the datarefs
3036 given by DATAREFS. */
3037
3038 static bool
3039 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3040 gimple_stmt_iterator region_end,
3041 vec<data_reference_p> datarefs,
3042 unsigned int n_stmts)
3043 {
3044 bb_vec_info bb_vinfo;
3045 auto_vector_sizes vector_sizes;
3046
3047 /* Autodetect first vector size we try. */
3048 poly_uint64 next_vector_size = 0;
3049 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
3050 unsigned int next_size = 0;
3051
3052 vec_info_shared shared;
3053
3054 poly_uint64 autodetected_vector_size = 0;
3055 while (1)
3056 {
3057 bool vectorized = false;
3058 bool fatal = false;
3059 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3060
3061 bool first_time_p = shared.datarefs.is_empty ();
3062 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3063 if (first_time_p)
3064 bb_vinfo->shared->save_datarefs ();
3065 else
3066 bb_vinfo->shared->check_datarefs ();
3067 bb_vinfo->vector_size = next_vector_size;
3068
3069 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3070 && dbg_cnt (vect_slp))
3071 {
3072 if (dump_enabled_p ())
3073 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3074
3075 bb_vinfo->shared->check_datarefs ();
3076 vect_schedule_slp (bb_vinfo);
3077
3078 unsigned HOST_WIDE_INT bytes;
3079 if (dump_enabled_p ())
3080 {
3081 if (bb_vinfo->vector_size.is_constant (&bytes))
3082 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3083 "basic block part vectorized using %wu byte "
3084 "vectors\n", bytes);
3085 else
3086 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3087 "basic block part vectorized using variable "
3088 "length vectors\n");
3089 }
3090
3091 vectorized = true;
3092 }
3093
3094 if (next_size == 0)
3095 autodetected_vector_size = bb_vinfo->vector_size;
3096
3097 delete bb_vinfo;
3098
3099 if (next_size < vector_sizes.length ()
3100 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3101 next_size += 1;
3102
3103 if (vectorized
3104 || next_size == vector_sizes.length ()
3105 || known_eq (bb_vinfo->vector_size, 0U)
3106 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3107 vector sizes will fail do not bother iterating. */
3108 || fatal)
3109 return vectorized;
3110
3111 /* Try the next biggest vector size. */
3112 next_vector_size = vector_sizes[next_size++];
3113 if (dump_enabled_p ())
3114 {
3115 dump_printf_loc (MSG_NOTE, vect_location,
3116 "***** Re-trying analysis with "
3117 "vector size ");
3118 dump_dec (MSG_NOTE, next_vector_size);
3119 dump_printf (MSG_NOTE, "\n");
3120 }
3121 }
3122 }
3123
3124 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3125 true if anything in the basic-block was vectorized. */
3126
3127 bool
3128 vect_slp_bb (basic_block bb)
3129 {
3130 gimple_stmt_iterator gsi;
3131 bool any_vectorized = false;
3132
3133 gsi = gsi_start_bb (bb);
3134 while (!gsi_end_p (gsi))
3135 {
3136 gimple_stmt_iterator region_begin = gsi;
3137 vec<data_reference_p> datarefs = vNULL;
3138 int insns = 0;
3139
3140 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3141 {
3142 gimple *stmt = gsi_stmt (gsi);
3143 if (is_gimple_debug (stmt))
3144 continue;
3145 insns++;
3146
3147 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3148 vect_location = stmt;
3149
3150 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3151 break;
3152 }
3153
3154 /* Skip leading unhandled stmts. */
3155 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3156 {
3157 gsi_next (&gsi);
3158 continue;
3159 }
3160
3161 gimple_stmt_iterator region_end = gsi;
3162
3163 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
3164 {
3165 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3167 "not vectorized: too many instructions in "
3168 "basic block.\n");
3169 }
3170 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3171 any_vectorized = true;
3172
3173 if (gsi_end_p (region_end))
3174 break;
3175
3176 /* Skip the unhandled stmt. */
3177 gsi_next (&gsi);
3178 }
3179
3180 return any_vectorized;
3181 }
3182
3183
3184 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3185
3186 static bool
3187 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3188 {
3189 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3190 tree op, vectype;
3191 enum vect_def_type dt;
3192
3193 /* For comparison and COND_EXPR type is chosen depending
3194 on the non-constant other comparison operand. */
3195 if (TREE_CODE_CLASS (code) == tcc_comparison)
3196 {
3197 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3198 op = gimple_assign_rhs1 (stmt);
3199
3200 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3201 gcc_unreachable ();
3202
3203 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3204 }
3205
3206 if (code == COND_EXPR)
3207 {
3208 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3209 tree cond = gimple_assign_rhs1 (stmt);
3210
3211 if (TREE_CODE (cond) == SSA_NAME)
3212 op = cond;
3213 else
3214 op = TREE_OPERAND (cond, 0);
3215
3216 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3217 gcc_unreachable ();
3218
3219 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3220 }
3221
3222 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3223 }
3224
3225 /* Build a variable-length vector in which the elements in ELTS are repeated
3226 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3227 RESULTS and add any new instructions to SEQ.
3228
3229 The approach we use is:
3230
3231 (1) Find a vector mode VM with integer elements of mode IM.
3232
3233 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3234 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3235 from small vectors to IM.
3236
3237 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3238
3239 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3240 correct byte contents.
3241
3242 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3243
3244 We try to find the largest IM for which this sequence works, in order
3245 to cut down on the number of interleaves. */
3246
3247 void
3248 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3249 vec<tree> elts, unsigned int nresults,
3250 vec<tree> &results)
3251 {
3252 unsigned int nelts = elts.length ();
3253 tree element_type = TREE_TYPE (vector_type);
3254
3255 /* (1) Find a vector mode VM with integer elements of mode IM. */
3256 unsigned int nvectors = 1;
3257 tree new_vector_type;
3258 tree permutes[2];
3259 if (!can_duplicate_and_interleave_p (vinfo, nelts, TYPE_MODE (element_type),
3260 &nvectors, &new_vector_type,
3261 permutes))
3262 gcc_unreachable ();
3263
3264 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3265 unsigned int partial_nelts = nelts / nvectors;
3266 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3267
3268 tree_vector_builder partial_elts;
3269 auto_vec<tree, 32> pieces (nvectors * 2);
3270 pieces.quick_grow (nvectors * 2);
3271 for (unsigned int i = 0; i < nvectors; ++i)
3272 {
3273 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3274 ELTS' has mode IM. */
3275 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3276 for (unsigned int j = 0; j < partial_nelts; ++j)
3277 partial_elts.quick_push (elts[i * partial_nelts + j]);
3278 tree t = gimple_build_vector (seq, &partial_elts);
3279 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3280 TREE_TYPE (new_vector_type), t);
3281
3282 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3283 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3284 }
3285
3286 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3287 correct byte contents.
3288
3289 We need to repeat the following operation log2(nvectors) times:
3290
3291 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3292 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3293
3294 However, if each input repeats every N elements and the VF is
3295 a multiple of N * 2, the HI result is the same as the LO. */
3296 unsigned int in_start = 0;
3297 unsigned int out_start = nvectors;
3298 unsigned int hi_start = nvectors / 2;
3299 /* A bound on the number of outputs needed to produce NRESULTS results
3300 in the final iteration. */
3301 unsigned int noutputs_bound = nvectors * nresults;
3302 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3303 {
3304 noutputs_bound /= 2;
3305 unsigned int limit = MIN (noutputs_bound, nvectors);
3306 for (unsigned int i = 0; i < limit; ++i)
3307 {
3308 if ((i & 1) != 0
3309 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3310 2 * in_repeat))
3311 {
3312 pieces[out_start + i] = pieces[out_start + i - 1];
3313 continue;
3314 }
3315
3316 tree output = make_ssa_name (new_vector_type);
3317 tree input1 = pieces[in_start + (i / 2)];
3318 tree input2 = pieces[in_start + (i / 2) + hi_start];
3319 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3320 input1, input2,
3321 permutes[i & 1]);
3322 gimple_seq_add_stmt (seq, stmt);
3323 pieces[out_start + i] = output;
3324 }
3325 std::swap (in_start, out_start);
3326 }
3327
3328 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3329 results.reserve (nresults);
3330 for (unsigned int i = 0; i < nresults; ++i)
3331 if (i < nvectors)
3332 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3333 pieces[in_start + i]));
3334 else
3335 results.quick_push (results[i - nvectors]);
3336 }
3337
3338
3339 /* For constant and loop invariant defs of SLP_NODE this function returns
3340 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3341 OP_NODE determines the node for the operand containing the scalar
3342 operands. */
3343
3344 static void
3345 vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
3346 vec<tree> *vec_oprnds)
3347 {
3348 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3349 vec_info *vinfo = stmt_vinfo->vinfo;
3350 unsigned HOST_WIDE_INT nunits;
3351 tree vec_cst;
3352 unsigned j, number_of_places_left_in_vector;
3353 tree vector_type;
3354 tree vop;
3355 int group_size = op_node->ops.length ();
3356 unsigned int vec_num, i;
3357 unsigned number_of_copies = 1;
3358 bool constant_p;
3359 tree neutral_op = NULL;
3360 gimple_seq ctor_seq = NULL;
3361 auto_vec<tree, 16> permute_results;
3362
3363 /* ??? SLP analysis should compute the vector type for the
3364 constant / invariant and store it in the SLP node. */
3365 tree op = op_node->ops[0];
3366 /* Check if vector type is a boolean vector. */
3367 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3368 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3369 && vect_mask_constant_operand_p (stmt_vinfo))
3370 vector_type
3371 = build_same_sized_truth_vector_type (stmt_vectype);
3372 else
3373 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
3374
3375 unsigned int number_of_vectors
3376 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3377 * TYPE_VECTOR_SUBPARTS (stmt_vectype),
3378 vector_type);
3379 vec_oprnds->create (number_of_vectors);
3380 auto_vec<tree> voprnds (number_of_vectors);
3381
3382 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3383 created vectors. It is greater than 1 if unrolling is performed.
3384
3385 For example, we have two scalar operands, s1 and s2 (e.g., group of
3386 strided accesses of size two), while NUNITS is four (i.e., four scalars
3387 of this type can be packed in a vector). The output vector will contain
3388 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3389 will be 2).
3390
3391 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3392 containing the operands.
3393
3394 For example, NUNITS is four as before, and the group size is 8
3395 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3396 {s5, s6, s7, s8}. */
3397
3398 /* When using duplicate_and_interleave, we just need one element for
3399 each scalar statement. */
3400 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3401 nunits = group_size;
3402
3403 number_of_copies = nunits * number_of_vectors / group_size;
3404
3405 number_of_places_left_in_vector = nunits;
3406 constant_p = true;
3407 tree_vector_builder elts (vector_type, nunits, 1);
3408 elts.quick_grow (nunits);
3409 bool place_after_defs = false;
3410 for (j = 0; j < number_of_copies; j++)
3411 {
3412 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3413 {
3414 /* Create 'vect_ = {op0,op1,...,opn}'. */
3415 number_of_places_left_in_vector--;
3416 tree orig_op = op;
3417 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3418 {
3419 if (CONSTANT_CLASS_P (op))
3420 {
3421 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3422 {
3423 /* Can't use VIEW_CONVERT_EXPR for booleans because
3424 of possibly different sizes of scalar value and
3425 vector element. */
3426 if (integer_zerop (op))
3427 op = build_int_cst (TREE_TYPE (vector_type), 0);
3428 else if (integer_onep (op))
3429 op = build_all_ones_cst (TREE_TYPE (vector_type));
3430 else
3431 gcc_unreachable ();
3432 }
3433 else
3434 op = fold_unary (VIEW_CONVERT_EXPR,
3435 TREE_TYPE (vector_type), op);
3436 gcc_assert (op && CONSTANT_CLASS_P (op));
3437 }
3438 else
3439 {
3440 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3441 gimple *init_stmt;
3442 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3443 {
3444 tree true_val
3445 = build_all_ones_cst (TREE_TYPE (vector_type));
3446 tree false_val
3447 = build_zero_cst (TREE_TYPE (vector_type));
3448 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3449 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3450 op, true_val,
3451 false_val);
3452 }
3453 else
3454 {
3455 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3456 op);
3457 init_stmt
3458 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3459 op);
3460 }
3461 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3462 op = new_temp;
3463 }
3464 }
3465 elts[number_of_places_left_in_vector] = op;
3466 if (!CONSTANT_CLASS_P (op))
3467 constant_p = false;
3468 if (TREE_CODE (orig_op) == SSA_NAME
3469 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3470 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3471 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3472 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3473 place_after_defs = true;
3474
3475 if (number_of_places_left_in_vector == 0)
3476 {
3477 if (constant_p
3478 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3479 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3480 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3481 else
3482 {
3483 if (permute_results.is_empty ())
3484 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3485 elts, number_of_vectors,
3486 permute_results);
3487 vec_cst = permute_results[number_of_vectors - j - 1];
3488 }
3489 tree init;
3490 gimple_stmt_iterator gsi;
3491 if (place_after_defs)
3492 {
3493 stmt_vec_info last_stmt_info
3494 = vect_find_last_scalar_stmt_in_slp (slp_node);
3495 gsi = gsi_for_stmt (last_stmt_info->stmt);
3496 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3497 &gsi);
3498 }
3499 else
3500 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3501 NULL);
3502 if (ctor_seq != NULL)
3503 {
3504 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3505 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3506 ctor_seq = NULL;
3507 }
3508 voprnds.quick_push (init);
3509 place_after_defs = false;
3510 number_of_places_left_in_vector = nunits;
3511 constant_p = true;
3512 elts.new_vector (vector_type, nunits, 1);
3513 elts.quick_grow (nunits);
3514 }
3515 }
3516 }
3517
3518 /* Since the vectors are created in the reverse order, we should invert
3519 them. */
3520 vec_num = voprnds.length ();
3521 for (j = vec_num; j != 0; j--)
3522 {
3523 vop = voprnds[j - 1];
3524 vec_oprnds->quick_push (vop);
3525 }
3526
3527 /* In case that VF is greater than the unrolling factor needed for the SLP
3528 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3529 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3530 to replicate the vectors. */
3531 while (number_of_vectors > vec_oprnds->length ())
3532 {
3533 tree neutral_vec = NULL;
3534
3535 if (neutral_op)
3536 {
3537 if (!neutral_vec)
3538 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3539
3540 vec_oprnds->quick_push (neutral_vec);
3541 }
3542 else
3543 {
3544 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3545 vec_oprnds->quick_push (vop);
3546 }
3547 }
3548 }
3549
3550
3551 /* Get vectorized definitions from SLP_NODE that contains corresponding
3552 vectorized def-stmts. */
3553
3554 static void
3555 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3556 {
3557 stmt_vec_info vec_def_stmt_info;
3558 unsigned int i;
3559
3560 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3561
3562 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3563 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3564 }
3565
3566
3567 /* Get N vectorized definitions for SLP_NODE.
3568 If the scalar definitions are loop invariants or constants, collect them and
3569 call vect_get_constant_vectors() to create vector stmts.
3570 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3571 must be stored in the corresponding child of SLP_NODE, and we call
3572 vect_get_slp_vect_defs () to retrieve them. */
3573
3574 void
3575 vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3576 {
3577 if (n == -1U)
3578 n = SLP_TREE_CHILDREN (slp_node).length ();
3579
3580 for (unsigned i = 0; i < n; ++i)
3581 {
3582 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3583
3584 vec<tree> vec_defs = vNULL;
3585
3586 /* For each operand we check if it has vectorized definitions in a child
3587 node or we need to create them (for invariants and constants). */
3588 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3589 {
3590 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3591 vect_get_slp_vect_defs (child, &vec_defs);
3592 }
3593 else
3594 vect_get_constant_vectors (child, slp_node, &vec_defs);
3595
3596 vec_oprnds->quick_push (vec_defs);
3597 }
3598 }
3599
3600 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3601 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3602 permute statements for the SLP node NODE of the SLP instance
3603 SLP_NODE_INSTANCE. */
3604
3605 bool
3606 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3607 gimple_stmt_iterator *gsi, poly_uint64 vf,
3608 slp_instance slp_node_instance, bool analyze_only,
3609 unsigned *n_perms)
3610 {
3611 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3612 vec_info *vinfo = stmt_info->vinfo;
3613 int vec_index = 0;
3614 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3615 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3616 unsigned int mask_element;
3617 machine_mode mode;
3618
3619 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3620 return false;
3621
3622 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3623
3624 mode = TYPE_MODE (vectype);
3625 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3626
3627 /* Initialize the vect stmts of NODE to properly insert the generated
3628 stmts later. */
3629 if (! analyze_only)
3630 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3631 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3632 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3633
3634 /* Generate permutation masks for every NODE. Number of masks for each NODE
3635 is equal to GROUP_SIZE.
3636 E.g., we have a group of three nodes with three loads from the same
3637 location in each node, and the vector size is 4. I.e., we have a
3638 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3639 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3640 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3641 ...
3642
3643 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3644 The last mask is illegal since we assume two operands for permute
3645 operation, and the mask element values can't be outside that range.
3646 Hence, the last mask must be converted into {2,5,5,5}.
3647 For the first two permutations we need the first and the second input
3648 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3649 we need the second and the third vectors: {b1,c1,a2,b2} and
3650 {c2,a3,b3,c3}. */
3651
3652 int vect_stmts_counter = 0;
3653 unsigned int index = 0;
3654 int first_vec_index = -1;
3655 int second_vec_index = -1;
3656 bool noop_p = true;
3657 *n_perms = 0;
3658
3659 vec_perm_builder mask;
3660 unsigned int nelts_to_build;
3661 unsigned int nvectors_per_build;
3662 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3663 && multiple_p (nunits, group_size));
3664 if (repeating_p)
3665 {
3666 /* A single vector contains a whole number of copies of the node, so:
3667 (a) all permutes can use the same mask; and
3668 (b) the permutes only need a single vector input. */
3669 mask.new_vector (nunits, group_size, 3);
3670 nelts_to_build = mask.encoded_nelts ();
3671 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3672 }
3673 else
3674 {
3675 /* We need to construct a separate mask for each vector statement. */
3676 unsigned HOST_WIDE_INT const_nunits, const_vf;
3677 if (!nunits.is_constant (&const_nunits)
3678 || !vf.is_constant (&const_vf))
3679 return false;
3680 mask.new_vector (const_nunits, const_nunits, 1);
3681 nelts_to_build = const_vf * group_size;
3682 nvectors_per_build = 1;
3683 }
3684
3685 unsigned int count = mask.encoded_nelts ();
3686 mask.quick_grow (count);
3687 vec_perm_indices indices;
3688
3689 for (unsigned int j = 0; j < nelts_to_build; j++)
3690 {
3691 unsigned int iter_num = j / group_size;
3692 unsigned int stmt_num = j % group_size;
3693 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3694 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3695 if (repeating_p)
3696 {
3697 first_vec_index = 0;
3698 mask_element = i;
3699 }
3700 else
3701 {
3702 /* Enforced before the loop when !repeating_p. */
3703 unsigned int const_nunits = nunits.to_constant ();
3704 vec_index = i / const_nunits;
3705 mask_element = i % const_nunits;
3706 if (vec_index == first_vec_index
3707 || first_vec_index == -1)
3708 {
3709 first_vec_index = vec_index;
3710 }
3711 else if (vec_index == second_vec_index
3712 || second_vec_index == -1)
3713 {
3714 second_vec_index = vec_index;
3715 mask_element += const_nunits;
3716 }
3717 else
3718 {
3719 if (dump_enabled_p ())
3720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3721 "permutation requires at "
3722 "least three vectors %G",
3723 stmt_info->stmt);
3724 gcc_assert (analyze_only);
3725 return false;
3726 }
3727
3728 gcc_assert (mask_element < 2 * const_nunits);
3729 }
3730
3731 if (mask_element != index)
3732 noop_p = false;
3733 mask[index++] = mask_element;
3734
3735 if (index == count && !noop_p)
3736 {
3737 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3738 if (!can_vec_perm_const_p (mode, indices))
3739 {
3740 if (dump_enabled_p ())
3741 {
3742 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3743 vect_location,
3744 "unsupported vect permute { ");
3745 for (i = 0; i < count; ++i)
3746 {
3747 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3748 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3749 }
3750 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3751 }
3752 gcc_assert (analyze_only);
3753 return false;
3754 }
3755
3756 ++*n_perms;
3757 }
3758
3759 if (index == count)
3760 {
3761 if (!analyze_only)
3762 {
3763 tree mask_vec = NULL_TREE;
3764
3765 if (! noop_p)
3766 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3767
3768 if (second_vec_index == -1)
3769 second_vec_index = first_vec_index;
3770
3771 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3772 {
3773 /* Generate the permute statement if necessary. */
3774 tree first_vec = dr_chain[first_vec_index + ri];
3775 tree second_vec = dr_chain[second_vec_index + ri];
3776 stmt_vec_info perm_stmt_info;
3777 if (! noop_p)
3778 {
3779 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3780 tree perm_dest
3781 = vect_create_destination_var (gimple_assign_lhs (stmt),
3782 vectype);
3783 perm_dest = make_ssa_name (perm_dest);
3784 gassign *perm_stmt
3785 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3786 first_vec, second_vec,
3787 mask_vec);
3788 perm_stmt_info
3789 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3790 gsi);
3791 }
3792 else
3793 /* If mask was NULL_TREE generate the requested
3794 identity transform. */
3795 perm_stmt_info = vinfo->lookup_def (first_vec);
3796
3797 /* Store the vector statement in NODE. */
3798 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3799 = perm_stmt_info;
3800 }
3801 }
3802
3803 index = 0;
3804 first_vec_index = -1;
3805 second_vec_index = -1;
3806 noop_p = true;
3807 }
3808 }
3809
3810 return true;
3811 }
3812
3813 /* Vectorize SLP instance tree in postorder. */
3814
3815 static void
3816 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3817 scalar_stmts_to_slp_tree_map_t *bst_map)
3818 {
3819 gimple_stmt_iterator si;
3820 stmt_vec_info stmt_info;
3821 unsigned int group_size;
3822 tree vectype;
3823 int i, j;
3824 slp_tree child;
3825
3826 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3827 return;
3828
3829 /* See if we have already vectorized the node in the graph of the
3830 SLP instance. */
3831 if (SLP_TREE_VEC_STMTS (node).exists ())
3832 return;
3833
3834 /* See if we have already vectorized the same set of stmts and reuse their
3835 vectorized stmts across instances. */
3836 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3837 {
3838 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3839 return;
3840 }
3841
3842 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3843 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3844 vect_schedule_slp_instance (child, instance, bst_map);
3845
3846 /* Push SLP node def-type to stmts. */
3847 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3848 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3849 {
3850 stmt_vec_info child_stmt_info;
3851 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3852 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3853 }
3854
3855 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3856
3857 /* VECTYPE is the type of the destination. */
3858 vectype = STMT_VINFO_VECTYPE (stmt_info);
3859 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3860 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3861
3862 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3863 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3864
3865 if (dump_enabled_p ())
3866 dump_printf_loc (MSG_NOTE, vect_location,
3867 "------>vectorizing SLP node starting from: %G",
3868 stmt_info->stmt);
3869
3870 /* Vectorized stmts go before the last scalar stmt which is where
3871 all uses are ready. */
3872 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3873 si = gsi_for_stmt (last_stmt_info->stmt);
3874
3875 /* Handle two-operation SLP nodes by vectorizing the group with
3876 both operations and then performing a merge. */
3877 if (SLP_TREE_TWO_OPERATORS (node))
3878 {
3879 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3880 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3881 enum tree_code ocode = ERROR_MARK;
3882 stmt_vec_info ostmt_info;
3883 vec_perm_builder mask (group_size, group_size, 1);
3884 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3885 {
3886 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3887 if (gimple_assign_rhs_code (ostmt) != code0)
3888 {
3889 mask.quick_push (1);
3890 ocode = gimple_assign_rhs_code (ostmt);
3891 }
3892 else
3893 mask.quick_push (0);
3894 }
3895 if (ocode != ERROR_MARK)
3896 {
3897 vec<stmt_vec_info> v0;
3898 vec<stmt_vec_info> v1;
3899 unsigned j;
3900 tree tmask = NULL_TREE;
3901 vect_transform_stmt (stmt_info, &si, node, instance);
3902 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3903 SLP_TREE_VEC_STMTS (node).truncate (0);
3904 gimple_assign_set_rhs_code (stmt, ocode);
3905 vect_transform_stmt (stmt_info, &si, node, instance);
3906 gimple_assign_set_rhs_code (stmt, code0);
3907 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3908 SLP_TREE_VEC_STMTS (node).truncate (0);
3909 tree meltype = build_nonstandard_integer_type
3910 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3911 tree mvectype = get_same_sized_vectype (meltype, vectype);
3912 unsigned k = 0, l;
3913 for (j = 0; j < v0.length (); ++j)
3914 {
3915 /* Enforced by vect_build_slp_tree, which rejects variable-length
3916 vectors for SLP_TREE_TWO_OPERATORS. */
3917 unsigned int const_nunits = nunits.to_constant ();
3918 tree_vector_builder melts (mvectype, const_nunits, 1);
3919 for (l = 0; l < const_nunits; ++l)
3920 {
3921 if (k >= group_size)
3922 k = 0;
3923 tree t = build_int_cst (meltype,
3924 mask[k++] * const_nunits + l);
3925 melts.quick_push (t);
3926 }
3927 tmask = melts.build ();
3928
3929 /* ??? Not all targets support a VEC_PERM_EXPR with a
3930 constant mask that would translate to a vec_merge RTX
3931 (with their vec_perm_const_ok). We can either not
3932 vectorize in that case or let veclower do its job.
3933 Unfortunately that isn't too great and at least for
3934 plus/minus we'd eventually like to match targets
3935 vector addsub instructions. */
3936 gimple *vstmt;
3937 vstmt = gimple_build_assign (make_ssa_name (vectype),
3938 VEC_PERM_EXPR,
3939 gimple_assign_lhs (v0[j]->stmt),
3940 gimple_assign_lhs (v1[j]->stmt),
3941 tmask);
3942 SLP_TREE_VEC_STMTS (node).quick_push
3943 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
3944 }
3945 v0.release ();
3946 v1.release ();
3947 return;
3948 }
3949 }
3950 vect_transform_stmt (stmt_info, &si, node, instance);
3951
3952 /* Restore stmt def-types. */
3953 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3954 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3955 {
3956 stmt_vec_info child_stmt_info;
3957 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3958 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
3959 }
3960 }
3961
3962 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3963 For loop vectorization this is done in vectorizable_call, but for SLP
3964 it needs to be deferred until end of vect_schedule_slp, because multiple
3965 SLP instances may refer to the same scalar stmt. */
3966
3967 static void
3968 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
3969 {
3970 gimple *new_stmt;
3971 gimple_stmt_iterator gsi;
3972 int i;
3973 slp_tree child;
3974 tree lhs;
3975 stmt_vec_info stmt_info;
3976
3977 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3978 return;
3979
3980 if (visited.add (node))
3981 return;
3982
3983 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3984 vect_remove_slp_scalar_calls (child, visited);
3985
3986 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3987 {
3988 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
3989 if (!stmt || gimple_bb (stmt) == NULL)
3990 continue;
3991 if (is_pattern_stmt_p (stmt_info)
3992 || !PURE_SLP_STMT (stmt_info))
3993 continue;
3994 lhs = gimple_call_lhs (stmt);
3995 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3996 gsi = gsi_for_stmt (stmt);
3997 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
3998 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3999 }
4000 }
4001
4002 static void
4003 vect_remove_slp_scalar_calls (slp_tree node)
4004 {
4005 hash_set<slp_tree> visited;
4006 vect_remove_slp_scalar_calls (node, visited);
4007 }
4008
4009 /* Generate vector code for all SLP instances in the loop/basic block. */
4010
4011 void
4012 vect_schedule_slp (vec_info *vinfo)
4013 {
4014 vec<slp_instance> slp_instances;
4015 slp_instance instance;
4016 unsigned int i;
4017
4018 scalar_stmts_to_slp_tree_map_t *bst_map
4019 = new scalar_stmts_to_slp_tree_map_t ();
4020 slp_instances = vinfo->slp_instances;
4021 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4022 {
4023 /* Schedule the tree of INSTANCE. */
4024 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4025 instance, bst_map);
4026 if (dump_enabled_p ())
4027 dump_printf_loc (MSG_NOTE, vect_location,
4028 "vectorizing stmts using SLP.\n");
4029 }
4030 delete bst_map;
4031
4032 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4033 {
4034 slp_tree root = SLP_INSTANCE_TREE (instance);
4035 stmt_vec_info store_info;
4036 unsigned int j;
4037
4038 /* Remove scalar call stmts. Do not do this for basic-block
4039 vectorization as not all uses may be vectorized.
4040 ??? Why should this be necessary? DCE should be able to
4041 remove the stmts itself.
4042 ??? For BB vectorization we can as well remove scalar
4043 stmts starting from the SLP tree root if they have no
4044 uses. */
4045 if (is_a <loop_vec_info> (vinfo))
4046 vect_remove_slp_scalar_calls (root);
4047
4048 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4049 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4050 {
4051 if (!STMT_VINFO_DATA_REF (store_info))
4052 break;
4053
4054 store_info = vect_orig_stmt (store_info);
4055 /* Free the attached stmt_vec_info and remove the stmt. */
4056 vinfo->remove_stmt (store_info);
4057 }
4058 }
4059 }