]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
tree-optimization/95335 - fix SLP nodes dropped to invariant
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48
49
50 /* Initialize a SLP node. */
51
52 _slp_tree::_slp_tree ()
53 {
54 SLP_TREE_SCALAR_STMTS (this) = vNULL;
55 SLP_TREE_SCALAR_OPS (this) = vNULL;
56 SLP_TREE_VEC_STMTS (this).create (0);
57 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
58 SLP_TREE_CHILDREN (this) = vNULL;
59 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
60 SLP_TREE_TWO_OPERATORS (this) = false;
61 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
62 SLP_TREE_VECTYPE (this) = NULL_TREE;
63 this->refcnt = 1;
64 this->max_nunits = 1;
65 }
66
67 /* Tear down a SLP node. */
68
69 _slp_tree::~_slp_tree ()
70 {
71 SLP_TREE_CHILDREN (this).release ();
72 SLP_TREE_SCALAR_STMTS (this).release ();
73 SLP_TREE_SCALAR_OPS (this).release ();
74 SLP_TREE_VEC_STMTS (this).release ();
75 SLP_TREE_LOAD_PERMUTATION (this).release ();
76 }
77
78 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
79 FINAL_P is true if we have vectorized the instance or if we have
80 made a final decision not to vectorize the statements in any way. */
81
82 static void
83 vect_free_slp_tree (slp_tree node, bool final_p)
84 {
85 int i;
86 slp_tree child;
87
88 if (--node->refcnt != 0)
89 return;
90
91 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
92 vect_free_slp_tree (child, final_p);
93
94 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
95 Some statements might no longer exist, after having been
96 removed by vect_transform_stmt. Updating the remaining
97 statements would be redundant. */
98 if (!final_p)
99 {
100 stmt_vec_info stmt_info;
101 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
102 {
103 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
104 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
105 }
106 }
107
108 delete node;
109 }
110
111 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
112 have vectorized the instance or if we have made a final decision not
113 to vectorize the statements in any way. */
114
115 void
116 vect_free_slp_instance (slp_instance instance, bool final_p)
117 {
118 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
119 SLP_INSTANCE_LOADS (instance).release ();
120 free (instance);
121 }
122
123
124 /* Create an SLP node for SCALAR_STMTS. */
125
126 static slp_tree
127 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
128 {
129 slp_tree node = new _slp_tree;
130 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
131 SLP_TREE_CHILDREN (node).create (nops);
132 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
133
134 unsigned i;
135 stmt_vec_info stmt_info;
136 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
137 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
138
139 return node;
140 }
141
142 /* Create an SLP node for OPS. */
143
144 static slp_tree
145 vect_create_new_slp_node (vec<tree> ops)
146 {
147 slp_tree node = new _slp_tree;
148 SLP_TREE_SCALAR_OPS (node) = ops;
149 SLP_TREE_DEF_TYPE (node) = vect_external_def;
150 return node;
151 }
152
153
154 /* This structure is used in creation of an SLP tree. Each instance
155 corresponds to the same operand in a group of scalar stmts in an SLP
156 node. */
157 typedef struct _slp_oprnd_info
158 {
159 /* Def-stmts for the operands. */
160 vec<stmt_vec_info> def_stmts;
161 /* Operands. */
162 vec<tree> ops;
163 /* Information about the first statement, its vector def-type, type, the
164 operand itself in case it's constant, and an indication if it's a pattern
165 stmt. */
166 tree first_op_type;
167 enum vect_def_type first_dt;
168 bool any_pattern;
169 } *slp_oprnd_info;
170
171
172 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
173 operand. */
174 static vec<slp_oprnd_info>
175 vect_create_oprnd_info (int nops, int group_size)
176 {
177 int i;
178 slp_oprnd_info oprnd_info;
179 vec<slp_oprnd_info> oprnds_info;
180
181 oprnds_info.create (nops);
182 for (i = 0; i < nops; i++)
183 {
184 oprnd_info = XNEW (struct _slp_oprnd_info);
185 oprnd_info->def_stmts.create (group_size);
186 oprnd_info->ops.create (group_size);
187 oprnd_info->first_dt = vect_uninitialized_def;
188 oprnd_info->first_op_type = NULL_TREE;
189 oprnd_info->any_pattern = false;
190 oprnds_info.quick_push (oprnd_info);
191 }
192
193 return oprnds_info;
194 }
195
196
197 /* Free operands info. */
198
199 static void
200 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
201 {
202 int i;
203 slp_oprnd_info oprnd_info;
204
205 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
206 {
207 oprnd_info->def_stmts.release ();
208 oprnd_info->ops.release ();
209 XDELETE (oprnd_info);
210 }
211
212 oprnds_info.release ();
213 }
214
215
216 /* Return true if STMTS contains a pattern statement. */
217
218 static bool
219 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
220 {
221 stmt_vec_info stmt_info;
222 unsigned int i;
223 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
224 if (is_pattern_stmt_p (stmt_info))
225 return true;
226 return false;
227 }
228
229 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
230 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
231 of the chain. */
232
233 int
234 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
235 stmt_vec_info first_stmt_info)
236 {
237 stmt_vec_info next_stmt_info = first_stmt_info;
238 int result = 0;
239
240 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
241 return -1;
242
243 do
244 {
245 if (next_stmt_info == stmt_info)
246 return result;
247 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
248 if (next_stmt_info)
249 result += DR_GROUP_GAP (next_stmt_info);
250 }
251 while (next_stmt_info);
252
253 return -1;
254 }
255
256 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
257 using the method implemented by duplicate_and_interleave. Return true
258 if so, returning the number of intermediate vectors in *NVECTORS_OUT
259 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
260 (if nonnull). */
261
262 bool
263 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
264 tree elt_type, unsigned int *nvectors_out,
265 tree *vector_type_out,
266 tree *permutes)
267 {
268 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
269 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
270 return false;
271
272 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
273 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
274 unsigned int nvectors = 1;
275 for (;;)
276 {
277 scalar_int_mode int_mode;
278 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
279 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
280 {
281 /* Get the natural vector type for this SLP group size. */
282 tree int_type = build_nonstandard_integer_type
283 (GET_MODE_BITSIZE (int_mode), 1);
284 tree vector_type
285 = get_vectype_for_scalar_type (vinfo, int_type, count);
286 if (vector_type
287 && VECTOR_MODE_P (TYPE_MODE (vector_type))
288 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
289 GET_MODE_SIZE (base_vector_mode)))
290 {
291 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
292 together into elements of type INT_TYPE and using the result
293 to build NVECTORS vectors. */
294 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
295 vec_perm_builder sel1 (nelts, 2, 3);
296 vec_perm_builder sel2 (nelts, 2, 3);
297 poly_int64 half_nelts = exact_div (nelts, 2);
298 for (unsigned int i = 0; i < 3; ++i)
299 {
300 sel1.quick_push (i);
301 sel1.quick_push (i + nelts);
302 sel2.quick_push (half_nelts + i);
303 sel2.quick_push (half_nelts + i + nelts);
304 }
305 vec_perm_indices indices1 (sel1, 2, nelts);
306 vec_perm_indices indices2 (sel2, 2, nelts);
307 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
308 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
309 {
310 if (nvectors_out)
311 *nvectors_out = nvectors;
312 if (vector_type_out)
313 *vector_type_out = vector_type;
314 if (permutes)
315 {
316 permutes[0] = vect_gen_perm_mask_checked (vector_type,
317 indices1);
318 permutes[1] = vect_gen_perm_mask_checked (vector_type,
319 indices2);
320 }
321 return true;
322 }
323 }
324 }
325 if (!multiple_p (elt_bytes, 2, &elt_bytes))
326 return false;
327 nvectors *= 2;
328 }
329 }
330
331 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
332 they are of a valid type and that they match the defs of the first stmt of
333 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
334 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
335 indicates swap is required for cond_expr stmts. Specifically, *SWAP
336 is 1 if STMT is cond and operands of comparison need to be swapped;
337 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
338 If there is any operand swap in this function, *SWAP is set to non-zero
339 value.
340 If there was a fatal error return -1; if the error could be corrected by
341 swapping operands of father node of this one, return 1; if everything is
342 ok return 0. */
343 static int
344 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
345 vec<stmt_vec_info> stmts, unsigned stmt_num,
346 vec<slp_oprnd_info> *oprnds_info)
347 {
348 stmt_vec_info stmt_info = stmts[stmt_num];
349 tree oprnd;
350 unsigned int i, number_of_oprnds;
351 enum vect_def_type dt = vect_uninitialized_def;
352 slp_oprnd_info oprnd_info;
353 int first_op_idx = 1;
354 unsigned int commutative_op = -1U;
355 bool first_op_cond = false;
356 bool first = stmt_num == 0;
357
358 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
359 {
360 number_of_oprnds = gimple_call_num_args (stmt);
361 first_op_idx = 3;
362 if (gimple_call_internal_p (stmt))
363 {
364 internal_fn ifn = gimple_call_internal_fn (stmt);
365 commutative_op = first_commutative_argument (ifn);
366
367 /* Masked load, only look at mask. */
368 if (ifn == IFN_MASK_LOAD)
369 {
370 number_of_oprnds = 1;
371 /* Mask operand index. */
372 first_op_idx = 5;
373 }
374 }
375 }
376 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
377 {
378 enum tree_code code = gimple_assign_rhs_code (stmt);
379 number_of_oprnds = gimple_num_ops (stmt) - 1;
380 /* Swap can only be done for cond_expr if asked to, otherwise we
381 could result in different comparison code to the first stmt. */
382 if (code == COND_EXPR
383 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
384 {
385 first_op_cond = true;
386 number_of_oprnds++;
387 }
388 else
389 commutative_op = commutative_tree_code (code) ? 0U : -1U;
390 }
391 else
392 return -1;
393
394 bool swapped = (*swap != 0);
395 gcc_assert (!swapped || first_op_cond);
396 for (i = 0; i < number_of_oprnds; i++)
397 {
398 again:
399 if (first_op_cond)
400 {
401 /* Map indicating how operands of cond_expr should be swapped. */
402 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
403 int *map = maps[*swap];
404
405 if (i < 2)
406 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
407 first_op_idx), map[i]);
408 else
409 oprnd = gimple_op (stmt_info->stmt, map[i]);
410 }
411 else
412 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
413 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
414 oprnd = TREE_OPERAND (oprnd, 0);
415
416 oprnd_info = (*oprnds_info)[i];
417
418 stmt_vec_info def_stmt_info;
419 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
420 {
421 if (dump_enabled_p ())
422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423 "Build SLP failed: can't analyze def for %T\n",
424 oprnd);
425
426 return -1;
427 }
428
429 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
430 oprnd_info->any_pattern = true;
431
432 if (first)
433 {
434 /* For the swapping logic below force vect_reduction_def
435 for the reduction op in a SLP reduction group. */
436 if (!STMT_VINFO_DATA_REF (stmt_info)
437 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
438 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
439 && def_stmt_info)
440 dt = vect_reduction_def;
441 oprnd_info->first_dt = dt;
442 oprnd_info->first_op_type = TREE_TYPE (oprnd);
443 }
444 else
445 {
446 /* Not first stmt of the group, check that the def-stmt/s match
447 the def-stmt/s of the first stmt. Allow different definition
448 types for reduction chains: the first stmt must be a
449 vect_reduction_def (a phi node), and the rest
450 end in the reduction chain. */
451 tree type = TREE_TYPE (oprnd);
452 if ((oprnd_info->first_dt != dt
453 && !(oprnd_info->first_dt == vect_reduction_def
454 && !STMT_VINFO_DATA_REF (stmt_info)
455 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
456 && def_stmt_info
457 && !STMT_VINFO_DATA_REF (def_stmt_info)
458 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
459 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
460 && !((oprnd_info->first_dt == vect_external_def
461 || oprnd_info->first_dt == vect_constant_def)
462 && (dt == vect_external_def
463 || dt == vect_constant_def)))
464 || !types_compatible_p (oprnd_info->first_op_type, type)
465 || (!STMT_VINFO_DATA_REF (stmt_info)
466 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
467 && ((!def_stmt_info
468 || STMT_VINFO_DATA_REF (def_stmt_info)
469 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
470 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
471 != (oprnd_info->first_dt != vect_reduction_def))))
472 {
473 /* Try swapping operands if we got a mismatch. */
474 if (i == commutative_op && !swapped)
475 {
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_NOTE, vect_location,
478 "trying swapped operands\n");
479 swapped = true;
480 goto again;
481 }
482
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 "Build SLP failed: different types\n");
486
487 return 1;
488 }
489 if ((dt == vect_constant_def
490 || dt == vect_external_def)
491 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
492 && (TREE_CODE (type) == BOOLEAN_TYPE
493 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
494 type)))
495 {
496 if (dump_enabled_p ())
497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
498 "Build SLP failed: invalid type of def "
499 "for variable-length SLP %T\n", oprnd);
500 return -1;
501 }
502 }
503
504 /* Check the types of the definitions. */
505 switch (dt)
506 {
507 case vect_external_def:
508 /* Make sure to demote the overall operand to external. */
509 oprnd_info->first_dt = vect_external_def;
510 /* Fallthru. */
511 case vect_constant_def:
512 oprnd_info->def_stmts.quick_push (NULL);
513 oprnd_info->ops.quick_push (oprnd);
514 break;
515
516 case vect_internal_def:
517 case vect_reduction_def:
518 if (oprnd_info->first_dt == vect_reduction_def
519 && !STMT_VINFO_DATA_REF (stmt_info)
520 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
521 && !STMT_VINFO_DATA_REF (def_stmt_info)
522 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
523 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
524 {
525 /* For a SLP reduction chain we want to duplicate the
526 reduction to each of the chain members. That gets
527 us a sane SLP graph (still the stmts are not 100%
528 correct wrt the initial values). */
529 gcc_assert (!first);
530 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
531 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
532 break;
533 }
534 /* Fallthru. */
535 case vect_induction_def:
536 oprnd_info->def_stmts.quick_push (def_stmt_info);
537 oprnd_info->ops.quick_push (oprnd);
538 break;
539
540 default:
541 /* FORNOW: Not supported. */
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
544 "Build SLP failed: illegal type of def %T\n",
545 oprnd);
546
547 return -1;
548 }
549 }
550
551 /* Swap operands. */
552 if (swapped)
553 {
554 if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location,
556 "swapped operands to match def types in %G",
557 stmt_info->stmt);
558 }
559
560 *swap = swapped;
561 return 0;
562 }
563
564 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
565 Return true if we can, meaning that this choice doesn't conflict with
566 existing SLP nodes that use STMT_INFO. */
567
568 static bool
569 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
570 {
571 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
572 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
573 return true;
574
575 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
576 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
577 {
578 /* We maintain the invariant that if any statement in the group is
579 used, all other members of the group have the same vector type. */
580 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
581 stmt_vec_info member_info = first_info;
582 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
583 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
584 || is_pattern_stmt_p (member_info))
585 break;
586
587 if (!member_info)
588 {
589 for (member_info = first_info; member_info;
590 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
591 STMT_VINFO_VECTYPE (member_info) = vectype;
592 return true;
593 }
594 }
595 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
596 && !is_pattern_stmt_p (stmt_info))
597 {
598 STMT_VINFO_VECTYPE (stmt_info) = vectype;
599 return true;
600 }
601
602 if (dump_enabled_p ())
603 {
604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
605 "Build SLP failed: incompatible vector"
606 " types for: %G", stmt_info->stmt);
607 dump_printf_loc (MSG_NOTE, vect_location,
608 " old vector type: %T\n", old_vectype);
609 dump_printf_loc (MSG_NOTE, vect_location,
610 " new vector type: %T\n", vectype);
611 }
612 return false;
613 }
614
615 /* Try to infer and assign a vector type to all the statements in STMTS.
616 Used only for BB vectorization. */
617
618 static bool
619 vect_update_all_shared_vectypes (vec_info *vinfo, vec<stmt_vec_info> stmts)
620 {
621 tree vectype, nunits_vectype;
622 if (!vect_get_vector_types_for_stmt (vinfo, stmts[0], &vectype,
623 &nunits_vectype, stmts.length ()))
624 return false;
625
626 stmt_vec_info stmt_info;
627 unsigned int i;
628 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
629 if (!vect_update_shared_vectype (stmt_info, vectype))
630 return false;
631
632 return true;
633 }
634
635 /* Return true if call statements CALL1 and CALL2 are similar enough
636 to be combined into the same SLP group. */
637
638 static bool
639 compatible_calls_p (gcall *call1, gcall *call2)
640 {
641 unsigned int nargs = gimple_call_num_args (call1);
642 if (nargs != gimple_call_num_args (call2))
643 return false;
644
645 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
646 return false;
647
648 if (gimple_call_internal_p (call1))
649 {
650 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
651 TREE_TYPE (gimple_call_lhs (call2))))
652 return false;
653 for (unsigned int i = 0; i < nargs; ++i)
654 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
655 TREE_TYPE (gimple_call_arg (call2, i))))
656 return false;
657 }
658 else
659 {
660 if (!operand_equal_p (gimple_call_fn (call1),
661 gimple_call_fn (call2), 0))
662 return false;
663
664 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
665 return false;
666 }
667 return true;
668 }
669
670 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
671 caller's attempt to find the vector type in STMT_INFO with the narrowest
672 element type. Return true if VECTYPE is nonnull and if it is valid
673 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
674 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
675 vect_build_slp_tree. */
676
677 static bool
678 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
679 unsigned int group_size,
680 tree vectype, poly_uint64 *max_nunits)
681 {
682 if (!vectype)
683 {
684 if (dump_enabled_p ())
685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
686 "Build SLP failed: unsupported data-type in %G\n",
687 stmt_info->stmt);
688 /* Fatal mismatch. */
689 return false;
690 }
691
692 /* If populating the vector type requires unrolling then fail
693 before adjusting *max_nunits for basic-block vectorization. */
694 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
695 unsigned HOST_WIDE_INT const_nunits;
696 if (is_a <bb_vec_info> (vinfo)
697 && (!nunits.is_constant (&const_nunits)
698 || const_nunits > group_size))
699 {
700 if (dump_enabled_p ())
701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
702 "Build SLP failed: unrolling required "
703 "in basic block SLP\n");
704 /* Fatal mismatch. */
705 return false;
706 }
707
708 /* In case of multiple types we need to detect the smallest type. */
709 vect_update_max_nunits (max_nunits, vectype);
710 return true;
711 }
712
713 /* STMTS is a group of GROUP_SIZE SLP statements in which some
714 statements do the same operation as the first statement and in which
715 the others do ALT_STMT_CODE. Return true if we can take one vector
716 of the first operation and one vector of the second and permute them
717 to get the required result. VECTYPE is the type of the vector that
718 would be permuted. */
719
720 static bool
721 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
722 unsigned int group_size, tree vectype,
723 tree_code alt_stmt_code)
724 {
725 unsigned HOST_WIDE_INT count;
726 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
727 return false;
728
729 vec_perm_builder sel (count, count, 1);
730 for (unsigned int i = 0; i < count; ++i)
731 {
732 unsigned int elt = i;
733 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
734 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
735 elt += count;
736 sel.quick_push (elt);
737 }
738 vec_perm_indices indices (sel, 2, count);
739 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
740 }
741
742 /* Verify if the scalar stmts STMTS are isomorphic, require data
743 permutation or are of unsupported types of operation. Return
744 true if they are, otherwise return false and indicate in *MATCHES
745 which stmts are not isomorphic to the first one. If MATCHES[0]
746 is false then this indicates the comparison could not be
747 carried out or the stmts will never be vectorized by SLP.
748
749 Note COND_EXPR is possibly isomorphic to another one after swapping its
750 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
751 the first stmt by swapping the two operands of comparison; set SWAP[i]
752 to 2 if stmt I is isormorphic to the first stmt by inverting the code
753 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
754 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
755
756 static bool
757 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
758 vec<stmt_vec_info> stmts, unsigned int group_size,
759 poly_uint64 *max_nunits, bool *matches,
760 bool *two_operators)
761 {
762 unsigned int i;
763 stmt_vec_info first_stmt_info = stmts[0];
764 enum tree_code first_stmt_code = ERROR_MARK;
765 enum tree_code alt_stmt_code = ERROR_MARK;
766 enum tree_code rhs_code = ERROR_MARK;
767 enum tree_code first_cond_code = ERROR_MARK;
768 tree lhs;
769 bool need_same_oprnds = false;
770 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
771 optab optab;
772 int icode;
773 machine_mode optab_op2_mode;
774 machine_mode vec_mode;
775 stmt_vec_info first_load = NULL, prev_first_load = NULL;
776 bool load_p = false;
777
778 /* For every stmt in NODE find its def stmt/s. */
779 stmt_vec_info stmt_info;
780 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
781 {
782 gimple *stmt = stmt_info->stmt;
783 swap[i] = 0;
784 matches[i] = false;
785
786 if (dump_enabled_p ())
787 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
788
789 /* Fail to vectorize statements marked as unvectorizable. */
790 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
791 {
792 if (dump_enabled_p ())
793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 "Build SLP failed: unvectorizable statement %G",
795 stmt);
796 /* Fatal mismatch. */
797 matches[0] = false;
798 return false;
799 }
800
801 lhs = gimple_get_lhs (stmt);
802 if (lhs == NULL_TREE)
803 {
804 if (dump_enabled_p ())
805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
806 "Build SLP failed: not GIMPLE_ASSIGN nor "
807 "GIMPLE_CALL %G", stmt);
808 /* Fatal mismatch. */
809 matches[0] = false;
810 return false;
811 }
812
813 tree nunits_vectype;
814 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
815 &nunits_vectype, group_size)
816 || (nunits_vectype
817 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
818 nunits_vectype, max_nunits)))
819 {
820 /* Fatal mismatch. */
821 matches[0] = false;
822 return false;
823 }
824
825 gcc_assert (vectype);
826
827 if (is_a <bb_vec_info> (vinfo)
828 && !vect_update_shared_vectype (stmt_info, vectype))
829 continue;
830
831 gcall *call_stmt = dyn_cast <gcall *> (stmt);
832 if (call_stmt)
833 {
834 rhs_code = CALL_EXPR;
835
836 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
837 load_p = true;
838 else if ((gimple_call_internal_p (call_stmt)
839 && (!vectorizable_internal_fn_p
840 (gimple_call_internal_fn (call_stmt))))
841 || gimple_call_tail_p (call_stmt)
842 || gimple_call_noreturn_p (call_stmt)
843 || !gimple_call_nothrow_p (call_stmt)
844 || gimple_call_chain (call_stmt))
845 {
846 if (dump_enabled_p ())
847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
848 "Build SLP failed: unsupported call type %G",
849 call_stmt);
850 /* Fatal mismatch. */
851 matches[0] = false;
852 return false;
853 }
854 }
855 else
856 {
857 rhs_code = gimple_assign_rhs_code (stmt);
858 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
859 }
860
861 /* Check the operation. */
862 if (i == 0)
863 {
864 first_stmt_code = rhs_code;
865
866 /* Shift arguments should be equal in all the packed stmts for a
867 vector shift with scalar shift operand. */
868 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
869 || rhs_code == LROTATE_EXPR
870 || rhs_code == RROTATE_EXPR)
871 {
872 vec_mode = TYPE_MODE (vectype);
873
874 /* First see if we have a vector/vector shift. */
875 optab = optab_for_tree_code (rhs_code, vectype,
876 optab_vector);
877
878 if (!optab
879 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
880 {
881 /* No vector/vector shift, try for a vector/scalar shift. */
882 optab = optab_for_tree_code (rhs_code, vectype,
883 optab_scalar);
884
885 if (!optab)
886 {
887 if (dump_enabled_p ())
888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
889 "Build SLP failed: no optab.\n");
890 /* Fatal mismatch. */
891 matches[0] = false;
892 return false;
893 }
894 icode = (int) optab_handler (optab, vec_mode);
895 if (icode == CODE_FOR_nothing)
896 {
897 if (dump_enabled_p ())
898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
899 "Build SLP failed: "
900 "op not supported by target.\n");
901 /* Fatal mismatch. */
902 matches[0] = false;
903 return false;
904 }
905 optab_op2_mode = insn_data[icode].operand[2].mode;
906 if (!VECTOR_MODE_P (optab_op2_mode))
907 {
908 need_same_oprnds = true;
909 first_op1 = gimple_assign_rhs2 (stmt);
910 }
911 }
912 }
913 else if (rhs_code == WIDEN_LSHIFT_EXPR)
914 {
915 need_same_oprnds = true;
916 first_op1 = gimple_assign_rhs2 (stmt);
917 }
918 else if (call_stmt
919 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
920 {
921 need_same_oprnds = true;
922 first_op1 = gimple_call_arg (call_stmt, 1);
923 }
924 }
925 else
926 {
927 if (first_stmt_code != rhs_code
928 && alt_stmt_code == ERROR_MARK)
929 alt_stmt_code = rhs_code;
930 if (first_stmt_code != rhs_code
931 && (first_stmt_code != IMAGPART_EXPR
932 || rhs_code != REALPART_EXPR)
933 && (first_stmt_code != REALPART_EXPR
934 || rhs_code != IMAGPART_EXPR)
935 /* Handle mismatches in plus/minus by computing both
936 and merging the results. */
937 && !((first_stmt_code == PLUS_EXPR
938 || first_stmt_code == MINUS_EXPR)
939 && (alt_stmt_code == PLUS_EXPR
940 || alt_stmt_code == MINUS_EXPR)
941 && rhs_code == alt_stmt_code)
942 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
943 && (first_stmt_code == ARRAY_REF
944 || first_stmt_code == BIT_FIELD_REF
945 || first_stmt_code == INDIRECT_REF
946 || first_stmt_code == COMPONENT_REF
947 || first_stmt_code == MEM_REF)))
948 {
949 if (dump_enabled_p ())
950 {
951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
952 "Build SLP failed: different operation "
953 "in stmt %G", stmt);
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "original stmt %G", first_stmt_info->stmt);
956 }
957 /* Mismatch. */
958 continue;
959 }
960
961 if (need_same_oprnds)
962 {
963 tree other_op1 = (call_stmt
964 ? gimple_call_arg (call_stmt, 1)
965 : gimple_assign_rhs2 (stmt));
966 if (!operand_equal_p (first_op1, other_op1, 0))
967 {
968 if (dump_enabled_p ())
969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
970 "Build SLP failed: different shift "
971 "arguments in %G", stmt);
972 /* Mismatch. */
973 continue;
974 }
975 }
976
977 if (!load_p && rhs_code == CALL_EXPR)
978 {
979 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
980 as_a <gcall *> (stmt)))
981 {
982 if (dump_enabled_p ())
983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
984 "Build SLP failed: different calls in %G",
985 stmt);
986 /* Mismatch. */
987 continue;
988 }
989 }
990 }
991
992 /* Grouped store or load. */
993 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
994 {
995 if (REFERENCE_CLASS_P (lhs))
996 {
997 /* Store. */
998 ;
999 }
1000 else
1001 {
1002 /* Load. */
1003 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1004 if (prev_first_load)
1005 {
1006 /* Check that there are no loads from different interleaving
1007 chains in the same node. */
1008 if (prev_first_load != first_load)
1009 {
1010 if (dump_enabled_p ())
1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1012 vect_location,
1013 "Build SLP failed: different "
1014 "interleaving chains in one node %G",
1015 stmt);
1016 /* Mismatch. */
1017 continue;
1018 }
1019 }
1020 else
1021 prev_first_load = first_load;
1022 }
1023 } /* Grouped access. */
1024 else
1025 {
1026 if (load_p)
1027 {
1028 /* Not grouped load. */
1029 if (dump_enabled_p ())
1030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1031 "Build SLP failed: not grouped load %G", stmt);
1032
1033 /* FORNOW: Not grouped loads are not supported. */
1034 /* Fatal mismatch. */
1035 matches[0] = false;
1036 return false;
1037 }
1038
1039 /* Not memory operation. */
1040 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1041 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1042 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1043 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1044 && rhs_code != VIEW_CONVERT_EXPR
1045 && rhs_code != CALL_EXPR)
1046 {
1047 if (dump_enabled_p ())
1048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049 "Build SLP failed: operation unsupported %G",
1050 stmt);
1051 /* Fatal mismatch. */
1052 matches[0] = false;
1053 return false;
1054 }
1055
1056 if (rhs_code == COND_EXPR)
1057 {
1058 tree cond_expr = gimple_assign_rhs1 (stmt);
1059 enum tree_code cond_code = TREE_CODE (cond_expr);
1060 enum tree_code swap_code = ERROR_MARK;
1061 enum tree_code invert_code = ERROR_MARK;
1062
1063 if (i == 0)
1064 first_cond_code = TREE_CODE (cond_expr);
1065 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1066 {
1067 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1068 swap_code = swap_tree_comparison (cond_code);
1069 invert_code = invert_tree_comparison (cond_code, honor_nans);
1070 }
1071
1072 if (first_cond_code == cond_code)
1073 ;
1074 /* Isomorphic can be achieved by swapping. */
1075 else if (first_cond_code == swap_code)
1076 swap[i] = 1;
1077 /* Isomorphic can be achieved by inverting. */
1078 else if (first_cond_code == invert_code)
1079 swap[i] = 2;
1080 else
1081 {
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "Build SLP failed: different"
1085 " operation %G", stmt);
1086 /* Mismatch. */
1087 continue;
1088 }
1089 }
1090 }
1091
1092 matches[i] = true;
1093 }
1094
1095 for (i = 0; i < group_size; ++i)
1096 if (!matches[i])
1097 return false;
1098
1099 /* If we allowed a two-operation SLP node verify the target can cope
1100 with the permute we are going to use. */
1101 if (alt_stmt_code != ERROR_MARK
1102 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1103 {
1104 if (!vect_two_operations_perm_ok_p (stmts, group_size,
1105 vectype, alt_stmt_code))
1106 {
1107 for (i = 0; i < group_size; ++i)
1108 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1109 {
1110 matches[i] = false;
1111 if (dump_enabled_p ())
1112 {
1113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1114 "Build SLP failed: different operation "
1115 "in stmt %G", stmts[i]->stmt);
1116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1117 "original stmt %G", first_stmt_info->stmt);
1118 }
1119 }
1120 return false;
1121 }
1122 *two_operators = true;
1123 }
1124
1125 return true;
1126 }
1127
1128 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1129 Note we never remove apart from at destruction time so we do not
1130 need a special value for deleted that differs from empty. */
1131 struct bst_traits
1132 {
1133 typedef vec <stmt_vec_info> value_type;
1134 typedef vec <stmt_vec_info> compare_type;
1135 static inline hashval_t hash (value_type);
1136 static inline bool equal (value_type existing, value_type candidate);
1137 static inline bool is_empty (value_type x) { return !x.exists (); }
1138 static inline bool is_deleted (value_type x) { return !x.exists (); }
1139 static const bool empty_zero_p = true;
1140 static inline void mark_empty (value_type &x) { x.release (); }
1141 static inline void mark_deleted (value_type &x) { x.release (); }
1142 static inline void remove (value_type &x) { x.release (); }
1143 };
1144 inline hashval_t
1145 bst_traits::hash (value_type x)
1146 {
1147 inchash::hash h;
1148 for (unsigned i = 0; i < x.length (); ++i)
1149 h.add_int (gimple_uid (x[i]->stmt));
1150 return h.end ();
1151 }
1152 inline bool
1153 bst_traits::equal (value_type existing, value_type candidate)
1154 {
1155 if (existing.length () != candidate.length ())
1156 return false;
1157 for (unsigned i = 0; i < existing.length (); ++i)
1158 if (existing[i] != candidate[i])
1159 return false;
1160 return true;
1161 }
1162
1163 typedef hash_map <vec <gimple *>, slp_tree,
1164 simple_hashmap_traits <bst_traits, slp_tree> >
1165 scalar_stmts_to_slp_tree_map_t;
1166
1167 static slp_tree
1168 vect_build_slp_tree_2 (vec_info *vinfo,
1169 vec<stmt_vec_info> stmts, unsigned int group_size,
1170 poly_uint64 *max_nunits,
1171 bool *matches, unsigned *npermutes, unsigned *tree_size,
1172 scalar_stmts_to_slp_tree_map_t *bst_map);
1173
1174 static slp_tree
1175 vect_build_slp_tree (vec_info *vinfo,
1176 vec<stmt_vec_info> stmts, unsigned int group_size,
1177 poly_uint64 *max_nunits,
1178 bool *matches, unsigned *npermutes, unsigned *tree_size,
1179 scalar_stmts_to_slp_tree_map_t *bst_map)
1180 {
1181 if (slp_tree *leader = bst_map->get (stmts))
1182 {
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1185 *leader ? "" : "failed ", *leader);
1186 if (*leader)
1187 {
1188 (*leader)->refcnt++;
1189 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1190 }
1191 return *leader;
1192 }
1193 poly_uint64 this_max_nunits = 1;
1194 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1195 &this_max_nunits,
1196 matches, npermutes, tree_size, bst_map);
1197 if (res)
1198 {
1199 res->max_nunits = this_max_nunits;
1200 vect_update_max_nunits (max_nunits, this_max_nunits);
1201 /* Keep a reference for the bst_map use. */
1202 res->refcnt++;
1203 }
1204 bst_map->put (stmts.copy (), res);
1205 return res;
1206 }
1207
1208 /* Recursively build an SLP tree starting from NODE.
1209 Fail (and return a value not equal to zero) if def-stmts are not
1210 isomorphic, require data permutation or are of unsupported types of
1211 operation. Otherwise, return 0.
1212 The value returned is the depth in the SLP tree where a mismatch
1213 was found. */
1214
1215 static slp_tree
1216 vect_build_slp_tree_2 (vec_info *vinfo,
1217 vec<stmt_vec_info> stmts, unsigned int group_size,
1218 poly_uint64 *max_nunits,
1219 bool *matches, unsigned *npermutes, unsigned *tree_size,
1220 scalar_stmts_to_slp_tree_map_t *bst_map)
1221 {
1222 unsigned nops, i, this_tree_size = 0;
1223 poly_uint64 this_max_nunits = *max_nunits;
1224 slp_tree node;
1225
1226 matches[0] = false;
1227
1228 stmt_vec_info stmt_info = stmts[0];
1229 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1230 nops = gimple_call_num_args (stmt);
1231 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1232 {
1233 nops = gimple_num_ops (stmt) - 1;
1234 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1235 nops++;
1236 }
1237 else if (is_a <gphi *> (stmt_info->stmt))
1238 nops = 0;
1239 else
1240 return NULL;
1241
1242 /* If the SLP node is a PHI (induction or reduction), terminate
1243 the recursion. */
1244 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1245 {
1246 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1247 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1248 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1249 max_nunits))
1250 return NULL;
1251
1252 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1253 /* Induction from different IVs is not supported. */
1254 if (def_type == vect_induction_def)
1255 {
1256 stmt_vec_info other_info;
1257 FOR_EACH_VEC_ELT (stmts, i, other_info)
1258 if (stmt_info != other_info)
1259 return NULL;
1260 }
1261 else if (def_type == vect_reduction_def
1262 || def_type == vect_double_reduction_def
1263 || def_type == vect_nested_cycle)
1264 {
1265 /* Else def types have to match. */
1266 stmt_vec_info other_info;
1267 FOR_EACH_VEC_ELT (stmts, i, other_info)
1268 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1269 return NULL;
1270 }
1271 else
1272 return NULL;
1273 (*tree_size)++;
1274 node = vect_create_new_slp_node (stmts, 0);
1275 return node;
1276 }
1277
1278
1279 bool two_operators = false;
1280 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1281 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1282 &this_max_nunits, matches, &two_operators))
1283 return NULL;
1284
1285 /* If the SLP node is a load, terminate the recursion unless masked. */
1286 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1287 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1288 {
1289 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1290 {
1291 /* Masked load. */
1292 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1293 nops = 1;
1294 }
1295 else
1296 {
1297 *max_nunits = this_max_nunits;
1298 (*tree_size)++;
1299 node = vect_create_new_slp_node (stmts, 0);
1300 /* And compute the load permutation. Whether it is actually
1301 a permutation depends on the unrolling factor which is
1302 decided later. */
1303 vec<unsigned> load_permutation;
1304 int j;
1305 stmt_vec_info load_info;
1306 load_permutation.create (group_size);
1307 stmt_vec_info first_stmt_info
1308 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1309 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1310 {
1311 int load_place = vect_get_place_in_interleaving_chain
1312 (load_info, first_stmt_info);
1313 gcc_assert (load_place != -1);
1314 load_permutation.safe_push (load_place);
1315 }
1316 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1317 return node;
1318 }
1319 }
1320
1321 /* Get at the operands, verifying they are compatible. */
1322 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1323 slp_oprnd_info oprnd_info;
1324 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1325 {
1326 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1327 stmts, i, &oprnds_info);
1328 if (res != 0)
1329 matches[(res == -1) ? 0 : i] = false;
1330 if (!matches[0])
1331 break;
1332 }
1333 for (i = 0; i < group_size; ++i)
1334 if (!matches[i])
1335 {
1336 vect_free_oprnd_info (oprnds_info);
1337 return NULL;
1338 }
1339
1340 auto_vec<slp_tree, 4> children;
1341
1342 stmt_info = stmts[0];
1343
1344 /* Create SLP_TREE nodes for the definition node/s. */
1345 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1346 {
1347 slp_tree child;
1348 unsigned old_tree_size = this_tree_size;
1349 unsigned int j;
1350
1351 if (oprnd_info->first_dt == vect_uninitialized_def)
1352 {
1353 /* COND_EXPR have one too many eventually if the condition
1354 is a SSA name. */
1355 gcc_assert (i == 3 && nops == 4);
1356 continue;
1357 }
1358
1359 if (oprnd_info->first_dt != vect_internal_def
1360 && oprnd_info->first_dt != vect_reduction_def
1361 && oprnd_info->first_dt != vect_induction_def)
1362 {
1363 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1364 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1365 oprnd_info->ops = vNULL;
1366 children.safe_push (invnode);
1367 continue;
1368 }
1369
1370 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1371 group_size, &this_max_nunits,
1372 matches, npermutes,
1373 &this_tree_size, bst_map)) != NULL)
1374 {
1375 /* If we have all children of a non-unary child built up from
1376 scalars then just throw that away and build it up this node
1377 from scalars. */
1378 if (is_a <bb_vec_info> (vinfo)
1379 && SLP_TREE_CHILDREN (child).length () > 1
1380 /* ??? Rejecting patterns this way doesn't work. We'd have to
1381 do extra work to cancel the pattern so the uses see the
1382 scalar version. */
1383 && !oprnd_info->any_pattern)
1384 {
1385 slp_tree grandchild;
1386
1387 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1388 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1389 break;
1390 if (!grandchild
1391 && vect_update_all_shared_vectypes (vinfo,
1392 oprnd_info->def_stmts))
1393 {
1394 /* Roll back. */
1395 this_tree_size = old_tree_size;
1396 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1397 vect_free_slp_tree (grandchild, false);
1398 SLP_TREE_CHILDREN (child).truncate (0);
1399
1400 if (dump_enabled_p ())
1401 dump_printf_loc (MSG_NOTE, vect_location,
1402 "Building parent vector operands from "
1403 "scalars instead\n");
1404 oprnd_info->def_stmts = vNULL;
1405 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1406 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1407 oprnd_info->ops = vNULL;
1408 ++this_tree_size;
1409 children.safe_push (child);
1410 continue;
1411 }
1412 }
1413
1414 oprnd_info->def_stmts = vNULL;
1415 children.safe_push (child);
1416 continue;
1417 }
1418
1419 /* If the SLP build failed fatally and we analyze a basic-block
1420 simply treat nodes we fail to build as externally defined
1421 (and thus build vectors from the scalar defs).
1422 The cost model will reject outright expensive cases.
1423 ??? This doesn't treat cases where permutation ultimatively
1424 fails (or we don't try permutation below). Ideally we'd
1425 even compute a permutation that will end up with the maximum
1426 SLP tree size... */
1427 if (is_a <bb_vec_info> (vinfo)
1428 && !matches[0]
1429 /* ??? Rejecting patterns this way doesn't work. We'd have to
1430 do extra work to cancel the pattern so the uses see the
1431 scalar version. */
1432 && !is_pattern_stmt_p (stmt_info)
1433 && !oprnd_info->any_pattern
1434 && vect_update_all_shared_vectypes (vinfo, oprnd_info->def_stmts))
1435 {
1436 if (dump_enabled_p ())
1437 dump_printf_loc (MSG_NOTE, vect_location,
1438 "Building vector operands from scalars\n");
1439 this_tree_size++;
1440 child = vect_create_new_slp_node (oprnd_info->def_stmts, 0);
1441 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1442 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1443 children.safe_push (child);
1444 oprnd_info->ops = vNULL;
1445 oprnd_info->def_stmts = vNULL;
1446 continue;
1447 }
1448
1449 /* If the SLP build for operand zero failed and operand zero
1450 and one can be commutated try that for the scalar stmts
1451 that failed the match. */
1452 if (i == 0
1453 /* A first scalar stmt mismatch signals a fatal mismatch. */
1454 && matches[0]
1455 /* ??? For COND_EXPRs we can swap the comparison operands
1456 as well as the arms under some constraints. */
1457 && nops == 2
1458 && oprnds_info[1]->first_dt == vect_internal_def
1459 && is_gimple_assign (stmt_info->stmt)
1460 /* Swapping operands for reductions breaks assumptions later on. */
1461 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1462 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1463 /* Do so only if the number of not successful permutes was nor more
1464 than a cut-ff as re-trying the recursive match on
1465 possibly each level of the tree would expose exponential
1466 behavior. */
1467 && *npermutes < 4)
1468 {
1469 /* See whether we can swap the matching or the non-matching
1470 stmt operands. */
1471 bool swap_not_matching = true;
1472 do
1473 {
1474 for (j = 0; j < group_size; ++j)
1475 {
1476 if (matches[j] != !swap_not_matching)
1477 continue;
1478 stmt_vec_info stmt_info = stmts[j];
1479 /* Verify if we can swap operands of this stmt. */
1480 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1481 if (!stmt
1482 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1483 {
1484 if (!swap_not_matching)
1485 goto fail;
1486 swap_not_matching = false;
1487 break;
1488 }
1489 }
1490 }
1491 while (j != group_size);
1492
1493 /* Swap mismatched definition stmts. */
1494 if (dump_enabled_p ())
1495 dump_printf_loc (MSG_NOTE, vect_location,
1496 "Re-trying with swapped operands of stmts ");
1497 for (j = 0; j < group_size; ++j)
1498 if (matches[j] == !swap_not_matching)
1499 {
1500 std::swap (oprnds_info[0]->def_stmts[j],
1501 oprnds_info[1]->def_stmts[j]);
1502 std::swap (oprnds_info[0]->ops[j],
1503 oprnds_info[1]->ops[j]);
1504 if (dump_enabled_p ())
1505 dump_printf (MSG_NOTE, "%d ", j);
1506 }
1507 if (dump_enabled_p ())
1508 dump_printf (MSG_NOTE, "\n");
1509 /* And try again with scratch 'matches' ... */
1510 bool *tem = XALLOCAVEC (bool, group_size);
1511 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1512 group_size, &this_max_nunits,
1513 tem, npermutes,
1514 &this_tree_size, bst_map)) != NULL)
1515 {
1516 /* If we have all children of a non-unary child built up from
1517 scalars then just throw that away and build it up this node
1518 from scalars. */
1519 if (is_a <bb_vec_info> (vinfo)
1520 && SLP_TREE_CHILDREN (child).length () > 1
1521 /* ??? Rejecting patterns this way doesn't work. We'd have
1522 to do extra work to cancel the pattern so the uses see the
1523 scalar version. */
1524 && !oprnd_info->any_pattern)
1525 {
1526 unsigned int j;
1527 slp_tree grandchild;
1528
1529 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1530 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1531 break;
1532 if (!grandchild
1533 && (vect_update_all_shared_vectypes
1534 (vinfo, oprnd_info->def_stmts)))
1535 {
1536 /* Roll back. */
1537 this_tree_size = old_tree_size;
1538 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1539 vect_free_slp_tree (grandchild, false);
1540 SLP_TREE_CHILDREN (child).truncate (0);
1541
1542 if (dump_enabled_p ())
1543 dump_printf_loc (MSG_NOTE, vect_location,
1544 "Building parent vector operands from "
1545 "scalars instead\n");
1546 oprnd_info->def_stmts = vNULL;
1547 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1548 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1549 oprnd_info->ops = vNULL;
1550 ++this_tree_size;
1551 children.safe_push (child);
1552 continue;
1553 }
1554 }
1555
1556 oprnd_info->def_stmts = vNULL;
1557 children.safe_push (child);
1558 continue;
1559 }
1560
1561 ++*npermutes;
1562 }
1563
1564 fail:
1565 gcc_assert (child == NULL);
1566 FOR_EACH_VEC_ELT (children, j, child)
1567 vect_free_slp_tree (child, false);
1568 vect_free_oprnd_info (oprnds_info);
1569 return NULL;
1570 }
1571
1572 vect_free_oprnd_info (oprnds_info);
1573
1574 *tree_size += this_tree_size + 1;
1575 *max_nunits = this_max_nunits;
1576
1577 node = vect_create_new_slp_node (stmts, nops);
1578 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1579 SLP_TREE_CHILDREN (node).splice (children);
1580 return node;
1581 }
1582
1583 /* Dump a single SLP tree NODE. */
1584
1585 static void
1586 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1587 slp_tree node)
1588 {
1589 unsigned i, j;
1590 slp_tree child;
1591 stmt_vec_info stmt_info;
1592 tree op;
1593
1594 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1595 dump_user_location_t user_loc = loc.get_user_location ();
1596 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1597 SLP_TREE_DEF_TYPE (node) == vect_external_def
1598 ? " (external)"
1599 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1600 ? " (constant)"
1601 : ""), node,
1602 estimated_poly_value (node->max_nunits), node->refcnt);
1603 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1604 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1605 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1606 else
1607 {
1608 dump_printf_loc (metadata, user_loc, "\t{ ");
1609 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1610 dump_printf (metadata, "%T%s ", op,
1611 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1612 dump_printf (metadata, "}\n");
1613 }
1614 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1615 {
1616 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1617 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1618 dump_printf (dump_kind, " %u", j);
1619 dump_printf (dump_kind, " }\n");
1620 }
1621 if (SLP_TREE_CHILDREN (node).is_empty ())
1622 return;
1623 dump_printf_loc (metadata, user_loc, "\tchildren");
1624 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1625 dump_printf (dump_kind, " %p", (void *)child);
1626 dump_printf (dump_kind, "\n");
1627 }
1628
1629 DEBUG_FUNCTION void
1630 debug (slp_tree node)
1631 {
1632 debug_dump_context ctx;
1633 vect_print_slp_tree (MSG_NOTE,
1634 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1635 node);
1636 }
1637
1638 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1639
1640 static void
1641 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1642 slp_tree node, hash_set<slp_tree> &visited)
1643 {
1644 unsigned i;
1645 slp_tree child;
1646
1647 if (visited.add (node))
1648 return;
1649
1650 vect_print_slp_tree (dump_kind, loc, node);
1651
1652 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1653 vect_print_slp_graph (dump_kind, loc, child, visited);
1654 }
1655
1656 static void
1657 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1658 slp_tree entry)
1659 {
1660 hash_set<slp_tree> visited;
1661 vect_print_slp_graph (dump_kind, loc, entry, visited);
1662 }
1663
1664 /* Mark the tree rooted at NODE with PURE_SLP. */
1665
1666 static void
1667 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1668 {
1669 int i;
1670 stmt_vec_info stmt_info;
1671 slp_tree child;
1672
1673 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1674 return;
1675
1676 if (visited.add (node))
1677 return;
1678
1679 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1680 STMT_SLP_TYPE (stmt_info) = pure_slp;
1681
1682 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1683 vect_mark_slp_stmts (child, visited);
1684 }
1685
1686 static void
1687 vect_mark_slp_stmts (slp_tree node)
1688 {
1689 hash_set<slp_tree> visited;
1690 vect_mark_slp_stmts (node, visited);
1691 }
1692
1693 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1694
1695 static void
1696 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1697 {
1698 int i;
1699 stmt_vec_info stmt_info;
1700 slp_tree child;
1701
1702 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1703 return;
1704
1705 if (visited.add (node))
1706 return;
1707
1708 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1709 {
1710 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1711 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1712 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1713 }
1714
1715 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1716 vect_mark_slp_stmts_relevant (child, visited);
1717 }
1718
1719 static void
1720 vect_mark_slp_stmts_relevant (slp_tree node)
1721 {
1722 hash_set<slp_tree> visited;
1723 vect_mark_slp_stmts_relevant (node, visited);
1724 }
1725
1726 /* Copy the SLP subtree rooted at NODE. */
1727
1728 static slp_tree
1729 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1730 {
1731 unsigned i;
1732
1733 bool existed_p;
1734 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1735 if (existed_p)
1736 return copy_ref;
1737
1738 copy_ref = new _slp_tree;
1739 slp_tree copy = copy_ref;
1740 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1741 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1742 copy->max_nunits = node->max_nunits;
1743 copy->refcnt = 0;
1744 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1745 {
1746 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1747 stmt_vec_info stmt_info;
1748 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1749 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1750 }
1751 if (SLP_TREE_SCALAR_OPS (node).exists ())
1752 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1753 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1754 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1755 if (SLP_TREE_CHILDREN (node).exists ())
1756 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1757 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1758
1759 slp_tree child;
1760 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1761 {
1762 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1763 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1764 }
1765 return copy;
1766 }
1767
1768 /* Rearrange the statements of NODE according to PERMUTATION. */
1769
1770 static void
1771 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1772 vec<unsigned> permutation,
1773 hash_set<slp_tree> &visited)
1774 {
1775 unsigned int i;
1776 slp_tree child;
1777
1778 if (visited.add (node))
1779 return;
1780
1781 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1782 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1783
1784 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1785 {
1786 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1787 /* ??? Computation nodes are isomorphic and need no rearrangement.
1788 This is a quick hack to cover those where rearrangement breaks
1789 semantics because only the first stmt is guaranteed to have the
1790 correct operation code due to others being swapped or inverted. */
1791 stmt_vec_info first = SLP_TREE_SCALAR_STMTS (node)[0];
1792 if (is_gimple_assign (first->stmt)
1793 && gimple_assign_rhs_code (first->stmt) == COND_EXPR)
1794 return;
1795 vec<stmt_vec_info> tmp_stmts;
1796 tmp_stmts.create (group_size);
1797 tmp_stmts.quick_grow (group_size);
1798 stmt_vec_info stmt_info;
1799 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1800 tmp_stmts[permutation[i]] = stmt_info;
1801 SLP_TREE_SCALAR_STMTS (node).release ();
1802 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1803 }
1804 if (SLP_TREE_SCALAR_OPS (node).exists ())
1805 {
1806 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1807 vec<tree> tmp_ops;
1808 tmp_ops.create (group_size);
1809 tmp_ops.quick_grow (group_size);
1810 tree op;
1811 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1812 tmp_ops[permutation[i]] = op;
1813 SLP_TREE_SCALAR_OPS (node).release ();
1814 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1815 }
1816 }
1817
1818
1819 /* Attempt to reorder stmts in a reduction chain so that we don't
1820 require any load permutation. Return true if that was possible,
1821 otherwise return false. */
1822
1823 static bool
1824 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1825 {
1826 unsigned int i, j;
1827 unsigned int lidx;
1828 slp_tree node, load;
1829
1830 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1831 return false;
1832
1833 /* Compare all the permutation sequences to the first one. We know
1834 that at least one load is permuted. */
1835 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1836 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1837 return false;
1838 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1839 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1840 {
1841 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1842 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1843 return false;
1844 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1845 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1846 return false;
1847 }
1848
1849 /* Check that the loads in the first sequence are different and there
1850 are no gaps between them. */
1851 auto_sbitmap load_index (group_size);
1852 bitmap_clear (load_index);
1853 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1854 {
1855 if (lidx >= group_size)
1856 return false;
1857 if (bitmap_bit_p (load_index, lidx))
1858 return false;
1859
1860 bitmap_set_bit (load_index, lidx);
1861 }
1862 for (i = 0; i < group_size; i++)
1863 if (!bitmap_bit_p (load_index, i))
1864 return false;
1865
1866 /* This permutation is valid for reduction. Since the order of the
1867 statements in the nodes is not important unless they are memory
1868 accesses, we can rearrange the statements in all the nodes
1869 according to the order of the loads. */
1870
1871 /* We have to unshare the SLP tree we modify. */
1872 hash_map<slp_tree, slp_tree> map;
1873 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1874 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1875 unshared->refcnt++;
1876 SLP_INSTANCE_TREE (slp_instn) = unshared;
1877 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1878 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1879 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1880
1881 /* Do the actual re-arrangement. */
1882 hash_set<slp_tree> visited;
1883 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1884 node->load_permutation, visited);
1885
1886 /* We are done, no actual permutations need to be generated. */
1887 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1888 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1889 {
1890 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1891 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1892 /* But we have to keep those permutations that are required because
1893 of handling of gaps. */
1894 if (known_eq (unrolling_factor, 1U)
1895 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1896 && DR_GROUP_GAP (first_stmt_info) == 0))
1897 SLP_TREE_LOAD_PERMUTATION (node).release ();
1898 else
1899 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1900 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1901 }
1902
1903 return true;
1904 }
1905
1906 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1907
1908 static void
1909 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1910 hash_set<slp_tree> &visited)
1911 {
1912 if (visited.add (node))
1913 return;
1914
1915 if (SLP_TREE_CHILDREN (node).length () == 0)
1916 {
1917 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1918 return;
1919 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1920 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1921 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1922 loads.safe_push (node);
1923 }
1924 else
1925 {
1926 unsigned i;
1927 slp_tree child;
1928 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1929 vect_gather_slp_loads (loads, child, visited);
1930 }
1931 }
1932
1933 static void
1934 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1935 {
1936 hash_set<slp_tree> visited;
1937 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1938 }
1939
1940
1941 /* Find the last store in SLP INSTANCE. */
1942
1943 stmt_vec_info
1944 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1945 {
1946 stmt_vec_info last = NULL;
1947 stmt_vec_info stmt_vinfo;
1948
1949 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1950 {
1951 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1952 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1953 }
1954
1955 return last;
1956 }
1957
1958 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1959 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1960 (also containing the first GROUP1_SIZE stmts, since stores are
1961 consecutive), the second containing the remainder.
1962 Return the first stmt in the second group. */
1963
1964 static stmt_vec_info
1965 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1966 {
1967 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1968 gcc_assert (group1_size > 0);
1969 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1970 gcc_assert (group2_size > 0);
1971 DR_GROUP_SIZE (first_vinfo) = group1_size;
1972
1973 stmt_vec_info stmt_info = first_vinfo;
1974 for (unsigned i = group1_size; i > 1; i--)
1975 {
1976 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1977 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1978 }
1979 /* STMT is now the last element of the first group. */
1980 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1981 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1982
1983 DR_GROUP_SIZE (group2) = group2_size;
1984 for (stmt_info = group2; stmt_info;
1985 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1986 {
1987 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1988 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1989 }
1990
1991 /* For the second group, the DR_GROUP_GAP is that before the original group,
1992 plus skipping over the first vector. */
1993 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1994
1995 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1996 DR_GROUP_GAP (first_vinfo) += group2_size;
1997
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2000 group1_size, group2_size);
2001
2002 return group2;
2003 }
2004
2005 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2006 statements and a vector of NUNITS elements. */
2007
2008 static poly_uint64
2009 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2010 {
2011 return exact_div (common_multiple (nunits, group_size), group_size);
2012 }
2013
2014 /* Analyze an SLP instance starting from a group of grouped stores. Call
2015 vect_build_slp_tree to build a tree of packed stmts if possible.
2016 Return FALSE if it's impossible to SLP any stmt in the loop. */
2017
2018 static bool
2019 vect_analyze_slp_instance (vec_info *vinfo,
2020 scalar_stmts_to_slp_tree_map_t *bst_map,
2021 stmt_vec_info stmt_info, unsigned max_tree_size)
2022 {
2023 slp_instance new_instance;
2024 slp_tree node;
2025 unsigned int group_size;
2026 tree vectype, scalar_type = NULL_TREE;
2027 unsigned int i;
2028 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2029 vec<stmt_vec_info> scalar_stmts;
2030 bool constructor = false;
2031
2032 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2033 {
2034 scalar_type = TREE_TYPE (DR_REF (dr));
2035 group_size = DR_GROUP_SIZE (stmt_info);
2036 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2037 }
2038 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2039 {
2040 gcc_assert (is_a <loop_vec_info> (vinfo));
2041 vectype = STMT_VINFO_VECTYPE (stmt_info);
2042 group_size = REDUC_GROUP_SIZE (stmt_info);
2043 }
2044 else if (is_gimple_assign (stmt_info->stmt)
2045 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2046 {
2047 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2048 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2049 constructor = true;
2050 }
2051 else
2052 {
2053 gcc_assert (is_a <loop_vec_info> (vinfo));
2054 vectype = STMT_VINFO_VECTYPE (stmt_info);
2055 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2056 }
2057
2058 if (!vectype)
2059 {
2060 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2062 "Build SLP failed: unsupported data-type %T\n",
2063 scalar_type);
2064
2065 return false;
2066 }
2067 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2068
2069 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2070 scalar_stmts.create (group_size);
2071 stmt_vec_info next_info = stmt_info;
2072 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2073 {
2074 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2075 while (next_info)
2076 {
2077 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2078 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2079 }
2080 }
2081 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2082 {
2083 /* Collect the reduction stmts and store them in
2084 SLP_TREE_SCALAR_STMTS. */
2085 while (next_info)
2086 {
2087 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2088 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2089 }
2090 /* Mark the first element of the reduction chain as reduction to properly
2091 transform the node. In the reduction analysis phase only the last
2092 element of the chain is marked as reduction. */
2093 STMT_VINFO_DEF_TYPE (stmt_info)
2094 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2095 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2096 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2097 }
2098 else if (constructor)
2099 {
2100 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2101 tree val;
2102 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2103 {
2104 if (TREE_CODE (val) == SSA_NAME)
2105 {
2106 gimple* def = SSA_NAME_DEF_STMT (val);
2107 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2108 /* Value is defined in another basic block. */
2109 if (!def_info)
2110 return false;
2111 def_info = vect_stmt_to_vectorize (def_info);
2112 scalar_stmts.safe_push (def_info);
2113 }
2114 else
2115 return false;
2116 }
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_NOTE, vect_location,
2119 "Analyzing vectorizable constructor: %G\n",
2120 stmt_info->stmt);
2121 }
2122 else
2123 {
2124 /* Collect reduction statements. */
2125 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2126 for (i = 0; reductions.iterate (i, &next_info); i++)
2127 scalar_stmts.safe_push (next_info);
2128 }
2129
2130 /* Build the tree for the SLP instance. */
2131 bool *matches = XALLOCAVEC (bool, group_size);
2132 unsigned npermutes = 0;
2133 poly_uint64 max_nunits = nunits;
2134 unsigned tree_size = 0;
2135 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2136 &max_nunits, matches, &npermutes,
2137 &tree_size, bst_map);
2138 if (node != NULL)
2139 {
2140 /* Calculate the unrolling factor based on the smallest type. */
2141 poly_uint64 unrolling_factor
2142 = calculate_unrolling_factor (max_nunits, group_size);
2143
2144 if (maybe_ne (unrolling_factor, 1U)
2145 && is_a <bb_vec_info> (vinfo))
2146 {
2147 unsigned HOST_WIDE_INT const_max_nunits;
2148 if (!max_nunits.is_constant (&const_max_nunits)
2149 || const_max_nunits > group_size)
2150 {
2151 if (dump_enabled_p ())
2152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153 "Build SLP failed: store group "
2154 "size not a multiple of the vector size "
2155 "in basic block SLP\n");
2156 vect_free_slp_tree (node, false);
2157 return false;
2158 }
2159 /* Fatal mismatch. */
2160 matches[group_size / const_max_nunits * const_max_nunits] = false;
2161 vect_free_slp_tree (node, false);
2162 }
2163 else
2164 {
2165 /* Create a new SLP instance. */
2166 new_instance = XNEW (class _slp_instance);
2167 SLP_INSTANCE_TREE (new_instance) = node;
2168 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2169 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2170 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2171
2172 vect_gather_slp_loads (new_instance, node);
2173 if (dump_enabled_p ())
2174 dump_printf_loc (MSG_NOTE, vect_location,
2175 "SLP size %u vs. limit %u.\n",
2176 tree_size, max_tree_size);
2177
2178 /* Check whether any load is possibly permuted. */
2179 slp_tree load_node;
2180 bool loads_permuted = false;
2181 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2182 {
2183 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2184 continue;
2185 unsigned j;
2186 stmt_vec_info load_info;
2187 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2188 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2189 {
2190 loads_permuted = true;
2191 break;
2192 }
2193 }
2194
2195 /* If the loads and stores can be handled with load/store-lane
2196 instructions do not generate this SLP instance. */
2197 if (is_a <loop_vec_info> (vinfo)
2198 && loads_permuted
2199 && dr && vect_store_lanes_supported (vectype, group_size, false))
2200 {
2201 slp_tree load_node;
2202 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2203 {
2204 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2205 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2206 /* Use SLP for strided accesses (or if we can't load-lanes). */
2207 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2208 || ! vect_load_lanes_supported
2209 (STMT_VINFO_VECTYPE (stmt_vinfo),
2210 DR_GROUP_SIZE (stmt_vinfo), false))
2211 break;
2212 }
2213 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2214 {
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2217 "Built SLP cancelled: can use "
2218 "load/store-lanes\n");
2219 vect_free_slp_instance (new_instance, false);
2220 return false;
2221 }
2222 }
2223
2224 /* If this is a reduction chain with a conversion in front
2225 amend the SLP tree with a node for that. */
2226 if (!dr
2227 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2228 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2229 {
2230 /* Get at the conversion stmt - we know it's the single use
2231 of the last stmt of the reduction chain. */
2232 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2233 use_operand_p use_p;
2234 gimple *use_stmt;
2235 bool r = single_imm_use (gimple_assign_lhs (tem),
2236 &use_p, &use_stmt);
2237 gcc_assert (r);
2238 next_info = vinfo->lookup_stmt (use_stmt);
2239 next_info = vect_stmt_to_vectorize (next_info);
2240 scalar_stmts = vNULL;
2241 scalar_stmts.create (group_size);
2242 for (unsigned i = 0; i < group_size; ++i)
2243 scalar_stmts.quick_push (next_info);
2244 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2245 SLP_TREE_CHILDREN (conv).quick_push (node);
2246 SLP_INSTANCE_TREE (new_instance) = conv;
2247 /* We also have to fake this conversion stmt as SLP reduction
2248 group so we don't have to mess with too much code
2249 elsewhere. */
2250 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2251 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2252 }
2253
2254 vinfo->slp_instances.safe_push (new_instance);
2255
2256 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2257 the number of scalar stmts in the root in a few places.
2258 Verify that assumption holds. */
2259 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2260 .length () == group_size);
2261
2262 if (dump_enabled_p ())
2263 {
2264 dump_printf_loc (MSG_NOTE, vect_location,
2265 "Final SLP tree for instance:\n");
2266 vect_print_slp_graph (MSG_NOTE, vect_location,
2267 SLP_INSTANCE_TREE (new_instance));
2268 }
2269
2270 return true;
2271 }
2272 }
2273 else
2274 {
2275 /* Failed to SLP. */
2276 /* Free the allocated memory. */
2277 scalar_stmts.release ();
2278 }
2279
2280 /* For basic block SLP, try to break the group up into multiples of the
2281 vector size. */
2282 unsigned HOST_WIDE_INT const_nunits;
2283 if (is_a <bb_vec_info> (vinfo)
2284 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2285 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2286 && nunits.is_constant (&const_nunits))
2287 {
2288 /* We consider breaking the group only on VF boundaries from the existing
2289 start. */
2290 for (i = 0; i < group_size; i++)
2291 if (!matches[i]) break;
2292
2293 if (i >= const_nunits && i < group_size)
2294 {
2295 /* Split into two groups at the first vector boundary before i. */
2296 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2297 unsigned group1_size = i & ~(const_nunits - 1);
2298
2299 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2300 group1_size);
2301 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2302 max_tree_size);
2303 /* If the first non-match was in the middle of a vector,
2304 skip the rest of that vector. */
2305 if (group1_size < i)
2306 {
2307 i = group1_size + const_nunits;
2308 if (i < group_size)
2309 rest = vect_split_slp_store_group (rest, const_nunits);
2310 }
2311 if (i < group_size)
2312 res |= vect_analyze_slp_instance (vinfo, bst_map,
2313 rest, max_tree_size);
2314 return res;
2315 }
2316 /* Even though the first vector did not all match, we might be able to SLP
2317 (some) of the remainder. FORNOW ignore this possibility. */
2318 }
2319
2320 return false;
2321 }
2322
2323
2324 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2325 trees of packed scalar stmts if SLP is possible. */
2326
2327 opt_result
2328 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2329 {
2330 unsigned int i;
2331 stmt_vec_info first_element;
2332
2333 DUMP_VECT_SCOPE ("vect_analyze_slp");
2334
2335 scalar_stmts_to_slp_tree_map_t *bst_map
2336 = new scalar_stmts_to_slp_tree_map_t ();
2337
2338 /* Find SLP sequences starting from groups of grouped stores. */
2339 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2340 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2341
2342 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2343 {
2344 if (loop_vinfo->reduction_chains.length () > 0)
2345 {
2346 /* Find SLP sequences starting from reduction chains. */
2347 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2348 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2349 max_tree_size))
2350 {
2351 /* Dissolve reduction chain group. */
2352 stmt_vec_info vinfo = first_element;
2353 stmt_vec_info last = NULL;
2354 while (vinfo)
2355 {
2356 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2357 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2358 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2359 last = vinfo;
2360 vinfo = next;
2361 }
2362 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2363 /* It can be still vectorized as part of an SLP reduction. */
2364 loop_vinfo->reductions.safe_push (last);
2365 }
2366 }
2367
2368 /* Find SLP sequences starting from groups of reductions. */
2369 if (loop_vinfo->reductions.length () > 1)
2370 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2371 max_tree_size);
2372 }
2373
2374 /* The map keeps a reference on SLP nodes built, release that. */
2375 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2376 it != bst_map->end (); ++it)
2377 if ((*it).second)
2378 vect_free_slp_tree ((*it).second, false);
2379 delete bst_map;
2380
2381 /* Optimize permutations in SLP reductions. */
2382 slp_instance instance;
2383 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2384 {
2385 slp_tree node = SLP_INSTANCE_TREE (instance);
2386 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2387 /* Reduction (there are no data-refs in the root).
2388 In reduction chain the order of the loads is not important. */
2389 if (!STMT_VINFO_DATA_REF (stmt_info)
2390 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2391 vect_attempt_slp_rearrange_stmts (instance);
2392 }
2393
2394 /* Gather all loads in the SLP graph. */
2395 hash_set<slp_tree> visited;
2396 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2397 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2398 visited);
2399
2400 return opt_result::success ();
2401 }
2402
2403 void
2404 vect_optimize_slp (vec_info *vinfo)
2405 {
2406 slp_tree node;
2407 unsigned i;
2408 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2409 {
2410 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2411 continue;
2412
2413 /* In basic block vectorization we allow any subchain of an interleaving
2414 chain.
2415 FORNOW: not in loop SLP because of realignment complications. */
2416 if (is_a <bb_vec_info> (vinfo))
2417 {
2418 bool subchain_p = true;
2419 stmt_vec_info next_load_info = NULL;
2420 stmt_vec_info load_info;
2421 unsigned j;
2422 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2423 {
2424 if (j != 0
2425 && (next_load_info != load_info
2426 || DR_GROUP_GAP (load_info) != 1))
2427 {
2428 subchain_p = false;
2429 break;
2430 }
2431 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2432 }
2433 if (subchain_p)
2434 {
2435 SLP_TREE_LOAD_PERMUTATION (node).release ();
2436 continue;
2437 }
2438 }
2439 else
2440 {
2441 stmt_vec_info load_info;
2442 bool this_load_permuted = false;
2443 unsigned j;
2444 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2445 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2446 {
2447 this_load_permuted = true;
2448 break;
2449 }
2450 stmt_vec_info first_stmt_info
2451 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2452 if (!this_load_permuted
2453 /* The load requires permutation when unrolling exposes
2454 a gap either because the group is larger than the SLP
2455 group-size or because there is a gap between the groups. */
2456 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2457 || ((SLP_TREE_SCALAR_STMTS (node).length ()
2458 == DR_GROUP_SIZE (first_stmt_info))
2459 && DR_GROUP_GAP (first_stmt_info) == 0)))
2460 {
2461 SLP_TREE_LOAD_PERMUTATION (node).release ();
2462 continue;
2463 }
2464 }
2465 }
2466 }
2467
2468
2469 /* For each possible SLP instance decide whether to SLP it and calculate overall
2470 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2471 least one instance. */
2472
2473 bool
2474 vect_make_slp_decision (loop_vec_info loop_vinfo)
2475 {
2476 unsigned int i;
2477 poly_uint64 unrolling_factor = 1;
2478 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2479 slp_instance instance;
2480 int decided_to_slp = 0;
2481
2482 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2483
2484 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2485 {
2486 /* FORNOW: SLP if you can. */
2487 /* All unroll factors have the form:
2488
2489 GET_MODE_SIZE (vinfo->vector_mode) * X
2490
2491 for some rational X, so they must have a common multiple. */
2492 unrolling_factor
2493 = force_common_multiple (unrolling_factor,
2494 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2495
2496 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2497 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2498 loop-based vectorization. Such stmts will be marked as HYBRID. */
2499 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2500 decided_to_slp++;
2501 }
2502
2503 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2504
2505 if (decided_to_slp && dump_enabled_p ())
2506 {
2507 dump_printf_loc (MSG_NOTE, vect_location,
2508 "Decided to SLP %d instances. Unrolling factor ",
2509 decided_to_slp);
2510 dump_dec (MSG_NOTE, unrolling_factor);
2511 dump_printf (MSG_NOTE, "\n");
2512 }
2513
2514 return (decided_to_slp > 0);
2515 }
2516
2517 /* Private data for vect_detect_hybrid_slp. */
2518 struct vdhs_data
2519 {
2520 loop_vec_info loop_vinfo;
2521 vec<stmt_vec_info> *worklist;
2522 };
2523
2524 /* Walker for walk_gimple_op. */
2525
2526 static tree
2527 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2528 {
2529 walk_stmt_info *wi = (walk_stmt_info *)data;
2530 vdhs_data *dat = (vdhs_data *)wi->info;
2531
2532 if (wi->is_lhs)
2533 return NULL_TREE;
2534
2535 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2536 if (!def_stmt_info)
2537 return NULL_TREE;
2538 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2539 if (PURE_SLP_STMT (def_stmt_info))
2540 {
2541 if (dump_enabled_p ())
2542 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2543 def_stmt_info->stmt);
2544 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2545 dat->worklist->safe_push (def_stmt_info);
2546 }
2547
2548 return NULL_TREE;
2549 }
2550
2551 /* Find stmts that must be both vectorized and SLPed. */
2552
2553 void
2554 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2555 {
2556 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2557
2558 /* All stmts participating in SLP are marked pure_slp, all other
2559 stmts are loop_vect.
2560 First collect all loop_vect stmts into a worklist. */
2561 auto_vec<stmt_vec_info> worklist;
2562 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2563 {
2564 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2565 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2566 gsi_next (&gsi))
2567 {
2568 gphi *phi = gsi.phi ();
2569 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2570 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2571 worklist.safe_push (stmt_info);
2572 }
2573 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2574 gsi_next (&gsi))
2575 {
2576 gimple *stmt = gsi_stmt (gsi);
2577 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2578 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2579 {
2580 for (gimple_stmt_iterator gsi2
2581 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2582 !gsi_end_p (gsi2); gsi_next (&gsi2))
2583 {
2584 stmt_vec_info patt_info
2585 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2586 if (!STMT_SLP_TYPE (patt_info)
2587 && STMT_VINFO_RELEVANT (patt_info))
2588 worklist.safe_push (patt_info);
2589 }
2590 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2591 }
2592 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2593 worklist.safe_push (stmt_info);
2594 }
2595 }
2596
2597 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2598 mark any SLP vectorized stmt as hybrid.
2599 ??? We're visiting def stmts N times (once for each non-SLP and
2600 once for each hybrid-SLP use). */
2601 walk_stmt_info wi;
2602 vdhs_data dat;
2603 dat.worklist = &worklist;
2604 dat.loop_vinfo = loop_vinfo;
2605 memset (&wi, 0, sizeof (wi));
2606 wi.info = (void *)&dat;
2607 while (!worklist.is_empty ())
2608 {
2609 stmt_vec_info stmt_info = worklist.pop ();
2610 /* Since SSA operands are not set up for pattern stmts we need
2611 to use walk_gimple_op. */
2612 wi.is_lhs = 0;
2613 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2614 }
2615 }
2616
2617
2618 /* Initialize a bb_vec_info struct for the statements between
2619 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2620
2621 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2622 gimple_stmt_iterator region_end_in,
2623 vec_info_shared *shared)
2624 : vec_info (vec_info::bb, init_cost (NULL), shared),
2625 bb (gsi_bb (region_begin_in)),
2626 region_begin (region_begin_in),
2627 region_end (region_end_in)
2628 {
2629 gimple_stmt_iterator gsi;
2630
2631 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2632 gsi_next (&gsi))
2633 {
2634 gimple *stmt = gsi_stmt (gsi);
2635 gimple_set_uid (stmt, 0);
2636 add_stmt (stmt);
2637 }
2638
2639 bb->aux = this;
2640 }
2641
2642
2643 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2644 stmts in the basic block. */
2645
2646 _bb_vec_info::~_bb_vec_info ()
2647 {
2648 for (gimple_stmt_iterator si = region_begin;
2649 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2650 /* Reset region marker. */
2651 gimple_set_uid (gsi_stmt (si), -1);
2652
2653 bb->aux = NULL;
2654 }
2655
2656 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2657 given then that child nodes have already been processed, and that
2658 their def types currently match their SLP node's def type. */
2659
2660 static bool
2661 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2662 slp_instance node_instance,
2663 stmt_vector_for_cost *cost_vec)
2664 {
2665 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2666 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2667
2668 /* Calculate the number of vector statements to be created for the
2669 scalar stmts in this node. For SLP reductions it is equal to the
2670 number of vector statements in the children (which has already been
2671 calculated by the recursive call). Otherwise it is the number of
2672 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2673 VF divided by the number of elements in a vector. */
2674 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2675 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2676 {
2677 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2678 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2679 {
2680 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2681 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2682 break;
2683 }
2684 }
2685 else
2686 {
2687 poly_uint64 vf;
2688 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2689 vf = loop_vinfo->vectorization_factor;
2690 else
2691 vf = 1;
2692 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2693 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2694 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2695 = vect_get_num_vectors (vf * group_size, vectype);
2696 }
2697
2698 bool dummy;
2699 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2700 node, node_instance, cost_vec);
2701 }
2702
2703 /* Try to build NODE from scalars, returning true on success.
2704 NODE_INSTANCE is the SLP instance that contains NODE. */
2705
2706 static bool
2707 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2708 slp_instance node_instance)
2709 {
2710 stmt_vec_info stmt_info;
2711 unsigned int i;
2712
2713 if (!is_a <bb_vec_info> (vinfo)
2714 || node == SLP_INSTANCE_TREE (node_instance)
2715 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2716 return false;
2717
2718 if (dump_enabled_p ())
2719 dump_printf_loc (MSG_NOTE, vect_location,
2720 "Building vector operands from scalars instead\n");
2721
2722 /* Don't remove and free the child nodes here, since they could be
2723 referenced by other structures. The analysis and scheduling phases
2724 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2725 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2726 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2727 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2728 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2729 {
2730 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2731 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2732 }
2733 return true;
2734 }
2735
2736 /* Compute the prologue cost for invariant or constant operands represented
2737 by NODE. */
2738
2739 static void
2740 vect_prologue_cost_for_slp (vec_info *vinfo,
2741 slp_tree node,
2742 stmt_vector_for_cost *cost_vec)
2743 {
2744 /* Without looking at the actual initializer a vector of
2745 constants can be implemented as load from the constant pool.
2746 When all elements are the same we can use a splat. */
2747 tree vectype = SLP_TREE_VECTYPE (node);
2748 /* ??? Ideally we'd want all invariant nodes to have a vectype. */
2749 if (!vectype)
2750 vectype = get_vectype_for_scalar_type (vinfo,
2751 TREE_TYPE (SLP_TREE_SCALAR_OPS
2752 (node)[0]), node);
2753 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2754 unsigned num_vects_to_check;
2755 unsigned HOST_WIDE_INT const_nunits;
2756 unsigned nelt_limit;
2757 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2758 && ! multiple_p (const_nunits, group_size))
2759 {
2760 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2761 nelt_limit = const_nunits;
2762 }
2763 else
2764 {
2765 /* If either the vector has variable length or the vectors
2766 are composed of repeated whole groups we only need to
2767 cost construction once. All vectors will be the same. */
2768 num_vects_to_check = 1;
2769 nelt_limit = group_size;
2770 }
2771 tree elt = NULL_TREE;
2772 unsigned nelt = 0;
2773 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2774 {
2775 unsigned si = j % group_size;
2776 if (nelt == 0)
2777 elt = SLP_TREE_SCALAR_OPS (node)[si];
2778 /* ??? We're just tracking whether all operands of a single
2779 vector initializer are the same, ideally we'd check if
2780 we emitted the same one already. */
2781 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2782 elt = NULL_TREE;
2783 nelt++;
2784 if (nelt == nelt_limit)
2785 {
2786 record_stmt_cost (cost_vec, 1,
2787 SLP_TREE_DEF_TYPE (node) == vect_external_def
2788 ? (elt ? scalar_to_vec : vec_construct)
2789 : vector_load,
2790 NULL, vectype, 0, vect_prologue);
2791 nelt = 0;
2792 }
2793 }
2794 }
2795
2796 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2797 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2798
2799 Return true if the operations are supported. */
2800
2801 static bool
2802 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2803 slp_instance node_instance,
2804 hash_set<slp_tree> &visited,
2805 hash_set<slp_tree> &lvisited,
2806 stmt_vector_for_cost *cost_vec)
2807 {
2808 int i, j;
2809 slp_tree child;
2810
2811 /* Assume we can code-generate all invariants. */
2812 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2813 return true;
2814
2815 /* If we already analyzed the exact same set of scalar stmts we're done.
2816 We share the generated vector stmts for those.
2817 The SLP graph is acyclic so not caching whether we failed or succeeded
2818 doesn't result in any issue since we throw away the lvisited set
2819 when we fail. */
2820 if (visited.contains (node)
2821 || lvisited.add (node))
2822 return true;
2823
2824 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2825 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2826 visited, lvisited, cost_vec))
2827 return false;
2828
2829 /* ??? We have to catch the case late where two first scalar stmts appear
2830 in multiple SLP children with different def type and fail. Remember
2831 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2832 match it when that is vect_internal_def. */
2833 auto_vec<vect_def_type, 4> dt;
2834 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2835 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2836 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2837 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2838
2839 /* Push SLP node def-type to stmt operands. */
2840 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2841 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2842 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2843 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2844 = SLP_TREE_DEF_TYPE (child);
2845
2846 /* Check everything worked out. */
2847 bool res = true;
2848 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2849 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2850 {
2851 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2852 {
2853 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2854 != SLP_TREE_DEF_TYPE (child))
2855 res = false;
2856 }
2857 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2858 != dt[j])
2859 res = false;
2860 }
2861 if (!res && dump_enabled_p ())
2862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2863 "not vectorized: same operand with different "
2864 "def type in stmt.\n");
2865
2866 if (res)
2867 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2868 cost_vec);
2869
2870 /* Restore def-types. */
2871 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2872 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2873 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2874
2875 /* When the node can be vectorized cost invariant nodes it references.
2876 This is not done in DFS order to allow the refering node
2877 vectorizable_* calls to nail down the invariant nodes vector type
2878 and possibly unshare it if it needs a different vector type than
2879 other referrers. */
2880 if (res)
2881 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2882 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2883 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2884 /* Perform usual caching, note code-generation still
2885 code-gens these nodes multiple times but we expect
2886 to CSE them later. */
2887 && !visited.contains (child)
2888 && !lvisited.add (child))
2889 {
2890 /* ??? After auditing more code paths make a "default"
2891 and push the vector type from NODE to all children
2892 if it is not already set. */
2893 /* Compute the number of vectors to be generated. */
2894 tree vector_type = SLP_TREE_VECTYPE (child);
2895 if (!vector_type)
2896 {
2897 /* For shifts with a scalar argument we don't need
2898 to cost or code-generate anything.
2899 ??? Represent this more explicitely. */
2900 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_SCALAR_STMTS (node)[0])
2901 == shift_vec_info_type)
2902 && j == 1);
2903 continue;
2904 }
2905 unsigned group_size = SLP_TREE_SCALAR_OPS (child).length ();
2906 poly_uint64 vf = 1;
2907 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2908 vf = loop_vinfo->vectorization_factor;
2909 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2910 = vect_get_num_vectors (vf * group_size, vector_type);
2911 /* And cost them. */
2912 vect_prologue_cost_for_slp (vinfo, child, cost_vec);
2913 }
2914
2915 /* If this node can't be vectorized, try pruning the tree here rather
2916 than felling the whole thing. */
2917 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2918 {
2919 /* We'll need to revisit this for invariant costing and number
2920 of vectorized stmt setting. */
2921 lvisited.remove (node);
2922 res = true;
2923 }
2924
2925 return res;
2926 }
2927
2928
2929 /* Analyze statements in SLP instances of VINFO. Return true if the
2930 operations are supported. */
2931
2932 bool
2933 vect_slp_analyze_operations (vec_info *vinfo)
2934 {
2935 slp_instance instance;
2936 int i;
2937
2938 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2939
2940 hash_set<slp_tree> visited;
2941 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2942 {
2943 hash_set<slp_tree> lvisited;
2944 stmt_vector_for_cost cost_vec;
2945 cost_vec.create (2);
2946 if (!vect_slp_analyze_node_operations (vinfo,
2947 SLP_INSTANCE_TREE (instance),
2948 instance, visited, lvisited,
2949 &cost_vec)
2950 /* Instances with a root stmt require vectorized defs for the
2951 SLP tree root. */
2952 || (SLP_INSTANCE_ROOT_STMT (instance)
2953 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2954 != vect_internal_def)))
2955 {
2956 slp_tree node = SLP_INSTANCE_TREE (instance);
2957 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2958 if (dump_enabled_p ())
2959 dump_printf_loc (MSG_NOTE, vect_location,
2960 "removing SLP instance operations starting from: %G",
2961 stmt_info->stmt);
2962 vect_free_slp_instance (instance, false);
2963 vinfo->slp_instances.ordered_remove (i);
2964 cost_vec.release ();
2965 }
2966 else
2967 {
2968 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2969 x != lvisited.end(); ++x)
2970 visited.add (*x);
2971 i++;
2972
2973 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
2974 cost_vec.release ();
2975 }
2976 }
2977
2978 return !vinfo->slp_instances.is_empty ();
2979 }
2980
2981
2982 /* Compute the scalar cost of the SLP node NODE and its children
2983 and return it. Do not account defs that are marked in LIFE and
2984 update LIFE according to uses of NODE. */
2985
2986 static void
2987 vect_bb_slp_scalar_cost (vec_info *vinfo,
2988 slp_tree node, vec<bool, va_heap> *life,
2989 stmt_vector_for_cost *cost_vec,
2990 hash_set<slp_tree> &visited)
2991 {
2992 unsigned i;
2993 stmt_vec_info stmt_info;
2994 slp_tree child;
2995
2996 if (visited.add (node))
2997 return;
2998
2999 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3000 {
3001 gimple *stmt = stmt_info->stmt;
3002 ssa_op_iter op_iter;
3003 def_operand_p def_p;
3004
3005 if ((*life)[i])
3006 continue;
3007
3008 /* If there is a non-vectorized use of the defs then the scalar
3009 stmt is kept live in which case we do not account it or any
3010 required defs in the SLP children in the scalar cost. This
3011 way we make the vectorization more costly when compared to
3012 the scalar cost. */
3013 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
3014 {
3015 imm_use_iterator use_iter;
3016 gimple *use_stmt;
3017 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3018 if (!is_gimple_debug (use_stmt))
3019 {
3020 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3021 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
3022 {
3023 (*life)[i] = true;
3024 BREAK_FROM_IMM_USE_STMT (use_iter);
3025 }
3026 }
3027 }
3028 if ((*life)[i])
3029 continue;
3030
3031 /* Count scalar stmts only once. */
3032 if (gimple_visited_p (stmt))
3033 continue;
3034 gimple_set_visited (stmt, true);
3035
3036 vect_cost_for_stmt kind;
3037 if (STMT_VINFO_DATA_REF (stmt_info))
3038 {
3039 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3040 kind = scalar_load;
3041 else
3042 kind = scalar_store;
3043 }
3044 else if (vect_nop_conversion_p (stmt_info))
3045 continue;
3046 else
3047 kind = scalar_stmt;
3048 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
3049 }
3050
3051 auto_vec<bool, 20> subtree_life;
3052 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3053 {
3054 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3055 {
3056 /* Do not directly pass LIFE to the recursive call, copy it to
3057 confine changes in the callee to the current child/subtree. */
3058 subtree_life.safe_splice (*life);
3059 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3060 visited);
3061 subtree_life.truncate (0);
3062 }
3063 }
3064 }
3065
3066 /* Check if vectorization of the basic block is profitable. */
3067
3068 static bool
3069 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3070 {
3071 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3072 slp_instance instance;
3073 int i;
3074 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3075 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3076
3077 /* Calculate scalar cost. */
3078 stmt_vector_for_cost scalar_costs;
3079 scalar_costs.create (0);
3080 hash_set<slp_tree> visited;
3081 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3082 {
3083 auto_vec<bool, 20> life;
3084 life.safe_grow_cleared
3085 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)).length ());
3086 vect_bb_slp_scalar_cost (bb_vinfo,
3087 SLP_INSTANCE_TREE (instance),
3088 &life, &scalar_costs, visited);
3089 }
3090 void *target_cost_data = init_cost (NULL);
3091 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3092 scalar_costs.release ();
3093 unsigned dummy;
3094 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3095 destroy_cost_data (target_cost_data);
3096
3097 /* Unset visited flag. */
3098 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3099 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3100 gimple_set_visited (gsi_stmt (gsi), false);
3101
3102 /* Complete the target-specific cost calculation. */
3103 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3104 &vec_inside_cost, &vec_epilogue_cost);
3105
3106 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3107
3108 if (dump_enabled_p ())
3109 {
3110 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3111 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3112 vec_inside_cost);
3113 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3114 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3115 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3116 }
3117
3118 /* Vectorization is profitable if its cost is more than the cost of scalar
3119 version. Note that we err on the vector side for equal cost because
3120 the cost estimate is otherwise quite pessimistic (constant uses are
3121 free on the scalar side but cost a load on the vector side for
3122 example). */
3123 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3124 return false;
3125
3126 return true;
3127 }
3128
3129 /* Find any vectorizable constructors and add them to the grouped_store
3130 array. */
3131
3132 static void
3133 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3134 {
3135 gimple_stmt_iterator gsi;
3136
3137 for (gsi = bb_vinfo->region_begin;
3138 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3139 {
3140 gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
3141 if (!stmt || gimple_assign_rhs_code (stmt) != CONSTRUCTOR)
3142 continue;
3143
3144 tree rhs = gimple_assign_rhs1 (stmt);
3145 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3146 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3147 CONSTRUCTOR_NELTS (rhs))
3148 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3149 || uniform_vector_p (rhs))
3150 continue;
3151
3152 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
3153 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3154 }
3155 }
3156
3157 /* Check if the region described by BB_VINFO can be vectorized, returning
3158 true if so. When returning false, set FATAL to true if the same failure
3159 would prevent vectorization at other vector sizes, false if it is still
3160 worth trying other sizes. N_STMTS is the number of statements in the
3161 region. */
3162
3163 static bool
3164 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3165 {
3166 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3167
3168 slp_instance instance;
3169 int i;
3170 poly_uint64 min_vf = 2;
3171
3172 /* The first group of checks is independent of the vector size. */
3173 fatal = true;
3174
3175 /* Analyze the data references. */
3176
3177 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3178 {
3179 if (dump_enabled_p ())
3180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3181 "not vectorized: unhandled data-ref in basic "
3182 "block.\n");
3183 return false;
3184 }
3185
3186 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3187 {
3188 if (dump_enabled_p ())
3189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3190 "not vectorized: not enough data-refs in "
3191 "basic block.\n");
3192 return false;
3193 }
3194
3195 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3196 {
3197 if (dump_enabled_p ())
3198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3199 "not vectorized: unhandled data access in "
3200 "basic block.\n");
3201 return false;
3202 }
3203
3204 vect_slp_check_for_constructors (bb_vinfo);
3205
3206 /* If there are no grouped stores in the region there is no need
3207 to continue with pattern recog as vect_analyze_slp will fail
3208 anyway. */
3209 if (bb_vinfo->grouped_stores.is_empty ())
3210 {
3211 if (dump_enabled_p ())
3212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3213 "not vectorized: no grouped stores in "
3214 "basic block.\n");
3215 return false;
3216 }
3217
3218 /* While the rest of the analysis below depends on it in some way. */
3219 fatal = false;
3220
3221 vect_pattern_recog (bb_vinfo);
3222
3223 /* Check the SLP opportunities in the basic block, analyze and build SLP
3224 trees. */
3225 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3226 {
3227 if (dump_enabled_p ())
3228 {
3229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3230 "Failed to SLP the basic block.\n");
3231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3232 "not vectorized: failed to find SLP opportunities "
3233 "in basic block.\n");
3234 }
3235 return false;
3236 }
3237
3238 /* Optimize permutations. */
3239 vect_optimize_slp (bb_vinfo);
3240
3241 vect_record_base_alignments (bb_vinfo);
3242
3243 /* Analyze and verify the alignment of data references and the
3244 dependence in the SLP instances. */
3245 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3246 {
3247 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
3248 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3249 {
3250 slp_tree node = SLP_INSTANCE_TREE (instance);
3251 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3252 if (dump_enabled_p ())
3253 dump_printf_loc (MSG_NOTE, vect_location,
3254 "removing SLP instance operations starting from: %G",
3255 stmt_info->stmt);
3256 vect_free_slp_instance (instance, false);
3257 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3258 continue;
3259 }
3260
3261 /* Mark all the statements that we want to vectorize as pure SLP and
3262 relevant. */
3263 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3264 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3265 if (SLP_INSTANCE_ROOT_STMT (instance))
3266 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3267
3268 i++;
3269 }
3270 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3271 return false;
3272
3273 if (!vect_slp_analyze_operations (bb_vinfo))
3274 {
3275 if (dump_enabled_p ())
3276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3277 "not vectorized: bad operation in basic block.\n");
3278 return false;
3279 }
3280
3281 /* Cost model: check if the vectorization is worthwhile. */
3282 if (!unlimited_cost_model (NULL)
3283 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3284 {
3285 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3287 "not vectorized: vectorization is not "
3288 "profitable.\n");
3289 return false;
3290 }
3291
3292 if (dump_enabled_p ())
3293 dump_printf_loc (MSG_NOTE, vect_location,
3294 "Basic block will be vectorized using SLP\n");
3295 return true;
3296 }
3297
3298 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3299 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3300 on success. The region has N_STMTS statements and has the datarefs
3301 given by DATAREFS. */
3302
3303 static bool
3304 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3305 gimple_stmt_iterator region_end,
3306 vec<data_reference_p> datarefs,
3307 unsigned int n_stmts)
3308 {
3309 bb_vec_info bb_vinfo;
3310 auto_vector_modes vector_modes;
3311
3312 /* Autodetect first vector size we try. */
3313 machine_mode next_vector_mode = VOIDmode;
3314 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3315 unsigned int mode_i = 0;
3316
3317 vec_info_shared shared;
3318
3319 machine_mode autodetected_vector_mode = VOIDmode;
3320 while (1)
3321 {
3322 bool vectorized = false;
3323 bool fatal = false;
3324 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3325
3326 bool first_time_p = shared.datarefs.is_empty ();
3327 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3328 if (first_time_p)
3329 bb_vinfo->shared->save_datarefs ();
3330 else
3331 bb_vinfo->shared->check_datarefs ();
3332 bb_vinfo->vector_mode = next_vector_mode;
3333
3334 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3335 && dbg_cnt (vect_slp))
3336 {
3337 if (dump_enabled_p ())
3338 {
3339 dump_printf_loc (MSG_NOTE, vect_location,
3340 "***** Analysis succeeded with vector mode"
3341 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3342 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3343 }
3344
3345 bb_vinfo->shared->check_datarefs ();
3346 vect_schedule_slp (bb_vinfo);
3347
3348 unsigned HOST_WIDE_INT bytes;
3349 if (dump_enabled_p ())
3350 {
3351 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3352 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3353 "basic block part vectorized using %wu byte "
3354 "vectors\n", bytes);
3355 else
3356 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3357 "basic block part vectorized using variable "
3358 "length vectors\n");
3359 }
3360
3361 vectorized = true;
3362 }
3363 else
3364 {
3365 if (dump_enabled_p ())
3366 dump_printf_loc (MSG_NOTE, vect_location,
3367 "***** Analysis failed with vector mode %s\n",
3368 GET_MODE_NAME (bb_vinfo->vector_mode));
3369 }
3370
3371 if (mode_i == 0)
3372 autodetected_vector_mode = bb_vinfo->vector_mode;
3373
3374 if (!fatal)
3375 while (mode_i < vector_modes.length ()
3376 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3377 {
3378 if (dump_enabled_p ())
3379 dump_printf_loc (MSG_NOTE, vect_location,
3380 "***** The result for vector mode %s would"
3381 " be the same\n",
3382 GET_MODE_NAME (vector_modes[mode_i]));
3383 mode_i += 1;
3384 }
3385
3386 delete bb_vinfo;
3387
3388 if (mode_i < vector_modes.length ()
3389 && VECTOR_MODE_P (autodetected_vector_mode)
3390 && (related_vector_mode (vector_modes[mode_i],
3391 GET_MODE_INNER (autodetected_vector_mode))
3392 == autodetected_vector_mode)
3393 && (related_vector_mode (autodetected_vector_mode,
3394 GET_MODE_INNER (vector_modes[mode_i]))
3395 == vector_modes[mode_i]))
3396 {
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE, vect_location,
3399 "***** Skipping vector mode %s, which would"
3400 " repeat the analysis for %s\n",
3401 GET_MODE_NAME (vector_modes[mode_i]),
3402 GET_MODE_NAME (autodetected_vector_mode));
3403 mode_i += 1;
3404 }
3405
3406 if (vectorized
3407 || mode_i == vector_modes.length ()
3408 || autodetected_vector_mode == VOIDmode
3409 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3410 vector sizes will fail do not bother iterating. */
3411 || fatal)
3412 return vectorized;
3413
3414 /* Try the next biggest vector size. */
3415 next_vector_mode = vector_modes[mode_i++];
3416 if (dump_enabled_p ())
3417 dump_printf_loc (MSG_NOTE, vect_location,
3418 "***** Re-trying analysis with vector mode %s\n",
3419 GET_MODE_NAME (next_vector_mode));
3420 }
3421 }
3422
3423 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3424 true if anything in the basic-block was vectorized. */
3425
3426 bool
3427 vect_slp_bb (basic_block bb)
3428 {
3429 gimple_stmt_iterator gsi;
3430 bool any_vectorized = false;
3431
3432 gsi = gsi_after_labels (bb);
3433 while (!gsi_end_p (gsi))
3434 {
3435 gimple_stmt_iterator region_begin = gsi;
3436 vec<data_reference_p> datarefs = vNULL;
3437 int insns = 0;
3438
3439 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3440 {
3441 gimple *stmt = gsi_stmt (gsi);
3442 if (is_gimple_debug (stmt))
3443 continue;
3444 insns++;
3445
3446 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3447 vect_location = stmt;
3448
3449 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3450 break;
3451 }
3452
3453 /* Skip leading unhandled stmts. */
3454 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3455 {
3456 gsi_next (&gsi);
3457 continue;
3458 }
3459
3460 gimple_stmt_iterator region_end = gsi;
3461
3462 if (insns > param_slp_max_insns_in_bb)
3463 {
3464 if (dump_enabled_p ())
3465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3466 "not vectorized: too many instructions in "
3467 "basic block.\n");
3468 }
3469 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3470 any_vectorized = true;
3471
3472 if (gsi_end_p (region_end))
3473 break;
3474
3475 /* Skip the unhandled stmt. */
3476 gsi_next (&gsi);
3477 }
3478
3479 return any_vectorized;
3480 }
3481
3482
3483 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3484
3485 static bool
3486 vect_mask_constant_operand_p (vec_info *vinfo,
3487 stmt_vec_info stmt_vinfo, unsigned op_num)
3488 {
3489 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3490 tree op, vectype;
3491 enum vect_def_type dt;
3492
3493 /* For comparison and COND_EXPR type is chosen depending
3494 on the non-constant other comparison operand. */
3495 if (TREE_CODE_CLASS (code) == tcc_comparison)
3496 {
3497 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3498 op = gimple_assign_rhs1 (stmt);
3499
3500 if (!vect_is_simple_use (op, vinfo, &dt, &vectype))
3501 gcc_unreachable ();
3502
3503 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3504 }
3505
3506 if (code == COND_EXPR)
3507 {
3508 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3509 tree cond = gimple_assign_rhs1 (stmt);
3510
3511 if (TREE_CODE (cond) == SSA_NAME)
3512 {
3513 if (op_num > 0)
3514 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3515 op = cond;
3516 }
3517 else
3518 {
3519 if (op_num > 1)
3520 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3521 op = TREE_OPERAND (cond, 0);
3522 }
3523
3524 if (!vect_is_simple_use (op, vinfo, &dt, &vectype))
3525 gcc_unreachable ();
3526
3527 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3528 }
3529
3530 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3531 }
3532
3533 /* Build a variable-length vector in which the elements in ELTS are repeated
3534 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3535 RESULTS and add any new instructions to SEQ.
3536
3537 The approach we use is:
3538
3539 (1) Find a vector mode VM with integer elements of mode IM.
3540
3541 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3542 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3543 from small vectors to IM.
3544
3545 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3546
3547 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3548 correct byte contents.
3549
3550 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3551
3552 We try to find the largest IM for which this sequence works, in order
3553 to cut down on the number of interleaves. */
3554
3555 void
3556 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3557 vec<tree> elts, unsigned int nresults,
3558 vec<tree> &results)
3559 {
3560 unsigned int nelts = elts.length ();
3561 tree element_type = TREE_TYPE (vector_type);
3562
3563 /* (1) Find a vector mode VM with integer elements of mode IM. */
3564 unsigned int nvectors = 1;
3565 tree new_vector_type;
3566 tree permutes[2];
3567 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3568 &nvectors, &new_vector_type,
3569 permutes))
3570 gcc_unreachable ();
3571
3572 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3573 unsigned int partial_nelts = nelts / nvectors;
3574 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3575
3576 tree_vector_builder partial_elts;
3577 auto_vec<tree, 32> pieces (nvectors * 2);
3578 pieces.quick_grow (nvectors * 2);
3579 for (unsigned int i = 0; i < nvectors; ++i)
3580 {
3581 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3582 ELTS' has mode IM. */
3583 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3584 for (unsigned int j = 0; j < partial_nelts; ++j)
3585 partial_elts.quick_push (elts[i * partial_nelts + j]);
3586 tree t = gimple_build_vector (seq, &partial_elts);
3587 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3588 TREE_TYPE (new_vector_type), t);
3589
3590 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3591 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3592 }
3593
3594 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3595 correct byte contents.
3596
3597 We need to repeat the following operation log2(nvectors) times:
3598
3599 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3600 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3601
3602 However, if each input repeats every N elements and the VF is
3603 a multiple of N * 2, the HI result is the same as the LO. */
3604 unsigned int in_start = 0;
3605 unsigned int out_start = nvectors;
3606 unsigned int hi_start = nvectors / 2;
3607 /* A bound on the number of outputs needed to produce NRESULTS results
3608 in the final iteration. */
3609 unsigned int noutputs_bound = nvectors * nresults;
3610 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3611 {
3612 noutputs_bound /= 2;
3613 unsigned int limit = MIN (noutputs_bound, nvectors);
3614 for (unsigned int i = 0; i < limit; ++i)
3615 {
3616 if ((i & 1) != 0
3617 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3618 2 * in_repeat))
3619 {
3620 pieces[out_start + i] = pieces[out_start + i - 1];
3621 continue;
3622 }
3623
3624 tree output = make_ssa_name (new_vector_type);
3625 tree input1 = pieces[in_start + (i / 2)];
3626 tree input2 = pieces[in_start + (i / 2) + hi_start];
3627 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3628 input1, input2,
3629 permutes[i & 1]);
3630 gimple_seq_add_stmt (seq, stmt);
3631 pieces[out_start + i] = output;
3632 }
3633 std::swap (in_start, out_start);
3634 }
3635
3636 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3637 results.reserve (nresults);
3638 for (unsigned int i = 0; i < nresults; ++i)
3639 if (i < nvectors)
3640 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3641 pieces[in_start + i]));
3642 else
3643 results.quick_push (results[i - nvectors]);
3644 }
3645
3646
3647 /* For constant and loop invariant defs of SLP_NODE this function returns
3648 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3649 OP_NODE determines the node for the operand containing the scalar
3650 operands. */
3651
3652 static void
3653 vect_get_constant_vectors (vec_info *vinfo,
3654 slp_tree slp_node, unsigned op_num,
3655 vec<tree> *vec_oprnds)
3656 {
3657 slp_tree op_node = SLP_TREE_CHILDREN (slp_node)[op_num];
3658 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3659 unsigned HOST_WIDE_INT nunits;
3660 tree vec_cst;
3661 unsigned j, number_of_places_left_in_vector;
3662 tree vector_type;
3663 tree vop;
3664 int group_size = op_node->ops.length ();
3665 unsigned int vec_num, i;
3666 unsigned number_of_copies = 1;
3667 bool constant_p;
3668 tree neutral_op = NULL;
3669 gimple_seq ctor_seq = NULL;
3670 auto_vec<tree, 16> permute_results;
3671
3672 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3673 vector_type = SLP_TREE_VECTYPE (op_node);
3674 {
3675 tree op = op_node->ops[0];
3676 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3677 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3678 && vect_mask_constant_operand_p (vinfo, stmt_vinfo, op_num))
3679 gcc_assert (vector_type
3680 && types_compatible_p (vector_type,
3681 truth_type_for (stmt_vectype)));
3682 else
3683 gcc_assert (vector_type
3684 && types_compatible_p (vector_type,
3685 get_vectype_for_scalar_type
3686 (vinfo, TREE_TYPE (op), op_node)));
3687 }
3688
3689 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3690 vec_oprnds->create (number_of_vectors);
3691 auto_vec<tree> voprnds (number_of_vectors);
3692
3693 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3694 created vectors. It is greater than 1 if unrolling is performed.
3695
3696 For example, we have two scalar operands, s1 and s2 (e.g., group of
3697 strided accesses of size two), while NUNITS is four (i.e., four scalars
3698 of this type can be packed in a vector). The output vector will contain
3699 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3700 will be 2).
3701
3702 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3703 containing the operands.
3704
3705 For example, NUNITS is four as before, and the group size is 8
3706 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3707 {s5, s6, s7, s8}. */
3708
3709 /* When using duplicate_and_interleave, we just need one element for
3710 each scalar statement. */
3711 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3712 nunits = group_size;
3713
3714 number_of_copies = nunits * number_of_vectors / group_size;
3715
3716 number_of_places_left_in_vector = nunits;
3717 constant_p = true;
3718 tree_vector_builder elts (vector_type, nunits, 1);
3719 elts.quick_grow (nunits);
3720 stmt_vec_info insert_after = NULL;
3721 for (j = 0; j < number_of_copies; j++)
3722 {
3723 tree op;
3724 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3725 {
3726 /* Create 'vect_ = {op0,op1,...,opn}'. */
3727 number_of_places_left_in_vector--;
3728 tree orig_op = op;
3729 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3730 {
3731 if (CONSTANT_CLASS_P (op))
3732 {
3733 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3734 {
3735 /* Can't use VIEW_CONVERT_EXPR for booleans because
3736 of possibly different sizes of scalar value and
3737 vector element. */
3738 if (integer_zerop (op))
3739 op = build_int_cst (TREE_TYPE (vector_type), 0);
3740 else if (integer_onep (op))
3741 op = build_all_ones_cst (TREE_TYPE (vector_type));
3742 else
3743 gcc_unreachable ();
3744 }
3745 else
3746 op = fold_unary (VIEW_CONVERT_EXPR,
3747 TREE_TYPE (vector_type), op);
3748 gcc_assert (op && CONSTANT_CLASS_P (op));
3749 }
3750 else
3751 {
3752 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3753 gimple *init_stmt;
3754 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3755 {
3756 tree true_val
3757 = build_all_ones_cst (TREE_TYPE (vector_type));
3758 tree false_val
3759 = build_zero_cst (TREE_TYPE (vector_type));
3760 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3761 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3762 op, true_val,
3763 false_val);
3764 }
3765 else
3766 {
3767 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3768 op);
3769 init_stmt
3770 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3771 op);
3772 }
3773 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3774 op = new_temp;
3775 }
3776 }
3777 elts[number_of_places_left_in_vector] = op;
3778 if (!CONSTANT_CLASS_P (op))
3779 constant_p = false;
3780 /* For BB vectorization we have to compute an insert location
3781 when a def is inside the analyzed region since we cannot
3782 simply insert at the BB start in this case. */
3783 stmt_vec_info opdef;
3784 if (TREE_CODE (orig_op) == SSA_NAME
3785 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3786 && is_a <bb_vec_info> (vinfo)
3787 && (opdef = vinfo->lookup_def (orig_op)))
3788 {
3789 if (!insert_after)
3790 insert_after = opdef;
3791 else
3792 insert_after = get_later_stmt (insert_after, opdef);
3793 }
3794
3795 if (number_of_places_left_in_vector == 0)
3796 {
3797 if (constant_p
3798 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3799 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3800 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3801 else
3802 {
3803 if (permute_results.is_empty ())
3804 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3805 elts, number_of_vectors,
3806 permute_results);
3807 vec_cst = permute_results[number_of_vectors - j - 1];
3808 }
3809 tree init;
3810 if (insert_after)
3811 {
3812 gimple_stmt_iterator gsi = gsi_for_stmt (insert_after->stmt);
3813 /* vect_init_vector inserts before. */
3814 gsi_next (&gsi);
3815 init = vect_init_vector (vinfo, stmt_vinfo, vec_cst,
3816 vector_type, &gsi);
3817 }
3818 else
3819 init = vect_init_vector (vinfo, stmt_vinfo, vec_cst,
3820 vector_type, NULL);
3821 if (ctor_seq != NULL)
3822 {
3823 gimple_stmt_iterator gsi
3824 = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3825 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3826 ctor_seq = NULL;
3827 }
3828 voprnds.quick_push (init);
3829 insert_after = NULL;
3830 number_of_places_left_in_vector = nunits;
3831 constant_p = true;
3832 elts.new_vector (vector_type, nunits, 1);
3833 elts.quick_grow (nunits);
3834 }
3835 }
3836 }
3837
3838 /* Since the vectors are created in the reverse order, we should invert
3839 them. */
3840 vec_num = voprnds.length ();
3841 for (j = vec_num; j != 0; j--)
3842 {
3843 vop = voprnds[j - 1];
3844 vec_oprnds->quick_push (vop);
3845 }
3846
3847 /* In case that VF is greater than the unrolling factor needed for the SLP
3848 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3849 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3850 to replicate the vectors. */
3851 while (number_of_vectors > vec_oprnds->length ())
3852 {
3853 tree neutral_vec = NULL;
3854
3855 if (neutral_op)
3856 {
3857 if (!neutral_vec)
3858 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3859
3860 vec_oprnds->quick_push (neutral_vec);
3861 }
3862 else
3863 {
3864 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3865 vec_oprnds->quick_push (vop);
3866 }
3867 }
3868 }
3869
3870
3871 /* Get vectorized definitions from SLP_NODE that contains corresponding
3872 vectorized def-stmts. */
3873
3874 static void
3875 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3876 {
3877 stmt_vec_info vec_def_stmt_info;
3878 unsigned int i;
3879
3880 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3881
3882 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3883 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3884 }
3885
3886
3887 /* Get N vectorized definitions for SLP_NODE.
3888 If the scalar definitions are loop invariants or constants, collect them and
3889 call vect_get_constant_vectors() to create vector stmts.
3890 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3891 must be stored in the corresponding child of SLP_NODE, and we call
3892 vect_get_slp_vect_defs () to retrieve them. */
3893
3894 void
3895 vect_get_slp_defs (vec_info *vinfo,
3896 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3897 {
3898 if (n == -1U)
3899 n = SLP_TREE_CHILDREN (slp_node).length ();
3900
3901 for (unsigned i = 0; i < n; ++i)
3902 {
3903 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3904
3905 vec<tree> vec_defs = vNULL;
3906
3907 /* For each operand we check if it has vectorized definitions in a child
3908 node or we need to create them (for invariants and constants). */
3909 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3910 {
3911 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3912 vect_get_slp_vect_defs (child, &vec_defs);
3913 }
3914 else
3915 vect_get_constant_vectors (vinfo, slp_node, i, &vec_defs);
3916
3917 vec_oprnds->quick_push (vec_defs);
3918 }
3919 }
3920
3921 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3922 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3923 permute statements for the SLP node NODE. */
3924
3925 bool
3926 vect_transform_slp_perm_load (vec_info *vinfo,
3927 slp_tree node, vec<tree> dr_chain,
3928 gimple_stmt_iterator *gsi, poly_uint64 vf,
3929 bool analyze_only, unsigned *n_perms)
3930 {
3931 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3932 int vec_index = 0;
3933 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3934 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
3935 unsigned int mask_element;
3936 machine_mode mode;
3937
3938 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3939 return false;
3940
3941 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3942
3943 mode = TYPE_MODE (vectype);
3944 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3945
3946 /* Initialize the vect stmts of NODE to properly insert the generated
3947 stmts later. */
3948 if (! analyze_only)
3949 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3950 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3951 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3952
3953 /* Generate permutation masks for every NODE. Number of masks for each NODE
3954 is equal to GROUP_SIZE.
3955 E.g., we have a group of three nodes with three loads from the same
3956 location in each node, and the vector size is 4. I.e., we have a
3957 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3958 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3959 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3960 ...
3961
3962 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3963 The last mask is illegal since we assume two operands for permute
3964 operation, and the mask element values can't be outside that range.
3965 Hence, the last mask must be converted into {2,5,5,5}.
3966 For the first two permutations we need the first and the second input
3967 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3968 we need the second and the third vectors: {b1,c1,a2,b2} and
3969 {c2,a3,b3,c3}. */
3970
3971 int vect_stmts_counter = 0;
3972 unsigned int index = 0;
3973 int first_vec_index = -1;
3974 int second_vec_index = -1;
3975 bool noop_p = true;
3976 *n_perms = 0;
3977
3978 vec_perm_builder mask;
3979 unsigned int nelts_to_build;
3980 unsigned int nvectors_per_build;
3981 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3982 && multiple_p (nunits, group_size));
3983 if (repeating_p)
3984 {
3985 /* A single vector contains a whole number of copies of the node, so:
3986 (a) all permutes can use the same mask; and
3987 (b) the permutes only need a single vector input. */
3988 mask.new_vector (nunits, group_size, 3);
3989 nelts_to_build = mask.encoded_nelts ();
3990 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3991 }
3992 else
3993 {
3994 /* We need to construct a separate mask for each vector statement. */
3995 unsigned HOST_WIDE_INT const_nunits, const_vf;
3996 if (!nunits.is_constant (&const_nunits)
3997 || !vf.is_constant (&const_vf))
3998 return false;
3999 mask.new_vector (const_nunits, const_nunits, 1);
4000 nelts_to_build = const_vf * group_size;
4001 nvectors_per_build = 1;
4002 }
4003
4004 unsigned int count = mask.encoded_nelts ();
4005 mask.quick_grow (count);
4006 vec_perm_indices indices;
4007
4008 for (unsigned int j = 0; j < nelts_to_build; j++)
4009 {
4010 unsigned int iter_num = j / group_size;
4011 unsigned int stmt_num = j % group_size;
4012 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4013 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4014 if (repeating_p)
4015 {
4016 first_vec_index = 0;
4017 mask_element = i;
4018 }
4019 else
4020 {
4021 /* Enforced before the loop when !repeating_p. */
4022 unsigned int const_nunits = nunits.to_constant ();
4023 vec_index = i / const_nunits;
4024 mask_element = i % const_nunits;
4025 if (vec_index == first_vec_index
4026 || first_vec_index == -1)
4027 {
4028 first_vec_index = vec_index;
4029 }
4030 else if (vec_index == second_vec_index
4031 || second_vec_index == -1)
4032 {
4033 second_vec_index = vec_index;
4034 mask_element += const_nunits;
4035 }
4036 else
4037 {
4038 if (dump_enabled_p ())
4039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4040 "permutation requires at "
4041 "least three vectors %G",
4042 stmt_info->stmt);
4043 gcc_assert (analyze_only);
4044 return false;
4045 }
4046
4047 gcc_assert (mask_element < 2 * const_nunits);
4048 }
4049
4050 if (mask_element != index)
4051 noop_p = false;
4052 mask[index++] = mask_element;
4053
4054 if (index == count && !noop_p)
4055 {
4056 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4057 if (!can_vec_perm_const_p (mode, indices))
4058 {
4059 if (dump_enabled_p ())
4060 {
4061 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4062 vect_location,
4063 "unsupported vect permute { ");
4064 for (i = 0; i < count; ++i)
4065 {
4066 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4067 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4068 }
4069 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4070 }
4071 gcc_assert (analyze_only);
4072 return false;
4073 }
4074
4075 ++*n_perms;
4076 }
4077
4078 if (index == count)
4079 {
4080 if (!analyze_only)
4081 {
4082 tree mask_vec = NULL_TREE;
4083
4084 if (! noop_p)
4085 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4086
4087 if (second_vec_index == -1)
4088 second_vec_index = first_vec_index;
4089
4090 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4091 {
4092 /* Generate the permute statement if necessary. */
4093 tree first_vec = dr_chain[first_vec_index + ri];
4094 tree second_vec = dr_chain[second_vec_index + ri];
4095 stmt_vec_info perm_stmt_info;
4096 if (! noop_p)
4097 {
4098 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4099 tree perm_dest
4100 = vect_create_destination_var (gimple_assign_lhs (stmt),
4101 vectype);
4102 perm_dest = make_ssa_name (perm_dest);
4103 gassign *perm_stmt
4104 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4105 first_vec, second_vec,
4106 mask_vec);
4107 perm_stmt_info
4108 = vect_finish_stmt_generation (vinfo,
4109 stmt_info, perm_stmt,
4110 gsi);
4111 }
4112 else
4113 /* If mask was NULL_TREE generate the requested
4114 identity transform. */
4115 perm_stmt_info = vinfo->lookup_def (first_vec);
4116
4117 /* Store the vector statement in NODE. */
4118 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
4119 = perm_stmt_info;
4120 }
4121 }
4122
4123 index = 0;
4124 first_vec_index = -1;
4125 second_vec_index = -1;
4126 noop_p = true;
4127 }
4128 }
4129
4130 return true;
4131 }
4132
4133 /* Vectorize SLP instance tree in postorder. */
4134
4135 static void
4136 vect_schedule_slp_instance (vec_info *vinfo,
4137 slp_tree node, slp_instance instance)
4138 {
4139 gimple_stmt_iterator si;
4140 stmt_vec_info stmt_info;
4141 unsigned int group_size;
4142 tree vectype;
4143 int i, j;
4144 slp_tree child;
4145
4146 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4147 return;
4148
4149 /* See if we have already vectorized the node in the graph of the
4150 SLP instance. */
4151 if (SLP_TREE_VEC_STMTS (node).exists ())
4152 return;
4153
4154 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4155 vect_schedule_slp_instance (vinfo, child, instance);
4156
4157 /* Push SLP node def-type to stmts. */
4158 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4159 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4160 {
4161 stmt_vec_info child_stmt_info;
4162 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4163 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
4164 }
4165
4166 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4167
4168 /* VECTYPE is the type of the destination. */
4169 vectype = STMT_VINFO_VECTYPE (stmt_info);
4170 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4171 group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4172
4173 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4174 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4175
4176 if (dump_enabled_p ())
4177 dump_printf_loc (MSG_NOTE, vect_location,
4178 "------>vectorizing SLP node starting from: %G",
4179 stmt_info->stmt);
4180
4181 /* Vectorized stmts go before the last scalar stmt which is where
4182 all uses are ready. */
4183 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4184 si = gsi_for_stmt (last_stmt_info->stmt);
4185
4186 /* Handle two-operation SLP nodes by vectorizing the group with
4187 both operations and then performing a merge. */
4188 bool done_p = false;
4189 if (SLP_TREE_TWO_OPERATORS (node))
4190 {
4191 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4192 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4193 enum tree_code ocode = ERROR_MARK;
4194 stmt_vec_info ostmt_info;
4195 vec_perm_builder mask (group_size, group_size, 1);
4196 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
4197 {
4198 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
4199 if (gimple_assign_rhs_code (ostmt) != code0)
4200 {
4201 mask.quick_push (1);
4202 ocode = gimple_assign_rhs_code (ostmt);
4203 }
4204 else
4205 mask.quick_push (0);
4206 }
4207 if (ocode != ERROR_MARK)
4208 {
4209 vec<stmt_vec_info> v0;
4210 vec<stmt_vec_info> v1;
4211 unsigned j;
4212 tree tmask = NULL_TREE;
4213 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4214 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4215 SLP_TREE_VEC_STMTS (node).truncate (0);
4216 gimple_assign_set_rhs_code (stmt, ocode);
4217 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4218 gimple_assign_set_rhs_code (stmt, code0);
4219 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4220 SLP_TREE_VEC_STMTS (node).truncate (0);
4221 tree meltype = build_nonstandard_integer_type
4222 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4223 tree mvectype = get_same_sized_vectype (meltype, vectype);
4224 unsigned k = 0, l;
4225 for (j = 0; j < v0.length (); ++j)
4226 {
4227 /* Enforced by vect_build_slp_tree, which rejects variable-length
4228 vectors for SLP_TREE_TWO_OPERATORS. */
4229 unsigned int const_nunits = nunits.to_constant ();
4230 tree_vector_builder melts (mvectype, const_nunits, 1);
4231 for (l = 0; l < const_nunits; ++l)
4232 {
4233 if (k >= group_size)
4234 k = 0;
4235 tree t = build_int_cst (meltype,
4236 mask[k++] * const_nunits + l);
4237 melts.quick_push (t);
4238 }
4239 tmask = melts.build ();
4240
4241 /* ??? Not all targets support a VEC_PERM_EXPR with a
4242 constant mask that would translate to a vec_merge RTX
4243 (with their vec_perm_const_ok). We can either not
4244 vectorize in that case or let veclower do its job.
4245 Unfortunately that isn't too great and at least for
4246 plus/minus we'd eventually like to match targets
4247 vector addsub instructions. */
4248 gimple *vstmt;
4249 vstmt = gimple_build_assign (make_ssa_name (vectype),
4250 VEC_PERM_EXPR,
4251 gimple_assign_lhs (v0[j]->stmt),
4252 gimple_assign_lhs (v1[j]->stmt),
4253 tmask);
4254 SLP_TREE_VEC_STMTS (node).quick_push
4255 (vect_finish_stmt_generation (vinfo, stmt_info, vstmt, &si));
4256 }
4257 v0.release ();
4258 v1.release ();
4259 done_p = true;
4260 }
4261 }
4262 if (!done_p)
4263 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4264
4265 /* Restore stmt def-types. */
4266 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4267 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4268 {
4269 stmt_vec_info child_stmt_info;
4270 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4271 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4272 }
4273 }
4274
4275 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4276 For loop vectorization this is done in vectorizable_call, but for SLP
4277 it needs to be deferred until end of vect_schedule_slp, because multiple
4278 SLP instances may refer to the same scalar stmt. */
4279
4280 static void
4281 vect_remove_slp_scalar_calls (vec_info *vinfo,
4282 slp_tree node, hash_set<slp_tree> &visited)
4283 {
4284 gimple *new_stmt;
4285 gimple_stmt_iterator gsi;
4286 int i;
4287 slp_tree child;
4288 tree lhs;
4289 stmt_vec_info stmt_info;
4290
4291 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4292 return;
4293
4294 if (visited.add (node))
4295 return;
4296
4297 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4298 vect_remove_slp_scalar_calls (vinfo, child, visited);
4299
4300 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4301 {
4302 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4303 if (!stmt || gimple_bb (stmt) == NULL)
4304 continue;
4305 if (is_pattern_stmt_p (stmt_info)
4306 || !PURE_SLP_STMT (stmt_info))
4307 continue;
4308 lhs = gimple_call_lhs (stmt);
4309 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4310 gsi = gsi_for_stmt (stmt);
4311 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4312 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4313 }
4314 }
4315
4316 static void
4317 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4318 {
4319 hash_set<slp_tree> visited;
4320 vect_remove_slp_scalar_calls (vinfo, node, visited);
4321 }
4322
4323 /* Vectorize the instance root. */
4324
4325 void
4326 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4327 {
4328 gassign *rstmt = NULL;
4329
4330 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4331 {
4332 stmt_vec_info child_stmt_info;
4333 int j;
4334
4335 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4336 {
4337 tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
4338 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4339 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4340 TREE_TYPE (vect_lhs)))
4341 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4342 vect_lhs);
4343 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4344 break;
4345 }
4346 }
4347 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4348 {
4349 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4350 stmt_vec_info child_stmt_info;
4351 int j;
4352 vec<constructor_elt, va_gc> *v;
4353 vec_alloc (v, nelts);
4354
4355 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4356 {
4357 CONSTRUCTOR_APPEND_ELT (v,
4358 NULL_TREE,
4359 gimple_get_lhs (child_stmt_info->stmt));
4360 }
4361 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4362 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4363 tree r_constructor = build_constructor (rtype, v);
4364 rstmt = gimple_build_assign (lhs, r_constructor);
4365 }
4366
4367 gcc_assert (rstmt);
4368
4369 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4370 gsi_replace (&rgsi, rstmt, true);
4371 }
4372
4373 /* Generate vector code for all SLP instances in the loop/basic block. */
4374
4375 void
4376 vect_schedule_slp (vec_info *vinfo)
4377 {
4378 vec<slp_instance> slp_instances;
4379 slp_instance instance;
4380 unsigned int i;
4381
4382 slp_instances = vinfo->slp_instances;
4383 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4384 {
4385 slp_tree node = SLP_INSTANCE_TREE (instance);
4386 /* Schedule the tree of INSTANCE. */
4387 vect_schedule_slp_instance (vinfo, node, instance);
4388
4389 if (SLP_INSTANCE_ROOT_STMT (instance))
4390 vectorize_slp_instance_root_stmt (node, instance);
4391
4392 if (dump_enabled_p ())
4393 dump_printf_loc (MSG_NOTE, vect_location,
4394 "vectorizing stmts using SLP.\n");
4395 }
4396
4397 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4398 {
4399 slp_tree root = SLP_INSTANCE_TREE (instance);
4400 stmt_vec_info store_info;
4401 unsigned int j;
4402
4403 /* Remove scalar call stmts. Do not do this for basic-block
4404 vectorization as not all uses may be vectorized.
4405 ??? Why should this be necessary? DCE should be able to
4406 remove the stmts itself.
4407 ??? For BB vectorization we can as well remove scalar
4408 stmts starting from the SLP tree root if they have no
4409 uses. */
4410 if (is_a <loop_vec_info> (vinfo))
4411 vect_remove_slp_scalar_calls (vinfo, root);
4412
4413 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4414 {
4415 if (!STMT_VINFO_DATA_REF (store_info))
4416 break;
4417
4418 if (SLP_INSTANCE_ROOT_STMT (instance))
4419 continue;
4420
4421 store_info = vect_orig_stmt (store_info);
4422 /* Free the attached stmt_vec_info and remove the stmt. */
4423 vinfo->remove_stmt (store_info);
4424 }
4425 }
4426 }