]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
167db07645486089cbe5d611da6c859fc25f9ad9
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48
49
50 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
51 slp_tree, stmt_vector_for_cost *);
52
53 /* Initialize a SLP node. */
54
55 _slp_tree::_slp_tree ()
56 {
57 SLP_TREE_SCALAR_STMTS (this) = vNULL;
58 SLP_TREE_SCALAR_OPS (this) = vNULL;
59 SLP_TREE_VEC_STMTS (this) = vNULL;
60 SLP_TREE_VEC_DEFS (this) = vNULL;
61 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
62 SLP_TREE_CHILDREN (this) = vNULL;
63 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
64 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
65 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
66 SLP_TREE_CODE (this) = ERROR_MARK;
67 SLP_TREE_VECTYPE (this) = NULL_TREE;
68 SLP_TREE_REPRESENTATIVE (this) = NULL;
69 this->refcnt = 1;
70 this->max_nunits = 1;
71 this->lanes = 0;
72 }
73
74 /* Tear down a SLP node. */
75
76 _slp_tree::~_slp_tree ()
77 {
78 SLP_TREE_CHILDREN (this).release ();
79 SLP_TREE_SCALAR_STMTS (this).release ();
80 SLP_TREE_SCALAR_OPS (this).release ();
81 SLP_TREE_VEC_STMTS (this).release ();
82 SLP_TREE_VEC_DEFS (this).release ();
83 SLP_TREE_LOAD_PERMUTATION (this).release ();
84 SLP_TREE_LANE_PERMUTATION (this).release ();
85 }
86
87 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
88 FINAL_P is true if we have vectorized the instance or if we have
89 made a final decision not to vectorize the statements in any way. */
90
91 static void
92 vect_free_slp_tree (slp_tree node, bool final_p)
93 {
94 int i;
95 slp_tree child;
96
97 if (--node->refcnt != 0)
98 return;
99
100 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
101 vect_free_slp_tree (child, final_p);
102
103 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
104 Some statements might no longer exist, after having been
105 removed by vect_transform_stmt. Updating the remaining
106 statements would be redundant. */
107 if (!final_p)
108 {
109 stmt_vec_info stmt_info;
110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
111 {
112 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
113 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
114 }
115 }
116
117 delete node;
118 }
119
120 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
121 have vectorized the instance or if we have made a final decision not
122 to vectorize the statements in any way. */
123
124 void
125 vect_free_slp_instance (slp_instance instance, bool final_p)
126 {
127 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
128 SLP_INSTANCE_LOADS (instance).release ();
129 free (instance);
130 }
131
132
133 /* Create an SLP node for SCALAR_STMTS. */
134
135 static slp_tree
136 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
137 {
138 slp_tree node = new _slp_tree;
139 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
140 SLP_TREE_CHILDREN (node).create (nops);
141 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
142 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
143 SLP_TREE_LANES (node) = scalar_stmts.length ();
144
145 unsigned i;
146 stmt_vec_info stmt_info;
147 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
148 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
149
150 return node;
151 }
152
153 /* Create an SLP node for OPS. */
154
155 static slp_tree
156 vect_create_new_slp_node (vec<tree> ops)
157 {
158 slp_tree node = new _slp_tree;
159 SLP_TREE_SCALAR_OPS (node) = ops;
160 SLP_TREE_DEF_TYPE (node) = vect_external_def;
161 SLP_TREE_LANES (node) = ops.length ();
162 return node;
163 }
164
165
166 /* This structure is used in creation of an SLP tree. Each instance
167 corresponds to the same operand in a group of scalar stmts in an SLP
168 node. */
169 typedef struct _slp_oprnd_info
170 {
171 /* Def-stmts for the operands. */
172 vec<stmt_vec_info> def_stmts;
173 /* Operands. */
174 vec<tree> ops;
175 /* Information about the first statement, its vector def-type, type, the
176 operand itself in case it's constant, and an indication if it's a pattern
177 stmt. */
178 tree first_op_type;
179 enum vect_def_type first_dt;
180 bool any_pattern;
181 } *slp_oprnd_info;
182
183
184 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
185 operand. */
186 static vec<slp_oprnd_info>
187 vect_create_oprnd_info (int nops, int group_size)
188 {
189 int i;
190 slp_oprnd_info oprnd_info;
191 vec<slp_oprnd_info> oprnds_info;
192
193 oprnds_info.create (nops);
194 for (i = 0; i < nops; i++)
195 {
196 oprnd_info = XNEW (struct _slp_oprnd_info);
197 oprnd_info->def_stmts.create (group_size);
198 oprnd_info->ops.create (group_size);
199 oprnd_info->first_dt = vect_uninitialized_def;
200 oprnd_info->first_op_type = NULL_TREE;
201 oprnd_info->any_pattern = false;
202 oprnds_info.quick_push (oprnd_info);
203 }
204
205 return oprnds_info;
206 }
207
208
209 /* Free operands info. */
210
211 static void
212 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
213 {
214 int i;
215 slp_oprnd_info oprnd_info;
216
217 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
218 {
219 oprnd_info->def_stmts.release ();
220 oprnd_info->ops.release ();
221 XDELETE (oprnd_info);
222 }
223
224 oprnds_info.release ();
225 }
226
227
228 /* Return true if STMTS contains a pattern statement. */
229
230 static bool
231 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
232 {
233 stmt_vec_info stmt_info;
234 unsigned int i;
235 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
236 if (is_pattern_stmt_p (stmt_info))
237 return true;
238 return false;
239 }
240
241 /* Return true when all lanes in the external or constant NODE have
242 the same value. */
243
244 static bool
245 vect_slp_tree_uniform_p (slp_tree node)
246 {
247 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
248 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
249
250 /* Pre-exsting vectors. */
251 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
252 return false;
253
254 unsigned i;
255 tree op, first = NULL_TREE;
256 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
257 if (!first)
258 first = op;
259 else if (!operand_equal_p (first, op, 0))
260 return false;
261
262 return true;
263 }
264
265 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
266 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
267 of the chain. */
268
269 int
270 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
271 stmt_vec_info first_stmt_info)
272 {
273 stmt_vec_info next_stmt_info = first_stmt_info;
274 int result = 0;
275
276 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
277 return -1;
278
279 do
280 {
281 if (next_stmt_info == stmt_info)
282 return result;
283 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
284 if (next_stmt_info)
285 result += DR_GROUP_GAP (next_stmt_info);
286 }
287 while (next_stmt_info);
288
289 return -1;
290 }
291
292 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
293 using the method implemented by duplicate_and_interleave. Return true
294 if so, returning the number of intermediate vectors in *NVECTORS_OUT
295 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
296 (if nonnull). */
297
298 bool
299 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
300 tree elt_type, unsigned int *nvectors_out,
301 tree *vector_type_out,
302 tree *permutes)
303 {
304 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
305 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
306 return false;
307
308 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
309 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
310 unsigned int nvectors = 1;
311 for (;;)
312 {
313 scalar_int_mode int_mode;
314 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
315 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
316 {
317 /* Get the natural vector type for this SLP group size. */
318 tree int_type = build_nonstandard_integer_type
319 (GET_MODE_BITSIZE (int_mode), 1);
320 tree vector_type
321 = get_vectype_for_scalar_type (vinfo, int_type, count);
322 if (vector_type
323 && VECTOR_MODE_P (TYPE_MODE (vector_type))
324 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
325 GET_MODE_SIZE (base_vector_mode)))
326 {
327 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
328 together into elements of type INT_TYPE and using the result
329 to build NVECTORS vectors. */
330 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
331 vec_perm_builder sel1 (nelts, 2, 3);
332 vec_perm_builder sel2 (nelts, 2, 3);
333 poly_int64 half_nelts = exact_div (nelts, 2);
334 for (unsigned int i = 0; i < 3; ++i)
335 {
336 sel1.quick_push (i);
337 sel1.quick_push (i + nelts);
338 sel2.quick_push (half_nelts + i);
339 sel2.quick_push (half_nelts + i + nelts);
340 }
341 vec_perm_indices indices1 (sel1, 2, nelts);
342 vec_perm_indices indices2 (sel2, 2, nelts);
343 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
344 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
345 {
346 if (nvectors_out)
347 *nvectors_out = nvectors;
348 if (vector_type_out)
349 *vector_type_out = vector_type;
350 if (permutes)
351 {
352 permutes[0] = vect_gen_perm_mask_checked (vector_type,
353 indices1);
354 permutes[1] = vect_gen_perm_mask_checked (vector_type,
355 indices2);
356 }
357 return true;
358 }
359 }
360 }
361 if (!multiple_p (elt_bytes, 2, &elt_bytes))
362 return false;
363 nvectors *= 2;
364 }
365 }
366
367 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
368 they are of a valid type and that they match the defs of the first stmt of
369 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
370 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
371 indicates swap is required for cond_expr stmts. Specifically, *SWAP
372 is 1 if STMT is cond and operands of comparison need to be swapped;
373 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
374 If there is any operand swap in this function, *SWAP is set to non-zero
375 value.
376 If there was a fatal error return -1; if the error could be corrected by
377 swapping operands of father node of this one, return 1; if everything is
378 ok return 0. */
379 static int
380 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
381 vec<stmt_vec_info> stmts, unsigned stmt_num,
382 vec<slp_oprnd_info> *oprnds_info)
383 {
384 stmt_vec_info stmt_info = stmts[stmt_num];
385 tree oprnd;
386 unsigned int i, number_of_oprnds;
387 enum vect_def_type dt = vect_uninitialized_def;
388 slp_oprnd_info oprnd_info;
389 int first_op_idx = 1;
390 unsigned int commutative_op = -1U;
391 bool first_op_cond = false;
392 bool first = stmt_num == 0;
393
394 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
395 {
396 number_of_oprnds = gimple_call_num_args (stmt);
397 first_op_idx = 3;
398 if (gimple_call_internal_p (stmt))
399 {
400 internal_fn ifn = gimple_call_internal_fn (stmt);
401 commutative_op = first_commutative_argument (ifn);
402
403 /* Masked load, only look at mask. */
404 if (ifn == IFN_MASK_LOAD)
405 {
406 number_of_oprnds = 1;
407 /* Mask operand index. */
408 first_op_idx = 5;
409 }
410 }
411 }
412 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
413 {
414 enum tree_code code = gimple_assign_rhs_code (stmt);
415 number_of_oprnds = gimple_num_ops (stmt) - 1;
416 /* Swap can only be done for cond_expr if asked to, otherwise we
417 could result in different comparison code to the first stmt. */
418 if (code == COND_EXPR
419 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
420 {
421 first_op_cond = true;
422 number_of_oprnds++;
423 }
424 else
425 commutative_op = commutative_tree_code (code) ? 0U : -1U;
426 }
427 else
428 return -1;
429
430 bool swapped = (*swap != 0);
431 gcc_assert (!swapped || first_op_cond);
432 for (i = 0; i < number_of_oprnds; i++)
433 {
434 again:
435 if (first_op_cond)
436 {
437 /* Map indicating how operands of cond_expr should be swapped. */
438 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
439 int *map = maps[*swap];
440
441 if (i < 2)
442 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
443 first_op_idx), map[i]);
444 else
445 oprnd = gimple_op (stmt_info->stmt, map[i]);
446 }
447 else
448 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
449 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
450 oprnd = TREE_OPERAND (oprnd, 0);
451
452 oprnd_info = (*oprnds_info)[i];
453
454 stmt_vec_info def_stmt_info;
455 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
456 {
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
459 "Build SLP failed: can't analyze def for %T\n",
460 oprnd);
461
462 return -1;
463 }
464
465 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
466 oprnd_info->any_pattern = true;
467
468 if (first)
469 {
470 /* For the swapping logic below force vect_reduction_def
471 for the reduction op in a SLP reduction group. */
472 if (!STMT_VINFO_DATA_REF (stmt_info)
473 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
474 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
475 && def_stmt_info)
476 dt = vect_reduction_def;
477 oprnd_info->first_dt = dt;
478 oprnd_info->first_op_type = TREE_TYPE (oprnd);
479 }
480 else
481 {
482 /* Not first stmt of the group, check that the def-stmt/s match
483 the def-stmt/s of the first stmt. Allow different definition
484 types for reduction chains: the first stmt must be a
485 vect_reduction_def (a phi node), and the rest
486 end in the reduction chain. */
487 tree type = TREE_TYPE (oprnd);
488 if ((oprnd_info->first_dt != dt
489 && !(oprnd_info->first_dt == vect_reduction_def
490 && !STMT_VINFO_DATA_REF (stmt_info)
491 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
492 && def_stmt_info
493 && !STMT_VINFO_DATA_REF (def_stmt_info)
494 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
495 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
496 && !((oprnd_info->first_dt == vect_external_def
497 || oprnd_info->first_dt == vect_constant_def)
498 && (dt == vect_external_def
499 || dt == vect_constant_def)))
500 || !types_compatible_p (oprnd_info->first_op_type, type)
501 || (!STMT_VINFO_DATA_REF (stmt_info)
502 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
503 && ((!def_stmt_info
504 || STMT_VINFO_DATA_REF (def_stmt_info)
505 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
506 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
507 != (oprnd_info->first_dt != vect_reduction_def))))
508 {
509 /* Try swapping operands if we got a mismatch. */
510 if (i == commutative_op && !swapped)
511 {
512 if (dump_enabled_p ())
513 dump_printf_loc (MSG_NOTE, vect_location,
514 "trying swapped operands\n");
515 swapped = true;
516 goto again;
517 }
518
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "Build SLP failed: different types\n");
522
523 return 1;
524 }
525 if ((dt == vect_constant_def
526 || dt == vect_external_def)
527 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
528 && (TREE_CODE (type) == BOOLEAN_TYPE
529 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
530 type)))
531 {
532 if (dump_enabled_p ())
533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
534 "Build SLP failed: invalid type of def "
535 "for variable-length SLP %T\n", oprnd);
536 return -1;
537 }
538 }
539
540 /* Check the types of the definitions. */
541 switch (dt)
542 {
543 case vect_external_def:
544 /* Make sure to demote the overall operand to external. */
545 oprnd_info->first_dt = vect_external_def;
546 /* Fallthru. */
547 case vect_constant_def:
548 oprnd_info->def_stmts.quick_push (NULL);
549 oprnd_info->ops.quick_push (oprnd);
550 break;
551
552 case vect_internal_def:
553 case vect_reduction_def:
554 if (oprnd_info->first_dt == vect_reduction_def
555 && !STMT_VINFO_DATA_REF (stmt_info)
556 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
557 && !STMT_VINFO_DATA_REF (def_stmt_info)
558 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
559 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
560 {
561 /* For a SLP reduction chain we want to duplicate the
562 reduction to each of the chain members. That gets
563 us a sane SLP graph (still the stmts are not 100%
564 correct wrt the initial values). */
565 gcc_assert (!first);
566 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
567 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
568 break;
569 }
570 /* Fallthru. */
571 case vect_induction_def:
572 oprnd_info->def_stmts.quick_push (def_stmt_info);
573 oprnd_info->ops.quick_push (oprnd);
574 break;
575
576 default:
577 /* FORNOW: Not supported. */
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
580 "Build SLP failed: illegal type of def %T\n",
581 oprnd);
582
583 return -1;
584 }
585 }
586
587 /* Swap operands. */
588 if (swapped)
589 {
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_NOTE, vect_location,
592 "swapped operands to match def types in %G",
593 stmt_info->stmt);
594 }
595
596 *swap = swapped;
597 return 0;
598 }
599
600 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
601 Return true if we can, meaning that this choice doesn't conflict with
602 existing SLP nodes that use STMT_INFO. */
603
604 static bool
605 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
606 {
607 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
608 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
609 return true;
610
611 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
612 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
613 {
614 /* We maintain the invariant that if any statement in the group is
615 used, all other members of the group have the same vector type. */
616 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
617 stmt_vec_info member_info = first_info;
618 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
619 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
620 || is_pattern_stmt_p (member_info))
621 break;
622
623 if (!member_info)
624 {
625 for (member_info = first_info; member_info;
626 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
627 STMT_VINFO_VECTYPE (member_info) = vectype;
628 return true;
629 }
630 }
631 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
632 && !is_pattern_stmt_p (stmt_info))
633 {
634 STMT_VINFO_VECTYPE (stmt_info) = vectype;
635 return true;
636 }
637
638 if (dump_enabled_p ())
639 {
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "Build SLP failed: incompatible vector"
642 " types for: %G", stmt_info->stmt);
643 dump_printf_loc (MSG_NOTE, vect_location,
644 " old vector type: %T\n", old_vectype);
645 dump_printf_loc (MSG_NOTE, vect_location,
646 " new vector type: %T\n", vectype);
647 }
648 return false;
649 }
650
651 /* Return true if call statements CALL1 and CALL2 are similar enough
652 to be combined into the same SLP group. */
653
654 static bool
655 compatible_calls_p (gcall *call1, gcall *call2)
656 {
657 unsigned int nargs = gimple_call_num_args (call1);
658 if (nargs != gimple_call_num_args (call2))
659 return false;
660
661 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
662 return false;
663
664 if (gimple_call_internal_p (call1))
665 {
666 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
667 TREE_TYPE (gimple_call_lhs (call2))))
668 return false;
669 for (unsigned int i = 0; i < nargs; ++i)
670 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
671 TREE_TYPE (gimple_call_arg (call2, i))))
672 return false;
673 }
674 else
675 {
676 if (!operand_equal_p (gimple_call_fn (call1),
677 gimple_call_fn (call2), 0))
678 return false;
679
680 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
681 return false;
682 }
683 return true;
684 }
685
686 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
687 caller's attempt to find the vector type in STMT_INFO with the narrowest
688 element type. Return true if VECTYPE is nonnull and if it is valid
689 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
690 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
691 vect_build_slp_tree. */
692
693 static bool
694 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
695 unsigned int group_size,
696 tree vectype, poly_uint64 *max_nunits)
697 {
698 if (!vectype)
699 {
700 if (dump_enabled_p ())
701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
702 "Build SLP failed: unsupported data-type in %G\n",
703 stmt_info->stmt);
704 /* Fatal mismatch. */
705 return false;
706 }
707
708 /* If populating the vector type requires unrolling then fail
709 before adjusting *max_nunits for basic-block vectorization. */
710 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
711 unsigned HOST_WIDE_INT const_nunits;
712 if (is_a <bb_vec_info> (vinfo)
713 && (!nunits.is_constant (&const_nunits)
714 || const_nunits > group_size))
715 {
716 if (dump_enabled_p ())
717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718 "Build SLP failed: unrolling required "
719 "in basic block SLP\n");
720 /* Fatal mismatch. */
721 return false;
722 }
723
724 /* In case of multiple types we need to detect the smallest type. */
725 vect_update_max_nunits (max_nunits, vectype);
726 return true;
727 }
728
729 /* Verify if the scalar stmts STMTS are isomorphic, require data
730 permutation or are of unsupported types of operation. Return
731 true if they are, otherwise return false and indicate in *MATCHES
732 which stmts are not isomorphic to the first one. If MATCHES[0]
733 is false then this indicates the comparison could not be
734 carried out or the stmts will never be vectorized by SLP.
735
736 Note COND_EXPR is possibly isomorphic to another one after swapping its
737 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
738 the first stmt by swapping the two operands of comparison; set SWAP[i]
739 to 2 if stmt I is isormorphic to the first stmt by inverting the code
740 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
741 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
742
743 static bool
744 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
745 vec<stmt_vec_info> stmts, unsigned int group_size,
746 poly_uint64 *max_nunits, bool *matches,
747 bool *two_operators, tree *node_vectype)
748 {
749 unsigned int i;
750 stmt_vec_info first_stmt_info = stmts[0];
751 enum tree_code first_stmt_code = ERROR_MARK;
752 enum tree_code alt_stmt_code = ERROR_MARK;
753 enum tree_code rhs_code = ERROR_MARK;
754 enum tree_code first_cond_code = ERROR_MARK;
755 tree lhs;
756 bool need_same_oprnds = false;
757 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
758 optab optab;
759 int icode;
760 machine_mode optab_op2_mode;
761 machine_mode vec_mode;
762 stmt_vec_info first_load = NULL, prev_first_load = NULL;
763 bool first_stmt_load_p = false, load_p = false;
764
765 /* For every stmt in NODE find its def stmt/s. */
766 stmt_vec_info stmt_info;
767 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
768 {
769 gimple *stmt = stmt_info->stmt;
770 swap[i] = 0;
771 matches[i] = false;
772
773 if (dump_enabled_p ())
774 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
775
776 /* Fail to vectorize statements marked as unvectorizable. */
777 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
778 {
779 if (dump_enabled_p ())
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781 "Build SLP failed: unvectorizable statement %G",
782 stmt);
783 /* Fatal mismatch. */
784 matches[0] = false;
785 return false;
786 }
787
788 lhs = gimple_get_lhs (stmt);
789 if (lhs == NULL_TREE)
790 {
791 if (dump_enabled_p ())
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "Build SLP failed: not GIMPLE_ASSIGN nor "
794 "GIMPLE_CALL %G", stmt);
795 /* Fatal mismatch. */
796 matches[0] = false;
797 return false;
798 }
799
800 tree nunits_vectype;
801 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
802 &nunits_vectype, group_size)
803 || (nunits_vectype
804 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
805 nunits_vectype, max_nunits)))
806 {
807 /* Fatal mismatch. */
808 matches[0] = false;
809 return false;
810 }
811
812 gcc_assert (vectype);
813
814 if (is_a <bb_vec_info> (vinfo)
815 && !vect_update_shared_vectype (stmt_info, vectype))
816 continue;
817
818 gcall *call_stmt = dyn_cast <gcall *> (stmt);
819 if (call_stmt)
820 {
821 rhs_code = CALL_EXPR;
822
823 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
824 load_p = true;
825 else if ((gimple_call_internal_p (call_stmt)
826 && (!vectorizable_internal_fn_p
827 (gimple_call_internal_fn (call_stmt))))
828 || gimple_call_tail_p (call_stmt)
829 || gimple_call_noreturn_p (call_stmt)
830 || !gimple_call_nothrow_p (call_stmt)
831 || gimple_call_chain (call_stmt))
832 {
833 if (dump_enabled_p ())
834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
835 "Build SLP failed: unsupported call type %G",
836 call_stmt);
837 /* Fatal mismatch. */
838 matches[0] = false;
839 return false;
840 }
841 }
842 else
843 {
844 rhs_code = gimple_assign_rhs_code (stmt);
845 load_p = gimple_vuse (stmt);
846 }
847
848 /* Check the operation. */
849 if (i == 0)
850 {
851 *node_vectype = vectype;
852 first_stmt_code = rhs_code;
853 first_stmt_load_p = load_p;
854
855 /* Shift arguments should be equal in all the packed stmts for a
856 vector shift with scalar shift operand. */
857 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
858 || rhs_code == LROTATE_EXPR
859 || rhs_code == RROTATE_EXPR)
860 {
861 vec_mode = TYPE_MODE (vectype);
862
863 /* First see if we have a vector/vector shift. */
864 optab = optab_for_tree_code (rhs_code, vectype,
865 optab_vector);
866
867 if (!optab
868 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
869 {
870 /* No vector/vector shift, try for a vector/scalar shift. */
871 optab = optab_for_tree_code (rhs_code, vectype,
872 optab_scalar);
873
874 if (!optab)
875 {
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 "Build SLP failed: no optab.\n");
879 /* Fatal mismatch. */
880 matches[0] = false;
881 return false;
882 }
883 icode = (int) optab_handler (optab, vec_mode);
884 if (icode == CODE_FOR_nothing)
885 {
886 if (dump_enabled_p ())
887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
888 "Build SLP failed: "
889 "op not supported by target.\n");
890 /* Fatal mismatch. */
891 matches[0] = false;
892 return false;
893 }
894 optab_op2_mode = insn_data[icode].operand[2].mode;
895 if (!VECTOR_MODE_P (optab_op2_mode))
896 {
897 need_same_oprnds = true;
898 first_op1 = gimple_assign_rhs2 (stmt);
899 }
900 }
901 }
902 else if (rhs_code == WIDEN_LSHIFT_EXPR)
903 {
904 need_same_oprnds = true;
905 first_op1 = gimple_assign_rhs2 (stmt);
906 }
907 else if (!load_p
908 && rhs_code == BIT_FIELD_REF)
909 {
910 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
911 if (TREE_CODE (vec) != SSA_NAME
912 || !types_compatible_p (vectype, TREE_TYPE (vec)))
913 {
914 if (dump_enabled_p ())
915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
916 "Build SLP failed: "
917 "BIT_FIELD_REF not supported\n");
918 /* Fatal mismatch. */
919 matches[0] = false;
920 return false;
921 }
922 }
923 else if (call_stmt
924 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
925 {
926 need_same_oprnds = true;
927 first_op1 = gimple_call_arg (call_stmt, 1);
928 }
929 }
930 else
931 {
932 if (first_stmt_code != rhs_code
933 && alt_stmt_code == ERROR_MARK)
934 alt_stmt_code = rhs_code;
935 if ((first_stmt_code != rhs_code
936 && (first_stmt_code != IMAGPART_EXPR
937 || rhs_code != REALPART_EXPR)
938 && (first_stmt_code != REALPART_EXPR
939 || rhs_code != IMAGPART_EXPR)
940 /* Handle mismatches in plus/minus by computing both
941 and merging the results. */
942 && !((first_stmt_code == PLUS_EXPR
943 || first_stmt_code == MINUS_EXPR)
944 && (alt_stmt_code == PLUS_EXPR
945 || alt_stmt_code == MINUS_EXPR)
946 && rhs_code == alt_stmt_code)
947 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
948 && (first_stmt_code == ARRAY_REF
949 || first_stmt_code == BIT_FIELD_REF
950 || first_stmt_code == INDIRECT_REF
951 || first_stmt_code == COMPONENT_REF
952 || first_stmt_code == MEM_REF)))
953 || first_stmt_load_p != load_p)
954 {
955 if (dump_enabled_p ())
956 {
957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 "Build SLP failed: different operation "
959 "in stmt %G", stmt);
960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
961 "original stmt %G", first_stmt_info->stmt);
962 }
963 /* Mismatch. */
964 continue;
965 }
966
967 if (need_same_oprnds)
968 {
969 tree other_op1 = (call_stmt
970 ? gimple_call_arg (call_stmt, 1)
971 : gimple_assign_rhs2 (stmt));
972 if (!operand_equal_p (first_op1, other_op1, 0))
973 {
974 if (dump_enabled_p ())
975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
976 "Build SLP failed: different shift "
977 "arguments in %G", stmt);
978 /* Mismatch. */
979 continue;
980 }
981 }
982 if (!load_p
983 && first_stmt_code == BIT_FIELD_REF
984 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
985 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
986 {
987 if (dump_enabled_p ())
988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
989 "Build SLP failed: different BIT_FIELD_REF "
990 "arguments in %G", stmt);
991 /* Mismatch. */
992 continue;
993 }
994
995 if (!load_p && rhs_code == CALL_EXPR)
996 {
997 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
998 as_a <gcall *> (stmt)))
999 {
1000 if (dump_enabled_p ())
1001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1002 "Build SLP failed: different calls in %G",
1003 stmt);
1004 /* Mismatch. */
1005 continue;
1006 }
1007 }
1008 }
1009
1010 /* Grouped store or load. */
1011 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012 {
1013 if (REFERENCE_CLASS_P (lhs))
1014 {
1015 /* Store. */
1016 ;
1017 }
1018 else
1019 {
1020 /* Load. */
1021 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1022 if (prev_first_load)
1023 {
1024 /* Check that there are no loads from different interleaving
1025 chains in the same node. */
1026 if (prev_first_load != first_load)
1027 {
1028 if (dump_enabled_p ())
1029 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1030 vect_location,
1031 "Build SLP failed: different "
1032 "interleaving chains in one node %G",
1033 stmt);
1034 /* Mismatch. */
1035 continue;
1036 }
1037 }
1038 else
1039 prev_first_load = first_load;
1040 }
1041 } /* Grouped access. */
1042 else
1043 {
1044 if (load_p)
1045 {
1046 /* Not grouped load. */
1047 if (dump_enabled_p ())
1048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049 "Build SLP failed: not grouped load %G", stmt);
1050
1051 /* FORNOW: Not grouped loads are not supported. */
1052 /* Fatal mismatch. */
1053 matches[0] = false;
1054 return false;
1055 }
1056
1057 /* Not memory operation. */
1058 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1059 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1060 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1061 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1062 && rhs_code != VIEW_CONVERT_EXPR
1063 && rhs_code != CALL_EXPR
1064 && rhs_code != BIT_FIELD_REF)
1065 {
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068 "Build SLP failed: operation unsupported %G",
1069 stmt);
1070 /* Fatal mismatch. */
1071 matches[0] = false;
1072 return false;
1073 }
1074
1075 if (rhs_code == COND_EXPR)
1076 {
1077 tree cond_expr = gimple_assign_rhs1 (stmt);
1078 enum tree_code cond_code = TREE_CODE (cond_expr);
1079 enum tree_code swap_code = ERROR_MARK;
1080 enum tree_code invert_code = ERROR_MARK;
1081
1082 if (i == 0)
1083 first_cond_code = TREE_CODE (cond_expr);
1084 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1085 {
1086 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1087 swap_code = swap_tree_comparison (cond_code);
1088 invert_code = invert_tree_comparison (cond_code, honor_nans);
1089 }
1090
1091 if (first_cond_code == cond_code)
1092 ;
1093 /* Isomorphic can be achieved by swapping. */
1094 else if (first_cond_code == swap_code)
1095 swap[i] = 1;
1096 /* Isomorphic can be achieved by inverting. */
1097 else if (first_cond_code == invert_code)
1098 swap[i] = 2;
1099 else
1100 {
1101 if (dump_enabled_p ())
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103 "Build SLP failed: different"
1104 " operation %G", stmt);
1105 /* Mismatch. */
1106 continue;
1107 }
1108 }
1109 }
1110
1111 matches[i] = true;
1112 }
1113
1114 for (i = 0; i < group_size; ++i)
1115 if (!matches[i])
1116 return false;
1117
1118 /* If we allowed a two-operation SLP node verify the target can cope
1119 with the permute we are going to use. */
1120 if (alt_stmt_code != ERROR_MARK
1121 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1122 {
1123 *two_operators = true;
1124 }
1125
1126 return true;
1127 }
1128
1129 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1130 Note we never remove apart from at destruction time so we do not
1131 need a special value for deleted that differs from empty. */
1132 struct bst_traits
1133 {
1134 typedef vec <stmt_vec_info> value_type;
1135 typedef vec <stmt_vec_info> compare_type;
1136 static inline hashval_t hash (value_type);
1137 static inline bool equal (value_type existing, value_type candidate);
1138 static inline bool is_empty (value_type x) { return !x.exists (); }
1139 static inline bool is_deleted (value_type x) { return !x.exists (); }
1140 static const bool empty_zero_p = true;
1141 static inline void mark_empty (value_type &x) { x.release (); }
1142 static inline void mark_deleted (value_type &x) { x.release (); }
1143 static inline void remove (value_type &x) { x.release (); }
1144 };
1145 inline hashval_t
1146 bst_traits::hash (value_type x)
1147 {
1148 inchash::hash h;
1149 for (unsigned i = 0; i < x.length (); ++i)
1150 h.add_int (gimple_uid (x[i]->stmt));
1151 return h.end ();
1152 }
1153 inline bool
1154 bst_traits::equal (value_type existing, value_type candidate)
1155 {
1156 if (existing.length () != candidate.length ())
1157 return false;
1158 for (unsigned i = 0; i < existing.length (); ++i)
1159 if (existing[i] != candidate[i])
1160 return false;
1161 return true;
1162 }
1163
1164 typedef hash_map <vec <gimple *>, slp_tree,
1165 simple_hashmap_traits <bst_traits, slp_tree> >
1166 scalar_stmts_to_slp_tree_map_t;
1167
1168 static slp_tree
1169 vect_build_slp_tree_2 (vec_info *vinfo,
1170 vec<stmt_vec_info> stmts, unsigned int group_size,
1171 poly_uint64 *max_nunits,
1172 bool *matches, unsigned *npermutes, unsigned *tree_size,
1173 scalar_stmts_to_slp_tree_map_t *bst_map);
1174
1175 static slp_tree
1176 vect_build_slp_tree (vec_info *vinfo,
1177 vec<stmt_vec_info> stmts, unsigned int group_size,
1178 poly_uint64 *max_nunits,
1179 bool *matches, unsigned *npermutes, unsigned *tree_size,
1180 scalar_stmts_to_slp_tree_map_t *bst_map)
1181 {
1182 if (slp_tree *leader = bst_map->get (stmts))
1183 {
1184 if (dump_enabled_p ())
1185 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1186 *leader ? "" : "failed ", *leader);
1187 if (*leader)
1188 {
1189 (*leader)->refcnt++;
1190 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1191 }
1192 return *leader;
1193 }
1194 poly_uint64 this_max_nunits = 1;
1195 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1196 &this_max_nunits,
1197 matches, npermutes, tree_size, bst_map);
1198 if (res)
1199 {
1200 res->max_nunits = this_max_nunits;
1201 vect_update_max_nunits (max_nunits, this_max_nunits);
1202 /* Keep a reference for the bst_map use. */
1203 res->refcnt++;
1204 }
1205 bst_map->put (stmts.copy (), res);
1206 return res;
1207 }
1208
1209 /* Recursively build an SLP tree starting from NODE.
1210 Fail (and return a value not equal to zero) if def-stmts are not
1211 isomorphic, require data permutation or are of unsupported types of
1212 operation. Otherwise, return 0.
1213 The value returned is the depth in the SLP tree where a mismatch
1214 was found. */
1215
1216 static slp_tree
1217 vect_build_slp_tree_2 (vec_info *vinfo,
1218 vec<stmt_vec_info> stmts, unsigned int group_size,
1219 poly_uint64 *max_nunits,
1220 bool *matches, unsigned *npermutes, unsigned *tree_size,
1221 scalar_stmts_to_slp_tree_map_t *bst_map)
1222 {
1223 unsigned nops, i, this_tree_size = 0;
1224 poly_uint64 this_max_nunits = *max_nunits;
1225 slp_tree node;
1226
1227 matches[0] = false;
1228
1229 stmt_vec_info stmt_info = stmts[0];
1230 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1231 nops = gimple_call_num_args (stmt);
1232 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1233 {
1234 nops = gimple_num_ops (stmt) - 1;
1235 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1236 nops++;
1237 }
1238 else if (is_a <gphi *> (stmt_info->stmt))
1239 nops = 0;
1240 else
1241 return NULL;
1242
1243 /* If the SLP node is a PHI (induction or reduction), terminate
1244 the recursion. */
1245 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1246 {
1247 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1248 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1249 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1250 max_nunits))
1251 return NULL;
1252
1253 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1254 /* Induction from different IVs is not supported. */
1255 if (def_type == vect_induction_def)
1256 {
1257 stmt_vec_info other_info;
1258 FOR_EACH_VEC_ELT (stmts, i, other_info)
1259 if (stmt_info != other_info)
1260 return NULL;
1261 }
1262 else if (def_type == vect_reduction_def
1263 || def_type == vect_double_reduction_def
1264 || def_type == vect_nested_cycle)
1265 {
1266 /* Else def types have to match. */
1267 stmt_vec_info other_info;
1268 FOR_EACH_VEC_ELT (stmts, i, other_info)
1269 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1270 return NULL;
1271 }
1272 else
1273 return NULL;
1274 (*tree_size)++;
1275 node = vect_create_new_slp_node (stmts, 0);
1276 SLP_TREE_VECTYPE (node) = vectype;
1277 return node;
1278 }
1279
1280
1281 bool two_operators = false;
1282 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1283 tree vectype = NULL_TREE;
1284 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1285 &this_max_nunits, matches, &two_operators,
1286 &vectype))
1287 return NULL;
1288
1289 /* If the SLP node is a load, terminate the recursion unless masked. */
1290 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1291 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1292 {
1293 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1294 {
1295 /* Masked load. */
1296 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1297 nops = 1;
1298 }
1299 else
1300 {
1301 *max_nunits = this_max_nunits;
1302 (*tree_size)++;
1303 node = vect_create_new_slp_node (stmts, 0);
1304 SLP_TREE_VECTYPE (node) = vectype;
1305 /* And compute the load permutation. Whether it is actually
1306 a permutation depends on the unrolling factor which is
1307 decided later. */
1308 vec<unsigned> load_permutation;
1309 int j;
1310 stmt_vec_info load_info;
1311 load_permutation.create (group_size);
1312 stmt_vec_info first_stmt_info
1313 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1314 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1315 {
1316 int load_place = vect_get_place_in_interleaving_chain
1317 (load_info, first_stmt_info);
1318 gcc_assert (load_place != -1);
1319 load_permutation.safe_push (load_place);
1320 }
1321 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1322 return node;
1323 }
1324 }
1325 else if (gimple_assign_single_p (stmt_info->stmt)
1326 && !gimple_vuse (stmt_info->stmt)
1327 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1328 {
1329 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1330 the same SSA name vector of a compatible type to vectype. */
1331 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1332 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1333 stmt_vec_info estmt_info;
1334 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1335 {
1336 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1337 tree bfref = gimple_assign_rhs1 (estmt);
1338 HOST_WIDE_INT lane;
1339 if (!known_eq (bit_field_size (bfref),
1340 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1341 || !constant_multiple_p (bit_field_offset (bfref),
1342 bit_field_size (bfref), &lane))
1343 {
1344 lperm.release ();
1345 return NULL;
1346 }
1347 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1348 }
1349 slp_tree vnode = vect_create_new_slp_node (vNULL);
1350 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
1351 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1352 /* We are always building a permutation node even if it is an identity
1353 permute to shield the rest of the vectorizer from the odd node
1354 representing an actual vector without any scalar ops.
1355 ??? We could hide it completely with making the permute node
1356 external? */
1357 node = vect_create_new_slp_node (stmts, 1);
1358 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1359 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1360 SLP_TREE_VECTYPE (node) = vectype;
1361 SLP_TREE_CHILDREN (node).quick_push (vnode);
1362 return node;
1363 }
1364
1365 /* Get at the operands, verifying they are compatible. */
1366 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1367 slp_oprnd_info oprnd_info;
1368 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1369 {
1370 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1371 stmts, i, &oprnds_info);
1372 if (res != 0)
1373 matches[(res == -1) ? 0 : i] = false;
1374 if (!matches[0])
1375 break;
1376 }
1377 for (i = 0; i < group_size; ++i)
1378 if (!matches[i])
1379 {
1380 vect_free_oprnd_info (oprnds_info);
1381 return NULL;
1382 }
1383
1384 auto_vec<slp_tree, 4> children;
1385
1386 stmt_info = stmts[0];
1387
1388 /* Create SLP_TREE nodes for the definition node/s. */
1389 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1390 {
1391 slp_tree child;
1392 unsigned int j;
1393
1394 if (oprnd_info->first_dt == vect_uninitialized_def)
1395 {
1396 /* COND_EXPR have one too many eventually if the condition
1397 is a SSA name. */
1398 gcc_assert (i == 3 && nops == 4);
1399 continue;
1400 }
1401
1402 if (oprnd_info->first_dt != vect_internal_def
1403 && oprnd_info->first_dt != vect_reduction_def
1404 && oprnd_info->first_dt != vect_induction_def)
1405 {
1406 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1407 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1408 oprnd_info->ops = vNULL;
1409 children.safe_push (invnode);
1410 continue;
1411 }
1412
1413 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1414 group_size, &this_max_nunits,
1415 matches, npermutes,
1416 &this_tree_size, bst_map)) != NULL)
1417 {
1418 oprnd_info->def_stmts = vNULL;
1419 children.safe_push (child);
1420 continue;
1421 }
1422
1423 /* If the SLP build failed fatally and we analyze a basic-block
1424 simply treat nodes we fail to build as externally defined
1425 (and thus build vectors from the scalar defs).
1426 The cost model will reject outright expensive cases.
1427 ??? This doesn't treat cases where permutation ultimatively
1428 fails (or we don't try permutation below). Ideally we'd
1429 even compute a permutation that will end up with the maximum
1430 SLP tree size... */
1431 if (is_a <bb_vec_info> (vinfo)
1432 && !matches[0]
1433 /* ??? Rejecting patterns this way doesn't work. We'd have to
1434 do extra work to cancel the pattern so the uses see the
1435 scalar version. */
1436 && !is_pattern_stmt_p (stmt_info)
1437 && !oprnd_info->any_pattern)
1438 {
1439 if (dump_enabled_p ())
1440 dump_printf_loc (MSG_NOTE, vect_location,
1441 "Building vector operands from scalars\n");
1442 this_tree_size++;
1443 child = vect_create_new_slp_node (oprnd_info->ops);
1444 children.safe_push (child);
1445 oprnd_info->ops = vNULL;
1446 oprnd_info->def_stmts = vNULL;
1447 continue;
1448 }
1449
1450 /* If the SLP build for operand zero failed and operand zero
1451 and one can be commutated try that for the scalar stmts
1452 that failed the match. */
1453 if (i == 0
1454 /* A first scalar stmt mismatch signals a fatal mismatch. */
1455 && matches[0]
1456 /* ??? For COND_EXPRs we can swap the comparison operands
1457 as well as the arms under some constraints. */
1458 && nops == 2
1459 && oprnds_info[1]->first_dt == vect_internal_def
1460 && is_gimple_assign (stmt_info->stmt)
1461 /* Swapping operands for reductions breaks assumptions later on. */
1462 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1463 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1464 /* Do so only if the number of not successful permutes was nor more
1465 than a cut-ff as re-trying the recursive match on
1466 possibly each level of the tree would expose exponential
1467 behavior. */
1468 && *npermutes < 4)
1469 {
1470 /* See whether we can swap the matching or the non-matching
1471 stmt operands. */
1472 bool swap_not_matching = true;
1473 do
1474 {
1475 for (j = 0; j < group_size; ++j)
1476 {
1477 if (matches[j] != !swap_not_matching)
1478 continue;
1479 stmt_vec_info stmt_info = stmts[j];
1480 /* Verify if we can swap operands of this stmt. */
1481 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1482 if (!stmt
1483 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1484 {
1485 if (!swap_not_matching)
1486 goto fail;
1487 swap_not_matching = false;
1488 break;
1489 }
1490 }
1491 }
1492 while (j != group_size);
1493
1494 /* Swap mismatched definition stmts. */
1495 if (dump_enabled_p ())
1496 dump_printf_loc (MSG_NOTE, vect_location,
1497 "Re-trying with swapped operands of stmts ");
1498 for (j = 0; j < group_size; ++j)
1499 if (matches[j] == !swap_not_matching)
1500 {
1501 std::swap (oprnds_info[0]->def_stmts[j],
1502 oprnds_info[1]->def_stmts[j]);
1503 std::swap (oprnds_info[0]->ops[j],
1504 oprnds_info[1]->ops[j]);
1505 if (dump_enabled_p ())
1506 dump_printf (MSG_NOTE, "%d ", j);
1507 }
1508 if (dump_enabled_p ())
1509 dump_printf (MSG_NOTE, "\n");
1510 /* And try again with scratch 'matches' ... */
1511 bool *tem = XALLOCAVEC (bool, group_size);
1512 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1513 group_size, &this_max_nunits,
1514 tem, npermutes,
1515 &this_tree_size, bst_map)) != NULL)
1516 {
1517 oprnd_info->def_stmts = vNULL;
1518 children.safe_push (child);
1519 continue;
1520 }
1521
1522 ++*npermutes;
1523 }
1524
1525 fail:
1526 gcc_assert (child == NULL);
1527 FOR_EACH_VEC_ELT (children, j, child)
1528 vect_free_slp_tree (child, false);
1529 vect_free_oprnd_info (oprnds_info);
1530 return NULL;
1531 }
1532
1533 vect_free_oprnd_info (oprnds_info);
1534
1535 /* If we have all children of a child built up from uniform scalars
1536 then just throw that away, causing it built up from scalars.
1537 The exception is the SLP node for the vector store. */
1538 if (is_a <bb_vec_info> (vinfo)
1539 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
1540 /* ??? Rejecting patterns this way doesn't work. We'd have to
1541 do extra work to cancel the pattern so the uses see the
1542 scalar version. */
1543 && !is_pattern_stmt_p (stmt_info))
1544 {
1545 slp_tree child;
1546 unsigned j;
1547 FOR_EACH_VEC_ELT (children, j, child)
1548 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
1549 || !vect_slp_tree_uniform_p (child))
1550 break;
1551 if (!child)
1552 {
1553 /* Roll back. */
1554 matches[0] = false;
1555 FOR_EACH_VEC_ELT (children, j, child)
1556 vect_free_slp_tree (child, false);
1557
1558 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_NOTE, vect_location,
1560 "Building parent vector operands from "
1561 "scalars instead\n");
1562 return NULL;
1563 }
1564 }
1565
1566 *tree_size += this_tree_size + 1;
1567 *max_nunits = this_max_nunits;
1568
1569 if (two_operators)
1570 {
1571 /* ??? We'd likely want to either cache in bst_map sth like
1572 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
1573 the true { a+b, a+b, a+b, a+b } ... but there we don't have
1574 explicit stmts to put in so the keying on 'stmts' doesn't
1575 work (but we have the same issue with nodes that use 'ops'). */
1576 slp_tree one = new _slp_tree;
1577 slp_tree two = new _slp_tree;
1578 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
1579 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
1580 SLP_TREE_VECTYPE (one) = vectype;
1581 SLP_TREE_VECTYPE (two) = vectype;
1582 SLP_TREE_CHILDREN (one).safe_splice (children);
1583 SLP_TREE_CHILDREN (two).safe_splice (children);
1584 slp_tree child;
1585 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
1586 child->refcnt++;
1587
1588 /* Here we record the original defs since this
1589 node represents the final lane configuration. */
1590 node = vect_create_new_slp_node (stmts, 2);
1591 SLP_TREE_VECTYPE (node) = vectype;
1592 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1593 SLP_TREE_CHILDREN (node).quick_push (one);
1594 SLP_TREE_CHILDREN (node).quick_push (two);
1595 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
1596 enum tree_code code0 = gimple_assign_rhs_code (stmt);
1597 enum tree_code ocode = ERROR_MARK;
1598 stmt_vec_info ostmt_info;
1599 unsigned j = 0;
1600 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
1601 {
1602 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
1603 if (gimple_assign_rhs_code (ostmt) != code0)
1604 {
1605 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
1606 ocode = gimple_assign_rhs_code (ostmt);
1607 j = i;
1608 }
1609 else
1610 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
1611 }
1612 SLP_TREE_CODE (one) = code0;
1613 SLP_TREE_CODE (two) = ocode;
1614 SLP_TREE_LANES (one) = stmts.length ();
1615 SLP_TREE_LANES (two) = stmts.length ();
1616 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
1617 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
1618 return node;
1619 }
1620
1621 node = vect_create_new_slp_node (stmts, nops);
1622 SLP_TREE_VECTYPE (node) = vectype;
1623 SLP_TREE_CHILDREN (node).splice (children);
1624 return node;
1625 }
1626
1627 /* Dump a single SLP tree NODE. */
1628
1629 static void
1630 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1631 slp_tree node)
1632 {
1633 unsigned i, j;
1634 slp_tree child;
1635 stmt_vec_info stmt_info;
1636 tree op;
1637
1638 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1639 dump_user_location_t user_loc = loc.get_user_location ();
1640 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1641 SLP_TREE_DEF_TYPE (node) == vect_external_def
1642 ? " (external)"
1643 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1644 ? " (constant)"
1645 : ""), node,
1646 estimated_poly_value (node->max_nunits), node->refcnt);
1647 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1648 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1649 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1650 else
1651 {
1652 dump_printf_loc (metadata, user_loc, "\t{ ");
1653 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1654 dump_printf (metadata, "%T%s ", op,
1655 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1656 dump_printf (metadata, "}\n");
1657 }
1658 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1659 {
1660 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1661 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1662 dump_printf (dump_kind, " %u", j);
1663 dump_printf (dump_kind, " }\n");
1664 }
1665 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1666 {
1667 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
1668 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
1669 dump_printf (dump_kind, " %u[%u]",
1670 SLP_TREE_LANE_PERMUTATION (node)[i].first,
1671 SLP_TREE_LANE_PERMUTATION (node)[i].second);
1672 dump_printf (dump_kind, " }\n");
1673 }
1674 if (SLP_TREE_CHILDREN (node).is_empty ())
1675 return;
1676 dump_printf_loc (metadata, user_loc, "\tchildren");
1677 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1678 dump_printf (dump_kind, " %p", (void *)child);
1679 dump_printf (dump_kind, "\n");
1680 }
1681
1682 DEBUG_FUNCTION void
1683 debug (slp_tree node)
1684 {
1685 debug_dump_context ctx;
1686 vect_print_slp_tree (MSG_NOTE,
1687 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1688 node);
1689 }
1690
1691 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1692
1693 static void
1694 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1695 slp_tree node, hash_set<slp_tree> &visited)
1696 {
1697 unsigned i;
1698 slp_tree child;
1699
1700 if (visited.add (node))
1701 return;
1702
1703 vect_print_slp_tree (dump_kind, loc, node);
1704
1705 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1706 vect_print_slp_graph (dump_kind, loc, child, visited);
1707 }
1708
1709 static void
1710 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1711 slp_tree entry)
1712 {
1713 hash_set<slp_tree> visited;
1714 vect_print_slp_graph (dump_kind, loc, entry, visited);
1715 }
1716
1717 /* Mark the tree rooted at NODE with PURE_SLP. */
1718
1719 static void
1720 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1721 {
1722 int i;
1723 stmt_vec_info stmt_info;
1724 slp_tree child;
1725
1726 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1727 return;
1728
1729 if (visited.add (node))
1730 return;
1731
1732 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1733 STMT_SLP_TYPE (stmt_info) = pure_slp;
1734
1735 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1736 vect_mark_slp_stmts (child, visited);
1737 }
1738
1739 static void
1740 vect_mark_slp_stmts (slp_tree node)
1741 {
1742 hash_set<slp_tree> visited;
1743 vect_mark_slp_stmts (node, visited);
1744 }
1745
1746 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1747
1748 static void
1749 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1750 {
1751 int i;
1752 stmt_vec_info stmt_info;
1753 slp_tree child;
1754
1755 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1756 return;
1757
1758 if (visited.add (node))
1759 return;
1760
1761 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1762 {
1763 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1764 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1765 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1766 }
1767
1768 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1769 vect_mark_slp_stmts_relevant (child, visited);
1770 }
1771
1772 static void
1773 vect_mark_slp_stmts_relevant (slp_tree node)
1774 {
1775 hash_set<slp_tree> visited;
1776 vect_mark_slp_stmts_relevant (node, visited);
1777 }
1778
1779 /* Copy the SLP subtree rooted at NODE. */
1780
1781 static slp_tree
1782 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1783 {
1784 unsigned i;
1785
1786 bool existed_p;
1787 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1788 if (existed_p)
1789 return copy_ref;
1790
1791 copy_ref = new _slp_tree;
1792 slp_tree copy = copy_ref;
1793 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1794 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1795 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1796 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
1797 copy->max_nunits = node->max_nunits;
1798 copy->refcnt = 0;
1799 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1800 {
1801 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1802 stmt_vec_info stmt_info;
1803 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1804 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1805 }
1806 if (SLP_TREE_SCALAR_OPS (node).exists ())
1807 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1808 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1809 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1810 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1811 SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
1812 if (SLP_TREE_CHILDREN (node).exists ())
1813 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1814 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1815
1816 slp_tree child;
1817 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1818 {
1819 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1820 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1821 }
1822 return copy;
1823 }
1824
1825 /* Rearrange the statements of NODE according to PERMUTATION. */
1826
1827 static void
1828 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1829 vec<unsigned> permutation,
1830 hash_set<slp_tree> &visited)
1831 {
1832 unsigned int i;
1833 slp_tree child;
1834
1835 if (visited.add (node))
1836 return;
1837
1838 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1839 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1840
1841 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1842 {
1843 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1844 vec<stmt_vec_info> tmp_stmts;
1845 tmp_stmts.create (group_size);
1846 tmp_stmts.quick_grow (group_size);
1847 stmt_vec_info stmt_info;
1848 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1849 tmp_stmts[permutation[i]] = stmt_info;
1850 SLP_TREE_SCALAR_STMTS (node).release ();
1851 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1852 }
1853 if (SLP_TREE_SCALAR_OPS (node).exists ())
1854 {
1855 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1856 vec<tree> tmp_ops;
1857 tmp_ops.create (group_size);
1858 tmp_ops.quick_grow (group_size);
1859 tree op;
1860 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1861 tmp_ops[permutation[i]] = op;
1862 SLP_TREE_SCALAR_OPS (node).release ();
1863 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1864 }
1865 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
1866 {
1867 gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
1868 for (i = 0; i < group_size; ++i)
1869 SLP_TREE_LANE_PERMUTATION (node)[i].second
1870 = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
1871 }
1872 }
1873
1874
1875 /* Attempt to reorder stmts in a reduction chain so that we don't
1876 require any load permutation. Return true if that was possible,
1877 otherwise return false. */
1878
1879 static bool
1880 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1881 {
1882 unsigned int i, j;
1883 unsigned int lidx;
1884 slp_tree node, load;
1885
1886 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1887 return false;
1888
1889 /* Compare all the permutation sequences to the first one. We know
1890 that at least one load is permuted. */
1891 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1892 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1893 return false;
1894 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1895 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1896 {
1897 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1898 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1899 return false;
1900 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1901 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1902 return false;
1903 }
1904
1905 /* Check that the loads in the first sequence are different and there
1906 are no gaps between them. */
1907 auto_sbitmap load_index (group_size);
1908 bitmap_clear (load_index);
1909 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1910 {
1911 if (lidx >= group_size)
1912 return false;
1913 if (bitmap_bit_p (load_index, lidx))
1914 return false;
1915
1916 bitmap_set_bit (load_index, lidx);
1917 }
1918 for (i = 0; i < group_size; i++)
1919 if (!bitmap_bit_p (load_index, i))
1920 return false;
1921
1922 /* This permutation is valid for reduction. Since the order of the
1923 statements in the nodes is not important unless they are memory
1924 accesses, we can rearrange the statements in all the nodes
1925 according to the order of the loads. */
1926
1927 /* We have to unshare the SLP tree we modify. */
1928 hash_map<slp_tree, slp_tree> map;
1929 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1930 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1931 unshared->refcnt++;
1932 SLP_INSTANCE_TREE (slp_instn) = unshared;
1933 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1934 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1935 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1936
1937 /* Do the actual re-arrangement. */
1938 hash_set<slp_tree> visited;
1939 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1940 node->load_permutation, visited);
1941
1942 /* We are done, no actual permutations need to be generated. */
1943 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1944 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1945 {
1946 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1947 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1948 /* But we have to keep those permutations that are required because
1949 of handling of gaps. */
1950 if (known_eq (unrolling_factor, 1U)
1951 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1952 && DR_GROUP_GAP (first_stmt_info) == 0))
1953 SLP_TREE_LOAD_PERMUTATION (node).release ();
1954 else
1955 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1956 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1957 }
1958
1959 return true;
1960 }
1961
1962 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1963
1964 static void
1965 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1966 hash_set<slp_tree> &visited)
1967 {
1968 if (visited.add (node))
1969 return;
1970
1971 if (SLP_TREE_CHILDREN (node).length () == 0)
1972 {
1973 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1974 return;
1975 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1976 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1977 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1978 loads.safe_push (node);
1979 }
1980 else
1981 {
1982 unsigned i;
1983 slp_tree child;
1984 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1985 vect_gather_slp_loads (loads, child, visited);
1986 }
1987 }
1988
1989 static void
1990 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1991 {
1992 hash_set<slp_tree> visited;
1993 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1994 }
1995
1996
1997 /* Find the last store in SLP INSTANCE. */
1998
1999 stmt_vec_info
2000 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2001 {
2002 stmt_vec_info last = NULL;
2003 stmt_vec_info stmt_vinfo;
2004
2005 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2006 {
2007 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2008 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2009 }
2010
2011 return last;
2012 }
2013
2014 /* Find the first stmt in NODE. */
2015
2016 stmt_vec_info
2017 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2018 {
2019 stmt_vec_info first = NULL;
2020 stmt_vec_info stmt_vinfo;
2021
2022 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2023 {
2024 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2025 if (!first
2026 || get_later_stmt (stmt_vinfo, first) == first)
2027 first = stmt_vinfo;
2028 }
2029
2030 return first;
2031 }
2032
2033 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2034 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2035 (also containing the first GROUP1_SIZE stmts, since stores are
2036 consecutive), the second containing the remainder.
2037 Return the first stmt in the second group. */
2038
2039 static stmt_vec_info
2040 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2041 {
2042 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2043 gcc_assert (group1_size > 0);
2044 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2045 gcc_assert (group2_size > 0);
2046 DR_GROUP_SIZE (first_vinfo) = group1_size;
2047
2048 stmt_vec_info stmt_info = first_vinfo;
2049 for (unsigned i = group1_size; i > 1; i--)
2050 {
2051 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2052 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2053 }
2054 /* STMT is now the last element of the first group. */
2055 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2056 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2057
2058 DR_GROUP_SIZE (group2) = group2_size;
2059 for (stmt_info = group2; stmt_info;
2060 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2061 {
2062 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2063 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2064 }
2065
2066 /* For the second group, the DR_GROUP_GAP is that before the original group,
2067 plus skipping over the first vector. */
2068 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2069
2070 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2071 DR_GROUP_GAP (first_vinfo) += group2_size;
2072
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2075 group1_size, group2_size);
2076
2077 return group2;
2078 }
2079
2080 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2081 statements and a vector of NUNITS elements. */
2082
2083 static poly_uint64
2084 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2085 {
2086 return exact_div (common_multiple (nunits, group_size), group_size);
2087 }
2088
2089 /* Analyze an SLP instance starting from a group of grouped stores. Call
2090 vect_build_slp_tree to build a tree of packed stmts if possible.
2091 Return FALSE if it's impossible to SLP any stmt in the loop. */
2092
2093 static bool
2094 vect_analyze_slp_instance (vec_info *vinfo,
2095 scalar_stmts_to_slp_tree_map_t *bst_map,
2096 stmt_vec_info stmt_info, unsigned max_tree_size)
2097 {
2098 slp_instance new_instance;
2099 slp_tree node;
2100 unsigned int group_size;
2101 tree vectype, scalar_type = NULL_TREE;
2102 unsigned int i;
2103 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2104 vec<stmt_vec_info> scalar_stmts;
2105 bool constructor = false;
2106
2107 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2108 {
2109 scalar_type = TREE_TYPE (DR_REF (dr));
2110 group_size = DR_GROUP_SIZE (stmt_info);
2111 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2112 }
2113 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2114 {
2115 gcc_assert (is_a <loop_vec_info> (vinfo));
2116 vectype = STMT_VINFO_VECTYPE (stmt_info);
2117 group_size = REDUC_GROUP_SIZE (stmt_info);
2118 }
2119 else if (is_gimple_assign (stmt_info->stmt)
2120 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2121 {
2122 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2123 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2124 constructor = true;
2125 }
2126 else
2127 {
2128 gcc_assert (is_a <loop_vec_info> (vinfo));
2129 vectype = STMT_VINFO_VECTYPE (stmt_info);
2130 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2131 }
2132
2133 if (!vectype)
2134 {
2135 if (dump_enabled_p ())
2136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137 "Build SLP failed: unsupported data-type %T\n",
2138 scalar_type);
2139
2140 return false;
2141 }
2142 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2143
2144 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2145 scalar_stmts.create (group_size);
2146 stmt_vec_info next_info = stmt_info;
2147 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2148 {
2149 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2150 while (next_info)
2151 {
2152 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2153 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2154 }
2155 }
2156 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2157 {
2158 /* Collect the reduction stmts and store them in
2159 SLP_TREE_SCALAR_STMTS. */
2160 while (next_info)
2161 {
2162 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2163 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2164 }
2165 /* Mark the first element of the reduction chain as reduction to properly
2166 transform the node. In the reduction analysis phase only the last
2167 element of the chain is marked as reduction. */
2168 STMT_VINFO_DEF_TYPE (stmt_info)
2169 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2170 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2171 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2172 }
2173 else if (constructor)
2174 {
2175 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2176 tree val;
2177 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2178 {
2179 if (TREE_CODE (val) == SSA_NAME)
2180 {
2181 gimple* def = SSA_NAME_DEF_STMT (val);
2182 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2183 /* Value is defined in another basic block. */
2184 if (!def_info)
2185 return false;
2186 def_info = vect_stmt_to_vectorize (def_info);
2187 scalar_stmts.safe_push (def_info);
2188 }
2189 else
2190 return false;
2191 }
2192 if (dump_enabled_p ())
2193 dump_printf_loc (MSG_NOTE, vect_location,
2194 "Analyzing vectorizable constructor: %G\n",
2195 stmt_info->stmt);
2196 }
2197 else
2198 {
2199 /* Collect reduction statements. */
2200 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2201 for (i = 0; reductions.iterate (i, &next_info); i++)
2202 scalar_stmts.safe_push (next_info);
2203 }
2204
2205 /* Build the tree for the SLP instance. */
2206 bool *matches = XALLOCAVEC (bool, group_size);
2207 unsigned npermutes = 0;
2208 poly_uint64 max_nunits = nunits;
2209 unsigned tree_size = 0;
2210 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2211 &max_nunits, matches, &npermutes,
2212 &tree_size, bst_map);
2213 if (node != NULL)
2214 {
2215 /* Calculate the unrolling factor based on the smallest type. */
2216 poly_uint64 unrolling_factor
2217 = calculate_unrolling_factor (max_nunits, group_size);
2218
2219 if (maybe_ne (unrolling_factor, 1U)
2220 && is_a <bb_vec_info> (vinfo))
2221 {
2222 unsigned HOST_WIDE_INT const_max_nunits;
2223 if (!max_nunits.is_constant (&const_max_nunits)
2224 || const_max_nunits > group_size)
2225 {
2226 if (dump_enabled_p ())
2227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2228 "Build SLP failed: store group "
2229 "size not a multiple of the vector size "
2230 "in basic block SLP\n");
2231 vect_free_slp_tree (node, false);
2232 return false;
2233 }
2234 /* Fatal mismatch. */
2235 matches[0] = true;
2236 matches[group_size / const_max_nunits * const_max_nunits] = false;
2237 vect_free_slp_tree (node, false);
2238 }
2239 else
2240 {
2241 /* Create a new SLP instance. */
2242 new_instance = XNEW (class _slp_instance);
2243 SLP_INSTANCE_TREE (new_instance) = node;
2244 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2245 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2246 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2247
2248 vect_gather_slp_loads (new_instance, node);
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_NOTE, vect_location,
2251 "SLP size %u vs. limit %u.\n",
2252 tree_size, max_tree_size);
2253
2254 /* Check whether any load is possibly permuted. */
2255 slp_tree load_node;
2256 bool loads_permuted = false;
2257 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2258 {
2259 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2260 continue;
2261 unsigned j;
2262 stmt_vec_info load_info;
2263 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2264 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2265 {
2266 loads_permuted = true;
2267 break;
2268 }
2269 }
2270
2271 /* If the loads and stores can be handled with load/store-lane
2272 instructions do not generate this SLP instance. */
2273 if (is_a <loop_vec_info> (vinfo)
2274 && loads_permuted
2275 && dr && vect_store_lanes_supported (vectype, group_size, false))
2276 {
2277 slp_tree load_node;
2278 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2279 {
2280 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2281 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2282 /* Use SLP for strided accesses (or if we can't load-lanes). */
2283 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2284 || ! vect_load_lanes_supported
2285 (STMT_VINFO_VECTYPE (stmt_vinfo),
2286 DR_GROUP_SIZE (stmt_vinfo), false))
2287 break;
2288 }
2289 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2290 {
2291 if (dump_enabled_p ())
2292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2293 "Built SLP cancelled: can use "
2294 "load/store-lanes\n");
2295 vect_free_slp_instance (new_instance, false);
2296 return false;
2297 }
2298 }
2299
2300 /* If this is a reduction chain with a conversion in front
2301 amend the SLP tree with a node for that. */
2302 if (!dr
2303 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2304 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2305 {
2306 /* Get at the conversion stmt - we know it's the single use
2307 of the last stmt of the reduction chain. */
2308 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2309 use_operand_p use_p;
2310 gimple *use_stmt;
2311 bool r = single_imm_use (gimple_assign_lhs (tem),
2312 &use_p, &use_stmt);
2313 gcc_assert (r);
2314 next_info = vinfo->lookup_stmt (use_stmt);
2315 next_info = vect_stmt_to_vectorize (next_info);
2316 scalar_stmts = vNULL;
2317 scalar_stmts.create (group_size);
2318 for (unsigned i = 0; i < group_size; ++i)
2319 scalar_stmts.quick_push (next_info);
2320 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2321 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
2322 SLP_TREE_CHILDREN (conv).quick_push (node);
2323 SLP_INSTANCE_TREE (new_instance) = conv;
2324 /* We also have to fake this conversion stmt as SLP reduction
2325 group so we don't have to mess with too much code
2326 elsewhere. */
2327 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2328 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2329 }
2330
2331 vinfo->slp_instances.safe_push (new_instance);
2332
2333 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2334 the number of scalar stmts in the root in a few places.
2335 Verify that assumption holds. */
2336 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2337 .length () == group_size);
2338
2339 if (dump_enabled_p ())
2340 {
2341 dump_printf_loc (MSG_NOTE, vect_location,
2342 "Final SLP tree for instance:\n");
2343 vect_print_slp_graph (MSG_NOTE, vect_location,
2344 SLP_INSTANCE_TREE (new_instance));
2345 }
2346
2347 return true;
2348 }
2349 }
2350 else
2351 {
2352 /* Failed to SLP. */
2353 /* Free the allocated memory. */
2354 scalar_stmts.release ();
2355 }
2356
2357 /* For basic block SLP, try to break the group up into multiples of the
2358 vector size. */
2359 unsigned HOST_WIDE_INT const_nunits;
2360 if (is_a <bb_vec_info> (vinfo)
2361 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2362 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2363 && nunits.is_constant (&const_nunits))
2364 {
2365 /* We consider breaking the group only on VF boundaries from the existing
2366 start. */
2367 for (i = 0; i < group_size; i++)
2368 if (!matches[i]) break;
2369
2370 if (i >= const_nunits && i < group_size)
2371 {
2372 /* Split into two groups at the first vector boundary before i. */
2373 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2374 unsigned group1_size = i & ~(const_nunits - 1);
2375
2376 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2377 group1_size);
2378 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2379 max_tree_size);
2380 /* If the first non-match was in the middle of a vector,
2381 skip the rest of that vector. */
2382 if (group1_size < i)
2383 {
2384 i = group1_size + const_nunits;
2385 if (i < group_size)
2386 rest = vect_split_slp_store_group (rest, const_nunits);
2387 }
2388 if (i < group_size)
2389 res |= vect_analyze_slp_instance (vinfo, bst_map,
2390 rest, max_tree_size);
2391 return res;
2392 }
2393 /* Even though the first vector did not all match, we might be able to SLP
2394 (some) of the remainder. FORNOW ignore this possibility. */
2395 }
2396
2397 return false;
2398 }
2399
2400
2401 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2402 trees of packed scalar stmts if SLP is possible. */
2403
2404 opt_result
2405 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2406 {
2407 unsigned int i;
2408 stmt_vec_info first_element;
2409
2410 DUMP_VECT_SCOPE ("vect_analyze_slp");
2411
2412 scalar_stmts_to_slp_tree_map_t *bst_map
2413 = new scalar_stmts_to_slp_tree_map_t ();
2414
2415 /* Find SLP sequences starting from groups of grouped stores. */
2416 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2417 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2418
2419 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2420 {
2421 if (loop_vinfo->reduction_chains.length () > 0)
2422 {
2423 /* Find SLP sequences starting from reduction chains. */
2424 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2425 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2426 max_tree_size))
2427 {
2428 /* Dissolve reduction chain group. */
2429 stmt_vec_info vinfo = first_element;
2430 stmt_vec_info last = NULL;
2431 while (vinfo)
2432 {
2433 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2434 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2435 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2436 last = vinfo;
2437 vinfo = next;
2438 }
2439 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2440 /* It can be still vectorized as part of an SLP reduction. */
2441 loop_vinfo->reductions.safe_push (last);
2442 }
2443 }
2444
2445 /* Find SLP sequences starting from groups of reductions. */
2446 if (loop_vinfo->reductions.length () > 1)
2447 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2448 max_tree_size);
2449 }
2450
2451 /* The map keeps a reference on SLP nodes built, release that. */
2452 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2453 it != bst_map->end (); ++it)
2454 if ((*it).second)
2455 vect_free_slp_tree ((*it).second, false);
2456 delete bst_map;
2457
2458 /* Optimize permutations in SLP reductions. */
2459 slp_instance instance;
2460 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2461 {
2462 slp_tree node = SLP_INSTANCE_TREE (instance);
2463 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2464 /* Reduction (there are no data-refs in the root).
2465 In reduction chain the order of the loads is not important. */
2466 if (!STMT_VINFO_DATA_REF (stmt_info)
2467 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2468 vect_attempt_slp_rearrange_stmts (instance);
2469 }
2470
2471 /* Gather all loads in the SLP graph. */
2472 hash_set<slp_tree> visited;
2473 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2474 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2475 visited);
2476
2477 return opt_result::success ();
2478 }
2479
2480 void
2481 vect_optimize_slp (vec_info *vinfo)
2482 {
2483 slp_tree node;
2484 unsigned i;
2485 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2486 {
2487 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2488 continue;
2489
2490 /* In basic block vectorization we allow any subchain of an interleaving
2491 chain.
2492 FORNOW: not in loop SLP because of realignment complications. */
2493 if (is_a <bb_vec_info> (vinfo))
2494 {
2495 bool subchain_p = true;
2496 stmt_vec_info next_load_info = NULL;
2497 stmt_vec_info load_info;
2498 unsigned j;
2499 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2500 {
2501 if (j != 0
2502 && (next_load_info != load_info
2503 || DR_GROUP_GAP (load_info) != 1))
2504 {
2505 subchain_p = false;
2506 break;
2507 }
2508 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2509 }
2510 if (subchain_p)
2511 {
2512 SLP_TREE_LOAD_PERMUTATION (node).release ();
2513 continue;
2514 }
2515 }
2516 else
2517 {
2518 stmt_vec_info load_info;
2519 bool this_load_permuted = false;
2520 unsigned j;
2521 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2522 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2523 {
2524 this_load_permuted = true;
2525 break;
2526 }
2527 stmt_vec_info first_stmt_info
2528 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2529 if (!this_load_permuted
2530 /* The load requires permutation when unrolling exposes
2531 a gap either because the group is larger than the SLP
2532 group-size or because there is a gap between the groups. */
2533 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2534 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
2535 && DR_GROUP_GAP (first_stmt_info) == 0)))
2536 {
2537 SLP_TREE_LOAD_PERMUTATION (node).release ();
2538 continue;
2539 }
2540 }
2541 }
2542 }
2543
2544
2545 /* For each possible SLP instance decide whether to SLP it and calculate overall
2546 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2547 least one instance. */
2548
2549 bool
2550 vect_make_slp_decision (loop_vec_info loop_vinfo)
2551 {
2552 unsigned int i;
2553 poly_uint64 unrolling_factor = 1;
2554 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2555 slp_instance instance;
2556 int decided_to_slp = 0;
2557
2558 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2559
2560 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2561 {
2562 /* FORNOW: SLP if you can. */
2563 /* All unroll factors have the form:
2564
2565 GET_MODE_SIZE (vinfo->vector_mode) * X
2566
2567 for some rational X, so they must have a common multiple. */
2568 unrolling_factor
2569 = force_common_multiple (unrolling_factor,
2570 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2571
2572 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2573 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2574 loop-based vectorization. Such stmts will be marked as HYBRID. */
2575 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2576 decided_to_slp++;
2577 }
2578
2579 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2580
2581 if (decided_to_slp && dump_enabled_p ())
2582 {
2583 dump_printf_loc (MSG_NOTE, vect_location,
2584 "Decided to SLP %d instances. Unrolling factor ",
2585 decided_to_slp);
2586 dump_dec (MSG_NOTE, unrolling_factor);
2587 dump_printf (MSG_NOTE, "\n");
2588 }
2589
2590 return (decided_to_slp > 0);
2591 }
2592
2593 /* Private data for vect_detect_hybrid_slp. */
2594 struct vdhs_data
2595 {
2596 loop_vec_info loop_vinfo;
2597 vec<stmt_vec_info> *worklist;
2598 };
2599
2600 /* Walker for walk_gimple_op. */
2601
2602 static tree
2603 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2604 {
2605 walk_stmt_info *wi = (walk_stmt_info *)data;
2606 vdhs_data *dat = (vdhs_data *)wi->info;
2607
2608 if (wi->is_lhs)
2609 return NULL_TREE;
2610
2611 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2612 if (!def_stmt_info)
2613 return NULL_TREE;
2614 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2615 if (PURE_SLP_STMT (def_stmt_info))
2616 {
2617 if (dump_enabled_p ())
2618 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2619 def_stmt_info->stmt);
2620 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2621 dat->worklist->safe_push (def_stmt_info);
2622 }
2623
2624 return NULL_TREE;
2625 }
2626
2627 /* Find stmts that must be both vectorized and SLPed. */
2628
2629 void
2630 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2631 {
2632 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2633
2634 /* All stmts participating in SLP are marked pure_slp, all other
2635 stmts are loop_vect.
2636 First collect all loop_vect stmts into a worklist. */
2637 auto_vec<stmt_vec_info> worklist;
2638 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2639 {
2640 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2641 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2642 gsi_next (&gsi))
2643 {
2644 gphi *phi = gsi.phi ();
2645 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2646 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2647 worklist.safe_push (stmt_info);
2648 }
2649 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2650 gsi_next (&gsi))
2651 {
2652 gimple *stmt = gsi_stmt (gsi);
2653 if (is_gimple_debug (stmt))
2654 continue;
2655 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2656 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2657 {
2658 for (gimple_stmt_iterator gsi2
2659 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2660 !gsi_end_p (gsi2); gsi_next (&gsi2))
2661 {
2662 stmt_vec_info patt_info
2663 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2664 if (!STMT_SLP_TYPE (patt_info)
2665 && STMT_VINFO_RELEVANT (patt_info))
2666 worklist.safe_push (patt_info);
2667 }
2668 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2669 }
2670 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2671 worklist.safe_push (stmt_info);
2672 }
2673 }
2674
2675 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2676 mark any SLP vectorized stmt as hybrid.
2677 ??? We're visiting def stmts N times (once for each non-SLP and
2678 once for each hybrid-SLP use). */
2679 walk_stmt_info wi;
2680 vdhs_data dat;
2681 dat.worklist = &worklist;
2682 dat.loop_vinfo = loop_vinfo;
2683 memset (&wi, 0, sizeof (wi));
2684 wi.info = (void *)&dat;
2685 while (!worklist.is_empty ())
2686 {
2687 stmt_vec_info stmt_info = worklist.pop ();
2688 /* Since SSA operands are not set up for pattern stmts we need
2689 to use walk_gimple_op. */
2690 wi.is_lhs = 0;
2691 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2692 }
2693 }
2694
2695
2696 /* Initialize a bb_vec_info struct for the statements between
2697 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2698
2699 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2700 gimple_stmt_iterator region_end_in,
2701 vec_info_shared *shared)
2702 : vec_info (vec_info::bb, init_cost (NULL), shared),
2703 bb (gsi_bb (region_begin_in)),
2704 region_begin (region_begin_in),
2705 region_end (region_end_in)
2706 {
2707 for (gimple *stmt : this->region_stmts ())
2708 {
2709 gimple_set_uid (stmt, 0);
2710 if (is_gimple_debug (stmt))
2711 continue;
2712 add_stmt (stmt);
2713 }
2714
2715 bb->aux = this;
2716 }
2717
2718
2719 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2720 stmts in the basic block. */
2721
2722 _bb_vec_info::~_bb_vec_info ()
2723 {
2724 for (gimple *stmt : this->region_stmts ())
2725 /* Reset region marker. */
2726 gimple_set_uid (stmt, -1);
2727
2728 bb->aux = NULL;
2729 }
2730
2731 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2732 given then that child nodes have already been processed, and that
2733 their def types currently match their SLP node's def type. */
2734
2735 static bool
2736 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2737 slp_instance node_instance,
2738 stmt_vector_for_cost *cost_vec)
2739 {
2740 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2741 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2742
2743 /* Calculate the number of vector statements to be created for the
2744 scalar stmts in this node. For SLP reductions it is equal to the
2745 number of vector statements in the children (which has already been
2746 calculated by the recursive call). Otherwise it is the number of
2747 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2748 VF divided by the number of elements in a vector. */
2749 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2750 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2751 {
2752 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2753 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2754 {
2755 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2756 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2757 break;
2758 }
2759 }
2760 else
2761 {
2762 poly_uint64 vf;
2763 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2764 vf = loop_vinfo->vectorization_factor;
2765 else
2766 vf = 1;
2767 unsigned int group_size = SLP_TREE_LANES (node);
2768 tree vectype = SLP_TREE_VECTYPE (node);
2769 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2770 = vect_get_num_vectors (vf * group_size, vectype);
2771 }
2772
2773 /* Handle purely internal nodes. */
2774 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2775 return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
2776
2777 bool dummy;
2778 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2779 node, node_instance, cost_vec);
2780 }
2781
2782 /* Try to build NODE from scalars, returning true on success.
2783 NODE_INSTANCE is the SLP instance that contains NODE. */
2784
2785 static bool
2786 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2787 slp_instance node_instance)
2788 {
2789 stmt_vec_info stmt_info;
2790 unsigned int i;
2791
2792 if (!is_a <bb_vec_info> (vinfo)
2793 || node == SLP_INSTANCE_TREE (node_instance)
2794 || !SLP_TREE_SCALAR_STMTS (node).exists ()
2795 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2796 return false;
2797
2798 if (dump_enabled_p ())
2799 dump_printf_loc (MSG_NOTE, vect_location,
2800 "Building vector operands from scalars instead\n");
2801
2802 /* Don't remove and free the child nodes here, since they could be
2803 referenced by other structures. The analysis and scheduling phases
2804 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2805 unsigned int group_size = SLP_TREE_LANES (node);
2806 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2807 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2808 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2809 {
2810 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2811 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2812 }
2813 return true;
2814 }
2815
2816 /* Compute the prologue cost for invariant or constant operands represented
2817 by NODE. */
2818
2819 static void
2820 vect_prologue_cost_for_slp (slp_tree node,
2821 stmt_vector_for_cost *cost_vec)
2822 {
2823 /* There's a special case of an existing vector, that costs nothing. */
2824 if (SLP_TREE_SCALAR_OPS (node).length () == 0
2825 && !SLP_TREE_VEC_DEFS (node).is_empty ())
2826 return;
2827 /* Without looking at the actual initializer a vector of
2828 constants can be implemented as load from the constant pool.
2829 When all elements are the same we can use a splat. */
2830 tree vectype = SLP_TREE_VECTYPE (node);
2831 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2832 unsigned num_vects_to_check;
2833 unsigned HOST_WIDE_INT const_nunits;
2834 unsigned nelt_limit;
2835 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2836 && ! multiple_p (const_nunits, group_size))
2837 {
2838 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2839 nelt_limit = const_nunits;
2840 }
2841 else
2842 {
2843 /* If either the vector has variable length or the vectors
2844 are composed of repeated whole groups we only need to
2845 cost construction once. All vectors will be the same. */
2846 num_vects_to_check = 1;
2847 nelt_limit = group_size;
2848 }
2849 tree elt = NULL_TREE;
2850 unsigned nelt = 0;
2851 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2852 {
2853 unsigned si = j % group_size;
2854 if (nelt == 0)
2855 elt = SLP_TREE_SCALAR_OPS (node)[si];
2856 /* ??? We're just tracking whether all operands of a single
2857 vector initializer are the same, ideally we'd check if
2858 we emitted the same one already. */
2859 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2860 elt = NULL_TREE;
2861 nelt++;
2862 if (nelt == nelt_limit)
2863 {
2864 record_stmt_cost (cost_vec, 1,
2865 SLP_TREE_DEF_TYPE (node) == vect_external_def
2866 ? (elt ? scalar_to_vec : vec_construct)
2867 : vector_load,
2868 NULL, vectype, 0, vect_prologue);
2869 nelt = 0;
2870 }
2871 }
2872 }
2873
2874 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2875 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2876
2877 Return true if the operations are supported. */
2878
2879 static bool
2880 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2881 slp_instance node_instance,
2882 hash_set<slp_tree> &visited,
2883 hash_set<slp_tree> &lvisited,
2884 stmt_vector_for_cost *cost_vec)
2885 {
2886 int i, j;
2887 slp_tree child;
2888
2889 /* Assume we can code-generate all invariants. */
2890 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2891 return true;
2892
2893 /* If we already analyzed the exact same set of scalar stmts we're done.
2894 We share the generated vector stmts for those.
2895 The SLP graph is acyclic so not caching whether we failed or succeeded
2896 doesn't result in any issue since we throw away the lvisited set
2897 when we fail. */
2898 if (visited.contains (node)
2899 || lvisited.contains (node))
2900 return true;
2901
2902 bool res = true;
2903 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2904 {
2905 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
2906 visited, lvisited, cost_vec);
2907 if (!res)
2908 break;
2909 }
2910
2911 if (res)
2912 {
2913 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2914 cost_vec);
2915 if (res)
2916 lvisited.add (node);
2917 }
2918
2919 /* When the node can be vectorized cost invariant nodes it references.
2920 This is not done in DFS order to allow the refering node
2921 vectorizable_* calls to nail down the invariant nodes vector type
2922 and possibly unshare it if it needs a different vector type than
2923 other referrers. */
2924 if (res)
2925 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2926 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2927 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2928 /* Perform usual caching, note code-generation still
2929 code-gens these nodes multiple times but we expect
2930 to CSE them later. */
2931 && !visited.contains (child)
2932 && !lvisited.add (child))
2933 {
2934 /* ??? After auditing more code paths make a "default"
2935 and push the vector type from NODE to all children
2936 if it is not already set. */
2937 /* Compute the number of vectors to be generated. */
2938 tree vector_type = SLP_TREE_VECTYPE (child);
2939 if (!vector_type)
2940 {
2941 /* For shifts with a scalar argument we don't need
2942 to cost or code-generate anything.
2943 ??? Represent this more explicitely. */
2944 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
2945 == shift_vec_info_type)
2946 && j == 1);
2947 continue;
2948 }
2949 unsigned group_size = SLP_TREE_LANES (child);
2950 poly_uint64 vf = 1;
2951 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2952 vf = loop_vinfo->vectorization_factor;
2953 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2954 = vect_get_num_vectors (vf * group_size, vector_type);
2955 /* And cost them. */
2956 vect_prologue_cost_for_slp (child, cost_vec);
2957 }
2958
2959 /* If this node or any of its children can't be vectorized, try pruning
2960 the tree here rather than felling the whole thing. */
2961 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2962 {
2963 /* We'll need to revisit this for invariant costing and number
2964 of vectorized stmt setting. */
2965 res = true;
2966 }
2967
2968 return res;
2969 }
2970
2971
2972 /* Analyze statements in SLP instances of VINFO. Return true if the
2973 operations are supported. */
2974
2975 bool
2976 vect_slp_analyze_operations (vec_info *vinfo)
2977 {
2978 slp_instance instance;
2979 int i;
2980
2981 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2982
2983 hash_set<slp_tree> visited;
2984 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2985 {
2986 hash_set<slp_tree> lvisited;
2987 stmt_vector_for_cost cost_vec;
2988 cost_vec.create (2);
2989 if (!vect_slp_analyze_node_operations (vinfo,
2990 SLP_INSTANCE_TREE (instance),
2991 instance, visited, lvisited,
2992 &cost_vec)
2993 /* Instances with a root stmt require vectorized defs for the
2994 SLP tree root. */
2995 || (SLP_INSTANCE_ROOT_STMT (instance)
2996 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2997 != vect_internal_def)))
2998 {
2999 slp_tree node = SLP_INSTANCE_TREE (instance);
3000 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3001 if (dump_enabled_p ())
3002 dump_printf_loc (MSG_NOTE, vect_location,
3003 "removing SLP instance operations starting from: %G",
3004 stmt_info->stmt);
3005 vect_free_slp_instance (instance, false);
3006 vinfo->slp_instances.ordered_remove (i);
3007 cost_vec.release ();
3008 }
3009 else
3010 {
3011 for (hash_set<slp_tree>::iterator x = lvisited.begin();
3012 x != lvisited.end(); ++x)
3013 visited.add (*x);
3014 i++;
3015
3016 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
3017 cost_vec.release ();
3018 }
3019 }
3020
3021 return !vinfo->slp_instances.is_empty ();
3022 }
3023
3024
3025 /* Compute the scalar cost of the SLP node NODE and its children
3026 and return it. Do not account defs that are marked in LIFE and
3027 update LIFE according to uses of NODE. */
3028
3029 static void
3030 vect_bb_slp_scalar_cost (vec_info *vinfo,
3031 slp_tree node, vec<bool, va_heap> *life,
3032 stmt_vector_for_cost *cost_vec,
3033 hash_set<slp_tree> &visited)
3034 {
3035 unsigned i;
3036 stmt_vec_info stmt_info;
3037 slp_tree child;
3038
3039 if (visited.add (node))
3040 return;
3041
3042 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3043 {
3044 ssa_op_iter op_iter;
3045 def_operand_p def_p;
3046
3047 if ((*life)[i])
3048 continue;
3049
3050 /* If there is a non-vectorized use of the defs then the scalar
3051 stmt is kept live in which case we do not account it or any
3052 required defs in the SLP children in the scalar cost. This
3053 way we make the vectorization more costly when compared to
3054 the scalar cost. */
3055 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3056 gimple *orig_stmt = orig_stmt_info->stmt;
3057 FOR_EACH_SSA_DEF_OPERAND (def_p, orig_stmt, op_iter, SSA_OP_DEF)
3058 {
3059 imm_use_iterator use_iter;
3060 gimple *use_stmt;
3061 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3062 if (!is_gimple_debug (use_stmt))
3063 {
3064 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3065 if (!use_stmt_info
3066 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
3067 {
3068 (*life)[i] = true;
3069 BREAK_FROM_IMM_USE_STMT (use_iter);
3070 }
3071 }
3072 }
3073 if ((*life)[i])
3074 continue;
3075
3076 /* Count scalar stmts only once. */
3077 if (gimple_visited_p (orig_stmt))
3078 continue;
3079 gimple_set_visited (orig_stmt, true);
3080
3081 vect_cost_for_stmt kind;
3082 if (STMT_VINFO_DATA_REF (orig_stmt_info))
3083 {
3084 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
3085 kind = scalar_load;
3086 else
3087 kind = scalar_store;
3088 }
3089 else if (vect_nop_conversion_p (orig_stmt_info))
3090 continue;
3091 else
3092 kind = scalar_stmt;
3093 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info, 0, vect_body);
3094 }
3095
3096 auto_vec<bool, 20> subtree_life;
3097 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3098 {
3099 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3100 {
3101 /* Do not directly pass LIFE to the recursive call, copy it to
3102 confine changes in the callee to the current child/subtree. */
3103 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3104 {
3105 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child));
3106 for (unsigned j = 0;
3107 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
3108 {
3109 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
3110 if (perm.first == i)
3111 subtree_life[perm.second] = (*life)[j];
3112 }
3113 }
3114 else
3115 {
3116 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
3117 subtree_life.safe_splice (*life);
3118 }
3119 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3120 visited);
3121 subtree_life.truncate (0);
3122 }
3123 }
3124 }
3125
3126 /* Check if vectorization of the basic block is profitable. */
3127
3128 static bool
3129 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3130 {
3131 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3132 slp_instance instance;
3133 int i;
3134 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3135 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3136
3137 /* Calculate scalar cost. */
3138 stmt_vector_for_cost scalar_costs;
3139 scalar_costs.create (0);
3140 hash_set<slp_tree> visited;
3141 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3142 {
3143 auto_vec<bool, 20> life;
3144 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)));
3145 vect_bb_slp_scalar_cost (bb_vinfo,
3146 SLP_INSTANCE_TREE (instance),
3147 &life, &scalar_costs, visited);
3148 }
3149 /* Unset visited flag. */
3150 stmt_info_for_cost *si;
3151 FOR_EACH_VEC_ELT (scalar_costs, i, si)
3152 gimple_set_visited (si->stmt_info->stmt, false);
3153
3154 void *target_cost_data = init_cost (NULL);
3155 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3156 scalar_costs.release ();
3157 unsigned dummy;
3158 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3159 destroy_cost_data (target_cost_data);
3160
3161 /* Complete the target-specific cost calculation. */
3162 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3163 &vec_inside_cost, &vec_epilogue_cost);
3164
3165 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3166
3167 if (dump_enabled_p ())
3168 {
3169 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3170 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3171 vec_inside_cost);
3172 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3173 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3174 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3175 }
3176
3177 /* Vectorization is profitable if its cost is more than the cost of scalar
3178 version. Note that we err on the vector side for equal cost because
3179 the cost estimate is otherwise quite pessimistic (constant uses are
3180 free on the scalar side but cost a load on the vector side for
3181 example). */
3182 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3183 return false;
3184
3185 return true;
3186 }
3187
3188 /* Find any vectorizable constructors and add them to the grouped_store
3189 array. */
3190
3191 static void
3192 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3193 {
3194 for (gimple *stmt : bb_vinfo->region_stmts ())
3195 {
3196 gassign *assign = dyn_cast<gassign *> (stmt);
3197 if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
3198 continue;
3199
3200 tree rhs = gimple_assign_rhs1 (assign);
3201 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3202 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3203 CONSTRUCTOR_NELTS (rhs))
3204 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3205 || uniform_vector_p (rhs))
3206 continue;
3207
3208 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
3209 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3210 }
3211 }
3212
3213 /* Check if the region described by BB_VINFO can be vectorized, returning
3214 true if so. When returning false, set FATAL to true if the same failure
3215 would prevent vectorization at other vector sizes, false if it is still
3216 worth trying other sizes. N_STMTS is the number of statements in the
3217 region. */
3218
3219 static bool
3220 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
3221 vec<int> *dataref_groups)
3222 {
3223 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3224
3225 slp_instance instance;
3226 int i;
3227 poly_uint64 min_vf = 2;
3228
3229 /* The first group of checks is independent of the vector size. */
3230 fatal = true;
3231
3232 /* Analyze the data references. */
3233
3234 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3235 {
3236 if (dump_enabled_p ())
3237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3238 "not vectorized: unhandled data-ref in basic "
3239 "block.\n");
3240 return false;
3241 }
3242
3243 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
3244 {
3245 if (dump_enabled_p ())
3246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3247 "not vectorized: unhandled data access in "
3248 "basic block.\n");
3249 return false;
3250 }
3251
3252 vect_slp_check_for_constructors (bb_vinfo);
3253
3254 /* If there are no grouped stores and no constructors in the region
3255 there is no need to continue with pattern recog as vect_analyze_slp
3256 will fail anyway. */
3257 if (bb_vinfo->grouped_stores.is_empty ())
3258 {
3259 if (dump_enabled_p ())
3260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3261 "not vectorized: no grouped stores in "
3262 "basic block.\n");
3263 return false;
3264 }
3265
3266 /* While the rest of the analysis below depends on it in some way. */
3267 fatal = false;
3268
3269 vect_pattern_recog (bb_vinfo);
3270
3271 /* Check the SLP opportunities in the basic block, analyze and build SLP
3272 trees. */
3273 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3274 {
3275 if (dump_enabled_p ())
3276 {
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278 "Failed to SLP the basic block.\n");
3279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3280 "not vectorized: failed to find SLP opportunities "
3281 "in basic block.\n");
3282 }
3283 return false;
3284 }
3285
3286 /* Optimize permutations. */
3287 vect_optimize_slp (bb_vinfo);
3288
3289 vect_record_base_alignments (bb_vinfo);
3290
3291 /* Analyze and verify the alignment of data references and the
3292 dependence in the SLP instances. */
3293 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3294 {
3295 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
3296 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3297 {
3298 slp_tree node = SLP_INSTANCE_TREE (instance);
3299 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3300 if (dump_enabled_p ())
3301 dump_printf_loc (MSG_NOTE, vect_location,
3302 "removing SLP instance operations starting from: %G",
3303 stmt_info->stmt);
3304 vect_free_slp_instance (instance, false);
3305 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3306 continue;
3307 }
3308
3309 /* Mark all the statements that we want to vectorize as pure SLP and
3310 relevant. */
3311 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3312 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3313 if (SLP_INSTANCE_ROOT_STMT (instance))
3314 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3315
3316 i++;
3317 }
3318 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3319 return false;
3320
3321 if (!vect_slp_analyze_operations (bb_vinfo))
3322 {
3323 if (dump_enabled_p ())
3324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3325 "not vectorized: bad operation in basic block.\n");
3326 return false;
3327 }
3328
3329 /* Cost model: check if the vectorization is worthwhile. */
3330 if (!unlimited_cost_model (NULL)
3331 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3332 {
3333 if (dump_enabled_p ())
3334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3335 "not vectorized: vectorization is not "
3336 "profitable.\n");
3337 return false;
3338 }
3339
3340 if (dump_enabled_p ())
3341 dump_printf_loc (MSG_NOTE, vect_location,
3342 "Basic block will be vectorized using SLP\n");
3343 return true;
3344 }
3345
3346 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3347 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3348 on success. The region has N_STMTS statements and has the datarefs
3349 given by DATAREFS. */
3350
3351 static bool
3352 vect_slp_region (gimple_stmt_iterator region_begin,
3353 gimple_stmt_iterator region_end,
3354 vec<data_reference_p> datarefs,
3355 vec<int> *dataref_groups,
3356 unsigned int n_stmts)
3357 {
3358 bb_vec_info bb_vinfo;
3359 auto_vector_modes vector_modes;
3360
3361 /* Autodetect first vector size we try. */
3362 machine_mode next_vector_mode = VOIDmode;
3363 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3364 unsigned int mode_i = 0;
3365
3366 vec_info_shared shared;
3367
3368 machine_mode autodetected_vector_mode = VOIDmode;
3369 while (1)
3370 {
3371 bool vectorized = false;
3372 bool fatal = false;
3373 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3374
3375 bool first_time_p = shared.datarefs.is_empty ();
3376 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3377 if (first_time_p)
3378 bb_vinfo->shared->save_datarefs ();
3379 else
3380 bb_vinfo->shared->check_datarefs ();
3381 bb_vinfo->vector_mode = next_vector_mode;
3382
3383 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
3384 && dbg_cnt (vect_slp))
3385 {
3386 if (dump_enabled_p ())
3387 {
3388 dump_printf_loc (MSG_NOTE, vect_location,
3389 "***** Analysis succeeded with vector mode"
3390 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3391 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3392 }
3393
3394 bb_vinfo->shared->check_datarefs ();
3395 vect_schedule_slp (bb_vinfo);
3396
3397 unsigned HOST_WIDE_INT bytes;
3398 if (dump_enabled_p ())
3399 {
3400 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3401 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3402 "basic block part vectorized using %wu byte "
3403 "vectors\n", bytes);
3404 else
3405 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3406 "basic block part vectorized using variable "
3407 "length vectors\n");
3408 }
3409
3410 vectorized = true;
3411 }
3412 else
3413 {
3414 if (dump_enabled_p ())
3415 dump_printf_loc (MSG_NOTE, vect_location,
3416 "***** Analysis failed with vector mode %s\n",
3417 GET_MODE_NAME (bb_vinfo->vector_mode));
3418 }
3419
3420 if (mode_i == 0)
3421 autodetected_vector_mode = bb_vinfo->vector_mode;
3422
3423 if (!fatal)
3424 while (mode_i < vector_modes.length ()
3425 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3426 {
3427 if (dump_enabled_p ())
3428 dump_printf_loc (MSG_NOTE, vect_location,
3429 "***** The result for vector mode %s would"
3430 " be the same\n",
3431 GET_MODE_NAME (vector_modes[mode_i]));
3432 mode_i += 1;
3433 }
3434
3435 delete bb_vinfo;
3436
3437 if (mode_i < vector_modes.length ()
3438 && VECTOR_MODE_P (autodetected_vector_mode)
3439 && (related_vector_mode (vector_modes[mode_i],
3440 GET_MODE_INNER (autodetected_vector_mode))
3441 == autodetected_vector_mode)
3442 && (related_vector_mode (autodetected_vector_mode,
3443 GET_MODE_INNER (vector_modes[mode_i]))
3444 == vector_modes[mode_i]))
3445 {
3446 if (dump_enabled_p ())
3447 dump_printf_loc (MSG_NOTE, vect_location,
3448 "***** Skipping vector mode %s, which would"
3449 " repeat the analysis for %s\n",
3450 GET_MODE_NAME (vector_modes[mode_i]),
3451 GET_MODE_NAME (autodetected_vector_mode));
3452 mode_i += 1;
3453 }
3454
3455 if (vectorized
3456 || mode_i == vector_modes.length ()
3457 || autodetected_vector_mode == VOIDmode
3458 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3459 vector sizes will fail do not bother iterating. */
3460 || fatal)
3461 return vectorized;
3462
3463 /* Try the next biggest vector size. */
3464 next_vector_mode = vector_modes[mode_i++];
3465 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_NOTE, vect_location,
3467 "***** Re-trying analysis with vector mode %s\n",
3468 GET_MODE_NAME (next_vector_mode));
3469 }
3470 }
3471
3472 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3473 true if anything in the basic-block was vectorized. */
3474
3475 bool
3476 vect_slp_bb (basic_block bb)
3477 {
3478 vec<data_reference_p> datarefs = vNULL;
3479 vec<int> dataref_groups = vNULL;
3480 int insns = 0;
3481 int current_group = 0;
3482 gimple_stmt_iterator region_begin = gsi_start_nondebug_after_labels_bb (bb);
3483 gimple_stmt_iterator region_end = gsi_last_bb (bb);
3484 if (!gsi_end_p (region_end))
3485 gsi_next (&region_end);
3486
3487 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
3488 gsi_next (&gsi))
3489 {
3490 gimple *stmt = gsi_stmt (gsi);
3491 if (is_gimple_debug (stmt))
3492 continue;
3493
3494 insns++;
3495
3496 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3497 vect_location = stmt;
3498
3499 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
3500 &dataref_groups, current_group))
3501 ++current_group;
3502
3503 if (insns > param_slp_max_insns_in_bb)
3504 {
3505 if (dump_enabled_p ())
3506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3507 "not vectorized: too many instructions in "
3508 "basic block.\n");
3509 }
3510 }
3511
3512 return vect_slp_region (region_begin, region_end, datarefs,
3513 &dataref_groups, insns);
3514 }
3515
3516
3517 /* Build a variable-length vector in which the elements in ELTS are repeated
3518 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3519 RESULTS and add any new instructions to SEQ.
3520
3521 The approach we use is:
3522
3523 (1) Find a vector mode VM with integer elements of mode IM.
3524
3525 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3526 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3527 from small vectors to IM.
3528
3529 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3530
3531 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3532 correct byte contents.
3533
3534 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3535
3536 We try to find the largest IM for which this sequence works, in order
3537 to cut down on the number of interleaves. */
3538
3539 void
3540 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3541 vec<tree> elts, unsigned int nresults,
3542 vec<tree> &results)
3543 {
3544 unsigned int nelts = elts.length ();
3545 tree element_type = TREE_TYPE (vector_type);
3546
3547 /* (1) Find a vector mode VM with integer elements of mode IM. */
3548 unsigned int nvectors = 1;
3549 tree new_vector_type;
3550 tree permutes[2];
3551 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3552 &nvectors, &new_vector_type,
3553 permutes))
3554 gcc_unreachable ();
3555
3556 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3557 unsigned int partial_nelts = nelts / nvectors;
3558 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3559
3560 tree_vector_builder partial_elts;
3561 auto_vec<tree, 32> pieces (nvectors * 2);
3562 pieces.quick_grow (nvectors * 2);
3563 for (unsigned int i = 0; i < nvectors; ++i)
3564 {
3565 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3566 ELTS' has mode IM. */
3567 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3568 for (unsigned int j = 0; j < partial_nelts; ++j)
3569 partial_elts.quick_push (elts[i * partial_nelts + j]);
3570 tree t = gimple_build_vector (seq, &partial_elts);
3571 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3572 TREE_TYPE (new_vector_type), t);
3573
3574 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3575 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3576 }
3577
3578 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3579 correct byte contents.
3580
3581 We need to repeat the following operation log2(nvectors) times:
3582
3583 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3584 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3585
3586 However, if each input repeats every N elements and the VF is
3587 a multiple of N * 2, the HI result is the same as the LO. */
3588 unsigned int in_start = 0;
3589 unsigned int out_start = nvectors;
3590 unsigned int hi_start = nvectors / 2;
3591 /* A bound on the number of outputs needed to produce NRESULTS results
3592 in the final iteration. */
3593 unsigned int noutputs_bound = nvectors * nresults;
3594 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3595 {
3596 noutputs_bound /= 2;
3597 unsigned int limit = MIN (noutputs_bound, nvectors);
3598 for (unsigned int i = 0; i < limit; ++i)
3599 {
3600 if ((i & 1) != 0
3601 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3602 2 * in_repeat))
3603 {
3604 pieces[out_start + i] = pieces[out_start + i - 1];
3605 continue;
3606 }
3607
3608 tree output = make_ssa_name (new_vector_type);
3609 tree input1 = pieces[in_start + (i / 2)];
3610 tree input2 = pieces[in_start + (i / 2) + hi_start];
3611 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3612 input1, input2,
3613 permutes[i & 1]);
3614 gimple_seq_add_stmt (seq, stmt);
3615 pieces[out_start + i] = output;
3616 }
3617 std::swap (in_start, out_start);
3618 }
3619
3620 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3621 results.reserve (nresults);
3622 for (unsigned int i = 0; i < nresults; ++i)
3623 if (i < nvectors)
3624 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3625 pieces[in_start + i]));
3626 else
3627 results.quick_push (results[i - nvectors]);
3628 }
3629
3630
3631 /* For constant and loop invariant defs in OP_NODE this function creates
3632 vector defs that will be used in the vectorized stmts and stores them
3633 to SLP_TREE_VEC_DEFS of OP_NODE. */
3634
3635 static void
3636 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3637 {
3638 unsigned HOST_WIDE_INT nunits;
3639 tree vec_cst;
3640 unsigned j, number_of_places_left_in_vector;
3641 tree vector_type;
3642 tree vop;
3643 int group_size = op_node->ops.length ();
3644 unsigned int vec_num, i;
3645 unsigned number_of_copies = 1;
3646 bool constant_p;
3647 gimple_seq ctor_seq = NULL;
3648 auto_vec<tree, 16> permute_results;
3649
3650 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3651 vector_type = SLP_TREE_VECTYPE (op_node);
3652
3653 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3654 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3655 auto_vec<tree> voprnds (number_of_vectors);
3656
3657 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3658 created vectors. It is greater than 1 if unrolling is performed.
3659
3660 For example, we have two scalar operands, s1 and s2 (e.g., group of
3661 strided accesses of size two), while NUNITS is four (i.e., four scalars
3662 of this type can be packed in a vector). The output vector will contain
3663 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3664 will be 2).
3665
3666 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3667 containing the operands.
3668
3669 For example, NUNITS is four as before, and the group size is 8
3670 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3671 {s5, s6, s7, s8}. */
3672
3673 /* When using duplicate_and_interleave, we just need one element for
3674 each scalar statement. */
3675 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3676 nunits = group_size;
3677
3678 number_of_copies = nunits * number_of_vectors / group_size;
3679
3680 number_of_places_left_in_vector = nunits;
3681 constant_p = true;
3682 tree_vector_builder elts (vector_type, nunits, 1);
3683 elts.quick_grow (nunits);
3684 stmt_vec_info insert_after = NULL;
3685 for (j = 0; j < number_of_copies; j++)
3686 {
3687 tree op;
3688 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3689 {
3690 /* Create 'vect_ = {op0,op1,...,opn}'. */
3691 number_of_places_left_in_vector--;
3692 tree orig_op = op;
3693 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3694 {
3695 if (CONSTANT_CLASS_P (op))
3696 {
3697 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3698 {
3699 /* Can't use VIEW_CONVERT_EXPR for booleans because
3700 of possibly different sizes of scalar value and
3701 vector element. */
3702 if (integer_zerop (op))
3703 op = build_int_cst (TREE_TYPE (vector_type), 0);
3704 else if (integer_onep (op))
3705 op = build_all_ones_cst (TREE_TYPE (vector_type));
3706 else
3707 gcc_unreachable ();
3708 }
3709 else
3710 op = fold_unary (VIEW_CONVERT_EXPR,
3711 TREE_TYPE (vector_type), op);
3712 gcc_assert (op && CONSTANT_CLASS_P (op));
3713 }
3714 else
3715 {
3716 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3717 gimple *init_stmt;
3718 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3719 {
3720 tree true_val
3721 = build_all_ones_cst (TREE_TYPE (vector_type));
3722 tree false_val
3723 = build_zero_cst (TREE_TYPE (vector_type));
3724 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3725 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3726 op, true_val,
3727 false_val);
3728 }
3729 else
3730 {
3731 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3732 op);
3733 init_stmt
3734 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3735 op);
3736 }
3737 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3738 op = new_temp;
3739 }
3740 }
3741 elts[number_of_places_left_in_vector] = op;
3742 if (!CONSTANT_CLASS_P (op))
3743 constant_p = false;
3744 /* For BB vectorization we have to compute an insert location
3745 when a def is inside the analyzed region since we cannot
3746 simply insert at the BB start in this case. */
3747 stmt_vec_info opdef;
3748 if (TREE_CODE (orig_op) == SSA_NAME
3749 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3750 && is_a <bb_vec_info> (vinfo)
3751 && (opdef = vinfo->lookup_def (orig_op)))
3752 {
3753 if (!insert_after)
3754 insert_after = opdef;
3755 else
3756 insert_after = get_later_stmt (insert_after, opdef);
3757 }
3758
3759 if (number_of_places_left_in_vector == 0)
3760 {
3761 if (constant_p
3762 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3763 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3764 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3765 else
3766 {
3767 if (permute_results.is_empty ())
3768 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3769 elts, number_of_vectors,
3770 permute_results);
3771 vec_cst = permute_results[number_of_vectors - j - 1];
3772 }
3773 if (!gimple_seq_empty_p (ctor_seq))
3774 {
3775 if (insert_after)
3776 {
3777 gimple_stmt_iterator gsi
3778 = gsi_for_stmt (insert_after->stmt);
3779 gsi_insert_seq_after (&gsi, ctor_seq,
3780 GSI_CONTINUE_LINKING);
3781 }
3782 else
3783 vinfo->insert_seq_on_entry (NULL, ctor_seq);
3784 ctor_seq = NULL;
3785 }
3786 voprnds.quick_push (vec_cst);
3787 insert_after = NULL;
3788 number_of_places_left_in_vector = nunits;
3789 constant_p = true;
3790 elts.new_vector (vector_type, nunits, 1);
3791 elts.quick_grow (nunits);
3792 }
3793 }
3794 }
3795
3796 /* Since the vectors are created in the reverse order, we should invert
3797 them. */
3798 vec_num = voprnds.length ();
3799 for (j = vec_num; j != 0; j--)
3800 {
3801 vop = voprnds[j - 1];
3802 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3803 }
3804
3805 /* In case that VF is greater than the unrolling factor needed for the SLP
3806 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3807 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3808 to replicate the vectors. */
3809 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
3810 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
3811 i++)
3812 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3813 }
3814
3815 /* Get the Ith vectorized definition from SLP_NODE. */
3816
3817 tree
3818 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
3819 {
3820 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
3821 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
3822 else
3823 return SLP_TREE_VEC_DEFS (slp_node)[i];
3824 }
3825
3826 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
3827
3828 void
3829 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
3830 {
3831 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
3832 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
3833 {
3834 unsigned j;
3835 gimple *vec_def_stmt;
3836 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
3837 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
3838 }
3839 else
3840 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
3841 }
3842
3843 /* Get N vectorized definitions for SLP_NODE. */
3844
3845 void
3846 vect_get_slp_defs (vec_info *,
3847 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3848 {
3849 if (n == -1U)
3850 n = SLP_TREE_CHILDREN (slp_node).length ();
3851
3852 for (unsigned i = 0; i < n; ++i)
3853 {
3854 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3855 vec<tree> vec_defs = vNULL;
3856 vect_get_slp_defs (child, &vec_defs);
3857 vec_oprnds->quick_push (vec_defs);
3858 }
3859 }
3860
3861 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3862 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3863 permute statements for the SLP node NODE. */
3864
3865 bool
3866 vect_transform_slp_perm_load (vec_info *vinfo,
3867 slp_tree node, vec<tree> dr_chain,
3868 gimple_stmt_iterator *gsi, poly_uint64 vf,
3869 bool analyze_only, unsigned *n_perms)
3870 {
3871 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3872 int vec_index = 0;
3873 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3874 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
3875 unsigned int mask_element;
3876 machine_mode mode;
3877
3878 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3879 return false;
3880
3881 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3882
3883 mode = TYPE_MODE (vectype);
3884 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3885
3886 /* Initialize the vect stmts of NODE to properly insert the generated
3887 stmts later. */
3888 if (! analyze_only)
3889 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3890 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3891 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3892
3893 /* Generate permutation masks for every NODE. Number of masks for each NODE
3894 is equal to GROUP_SIZE.
3895 E.g., we have a group of three nodes with three loads from the same
3896 location in each node, and the vector size is 4. I.e., we have a
3897 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3898 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3899 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3900 ...
3901
3902 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3903 The last mask is illegal since we assume two operands for permute
3904 operation, and the mask element values can't be outside that range.
3905 Hence, the last mask must be converted into {2,5,5,5}.
3906 For the first two permutations we need the first and the second input
3907 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3908 we need the second and the third vectors: {b1,c1,a2,b2} and
3909 {c2,a3,b3,c3}. */
3910
3911 int vect_stmts_counter = 0;
3912 unsigned int index = 0;
3913 int first_vec_index = -1;
3914 int second_vec_index = -1;
3915 bool noop_p = true;
3916 *n_perms = 0;
3917
3918 vec_perm_builder mask;
3919 unsigned int nelts_to_build;
3920 unsigned int nvectors_per_build;
3921 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3922 && multiple_p (nunits, group_size));
3923 if (repeating_p)
3924 {
3925 /* A single vector contains a whole number of copies of the node, so:
3926 (a) all permutes can use the same mask; and
3927 (b) the permutes only need a single vector input. */
3928 mask.new_vector (nunits, group_size, 3);
3929 nelts_to_build = mask.encoded_nelts ();
3930 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3931 }
3932 else
3933 {
3934 /* We need to construct a separate mask for each vector statement. */
3935 unsigned HOST_WIDE_INT const_nunits, const_vf;
3936 if (!nunits.is_constant (&const_nunits)
3937 || !vf.is_constant (&const_vf))
3938 return false;
3939 mask.new_vector (const_nunits, const_nunits, 1);
3940 nelts_to_build = const_vf * group_size;
3941 nvectors_per_build = 1;
3942 }
3943
3944 unsigned int count = mask.encoded_nelts ();
3945 mask.quick_grow (count);
3946 vec_perm_indices indices;
3947
3948 for (unsigned int j = 0; j < nelts_to_build; j++)
3949 {
3950 unsigned int iter_num = j / group_size;
3951 unsigned int stmt_num = j % group_size;
3952 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3953 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3954 if (repeating_p)
3955 {
3956 first_vec_index = 0;
3957 mask_element = i;
3958 }
3959 else
3960 {
3961 /* Enforced before the loop when !repeating_p. */
3962 unsigned int const_nunits = nunits.to_constant ();
3963 vec_index = i / const_nunits;
3964 mask_element = i % const_nunits;
3965 if (vec_index == first_vec_index
3966 || first_vec_index == -1)
3967 {
3968 first_vec_index = vec_index;
3969 }
3970 else if (vec_index == second_vec_index
3971 || second_vec_index == -1)
3972 {
3973 second_vec_index = vec_index;
3974 mask_element += const_nunits;
3975 }
3976 else
3977 {
3978 if (dump_enabled_p ())
3979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3980 "permutation requires at "
3981 "least three vectors %G",
3982 stmt_info->stmt);
3983 gcc_assert (analyze_only);
3984 return false;
3985 }
3986
3987 gcc_assert (mask_element < 2 * const_nunits);
3988 }
3989
3990 if (mask_element != index)
3991 noop_p = false;
3992 mask[index++] = mask_element;
3993
3994 if (index == count && !noop_p)
3995 {
3996 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3997 if (!can_vec_perm_const_p (mode, indices))
3998 {
3999 if (dump_enabled_p ())
4000 {
4001 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4002 vect_location,
4003 "unsupported vect permute { ");
4004 for (i = 0; i < count; ++i)
4005 {
4006 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4007 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4008 }
4009 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4010 }
4011 gcc_assert (analyze_only);
4012 return false;
4013 }
4014
4015 ++*n_perms;
4016 }
4017
4018 if (index == count)
4019 {
4020 if (!analyze_only)
4021 {
4022 tree mask_vec = NULL_TREE;
4023
4024 if (! noop_p)
4025 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4026
4027 if (second_vec_index == -1)
4028 second_vec_index = first_vec_index;
4029
4030 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4031 {
4032 /* Generate the permute statement if necessary. */
4033 tree first_vec = dr_chain[first_vec_index + ri];
4034 tree second_vec = dr_chain[second_vec_index + ri];
4035 gimple *perm_stmt;
4036 if (! noop_p)
4037 {
4038 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4039 tree perm_dest
4040 = vect_create_destination_var (gimple_assign_lhs (stmt),
4041 vectype);
4042 perm_dest = make_ssa_name (perm_dest);
4043 perm_stmt
4044 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4045 first_vec, second_vec,
4046 mask_vec);
4047 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
4048 gsi);
4049 }
4050 else
4051 /* If mask was NULL_TREE generate the requested
4052 identity transform. */
4053 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
4054
4055 /* Store the vector statement in NODE. */
4056 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
4057 }
4058 }
4059
4060 index = 0;
4061 first_vec_index = -1;
4062 second_vec_index = -1;
4063 noop_p = true;
4064 }
4065 }
4066
4067 return true;
4068 }
4069
4070
4071 /* Vectorize the SLP permutations in NODE as specified
4072 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
4073 child number and lane number.
4074 Interleaving of two two-lane two-child SLP subtrees (not supported):
4075 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
4076 A blend of two four-lane two-child SLP subtrees:
4077 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
4078 Highpart of a four-lane one-child SLP subtree (not supported):
4079 [ { 0, 2 }, { 0, 3 } ]
4080 Where currently only a subset is supported by code generating below. */
4081
4082 static bool
4083 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
4084 slp_tree node, stmt_vector_for_cost *cost_vec)
4085 {
4086 tree vectype = SLP_TREE_VECTYPE (node);
4087
4088 /* ??? We currently only support all same vector input and output types
4089 while the SLP IL should really do a concat + select and thus accept
4090 arbitrary mismatches. */
4091 slp_tree child;
4092 unsigned i;
4093 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4094 if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
4095 {
4096 if (dump_enabled_p ())
4097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4098 "Unsupported lane permutation\n");
4099 return false;
4100 }
4101
4102 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
4103 gcc_assert (perm.length () == SLP_TREE_LANES (node));
4104 if (dump_enabled_p ())
4105 {
4106 dump_printf_loc (MSG_NOTE, vect_location,
4107 "vectorizing permutation");
4108 for (unsigned i = 0; i < perm.length (); ++i)
4109 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
4110 dump_printf (MSG_NOTE, "\n");
4111 }
4112
4113 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4114 if (!nunits.is_constant ())
4115 return false;
4116 unsigned HOST_WIDE_INT vf = 1;
4117 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
4118 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
4119 return false;
4120 unsigned olanes = vf * SLP_TREE_LANES (node);
4121 gcc_assert (multiple_p (olanes, nunits));
4122
4123 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
4124 from the { SLP operand, scalar lane } permutation as recorded in the
4125 SLP node as intermediate step. This part should already work
4126 with SLP children with arbitrary number of lanes. */
4127 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
4128 auto_vec<unsigned> active_lane;
4129 vperm.create (olanes);
4130 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length ());
4131 for (unsigned i = 0; i < vf; ++i)
4132 {
4133 for (unsigned pi = 0; pi < perm.length (); ++pi)
4134 {
4135 std::pair<unsigned, unsigned> p = perm[pi];
4136 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
4137 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
4138 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
4139 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
4140 vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
4141 }
4142 /* Advance to the next group. */
4143 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
4144 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
4145 }
4146
4147 if (dump_enabled_p ())
4148 {
4149 dump_printf_loc (MSG_NOTE, vect_location, "as");
4150 for (unsigned i = 0; i < vperm.length (); ++i)
4151 {
4152 if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
4153 dump_printf (MSG_NOTE, ",");
4154 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
4155 vperm[i].first.first, vperm[i].first.second,
4156 vperm[i].first.second);
4157 }
4158 dump_printf (MSG_NOTE, "\n");
4159 }
4160
4161 /* We can only handle two-vector permutes, everything else should
4162 be lowered on the SLP level. The following is closely inspired
4163 by vect_transform_slp_perm_load and is supposed to eventually
4164 replace it.
4165 ??? As intermediate step do code-gen in the SLP tree representation
4166 somehow? */
4167 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
4168 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
4169 unsigned int const_nunits = nunits.to_constant ();
4170 unsigned int index = 0;
4171 unsigned int mask_element;
4172 vec_perm_builder mask;
4173 mask.new_vector (const_nunits, const_nunits, 1);
4174 unsigned int count = mask.encoded_nelts ();
4175 mask.quick_grow (count);
4176 vec_perm_indices indices;
4177 unsigned nperms = 0;
4178 for (unsigned i = 0; i < vperm.length (); ++i)
4179 {
4180 mask_element = vperm[i].second;
4181 if (first_vec.first == -1U
4182 || first_vec == vperm[i].first)
4183 first_vec = vperm[i].first;
4184 else if (second_vec.first == -1U
4185 || second_vec == vperm[i].first)
4186 {
4187 second_vec = vperm[i].first;
4188 mask_element += const_nunits;
4189 }
4190 else
4191 {
4192 if (dump_enabled_p ())
4193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4194 "permutation requires at "
4195 "least three vectors");
4196 gcc_assert (!gsi);
4197 return false;
4198 }
4199
4200 mask[index++] = mask_element;
4201
4202 if (index == count)
4203 {
4204 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
4205 const_nunits);
4206 bool identity_p = indices.series_p (0, 1, 0, 1);
4207 if (!identity_p
4208 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
4209 {
4210 if (dump_enabled_p ())
4211 {
4212 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4213 vect_location,
4214 "unsupported vect permute { ");
4215 for (i = 0; i < count; ++i)
4216 {
4217 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4218 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4219 }
4220 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4221 }
4222 gcc_assert (!gsi);
4223 return false;
4224 }
4225
4226 if (!identity_p)
4227 nperms++;
4228 if (gsi)
4229 {
4230 if (second_vec.first == -1U)
4231 second_vec = first_vec;
4232
4233 /* Generate the permute statement if necessary. */
4234 slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
4235 tree first_def
4236 = vect_get_slp_vect_def (first_node, first_vec.second);
4237 gassign *perm_stmt;
4238 tree perm_dest = make_ssa_name (vectype);
4239 if (!identity_p)
4240 {
4241 slp_tree second_node
4242 = SLP_TREE_CHILDREN (node)[second_vec.first];
4243 tree second_def
4244 = vect_get_slp_vect_def (second_node, second_vec.second);
4245 tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4246 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4247 first_def, second_def,
4248 mask_vec);
4249 }
4250 else
4251 /* We need a copy here in case the def was external. */
4252 perm_stmt = gimple_build_assign (perm_dest, first_def);
4253 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
4254 /* Store the vector statement in NODE. */
4255 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
4256 }
4257
4258 index = 0;
4259 first_vec = std::make_pair (-1U, -1U);
4260 second_vec = std::make_pair (-1U, -1U);
4261 }
4262 }
4263
4264 if (!gsi)
4265 record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
4266
4267 return true;
4268 }
4269
4270 /* Vectorize SLP instance tree in postorder. */
4271
4272 static void
4273 vect_schedule_slp_instance (vec_info *vinfo,
4274 slp_tree node, slp_instance instance)
4275 {
4276 gimple_stmt_iterator si;
4277 int i;
4278 slp_tree child;
4279
4280 /* See if we have already vectorized the node in the graph of the
4281 SLP instance. */
4282 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4283 && SLP_TREE_VEC_STMTS (node).exists ())
4284 || SLP_TREE_VEC_DEFS (node).exists ())
4285 return;
4286
4287 /* Vectorize externals and constants. */
4288 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4289 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4290 {
4291 /* ??? vectorizable_shift can end up using a scalar operand which is
4292 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4293 node in this case. */
4294 if (!SLP_TREE_VECTYPE (node))
4295 return;
4296
4297 vect_create_constant_vectors (vinfo, node);
4298 return;
4299 }
4300
4301 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4302 vect_schedule_slp_instance (vinfo, child, instance);
4303
4304 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4305 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4306
4307 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4308 if (dump_enabled_p ())
4309 dump_printf_loc (MSG_NOTE, vect_location,
4310 "------>vectorizing SLP node starting from: %G",
4311 stmt_info->stmt);
4312
4313 if (STMT_VINFO_DATA_REF (stmt_info)
4314 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
4315 {
4316 /* Vectorized loads go before the first scalar load to make it
4317 ready early, vectorized stores go before the last scalar
4318 stmt which is where all uses are ready. */
4319 stmt_vec_info last_stmt_info = NULL;
4320 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
4321 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
4322 else /* DR_IS_WRITE */
4323 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4324 si = gsi_for_stmt (last_stmt_info->stmt);
4325 }
4326 else if (SLP_TREE_CHILDREN (node).is_empty ())
4327 {
4328 /* This happens for reduction and induction PHIs where we do not use the
4329 insertion iterator. */
4330 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4331 == cycle_phi_info_type
4332 || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4333 == induc_vec_info_type));
4334 si = gsi_none ();
4335 }
4336 else
4337 {
4338 /* Emit other stmts after the children vectorized defs which is
4339 earliest possible. */
4340 gimple *last_stmt = NULL;
4341 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4342 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4343 {
4344 /* For fold-left reductions we are retaining the scalar
4345 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
4346 set so the representation isn't perfect. Resort to the
4347 last scalar def here. */
4348 if (SLP_TREE_VEC_STMTS (child).is_empty ())
4349 {
4350 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
4351 == cycle_phi_info_type);
4352 gphi *phi = as_a <gphi *>
4353 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
4354 if (!last_stmt
4355 || vect_stmt_dominates_stmt_p (last_stmt, phi))
4356 last_stmt = phi;
4357 }
4358 /* We are emitting all vectorized stmts in the same place and
4359 the last one is the last.
4360 ??? Unless we have a load permutation applied and that
4361 figures to re-use an earlier generated load. */
4362 unsigned j;
4363 gimple *vstmt;
4364 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
4365 if (!last_stmt
4366 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4367 last_stmt = vstmt;
4368 }
4369 else if (!SLP_TREE_VECTYPE (child))
4370 {
4371 /* For externals we use unvectorized at all scalar defs. */
4372 unsigned j;
4373 tree def;
4374 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
4375 if (TREE_CODE (def) == SSA_NAME
4376 && !SSA_NAME_IS_DEFAULT_DEF (def))
4377 {
4378 gimple *stmt = SSA_NAME_DEF_STMT (def);
4379 if (!last_stmt
4380 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
4381 last_stmt = stmt;
4382 }
4383 }
4384 else
4385 {
4386 /* For externals we have to look at all defs since their
4387 insertion place is decided per vector. But beware
4388 of pre-existing vectors where we need to make sure
4389 we do not insert before the region boundary. */
4390 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
4391 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
4392 last_stmt = gsi_stmt (as_a <bb_vec_info> (vinfo)->region_begin);
4393 else
4394 {
4395 unsigned j;
4396 tree vdef;
4397 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
4398 if (TREE_CODE (vdef) == SSA_NAME
4399 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
4400 {
4401 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
4402 if (!last_stmt
4403 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
4404 last_stmt = vstmt;
4405 }
4406 }
4407 }
4408 /* This can happen when all children are pre-existing vectors or
4409 constants. */
4410 if (!last_stmt)
4411 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
4412 if (is_a <gphi *> (last_stmt))
4413 si = gsi_after_labels (gimple_bb (last_stmt));
4414 else
4415 {
4416 si = gsi_for_stmt (last_stmt);
4417 gsi_next (&si);
4418 }
4419 }
4420
4421 bool done_p = false;
4422
4423 /* Handle purely internal nodes. */
4424 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4425 {
4426 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
4427 be shared with different SLP nodes (but usually it's the same
4428 operation apart from the case the stmt is only there for denoting
4429 the actual scalar lane defs ...). So do not call vect_transform_stmt
4430 but open-code it here (partly). */
4431 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
4432 gcc_assert (done);
4433 done_p = true;
4434 }
4435 if (!done_p)
4436 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4437 }
4438
4439 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4440 For loop vectorization this is done in vectorizable_call, but for SLP
4441 it needs to be deferred until end of vect_schedule_slp, because multiple
4442 SLP instances may refer to the same scalar stmt. */
4443
4444 static void
4445 vect_remove_slp_scalar_calls (vec_info *vinfo,
4446 slp_tree node, hash_set<slp_tree> &visited)
4447 {
4448 gimple *new_stmt;
4449 gimple_stmt_iterator gsi;
4450 int i;
4451 slp_tree child;
4452 tree lhs;
4453 stmt_vec_info stmt_info;
4454
4455 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4456 return;
4457
4458 if (visited.add (node))
4459 return;
4460
4461 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4462 vect_remove_slp_scalar_calls (vinfo, child, visited);
4463
4464 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4465 {
4466 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4467 if (!stmt || gimple_bb (stmt) == NULL)
4468 continue;
4469 if (is_pattern_stmt_p (stmt_info)
4470 || !PURE_SLP_STMT (stmt_info))
4471 continue;
4472 lhs = gimple_call_lhs (stmt);
4473 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4474 gsi = gsi_for_stmt (stmt);
4475 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4476 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4477 }
4478 }
4479
4480 static void
4481 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4482 {
4483 hash_set<slp_tree> visited;
4484 vect_remove_slp_scalar_calls (vinfo, node, visited);
4485 }
4486
4487 /* Vectorize the instance root. */
4488
4489 void
4490 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4491 {
4492 gassign *rstmt = NULL;
4493
4494 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4495 {
4496 gimple *child_stmt;
4497 int j;
4498
4499 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4500 {
4501 tree vect_lhs = gimple_get_lhs (child_stmt);
4502 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4503 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4504 TREE_TYPE (vect_lhs)))
4505 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4506 vect_lhs);
4507 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4508 break;
4509 }
4510 }
4511 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4512 {
4513 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4514 gimple *child_stmt;
4515 int j;
4516 vec<constructor_elt, va_gc> *v;
4517 vec_alloc (v, nelts);
4518
4519 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
4520 {
4521 CONSTRUCTOR_APPEND_ELT (v,
4522 NULL_TREE,
4523 gimple_get_lhs (child_stmt));
4524 }
4525 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4526 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4527 tree r_constructor = build_constructor (rtype, v);
4528 rstmt = gimple_build_assign (lhs, r_constructor);
4529 }
4530
4531 gcc_assert (rstmt);
4532
4533 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4534 gsi_replace (&rgsi, rstmt, true);
4535 }
4536
4537 /* Generate vector code for all SLP instances in the loop/basic block. */
4538
4539 void
4540 vect_schedule_slp (vec_info *vinfo)
4541 {
4542 vec<slp_instance> slp_instances;
4543 slp_instance instance;
4544 unsigned int i;
4545
4546 slp_instances = vinfo->slp_instances;
4547 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4548 {
4549 slp_tree node = SLP_INSTANCE_TREE (instance);
4550 /* Schedule the tree of INSTANCE. */
4551 vect_schedule_slp_instance (vinfo, node, instance);
4552
4553 if (SLP_INSTANCE_ROOT_STMT (instance))
4554 vectorize_slp_instance_root_stmt (node, instance);
4555
4556 if (dump_enabled_p ())
4557 dump_printf_loc (MSG_NOTE, vect_location,
4558 "vectorizing stmts using SLP.\n");
4559 }
4560
4561 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4562 {
4563 slp_tree root = SLP_INSTANCE_TREE (instance);
4564 stmt_vec_info store_info;
4565 unsigned int j;
4566
4567 /* Remove scalar call stmts. Do not do this for basic-block
4568 vectorization as not all uses may be vectorized.
4569 ??? Why should this be necessary? DCE should be able to
4570 remove the stmts itself.
4571 ??? For BB vectorization we can as well remove scalar
4572 stmts starting from the SLP tree root if they have no
4573 uses. */
4574 if (is_a <loop_vec_info> (vinfo))
4575 vect_remove_slp_scalar_calls (vinfo, root);
4576
4577 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4578 {
4579 if (!STMT_VINFO_DATA_REF (store_info))
4580 break;
4581
4582 if (SLP_INSTANCE_ROOT_STMT (instance))
4583 continue;
4584
4585 store_info = vect_orig_stmt (store_info);
4586 /* Free the attached stmt_vec_info and remove the stmt. */
4587 vinfo->remove_stmt (store_info);
4588 }
4589 }
4590 }