]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
836defce9901683b000216f98c809451c7c82c5d
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48
49
50 /* Initialize a SLP node. */
51
52 _slp_tree::_slp_tree ()
53 {
54 SLP_TREE_SCALAR_STMTS (this) = vNULL;
55 SLP_TREE_SCALAR_OPS (this) = vNULL;
56 SLP_TREE_VEC_STMTS (this) = vNULL;
57 SLP_TREE_VEC_DEFS (this) = vNULL;
58 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
59 SLP_TREE_CHILDREN (this) = vNULL;
60 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
61 SLP_TREE_TWO_OPERATORS (this) = false;
62 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
63 SLP_TREE_VECTYPE (this) = NULL_TREE;
64 SLP_TREE_REPRESENTATIVE (this) = NULL;
65 this->refcnt = 1;
66 this->max_nunits = 1;
67 }
68
69 /* Tear down a SLP node. */
70
71 _slp_tree::~_slp_tree ()
72 {
73 SLP_TREE_CHILDREN (this).release ();
74 SLP_TREE_SCALAR_STMTS (this).release ();
75 SLP_TREE_SCALAR_OPS (this).release ();
76 SLP_TREE_VEC_STMTS (this).release ();
77 SLP_TREE_VEC_DEFS (this).release ();
78 SLP_TREE_LOAD_PERMUTATION (this).release ();
79 }
80
81 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
82 FINAL_P is true if we have vectorized the instance or if we have
83 made a final decision not to vectorize the statements in any way. */
84
85 static void
86 vect_free_slp_tree (slp_tree node, bool final_p)
87 {
88 int i;
89 slp_tree child;
90
91 if (--node->refcnt != 0)
92 return;
93
94 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
95 vect_free_slp_tree (child, final_p);
96
97 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
98 Some statements might no longer exist, after having been
99 removed by vect_transform_stmt. Updating the remaining
100 statements would be redundant. */
101 if (!final_p)
102 {
103 stmt_vec_info stmt_info;
104 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
105 {
106 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
107 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
108 }
109 }
110
111 delete node;
112 }
113
114 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
115 have vectorized the instance or if we have made a final decision not
116 to vectorize the statements in any way. */
117
118 void
119 vect_free_slp_instance (slp_instance instance, bool final_p)
120 {
121 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
122 SLP_INSTANCE_LOADS (instance).release ();
123 free (instance);
124 }
125
126
127 /* Create an SLP node for SCALAR_STMTS. */
128
129 static slp_tree
130 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
131 {
132 slp_tree node = new _slp_tree;
133 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
134 SLP_TREE_CHILDREN (node).create (nops);
135 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
136 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
137
138 unsigned i;
139 stmt_vec_info stmt_info;
140 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
141 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
142
143 return node;
144 }
145
146 /* Create an SLP node for OPS. */
147
148 static slp_tree
149 vect_create_new_slp_node (vec<tree> ops)
150 {
151 slp_tree node = new _slp_tree;
152 SLP_TREE_SCALAR_OPS (node) = ops;
153 SLP_TREE_DEF_TYPE (node) = vect_external_def;
154 return node;
155 }
156
157
158 /* This structure is used in creation of an SLP tree. Each instance
159 corresponds to the same operand in a group of scalar stmts in an SLP
160 node. */
161 typedef struct _slp_oprnd_info
162 {
163 /* Def-stmts for the operands. */
164 vec<stmt_vec_info> def_stmts;
165 /* Operands. */
166 vec<tree> ops;
167 /* Information about the first statement, its vector def-type, type, the
168 operand itself in case it's constant, and an indication if it's a pattern
169 stmt. */
170 tree first_op_type;
171 enum vect_def_type first_dt;
172 bool any_pattern;
173 } *slp_oprnd_info;
174
175
176 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
177 operand. */
178 static vec<slp_oprnd_info>
179 vect_create_oprnd_info (int nops, int group_size)
180 {
181 int i;
182 slp_oprnd_info oprnd_info;
183 vec<slp_oprnd_info> oprnds_info;
184
185 oprnds_info.create (nops);
186 for (i = 0; i < nops; i++)
187 {
188 oprnd_info = XNEW (struct _slp_oprnd_info);
189 oprnd_info->def_stmts.create (group_size);
190 oprnd_info->ops.create (group_size);
191 oprnd_info->first_dt = vect_uninitialized_def;
192 oprnd_info->first_op_type = NULL_TREE;
193 oprnd_info->any_pattern = false;
194 oprnds_info.quick_push (oprnd_info);
195 }
196
197 return oprnds_info;
198 }
199
200
201 /* Free operands info. */
202
203 static void
204 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
205 {
206 int i;
207 slp_oprnd_info oprnd_info;
208
209 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
210 {
211 oprnd_info->def_stmts.release ();
212 oprnd_info->ops.release ();
213 XDELETE (oprnd_info);
214 }
215
216 oprnds_info.release ();
217 }
218
219
220 /* Return true if STMTS contains a pattern statement. */
221
222 static bool
223 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
224 {
225 stmt_vec_info stmt_info;
226 unsigned int i;
227 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
228 if (is_pattern_stmt_p (stmt_info))
229 return true;
230 return false;
231 }
232
233 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
234 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
235 of the chain. */
236
237 int
238 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
239 stmt_vec_info first_stmt_info)
240 {
241 stmt_vec_info next_stmt_info = first_stmt_info;
242 int result = 0;
243
244 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
245 return -1;
246
247 do
248 {
249 if (next_stmt_info == stmt_info)
250 return result;
251 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
252 if (next_stmt_info)
253 result += DR_GROUP_GAP (next_stmt_info);
254 }
255 while (next_stmt_info);
256
257 return -1;
258 }
259
260 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
261 using the method implemented by duplicate_and_interleave. Return true
262 if so, returning the number of intermediate vectors in *NVECTORS_OUT
263 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
264 (if nonnull). */
265
266 bool
267 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
268 tree elt_type, unsigned int *nvectors_out,
269 tree *vector_type_out,
270 tree *permutes)
271 {
272 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
273 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
274 return false;
275
276 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
277 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
278 unsigned int nvectors = 1;
279 for (;;)
280 {
281 scalar_int_mode int_mode;
282 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
283 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
284 {
285 /* Get the natural vector type for this SLP group size. */
286 tree int_type = build_nonstandard_integer_type
287 (GET_MODE_BITSIZE (int_mode), 1);
288 tree vector_type
289 = get_vectype_for_scalar_type (vinfo, int_type, count);
290 if (vector_type
291 && VECTOR_MODE_P (TYPE_MODE (vector_type))
292 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
293 GET_MODE_SIZE (base_vector_mode)))
294 {
295 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
296 together into elements of type INT_TYPE and using the result
297 to build NVECTORS vectors. */
298 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
299 vec_perm_builder sel1 (nelts, 2, 3);
300 vec_perm_builder sel2 (nelts, 2, 3);
301 poly_int64 half_nelts = exact_div (nelts, 2);
302 for (unsigned int i = 0; i < 3; ++i)
303 {
304 sel1.quick_push (i);
305 sel1.quick_push (i + nelts);
306 sel2.quick_push (half_nelts + i);
307 sel2.quick_push (half_nelts + i + nelts);
308 }
309 vec_perm_indices indices1 (sel1, 2, nelts);
310 vec_perm_indices indices2 (sel2, 2, nelts);
311 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
312 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
313 {
314 if (nvectors_out)
315 *nvectors_out = nvectors;
316 if (vector_type_out)
317 *vector_type_out = vector_type;
318 if (permutes)
319 {
320 permutes[0] = vect_gen_perm_mask_checked (vector_type,
321 indices1);
322 permutes[1] = vect_gen_perm_mask_checked (vector_type,
323 indices2);
324 }
325 return true;
326 }
327 }
328 }
329 if (!multiple_p (elt_bytes, 2, &elt_bytes))
330 return false;
331 nvectors *= 2;
332 }
333 }
334
335 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
336 they are of a valid type and that they match the defs of the first stmt of
337 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
338 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
339 indicates swap is required for cond_expr stmts. Specifically, *SWAP
340 is 1 if STMT is cond and operands of comparison need to be swapped;
341 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
342 If there is any operand swap in this function, *SWAP is set to non-zero
343 value.
344 If there was a fatal error return -1; if the error could be corrected by
345 swapping operands of father node of this one, return 1; if everything is
346 ok return 0. */
347 static int
348 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
349 vec<stmt_vec_info> stmts, unsigned stmt_num,
350 vec<slp_oprnd_info> *oprnds_info)
351 {
352 stmt_vec_info stmt_info = stmts[stmt_num];
353 tree oprnd;
354 unsigned int i, number_of_oprnds;
355 enum vect_def_type dt = vect_uninitialized_def;
356 slp_oprnd_info oprnd_info;
357 int first_op_idx = 1;
358 unsigned int commutative_op = -1U;
359 bool first_op_cond = false;
360 bool first = stmt_num == 0;
361
362 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
363 {
364 number_of_oprnds = gimple_call_num_args (stmt);
365 first_op_idx = 3;
366 if (gimple_call_internal_p (stmt))
367 {
368 internal_fn ifn = gimple_call_internal_fn (stmt);
369 commutative_op = first_commutative_argument (ifn);
370
371 /* Masked load, only look at mask. */
372 if (ifn == IFN_MASK_LOAD)
373 {
374 number_of_oprnds = 1;
375 /* Mask operand index. */
376 first_op_idx = 5;
377 }
378 }
379 }
380 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
381 {
382 enum tree_code code = gimple_assign_rhs_code (stmt);
383 number_of_oprnds = gimple_num_ops (stmt) - 1;
384 /* Swap can only be done for cond_expr if asked to, otherwise we
385 could result in different comparison code to the first stmt. */
386 if (code == COND_EXPR
387 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
388 {
389 first_op_cond = true;
390 number_of_oprnds++;
391 }
392 else
393 commutative_op = commutative_tree_code (code) ? 0U : -1U;
394 }
395 else
396 return -1;
397
398 bool swapped = (*swap != 0);
399 gcc_assert (!swapped || first_op_cond);
400 for (i = 0; i < number_of_oprnds; i++)
401 {
402 again:
403 if (first_op_cond)
404 {
405 /* Map indicating how operands of cond_expr should be swapped. */
406 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
407 int *map = maps[*swap];
408
409 if (i < 2)
410 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
411 first_op_idx), map[i]);
412 else
413 oprnd = gimple_op (stmt_info->stmt, map[i]);
414 }
415 else
416 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
417 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
418 oprnd = TREE_OPERAND (oprnd, 0);
419
420 oprnd_info = (*oprnds_info)[i];
421
422 stmt_vec_info def_stmt_info;
423 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
424 {
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
427 "Build SLP failed: can't analyze def for %T\n",
428 oprnd);
429
430 return -1;
431 }
432
433 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
434 oprnd_info->any_pattern = true;
435
436 if (first)
437 {
438 /* For the swapping logic below force vect_reduction_def
439 for the reduction op in a SLP reduction group. */
440 if (!STMT_VINFO_DATA_REF (stmt_info)
441 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
442 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
443 && def_stmt_info)
444 dt = vect_reduction_def;
445 oprnd_info->first_dt = dt;
446 oprnd_info->first_op_type = TREE_TYPE (oprnd);
447 }
448 else
449 {
450 /* Not first stmt of the group, check that the def-stmt/s match
451 the def-stmt/s of the first stmt. Allow different definition
452 types for reduction chains: the first stmt must be a
453 vect_reduction_def (a phi node), and the rest
454 end in the reduction chain. */
455 tree type = TREE_TYPE (oprnd);
456 if ((oprnd_info->first_dt != dt
457 && !(oprnd_info->first_dt == vect_reduction_def
458 && !STMT_VINFO_DATA_REF (stmt_info)
459 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
460 && def_stmt_info
461 && !STMT_VINFO_DATA_REF (def_stmt_info)
462 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
463 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
464 && !((oprnd_info->first_dt == vect_external_def
465 || oprnd_info->first_dt == vect_constant_def)
466 && (dt == vect_external_def
467 || dt == vect_constant_def)))
468 || !types_compatible_p (oprnd_info->first_op_type, type)
469 || (!STMT_VINFO_DATA_REF (stmt_info)
470 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
471 && ((!def_stmt_info
472 || STMT_VINFO_DATA_REF (def_stmt_info)
473 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
474 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
475 != (oprnd_info->first_dt != vect_reduction_def))))
476 {
477 /* Try swapping operands if we got a mismatch. */
478 if (i == commutative_op && !swapped)
479 {
480 if (dump_enabled_p ())
481 dump_printf_loc (MSG_NOTE, vect_location,
482 "trying swapped operands\n");
483 swapped = true;
484 goto again;
485 }
486
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
489 "Build SLP failed: different types\n");
490
491 return 1;
492 }
493 if ((dt == vect_constant_def
494 || dt == vect_external_def)
495 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
496 && (TREE_CODE (type) == BOOLEAN_TYPE
497 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
498 type)))
499 {
500 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "Build SLP failed: invalid type of def "
503 "for variable-length SLP %T\n", oprnd);
504 return -1;
505 }
506 }
507
508 /* Check the types of the definitions. */
509 switch (dt)
510 {
511 case vect_external_def:
512 /* Make sure to demote the overall operand to external. */
513 oprnd_info->first_dt = vect_external_def;
514 /* Fallthru. */
515 case vect_constant_def:
516 oprnd_info->def_stmts.quick_push (NULL);
517 oprnd_info->ops.quick_push (oprnd);
518 break;
519
520 case vect_internal_def:
521 case vect_reduction_def:
522 if (oprnd_info->first_dt == vect_reduction_def
523 && !STMT_VINFO_DATA_REF (stmt_info)
524 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
525 && !STMT_VINFO_DATA_REF (def_stmt_info)
526 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
527 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
528 {
529 /* For a SLP reduction chain we want to duplicate the
530 reduction to each of the chain members. That gets
531 us a sane SLP graph (still the stmts are not 100%
532 correct wrt the initial values). */
533 gcc_assert (!first);
534 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
535 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
536 break;
537 }
538 /* Fallthru. */
539 case vect_induction_def:
540 oprnd_info->def_stmts.quick_push (def_stmt_info);
541 oprnd_info->ops.quick_push (oprnd);
542 break;
543
544 default:
545 /* FORNOW: Not supported. */
546 if (dump_enabled_p ())
547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
548 "Build SLP failed: illegal type of def %T\n",
549 oprnd);
550
551 return -1;
552 }
553 }
554
555 /* Swap operands. */
556 if (swapped)
557 {
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "swapped operands to match def types in %G",
561 stmt_info->stmt);
562 }
563
564 *swap = swapped;
565 return 0;
566 }
567
568 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
569 Return true if we can, meaning that this choice doesn't conflict with
570 existing SLP nodes that use STMT_INFO. */
571
572 static bool
573 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
574 {
575 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
576 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
577 return true;
578
579 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
580 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
581 {
582 /* We maintain the invariant that if any statement in the group is
583 used, all other members of the group have the same vector type. */
584 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
585 stmt_vec_info member_info = first_info;
586 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
587 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
588 || is_pattern_stmt_p (member_info))
589 break;
590
591 if (!member_info)
592 {
593 for (member_info = first_info; member_info;
594 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
595 STMT_VINFO_VECTYPE (member_info) = vectype;
596 return true;
597 }
598 }
599 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
600 && !is_pattern_stmt_p (stmt_info))
601 {
602 STMT_VINFO_VECTYPE (stmt_info) = vectype;
603 return true;
604 }
605
606 if (dump_enabled_p ())
607 {
608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
609 "Build SLP failed: incompatible vector"
610 " types for: %G", stmt_info->stmt);
611 dump_printf_loc (MSG_NOTE, vect_location,
612 " old vector type: %T\n", old_vectype);
613 dump_printf_loc (MSG_NOTE, vect_location,
614 " new vector type: %T\n", vectype);
615 }
616 return false;
617 }
618
619 /* Try to infer and assign a vector type to all the statements in STMTS.
620 Used only for BB vectorization. */
621
622 static bool
623 vect_update_all_shared_vectypes (vec_info *vinfo, vec<stmt_vec_info> stmts)
624 {
625 tree vectype, nunits_vectype;
626 if (!vect_get_vector_types_for_stmt (vinfo, stmts[0], &vectype,
627 &nunits_vectype, stmts.length ()))
628 return false;
629
630 stmt_vec_info stmt_info;
631 unsigned int i;
632 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
633 if (!vect_update_shared_vectype (stmt_info, vectype))
634 return false;
635
636 return true;
637 }
638
639 /* Return true if call statements CALL1 and CALL2 are similar enough
640 to be combined into the same SLP group. */
641
642 static bool
643 compatible_calls_p (gcall *call1, gcall *call2)
644 {
645 unsigned int nargs = gimple_call_num_args (call1);
646 if (nargs != gimple_call_num_args (call2))
647 return false;
648
649 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
650 return false;
651
652 if (gimple_call_internal_p (call1))
653 {
654 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
655 TREE_TYPE (gimple_call_lhs (call2))))
656 return false;
657 for (unsigned int i = 0; i < nargs; ++i)
658 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
659 TREE_TYPE (gimple_call_arg (call2, i))))
660 return false;
661 }
662 else
663 {
664 if (!operand_equal_p (gimple_call_fn (call1),
665 gimple_call_fn (call2), 0))
666 return false;
667
668 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
669 return false;
670 }
671 return true;
672 }
673
674 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
675 caller's attempt to find the vector type in STMT_INFO with the narrowest
676 element type. Return true if VECTYPE is nonnull and if it is valid
677 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
678 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
679 vect_build_slp_tree. */
680
681 static bool
682 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
683 unsigned int group_size,
684 tree vectype, poly_uint64 *max_nunits)
685 {
686 if (!vectype)
687 {
688 if (dump_enabled_p ())
689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
690 "Build SLP failed: unsupported data-type in %G\n",
691 stmt_info->stmt);
692 /* Fatal mismatch. */
693 return false;
694 }
695
696 /* If populating the vector type requires unrolling then fail
697 before adjusting *max_nunits for basic-block vectorization. */
698 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
699 unsigned HOST_WIDE_INT const_nunits;
700 if (is_a <bb_vec_info> (vinfo)
701 && (!nunits.is_constant (&const_nunits)
702 || const_nunits > group_size))
703 {
704 if (dump_enabled_p ())
705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
706 "Build SLP failed: unrolling required "
707 "in basic block SLP\n");
708 /* Fatal mismatch. */
709 return false;
710 }
711
712 /* In case of multiple types we need to detect the smallest type. */
713 vect_update_max_nunits (max_nunits, vectype);
714 return true;
715 }
716
717 /* STMTS is a group of GROUP_SIZE SLP statements in which some
718 statements do the same operation as the first statement and in which
719 the others do ALT_STMT_CODE. Return true if we can take one vector
720 of the first operation and one vector of the second and permute them
721 to get the required result. VECTYPE is the type of the vector that
722 would be permuted. */
723
724 static bool
725 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
726 unsigned int group_size, tree vectype,
727 tree_code alt_stmt_code)
728 {
729 unsigned HOST_WIDE_INT count;
730 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
731 return false;
732
733 vec_perm_builder sel (count, count, 1);
734 for (unsigned int i = 0; i < count; ++i)
735 {
736 unsigned int elt = i;
737 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
738 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
739 elt += count;
740 sel.quick_push (elt);
741 }
742 vec_perm_indices indices (sel, 2, count);
743 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
744 }
745
746 /* Verify if the scalar stmts STMTS are isomorphic, require data
747 permutation or are of unsupported types of operation. Return
748 true if they are, otherwise return false and indicate in *MATCHES
749 which stmts are not isomorphic to the first one. If MATCHES[0]
750 is false then this indicates the comparison could not be
751 carried out or the stmts will never be vectorized by SLP.
752
753 Note COND_EXPR is possibly isomorphic to another one after swapping its
754 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
755 the first stmt by swapping the two operands of comparison; set SWAP[i]
756 to 2 if stmt I is isormorphic to the first stmt by inverting the code
757 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
758 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
759
760 static bool
761 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
762 vec<stmt_vec_info> stmts, unsigned int group_size,
763 poly_uint64 *max_nunits, bool *matches,
764 bool *two_operators)
765 {
766 unsigned int i;
767 stmt_vec_info first_stmt_info = stmts[0];
768 enum tree_code first_stmt_code = ERROR_MARK;
769 enum tree_code alt_stmt_code = ERROR_MARK;
770 enum tree_code rhs_code = ERROR_MARK;
771 enum tree_code first_cond_code = ERROR_MARK;
772 tree lhs;
773 bool need_same_oprnds = false;
774 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
775 optab optab;
776 int icode;
777 machine_mode optab_op2_mode;
778 machine_mode vec_mode;
779 stmt_vec_info first_load = NULL, prev_first_load = NULL;
780 bool load_p = false;
781
782 /* For every stmt in NODE find its def stmt/s. */
783 stmt_vec_info stmt_info;
784 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
785 {
786 gimple *stmt = stmt_info->stmt;
787 swap[i] = 0;
788 matches[i] = false;
789
790 if (dump_enabled_p ())
791 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
792
793 /* Fail to vectorize statements marked as unvectorizable. */
794 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
795 {
796 if (dump_enabled_p ())
797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
798 "Build SLP failed: unvectorizable statement %G",
799 stmt);
800 /* Fatal mismatch. */
801 matches[0] = false;
802 return false;
803 }
804
805 lhs = gimple_get_lhs (stmt);
806 if (lhs == NULL_TREE)
807 {
808 if (dump_enabled_p ())
809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
810 "Build SLP failed: not GIMPLE_ASSIGN nor "
811 "GIMPLE_CALL %G", stmt);
812 /* Fatal mismatch. */
813 matches[0] = false;
814 return false;
815 }
816
817 tree nunits_vectype;
818 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
819 &nunits_vectype, group_size)
820 || (nunits_vectype
821 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
822 nunits_vectype, max_nunits)))
823 {
824 /* Fatal mismatch. */
825 matches[0] = false;
826 return false;
827 }
828
829 gcc_assert (vectype);
830
831 if (is_a <bb_vec_info> (vinfo)
832 && !vect_update_shared_vectype (stmt_info, vectype))
833 continue;
834
835 gcall *call_stmt = dyn_cast <gcall *> (stmt);
836 if (call_stmt)
837 {
838 rhs_code = CALL_EXPR;
839
840 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
841 load_p = true;
842 else if ((gimple_call_internal_p (call_stmt)
843 && (!vectorizable_internal_fn_p
844 (gimple_call_internal_fn (call_stmt))))
845 || gimple_call_tail_p (call_stmt)
846 || gimple_call_noreturn_p (call_stmt)
847 || !gimple_call_nothrow_p (call_stmt)
848 || gimple_call_chain (call_stmt))
849 {
850 if (dump_enabled_p ())
851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
852 "Build SLP failed: unsupported call type %G",
853 call_stmt);
854 /* Fatal mismatch. */
855 matches[0] = false;
856 return false;
857 }
858 }
859 else
860 {
861 rhs_code = gimple_assign_rhs_code (stmt);
862 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
863 }
864
865 /* Check the operation. */
866 if (i == 0)
867 {
868 first_stmt_code = rhs_code;
869
870 /* Shift arguments should be equal in all the packed stmts for a
871 vector shift with scalar shift operand. */
872 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
873 || rhs_code == LROTATE_EXPR
874 || rhs_code == RROTATE_EXPR)
875 {
876 vec_mode = TYPE_MODE (vectype);
877
878 /* First see if we have a vector/vector shift. */
879 optab = optab_for_tree_code (rhs_code, vectype,
880 optab_vector);
881
882 if (!optab
883 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
884 {
885 /* No vector/vector shift, try for a vector/scalar shift. */
886 optab = optab_for_tree_code (rhs_code, vectype,
887 optab_scalar);
888
889 if (!optab)
890 {
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
893 "Build SLP failed: no optab.\n");
894 /* Fatal mismatch. */
895 matches[0] = false;
896 return false;
897 }
898 icode = (int) optab_handler (optab, vec_mode);
899 if (icode == CODE_FOR_nothing)
900 {
901 if (dump_enabled_p ())
902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
903 "Build SLP failed: "
904 "op not supported by target.\n");
905 /* Fatal mismatch. */
906 matches[0] = false;
907 return false;
908 }
909 optab_op2_mode = insn_data[icode].operand[2].mode;
910 if (!VECTOR_MODE_P (optab_op2_mode))
911 {
912 need_same_oprnds = true;
913 first_op1 = gimple_assign_rhs2 (stmt);
914 }
915 }
916 }
917 else if (rhs_code == WIDEN_LSHIFT_EXPR)
918 {
919 need_same_oprnds = true;
920 first_op1 = gimple_assign_rhs2 (stmt);
921 }
922 else if (call_stmt
923 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
924 {
925 need_same_oprnds = true;
926 first_op1 = gimple_call_arg (call_stmt, 1);
927 }
928 }
929 else
930 {
931 if (first_stmt_code != rhs_code
932 && alt_stmt_code == ERROR_MARK)
933 alt_stmt_code = rhs_code;
934 if (first_stmt_code != rhs_code
935 && (first_stmt_code != IMAGPART_EXPR
936 || rhs_code != REALPART_EXPR)
937 && (first_stmt_code != REALPART_EXPR
938 || rhs_code != IMAGPART_EXPR)
939 /* Handle mismatches in plus/minus by computing both
940 and merging the results. */
941 && !((first_stmt_code == PLUS_EXPR
942 || first_stmt_code == MINUS_EXPR)
943 && (alt_stmt_code == PLUS_EXPR
944 || alt_stmt_code == MINUS_EXPR)
945 && rhs_code == alt_stmt_code)
946 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
947 && (first_stmt_code == ARRAY_REF
948 || first_stmt_code == BIT_FIELD_REF
949 || first_stmt_code == INDIRECT_REF
950 || first_stmt_code == COMPONENT_REF
951 || first_stmt_code == MEM_REF)))
952 {
953 if (dump_enabled_p ())
954 {
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: different operation "
957 "in stmt %G", stmt);
958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
959 "original stmt %G", first_stmt_info->stmt);
960 }
961 /* Mismatch. */
962 continue;
963 }
964
965 if (need_same_oprnds)
966 {
967 tree other_op1 = (call_stmt
968 ? gimple_call_arg (call_stmt, 1)
969 : gimple_assign_rhs2 (stmt));
970 if (!operand_equal_p (first_op1, other_op1, 0))
971 {
972 if (dump_enabled_p ())
973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
974 "Build SLP failed: different shift "
975 "arguments in %G", stmt);
976 /* Mismatch. */
977 continue;
978 }
979 }
980
981 if (!load_p && rhs_code == CALL_EXPR)
982 {
983 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
984 as_a <gcall *> (stmt)))
985 {
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
988 "Build SLP failed: different calls in %G",
989 stmt);
990 /* Mismatch. */
991 continue;
992 }
993 }
994 }
995
996 /* Grouped store or load. */
997 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
998 {
999 if (REFERENCE_CLASS_P (lhs))
1000 {
1001 /* Store. */
1002 ;
1003 }
1004 else
1005 {
1006 /* Load. */
1007 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1008 if (prev_first_load)
1009 {
1010 /* Check that there are no loads from different interleaving
1011 chains in the same node. */
1012 if (prev_first_load != first_load)
1013 {
1014 if (dump_enabled_p ())
1015 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1016 vect_location,
1017 "Build SLP failed: different "
1018 "interleaving chains in one node %G",
1019 stmt);
1020 /* Mismatch. */
1021 continue;
1022 }
1023 }
1024 else
1025 prev_first_load = first_load;
1026 }
1027 } /* Grouped access. */
1028 else
1029 {
1030 if (load_p)
1031 {
1032 /* Not grouped load. */
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "Build SLP failed: not grouped load %G", stmt);
1036
1037 /* FORNOW: Not grouped loads are not supported. */
1038 /* Fatal mismatch. */
1039 matches[0] = false;
1040 return false;
1041 }
1042
1043 /* Not memory operation. */
1044 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1045 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1046 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1047 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1048 && rhs_code != VIEW_CONVERT_EXPR
1049 && rhs_code != CALL_EXPR)
1050 {
1051 if (dump_enabled_p ())
1052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1053 "Build SLP failed: operation unsupported %G",
1054 stmt);
1055 /* Fatal mismatch. */
1056 matches[0] = false;
1057 return false;
1058 }
1059
1060 if (rhs_code == COND_EXPR)
1061 {
1062 tree cond_expr = gimple_assign_rhs1 (stmt);
1063 enum tree_code cond_code = TREE_CODE (cond_expr);
1064 enum tree_code swap_code = ERROR_MARK;
1065 enum tree_code invert_code = ERROR_MARK;
1066
1067 if (i == 0)
1068 first_cond_code = TREE_CODE (cond_expr);
1069 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1070 {
1071 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1072 swap_code = swap_tree_comparison (cond_code);
1073 invert_code = invert_tree_comparison (cond_code, honor_nans);
1074 }
1075
1076 if (first_cond_code == cond_code)
1077 ;
1078 /* Isomorphic can be achieved by swapping. */
1079 else if (first_cond_code == swap_code)
1080 swap[i] = 1;
1081 /* Isomorphic can be achieved by inverting. */
1082 else if (first_cond_code == invert_code)
1083 swap[i] = 2;
1084 else
1085 {
1086 if (dump_enabled_p ())
1087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1088 "Build SLP failed: different"
1089 " operation %G", stmt);
1090 /* Mismatch. */
1091 continue;
1092 }
1093 }
1094 }
1095
1096 matches[i] = true;
1097 }
1098
1099 for (i = 0; i < group_size; ++i)
1100 if (!matches[i])
1101 return false;
1102
1103 /* If we allowed a two-operation SLP node verify the target can cope
1104 with the permute we are going to use. */
1105 if (alt_stmt_code != ERROR_MARK
1106 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1107 {
1108 if (!vect_two_operations_perm_ok_p (stmts, group_size,
1109 vectype, alt_stmt_code))
1110 {
1111 for (i = 0; i < group_size; ++i)
1112 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1113 {
1114 matches[i] = false;
1115 if (dump_enabled_p ())
1116 {
1117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1118 "Build SLP failed: different operation "
1119 "in stmt %G", stmts[i]->stmt);
1120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1121 "original stmt %G", first_stmt_info->stmt);
1122 }
1123 }
1124 return false;
1125 }
1126 *two_operators = true;
1127 }
1128
1129 return true;
1130 }
1131
1132 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1133 Note we never remove apart from at destruction time so we do not
1134 need a special value for deleted that differs from empty. */
1135 struct bst_traits
1136 {
1137 typedef vec <stmt_vec_info> value_type;
1138 typedef vec <stmt_vec_info> compare_type;
1139 static inline hashval_t hash (value_type);
1140 static inline bool equal (value_type existing, value_type candidate);
1141 static inline bool is_empty (value_type x) { return !x.exists (); }
1142 static inline bool is_deleted (value_type x) { return !x.exists (); }
1143 static const bool empty_zero_p = true;
1144 static inline void mark_empty (value_type &x) { x.release (); }
1145 static inline void mark_deleted (value_type &x) { x.release (); }
1146 static inline void remove (value_type &x) { x.release (); }
1147 };
1148 inline hashval_t
1149 bst_traits::hash (value_type x)
1150 {
1151 inchash::hash h;
1152 for (unsigned i = 0; i < x.length (); ++i)
1153 h.add_int (gimple_uid (x[i]->stmt));
1154 return h.end ();
1155 }
1156 inline bool
1157 bst_traits::equal (value_type existing, value_type candidate)
1158 {
1159 if (existing.length () != candidate.length ())
1160 return false;
1161 for (unsigned i = 0; i < existing.length (); ++i)
1162 if (existing[i] != candidate[i])
1163 return false;
1164 return true;
1165 }
1166
1167 typedef hash_map <vec <gimple *>, slp_tree,
1168 simple_hashmap_traits <bst_traits, slp_tree> >
1169 scalar_stmts_to_slp_tree_map_t;
1170
1171 static slp_tree
1172 vect_build_slp_tree_2 (vec_info *vinfo,
1173 vec<stmt_vec_info> stmts, unsigned int group_size,
1174 poly_uint64 *max_nunits,
1175 bool *matches, unsigned *npermutes, unsigned *tree_size,
1176 scalar_stmts_to_slp_tree_map_t *bst_map);
1177
1178 static slp_tree
1179 vect_build_slp_tree (vec_info *vinfo,
1180 vec<stmt_vec_info> stmts, unsigned int group_size,
1181 poly_uint64 *max_nunits,
1182 bool *matches, unsigned *npermutes, unsigned *tree_size,
1183 scalar_stmts_to_slp_tree_map_t *bst_map)
1184 {
1185 if (slp_tree *leader = bst_map->get (stmts))
1186 {
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1189 *leader ? "" : "failed ", *leader);
1190 if (*leader)
1191 {
1192 (*leader)->refcnt++;
1193 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1194 }
1195 return *leader;
1196 }
1197 poly_uint64 this_max_nunits = 1;
1198 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1199 &this_max_nunits,
1200 matches, npermutes, tree_size, bst_map);
1201 if (res)
1202 {
1203 res->max_nunits = this_max_nunits;
1204 vect_update_max_nunits (max_nunits, this_max_nunits);
1205 /* Keep a reference for the bst_map use. */
1206 res->refcnt++;
1207 }
1208 bst_map->put (stmts.copy (), res);
1209 return res;
1210 }
1211
1212 /* Recursively build an SLP tree starting from NODE.
1213 Fail (and return a value not equal to zero) if def-stmts are not
1214 isomorphic, require data permutation or are of unsupported types of
1215 operation. Otherwise, return 0.
1216 The value returned is the depth in the SLP tree where a mismatch
1217 was found. */
1218
1219 static slp_tree
1220 vect_build_slp_tree_2 (vec_info *vinfo,
1221 vec<stmt_vec_info> stmts, unsigned int group_size,
1222 poly_uint64 *max_nunits,
1223 bool *matches, unsigned *npermutes, unsigned *tree_size,
1224 scalar_stmts_to_slp_tree_map_t *bst_map)
1225 {
1226 unsigned nops, i, this_tree_size = 0;
1227 poly_uint64 this_max_nunits = *max_nunits;
1228 slp_tree node;
1229
1230 matches[0] = false;
1231
1232 stmt_vec_info stmt_info = stmts[0];
1233 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1234 nops = gimple_call_num_args (stmt);
1235 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1236 {
1237 nops = gimple_num_ops (stmt) - 1;
1238 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1239 nops++;
1240 }
1241 else if (is_a <gphi *> (stmt_info->stmt))
1242 nops = 0;
1243 else
1244 return NULL;
1245
1246 /* If the SLP node is a PHI (induction or reduction), terminate
1247 the recursion. */
1248 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1249 {
1250 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1251 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1252 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1253 max_nunits))
1254 return NULL;
1255
1256 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1257 /* Induction from different IVs is not supported. */
1258 if (def_type == vect_induction_def)
1259 {
1260 stmt_vec_info other_info;
1261 FOR_EACH_VEC_ELT (stmts, i, other_info)
1262 if (stmt_info != other_info)
1263 return NULL;
1264 }
1265 else if (def_type == vect_reduction_def
1266 || def_type == vect_double_reduction_def
1267 || def_type == vect_nested_cycle)
1268 {
1269 /* Else def types have to match. */
1270 stmt_vec_info other_info;
1271 FOR_EACH_VEC_ELT (stmts, i, other_info)
1272 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1273 return NULL;
1274 }
1275 else
1276 return NULL;
1277 (*tree_size)++;
1278 node = vect_create_new_slp_node (stmts, 0);
1279 return node;
1280 }
1281
1282
1283 bool two_operators = false;
1284 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1285 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1286 &this_max_nunits, matches, &two_operators))
1287 return NULL;
1288
1289 /* If the SLP node is a load, terminate the recursion unless masked. */
1290 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1291 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1292 {
1293 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1294 {
1295 /* Masked load. */
1296 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1297 nops = 1;
1298 }
1299 else
1300 {
1301 *max_nunits = this_max_nunits;
1302 (*tree_size)++;
1303 node = vect_create_new_slp_node (stmts, 0);
1304 /* And compute the load permutation. Whether it is actually
1305 a permutation depends on the unrolling factor which is
1306 decided later. */
1307 vec<unsigned> load_permutation;
1308 int j;
1309 stmt_vec_info load_info;
1310 load_permutation.create (group_size);
1311 stmt_vec_info first_stmt_info
1312 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1313 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1314 {
1315 int load_place = vect_get_place_in_interleaving_chain
1316 (load_info, first_stmt_info);
1317 gcc_assert (load_place != -1);
1318 load_permutation.safe_push (load_place);
1319 }
1320 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1321 return node;
1322 }
1323 }
1324
1325 /* Get at the operands, verifying they are compatible. */
1326 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1327 slp_oprnd_info oprnd_info;
1328 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1329 {
1330 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1331 stmts, i, &oprnds_info);
1332 if (res != 0)
1333 matches[(res == -1) ? 0 : i] = false;
1334 if (!matches[0])
1335 break;
1336 }
1337 for (i = 0; i < group_size; ++i)
1338 if (!matches[i])
1339 {
1340 vect_free_oprnd_info (oprnds_info);
1341 return NULL;
1342 }
1343
1344 auto_vec<slp_tree, 4> children;
1345
1346 stmt_info = stmts[0];
1347
1348 /* Create SLP_TREE nodes for the definition node/s. */
1349 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1350 {
1351 slp_tree child;
1352 unsigned old_tree_size = this_tree_size;
1353 unsigned int j;
1354
1355 if (oprnd_info->first_dt == vect_uninitialized_def)
1356 {
1357 /* COND_EXPR have one too many eventually if the condition
1358 is a SSA name. */
1359 gcc_assert (i == 3 && nops == 4);
1360 continue;
1361 }
1362
1363 if (oprnd_info->first_dt != vect_internal_def
1364 && oprnd_info->first_dt != vect_reduction_def
1365 && oprnd_info->first_dt != vect_induction_def)
1366 {
1367 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1368 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1369 oprnd_info->ops = vNULL;
1370 children.safe_push (invnode);
1371 continue;
1372 }
1373
1374 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1375 group_size, &this_max_nunits,
1376 matches, npermutes,
1377 &this_tree_size, bst_map)) != NULL)
1378 {
1379 /* If we have all children of a non-unary child built up from
1380 scalars then just throw that away and build it up this node
1381 from scalars. */
1382 if (is_a <bb_vec_info> (vinfo)
1383 && SLP_TREE_CHILDREN (child).length () > 1
1384 /* ??? Rejecting patterns this way doesn't work. We'd have to
1385 do extra work to cancel the pattern so the uses see the
1386 scalar version. */
1387 && !oprnd_info->any_pattern)
1388 {
1389 slp_tree grandchild;
1390
1391 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1392 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1393 break;
1394 if (!grandchild
1395 && vect_update_all_shared_vectypes (vinfo,
1396 oprnd_info->def_stmts))
1397 {
1398 /* Roll back. */
1399 this_tree_size = old_tree_size;
1400 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1401 vect_free_slp_tree (grandchild, false);
1402 SLP_TREE_CHILDREN (child).truncate (0);
1403
1404 if (dump_enabled_p ())
1405 dump_printf_loc (MSG_NOTE, vect_location,
1406 "Building parent vector operands from "
1407 "scalars instead\n");
1408 oprnd_info->def_stmts = vNULL;
1409 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1410 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1411 oprnd_info->ops = vNULL;
1412 ++this_tree_size;
1413 children.safe_push (child);
1414 continue;
1415 }
1416 }
1417
1418 oprnd_info->def_stmts = vNULL;
1419 children.safe_push (child);
1420 continue;
1421 }
1422
1423 /* If the SLP build failed fatally and we analyze a basic-block
1424 simply treat nodes we fail to build as externally defined
1425 (and thus build vectors from the scalar defs).
1426 The cost model will reject outright expensive cases.
1427 ??? This doesn't treat cases where permutation ultimatively
1428 fails (or we don't try permutation below). Ideally we'd
1429 even compute a permutation that will end up with the maximum
1430 SLP tree size... */
1431 if (is_a <bb_vec_info> (vinfo)
1432 && !matches[0]
1433 /* ??? Rejecting patterns this way doesn't work. We'd have to
1434 do extra work to cancel the pattern so the uses see the
1435 scalar version. */
1436 && !is_pattern_stmt_p (stmt_info)
1437 && !oprnd_info->any_pattern
1438 && vect_update_all_shared_vectypes (vinfo, oprnd_info->def_stmts))
1439 {
1440 if (dump_enabled_p ())
1441 dump_printf_loc (MSG_NOTE, vect_location,
1442 "Building vector operands from scalars\n");
1443 this_tree_size++;
1444 child = vect_create_new_slp_node (oprnd_info->def_stmts, 0);
1445 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1446 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1447 children.safe_push (child);
1448 oprnd_info->ops = vNULL;
1449 oprnd_info->def_stmts = vNULL;
1450 continue;
1451 }
1452
1453 /* If the SLP build for operand zero failed and operand zero
1454 and one can be commutated try that for the scalar stmts
1455 that failed the match. */
1456 if (i == 0
1457 /* A first scalar stmt mismatch signals a fatal mismatch. */
1458 && matches[0]
1459 /* ??? For COND_EXPRs we can swap the comparison operands
1460 as well as the arms under some constraints. */
1461 && nops == 2
1462 && oprnds_info[1]->first_dt == vect_internal_def
1463 && is_gimple_assign (stmt_info->stmt)
1464 /* Swapping operands for reductions breaks assumptions later on. */
1465 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1466 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1467 /* Do so only if the number of not successful permutes was nor more
1468 than a cut-ff as re-trying the recursive match on
1469 possibly each level of the tree would expose exponential
1470 behavior. */
1471 && *npermutes < 4)
1472 {
1473 /* See whether we can swap the matching or the non-matching
1474 stmt operands. */
1475 bool swap_not_matching = true;
1476 do
1477 {
1478 for (j = 0; j < group_size; ++j)
1479 {
1480 if (matches[j] != !swap_not_matching)
1481 continue;
1482 stmt_vec_info stmt_info = stmts[j];
1483 /* Verify if we can swap operands of this stmt. */
1484 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1485 if (!stmt
1486 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1487 {
1488 if (!swap_not_matching)
1489 goto fail;
1490 swap_not_matching = false;
1491 break;
1492 }
1493 }
1494 }
1495 while (j != group_size);
1496
1497 /* Swap mismatched definition stmts. */
1498 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE, vect_location,
1500 "Re-trying with swapped operands of stmts ");
1501 for (j = 0; j < group_size; ++j)
1502 if (matches[j] == !swap_not_matching)
1503 {
1504 std::swap (oprnds_info[0]->def_stmts[j],
1505 oprnds_info[1]->def_stmts[j]);
1506 std::swap (oprnds_info[0]->ops[j],
1507 oprnds_info[1]->ops[j]);
1508 if (dump_enabled_p ())
1509 dump_printf (MSG_NOTE, "%d ", j);
1510 }
1511 if (dump_enabled_p ())
1512 dump_printf (MSG_NOTE, "\n");
1513 /* And try again with scratch 'matches' ... */
1514 bool *tem = XALLOCAVEC (bool, group_size);
1515 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1516 group_size, &this_max_nunits,
1517 tem, npermutes,
1518 &this_tree_size, bst_map)) != NULL)
1519 {
1520 /* If we have all children of a non-unary child built up from
1521 scalars then just throw that away and build it up this node
1522 from scalars. */
1523 if (is_a <bb_vec_info> (vinfo)
1524 && SLP_TREE_CHILDREN (child).length () > 1
1525 /* ??? Rejecting patterns this way doesn't work. We'd have
1526 to do extra work to cancel the pattern so the uses see the
1527 scalar version. */
1528 && !oprnd_info->any_pattern)
1529 {
1530 unsigned int j;
1531 slp_tree grandchild;
1532
1533 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1534 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1535 break;
1536 if (!grandchild
1537 && (vect_update_all_shared_vectypes
1538 (vinfo, oprnd_info->def_stmts)))
1539 {
1540 /* Roll back. */
1541 this_tree_size = old_tree_size;
1542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1543 vect_free_slp_tree (grandchild, false);
1544 SLP_TREE_CHILDREN (child).truncate (0);
1545
1546 if (dump_enabled_p ())
1547 dump_printf_loc (MSG_NOTE, vect_location,
1548 "Building parent vector operands from "
1549 "scalars instead\n");
1550 oprnd_info->def_stmts = vNULL;
1551 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1552 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1553 oprnd_info->ops = vNULL;
1554 ++this_tree_size;
1555 children.safe_push (child);
1556 continue;
1557 }
1558 }
1559
1560 oprnd_info->def_stmts = vNULL;
1561 children.safe_push (child);
1562 continue;
1563 }
1564
1565 ++*npermutes;
1566 }
1567
1568 fail:
1569 gcc_assert (child == NULL);
1570 FOR_EACH_VEC_ELT (children, j, child)
1571 vect_free_slp_tree (child, false);
1572 vect_free_oprnd_info (oprnds_info);
1573 return NULL;
1574 }
1575
1576 vect_free_oprnd_info (oprnds_info);
1577
1578 *tree_size += this_tree_size + 1;
1579 *max_nunits = this_max_nunits;
1580
1581 node = vect_create_new_slp_node (stmts, nops);
1582 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1583 SLP_TREE_CHILDREN (node).splice (children);
1584 return node;
1585 }
1586
1587 /* Dump a single SLP tree NODE. */
1588
1589 static void
1590 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1591 slp_tree node)
1592 {
1593 unsigned i, j;
1594 slp_tree child;
1595 stmt_vec_info stmt_info;
1596 tree op;
1597
1598 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1599 dump_user_location_t user_loc = loc.get_user_location ();
1600 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1601 SLP_TREE_DEF_TYPE (node) == vect_external_def
1602 ? " (external)"
1603 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1604 ? " (constant)"
1605 : ""), node,
1606 estimated_poly_value (node->max_nunits), node->refcnt);
1607 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1608 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1609 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1610 else
1611 {
1612 dump_printf_loc (metadata, user_loc, "\t{ ");
1613 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1614 dump_printf (metadata, "%T%s ", op,
1615 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1616 dump_printf (metadata, "}\n");
1617 }
1618 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1619 {
1620 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1621 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1622 dump_printf (dump_kind, " %u", j);
1623 dump_printf (dump_kind, " }\n");
1624 }
1625 if (SLP_TREE_CHILDREN (node).is_empty ())
1626 return;
1627 dump_printf_loc (metadata, user_loc, "\tchildren");
1628 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1629 dump_printf (dump_kind, " %p", (void *)child);
1630 dump_printf (dump_kind, "\n");
1631 }
1632
1633 DEBUG_FUNCTION void
1634 debug (slp_tree node)
1635 {
1636 debug_dump_context ctx;
1637 vect_print_slp_tree (MSG_NOTE,
1638 dump_location_t::from_location_t (UNKNOWN_LOCATION),
1639 node);
1640 }
1641
1642 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1643
1644 static void
1645 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1646 slp_tree node, hash_set<slp_tree> &visited)
1647 {
1648 unsigned i;
1649 slp_tree child;
1650
1651 if (visited.add (node))
1652 return;
1653
1654 vect_print_slp_tree (dump_kind, loc, node);
1655
1656 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1657 vect_print_slp_graph (dump_kind, loc, child, visited);
1658 }
1659
1660 static void
1661 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
1662 slp_tree entry)
1663 {
1664 hash_set<slp_tree> visited;
1665 vect_print_slp_graph (dump_kind, loc, entry, visited);
1666 }
1667
1668 /* Mark the tree rooted at NODE with PURE_SLP. */
1669
1670 static void
1671 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1672 {
1673 int i;
1674 stmt_vec_info stmt_info;
1675 slp_tree child;
1676
1677 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1678 return;
1679
1680 if (visited.add (node))
1681 return;
1682
1683 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1684 STMT_SLP_TYPE (stmt_info) = pure_slp;
1685
1686 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1687 vect_mark_slp_stmts (child, visited);
1688 }
1689
1690 static void
1691 vect_mark_slp_stmts (slp_tree node)
1692 {
1693 hash_set<slp_tree> visited;
1694 vect_mark_slp_stmts (node, visited);
1695 }
1696
1697 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1698
1699 static void
1700 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1701 {
1702 int i;
1703 stmt_vec_info stmt_info;
1704 slp_tree child;
1705
1706 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1707 return;
1708
1709 if (visited.add (node))
1710 return;
1711
1712 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1713 {
1714 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1715 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1716 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1717 }
1718
1719 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1720 vect_mark_slp_stmts_relevant (child, visited);
1721 }
1722
1723 static void
1724 vect_mark_slp_stmts_relevant (slp_tree node)
1725 {
1726 hash_set<slp_tree> visited;
1727 vect_mark_slp_stmts_relevant (node, visited);
1728 }
1729
1730 /* Copy the SLP subtree rooted at NODE. */
1731
1732 static slp_tree
1733 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1734 {
1735 unsigned i;
1736
1737 bool existed_p;
1738 slp_tree &copy_ref = map.get_or_insert (node, &existed_p);
1739 if (existed_p)
1740 return copy_ref;
1741
1742 copy_ref = new _slp_tree;
1743 slp_tree copy = copy_ref;
1744 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
1745 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
1746 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
1747 copy->max_nunits = node->max_nunits;
1748 copy->refcnt = 0;
1749 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1750 {
1751 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1752 stmt_vec_info stmt_info;
1753 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1754 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1755 }
1756 if (SLP_TREE_SCALAR_OPS (node).exists ())
1757 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1758 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1759 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1760 if (SLP_TREE_CHILDREN (node).exists ())
1761 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1762 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1763
1764 slp_tree child;
1765 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1766 {
1767 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1768 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1769 }
1770 return copy;
1771 }
1772
1773 /* Rearrange the statements of NODE according to PERMUTATION. */
1774
1775 static void
1776 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1777 vec<unsigned> permutation,
1778 hash_set<slp_tree> &visited)
1779 {
1780 unsigned int i;
1781 slp_tree child;
1782
1783 if (visited.add (node))
1784 return;
1785
1786 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1787 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1788
1789 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1790 {
1791 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1792 vec<stmt_vec_info> tmp_stmts;
1793 tmp_stmts.create (group_size);
1794 tmp_stmts.quick_grow (group_size);
1795 stmt_vec_info stmt_info;
1796 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1797 tmp_stmts[permutation[i]] = stmt_info;
1798 SLP_TREE_SCALAR_STMTS (node).release ();
1799 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1800 }
1801 if (SLP_TREE_SCALAR_OPS (node).exists ())
1802 {
1803 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1804 vec<tree> tmp_ops;
1805 tmp_ops.create (group_size);
1806 tmp_ops.quick_grow (group_size);
1807 tree op;
1808 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1809 tmp_ops[permutation[i]] = op;
1810 SLP_TREE_SCALAR_OPS (node).release ();
1811 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1812 }
1813 }
1814
1815
1816 /* Attempt to reorder stmts in a reduction chain so that we don't
1817 require any load permutation. Return true if that was possible,
1818 otherwise return false. */
1819
1820 static bool
1821 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1822 {
1823 unsigned int i, j;
1824 unsigned int lidx;
1825 slp_tree node, load;
1826
1827 if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
1828 return false;
1829
1830 /* Compare all the permutation sequences to the first one. We know
1831 that at least one load is permuted. */
1832 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1833 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1834 return false;
1835 unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length ();
1836 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1837 {
1838 if (!SLP_TREE_LOAD_PERMUTATION (load).exists ()
1839 || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size)
1840 return false;
1841 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx)
1842 if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j])
1843 return false;
1844 }
1845
1846 /* Check that the loads in the first sequence are different and there
1847 are no gaps between them. */
1848 auto_sbitmap load_index (group_size);
1849 bitmap_clear (load_index);
1850 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1851 {
1852 if (lidx >= group_size)
1853 return false;
1854 if (bitmap_bit_p (load_index, lidx))
1855 return false;
1856
1857 bitmap_set_bit (load_index, lidx);
1858 }
1859 for (i = 0; i < group_size; i++)
1860 if (!bitmap_bit_p (load_index, i))
1861 return false;
1862
1863 /* This permutation is valid for reduction. Since the order of the
1864 statements in the nodes is not important unless they are memory
1865 accesses, we can rearrange the statements in all the nodes
1866 according to the order of the loads. */
1867
1868 /* We have to unshare the SLP tree we modify. */
1869 hash_map<slp_tree, slp_tree> map;
1870 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1871 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1872 unshared->refcnt++;
1873 SLP_INSTANCE_TREE (slp_instn) = unshared;
1874 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1875 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1876 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1877
1878 /* Do the actual re-arrangement. */
1879 hash_set<slp_tree> visited;
1880 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1881 node->load_permutation, visited);
1882
1883 /* We are done, no actual permutations need to be generated. */
1884 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1885 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1886 {
1887 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1888 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1889 /* But we have to keep those permutations that are required because
1890 of handling of gaps. */
1891 if (known_eq (unrolling_factor, 1U)
1892 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1893 && DR_GROUP_GAP (first_stmt_info) == 0))
1894 SLP_TREE_LOAD_PERMUTATION (node).release ();
1895 else
1896 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1897 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1898 }
1899
1900 return true;
1901 }
1902
1903 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1904
1905 static void
1906 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
1907 hash_set<slp_tree> &visited)
1908 {
1909 if (visited.add (node))
1910 return;
1911
1912 if (SLP_TREE_CHILDREN (node).length () == 0)
1913 {
1914 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1915 return;
1916 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1917 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1918 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1919 loads.safe_push (node);
1920 }
1921 else
1922 {
1923 unsigned i;
1924 slp_tree child;
1925 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1926 vect_gather_slp_loads (loads, child, visited);
1927 }
1928 }
1929
1930 static void
1931 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1932 {
1933 hash_set<slp_tree> visited;
1934 vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
1935 }
1936
1937
1938 /* Find the last store in SLP INSTANCE. */
1939
1940 stmt_vec_info
1941 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1942 {
1943 stmt_vec_info last = NULL;
1944 stmt_vec_info stmt_vinfo;
1945
1946 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1947 {
1948 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1949 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1950 }
1951
1952 return last;
1953 }
1954
1955 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1956 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1957 (also containing the first GROUP1_SIZE stmts, since stores are
1958 consecutive), the second containing the remainder.
1959 Return the first stmt in the second group. */
1960
1961 static stmt_vec_info
1962 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1963 {
1964 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1965 gcc_assert (group1_size > 0);
1966 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1967 gcc_assert (group2_size > 0);
1968 DR_GROUP_SIZE (first_vinfo) = group1_size;
1969
1970 stmt_vec_info stmt_info = first_vinfo;
1971 for (unsigned i = group1_size; i > 1; i--)
1972 {
1973 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1974 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1975 }
1976 /* STMT is now the last element of the first group. */
1977 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1978 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1979
1980 DR_GROUP_SIZE (group2) = group2_size;
1981 for (stmt_info = group2; stmt_info;
1982 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1983 {
1984 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1985 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1986 }
1987
1988 /* For the second group, the DR_GROUP_GAP is that before the original group,
1989 plus skipping over the first vector. */
1990 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1991
1992 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1993 DR_GROUP_GAP (first_vinfo) += group2_size;
1994
1995 if (dump_enabled_p ())
1996 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1997 group1_size, group2_size);
1998
1999 return group2;
2000 }
2001
2002 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2003 statements and a vector of NUNITS elements. */
2004
2005 static poly_uint64
2006 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2007 {
2008 return exact_div (common_multiple (nunits, group_size), group_size);
2009 }
2010
2011 /* Analyze an SLP instance starting from a group of grouped stores. Call
2012 vect_build_slp_tree to build a tree of packed stmts if possible.
2013 Return FALSE if it's impossible to SLP any stmt in the loop. */
2014
2015 static bool
2016 vect_analyze_slp_instance (vec_info *vinfo,
2017 scalar_stmts_to_slp_tree_map_t *bst_map,
2018 stmt_vec_info stmt_info, unsigned max_tree_size)
2019 {
2020 slp_instance new_instance;
2021 slp_tree node;
2022 unsigned int group_size;
2023 tree vectype, scalar_type = NULL_TREE;
2024 unsigned int i;
2025 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2026 vec<stmt_vec_info> scalar_stmts;
2027 bool constructor = false;
2028
2029 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2030 {
2031 scalar_type = TREE_TYPE (DR_REF (dr));
2032 group_size = DR_GROUP_SIZE (stmt_info);
2033 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2034 }
2035 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2036 {
2037 gcc_assert (is_a <loop_vec_info> (vinfo));
2038 vectype = STMT_VINFO_VECTYPE (stmt_info);
2039 group_size = REDUC_GROUP_SIZE (stmt_info);
2040 }
2041 else if (is_gimple_assign (stmt_info->stmt)
2042 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2043 {
2044 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2045 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2046 constructor = true;
2047 }
2048 else
2049 {
2050 gcc_assert (is_a <loop_vec_info> (vinfo));
2051 vectype = STMT_VINFO_VECTYPE (stmt_info);
2052 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2053 }
2054
2055 if (!vectype)
2056 {
2057 if (dump_enabled_p ())
2058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2059 "Build SLP failed: unsupported data-type %T\n",
2060 scalar_type);
2061
2062 return false;
2063 }
2064 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2065
2066 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2067 scalar_stmts.create (group_size);
2068 stmt_vec_info next_info = stmt_info;
2069 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2070 {
2071 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2072 while (next_info)
2073 {
2074 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2075 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2076 }
2077 }
2078 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2079 {
2080 /* Collect the reduction stmts and store them in
2081 SLP_TREE_SCALAR_STMTS. */
2082 while (next_info)
2083 {
2084 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2085 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2086 }
2087 /* Mark the first element of the reduction chain as reduction to properly
2088 transform the node. In the reduction analysis phase only the last
2089 element of the chain is marked as reduction. */
2090 STMT_VINFO_DEF_TYPE (stmt_info)
2091 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2092 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2093 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2094 }
2095 else if (constructor)
2096 {
2097 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2098 tree val;
2099 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2100 {
2101 if (TREE_CODE (val) == SSA_NAME)
2102 {
2103 gimple* def = SSA_NAME_DEF_STMT (val);
2104 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2105 /* Value is defined in another basic block. */
2106 if (!def_info)
2107 return false;
2108 def_info = vect_stmt_to_vectorize (def_info);
2109 scalar_stmts.safe_push (def_info);
2110 }
2111 else
2112 return false;
2113 }
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_NOTE, vect_location,
2116 "Analyzing vectorizable constructor: %G\n",
2117 stmt_info->stmt);
2118 }
2119 else
2120 {
2121 /* Collect reduction statements. */
2122 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2123 for (i = 0; reductions.iterate (i, &next_info); i++)
2124 scalar_stmts.safe_push (next_info);
2125 }
2126
2127 /* Build the tree for the SLP instance. */
2128 bool *matches = XALLOCAVEC (bool, group_size);
2129 unsigned npermutes = 0;
2130 poly_uint64 max_nunits = nunits;
2131 unsigned tree_size = 0;
2132 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2133 &max_nunits, matches, &npermutes,
2134 &tree_size, bst_map);
2135 if (node != NULL)
2136 {
2137 /* Calculate the unrolling factor based on the smallest type. */
2138 poly_uint64 unrolling_factor
2139 = calculate_unrolling_factor (max_nunits, group_size);
2140
2141 if (maybe_ne (unrolling_factor, 1U)
2142 && is_a <bb_vec_info> (vinfo))
2143 {
2144 unsigned HOST_WIDE_INT const_max_nunits;
2145 if (!max_nunits.is_constant (&const_max_nunits)
2146 || const_max_nunits > group_size)
2147 {
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "Build SLP failed: store group "
2151 "size not a multiple of the vector size "
2152 "in basic block SLP\n");
2153 vect_free_slp_tree (node, false);
2154 return false;
2155 }
2156 /* Fatal mismatch. */
2157 matches[group_size / const_max_nunits * const_max_nunits] = false;
2158 vect_free_slp_tree (node, false);
2159 }
2160 else
2161 {
2162 /* Create a new SLP instance. */
2163 new_instance = XNEW (class _slp_instance);
2164 SLP_INSTANCE_TREE (new_instance) = node;
2165 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2166 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2167 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2168
2169 vect_gather_slp_loads (new_instance, node);
2170 if (dump_enabled_p ())
2171 dump_printf_loc (MSG_NOTE, vect_location,
2172 "SLP size %u vs. limit %u.\n",
2173 tree_size, max_tree_size);
2174
2175 /* Check whether any load is possibly permuted. */
2176 slp_tree load_node;
2177 bool loads_permuted = false;
2178 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2179 {
2180 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2181 continue;
2182 unsigned j;
2183 stmt_vec_info load_info;
2184 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2185 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2186 {
2187 loads_permuted = true;
2188 break;
2189 }
2190 }
2191
2192 /* If the loads and stores can be handled with load/store-lane
2193 instructions do not generate this SLP instance. */
2194 if (is_a <loop_vec_info> (vinfo)
2195 && loads_permuted
2196 && dr && vect_store_lanes_supported (vectype, group_size, false))
2197 {
2198 slp_tree load_node;
2199 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2200 {
2201 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2202 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2203 /* Use SLP for strided accesses (or if we can't load-lanes). */
2204 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2205 || ! vect_load_lanes_supported
2206 (STMT_VINFO_VECTYPE (stmt_vinfo),
2207 DR_GROUP_SIZE (stmt_vinfo), false))
2208 break;
2209 }
2210 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2211 {
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214 "Built SLP cancelled: can use "
2215 "load/store-lanes\n");
2216 vect_free_slp_instance (new_instance, false);
2217 return false;
2218 }
2219 }
2220
2221 /* If this is a reduction chain with a conversion in front
2222 amend the SLP tree with a node for that. */
2223 if (!dr
2224 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2225 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2226 {
2227 /* Get at the conversion stmt - we know it's the single use
2228 of the last stmt of the reduction chain. */
2229 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2230 use_operand_p use_p;
2231 gimple *use_stmt;
2232 bool r = single_imm_use (gimple_assign_lhs (tem),
2233 &use_p, &use_stmt);
2234 gcc_assert (r);
2235 next_info = vinfo->lookup_stmt (use_stmt);
2236 next_info = vect_stmt_to_vectorize (next_info);
2237 scalar_stmts = vNULL;
2238 scalar_stmts.create (group_size);
2239 for (unsigned i = 0; i < group_size; ++i)
2240 scalar_stmts.quick_push (next_info);
2241 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
2242 SLP_TREE_CHILDREN (conv).quick_push (node);
2243 SLP_INSTANCE_TREE (new_instance) = conv;
2244 /* We also have to fake this conversion stmt as SLP reduction
2245 group so we don't have to mess with too much code
2246 elsewhere. */
2247 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2248 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2249 }
2250
2251 vinfo->slp_instances.safe_push (new_instance);
2252
2253 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
2254 the number of scalar stmts in the root in a few places.
2255 Verify that assumption holds. */
2256 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
2257 .length () == group_size);
2258
2259 if (dump_enabled_p ())
2260 {
2261 dump_printf_loc (MSG_NOTE, vect_location,
2262 "Final SLP tree for instance:\n");
2263 vect_print_slp_graph (MSG_NOTE, vect_location,
2264 SLP_INSTANCE_TREE (new_instance));
2265 }
2266
2267 return true;
2268 }
2269 }
2270 else
2271 {
2272 /* Failed to SLP. */
2273 /* Free the allocated memory. */
2274 scalar_stmts.release ();
2275 }
2276
2277 /* For basic block SLP, try to break the group up into multiples of the
2278 vector size. */
2279 unsigned HOST_WIDE_INT const_nunits;
2280 if (is_a <bb_vec_info> (vinfo)
2281 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2282 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2283 && nunits.is_constant (&const_nunits))
2284 {
2285 /* We consider breaking the group only on VF boundaries from the existing
2286 start. */
2287 for (i = 0; i < group_size; i++)
2288 if (!matches[i]) break;
2289
2290 if (i >= const_nunits && i < group_size)
2291 {
2292 /* Split into two groups at the first vector boundary before i. */
2293 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2294 unsigned group1_size = i & ~(const_nunits - 1);
2295
2296 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2297 group1_size);
2298 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2299 max_tree_size);
2300 /* If the first non-match was in the middle of a vector,
2301 skip the rest of that vector. */
2302 if (group1_size < i)
2303 {
2304 i = group1_size + const_nunits;
2305 if (i < group_size)
2306 rest = vect_split_slp_store_group (rest, const_nunits);
2307 }
2308 if (i < group_size)
2309 res |= vect_analyze_slp_instance (vinfo, bst_map,
2310 rest, max_tree_size);
2311 return res;
2312 }
2313 /* Even though the first vector did not all match, we might be able to SLP
2314 (some) of the remainder. FORNOW ignore this possibility. */
2315 }
2316
2317 return false;
2318 }
2319
2320
2321 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2322 trees of packed scalar stmts if SLP is possible. */
2323
2324 opt_result
2325 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2326 {
2327 unsigned int i;
2328 stmt_vec_info first_element;
2329
2330 DUMP_VECT_SCOPE ("vect_analyze_slp");
2331
2332 scalar_stmts_to_slp_tree_map_t *bst_map
2333 = new scalar_stmts_to_slp_tree_map_t ();
2334
2335 /* Find SLP sequences starting from groups of grouped stores. */
2336 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2337 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2338
2339 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2340 {
2341 if (loop_vinfo->reduction_chains.length () > 0)
2342 {
2343 /* Find SLP sequences starting from reduction chains. */
2344 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2345 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2346 max_tree_size))
2347 {
2348 /* Dissolve reduction chain group. */
2349 stmt_vec_info vinfo = first_element;
2350 stmt_vec_info last = NULL;
2351 while (vinfo)
2352 {
2353 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2354 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2355 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2356 last = vinfo;
2357 vinfo = next;
2358 }
2359 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2360 /* It can be still vectorized as part of an SLP reduction. */
2361 loop_vinfo->reductions.safe_push (last);
2362 }
2363 }
2364
2365 /* Find SLP sequences starting from groups of reductions. */
2366 if (loop_vinfo->reductions.length () > 1)
2367 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2368 max_tree_size);
2369 }
2370
2371 /* The map keeps a reference on SLP nodes built, release that. */
2372 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2373 it != bst_map->end (); ++it)
2374 if ((*it).second)
2375 vect_free_slp_tree ((*it).second, false);
2376 delete bst_map;
2377
2378 /* Optimize permutations in SLP reductions. */
2379 slp_instance instance;
2380 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2381 {
2382 slp_tree node = SLP_INSTANCE_TREE (instance);
2383 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2384 /* Reduction (there are no data-refs in the root).
2385 In reduction chain the order of the loads is not important. */
2386 if (!STMT_VINFO_DATA_REF (stmt_info)
2387 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2388 vect_attempt_slp_rearrange_stmts (instance);
2389 }
2390
2391 /* Gather all loads in the SLP graph. */
2392 hash_set<slp_tree> visited;
2393 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
2394 vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
2395 visited);
2396
2397 return opt_result::success ();
2398 }
2399
2400 void
2401 vect_optimize_slp (vec_info *vinfo)
2402 {
2403 slp_tree node;
2404 unsigned i;
2405 FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
2406 {
2407 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
2408 continue;
2409
2410 /* In basic block vectorization we allow any subchain of an interleaving
2411 chain.
2412 FORNOW: not in loop SLP because of realignment complications. */
2413 if (is_a <bb_vec_info> (vinfo))
2414 {
2415 bool subchain_p = true;
2416 stmt_vec_info next_load_info = NULL;
2417 stmt_vec_info load_info;
2418 unsigned j;
2419 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2420 {
2421 if (j != 0
2422 && (next_load_info != load_info
2423 || DR_GROUP_GAP (load_info) != 1))
2424 {
2425 subchain_p = false;
2426 break;
2427 }
2428 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
2429 }
2430 if (subchain_p)
2431 {
2432 SLP_TREE_LOAD_PERMUTATION (node).release ();
2433 continue;
2434 }
2435 }
2436 else
2437 {
2438 stmt_vec_info load_info;
2439 bool this_load_permuted = false;
2440 unsigned j;
2441 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2442 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
2443 {
2444 this_load_permuted = true;
2445 break;
2446 }
2447 stmt_vec_info first_stmt_info
2448 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
2449 if (!this_load_permuted
2450 /* The load requires permutation when unrolling exposes
2451 a gap either because the group is larger than the SLP
2452 group-size or because there is a gap between the groups. */
2453 && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
2454 || ((SLP_TREE_SCALAR_STMTS (node).length ()
2455 == DR_GROUP_SIZE (first_stmt_info))
2456 && DR_GROUP_GAP (first_stmt_info) == 0)))
2457 {
2458 SLP_TREE_LOAD_PERMUTATION (node).release ();
2459 continue;
2460 }
2461 }
2462 }
2463 }
2464
2465
2466 /* For each possible SLP instance decide whether to SLP it and calculate overall
2467 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2468 least one instance. */
2469
2470 bool
2471 vect_make_slp_decision (loop_vec_info loop_vinfo)
2472 {
2473 unsigned int i;
2474 poly_uint64 unrolling_factor = 1;
2475 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2476 slp_instance instance;
2477 int decided_to_slp = 0;
2478
2479 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2480
2481 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2482 {
2483 /* FORNOW: SLP if you can. */
2484 /* All unroll factors have the form:
2485
2486 GET_MODE_SIZE (vinfo->vector_mode) * X
2487
2488 for some rational X, so they must have a common multiple. */
2489 unrolling_factor
2490 = force_common_multiple (unrolling_factor,
2491 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2492
2493 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2494 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2495 loop-based vectorization. Such stmts will be marked as HYBRID. */
2496 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2497 decided_to_slp++;
2498 }
2499
2500 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2501
2502 if (decided_to_slp && dump_enabled_p ())
2503 {
2504 dump_printf_loc (MSG_NOTE, vect_location,
2505 "Decided to SLP %d instances. Unrolling factor ",
2506 decided_to_slp);
2507 dump_dec (MSG_NOTE, unrolling_factor);
2508 dump_printf (MSG_NOTE, "\n");
2509 }
2510
2511 return (decided_to_slp > 0);
2512 }
2513
2514 /* Private data for vect_detect_hybrid_slp. */
2515 struct vdhs_data
2516 {
2517 loop_vec_info loop_vinfo;
2518 vec<stmt_vec_info> *worklist;
2519 };
2520
2521 /* Walker for walk_gimple_op. */
2522
2523 static tree
2524 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2525 {
2526 walk_stmt_info *wi = (walk_stmt_info *)data;
2527 vdhs_data *dat = (vdhs_data *)wi->info;
2528
2529 if (wi->is_lhs)
2530 return NULL_TREE;
2531
2532 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2533 if (!def_stmt_info)
2534 return NULL_TREE;
2535 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2536 if (PURE_SLP_STMT (def_stmt_info))
2537 {
2538 if (dump_enabled_p ())
2539 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2540 def_stmt_info->stmt);
2541 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2542 dat->worklist->safe_push (def_stmt_info);
2543 }
2544
2545 return NULL_TREE;
2546 }
2547
2548 /* Find stmts that must be both vectorized and SLPed. */
2549
2550 void
2551 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2552 {
2553 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2554
2555 /* All stmts participating in SLP are marked pure_slp, all other
2556 stmts are loop_vect.
2557 First collect all loop_vect stmts into a worklist. */
2558 auto_vec<stmt_vec_info> worklist;
2559 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2560 {
2561 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2562 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2563 gsi_next (&gsi))
2564 {
2565 gphi *phi = gsi.phi ();
2566 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2567 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2568 worklist.safe_push (stmt_info);
2569 }
2570 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2571 gsi_next (&gsi))
2572 {
2573 gimple *stmt = gsi_stmt (gsi);
2574 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2575 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2576 {
2577 for (gimple_stmt_iterator gsi2
2578 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2579 !gsi_end_p (gsi2); gsi_next (&gsi2))
2580 {
2581 stmt_vec_info patt_info
2582 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2583 if (!STMT_SLP_TYPE (patt_info)
2584 && STMT_VINFO_RELEVANT (patt_info))
2585 worklist.safe_push (patt_info);
2586 }
2587 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2588 }
2589 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2590 worklist.safe_push (stmt_info);
2591 }
2592 }
2593
2594 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2595 mark any SLP vectorized stmt as hybrid.
2596 ??? We're visiting def stmts N times (once for each non-SLP and
2597 once for each hybrid-SLP use). */
2598 walk_stmt_info wi;
2599 vdhs_data dat;
2600 dat.worklist = &worklist;
2601 dat.loop_vinfo = loop_vinfo;
2602 memset (&wi, 0, sizeof (wi));
2603 wi.info = (void *)&dat;
2604 while (!worklist.is_empty ())
2605 {
2606 stmt_vec_info stmt_info = worklist.pop ();
2607 /* Since SSA operands are not set up for pattern stmts we need
2608 to use walk_gimple_op. */
2609 wi.is_lhs = 0;
2610 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2611 }
2612 }
2613
2614
2615 /* Initialize a bb_vec_info struct for the statements between
2616 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2617
2618 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2619 gimple_stmt_iterator region_end_in,
2620 vec_info_shared *shared)
2621 : vec_info (vec_info::bb, init_cost (NULL), shared),
2622 bb (gsi_bb (region_begin_in)),
2623 region_begin (region_begin_in),
2624 region_end (region_end_in)
2625 {
2626 gimple_stmt_iterator gsi;
2627
2628 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2629 gsi_next (&gsi))
2630 {
2631 gimple *stmt = gsi_stmt (gsi);
2632 gimple_set_uid (stmt, 0);
2633 add_stmt (stmt);
2634 }
2635
2636 bb->aux = this;
2637 }
2638
2639
2640 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2641 stmts in the basic block. */
2642
2643 _bb_vec_info::~_bb_vec_info ()
2644 {
2645 for (gimple_stmt_iterator si = region_begin;
2646 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2647 /* Reset region marker. */
2648 gimple_set_uid (gsi_stmt (si), -1);
2649
2650 bb->aux = NULL;
2651 }
2652
2653 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2654 given then that child nodes have already been processed, and that
2655 their def types currently match their SLP node's def type. */
2656
2657 static bool
2658 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2659 slp_instance node_instance,
2660 stmt_vector_for_cost *cost_vec)
2661 {
2662 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2663 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2664
2665 /* Calculate the number of vector statements to be created for the
2666 scalar stmts in this node. For SLP reductions it is equal to the
2667 number of vector statements in the children (which has already been
2668 calculated by the recursive call). Otherwise it is the number of
2669 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2670 VF divided by the number of elements in a vector. */
2671 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2672 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2673 {
2674 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2675 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2676 {
2677 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2678 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2679 break;
2680 }
2681 }
2682 else
2683 {
2684 poly_uint64 vf;
2685 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2686 vf = loop_vinfo->vectorization_factor;
2687 else
2688 vf = 1;
2689 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2690 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2691 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2692 = vect_get_num_vectors (vf * group_size, vectype);
2693 }
2694
2695 bool dummy;
2696 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
2697 node, node_instance, cost_vec);
2698 }
2699
2700 /* Try to build NODE from scalars, returning true on success.
2701 NODE_INSTANCE is the SLP instance that contains NODE. */
2702
2703 static bool
2704 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2705 slp_instance node_instance)
2706 {
2707 stmt_vec_info stmt_info;
2708 unsigned int i;
2709
2710 if (!is_a <bb_vec_info> (vinfo)
2711 || node == SLP_INSTANCE_TREE (node_instance)
2712 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2713 return false;
2714
2715 if (dump_enabled_p ())
2716 dump_printf_loc (MSG_NOTE, vect_location,
2717 "Building vector operands from scalars instead\n");
2718
2719 /* Don't remove and free the child nodes here, since they could be
2720 referenced by other structures. The analysis and scheduling phases
2721 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2722 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2723 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2724 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2725 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2726 {
2727 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2728 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2729 }
2730 return true;
2731 }
2732
2733 /* Compute the prologue cost for invariant or constant operands represented
2734 by NODE. */
2735
2736 static void
2737 vect_prologue_cost_for_slp (slp_tree node,
2738 stmt_vector_for_cost *cost_vec)
2739 {
2740 /* Without looking at the actual initializer a vector of
2741 constants can be implemented as load from the constant pool.
2742 When all elements are the same we can use a splat. */
2743 tree vectype = SLP_TREE_VECTYPE (node);
2744 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
2745 unsigned num_vects_to_check;
2746 unsigned HOST_WIDE_INT const_nunits;
2747 unsigned nelt_limit;
2748 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
2749 && ! multiple_p (const_nunits, group_size))
2750 {
2751 num_vects_to_check = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
2752 nelt_limit = const_nunits;
2753 }
2754 else
2755 {
2756 /* If either the vector has variable length or the vectors
2757 are composed of repeated whole groups we only need to
2758 cost construction once. All vectors will be the same. */
2759 num_vects_to_check = 1;
2760 nelt_limit = group_size;
2761 }
2762 tree elt = NULL_TREE;
2763 unsigned nelt = 0;
2764 for (unsigned j = 0; j < num_vects_to_check * nelt_limit; ++j)
2765 {
2766 unsigned si = j % group_size;
2767 if (nelt == 0)
2768 elt = SLP_TREE_SCALAR_OPS (node)[si];
2769 /* ??? We're just tracking whether all operands of a single
2770 vector initializer are the same, ideally we'd check if
2771 we emitted the same one already. */
2772 else if (elt != SLP_TREE_SCALAR_OPS (node)[si])
2773 elt = NULL_TREE;
2774 nelt++;
2775 if (nelt == nelt_limit)
2776 {
2777 record_stmt_cost (cost_vec, 1,
2778 SLP_TREE_DEF_TYPE (node) == vect_external_def
2779 ? (elt ? scalar_to_vec : vec_construct)
2780 : vector_load,
2781 NULL, vectype, 0, vect_prologue);
2782 nelt = 0;
2783 }
2784 }
2785 }
2786
2787 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2788 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2789
2790 Return true if the operations are supported. */
2791
2792 static bool
2793 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2794 slp_instance node_instance,
2795 hash_set<slp_tree> &visited,
2796 hash_set<slp_tree> &lvisited,
2797 stmt_vector_for_cost *cost_vec)
2798 {
2799 int i, j;
2800 slp_tree child;
2801
2802 /* Assume we can code-generate all invariants. */
2803 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2804 return true;
2805
2806 /* If we already analyzed the exact same set of scalar stmts we're done.
2807 We share the generated vector stmts for those.
2808 The SLP graph is acyclic so not caching whether we failed or succeeded
2809 doesn't result in any issue since we throw away the lvisited set
2810 when we fail. */
2811 if (visited.contains (node)
2812 || lvisited.add (node))
2813 return true;
2814
2815 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2816 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2817 visited, lvisited, cost_vec))
2818 return false;
2819
2820 /* ??? We have to catch the case late where two first scalar stmts appear
2821 in multiple SLP children with different def type and fail. Remember
2822 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2823 match it when that is vect_internal_def. */
2824 auto_vec<vect_def_type, 4> dt;
2825 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2826 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2827 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2828 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2829
2830 /* Push SLP node def-type to stmt operands. */
2831 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2832 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2833 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2834 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2835 = SLP_TREE_DEF_TYPE (child);
2836
2837 /* Check everything worked out. */
2838 bool res = true;
2839 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2840 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2841 {
2842 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2843 {
2844 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2845 != SLP_TREE_DEF_TYPE (child))
2846 res = false;
2847 }
2848 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2849 != dt[j])
2850 res = false;
2851 }
2852 if (!res && dump_enabled_p ())
2853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2854 "not vectorized: same operand with different "
2855 "def type in stmt.\n");
2856
2857 if (res)
2858 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2859 cost_vec);
2860
2861 /* Restore def-types. */
2862 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2863 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2864 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2865
2866 /* When the node can be vectorized cost invariant nodes it references.
2867 This is not done in DFS order to allow the refering node
2868 vectorizable_* calls to nail down the invariant nodes vector type
2869 and possibly unshare it if it needs a different vector type than
2870 other referrers. */
2871 if (res)
2872 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2873 if ((SLP_TREE_DEF_TYPE (child) == vect_constant_def
2874 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
2875 /* Perform usual caching, note code-generation still
2876 code-gens these nodes multiple times but we expect
2877 to CSE them later. */
2878 && !visited.contains (child)
2879 && !lvisited.add (child))
2880 {
2881 /* ??? After auditing more code paths make a "default"
2882 and push the vector type from NODE to all children
2883 if it is not already set. */
2884 /* Compute the number of vectors to be generated. */
2885 tree vector_type = SLP_TREE_VECTYPE (child);
2886 if (!vector_type)
2887 {
2888 /* For shifts with a scalar argument we don't need
2889 to cost or code-generate anything.
2890 ??? Represent this more explicitely. */
2891 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_SCALAR_STMTS (node)[0])
2892 == shift_vec_info_type)
2893 && j == 1);
2894 continue;
2895 }
2896 unsigned group_size = SLP_TREE_SCALAR_OPS (child).length ();
2897 poly_uint64 vf = 1;
2898 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2899 vf = loop_vinfo->vectorization_factor;
2900 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
2901 = vect_get_num_vectors (vf * group_size, vector_type);
2902 /* And cost them. */
2903 vect_prologue_cost_for_slp (child, cost_vec);
2904 }
2905
2906 /* If this node can't be vectorized, try pruning the tree here rather
2907 than felling the whole thing. */
2908 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2909 {
2910 /* We'll need to revisit this for invariant costing and number
2911 of vectorized stmt setting. */
2912 lvisited.remove (node);
2913 res = true;
2914 }
2915
2916 return res;
2917 }
2918
2919
2920 /* Analyze statements in SLP instances of VINFO. Return true if the
2921 operations are supported. */
2922
2923 bool
2924 vect_slp_analyze_operations (vec_info *vinfo)
2925 {
2926 slp_instance instance;
2927 int i;
2928
2929 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2930
2931 hash_set<slp_tree> visited;
2932 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2933 {
2934 hash_set<slp_tree> lvisited;
2935 stmt_vector_for_cost cost_vec;
2936 cost_vec.create (2);
2937 if (!vect_slp_analyze_node_operations (vinfo,
2938 SLP_INSTANCE_TREE (instance),
2939 instance, visited, lvisited,
2940 &cost_vec)
2941 /* Instances with a root stmt require vectorized defs for the
2942 SLP tree root. */
2943 || (SLP_INSTANCE_ROOT_STMT (instance)
2944 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2945 != vect_internal_def)))
2946 {
2947 slp_tree node = SLP_INSTANCE_TREE (instance);
2948 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2949 if (dump_enabled_p ())
2950 dump_printf_loc (MSG_NOTE, vect_location,
2951 "removing SLP instance operations starting from: %G",
2952 stmt_info->stmt);
2953 vect_free_slp_instance (instance, false);
2954 vinfo->slp_instances.ordered_remove (i);
2955 cost_vec.release ();
2956 }
2957 else
2958 {
2959 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2960 x != lvisited.end(); ++x)
2961 visited.add (*x);
2962 i++;
2963
2964 add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
2965 cost_vec.release ();
2966 }
2967 }
2968
2969 return !vinfo->slp_instances.is_empty ();
2970 }
2971
2972
2973 /* Compute the scalar cost of the SLP node NODE and its children
2974 and return it. Do not account defs that are marked in LIFE and
2975 update LIFE according to uses of NODE. */
2976
2977 static void
2978 vect_bb_slp_scalar_cost (vec_info *vinfo,
2979 slp_tree node, vec<bool, va_heap> *life,
2980 stmt_vector_for_cost *cost_vec,
2981 hash_set<slp_tree> &visited)
2982 {
2983 unsigned i;
2984 stmt_vec_info stmt_info;
2985 slp_tree child;
2986
2987 if (visited.add (node))
2988 return;
2989
2990 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2991 {
2992 gimple *stmt = stmt_info->stmt;
2993 ssa_op_iter op_iter;
2994 def_operand_p def_p;
2995
2996 if ((*life)[i])
2997 continue;
2998
2999 /* If there is a non-vectorized use of the defs then the scalar
3000 stmt is kept live in which case we do not account it or any
3001 required defs in the SLP children in the scalar cost. This
3002 way we make the vectorization more costly when compared to
3003 the scalar cost. */
3004 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
3005 {
3006 imm_use_iterator use_iter;
3007 gimple *use_stmt;
3008 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3009 if (!is_gimple_debug (use_stmt))
3010 {
3011 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3012 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
3013 {
3014 (*life)[i] = true;
3015 BREAK_FROM_IMM_USE_STMT (use_iter);
3016 }
3017 }
3018 }
3019 if ((*life)[i])
3020 continue;
3021
3022 /* Count scalar stmts only once. */
3023 if (gimple_visited_p (stmt))
3024 continue;
3025 gimple_set_visited (stmt, true);
3026
3027 vect_cost_for_stmt kind;
3028 if (STMT_VINFO_DATA_REF (stmt_info))
3029 {
3030 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3031 kind = scalar_load;
3032 else
3033 kind = scalar_store;
3034 }
3035 else if (vect_nop_conversion_p (stmt_info))
3036 continue;
3037 else
3038 kind = scalar_stmt;
3039 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
3040 }
3041
3042 auto_vec<bool, 20> subtree_life;
3043 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3044 {
3045 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3046 {
3047 /* Do not directly pass LIFE to the recursive call, copy it to
3048 confine changes in the callee to the current child/subtree. */
3049 subtree_life.safe_splice (*life);
3050 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
3051 visited);
3052 subtree_life.truncate (0);
3053 }
3054 }
3055 }
3056
3057 /* Check if vectorization of the basic block is profitable. */
3058
3059 static bool
3060 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3061 {
3062 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3063 slp_instance instance;
3064 int i;
3065 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3066 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3067
3068 /* Calculate scalar cost. */
3069 stmt_vector_for_cost scalar_costs;
3070 scalar_costs.create (0);
3071 hash_set<slp_tree> visited;
3072 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3073 {
3074 auto_vec<bool, 20> life;
3075 life.safe_grow_cleared
3076 (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)).length ());
3077 vect_bb_slp_scalar_cost (bb_vinfo,
3078 SLP_INSTANCE_TREE (instance),
3079 &life, &scalar_costs, visited);
3080 }
3081 void *target_cost_data = init_cost (NULL);
3082 add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
3083 scalar_costs.release ();
3084 unsigned dummy;
3085 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3086 destroy_cost_data (target_cost_data);
3087
3088 /* Unset visited flag. */
3089 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3090 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3091 gimple_set_visited (gsi_stmt (gsi), false);
3092
3093 /* Complete the target-specific cost calculation. */
3094 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3095 &vec_inside_cost, &vec_epilogue_cost);
3096
3097 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3098
3099 if (dump_enabled_p ())
3100 {
3101 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3102 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3103 vec_inside_cost);
3104 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3105 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3106 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3107 }
3108
3109 /* Vectorization is profitable if its cost is more than the cost of scalar
3110 version. Note that we err on the vector side for equal cost because
3111 the cost estimate is otherwise quite pessimistic (constant uses are
3112 free on the scalar side but cost a load on the vector side for
3113 example). */
3114 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3115 return false;
3116
3117 return true;
3118 }
3119
3120 /* Find any vectorizable constructors and add them to the grouped_store
3121 array. */
3122
3123 static void
3124 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3125 {
3126 gimple_stmt_iterator gsi;
3127
3128 for (gsi = bb_vinfo->region_begin;
3129 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3130 {
3131 gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
3132 if (!stmt || gimple_assign_rhs_code (stmt) != CONSTRUCTOR)
3133 continue;
3134
3135 tree rhs = gimple_assign_rhs1 (stmt);
3136 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3137 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3138 CONSTRUCTOR_NELTS (rhs))
3139 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3140 || uniform_vector_p (rhs))
3141 continue;
3142
3143 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
3144 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3145 }
3146 }
3147
3148 /* Check if the region described by BB_VINFO can be vectorized, returning
3149 true if so. When returning false, set FATAL to true if the same failure
3150 would prevent vectorization at other vector sizes, false if it is still
3151 worth trying other sizes. N_STMTS is the number of statements in the
3152 region. */
3153
3154 static bool
3155 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3156 {
3157 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3158
3159 slp_instance instance;
3160 int i;
3161 poly_uint64 min_vf = 2;
3162
3163 /* The first group of checks is independent of the vector size. */
3164 fatal = true;
3165
3166 /* Analyze the data references. */
3167
3168 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3169 {
3170 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3172 "not vectorized: unhandled data-ref in basic "
3173 "block.\n");
3174 return false;
3175 }
3176
3177 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3178 {
3179 if (dump_enabled_p ())
3180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3181 "not vectorized: not enough data-refs in "
3182 "basic block.\n");
3183 return false;
3184 }
3185
3186 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3187 {
3188 if (dump_enabled_p ())
3189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3190 "not vectorized: unhandled data access in "
3191 "basic block.\n");
3192 return false;
3193 }
3194
3195 vect_slp_check_for_constructors (bb_vinfo);
3196
3197 /* If there are no grouped stores in the region there is no need
3198 to continue with pattern recog as vect_analyze_slp will fail
3199 anyway. */
3200 if (bb_vinfo->grouped_stores.is_empty ())
3201 {
3202 if (dump_enabled_p ())
3203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3204 "not vectorized: no grouped stores in "
3205 "basic block.\n");
3206 return false;
3207 }
3208
3209 /* While the rest of the analysis below depends on it in some way. */
3210 fatal = false;
3211
3212 vect_pattern_recog (bb_vinfo);
3213
3214 /* Check the SLP opportunities in the basic block, analyze and build SLP
3215 trees. */
3216 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3217 {
3218 if (dump_enabled_p ())
3219 {
3220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3221 "Failed to SLP the basic block.\n");
3222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3223 "not vectorized: failed to find SLP opportunities "
3224 "in basic block.\n");
3225 }
3226 return false;
3227 }
3228
3229 /* Optimize permutations. */
3230 vect_optimize_slp (bb_vinfo);
3231
3232 vect_record_base_alignments (bb_vinfo);
3233
3234 /* Analyze and verify the alignment of data references and the
3235 dependence in the SLP instances. */
3236 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3237 {
3238 if (! vect_slp_analyze_and_verify_instance_alignment (bb_vinfo, instance)
3239 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
3240 {
3241 slp_tree node = SLP_INSTANCE_TREE (instance);
3242 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3243 if (dump_enabled_p ())
3244 dump_printf_loc (MSG_NOTE, vect_location,
3245 "removing SLP instance operations starting from: %G",
3246 stmt_info->stmt);
3247 vect_free_slp_instance (instance, false);
3248 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3249 continue;
3250 }
3251
3252 /* Mark all the statements that we want to vectorize as pure SLP and
3253 relevant. */
3254 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3255 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3256 if (SLP_INSTANCE_ROOT_STMT (instance))
3257 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3258
3259 i++;
3260 }
3261 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3262 return false;
3263
3264 if (!vect_slp_analyze_operations (bb_vinfo))
3265 {
3266 if (dump_enabled_p ())
3267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3268 "not vectorized: bad operation in basic block.\n");
3269 return false;
3270 }
3271
3272 /* Cost model: check if the vectorization is worthwhile. */
3273 if (!unlimited_cost_model (NULL)
3274 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3275 {
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278 "not vectorized: vectorization is not "
3279 "profitable.\n");
3280 return false;
3281 }
3282
3283 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_NOTE, vect_location,
3285 "Basic block will be vectorized using SLP\n");
3286 return true;
3287 }
3288
3289 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3290 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3291 on success. The region has N_STMTS statements and has the datarefs
3292 given by DATAREFS. */
3293
3294 static bool
3295 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3296 gimple_stmt_iterator region_end,
3297 vec<data_reference_p> datarefs,
3298 unsigned int n_stmts)
3299 {
3300 bb_vec_info bb_vinfo;
3301 auto_vector_modes vector_modes;
3302
3303 /* Autodetect first vector size we try. */
3304 machine_mode next_vector_mode = VOIDmode;
3305 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3306 unsigned int mode_i = 0;
3307
3308 vec_info_shared shared;
3309
3310 machine_mode autodetected_vector_mode = VOIDmode;
3311 while (1)
3312 {
3313 bool vectorized = false;
3314 bool fatal = false;
3315 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3316
3317 bool first_time_p = shared.datarefs.is_empty ();
3318 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3319 if (first_time_p)
3320 bb_vinfo->shared->save_datarefs ();
3321 else
3322 bb_vinfo->shared->check_datarefs ();
3323 bb_vinfo->vector_mode = next_vector_mode;
3324
3325 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3326 && dbg_cnt (vect_slp))
3327 {
3328 if (dump_enabled_p ())
3329 {
3330 dump_printf_loc (MSG_NOTE, vect_location,
3331 "***** Analysis succeeded with vector mode"
3332 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3333 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3334 }
3335
3336 bb_vinfo->shared->check_datarefs ();
3337 vect_schedule_slp (bb_vinfo);
3338
3339 unsigned HOST_WIDE_INT bytes;
3340 if (dump_enabled_p ())
3341 {
3342 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3343 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3344 "basic block part vectorized using %wu byte "
3345 "vectors\n", bytes);
3346 else
3347 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3348 "basic block part vectorized using variable "
3349 "length vectors\n");
3350 }
3351
3352 vectorized = true;
3353 }
3354 else
3355 {
3356 if (dump_enabled_p ())
3357 dump_printf_loc (MSG_NOTE, vect_location,
3358 "***** Analysis failed with vector mode %s\n",
3359 GET_MODE_NAME (bb_vinfo->vector_mode));
3360 }
3361
3362 if (mode_i == 0)
3363 autodetected_vector_mode = bb_vinfo->vector_mode;
3364
3365 if (!fatal)
3366 while (mode_i < vector_modes.length ()
3367 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3368 {
3369 if (dump_enabled_p ())
3370 dump_printf_loc (MSG_NOTE, vect_location,
3371 "***** The result for vector mode %s would"
3372 " be the same\n",
3373 GET_MODE_NAME (vector_modes[mode_i]));
3374 mode_i += 1;
3375 }
3376
3377 delete bb_vinfo;
3378
3379 if (mode_i < vector_modes.length ()
3380 && VECTOR_MODE_P (autodetected_vector_mode)
3381 && (related_vector_mode (vector_modes[mode_i],
3382 GET_MODE_INNER (autodetected_vector_mode))
3383 == autodetected_vector_mode)
3384 && (related_vector_mode (autodetected_vector_mode,
3385 GET_MODE_INNER (vector_modes[mode_i]))
3386 == vector_modes[mode_i]))
3387 {
3388 if (dump_enabled_p ())
3389 dump_printf_loc (MSG_NOTE, vect_location,
3390 "***** Skipping vector mode %s, which would"
3391 " repeat the analysis for %s\n",
3392 GET_MODE_NAME (vector_modes[mode_i]),
3393 GET_MODE_NAME (autodetected_vector_mode));
3394 mode_i += 1;
3395 }
3396
3397 if (vectorized
3398 || mode_i == vector_modes.length ()
3399 || autodetected_vector_mode == VOIDmode
3400 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3401 vector sizes will fail do not bother iterating. */
3402 || fatal)
3403 return vectorized;
3404
3405 /* Try the next biggest vector size. */
3406 next_vector_mode = vector_modes[mode_i++];
3407 if (dump_enabled_p ())
3408 dump_printf_loc (MSG_NOTE, vect_location,
3409 "***** Re-trying analysis with vector mode %s\n",
3410 GET_MODE_NAME (next_vector_mode));
3411 }
3412 }
3413
3414 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3415 true if anything in the basic-block was vectorized. */
3416
3417 bool
3418 vect_slp_bb (basic_block bb)
3419 {
3420 gimple_stmt_iterator gsi;
3421 bool any_vectorized = false;
3422
3423 gsi = gsi_after_labels (bb);
3424 while (!gsi_end_p (gsi))
3425 {
3426 gimple_stmt_iterator region_begin = gsi;
3427 vec<data_reference_p> datarefs = vNULL;
3428 int insns = 0;
3429
3430 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3431 {
3432 gimple *stmt = gsi_stmt (gsi);
3433 if (is_gimple_debug (stmt))
3434 continue;
3435 insns++;
3436
3437 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3438 vect_location = stmt;
3439
3440 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3441 break;
3442 }
3443
3444 /* Skip leading unhandled stmts. */
3445 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3446 {
3447 gsi_next (&gsi);
3448 continue;
3449 }
3450
3451 gimple_stmt_iterator region_end = gsi;
3452
3453 if (insns > param_slp_max_insns_in_bb)
3454 {
3455 if (dump_enabled_p ())
3456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3457 "not vectorized: too many instructions in "
3458 "basic block.\n");
3459 }
3460 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3461 any_vectorized = true;
3462
3463 if (gsi_end_p (region_end))
3464 break;
3465
3466 /* Skip the unhandled stmt. */
3467 gsi_next (&gsi);
3468 }
3469
3470 return any_vectorized;
3471 }
3472
3473
3474 /* Build a variable-length vector in which the elements in ELTS are repeated
3475 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3476 RESULTS and add any new instructions to SEQ.
3477
3478 The approach we use is:
3479
3480 (1) Find a vector mode VM with integer elements of mode IM.
3481
3482 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3483 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3484 from small vectors to IM.
3485
3486 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3487
3488 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3489 correct byte contents.
3490
3491 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3492
3493 We try to find the largest IM for which this sequence works, in order
3494 to cut down on the number of interleaves. */
3495
3496 void
3497 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3498 vec<tree> elts, unsigned int nresults,
3499 vec<tree> &results)
3500 {
3501 unsigned int nelts = elts.length ();
3502 tree element_type = TREE_TYPE (vector_type);
3503
3504 /* (1) Find a vector mode VM with integer elements of mode IM. */
3505 unsigned int nvectors = 1;
3506 tree new_vector_type;
3507 tree permutes[2];
3508 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3509 &nvectors, &new_vector_type,
3510 permutes))
3511 gcc_unreachable ();
3512
3513 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3514 unsigned int partial_nelts = nelts / nvectors;
3515 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3516
3517 tree_vector_builder partial_elts;
3518 auto_vec<tree, 32> pieces (nvectors * 2);
3519 pieces.quick_grow (nvectors * 2);
3520 for (unsigned int i = 0; i < nvectors; ++i)
3521 {
3522 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3523 ELTS' has mode IM. */
3524 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3525 for (unsigned int j = 0; j < partial_nelts; ++j)
3526 partial_elts.quick_push (elts[i * partial_nelts + j]);
3527 tree t = gimple_build_vector (seq, &partial_elts);
3528 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3529 TREE_TYPE (new_vector_type), t);
3530
3531 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3532 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3533 }
3534
3535 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3536 correct byte contents.
3537
3538 We need to repeat the following operation log2(nvectors) times:
3539
3540 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3541 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3542
3543 However, if each input repeats every N elements and the VF is
3544 a multiple of N * 2, the HI result is the same as the LO. */
3545 unsigned int in_start = 0;
3546 unsigned int out_start = nvectors;
3547 unsigned int hi_start = nvectors / 2;
3548 /* A bound on the number of outputs needed to produce NRESULTS results
3549 in the final iteration. */
3550 unsigned int noutputs_bound = nvectors * nresults;
3551 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3552 {
3553 noutputs_bound /= 2;
3554 unsigned int limit = MIN (noutputs_bound, nvectors);
3555 for (unsigned int i = 0; i < limit; ++i)
3556 {
3557 if ((i & 1) != 0
3558 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3559 2 * in_repeat))
3560 {
3561 pieces[out_start + i] = pieces[out_start + i - 1];
3562 continue;
3563 }
3564
3565 tree output = make_ssa_name (new_vector_type);
3566 tree input1 = pieces[in_start + (i / 2)];
3567 tree input2 = pieces[in_start + (i / 2) + hi_start];
3568 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3569 input1, input2,
3570 permutes[i & 1]);
3571 gimple_seq_add_stmt (seq, stmt);
3572 pieces[out_start + i] = output;
3573 }
3574 std::swap (in_start, out_start);
3575 }
3576
3577 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3578 results.reserve (nresults);
3579 for (unsigned int i = 0; i < nresults; ++i)
3580 if (i < nvectors)
3581 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3582 pieces[in_start + i]));
3583 else
3584 results.quick_push (results[i - nvectors]);
3585 }
3586
3587
3588 /* For constant and loop invariant defs in OP_NODE this function creates
3589 vector defs that will be used in the vectorized stmts and stores them
3590 to SLP_TREE_VEC_DEFS of OP_NODE. */
3591
3592 static void
3593 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
3594 {
3595 unsigned HOST_WIDE_INT nunits;
3596 tree vec_cst;
3597 unsigned j, number_of_places_left_in_vector;
3598 tree vector_type;
3599 tree vop;
3600 int group_size = op_node->ops.length ();
3601 unsigned int vec_num, i;
3602 unsigned number_of_copies = 1;
3603 bool constant_p;
3604 gimple_seq ctor_seq = NULL;
3605 auto_vec<tree, 16> permute_results;
3606
3607 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
3608 vector_type = SLP_TREE_VECTYPE (op_node);
3609
3610 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
3611 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
3612 auto_vec<tree> voprnds (number_of_vectors);
3613
3614 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3615 created vectors. It is greater than 1 if unrolling is performed.
3616
3617 For example, we have two scalar operands, s1 and s2 (e.g., group of
3618 strided accesses of size two), while NUNITS is four (i.e., four scalars
3619 of this type can be packed in a vector). The output vector will contain
3620 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3621 will be 2).
3622
3623 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3624 containing the operands.
3625
3626 For example, NUNITS is four as before, and the group size is 8
3627 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3628 {s5, s6, s7, s8}. */
3629
3630 /* When using duplicate_and_interleave, we just need one element for
3631 each scalar statement. */
3632 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3633 nunits = group_size;
3634
3635 number_of_copies = nunits * number_of_vectors / group_size;
3636
3637 number_of_places_left_in_vector = nunits;
3638 constant_p = true;
3639 tree_vector_builder elts (vector_type, nunits, 1);
3640 elts.quick_grow (nunits);
3641 stmt_vec_info insert_after = NULL;
3642 for (j = 0; j < number_of_copies; j++)
3643 {
3644 tree op;
3645 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3646 {
3647 /* Create 'vect_ = {op0,op1,...,opn}'. */
3648 number_of_places_left_in_vector--;
3649 tree orig_op = op;
3650 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3651 {
3652 if (CONSTANT_CLASS_P (op))
3653 {
3654 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3655 {
3656 /* Can't use VIEW_CONVERT_EXPR for booleans because
3657 of possibly different sizes of scalar value and
3658 vector element. */
3659 if (integer_zerop (op))
3660 op = build_int_cst (TREE_TYPE (vector_type), 0);
3661 else if (integer_onep (op))
3662 op = build_all_ones_cst (TREE_TYPE (vector_type));
3663 else
3664 gcc_unreachable ();
3665 }
3666 else
3667 op = fold_unary (VIEW_CONVERT_EXPR,
3668 TREE_TYPE (vector_type), op);
3669 gcc_assert (op && CONSTANT_CLASS_P (op));
3670 }
3671 else
3672 {
3673 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3674 gimple *init_stmt;
3675 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3676 {
3677 tree true_val
3678 = build_all_ones_cst (TREE_TYPE (vector_type));
3679 tree false_val
3680 = build_zero_cst (TREE_TYPE (vector_type));
3681 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3682 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3683 op, true_val,
3684 false_val);
3685 }
3686 else
3687 {
3688 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3689 op);
3690 init_stmt
3691 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3692 op);
3693 }
3694 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3695 op = new_temp;
3696 }
3697 }
3698 elts[number_of_places_left_in_vector] = op;
3699 if (!CONSTANT_CLASS_P (op))
3700 constant_p = false;
3701 /* For BB vectorization we have to compute an insert location
3702 when a def is inside the analyzed region since we cannot
3703 simply insert at the BB start in this case. */
3704 stmt_vec_info opdef;
3705 if (TREE_CODE (orig_op) == SSA_NAME
3706 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3707 && is_a <bb_vec_info> (vinfo)
3708 && (opdef = vinfo->lookup_def (orig_op)))
3709 {
3710 if (!insert_after)
3711 insert_after = opdef;
3712 else
3713 insert_after = get_later_stmt (insert_after, opdef);
3714 }
3715
3716 if (number_of_places_left_in_vector == 0)
3717 {
3718 if (constant_p
3719 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3720 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3721 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3722 else
3723 {
3724 if (permute_results.is_empty ())
3725 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3726 elts, number_of_vectors,
3727 permute_results);
3728 vec_cst = permute_results[number_of_vectors - j - 1];
3729 }
3730 tree init;
3731 if (insert_after)
3732 {
3733 gimple_stmt_iterator gsi = gsi_for_stmt (insert_after->stmt);
3734 /* vect_init_vector inserts before. */
3735 gsi_next (&gsi);
3736 init = vect_init_vector (vinfo, NULL, vec_cst,
3737 vector_type, &gsi);
3738 }
3739 else
3740 init = vect_init_vector (vinfo, NULL, vec_cst,
3741 vector_type, NULL);
3742 if (ctor_seq != NULL)
3743 {
3744 gimple_stmt_iterator gsi
3745 = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3746 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3747 ctor_seq = NULL;
3748 }
3749 voprnds.quick_push (init);
3750 insert_after = NULL;
3751 number_of_places_left_in_vector = nunits;
3752 constant_p = true;
3753 elts.new_vector (vector_type, nunits, 1);
3754 elts.quick_grow (nunits);
3755 }
3756 }
3757 }
3758
3759 /* Since the vectors are created in the reverse order, we should invert
3760 them. */
3761 vec_num = voprnds.length ();
3762 for (j = vec_num; j != 0; j--)
3763 {
3764 vop = voprnds[j - 1];
3765 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3766 }
3767
3768 /* In case that VF is greater than the unrolling factor needed for the SLP
3769 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3770 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3771 to replicate the vectors. */
3772 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
3773 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
3774 i++)
3775 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
3776 }
3777
3778
3779 /* Get vectorized definitions from SLP_NODE that contains corresponding
3780 vectorized def-stmts. */
3781
3782 static void
3783 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3784 {
3785 stmt_vec_info vec_def_stmt_info;
3786 unsigned int i;
3787
3788 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3789
3790 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3791 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3792 }
3793
3794
3795 /* Get N vectorized definitions for SLP_NODE. */
3796
3797 void
3798 vect_get_slp_defs (vec_info *,
3799 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3800 {
3801 if (n == -1U)
3802 n = SLP_TREE_CHILDREN (slp_node).length ();
3803
3804 for (unsigned i = 0; i < n; ++i)
3805 {
3806 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3807
3808 vec<tree> vec_defs = vNULL;
3809
3810 /* For each operand we check if it has vectorized definitions in a child
3811 node or we need to create them (for invariants and constants). */
3812 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3813 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3814 vect_get_slp_vect_defs (child, &vec_defs);
3815 else
3816 vec_defs.splice (SLP_TREE_VEC_DEFS (child));
3817
3818 vec_oprnds->quick_push (vec_defs);
3819 }
3820 }
3821
3822 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3823 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3824 permute statements for the SLP node NODE. */
3825
3826 bool
3827 vect_transform_slp_perm_load (vec_info *vinfo,
3828 slp_tree node, vec<tree> dr_chain,
3829 gimple_stmt_iterator *gsi, poly_uint64 vf,
3830 bool analyze_only, unsigned *n_perms)
3831 {
3832 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3833 int vec_index = 0;
3834 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3835 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
3836 unsigned int mask_element;
3837 machine_mode mode;
3838
3839 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3840 return false;
3841
3842 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3843
3844 mode = TYPE_MODE (vectype);
3845 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3846
3847 /* Initialize the vect stmts of NODE to properly insert the generated
3848 stmts later. */
3849 if (! analyze_only)
3850 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3851 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3852 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3853
3854 /* Generate permutation masks for every NODE. Number of masks for each NODE
3855 is equal to GROUP_SIZE.
3856 E.g., we have a group of three nodes with three loads from the same
3857 location in each node, and the vector size is 4. I.e., we have a
3858 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3859 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3860 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3861 ...
3862
3863 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3864 The last mask is illegal since we assume two operands for permute
3865 operation, and the mask element values can't be outside that range.
3866 Hence, the last mask must be converted into {2,5,5,5}.
3867 For the first two permutations we need the first and the second input
3868 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3869 we need the second and the third vectors: {b1,c1,a2,b2} and
3870 {c2,a3,b3,c3}. */
3871
3872 int vect_stmts_counter = 0;
3873 unsigned int index = 0;
3874 int first_vec_index = -1;
3875 int second_vec_index = -1;
3876 bool noop_p = true;
3877 *n_perms = 0;
3878
3879 vec_perm_builder mask;
3880 unsigned int nelts_to_build;
3881 unsigned int nvectors_per_build;
3882 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3883 && multiple_p (nunits, group_size));
3884 if (repeating_p)
3885 {
3886 /* A single vector contains a whole number of copies of the node, so:
3887 (a) all permutes can use the same mask; and
3888 (b) the permutes only need a single vector input. */
3889 mask.new_vector (nunits, group_size, 3);
3890 nelts_to_build = mask.encoded_nelts ();
3891 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3892 }
3893 else
3894 {
3895 /* We need to construct a separate mask for each vector statement. */
3896 unsigned HOST_WIDE_INT const_nunits, const_vf;
3897 if (!nunits.is_constant (&const_nunits)
3898 || !vf.is_constant (&const_vf))
3899 return false;
3900 mask.new_vector (const_nunits, const_nunits, 1);
3901 nelts_to_build = const_vf * group_size;
3902 nvectors_per_build = 1;
3903 }
3904
3905 unsigned int count = mask.encoded_nelts ();
3906 mask.quick_grow (count);
3907 vec_perm_indices indices;
3908
3909 for (unsigned int j = 0; j < nelts_to_build; j++)
3910 {
3911 unsigned int iter_num = j / group_size;
3912 unsigned int stmt_num = j % group_size;
3913 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3914 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3915 if (repeating_p)
3916 {
3917 first_vec_index = 0;
3918 mask_element = i;
3919 }
3920 else
3921 {
3922 /* Enforced before the loop when !repeating_p. */
3923 unsigned int const_nunits = nunits.to_constant ();
3924 vec_index = i / const_nunits;
3925 mask_element = i % const_nunits;
3926 if (vec_index == first_vec_index
3927 || first_vec_index == -1)
3928 {
3929 first_vec_index = vec_index;
3930 }
3931 else if (vec_index == second_vec_index
3932 || second_vec_index == -1)
3933 {
3934 second_vec_index = vec_index;
3935 mask_element += const_nunits;
3936 }
3937 else
3938 {
3939 if (dump_enabled_p ())
3940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3941 "permutation requires at "
3942 "least three vectors %G",
3943 stmt_info->stmt);
3944 gcc_assert (analyze_only);
3945 return false;
3946 }
3947
3948 gcc_assert (mask_element < 2 * const_nunits);
3949 }
3950
3951 if (mask_element != index)
3952 noop_p = false;
3953 mask[index++] = mask_element;
3954
3955 if (index == count && !noop_p)
3956 {
3957 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3958 if (!can_vec_perm_const_p (mode, indices))
3959 {
3960 if (dump_enabled_p ())
3961 {
3962 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3963 vect_location,
3964 "unsupported vect permute { ");
3965 for (i = 0; i < count; ++i)
3966 {
3967 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3968 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3969 }
3970 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3971 }
3972 gcc_assert (analyze_only);
3973 return false;
3974 }
3975
3976 ++*n_perms;
3977 }
3978
3979 if (index == count)
3980 {
3981 if (!analyze_only)
3982 {
3983 tree mask_vec = NULL_TREE;
3984
3985 if (! noop_p)
3986 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3987
3988 if (second_vec_index == -1)
3989 second_vec_index = first_vec_index;
3990
3991 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3992 {
3993 /* Generate the permute statement if necessary. */
3994 tree first_vec = dr_chain[first_vec_index + ri];
3995 tree second_vec = dr_chain[second_vec_index + ri];
3996 stmt_vec_info perm_stmt_info;
3997 if (! noop_p)
3998 {
3999 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4000 tree perm_dest
4001 = vect_create_destination_var (gimple_assign_lhs (stmt),
4002 vectype);
4003 perm_dest = make_ssa_name (perm_dest);
4004 gassign *perm_stmt
4005 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4006 first_vec, second_vec,
4007 mask_vec);
4008 perm_stmt_info
4009 = vect_finish_stmt_generation (vinfo,
4010 stmt_info, perm_stmt,
4011 gsi);
4012 }
4013 else
4014 /* If mask was NULL_TREE generate the requested
4015 identity transform. */
4016 perm_stmt_info = vinfo->lookup_def (first_vec);
4017
4018 /* Store the vector statement in NODE. */
4019 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
4020 = perm_stmt_info;
4021 }
4022 }
4023
4024 index = 0;
4025 first_vec_index = -1;
4026 second_vec_index = -1;
4027 noop_p = true;
4028 }
4029 }
4030
4031 return true;
4032 }
4033
4034 /* Vectorize SLP instance tree in postorder. */
4035
4036 static void
4037 vect_schedule_slp_instance (vec_info *vinfo,
4038 slp_tree node, slp_instance instance)
4039 {
4040 gimple_stmt_iterator si;
4041 int i, j;
4042 slp_tree child;
4043
4044 /* See if we have already vectorized the node in the graph of the
4045 SLP instance. */
4046 if ((SLP_TREE_DEF_TYPE (node) == vect_internal_def
4047 && SLP_TREE_VEC_STMTS (node).exists ())
4048 || SLP_TREE_VEC_DEFS (node).exists ())
4049 return;
4050
4051 /* Vectorize externals and constants. */
4052 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
4053 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4054 {
4055 /* ??? vectorizable_shift can end up using a scalar operand which is
4056 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
4057 node in this case. */
4058 if (!SLP_TREE_VECTYPE (node))
4059 return;
4060
4061 vect_create_constant_vectors (vinfo, node);
4062 return;
4063 }
4064
4065 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4066 vect_schedule_slp_instance (vinfo, child, instance);
4067
4068 /* Push SLP node def-type to stmts. */
4069 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4070 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4071 {
4072 stmt_vec_info child_stmt_info;
4073 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4074 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
4075 }
4076
4077 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4078
4079 /* VECTYPE is the type of the destination. */
4080 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4081 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4082 unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
4083
4084 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4085 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4086
4087 if (dump_enabled_p ())
4088 dump_printf_loc (MSG_NOTE, vect_location,
4089 "------>vectorizing SLP node starting from: %G",
4090 stmt_info->stmt);
4091
4092 /* Vectorized stmts go before the last scalar stmt which is where
4093 all uses are ready. */
4094 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4095 si = gsi_for_stmt (last_stmt_info->stmt);
4096
4097 /* Handle two-operation SLP nodes by vectorizing the group with
4098 both operations and then performing a merge. */
4099 bool done_p = false;
4100 if (SLP_TREE_TWO_OPERATORS (node))
4101 {
4102 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4103 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4104 enum tree_code ocode = ERROR_MARK;
4105 stmt_vec_info ostmt_info;
4106 vec_perm_builder mask (group_size, group_size, 1);
4107 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
4108 {
4109 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
4110 if (gimple_assign_rhs_code (ostmt) != code0)
4111 {
4112 mask.quick_push (1);
4113 ocode = gimple_assign_rhs_code (ostmt);
4114 }
4115 else
4116 mask.quick_push (0);
4117 }
4118 if (ocode != ERROR_MARK)
4119 {
4120 vec<stmt_vec_info> v0;
4121 vec<stmt_vec_info> v1;
4122 unsigned j;
4123 tree tmask = NULL_TREE;
4124 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4125 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4126 SLP_TREE_VEC_STMTS (node).truncate (0);
4127 gimple_assign_set_rhs_code (stmt, ocode);
4128 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4129 gimple_assign_set_rhs_code (stmt, code0);
4130 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4131 SLP_TREE_VEC_STMTS (node).truncate (0);
4132 tree meltype = build_nonstandard_integer_type
4133 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4134 tree mvectype = get_same_sized_vectype (meltype, vectype);
4135 unsigned k = 0, l;
4136 for (j = 0; j < v0.length (); ++j)
4137 {
4138 /* Enforced by vect_build_slp_tree, which rejects variable-length
4139 vectors for SLP_TREE_TWO_OPERATORS. */
4140 unsigned int const_nunits = nunits.to_constant ();
4141 tree_vector_builder melts (mvectype, const_nunits, 1);
4142 for (l = 0; l < const_nunits; ++l)
4143 {
4144 if (k >= group_size)
4145 k = 0;
4146 tree t = build_int_cst (meltype,
4147 mask[k++] * const_nunits + l);
4148 melts.quick_push (t);
4149 }
4150 tmask = melts.build ();
4151
4152 /* ??? Not all targets support a VEC_PERM_EXPR with a
4153 constant mask that would translate to a vec_merge RTX
4154 (with their vec_perm_const_ok). We can either not
4155 vectorize in that case or let veclower do its job.
4156 Unfortunately that isn't too great and at least for
4157 plus/minus we'd eventually like to match targets
4158 vector addsub instructions. */
4159 gimple *vstmt;
4160 vstmt = gimple_build_assign (make_ssa_name (vectype),
4161 VEC_PERM_EXPR,
4162 gimple_assign_lhs (v0[j]->stmt),
4163 gimple_assign_lhs (v1[j]->stmt),
4164 tmask);
4165 SLP_TREE_VEC_STMTS (node).quick_push
4166 (vect_finish_stmt_generation (vinfo, stmt_info, vstmt, &si));
4167 }
4168 v0.release ();
4169 v1.release ();
4170 done_p = true;
4171 }
4172 }
4173 if (!done_p)
4174 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
4175
4176 /* Restore stmt def-types. */
4177 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4178 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4179 {
4180 stmt_vec_info child_stmt_info;
4181 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4182 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4183 }
4184 }
4185
4186 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4187 For loop vectorization this is done in vectorizable_call, but for SLP
4188 it needs to be deferred until end of vect_schedule_slp, because multiple
4189 SLP instances may refer to the same scalar stmt. */
4190
4191 static void
4192 vect_remove_slp_scalar_calls (vec_info *vinfo,
4193 slp_tree node, hash_set<slp_tree> &visited)
4194 {
4195 gimple *new_stmt;
4196 gimple_stmt_iterator gsi;
4197 int i;
4198 slp_tree child;
4199 tree lhs;
4200 stmt_vec_info stmt_info;
4201
4202 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4203 return;
4204
4205 if (visited.add (node))
4206 return;
4207
4208 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4209 vect_remove_slp_scalar_calls (vinfo, child, visited);
4210
4211 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4212 {
4213 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4214 if (!stmt || gimple_bb (stmt) == NULL)
4215 continue;
4216 if (is_pattern_stmt_p (stmt_info)
4217 || !PURE_SLP_STMT (stmt_info))
4218 continue;
4219 lhs = gimple_call_lhs (stmt);
4220 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4221 gsi = gsi_for_stmt (stmt);
4222 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4223 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4224 }
4225 }
4226
4227 static void
4228 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
4229 {
4230 hash_set<slp_tree> visited;
4231 vect_remove_slp_scalar_calls (vinfo, node, visited);
4232 }
4233
4234 /* Vectorize the instance root. */
4235
4236 void
4237 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4238 {
4239 gassign *rstmt = NULL;
4240
4241 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4242 {
4243 stmt_vec_info child_stmt_info;
4244 int j;
4245
4246 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4247 {
4248 tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
4249 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4250 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4251 TREE_TYPE (vect_lhs)))
4252 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4253 vect_lhs);
4254 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4255 break;
4256 }
4257 }
4258 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4259 {
4260 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4261 stmt_vec_info child_stmt_info;
4262 int j;
4263 vec<constructor_elt, va_gc> *v;
4264 vec_alloc (v, nelts);
4265
4266 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4267 {
4268 CONSTRUCTOR_APPEND_ELT (v,
4269 NULL_TREE,
4270 gimple_get_lhs (child_stmt_info->stmt));
4271 }
4272 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4273 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4274 tree r_constructor = build_constructor (rtype, v);
4275 rstmt = gimple_build_assign (lhs, r_constructor);
4276 }
4277
4278 gcc_assert (rstmt);
4279
4280 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4281 gsi_replace (&rgsi, rstmt, true);
4282 }
4283
4284 /* Generate vector code for all SLP instances in the loop/basic block. */
4285
4286 void
4287 vect_schedule_slp (vec_info *vinfo)
4288 {
4289 vec<slp_instance> slp_instances;
4290 slp_instance instance;
4291 unsigned int i;
4292
4293 slp_instances = vinfo->slp_instances;
4294 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4295 {
4296 slp_tree node = SLP_INSTANCE_TREE (instance);
4297 /* Schedule the tree of INSTANCE. */
4298 vect_schedule_slp_instance (vinfo, node, instance);
4299
4300 if (SLP_INSTANCE_ROOT_STMT (instance))
4301 vectorize_slp_instance_root_stmt (node, instance);
4302
4303 if (dump_enabled_p ())
4304 dump_printf_loc (MSG_NOTE, vect_location,
4305 "vectorizing stmts using SLP.\n");
4306 }
4307
4308 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4309 {
4310 slp_tree root = SLP_INSTANCE_TREE (instance);
4311 stmt_vec_info store_info;
4312 unsigned int j;
4313
4314 /* Remove scalar call stmts. Do not do this for basic-block
4315 vectorization as not all uses may be vectorized.
4316 ??? Why should this be necessary? DCE should be able to
4317 remove the stmts itself.
4318 ??? For BB vectorization we can as well remove scalar
4319 stmts starting from the SLP tree root if they have no
4320 uses. */
4321 if (is_a <loop_vec_info> (vinfo))
4322 vect_remove_slp_scalar_calls (vinfo, root);
4323
4324 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
4325 {
4326 if (!STMT_VINFO_DATA_REF (store_info))
4327 break;
4328
4329 if (SLP_INSTANCE_ROOT_STMT (instance))
4330 continue;
4331
4332 store_info = vect_orig_stmt (store_info);
4333 /* Free the attached stmt_vec_info and remove the stmt. */
4334 vinfo->remove_stmt (store_info);
4335 }
4336 }
4337 }