]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
re PR tree-optimization/92537 (ICE in vect_slp_analyze_node_operations, at tree-vect...
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47
48
49 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
50 FINAL_P is true if we have vectorized the instance or if we have
51 made a final decision not to vectorize the statements in any way. */
52
53 static void
54 vect_free_slp_tree (slp_tree node, bool final_p)
55 {
56 int i;
57 slp_tree child;
58
59 if (--node->refcnt != 0)
60 return;
61
62 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
63 vect_free_slp_tree (child, final_p);
64
65 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
66 Some statements might no longer exist, after having been
67 removed by vect_transform_stmt. Updating the remaining
68 statements would be redundant. */
69 if (!final_p)
70 {
71 stmt_vec_info stmt_info;
72 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
73 {
74 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
75 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
76 }
77 }
78
79 SLP_TREE_CHILDREN (node).release ();
80 SLP_TREE_SCALAR_STMTS (node).release ();
81 SLP_TREE_SCALAR_OPS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
84
85 free (node);
86 }
87
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
91
92 void
93 vect_free_slp_instance (slp_instance instance, bool final_p)
94 {
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
98 }
99
100
101 /* Create an SLP node for SCALAR_STMTS. */
102
103 static slp_tree
104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
105 {
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
109
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
113 {
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
117 }
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
122
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_SCALAR_OPS (node) = vNULL;
126 SLP_TREE_VEC_STMTS (node).create (0);
127 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
128 SLP_TREE_CHILDREN (node).create (nops);
129 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
130 SLP_TREE_TWO_OPERATORS (node) = false;
131 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
132 node->refcnt = 1;
133 node->max_nunits = 1;
134
135 unsigned i;
136 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
137 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
138
139 return node;
140 }
141
142 /* Create an SLP node for OPS. */
143
144 static slp_tree
145 vect_create_new_slp_node (vec<tree> ops)
146 {
147 slp_tree node;
148
149 node = XNEW (struct _slp_tree);
150 SLP_TREE_SCALAR_STMTS (node) = vNULL;
151 SLP_TREE_SCALAR_OPS (node) = ops;
152 SLP_TREE_VEC_STMTS (node).create (0);
153 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
154 SLP_TREE_CHILDREN (node) = vNULL;
155 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
156 SLP_TREE_TWO_OPERATORS (node) = false;
157 SLP_TREE_DEF_TYPE (node) = vect_external_def;
158 node->refcnt = 1;
159 node->max_nunits = 1;
160
161 return node;
162 }
163
164
165 /* This structure is used in creation of an SLP tree. Each instance
166 corresponds to the same operand in a group of scalar stmts in an SLP
167 node. */
168 typedef struct _slp_oprnd_info
169 {
170 /* Def-stmts for the operands. */
171 vec<stmt_vec_info> def_stmts;
172 /* Operands. */
173 vec<tree> ops;
174 /* Information about the first statement, its vector def-type, type, the
175 operand itself in case it's constant, and an indication if it's a pattern
176 stmt. */
177 tree first_op_type;
178 enum vect_def_type first_dt;
179 bool any_pattern;
180 } *slp_oprnd_info;
181
182
183 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
184 operand. */
185 static vec<slp_oprnd_info>
186 vect_create_oprnd_info (int nops, int group_size)
187 {
188 int i;
189 slp_oprnd_info oprnd_info;
190 vec<slp_oprnd_info> oprnds_info;
191
192 oprnds_info.create (nops);
193 for (i = 0; i < nops; i++)
194 {
195 oprnd_info = XNEW (struct _slp_oprnd_info);
196 oprnd_info->def_stmts.create (group_size);
197 oprnd_info->ops.create (group_size);
198 oprnd_info->first_dt = vect_uninitialized_def;
199 oprnd_info->first_op_type = NULL_TREE;
200 oprnd_info->any_pattern = false;
201 oprnds_info.quick_push (oprnd_info);
202 }
203
204 return oprnds_info;
205 }
206
207
208 /* Free operands info. */
209
210 static void
211 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
212 {
213 int i;
214 slp_oprnd_info oprnd_info;
215
216 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
217 {
218 oprnd_info->def_stmts.release ();
219 oprnd_info->ops.release ();
220 XDELETE (oprnd_info);
221 }
222
223 oprnds_info.release ();
224 }
225
226
227 /* Return true if STMTS contains a pattern statement. */
228
229 static bool
230 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
231 {
232 stmt_vec_info stmt_info;
233 unsigned int i;
234 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
235 if (is_pattern_stmt_p (stmt_info))
236 return true;
237 return false;
238 }
239
240 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
241 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
242 of the chain. */
243
244 int
245 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
246 stmt_vec_info first_stmt_info)
247 {
248 stmt_vec_info next_stmt_info = first_stmt_info;
249 int result = 0;
250
251 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
252 return -1;
253
254 do
255 {
256 if (next_stmt_info == stmt_info)
257 return result;
258 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
259 if (next_stmt_info)
260 result += DR_GROUP_GAP (next_stmt_info);
261 }
262 while (next_stmt_info);
263
264 return -1;
265 }
266
267 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
268 using the method implemented by duplicate_and_interleave. Return true
269 if so, returning the number of intermediate vectors in *NVECTORS_OUT
270 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
271 (if nonnull). */
272
273 bool
274 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
275 tree elt_type, unsigned int *nvectors_out,
276 tree *vector_type_out,
277 tree *permutes)
278 {
279 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
280 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
281 return false;
282
283 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
284 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
285 unsigned int nvectors = 1;
286 for (;;)
287 {
288 scalar_int_mode int_mode;
289 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
290 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
291 {
292 /* Get the natural vector type for this SLP group size. */
293 tree int_type = build_nonstandard_integer_type
294 (GET_MODE_BITSIZE (int_mode), 1);
295 tree vector_type
296 = get_vectype_for_scalar_type (vinfo, int_type, count);
297 if (vector_type
298 && VECTOR_MODE_P (TYPE_MODE (vector_type))
299 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
300 GET_MODE_SIZE (base_vector_mode)))
301 {
302 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
303 together into elements of type INT_TYPE and using the result
304 to build NVECTORS vectors. */
305 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
306 vec_perm_builder sel1 (nelts, 2, 3);
307 vec_perm_builder sel2 (nelts, 2, 3);
308 poly_int64 half_nelts = exact_div (nelts, 2);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel1.quick_push (i);
312 sel1.quick_push (i + nelts);
313 sel2.quick_push (half_nelts + i);
314 sel2.quick_push (half_nelts + i + nelts);
315 }
316 vec_perm_indices indices1 (sel1, 2, nelts);
317 vec_perm_indices indices2 (sel2, 2, nelts);
318 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
319 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
320 {
321 if (nvectors_out)
322 *nvectors_out = nvectors;
323 if (vector_type_out)
324 *vector_type_out = vector_type;
325 if (permutes)
326 {
327 permutes[0] = vect_gen_perm_mask_checked (vector_type,
328 indices1);
329 permutes[1] = vect_gen_perm_mask_checked (vector_type,
330 indices2);
331 }
332 return true;
333 }
334 }
335 }
336 if (!multiple_p (elt_bytes, 2, &elt_bytes))
337 return false;
338 nvectors *= 2;
339 }
340 }
341
342 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
343 they are of a valid type and that they match the defs of the first stmt of
344 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
345 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
346 indicates swap is required for cond_expr stmts. Specifically, *SWAP
347 is 1 if STMT is cond and operands of comparison need to be swapped;
348 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
349 If there is any operand swap in this function, *SWAP is set to non-zero
350 value.
351 If there was a fatal error return -1; if the error could be corrected by
352 swapping operands of father node of this one, return 1; if everything is
353 ok return 0. */
354 static int
355 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
356 vec<stmt_vec_info> stmts, unsigned stmt_num,
357 vec<slp_oprnd_info> *oprnds_info)
358 {
359 stmt_vec_info stmt_info = stmts[stmt_num];
360 tree oprnd;
361 unsigned int i, number_of_oprnds;
362 enum vect_def_type dt = vect_uninitialized_def;
363 slp_oprnd_info oprnd_info;
364 int first_op_idx = 1;
365 unsigned int commutative_op = -1U;
366 bool first_op_cond = false;
367 bool first = stmt_num == 0;
368
369 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
370 {
371 number_of_oprnds = gimple_call_num_args (stmt);
372 first_op_idx = 3;
373 if (gimple_call_internal_p (stmt))
374 {
375 internal_fn ifn = gimple_call_internal_fn (stmt);
376 commutative_op = first_commutative_argument (ifn);
377
378 /* Masked load, only look at mask. */
379 if (ifn == IFN_MASK_LOAD)
380 {
381 number_of_oprnds = 1;
382 /* Mask operand index. */
383 first_op_idx = 5;
384 }
385 }
386 }
387 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
388 {
389 enum tree_code code = gimple_assign_rhs_code (stmt);
390 number_of_oprnds = gimple_num_ops (stmt) - 1;
391 /* Swap can only be done for cond_expr if asked to, otherwise we
392 could result in different comparison code to the first stmt. */
393 if (code == COND_EXPR
394 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
395 {
396 first_op_cond = true;
397 number_of_oprnds++;
398 }
399 else
400 commutative_op = commutative_tree_code (code) ? 0U : -1U;
401 }
402 else
403 return -1;
404
405 bool swapped = (*swap != 0);
406 gcc_assert (!swapped || first_op_cond);
407 for (i = 0; i < number_of_oprnds; i++)
408 {
409 again:
410 if (first_op_cond)
411 {
412 /* Map indicating how operands of cond_expr should be swapped. */
413 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
414 int *map = maps[*swap];
415
416 if (i < 2)
417 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
418 first_op_idx), map[i]);
419 else
420 oprnd = gimple_op (stmt_info->stmt, map[i]);
421 }
422 else
423 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
424 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
425 oprnd = TREE_OPERAND (oprnd, 0);
426
427 oprnd_info = (*oprnds_info)[i];
428
429 stmt_vec_info def_stmt_info;
430 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
431 {
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434 "Build SLP failed: can't analyze def for %T\n",
435 oprnd);
436
437 return -1;
438 }
439
440 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
441 oprnd_info->any_pattern = true;
442
443 if (first)
444 {
445 /* For the swapping logic below force vect_reduction_def
446 for the reduction op in a SLP reduction group. */
447 if (!STMT_VINFO_DATA_REF (stmt_info)
448 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
449 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
450 && def_stmt_info)
451 dt = vect_reduction_def;
452 oprnd_info->first_dt = dt;
453 oprnd_info->first_op_type = TREE_TYPE (oprnd);
454 }
455 else
456 {
457 /* Not first stmt of the group, check that the def-stmt/s match
458 the def-stmt/s of the first stmt. Allow different definition
459 types for reduction chains: the first stmt must be a
460 vect_reduction_def (a phi node), and the rest
461 end in the reduction chain. */
462 tree type = TREE_TYPE (oprnd);
463 if ((oprnd_info->first_dt != dt
464 && !(oprnd_info->first_dt == vect_reduction_def
465 && !STMT_VINFO_DATA_REF (stmt_info)
466 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
467 && def_stmt_info
468 && !STMT_VINFO_DATA_REF (def_stmt_info)
469 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
470 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
471 && !((oprnd_info->first_dt == vect_external_def
472 || oprnd_info->first_dt == vect_constant_def)
473 && (dt == vect_external_def
474 || dt == vect_constant_def)))
475 || !types_compatible_p (oprnd_info->first_op_type, type)
476 || (!STMT_VINFO_DATA_REF (stmt_info)
477 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
478 && ((!def_stmt_info
479 || STMT_VINFO_DATA_REF (def_stmt_info)
480 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
481 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
482 != (oprnd_info->first_dt != vect_reduction_def))))
483 {
484 /* Try swapping operands if we got a mismatch. */
485 if (i == commutative_op && !swapped)
486 {
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location,
489 "trying swapped operands\n");
490 swapped = true;
491 goto again;
492 }
493
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
496 "Build SLP failed: different types\n");
497
498 return 1;
499 }
500 if ((dt == vect_constant_def
501 || dt == vect_external_def)
502 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
503 && (TREE_CODE (type) == BOOLEAN_TYPE
504 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
505 type)))
506 {
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509 "Build SLP failed: invalid type of def "
510 "for variable-length SLP %T\n", oprnd);
511 return -1;
512 }
513 }
514
515 /* Check the types of the definitions. */
516 switch (dt)
517 {
518 case vect_external_def:
519 /* Make sure to demote the overall operand to external. */
520 oprnd_info->first_dt = vect_external_def;
521 /* Fallthru. */
522 case vect_constant_def:
523 oprnd_info->def_stmts.quick_push (NULL);
524 oprnd_info->ops.quick_push (oprnd);
525 break;
526
527 case vect_internal_def:
528 case vect_reduction_def:
529 if (oprnd_info->first_dt == vect_reduction_def
530 && !STMT_VINFO_DATA_REF (stmt_info)
531 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
532 && !STMT_VINFO_DATA_REF (def_stmt_info)
533 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
534 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
535 {
536 /* For a SLP reduction chain we want to duplicate the
537 reduction to each of the chain members. That gets
538 us a sane SLP graph (still the stmts are not 100%
539 correct wrt the initial values). */
540 gcc_assert (!first);
541 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
542 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
543 break;
544 }
545 /* Fallthru. */
546 case vect_induction_def:
547 oprnd_info->def_stmts.quick_push (def_stmt_info);
548 oprnd_info->ops.quick_push (oprnd);
549 break;
550
551 default:
552 /* FORNOW: Not supported. */
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
555 "Build SLP failed: illegal type of def %T\n",
556 oprnd);
557
558 return -1;
559 }
560 }
561
562 /* Swap operands. */
563 if (swapped)
564 {
565 if (first_op_cond)
566 {
567 /* If there are already uses of this stmt in a SLP instance then
568 we've committed to the operand order and can't swap it. */
569 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
570 {
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
573 "Build SLP failed: cannot swap operands of "
574 "shared stmt %G", stmt_info->stmt);
575 return -1;
576 }
577
578 /* To get rid of this swapping we have to move the stmt code
579 to the SLP tree as well (and gather it here per stmt). */
580 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
581 tree cond = gimple_assign_rhs1 (stmt);
582 enum tree_code code = TREE_CODE (cond);
583
584 /* Swap. */
585 if (*swap == 1)
586 {
587 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
588 &TREE_OPERAND (cond, 1));
589 TREE_SET_CODE (cond, swap_tree_comparison (code));
590 }
591 /* Invert. */
592 else
593 {
594 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
595 gimple_assign_rhs3_ptr (stmt));
596 if (STMT_VINFO_REDUC_IDX (stmt_info) == 1)
597 STMT_VINFO_REDUC_IDX (stmt_info) = 2;
598 else if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
599 STMT_VINFO_REDUC_IDX (stmt_info) = 1;
600 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
601 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
602 gcc_assert (code != ERROR_MARK);
603 TREE_SET_CODE (cond, code);
604 }
605 }
606 else
607 {
608 /* Commutative ops need not reflect swapping, ops are in
609 the SLP tree. */
610 }
611 if (dump_enabled_p ())
612 dump_printf_loc (MSG_NOTE, vect_location,
613 "swapped operands to match def types in %G",
614 stmt_info->stmt);
615 }
616
617 *swap = swapped;
618 return 0;
619 }
620
621 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
622 Return true if we can, meaning that this choice doesn't conflict with
623 existing SLP nodes that use STMT_INFO. */
624
625 static bool
626 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
627 {
628 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
629 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
630 return true;
631
632 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
633 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
634 {
635 /* We maintain the invariant that if any statement in the group is
636 used, all other members of the group have the same vector type. */
637 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
638 stmt_vec_info member_info = first_info;
639 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
640 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
641 || is_pattern_stmt_p (member_info))
642 break;
643
644 if (!member_info)
645 {
646 for (member_info = first_info; member_info;
647 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
648 STMT_VINFO_VECTYPE (member_info) = vectype;
649 return true;
650 }
651 }
652 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
653 && !is_pattern_stmt_p (stmt_info))
654 {
655 STMT_VINFO_VECTYPE (stmt_info) = vectype;
656 return true;
657 }
658
659 if (dump_enabled_p ())
660 {
661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
662 "Build SLP failed: incompatible vector"
663 " types for: %G", stmt_info->stmt);
664 dump_printf_loc (MSG_NOTE, vect_location,
665 " old vector type: %T\n", old_vectype);
666 dump_printf_loc (MSG_NOTE, vect_location,
667 " new vector type: %T\n", vectype);
668 }
669 return false;
670 }
671
672 /* Try to infer and assign a vector type to all the statements in STMTS.
673 Used only for BB vectorization. */
674
675 static bool
676 vect_update_all_shared_vectypes (vec<stmt_vec_info> stmts)
677 {
678 tree vectype, nunits_vectype;
679 if (!vect_get_vector_types_for_stmt (stmts[0], &vectype,
680 &nunits_vectype, stmts.length ()))
681 return false;
682
683 stmt_vec_info stmt_info;
684 unsigned int i;
685 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
686 if (!vect_update_shared_vectype (stmt_info, vectype))
687 return false;
688
689 return true;
690 }
691
692 /* Return true if call statements CALL1 and CALL2 are similar enough
693 to be combined into the same SLP group. */
694
695 static bool
696 compatible_calls_p (gcall *call1, gcall *call2)
697 {
698 unsigned int nargs = gimple_call_num_args (call1);
699 if (nargs != gimple_call_num_args (call2))
700 return false;
701
702 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
703 return false;
704
705 if (gimple_call_internal_p (call1))
706 {
707 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
708 TREE_TYPE (gimple_call_lhs (call2))))
709 return false;
710 for (unsigned int i = 0; i < nargs; ++i)
711 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
712 TREE_TYPE (gimple_call_arg (call2, i))))
713 return false;
714 }
715 else
716 {
717 if (!operand_equal_p (gimple_call_fn (call1),
718 gimple_call_fn (call2), 0))
719 return false;
720
721 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
722 return false;
723 }
724 return true;
725 }
726
727 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
728 caller's attempt to find the vector type in STMT_INFO with the narrowest
729 element type. Return true if VECTYPE is nonnull and if it is valid
730 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
731 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
732 vect_build_slp_tree. */
733
734 static bool
735 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
736 tree vectype, poly_uint64 *max_nunits)
737 {
738 if (!vectype)
739 {
740 if (dump_enabled_p ())
741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
742 "Build SLP failed: unsupported data-type in %G\n",
743 stmt_info->stmt);
744 /* Fatal mismatch. */
745 return false;
746 }
747
748 /* If populating the vector type requires unrolling then fail
749 before adjusting *max_nunits for basic-block vectorization. */
750 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
751 unsigned HOST_WIDE_INT const_nunits;
752 if (STMT_VINFO_BB_VINFO (stmt_info)
753 && (!nunits.is_constant (&const_nunits)
754 || const_nunits > group_size))
755 {
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: unrolling required "
759 "in basic block SLP\n");
760 /* Fatal mismatch. */
761 return false;
762 }
763
764 /* In case of multiple types we need to detect the smallest type. */
765 vect_update_max_nunits (max_nunits, vectype);
766 return true;
767 }
768
769 /* STMTS is a group of GROUP_SIZE SLP statements in which some
770 statements do the same operation as the first statement and in which
771 the others do ALT_STMT_CODE. Return true if we can take one vector
772 of the first operation and one vector of the second and permute them
773 to get the required result. VECTYPE is the type of the vector that
774 would be permuted. */
775
776 static bool
777 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
778 unsigned int group_size, tree vectype,
779 tree_code alt_stmt_code)
780 {
781 unsigned HOST_WIDE_INT count;
782 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
783 return false;
784
785 vec_perm_builder sel (count, count, 1);
786 for (unsigned int i = 0; i < count; ++i)
787 {
788 unsigned int elt = i;
789 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
790 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
791 elt += count;
792 sel.quick_push (elt);
793 }
794 vec_perm_indices indices (sel, 2, count);
795 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
796 }
797
798 /* Verify if the scalar stmts STMTS are isomorphic, require data
799 permutation or are of unsupported types of operation. Return
800 true if they are, otherwise return false and indicate in *MATCHES
801 which stmts are not isomorphic to the first one. If MATCHES[0]
802 is false then this indicates the comparison could not be
803 carried out or the stmts will never be vectorized by SLP.
804
805 Note COND_EXPR is possibly isomorphic to another one after swapping its
806 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
807 the first stmt by swapping the two operands of comparison; set SWAP[i]
808 to 2 if stmt I is isormorphic to the first stmt by inverting the code
809 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
810 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
811
812 static bool
813 vect_build_slp_tree_1 (unsigned char *swap,
814 vec<stmt_vec_info> stmts, unsigned int group_size,
815 poly_uint64 *max_nunits, bool *matches,
816 bool *two_operators)
817 {
818 unsigned int i;
819 stmt_vec_info first_stmt_info = stmts[0];
820 enum tree_code first_stmt_code = ERROR_MARK;
821 enum tree_code alt_stmt_code = ERROR_MARK;
822 enum tree_code rhs_code = ERROR_MARK;
823 enum tree_code first_cond_code = ERROR_MARK;
824 tree lhs;
825 bool need_same_oprnds = false;
826 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
827 optab optab;
828 int icode;
829 machine_mode optab_op2_mode;
830 machine_mode vec_mode;
831 stmt_vec_info first_load = NULL, prev_first_load = NULL;
832 bool load_p = false;
833
834 /* For every stmt in NODE find its def stmt/s. */
835 stmt_vec_info stmt_info;
836 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
837 {
838 vec_info *vinfo = stmt_info->vinfo;
839 gimple *stmt = stmt_info->stmt;
840 swap[i] = 0;
841 matches[i] = false;
842
843 if (dump_enabled_p ())
844 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
845
846 /* Fail to vectorize statements marked as unvectorizable. */
847 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
848 {
849 if (dump_enabled_p ())
850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
851 "Build SLP failed: unvectorizable statement %G",
852 stmt);
853 /* Fatal mismatch. */
854 matches[0] = false;
855 return false;
856 }
857
858 lhs = gimple_get_lhs (stmt);
859 if (lhs == NULL_TREE)
860 {
861 if (dump_enabled_p ())
862 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
863 "Build SLP failed: not GIMPLE_ASSIGN nor "
864 "GIMPLE_CALL %G", stmt);
865 /* Fatal mismatch. */
866 matches[0] = false;
867 return false;
868 }
869
870 tree nunits_vectype;
871 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
872 &nunits_vectype, group_size)
873 || (nunits_vectype
874 && !vect_record_max_nunits (stmt_info, group_size,
875 nunits_vectype, max_nunits)))
876 {
877 /* Fatal mismatch. */
878 matches[0] = false;
879 return false;
880 }
881
882 gcc_assert (vectype);
883
884 if (is_a <bb_vec_info> (vinfo)
885 && !vect_update_shared_vectype (stmt_info, vectype))
886 continue;
887
888 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
889 {
890 rhs_code = CALL_EXPR;
891
892 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
893 load_p = true;
894 else if ((gimple_call_internal_p (call_stmt)
895 && (!vectorizable_internal_fn_p
896 (gimple_call_internal_fn (call_stmt))))
897 || gimple_call_tail_p (call_stmt)
898 || gimple_call_noreturn_p (call_stmt)
899 || !gimple_call_nothrow_p (call_stmt)
900 || gimple_call_chain (call_stmt))
901 {
902 if (dump_enabled_p ())
903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
904 "Build SLP failed: unsupported call type %G",
905 call_stmt);
906 /* Fatal mismatch. */
907 matches[0] = false;
908 return false;
909 }
910 }
911 else
912 {
913 rhs_code = gimple_assign_rhs_code (stmt);
914 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
915 }
916
917 /* Check the operation. */
918 if (i == 0)
919 {
920 first_stmt_code = rhs_code;
921
922 /* Shift arguments should be equal in all the packed stmts for a
923 vector shift with scalar shift operand. */
924 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
925 || rhs_code == LROTATE_EXPR
926 || rhs_code == RROTATE_EXPR)
927 {
928 if (vectype == boolean_type_node)
929 {
930 if (dump_enabled_p ())
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "Build SLP failed: shift of a"
933 " boolean.\n");
934 /* Fatal mismatch. */
935 matches[0] = false;
936 return false;
937 }
938
939 vec_mode = TYPE_MODE (vectype);
940
941 /* First see if we have a vector/vector shift. */
942 optab = optab_for_tree_code (rhs_code, vectype,
943 optab_vector);
944
945 if (!optab
946 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
947 {
948 /* No vector/vector shift, try for a vector/scalar shift. */
949 optab = optab_for_tree_code (rhs_code, vectype,
950 optab_scalar);
951
952 if (!optab)
953 {
954 if (dump_enabled_p ())
955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 "Build SLP failed: no optab.\n");
957 /* Fatal mismatch. */
958 matches[0] = false;
959 return false;
960 }
961 icode = (int) optab_handler (optab, vec_mode);
962 if (icode == CODE_FOR_nothing)
963 {
964 if (dump_enabled_p ())
965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
966 "Build SLP failed: "
967 "op not supported by target.\n");
968 /* Fatal mismatch. */
969 matches[0] = false;
970 return false;
971 }
972 optab_op2_mode = insn_data[icode].operand[2].mode;
973 if (!VECTOR_MODE_P (optab_op2_mode))
974 {
975 need_same_oprnds = true;
976 first_op1 = gimple_assign_rhs2 (stmt);
977 }
978 }
979 }
980 else if (rhs_code == WIDEN_LSHIFT_EXPR)
981 {
982 need_same_oprnds = true;
983 first_op1 = gimple_assign_rhs2 (stmt);
984 }
985 }
986 else
987 {
988 if (first_stmt_code != rhs_code
989 && alt_stmt_code == ERROR_MARK)
990 alt_stmt_code = rhs_code;
991 if (first_stmt_code != rhs_code
992 && (first_stmt_code != IMAGPART_EXPR
993 || rhs_code != REALPART_EXPR)
994 && (first_stmt_code != REALPART_EXPR
995 || rhs_code != IMAGPART_EXPR)
996 /* Handle mismatches in plus/minus by computing both
997 and merging the results. */
998 && !((first_stmt_code == PLUS_EXPR
999 || first_stmt_code == MINUS_EXPR)
1000 && (alt_stmt_code == PLUS_EXPR
1001 || alt_stmt_code == MINUS_EXPR)
1002 && rhs_code == alt_stmt_code)
1003 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1004 && (first_stmt_code == ARRAY_REF
1005 || first_stmt_code == BIT_FIELD_REF
1006 || first_stmt_code == INDIRECT_REF
1007 || first_stmt_code == COMPONENT_REF
1008 || first_stmt_code == MEM_REF)))
1009 {
1010 if (dump_enabled_p ())
1011 {
1012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 "Build SLP failed: different operation "
1014 "in stmt %G", stmt);
1015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1016 "original stmt %G", first_stmt_info->stmt);
1017 }
1018 /* Mismatch. */
1019 continue;
1020 }
1021
1022 if (need_same_oprnds
1023 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
1024 {
1025 if (dump_enabled_p ())
1026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1027 "Build SLP failed: different shift "
1028 "arguments in %G", stmt);
1029 /* Mismatch. */
1030 continue;
1031 }
1032
1033 if (!load_p && rhs_code == CALL_EXPR)
1034 {
1035 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1036 as_a <gcall *> (stmt)))
1037 {
1038 if (dump_enabled_p ())
1039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1040 "Build SLP failed: different calls in %G",
1041 stmt);
1042 /* Mismatch. */
1043 continue;
1044 }
1045 }
1046 }
1047
1048 /* Grouped store or load. */
1049 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1050 {
1051 if (REFERENCE_CLASS_P (lhs))
1052 {
1053 /* Store. */
1054 ;
1055 }
1056 else
1057 {
1058 /* Load. */
1059 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1060 if (prev_first_load)
1061 {
1062 /* Check that there are no loads from different interleaving
1063 chains in the same node. */
1064 if (prev_first_load != first_load)
1065 {
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1068 vect_location,
1069 "Build SLP failed: different "
1070 "interleaving chains in one node %G",
1071 stmt);
1072 /* Mismatch. */
1073 continue;
1074 }
1075 }
1076 else
1077 prev_first_load = first_load;
1078 }
1079 } /* Grouped access. */
1080 else
1081 {
1082 if (load_p)
1083 {
1084 /* Not grouped load. */
1085 if (dump_enabled_p ())
1086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1087 "Build SLP failed: not grouped load %G", stmt);
1088
1089 /* FORNOW: Not grouped loads are not supported. */
1090 /* Fatal mismatch. */
1091 matches[0] = false;
1092 return false;
1093 }
1094
1095 /* Not memory operation. */
1096 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1097 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1098 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1099 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1100 && rhs_code != VIEW_CONVERT_EXPR
1101 && rhs_code != CALL_EXPR)
1102 {
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105 "Build SLP failed: operation unsupported %G",
1106 stmt);
1107 /* Fatal mismatch. */
1108 matches[0] = false;
1109 return false;
1110 }
1111
1112 if (rhs_code == COND_EXPR)
1113 {
1114 tree cond_expr = gimple_assign_rhs1 (stmt);
1115 enum tree_code cond_code = TREE_CODE (cond_expr);
1116 enum tree_code swap_code = ERROR_MARK;
1117 enum tree_code invert_code = ERROR_MARK;
1118
1119 if (i == 0)
1120 first_cond_code = TREE_CODE (cond_expr);
1121 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1122 {
1123 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1124 swap_code = swap_tree_comparison (cond_code);
1125 invert_code = invert_tree_comparison (cond_code, honor_nans);
1126 }
1127
1128 if (first_cond_code == cond_code)
1129 ;
1130 /* Isomorphic can be achieved by swapping. */
1131 else if (first_cond_code == swap_code)
1132 swap[i] = 1;
1133 /* Isomorphic can be achieved by inverting. */
1134 else if (first_cond_code == invert_code)
1135 swap[i] = 2;
1136 else
1137 {
1138 if (dump_enabled_p ())
1139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140 "Build SLP failed: different"
1141 " operation %G", stmt);
1142 /* Mismatch. */
1143 continue;
1144 }
1145 }
1146 }
1147
1148 matches[i] = true;
1149 }
1150
1151 for (i = 0; i < group_size; ++i)
1152 if (!matches[i])
1153 return false;
1154
1155 /* If we allowed a two-operation SLP node verify the target can cope
1156 with the permute we are going to use. */
1157 if (alt_stmt_code != ERROR_MARK
1158 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1159 {
1160 if (vectype == boolean_type_node
1161 || !vect_two_operations_perm_ok_p (stmts, group_size,
1162 vectype, alt_stmt_code))
1163 {
1164 for (i = 0; i < group_size; ++i)
1165 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1166 {
1167 matches[i] = false;
1168 if (dump_enabled_p ())
1169 {
1170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171 "Build SLP failed: different operation "
1172 "in stmt %G", stmts[i]->stmt);
1173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 "original stmt %G", first_stmt_info->stmt);
1175 }
1176 }
1177 return false;
1178 }
1179 *two_operators = true;
1180 }
1181
1182 return true;
1183 }
1184
1185 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1186 Note we never remove apart from at destruction time so we do not
1187 need a special value for deleted that differs from empty. */
1188 struct bst_traits
1189 {
1190 typedef vec <stmt_vec_info> value_type;
1191 typedef vec <stmt_vec_info> compare_type;
1192 static inline hashval_t hash (value_type);
1193 static inline bool equal (value_type existing, value_type candidate);
1194 static inline bool is_empty (value_type x) { return !x.exists (); }
1195 static inline bool is_deleted (value_type x) { return !x.exists (); }
1196 static inline void mark_empty (value_type &x) { x.release (); }
1197 static inline void mark_deleted (value_type &x) { x.release (); }
1198 static inline void remove (value_type &x) { x.release (); }
1199 };
1200 inline hashval_t
1201 bst_traits::hash (value_type x)
1202 {
1203 inchash::hash h;
1204 for (unsigned i = 0; i < x.length (); ++i)
1205 h.add_int (gimple_uid (x[i]->stmt));
1206 return h.end ();
1207 }
1208 inline bool
1209 bst_traits::equal (value_type existing, value_type candidate)
1210 {
1211 if (existing.length () != candidate.length ())
1212 return false;
1213 for (unsigned i = 0; i < existing.length (); ++i)
1214 if (existing[i] != candidate[i])
1215 return false;
1216 return true;
1217 }
1218
1219 typedef hash_map <vec <gimple *>, slp_tree,
1220 simple_hashmap_traits <bst_traits, slp_tree> >
1221 scalar_stmts_to_slp_tree_map_t;
1222
1223 static slp_tree
1224 vect_build_slp_tree_2 (vec_info *vinfo,
1225 vec<stmt_vec_info> stmts, unsigned int group_size,
1226 poly_uint64 *max_nunits,
1227 bool *matches, unsigned *npermutes, unsigned *tree_size,
1228 scalar_stmts_to_slp_tree_map_t *bst_map);
1229
1230 static slp_tree
1231 vect_build_slp_tree (vec_info *vinfo,
1232 vec<stmt_vec_info> stmts, unsigned int group_size,
1233 poly_uint64 *max_nunits,
1234 bool *matches, unsigned *npermutes, unsigned *tree_size,
1235 scalar_stmts_to_slp_tree_map_t *bst_map)
1236 {
1237 if (slp_tree *leader = bst_map->get (stmts))
1238 {
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1241 *leader ? "" : "failed ", *leader);
1242 if (*leader)
1243 {
1244 (*leader)->refcnt++;
1245 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1246 }
1247 return *leader;
1248 }
1249 poly_uint64 this_max_nunits = 1;
1250 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1251 matches, npermutes, tree_size, bst_map);
1252 if (res)
1253 {
1254 res->max_nunits = this_max_nunits;
1255 vect_update_max_nunits (max_nunits, this_max_nunits);
1256 /* Keep a reference for the bst_map use. */
1257 res->refcnt++;
1258 }
1259 bst_map->put (stmts.copy (), res);
1260 return res;
1261 }
1262
1263 /* Recursively build an SLP tree starting from NODE.
1264 Fail (and return a value not equal to zero) if def-stmts are not
1265 isomorphic, require data permutation or are of unsupported types of
1266 operation. Otherwise, return 0.
1267 The value returned is the depth in the SLP tree where a mismatch
1268 was found. */
1269
1270 static slp_tree
1271 vect_build_slp_tree_2 (vec_info *vinfo,
1272 vec<stmt_vec_info> stmts, unsigned int group_size,
1273 poly_uint64 *max_nunits,
1274 bool *matches, unsigned *npermutes, unsigned *tree_size,
1275 scalar_stmts_to_slp_tree_map_t *bst_map)
1276 {
1277 unsigned nops, i, this_tree_size = 0;
1278 poly_uint64 this_max_nunits = *max_nunits;
1279 slp_tree node;
1280
1281 matches[0] = false;
1282
1283 stmt_vec_info stmt_info = stmts[0];
1284 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1285 nops = gimple_call_num_args (stmt);
1286 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1287 {
1288 nops = gimple_num_ops (stmt) - 1;
1289 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1290 nops++;
1291 }
1292 else if (is_a <gphi *> (stmt_info->stmt))
1293 nops = 0;
1294 else
1295 return NULL;
1296
1297 /* If the SLP node is a PHI (induction or reduction), terminate
1298 the recursion. */
1299 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1300 {
1301 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1302 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1303 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1304 return NULL;
1305
1306 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1307 /* Induction from different IVs is not supported. */
1308 if (def_type == vect_induction_def)
1309 {
1310 stmt_vec_info other_info;
1311 FOR_EACH_VEC_ELT (stmts, i, other_info)
1312 if (stmt_info != other_info)
1313 return NULL;
1314 }
1315 else if (def_type == vect_reduction_def
1316 || def_type == vect_double_reduction_def
1317 || def_type == vect_nested_cycle)
1318 {
1319 /* Else def types have to match. */
1320 stmt_vec_info other_info;
1321 FOR_EACH_VEC_ELT (stmts, i, other_info)
1322 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1323 return NULL;
1324 }
1325 else
1326 return NULL;
1327 (*tree_size)++;
1328 node = vect_create_new_slp_node (stmts);
1329 return node;
1330 }
1331
1332
1333 bool two_operators = false;
1334 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1335 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1336 &this_max_nunits, matches, &two_operators))
1337 return NULL;
1338
1339 /* If the SLP node is a load, terminate the recursion unless masked. */
1340 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1341 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1342 {
1343 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1344 {
1345 /* Masked load. */
1346 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1347 nops = 1;
1348 }
1349 else
1350 {
1351 *max_nunits = this_max_nunits;
1352 (*tree_size)++;
1353 node = vect_create_new_slp_node (stmts);
1354 return node;
1355 }
1356 }
1357
1358 /* Get at the operands, verifying they are compatible. */
1359 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1360 slp_oprnd_info oprnd_info;
1361 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1362 {
1363 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1364 stmts, i, &oprnds_info);
1365 if (res != 0)
1366 matches[(res == -1) ? 0 : i] = false;
1367 if (!matches[0])
1368 break;
1369 }
1370 for (i = 0; i < group_size; ++i)
1371 if (!matches[i])
1372 {
1373 vect_free_oprnd_info (oprnds_info);
1374 return NULL;
1375 }
1376
1377 auto_vec<slp_tree, 4> children;
1378
1379 stmt_info = stmts[0];
1380
1381 /* Create SLP_TREE nodes for the definition node/s. */
1382 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1383 {
1384 slp_tree child;
1385 unsigned old_tree_size = this_tree_size;
1386 unsigned int j;
1387
1388 if (oprnd_info->first_dt == vect_uninitialized_def)
1389 {
1390 /* COND_EXPR have one too many eventually if the condition
1391 is a SSA name. */
1392 gcc_assert (i == 3 && nops == 4);
1393 continue;
1394 }
1395
1396 if (oprnd_info->first_dt != vect_internal_def
1397 && oprnd_info->first_dt != vect_reduction_def
1398 && oprnd_info->first_dt != vect_induction_def)
1399 {
1400 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1401 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1402 oprnd_info->ops = vNULL;
1403 children.safe_push (invnode);
1404 continue;
1405 }
1406
1407 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1408 group_size, &this_max_nunits,
1409 matches, npermutes,
1410 &this_tree_size, bst_map)) != NULL)
1411 {
1412 /* If we have all children of child built up from scalars then just
1413 throw that away and build it up this node from scalars. */
1414 if (is_a <bb_vec_info> (vinfo)
1415 && !SLP_TREE_CHILDREN (child).is_empty ()
1416 /* ??? Rejecting patterns this way doesn't work. We'd have to
1417 do extra work to cancel the pattern so the uses see the
1418 scalar version. */
1419 && !oprnd_info->any_pattern)
1420 {
1421 slp_tree grandchild;
1422
1423 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1424 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1425 break;
1426 if (!grandchild
1427 && vect_update_all_shared_vectypes (oprnd_info->def_stmts))
1428 {
1429 /* Roll back. */
1430 this_tree_size = old_tree_size;
1431 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1432 vect_free_slp_tree (grandchild, false);
1433 SLP_TREE_CHILDREN (child).truncate (0);
1434
1435 if (dump_enabled_p ())
1436 dump_printf_loc (MSG_NOTE, vect_location,
1437 "Building parent vector operands from "
1438 "scalars instead\n");
1439 oprnd_info->def_stmts = vNULL;
1440 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1441 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1442 oprnd_info->ops = vNULL;
1443 ++this_tree_size;
1444 children.safe_push (child);
1445 continue;
1446 }
1447 }
1448
1449 oprnd_info->def_stmts = vNULL;
1450 children.safe_push (child);
1451 continue;
1452 }
1453
1454 /* If the SLP build failed fatally and we analyze a basic-block
1455 simply treat nodes we fail to build as externally defined
1456 (and thus build vectors from the scalar defs).
1457 The cost model will reject outright expensive cases.
1458 ??? This doesn't treat cases where permutation ultimatively
1459 fails (or we don't try permutation below). Ideally we'd
1460 even compute a permutation that will end up with the maximum
1461 SLP tree size... */
1462 if (is_a <bb_vec_info> (vinfo)
1463 && !matches[0]
1464 /* ??? Rejecting patterns this way doesn't work. We'd have to
1465 do extra work to cancel the pattern so the uses see the
1466 scalar version. */
1467 && !is_pattern_stmt_p (stmt_info)
1468 && !oprnd_info->any_pattern
1469 && vect_update_all_shared_vectypes (oprnd_info->def_stmts))
1470 {
1471 if (dump_enabled_p ())
1472 dump_printf_loc (MSG_NOTE, vect_location,
1473 "Building vector operands from scalars\n");
1474 this_tree_size++;
1475 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1476 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1477 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1478 children.safe_push (child);
1479 oprnd_info->ops = vNULL;
1480 oprnd_info->def_stmts = vNULL;
1481 continue;
1482 }
1483
1484 /* If the SLP build for operand zero failed and operand zero
1485 and one can be commutated try that for the scalar stmts
1486 that failed the match. */
1487 if (i == 0
1488 /* A first scalar stmt mismatch signals a fatal mismatch. */
1489 && matches[0]
1490 /* ??? For COND_EXPRs we can swap the comparison operands
1491 as well as the arms under some constraints. */
1492 && nops == 2
1493 && oprnds_info[1]->first_dt == vect_internal_def
1494 && is_gimple_assign (stmt_info->stmt)
1495 /* Swapping operands for reductions breaks assumptions later on. */
1496 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1497 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1498 /* Do so only if the number of not successful permutes was nor more
1499 than a cut-ff as re-trying the recursive match on
1500 possibly each level of the tree would expose exponential
1501 behavior. */
1502 && *npermutes < 4)
1503 {
1504 /* See whether we can swap the matching or the non-matching
1505 stmt operands. */
1506 bool swap_not_matching = true;
1507 do
1508 {
1509 for (j = 0; j < group_size; ++j)
1510 {
1511 if (matches[j] != !swap_not_matching)
1512 continue;
1513 stmt_vec_info stmt_info = stmts[j];
1514 /* Verify if we can swap operands of this stmt. */
1515 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1516 if (!stmt
1517 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1518 {
1519 if (!swap_not_matching)
1520 goto fail;
1521 swap_not_matching = false;
1522 break;
1523 }
1524 }
1525 }
1526 while (j != group_size);
1527
1528 /* Swap mismatched definition stmts. */
1529 if (dump_enabled_p ())
1530 dump_printf_loc (MSG_NOTE, vect_location,
1531 "Re-trying with swapped operands of stmts ");
1532 for (j = 0; j < group_size; ++j)
1533 if (matches[j] == !swap_not_matching)
1534 {
1535 std::swap (oprnds_info[0]->def_stmts[j],
1536 oprnds_info[1]->def_stmts[j]);
1537 std::swap (oprnds_info[0]->ops[j],
1538 oprnds_info[1]->ops[j]);
1539 if (dump_enabled_p ())
1540 dump_printf (MSG_NOTE, "%d ", j);
1541 }
1542 if (dump_enabled_p ())
1543 dump_printf (MSG_NOTE, "\n");
1544 /* And try again with scratch 'matches' ... */
1545 bool *tem = XALLOCAVEC (bool, group_size);
1546 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1547 group_size, &this_max_nunits,
1548 tem, npermutes,
1549 &this_tree_size, bst_map)) != NULL)
1550 {
1551 /* If we have all children of child built up from scalars then
1552 just throw that away and build it up this node from scalars. */
1553 if (is_a <bb_vec_info> (vinfo)
1554 && !SLP_TREE_CHILDREN (child).is_empty ()
1555 /* ??? Rejecting patterns this way doesn't work. We'd have
1556 to do extra work to cancel the pattern so the uses see the
1557 scalar version. */
1558 && !oprnd_info->any_pattern)
1559 {
1560 unsigned int j;
1561 slp_tree grandchild;
1562
1563 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1564 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1565 break;
1566 if (!grandchild
1567 && (vect_update_all_shared_vectypes
1568 (oprnd_info->def_stmts)))
1569 {
1570 /* Roll back. */
1571 this_tree_size = old_tree_size;
1572 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1573 vect_free_slp_tree (grandchild, false);
1574 SLP_TREE_CHILDREN (child).truncate (0);
1575
1576 if (dump_enabled_p ())
1577 dump_printf_loc (MSG_NOTE, vect_location,
1578 "Building parent vector operands from "
1579 "scalars instead\n");
1580 oprnd_info->def_stmts = vNULL;
1581 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1582 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1583 oprnd_info->ops = vNULL;
1584 ++this_tree_size;
1585 children.safe_push (child);
1586 continue;
1587 }
1588 }
1589
1590 oprnd_info->def_stmts = vNULL;
1591 children.safe_push (child);
1592 continue;
1593 }
1594
1595 ++*npermutes;
1596 }
1597
1598 fail:
1599 gcc_assert (child == NULL);
1600 FOR_EACH_VEC_ELT (children, j, child)
1601 vect_free_slp_tree (child, false);
1602 vect_free_oprnd_info (oprnds_info);
1603 return NULL;
1604 }
1605
1606 vect_free_oprnd_info (oprnds_info);
1607
1608 *tree_size += this_tree_size + 1;
1609 *max_nunits = this_max_nunits;
1610
1611 node = vect_create_new_slp_node (stmts);
1612 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1613 SLP_TREE_CHILDREN (node).splice (children);
1614 return node;
1615 }
1616
1617 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1618
1619 static void
1620 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1621 slp_tree node, hash_set<slp_tree> &visited)
1622 {
1623 unsigned i;
1624 stmt_vec_info stmt_info;
1625 slp_tree child;
1626 tree op;
1627
1628 if (visited.add (node))
1629 return;
1630
1631 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1632 dump_user_location_t user_loc = loc.get_user_location ();
1633 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
1634 SLP_TREE_DEF_TYPE (node) == vect_external_def
1635 ? " (external)"
1636 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1637 ? " (constant)"
1638 : ""), node,
1639 estimated_poly_value (node->max_nunits));
1640 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1641 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1642 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1643 else
1644 {
1645 dump_printf_loc (metadata, user_loc, "\t{ ");
1646 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1647 dump_printf (metadata, "%T%s ", op,
1648 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1649 dump_printf (metadata, "}\n");
1650 }
1651 if (SLP_TREE_CHILDREN (node).is_empty ())
1652 return;
1653 dump_printf_loc (metadata, user_loc, "\tchildren");
1654 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1655 dump_printf (dump_kind, " %p", (void *)child);
1656 dump_printf (dump_kind, "\n");
1657 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1658 vect_print_slp_tree (dump_kind, loc, child, visited);
1659 }
1660
1661 static void
1662 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1663 slp_tree node)
1664 {
1665 hash_set<slp_tree> visited;
1666 vect_print_slp_tree (dump_kind, loc, node, visited);
1667 }
1668
1669 /* Mark the tree rooted at NODE with PURE_SLP. */
1670
1671 static void
1672 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1673 {
1674 int i;
1675 stmt_vec_info stmt_info;
1676 slp_tree child;
1677
1678 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1679 return;
1680
1681 if (visited.add (node))
1682 return;
1683
1684 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1685 STMT_SLP_TYPE (stmt_info) = pure_slp;
1686
1687 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1688 vect_mark_slp_stmts (child, visited);
1689 }
1690
1691 static void
1692 vect_mark_slp_stmts (slp_tree node)
1693 {
1694 hash_set<slp_tree> visited;
1695 vect_mark_slp_stmts (node, visited);
1696 }
1697
1698 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1699
1700 static void
1701 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1702 {
1703 int i;
1704 stmt_vec_info stmt_info;
1705 slp_tree child;
1706
1707 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1708 return;
1709
1710 if (visited.add (node))
1711 return;
1712
1713 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1714 {
1715 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1716 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1717 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1718 }
1719
1720 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1721 vect_mark_slp_stmts_relevant (child, visited);
1722 }
1723
1724 static void
1725 vect_mark_slp_stmts_relevant (slp_tree node)
1726 {
1727 hash_set<slp_tree> visited;
1728 vect_mark_slp_stmts_relevant (node, visited);
1729 }
1730
1731
1732 /* Rearrange the statements of NODE according to PERMUTATION. */
1733
1734 static void
1735 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1736 vec<unsigned> permutation,
1737 hash_set<slp_tree> &visited)
1738 {
1739 unsigned int i;
1740 slp_tree child;
1741
1742 if (visited.add (node))
1743 return;
1744
1745 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1746 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1747
1748 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1749 {
1750 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1751 vec<stmt_vec_info> tmp_stmts;
1752 tmp_stmts.create (group_size);
1753 tmp_stmts.quick_grow (group_size);
1754 stmt_vec_info stmt_info;
1755 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1756 tmp_stmts[permutation[i]] = stmt_info;
1757 SLP_TREE_SCALAR_STMTS (node).release ();
1758 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1759 }
1760 if (SLP_TREE_SCALAR_OPS (node).exists ())
1761 {
1762 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1763 vec<tree> tmp_ops;
1764 tmp_ops.create (group_size);
1765 tmp_ops.quick_grow (group_size);
1766 tree op;
1767 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1768 tmp_ops[permutation[i]] = op;
1769 SLP_TREE_SCALAR_OPS (node).release ();
1770 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1771 }
1772 }
1773
1774
1775 /* Attempt to reorder stmts in a reduction chain so that we don't
1776 require any load permutation. Return true if that was possible,
1777 otherwise return false. */
1778
1779 static bool
1780 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1781 {
1782 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1783 unsigned int i, j;
1784 unsigned int lidx;
1785 slp_tree node, load;
1786
1787 /* Compare all the permutation sequences to the first one. We know
1788 that at least one load is permuted. */
1789 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1790 if (!node->load_permutation.exists ())
1791 return false;
1792 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1793 {
1794 if (!load->load_permutation.exists ())
1795 return false;
1796 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1797 if (lidx != node->load_permutation[j])
1798 return false;
1799 }
1800
1801 /* Check that the loads in the first sequence are different and there
1802 are no gaps between them. */
1803 auto_sbitmap load_index (group_size);
1804 bitmap_clear (load_index);
1805 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1806 {
1807 if (lidx >= group_size)
1808 return false;
1809 if (bitmap_bit_p (load_index, lidx))
1810 return false;
1811
1812 bitmap_set_bit (load_index, lidx);
1813 }
1814 for (i = 0; i < group_size; i++)
1815 if (!bitmap_bit_p (load_index, i))
1816 return false;
1817
1818 /* This permutation is valid for reduction. Since the order of the
1819 statements in the nodes is not important unless they are memory
1820 accesses, we can rearrange the statements in all the nodes
1821 according to the order of the loads. */
1822 hash_set<slp_tree> visited;
1823 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1824 node->load_permutation, visited);
1825
1826 /* We are done, no actual permutations need to be generated. */
1827 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1828 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1829 {
1830 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1831 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1832 /* But we have to keep those permutations that are required because
1833 of handling of gaps. */
1834 if (known_eq (unrolling_factor, 1U)
1835 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1836 && DR_GROUP_GAP (first_stmt_info) == 0))
1837 SLP_TREE_LOAD_PERMUTATION (node).release ();
1838 else
1839 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1840 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1841 }
1842
1843 return true;
1844 }
1845
1846 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1847
1848 static void
1849 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1850 hash_set<slp_tree> &visited)
1851 {
1852 if (visited.add (node))
1853 return;
1854
1855 if (SLP_TREE_CHILDREN (node).length () == 0)
1856 {
1857 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1858 return;
1859 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1860 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1861 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1862 SLP_INSTANCE_LOADS (inst).safe_push (node);
1863 }
1864 else
1865 {
1866 unsigned i;
1867 slp_tree child;
1868 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1869 vect_gather_slp_loads (inst, child, visited);
1870 }
1871 }
1872
1873 static void
1874 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1875 {
1876 hash_set<slp_tree> visited;
1877 vect_gather_slp_loads (inst, node, visited);
1878 }
1879
1880 /* Check if the required load permutations in the SLP instance
1881 SLP_INSTN are supported. */
1882
1883 static bool
1884 vect_supported_load_permutation_p (slp_instance slp_instn)
1885 {
1886 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1887 unsigned int i, j, k, next;
1888 slp_tree node;
1889
1890 if (dump_enabled_p ())
1891 {
1892 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1893 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1894 if (node->load_permutation.exists ())
1895 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1896 dump_printf (MSG_NOTE, "%d ", next);
1897 else
1898 for (k = 0; k < group_size; ++k)
1899 dump_printf (MSG_NOTE, "%d ", k);
1900 dump_printf (MSG_NOTE, "\n");
1901 }
1902
1903 /* In case of reduction every load permutation is allowed, since the order
1904 of the reduction statements is not important (as opposed to the case of
1905 grouped stores). The only condition we need to check is that all the
1906 load nodes are of the same size and have the same permutation (and then
1907 rearrange all the nodes of the SLP instance according to this
1908 permutation). */
1909
1910 /* Check that all the load nodes are of the same size. */
1911 /* ??? Can't we assert this? */
1912 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1913 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1914 return false;
1915
1916 node = SLP_INSTANCE_TREE (slp_instn);
1917 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1918
1919 /* Reduction (there are no data-refs in the root).
1920 In reduction chain the order of the loads is not important. */
1921 if (!STMT_VINFO_DATA_REF (stmt_info)
1922 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1923 vect_attempt_slp_rearrange_stmts (slp_instn);
1924
1925 /* In basic block vectorization we allow any subchain of an interleaving
1926 chain.
1927 FORNOW: not supported in loop SLP because of realignment compications. */
1928 if (STMT_VINFO_BB_VINFO (stmt_info))
1929 {
1930 /* Check whether the loads in an instance form a subchain and thus
1931 no permutation is necessary. */
1932 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1933 {
1934 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1935 continue;
1936 bool subchain_p = true;
1937 stmt_vec_info next_load_info = NULL;
1938 stmt_vec_info load_info;
1939 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1940 {
1941 if (j != 0
1942 && (next_load_info != load_info
1943 || DR_GROUP_GAP (load_info) != 1))
1944 {
1945 subchain_p = false;
1946 break;
1947 }
1948 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1949 }
1950 if (subchain_p)
1951 SLP_TREE_LOAD_PERMUTATION (node).release ();
1952 else
1953 {
1954 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1955 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1956 unsigned HOST_WIDE_INT nunits;
1957 unsigned k, maxk = 0;
1958 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1959 if (k > maxk)
1960 maxk = k;
1961 /* In BB vectorization we may not actually use a loaded vector
1962 accessing elements in excess of DR_GROUP_SIZE. */
1963 tree vectype = STMT_VINFO_VECTYPE (group_info);
1964 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1965 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1966 {
1967 if (dump_enabled_p ())
1968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1969 "BB vectorization with gaps at the end of "
1970 "a load is not supported\n");
1971 return false;
1972 }
1973
1974 /* Verify the permutation can be generated. */
1975 vec<tree> tem;
1976 unsigned n_perms;
1977 if (!vect_transform_slp_perm_load (node, tem, NULL,
1978 1, slp_instn, true, &n_perms))
1979 {
1980 if (dump_enabled_p ())
1981 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1982 vect_location,
1983 "unsupported load permutation\n");
1984 return false;
1985 }
1986 }
1987 }
1988 return true;
1989 }
1990
1991 /* For loop vectorization verify we can generate the permutation. Be
1992 conservative about the vectorization factor, there are permutations
1993 that will use three vector inputs only starting from a specific factor
1994 and the vectorization factor is not yet final.
1995 ??? The SLP instance unrolling factor might not be the maximum one. */
1996 unsigned n_perms;
1997 poly_uint64 test_vf
1998 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1999 LOOP_VINFO_VECT_FACTOR
2000 (STMT_VINFO_LOOP_VINFO (stmt_info)));
2001 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
2002 if (node->load_permutation.exists ()
2003 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
2004 slp_instn, true, &n_perms))
2005 return false;
2006
2007 return true;
2008 }
2009
2010
2011 /* Find the last store in SLP INSTANCE. */
2012
2013 stmt_vec_info
2014 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2015 {
2016 stmt_vec_info last = NULL;
2017 stmt_vec_info stmt_vinfo;
2018
2019 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2020 {
2021 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2022 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2023 }
2024
2025 return last;
2026 }
2027
2028 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2029 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2030 (also containing the first GROUP1_SIZE stmts, since stores are
2031 consecutive), the second containing the remainder.
2032 Return the first stmt in the second group. */
2033
2034 static stmt_vec_info
2035 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2036 {
2037 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2038 gcc_assert (group1_size > 0);
2039 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2040 gcc_assert (group2_size > 0);
2041 DR_GROUP_SIZE (first_vinfo) = group1_size;
2042
2043 stmt_vec_info stmt_info = first_vinfo;
2044 for (unsigned i = group1_size; i > 1; i--)
2045 {
2046 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2047 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2048 }
2049 /* STMT is now the last element of the first group. */
2050 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2051 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2052
2053 DR_GROUP_SIZE (group2) = group2_size;
2054 for (stmt_info = group2; stmt_info;
2055 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2056 {
2057 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2058 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2059 }
2060
2061 /* For the second group, the DR_GROUP_GAP is that before the original group,
2062 plus skipping over the first vector. */
2063 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2064
2065 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2066 DR_GROUP_GAP (first_vinfo) += group2_size;
2067
2068 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2070 group1_size, group2_size);
2071
2072 return group2;
2073 }
2074
2075 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2076 statements and a vector of NUNITS elements. */
2077
2078 static poly_uint64
2079 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2080 {
2081 return exact_div (common_multiple (nunits, group_size), group_size);
2082 }
2083
2084 /* Analyze an SLP instance starting from a group of grouped stores. Call
2085 vect_build_slp_tree to build a tree of packed stmts if possible.
2086 Return FALSE if it's impossible to SLP any stmt in the loop. */
2087
2088 static bool
2089 vect_analyze_slp_instance (vec_info *vinfo,
2090 scalar_stmts_to_slp_tree_map_t *bst_map,
2091 stmt_vec_info stmt_info, unsigned max_tree_size)
2092 {
2093 slp_instance new_instance;
2094 slp_tree node;
2095 unsigned int group_size;
2096 tree vectype, scalar_type = NULL_TREE;
2097 unsigned int i;
2098 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2099 vec<stmt_vec_info> scalar_stmts;
2100 bool constructor = false;
2101
2102 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2103 {
2104 scalar_type = TREE_TYPE (DR_REF (dr));
2105 group_size = DR_GROUP_SIZE (stmt_info);
2106 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2107 }
2108 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2109 {
2110 gcc_assert (is_a <loop_vec_info> (vinfo));
2111 vectype = STMT_VINFO_VECTYPE (stmt_info);
2112 group_size = REDUC_GROUP_SIZE (stmt_info);
2113 }
2114 else if (is_gimple_assign (stmt_info->stmt)
2115 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2116 {
2117 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2118 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2119 constructor = true;
2120 }
2121 else
2122 {
2123 gcc_assert (is_a <loop_vec_info> (vinfo));
2124 vectype = STMT_VINFO_VECTYPE (stmt_info);
2125 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2126 }
2127
2128 if (!vectype)
2129 {
2130 if (dump_enabled_p ())
2131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2132 "Build SLP failed: unsupported data-type %T\n",
2133 scalar_type);
2134
2135 return false;
2136 }
2137 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2138
2139 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2140 scalar_stmts.create (group_size);
2141 stmt_vec_info next_info = stmt_info;
2142 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2143 {
2144 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2145 while (next_info)
2146 {
2147 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2148 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2149 }
2150 }
2151 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2152 {
2153 /* Collect the reduction stmts and store them in
2154 SLP_TREE_SCALAR_STMTS. */
2155 while (next_info)
2156 {
2157 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2158 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2159 }
2160 /* Mark the first element of the reduction chain as reduction to properly
2161 transform the node. In the reduction analysis phase only the last
2162 element of the chain is marked as reduction. */
2163 STMT_VINFO_DEF_TYPE (stmt_info)
2164 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2165 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2166 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2167 }
2168 else if (constructor)
2169 {
2170 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2171 tree val;
2172 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2173 {
2174 if (TREE_CODE (val) == SSA_NAME)
2175 {
2176 gimple* def = SSA_NAME_DEF_STMT (val);
2177 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2178 /* Value is defined in another basic block. */
2179 if (!def_info)
2180 return false;
2181 scalar_stmts.safe_push (def_info);
2182 }
2183 else
2184 return false;
2185 }
2186 }
2187 else
2188 {
2189 /* Collect reduction statements. */
2190 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2191 for (i = 0; reductions.iterate (i, &next_info); i++)
2192 scalar_stmts.safe_push (next_info);
2193 }
2194
2195 /* Build the tree for the SLP instance. */
2196 bool *matches = XALLOCAVEC (bool, group_size);
2197 unsigned npermutes = 0;
2198 poly_uint64 max_nunits = nunits;
2199 unsigned tree_size = 0;
2200 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2201 &max_nunits, matches, &npermutes,
2202 &tree_size, bst_map);
2203 if (node != NULL)
2204 {
2205 /* If this is a reduction chain with a conversion in front
2206 amend the SLP tree with a node for that. */
2207 if (!dr
2208 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2209 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2210 {
2211 /* Get at the conversion stmt - we know it's the single use
2212 of the last stmt of the reduction chain. */
2213 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2214 use_operand_p use_p;
2215 gimple *use_stmt;
2216 bool r = single_imm_use (gimple_assign_lhs (tem), &use_p, &use_stmt);
2217 gcc_assert (r);
2218 next_info = vinfo->lookup_stmt (use_stmt);
2219 next_info = vect_stmt_to_vectorize (next_info);
2220 scalar_stmts = vNULL;
2221 scalar_stmts.create (group_size);
2222 for (unsigned i = 0; i < group_size; ++i)
2223 scalar_stmts.quick_push (next_info);
2224 slp_tree conv = vect_create_new_slp_node (scalar_stmts);
2225 SLP_TREE_CHILDREN (conv).quick_push (node);
2226 node = conv;
2227 /* We also have to fake this conversion stmt as SLP reduction group
2228 so we don't have to mess with too much code elsewhere. */
2229 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2230 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2231 }
2232
2233 /* Calculate the unrolling factor based on the smallest type. */
2234 poly_uint64 unrolling_factor
2235 = calculate_unrolling_factor (max_nunits, group_size);
2236
2237 if (maybe_ne (unrolling_factor, 1U)
2238 && is_a <bb_vec_info> (vinfo))
2239 {
2240 unsigned HOST_WIDE_INT const_max_nunits;
2241 if (!max_nunits.is_constant (&const_max_nunits)
2242 || const_max_nunits > group_size)
2243 {
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "Build SLP failed: store group "
2247 "size not a multiple of the vector size "
2248 "in basic block SLP\n");
2249 vect_free_slp_tree (node, false);
2250 return false;
2251 }
2252 /* Fatal mismatch. */
2253 matches[group_size / const_max_nunits * const_max_nunits] = false;
2254 vect_free_slp_tree (node, false);
2255 }
2256 else
2257 {
2258 /* Create a new SLP instance. */
2259 new_instance = XNEW (class _slp_instance);
2260 SLP_INSTANCE_TREE (new_instance) = node;
2261 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2262 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2263 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2264 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2265
2266 vect_gather_slp_loads (new_instance, node);
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_NOTE, vect_location,
2269 "SLP size %u vs. limit %u.\n",
2270 tree_size, max_tree_size);
2271
2272 /* Compute the load permutation. */
2273 slp_tree load_node;
2274 bool loads_permuted = false;
2275 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2276 {
2277 vec<unsigned> load_permutation;
2278 int j;
2279 stmt_vec_info load_info;
2280 bool this_load_permuted = false;
2281 load_permutation.create (group_size);
2282 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2283 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2284 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2285 {
2286 int load_place = vect_get_place_in_interleaving_chain
2287 (load_info, first_stmt_info);
2288 gcc_assert (load_place != -1);
2289 if (load_place != j)
2290 this_load_permuted = true;
2291 load_permutation.safe_push (load_place);
2292 }
2293 if (!this_load_permuted
2294 /* The load requires permutation when unrolling exposes
2295 a gap either because the group is larger than the SLP
2296 group-size or because there is a gap between the groups. */
2297 && (known_eq (unrolling_factor, 1U)
2298 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2299 && DR_GROUP_GAP (first_stmt_info) == 0)))
2300 {
2301 load_permutation.release ();
2302 continue;
2303 }
2304 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2305 loads_permuted = true;
2306 }
2307
2308 if (loads_permuted)
2309 {
2310 if (!vect_supported_load_permutation_p (new_instance))
2311 {
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "Build SLP failed: unsupported load "
2315 "permutation %G", stmt_info->stmt);
2316 vect_free_slp_instance (new_instance, false);
2317 return false;
2318 }
2319 }
2320
2321 /* If the loads and stores can be handled with load/store-lan
2322 instructions do not generate this SLP instance. */
2323 if (is_a <loop_vec_info> (vinfo)
2324 && loads_permuted
2325 && dr && vect_store_lanes_supported (vectype, group_size, false))
2326 {
2327 slp_tree load_node;
2328 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2329 {
2330 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2331 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2332 /* Use SLP for strided accesses (or if we can't load-lanes). */
2333 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2334 || ! vect_load_lanes_supported
2335 (STMT_VINFO_VECTYPE (stmt_vinfo),
2336 DR_GROUP_SIZE (stmt_vinfo), false))
2337 break;
2338 }
2339 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2340 {
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "Built SLP cancelled: can use "
2344 "load/store-lanes\n");
2345 vect_free_slp_instance (new_instance, false);
2346 return false;
2347 }
2348 }
2349
2350 vinfo->slp_instances.safe_push (new_instance);
2351
2352 if (dump_enabled_p ())
2353 {
2354 dump_printf_loc (MSG_NOTE, vect_location,
2355 "Final SLP tree for instance:\n");
2356 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2357 }
2358
2359 return true;
2360 }
2361 }
2362 else
2363 {
2364 /* Failed to SLP. */
2365 /* Free the allocated memory. */
2366 scalar_stmts.release ();
2367 }
2368
2369 /* For basic block SLP, try to break the group up into multiples of the
2370 vector size. */
2371 unsigned HOST_WIDE_INT const_nunits;
2372 if (is_a <bb_vec_info> (vinfo)
2373 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2374 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2375 && nunits.is_constant (&const_nunits))
2376 {
2377 /* We consider breaking the group only on VF boundaries from the existing
2378 start. */
2379 for (i = 0; i < group_size; i++)
2380 if (!matches[i]) break;
2381
2382 if (i >= const_nunits && i < group_size)
2383 {
2384 /* Split into two groups at the first vector boundary before i. */
2385 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2386 unsigned group1_size = i & ~(const_nunits - 1);
2387
2388 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2389 group1_size);
2390 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2391 max_tree_size);
2392 /* If the first non-match was in the middle of a vector,
2393 skip the rest of that vector. */
2394 if (group1_size < i)
2395 {
2396 i = group1_size + const_nunits;
2397 if (i < group_size)
2398 rest = vect_split_slp_store_group (rest, const_nunits);
2399 }
2400 if (i < group_size)
2401 res |= vect_analyze_slp_instance (vinfo, bst_map,
2402 rest, max_tree_size);
2403 return res;
2404 }
2405 /* Even though the first vector did not all match, we might be able to SLP
2406 (some) of the remainder. FORNOW ignore this possibility. */
2407 }
2408
2409 return false;
2410 }
2411
2412
2413 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2414 trees of packed scalar stmts if SLP is possible. */
2415
2416 opt_result
2417 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2418 {
2419 unsigned int i;
2420 stmt_vec_info first_element;
2421
2422 DUMP_VECT_SCOPE ("vect_analyze_slp");
2423
2424 scalar_stmts_to_slp_tree_map_t *bst_map
2425 = new scalar_stmts_to_slp_tree_map_t ();
2426
2427 /* Find SLP sequences starting from groups of grouped stores. */
2428 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2429 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2430
2431 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2432 {
2433 if (loop_vinfo->reduction_chains.length () > 0)
2434 {
2435 /* Find SLP sequences starting from reduction chains. */
2436 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2437 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2438 max_tree_size))
2439 {
2440 /* Dissolve reduction chain group. */
2441 stmt_vec_info vinfo = first_element;
2442 stmt_vec_info last = NULL;
2443 while (vinfo)
2444 {
2445 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2446 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2447 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2448 last = vinfo;
2449 vinfo = next;
2450 }
2451 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2452 /* It can be still vectorized as part of an SLP reduction. */
2453 loop_vinfo->reductions.safe_push (last);
2454 }
2455 }
2456
2457 /* Find SLP sequences starting from groups of reductions. */
2458 if (loop_vinfo->reductions.length () > 1)
2459 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2460 max_tree_size);
2461 }
2462
2463 /* The map keeps a reference on SLP nodes built, release that. */
2464 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2465 it != bst_map->end (); ++it)
2466 if ((*it).second)
2467 vect_free_slp_tree ((*it).second, false);
2468 delete bst_map;
2469
2470 return opt_result::success ();
2471 }
2472
2473
2474 /* For each possible SLP instance decide whether to SLP it and calculate overall
2475 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2476 least one instance. */
2477
2478 bool
2479 vect_make_slp_decision (loop_vec_info loop_vinfo)
2480 {
2481 unsigned int i;
2482 poly_uint64 unrolling_factor = 1;
2483 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2484 slp_instance instance;
2485 int decided_to_slp = 0;
2486
2487 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2488
2489 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2490 {
2491 /* FORNOW: SLP if you can. */
2492 /* All unroll factors have the form:
2493
2494 GET_MODE_SIZE (vinfo->vector_mode) * X
2495
2496 for some rational X, so they must have a common multiple. */
2497 unrolling_factor
2498 = force_common_multiple (unrolling_factor,
2499 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2500
2501 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2502 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2503 loop-based vectorization. Such stmts will be marked as HYBRID. */
2504 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2505 decided_to_slp++;
2506 }
2507
2508 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2509
2510 if (decided_to_slp && dump_enabled_p ())
2511 {
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "Decided to SLP %d instances. Unrolling factor ",
2514 decided_to_slp);
2515 dump_dec (MSG_NOTE, unrolling_factor);
2516 dump_printf (MSG_NOTE, "\n");
2517 }
2518
2519 return (decided_to_slp > 0);
2520 }
2521
2522
2523 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2524 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2525
2526 static void
2527 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2528 hash_map<slp_tree, unsigned> &visited)
2529 {
2530 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2531 imm_use_iterator imm_iter;
2532 gimple *use_stmt;
2533 stmt_vec_info use_vinfo;
2534 slp_tree child;
2535 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2536 int j;
2537
2538 /* We need to union stype over the incoming graph edges but we still
2539 want to limit recursion to stay O(N+E). */
2540 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2541
2542 /* Propagate hybrid down the SLP tree. */
2543 if (stype == hybrid)
2544 ;
2545 else if (HYBRID_SLP_STMT (stmt_vinfo))
2546 stype = hybrid;
2547 else if (!only_edge)
2548 {
2549 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2550 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2551 /* If we get a pattern stmt here we have to use the LHS of the
2552 original stmt for immediate uses. */
2553 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2554 tree def;
2555 if (gimple_code (stmt) == GIMPLE_PHI)
2556 def = gimple_phi_result (stmt);
2557 else
2558 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2559 if (def)
2560 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2561 {
2562 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2563 if (!use_vinfo)
2564 continue;
2565 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2566 if (!STMT_SLP_TYPE (use_vinfo)
2567 && (STMT_VINFO_RELEVANT (use_vinfo)
2568 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2569 && !(gimple_code (use_stmt) == GIMPLE_PHI
2570 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2571 {
2572 if (dump_enabled_p ())
2573 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2574 "def in non-SLP stmt: %G", use_stmt);
2575 stype = hybrid;
2576 }
2577 }
2578 }
2579
2580 if (stype == hybrid
2581 && !HYBRID_SLP_STMT (stmt_vinfo))
2582 {
2583 if (dump_enabled_p ())
2584 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2585 stmt_vinfo->stmt);
2586 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2587 }
2588
2589 if (!only_edge)
2590 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2591 if (SLP_TREE_DEF_TYPE (child) != vect_external_def
2592 && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
2593 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2594 }
2595
2596 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2597
2598 static tree
2599 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2600 {
2601 walk_stmt_info *wi = (walk_stmt_info *)data;
2602 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2603
2604 if (wi->is_lhs)
2605 return NULL_TREE;
2606
2607 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2608 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2609 {
2610 if (dump_enabled_p ())
2611 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2612 def_stmt_info->stmt);
2613 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2614 }
2615
2616 return NULL_TREE;
2617 }
2618
2619 static tree
2620 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2621 walk_stmt_info *wi)
2622 {
2623 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2624 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2625 /* If the stmt is in a SLP instance then this isn't a reason
2626 to mark use definitions in other SLP instances as hybrid. */
2627 if (! STMT_SLP_TYPE (use_vinfo)
2628 && (STMT_VINFO_RELEVANT (use_vinfo)
2629 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2630 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2631 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2632 ;
2633 else
2634 *handled = true;
2635 return NULL_TREE;
2636 }
2637
2638 /* Find stmts that must be both vectorized and SLPed. */
2639
2640 void
2641 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2642 {
2643 unsigned int i;
2644 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2645 slp_instance instance;
2646
2647 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2648
2649 /* First walk all pattern stmt in the loop and mark defs of uses as
2650 hybrid because immediate uses in them are not recorded. */
2651 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2652 {
2653 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2654 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2655 gsi_next (&gsi))
2656 {
2657 gimple *stmt = gsi_stmt (gsi);
2658 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2659 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2660 {
2661 walk_stmt_info wi;
2662 memset (&wi, 0, sizeof (wi));
2663 wi.info = loop_vinfo;
2664 gimple_stmt_iterator gsi2
2665 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2666 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2667 vect_detect_hybrid_slp_1, &wi);
2668 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2669 vect_detect_hybrid_slp_2,
2670 vect_detect_hybrid_slp_1, &wi);
2671 }
2672 }
2673 }
2674
2675 /* Then walk the SLP instance trees marking stmts with uses in
2676 non-SLP stmts as hybrid, also propagating hybrid down the
2677 SLP tree, collecting the above info on-the-fly. */
2678 hash_map<slp_tree, unsigned> visited;
2679 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2680 {
2681 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2682 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2683 i, pure_slp, visited);
2684 }
2685 }
2686
2687
2688 /* Initialize a bb_vec_info struct for the statements between
2689 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2690
2691 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2692 gimple_stmt_iterator region_end_in,
2693 vec_info_shared *shared)
2694 : vec_info (vec_info::bb, init_cost (NULL), shared),
2695 bb (gsi_bb (region_begin_in)),
2696 region_begin (region_begin_in),
2697 region_end (region_end_in)
2698 {
2699 gimple_stmt_iterator gsi;
2700
2701 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2702 gsi_next (&gsi))
2703 {
2704 gimple *stmt = gsi_stmt (gsi);
2705 gimple_set_uid (stmt, 0);
2706 add_stmt (stmt);
2707 }
2708
2709 bb->aux = this;
2710 }
2711
2712
2713 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2714 stmts in the basic block. */
2715
2716 _bb_vec_info::~_bb_vec_info ()
2717 {
2718 for (gimple_stmt_iterator si = region_begin;
2719 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2720 /* Reset region marker. */
2721 gimple_set_uid (gsi_stmt (si), -1);
2722
2723 bb->aux = NULL;
2724 }
2725
2726 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2727 given then that child nodes have already been processed, and that
2728 their def types currently match their SLP node's def type. */
2729
2730 static bool
2731 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2732 slp_instance node_instance,
2733 stmt_vector_for_cost *cost_vec)
2734 {
2735 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2736 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2737
2738 /* For BB vectorization vector types are assigned here.
2739 Memory accesses already got their vector type assigned
2740 in vect_analyze_data_refs. */
2741 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2742 if (bb_vinfo && STMT_VINFO_VECTYPE (stmt_info) == boolean_type_node)
2743 {
2744 tree vectype = vect_get_mask_type_for_stmt (stmt_info, node);
2745 if (!vectype)
2746 /* vect_get_mask_type_for_stmt has already explained the
2747 failure. */
2748 return false;
2749
2750 stmt_vec_info sstmt_info;
2751 unsigned int i;
2752 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2753 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2754 }
2755
2756 /* Calculate the number of vector statements to be created for the
2757 scalar stmts in this node. For SLP reductions it is equal to the
2758 number of vector statements in the children (which has already been
2759 calculated by the recursive call). Otherwise it is the number of
2760 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2761 VF divided by the number of elements in a vector. */
2762 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2763 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2764 {
2765 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2766 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2767 {
2768 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2769 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2770 break;
2771 }
2772 }
2773 else
2774 {
2775 poly_uint64 vf;
2776 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2777 vf = loop_vinfo->vectorization_factor;
2778 else
2779 vf = 1;
2780 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2781 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2782 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2783 = vect_get_num_vectors (vf * group_size, vectype);
2784 }
2785
2786 bool dummy;
2787 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2788 }
2789
2790 /* Try to build NODE from scalars, returning true on success.
2791 NODE_INSTANCE is the SLP instance that contains NODE. */
2792
2793 static bool
2794 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2795 slp_instance node_instance)
2796 {
2797 stmt_vec_info stmt_info;
2798 unsigned int i;
2799
2800 if (!is_a <bb_vec_info> (vinfo)
2801 || node == SLP_INSTANCE_TREE (node_instance)
2802 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2803 return false;
2804
2805 if (dump_enabled_p ())
2806 dump_printf_loc (MSG_NOTE, vect_location,
2807 "Building vector operands from scalars instead\n");
2808
2809 /* Don't remove and free the child nodes here, since they could be
2810 referenced by other structures. The analysis and scheduling phases
2811 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2812 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2813 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2814 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2815 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2816 {
2817 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2818 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2819 }
2820 return true;
2821 }
2822
2823 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2824 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2825
2826 Return true if the operations are supported. */
2827
2828 static bool
2829 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2830 slp_instance node_instance,
2831 hash_set<slp_tree> &visited,
2832 hash_set<slp_tree> &lvisited,
2833 stmt_vector_for_cost *cost_vec)
2834 {
2835 int i, j;
2836 slp_tree child;
2837
2838 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2839 return true;
2840
2841 /* If we already analyzed the exact same set of scalar stmts we're done.
2842 We share the generated vector stmts for those.
2843 The SLP graph is acyclic so not caching whether we failed or succeeded
2844 doesn't result in any issue since we throw away the lvisited set
2845 when we fail. */
2846 if (visited.contains (node)
2847 || lvisited.add (node))
2848 return true;
2849
2850 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2851 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2852 visited, lvisited, cost_vec))
2853 return false;
2854
2855 /* ??? We have to catch the case late where two first scalar stmts appear
2856 in multiple SLP children with different def type and fail. Remember
2857 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2858 match it when that is vect_internal_def. */
2859 auto_vec<vect_def_type, 4> dt;
2860 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2861 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2862 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2863 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2864
2865 /* Push SLP node def-type to stmt operands. */
2866 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2867 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2868 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2869 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2870 = SLP_TREE_DEF_TYPE (child);
2871
2872 /* Check everything worked out. */
2873 bool res = true;
2874 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2875 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2876 {
2877 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2878 {
2879 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2880 != SLP_TREE_DEF_TYPE (child))
2881 res = false;
2882 }
2883 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2884 != dt[j])
2885 res = false;
2886 }
2887 if (!res && dump_enabled_p ())
2888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2889 "not vectorized: same operand with different "
2890 "def type in stmt.\n");
2891
2892 if (res)
2893 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2894 cost_vec);
2895
2896 /* Restore def-types. */
2897 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2898 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2899 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2900
2901 /* If this node can't be vectorized, try pruning the tree here rather
2902 than felling the whole thing. */
2903 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2904 res = true;
2905
2906 return res;
2907 }
2908
2909
2910 /* Analyze statements in SLP instances of VINFO. Return true if the
2911 operations are supported. */
2912
2913 bool
2914 vect_slp_analyze_operations (vec_info *vinfo)
2915 {
2916 slp_instance instance;
2917 int i;
2918
2919 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2920
2921 hash_set<slp_tree> visited;
2922 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2923 {
2924 hash_set<slp_tree> lvisited;
2925 stmt_vector_for_cost cost_vec;
2926 cost_vec.create (2);
2927 if (!vect_slp_analyze_node_operations (vinfo,
2928 SLP_INSTANCE_TREE (instance),
2929 instance, visited, lvisited,
2930 &cost_vec)
2931 /* Instances with a root stmt require vectorized defs for the
2932 SLP tree root. */
2933 || (SLP_INSTANCE_ROOT_STMT (instance)
2934 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2935 != vect_internal_def)))
2936 {
2937 slp_tree node = SLP_INSTANCE_TREE (instance);
2938 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2939 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_NOTE, vect_location,
2941 "removing SLP instance operations starting from: %G",
2942 stmt_info->stmt);
2943 vect_free_slp_instance (instance, false);
2944 vinfo->slp_instances.ordered_remove (i);
2945 cost_vec.release ();
2946 }
2947 else
2948 {
2949 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2950 x != lvisited.end(); ++x)
2951 visited.add (*x);
2952 i++;
2953
2954 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2955 cost_vec.release ();
2956 }
2957 }
2958
2959 return !vinfo->slp_instances.is_empty ();
2960 }
2961
2962
2963 /* Compute the scalar cost of the SLP node NODE and its children
2964 and return it. Do not account defs that are marked in LIFE and
2965 update LIFE according to uses of NODE. */
2966
2967 static void
2968 vect_bb_slp_scalar_cost (basic_block bb,
2969 slp_tree node, vec<bool, va_heap> *life,
2970 stmt_vector_for_cost *cost_vec,
2971 hash_set<slp_tree> &visited)
2972 {
2973 unsigned i;
2974 stmt_vec_info stmt_info;
2975 slp_tree child;
2976
2977 if (visited.add (node))
2978 return;
2979
2980 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2981 {
2982 gimple *stmt = stmt_info->stmt;
2983 vec_info *vinfo = stmt_info->vinfo;
2984 ssa_op_iter op_iter;
2985 def_operand_p def_p;
2986
2987 if ((*life)[i])
2988 continue;
2989
2990 /* If there is a non-vectorized use of the defs then the scalar
2991 stmt is kept live in which case we do not account it or any
2992 required defs in the SLP children in the scalar cost. This
2993 way we make the vectorization more costly when compared to
2994 the scalar cost. */
2995 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2996 {
2997 imm_use_iterator use_iter;
2998 gimple *use_stmt;
2999 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3000 if (!is_gimple_debug (use_stmt))
3001 {
3002 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3003 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
3004 {
3005 (*life)[i] = true;
3006 BREAK_FROM_IMM_USE_STMT (use_iter);
3007 }
3008 }
3009 }
3010 if ((*life)[i])
3011 continue;
3012
3013 /* Count scalar stmts only once. */
3014 if (gimple_visited_p (stmt))
3015 continue;
3016 gimple_set_visited (stmt, true);
3017
3018 vect_cost_for_stmt kind;
3019 if (STMT_VINFO_DATA_REF (stmt_info))
3020 {
3021 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3022 kind = scalar_load;
3023 else
3024 kind = scalar_store;
3025 }
3026 else if (vect_nop_conversion_p (stmt_info))
3027 continue;
3028 else
3029 kind = scalar_stmt;
3030 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
3031 }
3032
3033 auto_vec<bool, 20> subtree_life;
3034 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3035 {
3036 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3037 {
3038 /* Do not directly pass LIFE to the recursive call, copy it to
3039 confine changes in the callee to the current child/subtree. */
3040 subtree_life.safe_splice (*life);
3041 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
3042 visited);
3043 subtree_life.truncate (0);
3044 }
3045 }
3046 }
3047
3048 /* Check if vectorization of the basic block is profitable. */
3049
3050 static bool
3051 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3052 {
3053 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3054 slp_instance instance;
3055 int i;
3056 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3057 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3058
3059 /* Calculate scalar cost. */
3060 stmt_vector_for_cost scalar_costs;
3061 scalar_costs.create (0);
3062 hash_set<slp_tree> visited;
3063 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3064 {
3065 auto_vec<bool, 20> life;
3066 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3067 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
3068 SLP_INSTANCE_TREE (instance),
3069 &life, &scalar_costs, visited);
3070 }
3071 void *target_cost_data = init_cost (NULL);
3072 add_stmt_costs (target_cost_data, &scalar_costs);
3073 scalar_costs.release ();
3074 unsigned dummy;
3075 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3076 destroy_cost_data (target_cost_data);
3077
3078 /* Unset visited flag. */
3079 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3080 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3081 gimple_set_visited (gsi_stmt (gsi), false);
3082
3083 /* Complete the target-specific cost calculation. */
3084 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3085 &vec_inside_cost, &vec_epilogue_cost);
3086
3087 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3088
3089 if (dump_enabled_p ())
3090 {
3091 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3092 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3093 vec_inside_cost);
3094 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3095 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3096 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3097 }
3098
3099 /* Vectorization is profitable if its cost is more than the cost of scalar
3100 version. Note that we err on the vector side for equal cost because
3101 the cost estimate is otherwise quite pessimistic (constant uses are
3102 free on the scalar side but cost a load on the vector side for
3103 example). */
3104 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3105 return false;
3106
3107 return true;
3108 }
3109
3110 /* Find any vectorizable constructors and add them to the grouped_store
3111 array. */
3112
3113 static void
3114 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3115 {
3116 gimple_stmt_iterator gsi;
3117
3118 for (gsi = bb_vinfo->region_begin;
3119 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3120 {
3121 gimple *stmt = gsi_stmt (gsi);
3122
3123 if (is_gimple_assign (stmt)
3124 && gimple_assign_rhs_code (stmt) == CONSTRUCTOR
3125 && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
3126 && TREE_CODE (TREE_TYPE (gimple_assign_lhs (stmt))) == VECTOR_TYPE)
3127 {
3128 tree rhs = gimple_assign_rhs1 (stmt);
3129
3130 if (CONSTRUCTOR_NELTS (rhs) == 0)
3131 continue;
3132
3133 poly_uint64 subparts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs));
3134
3135 if (maybe_ne (subparts, CONSTRUCTOR_NELTS (rhs)))
3136 continue;
3137
3138 if (dump_enabled_p ())
3139 dump_printf_loc (MSG_NOTE, vect_location,
3140 "Found vectorizable constructor: %G\n", stmt);
3141 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
3142 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3143 }
3144 }
3145 }
3146
3147 /* Check if the region described by BB_VINFO can be vectorized, returning
3148 true if so. When returning false, set FATAL to true if the same failure
3149 would prevent vectorization at other vector sizes, false if it is still
3150 worth trying other sizes. N_STMTS is the number of statements in the
3151 region. */
3152
3153 static bool
3154 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3155 {
3156 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3157
3158 slp_instance instance;
3159 int i;
3160 poly_uint64 min_vf = 2;
3161
3162 /* The first group of checks is independent of the vector size. */
3163 fatal = true;
3164
3165 /* Analyze the data references. */
3166
3167 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3168 {
3169 if (dump_enabled_p ())
3170 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3171 "not vectorized: unhandled data-ref in basic "
3172 "block.\n");
3173 return false;
3174 }
3175
3176 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3177 {
3178 if (dump_enabled_p ())
3179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3180 "not vectorized: not enough data-refs in "
3181 "basic block.\n");
3182 return false;
3183 }
3184
3185 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3186 {
3187 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189 "not vectorized: unhandled data access in "
3190 "basic block.\n");
3191 return false;
3192 }
3193
3194 vect_slp_check_for_constructors (bb_vinfo);
3195
3196 /* If there are no grouped stores in the region there is no need
3197 to continue with pattern recog as vect_analyze_slp will fail
3198 anyway. */
3199 if (bb_vinfo->grouped_stores.is_empty ())
3200 {
3201 if (dump_enabled_p ())
3202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3203 "not vectorized: no grouped stores in "
3204 "basic block.\n");
3205 return false;
3206 }
3207
3208 /* While the rest of the analysis below depends on it in some way. */
3209 fatal = false;
3210
3211 vect_pattern_recog (bb_vinfo);
3212
3213 /* Check the SLP opportunities in the basic block, analyze and build SLP
3214 trees. */
3215 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3216 {
3217 if (dump_enabled_p ())
3218 {
3219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3220 "Failed to SLP the basic block.\n");
3221 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3222 "not vectorized: failed to find SLP opportunities "
3223 "in basic block.\n");
3224 }
3225 return false;
3226 }
3227
3228 vect_record_base_alignments (bb_vinfo);
3229
3230 /* Analyze and verify the alignment of data references and the
3231 dependence in the SLP instances. */
3232 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3233 {
3234 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3235 || ! vect_slp_analyze_instance_dependence (instance))
3236 {
3237 slp_tree node = SLP_INSTANCE_TREE (instance);
3238 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3239 if (dump_enabled_p ())
3240 dump_printf_loc (MSG_NOTE, vect_location,
3241 "removing SLP instance operations starting from: %G",
3242 stmt_info->stmt);
3243 vect_free_slp_instance (instance, false);
3244 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3245 continue;
3246 }
3247
3248 /* Mark all the statements that we want to vectorize as pure SLP and
3249 relevant. */
3250 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3251 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3252 if (SLP_INSTANCE_ROOT_STMT (instance))
3253 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3254
3255 i++;
3256 }
3257 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3258 return false;
3259
3260 if (!vect_slp_analyze_operations (bb_vinfo))
3261 {
3262 if (dump_enabled_p ())
3263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3264 "not vectorized: bad operation in basic block.\n");
3265 return false;
3266 }
3267
3268 /* Cost model: check if the vectorization is worthwhile. */
3269 if (!unlimited_cost_model (NULL)
3270 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3271 {
3272 if (dump_enabled_p ())
3273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3274 "not vectorized: vectorization is not "
3275 "profitable.\n");
3276 return false;
3277 }
3278
3279 if (dump_enabled_p ())
3280 dump_printf_loc (MSG_NOTE, vect_location,
3281 "Basic block will be vectorized using SLP\n");
3282 return true;
3283 }
3284
3285 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3286 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3287 on success. The region has N_STMTS statements and has the datarefs
3288 given by DATAREFS. */
3289
3290 static bool
3291 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3292 gimple_stmt_iterator region_end,
3293 vec<data_reference_p> datarefs,
3294 unsigned int n_stmts)
3295 {
3296 bb_vec_info bb_vinfo;
3297 auto_vector_modes vector_modes;
3298
3299 /* Autodetect first vector size we try. */
3300 machine_mode next_vector_mode = VOIDmode;
3301 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3302 unsigned int mode_i = 0;
3303
3304 vec_info_shared shared;
3305
3306 machine_mode autodetected_vector_mode = VOIDmode;
3307 while (1)
3308 {
3309 bool vectorized = false;
3310 bool fatal = false;
3311 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3312
3313 bool first_time_p = shared.datarefs.is_empty ();
3314 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3315 if (first_time_p)
3316 bb_vinfo->shared->save_datarefs ();
3317 else
3318 bb_vinfo->shared->check_datarefs ();
3319 bb_vinfo->vector_mode = next_vector_mode;
3320
3321 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3322 && dbg_cnt (vect_slp))
3323 {
3324 if (dump_enabled_p ())
3325 {
3326 dump_printf_loc (MSG_NOTE, vect_location,
3327 "***** Analysis succeeded with vector mode"
3328 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3329 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3330 }
3331
3332 bb_vinfo->shared->check_datarefs ();
3333 vect_schedule_slp (bb_vinfo);
3334
3335 unsigned HOST_WIDE_INT bytes;
3336 if (dump_enabled_p ())
3337 {
3338 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3339 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3340 "basic block part vectorized using %wu byte "
3341 "vectors\n", bytes);
3342 else
3343 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3344 "basic block part vectorized using variable "
3345 "length vectors\n");
3346 }
3347
3348 vectorized = true;
3349 }
3350 else
3351 {
3352 if (dump_enabled_p ())
3353 dump_printf_loc (MSG_NOTE, vect_location,
3354 "***** Analysis failed with vector mode %s\n",
3355 GET_MODE_NAME (bb_vinfo->vector_mode));
3356 }
3357
3358 if (mode_i == 0)
3359 autodetected_vector_mode = bb_vinfo->vector_mode;
3360
3361 if (!fatal)
3362 while (mode_i < vector_modes.length ()
3363 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3364 {
3365 if (dump_enabled_p ())
3366 dump_printf_loc (MSG_NOTE, vect_location,
3367 "***** The result for vector mode %s would"
3368 " be the same\n",
3369 GET_MODE_NAME (vector_modes[mode_i]));
3370 mode_i += 1;
3371 }
3372
3373 delete bb_vinfo;
3374
3375 if (mode_i < vector_modes.length ()
3376 && VECTOR_MODE_P (autodetected_vector_mode)
3377 && (related_vector_mode (vector_modes[mode_i],
3378 GET_MODE_INNER (autodetected_vector_mode))
3379 == autodetected_vector_mode)
3380 && (related_vector_mode (autodetected_vector_mode,
3381 GET_MODE_INNER (vector_modes[mode_i]))
3382 == vector_modes[mode_i]))
3383 {
3384 if (dump_enabled_p ())
3385 dump_printf_loc (MSG_NOTE, vect_location,
3386 "***** Skipping vector mode %s, which would"
3387 " repeat the analysis for %s\n",
3388 GET_MODE_NAME (vector_modes[mode_i]),
3389 GET_MODE_NAME (autodetected_vector_mode));
3390 mode_i += 1;
3391 }
3392
3393 if (vectorized
3394 || mode_i == vector_modes.length ()
3395 || autodetected_vector_mode == VOIDmode
3396 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3397 vector sizes will fail do not bother iterating. */
3398 || fatal)
3399 return vectorized;
3400
3401 /* Try the next biggest vector size. */
3402 next_vector_mode = vector_modes[mode_i++];
3403 if (dump_enabled_p ())
3404 dump_printf_loc (MSG_NOTE, vect_location,
3405 "***** Re-trying analysis with vector mode %s\n",
3406 GET_MODE_NAME (next_vector_mode));
3407 }
3408 }
3409
3410 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3411 true if anything in the basic-block was vectorized. */
3412
3413 bool
3414 vect_slp_bb (basic_block bb)
3415 {
3416 gimple_stmt_iterator gsi;
3417 bool any_vectorized = false;
3418
3419 gsi = gsi_start_bb (bb);
3420 while (!gsi_end_p (gsi))
3421 {
3422 gimple_stmt_iterator region_begin = gsi;
3423 vec<data_reference_p> datarefs = vNULL;
3424 int insns = 0;
3425
3426 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3427 {
3428 gimple *stmt = gsi_stmt (gsi);
3429 if (is_gimple_debug (stmt))
3430 continue;
3431 insns++;
3432
3433 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3434 vect_location = stmt;
3435
3436 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3437 break;
3438 }
3439
3440 /* Skip leading unhandled stmts. */
3441 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3442 {
3443 gsi_next (&gsi);
3444 continue;
3445 }
3446
3447 gimple_stmt_iterator region_end = gsi;
3448
3449 if (insns > param_slp_max_insns_in_bb)
3450 {
3451 if (dump_enabled_p ())
3452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3453 "not vectorized: too many instructions in "
3454 "basic block.\n");
3455 }
3456 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3457 any_vectorized = true;
3458
3459 if (gsi_end_p (region_end))
3460 break;
3461
3462 /* Skip the unhandled stmt. */
3463 gsi_next (&gsi);
3464 }
3465
3466 return any_vectorized;
3467 }
3468
3469
3470 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3471
3472 static bool
3473 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3474 {
3475 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3476 tree op, vectype;
3477 enum vect_def_type dt;
3478
3479 /* For comparison and COND_EXPR type is chosen depending
3480 on the non-constant other comparison operand. */
3481 if (TREE_CODE_CLASS (code) == tcc_comparison)
3482 {
3483 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3484 op = gimple_assign_rhs1 (stmt);
3485
3486 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3487 gcc_unreachable ();
3488
3489 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3490 }
3491
3492 if (code == COND_EXPR)
3493 {
3494 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3495 tree cond = gimple_assign_rhs1 (stmt);
3496
3497 if (TREE_CODE (cond) == SSA_NAME)
3498 op = cond;
3499 else
3500 op = TREE_OPERAND (cond, 0);
3501
3502 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3503 gcc_unreachable ();
3504
3505 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3506 }
3507
3508 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3509 }
3510
3511 /* Build a variable-length vector in which the elements in ELTS are repeated
3512 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3513 RESULTS and add any new instructions to SEQ.
3514
3515 The approach we use is:
3516
3517 (1) Find a vector mode VM with integer elements of mode IM.
3518
3519 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3520 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3521 from small vectors to IM.
3522
3523 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3524
3525 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3526 correct byte contents.
3527
3528 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3529
3530 We try to find the largest IM for which this sequence works, in order
3531 to cut down on the number of interleaves. */
3532
3533 void
3534 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3535 vec<tree> elts, unsigned int nresults,
3536 vec<tree> &results)
3537 {
3538 unsigned int nelts = elts.length ();
3539 tree element_type = TREE_TYPE (vector_type);
3540
3541 /* (1) Find a vector mode VM with integer elements of mode IM. */
3542 unsigned int nvectors = 1;
3543 tree new_vector_type;
3544 tree permutes[2];
3545 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3546 &nvectors, &new_vector_type,
3547 permutes))
3548 gcc_unreachable ();
3549
3550 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3551 unsigned int partial_nelts = nelts / nvectors;
3552 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3553
3554 tree_vector_builder partial_elts;
3555 auto_vec<tree, 32> pieces (nvectors * 2);
3556 pieces.quick_grow (nvectors * 2);
3557 for (unsigned int i = 0; i < nvectors; ++i)
3558 {
3559 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3560 ELTS' has mode IM. */
3561 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3562 for (unsigned int j = 0; j < partial_nelts; ++j)
3563 partial_elts.quick_push (elts[i * partial_nelts + j]);
3564 tree t = gimple_build_vector (seq, &partial_elts);
3565 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3566 TREE_TYPE (new_vector_type), t);
3567
3568 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3569 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3570 }
3571
3572 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3573 correct byte contents.
3574
3575 We need to repeat the following operation log2(nvectors) times:
3576
3577 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3578 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3579
3580 However, if each input repeats every N elements and the VF is
3581 a multiple of N * 2, the HI result is the same as the LO. */
3582 unsigned int in_start = 0;
3583 unsigned int out_start = nvectors;
3584 unsigned int hi_start = nvectors / 2;
3585 /* A bound on the number of outputs needed to produce NRESULTS results
3586 in the final iteration. */
3587 unsigned int noutputs_bound = nvectors * nresults;
3588 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3589 {
3590 noutputs_bound /= 2;
3591 unsigned int limit = MIN (noutputs_bound, nvectors);
3592 for (unsigned int i = 0; i < limit; ++i)
3593 {
3594 if ((i & 1) != 0
3595 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3596 2 * in_repeat))
3597 {
3598 pieces[out_start + i] = pieces[out_start + i - 1];
3599 continue;
3600 }
3601
3602 tree output = make_ssa_name (new_vector_type);
3603 tree input1 = pieces[in_start + (i / 2)];
3604 tree input2 = pieces[in_start + (i / 2) + hi_start];
3605 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3606 input1, input2,
3607 permutes[i & 1]);
3608 gimple_seq_add_stmt (seq, stmt);
3609 pieces[out_start + i] = output;
3610 }
3611 std::swap (in_start, out_start);
3612 }
3613
3614 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3615 results.reserve (nresults);
3616 for (unsigned int i = 0; i < nresults; ++i)
3617 if (i < nvectors)
3618 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3619 pieces[in_start + i]));
3620 else
3621 results.quick_push (results[i - nvectors]);
3622 }
3623
3624
3625 /* For constant and loop invariant defs of SLP_NODE this function returns
3626 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3627 OP_NODE determines the node for the operand containing the scalar
3628 operands. */
3629
3630 static void
3631 vect_get_constant_vectors (slp_tree op_node, slp_tree slp_node,
3632 vec<tree> *vec_oprnds)
3633 {
3634 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3635 vec_info *vinfo = stmt_vinfo->vinfo;
3636 unsigned HOST_WIDE_INT nunits;
3637 tree vec_cst;
3638 unsigned j, number_of_places_left_in_vector;
3639 tree vector_type;
3640 tree vop;
3641 int group_size = op_node->ops.length ();
3642 unsigned int vec_num, i;
3643 unsigned number_of_copies = 1;
3644 bool constant_p;
3645 tree neutral_op = NULL;
3646 gimple_seq ctor_seq = NULL;
3647 auto_vec<tree, 16> permute_results;
3648
3649 /* ??? SLP analysis should compute the vector type for the
3650 constant / invariant and store it in the SLP node. */
3651 tree op = op_node->ops[0];
3652 /* Check if vector type is a boolean vector. */
3653 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3654 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3655 && vect_mask_constant_operand_p (stmt_vinfo))
3656 vector_type = truth_type_for (stmt_vectype);
3657 else
3658 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), op_node);
3659
3660 /* ??? For lane-reducing ops we should also have the required number
3661 of vector stmts initialized rather than second-guessing here. */
3662 unsigned int number_of_vectors;
3663 if (is_gimple_assign (stmt_vinfo->stmt)
3664 && (gimple_assign_rhs_code (stmt_vinfo->stmt) == SAD_EXPR
3665 || gimple_assign_rhs_code (stmt_vinfo->stmt) == DOT_PROD_EXPR
3666 || gimple_assign_rhs_code (stmt_vinfo->stmt) == WIDEN_SUM_EXPR))
3667 number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3668 else
3669 number_of_vectors
3670 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3671 * TYPE_VECTOR_SUBPARTS (stmt_vectype),
3672 vector_type);
3673 vec_oprnds->create (number_of_vectors);
3674 auto_vec<tree> voprnds (number_of_vectors);
3675
3676 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3677 created vectors. It is greater than 1 if unrolling is performed.
3678
3679 For example, we have two scalar operands, s1 and s2 (e.g., group of
3680 strided accesses of size two), while NUNITS is four (i.e., four scalars
3681 of this type can be packed in a vector). The output vector will contain
3682 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3683 will be 2).
3684
3685 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3686 containing the operands.
3687
3688 For example, NUNITS is four as before, and the group size is 8
3689 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3690 {s5, s6, s7, s8}. */
3691
3692 /* When using duplicate_and_interleave, we just need one element for
3693 each scalar statement. */
3694 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3695 nunits = group_size;
3696
3697 number_of_copies = nunits * number_of_vectors / group_size;
3698
3699 number_of_places_left_in_vector = nunits;
3700 constant_p = true;
3701 tree_vector_builder elts (vector_type, nunits, 1);
3702 elts.quick_grow (nunits);
3703 bool place_after_defs = false;
3704 for (j = 0; j < number_of_copies; j++)
3705 {
3706 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3707 {
3708 /* Create 'vect_ = {op0,op1,...,opn}'. */
3709 number_of_places_left_in_vector--;
3710 tree orig_op = op;
3711 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3712 {
3713 if (CONSTANT_CLASS_P (op))
3714 {
3715 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3716 {
3717 /* Can't use VIEW_CONVERT_EXPR for booleans because
3718 of possibly different sizes of scalar value and
3719 vector element. */
3720 if (integer_zerop (op))
3721 op = build_int_cst (TREE_TYPE (vector_type), 0);
3722 else if (integer_onep (op))
3723 op = build_all_ones_cst (TREE_TYPE (vector_type));
3724 else
3725 gcc_unreachable ();
3726 }
3727 else
3728 op = fold_unary (VIEW_CONVERT_EXPR,
3729 TREE_TYPE (vector_type), op);
3730 gcc_assert (op && CONSTANT_CLASS_P (op));
3731 }
3732 else
3733 {
3734 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3735 gimple *init_stmt;
3736 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3737 {
3738 tree true_val
3739 = build_all_ones_cst (TREE_TYPE (vector_type));
3740 tree false_val
3741 = build_zero_cst (TREE_TYPE (vector_type));
3742 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3743 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3744 op, true_val,
3745 false_val);
3746 }
3747 else
3748 {
3749 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3750 op);
3751 init_stmt
3752 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3753 op);
3754 }
3755 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3756 op = new_temp;
3757 }
3758 }
3759 elts[number_of_places_left_in_vector] = op;
3760 if (!CONSTANT_CLASS_P (op))
3761 constant_p = false;
3762 if (TREE_CODE (orig_op) == SSA_NAME
3763 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3764 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3765 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3766 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3767 place_after_defs = true;
3768
3769 if (number_of_places_left_in_vector == 0)
3770 {
3771 if (constant_p
3772 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3773 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3774 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3775 else
3776 {
3777 if (permute_results.is_empty ())
3778 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3779 elts, number_of_vectors,
3780 permute_results);
3781 vec_cst = permute_results[number_of_vectors - j - 1];
3782 }
3783 tree init;
3784 gimple_stmt_iterator gsi;
3785 if (place_after_defs)
3786 {
3787 stmt_vec_info last_stmt_info
3788 = vect_find_last_scalar_stmt_in_slp (slp_node);
3789 gsi = gsi_for_stmt (last_stmt_info->stmt);
3790 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3791 &gsi);
3792 }
3793 else
3794 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3795 NULL);
3796 if (ctor_seq != NULL)
3797 {
3798 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3799 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3800 ctor_seq = NULL;
3801 }
3802 voprnds.quick_push (init);
3803 place_after_defs = false;
3804 number_of_places_left_in_vector = nunits;
3805 constant_p = true;
3806 elts.new_vector (vector_type, nunits, 1);
3807 elts.quick_grow (nunits);
3808 }
3809 }
3810 }
3811
3812 /* Since the vectors are created in the reverse order, we should invert
3813 them. */
3814 vec_num = voprnds.length ();
3815 for (j = vec_num; j != 0; j--)
3816 {
3817 vop = voprnds[j - 1];
3818 vec_oprnds->quick_push (vop);
3819 }
3820
3821 /* In case that VF is greater than the unrolling factor needed for the SLP
3822 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3823 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3824 to replicate the vectors. */
3825 while (number_of_vectors > vec_oprnds->length ())
3826 {
3827 tree neutral_vec = NULL;
3828
3829 if (neutral_op)
3830 {
3831 if (!neutral_vec)
3832 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3833
3834 vec_oprnds->quick_push (neutral_vec);
3835 }
3836 else
3837 {
3838 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3839 vec_oprnds->quick_push (vop);
3840 }
3841 }
3842 }
3843
3844
3845 /* Get vectorized definitions from SLP_NODE that contains corresponding
3846 vectorized def-stmts. */
3847
3848 static void
3849 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3850 {
3851 stmt_vec_info vec_def_stmt_info;
3852 unsigned int i;
3853
3854 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3855
3856 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3857 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3858 }
3859
3860
3861 /* Get N vectorized definitions for SLP_NODE.
3862 If the scalar definitions are loop invariants or constants, collect them and
3863 call vect_get_constant_vectors() to create vector stmts.
3864 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3865 must be stored in the corresponding child of SLP_NODE, and we call
3866 vect_get_slp_vect_defs () to retrieve them. */
3867
3868 void
3869 vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3870 {
3871 if (n == -1U)
3872 n = SLP_TREE_CHILDREN (slp_node).length ();
3873
3874 for (unsigned i = 0; i < n; ++i)
3875 {
3876 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3877
3878 vec<tree> vec_defs = vNULL;
3879
3880 /* For each operand we check if it has vectorized definitions in a child
3881 node or we need to create them (for invariants and constants). */
3882 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3883 {
3884 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3885 vect_get_slp_vect_defs (child, &vec_defs);
3886 }
3887 else
3888 vect_get_constant_vectors (child, slp_node, &vec_defs);
3889
3890 vec_oprnds->quick_push (vec_defs);
3891 }
3892 }
3893
3894 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3895 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3896 permute statements for the SLP node NODE of the SLP instance
3897 SLP_NODE_INSTANCE. */
3898
3899 bool
3900 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3901 gimple_stmt_iterator *gsi, poly_uint64 vf,
3902 slp_instance slp_node_instance, bool analyze_only,
3903 unsigned *n_perms)
3904 {
3905 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3906 vec_info *vinfo = stmt_info->vinfo;
3907 int vec_index = 0;
3908 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3909 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3910 unsigned int mask_element;
3911 machine_mode mode;
3912
3913 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3914 return false;
3915
3916 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3917
3918 mode = TYPE_MODE (vectype);
3919 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3920
3921 /* Initialize the vect stmts of NODE to properly insert the generated
3922 stmts later. */
3923 if (! analyze_only)
3924 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3925 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3926 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3927
3928 /* Generate permutation masks for every NODE. Number of masks for each NODE
3929 is equal to GROUP_SIZE.
3930 E.g., we have a group of three nodes with three loads from the same
3931 location in each node, and the vector size is 4. I.e., we have a
3932 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3933 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3934 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3935 ...
3936
3937 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3938 The last mask is illegal since we assume two operands for permute
3939 operation, and the mask element values can't be outside that range.
3940 Hence, the last mask must be converted into {2,5,5,5}.
3941 For the first two permutations we need the first and the second input
3942 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3943 we need the second and the third vectors: {b1,c1,a2,b2} and
3944 {c2,a3,b3,c3}. */
3945
3946 int vect_stmts_counter = 0;
3947 unsigned int index = 0;
3948 int first_vec_index = -1;
3949 int second_vec_index = -1;
3950 bool noop_p = true;
3951 *n_perms = 0;
3952
3953 vec_perm_builder mask;
3954 unsigned int nelts_to_build;
3955 unsigned int nvectors_per_build;
3956 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3957 && multiple_p (nunits, group_size));
3958 if (repeating_p)
3959 {
3960 /* A single vector contains a whole number of copies of the node, so:
3961 (a) all permutes can use the same mask; and
3962 (b) the permutes only need a single vector input. */
3963 mask.new_vector (nunits, group_size, 3);
3964 nelts_to_build = mask.encoded_nelts ();
3965 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3966 }
3967 else
3968 {
3969 /* We need to construct a separate mask for each vector statement. */
3970 unsigned HOST_WIDE_INT const_nunits, const_vf;
3971 if (!nunits.is_constant (&const_nunits)
3972 || !vf.is_constant (&const_vf))
3973 return false;
3974 mask.new_vector (const_nunits, const_nunits, 1);
3975 nelts_to_build = const_vf * group_size;
3976 nvectors_per_build = 1;
3977 }
3978
3979 unsigned int count = mask.encoded_nelts ();
3980 mask.quick_grow (count);
3981 vec_perm_indices indices;
3982
3983 for (unsigned int j = 0; j < nelts_to_build; j++)
3984 {
3985 unsigned int iter_num = j / group_size;
3986 unsigned int stmt_num = j % group_size;
3987 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3988 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3989 if (repeating_p)
3990 {
3991 first_vec_index = 0;
3992 mask_element = i;
3993 }
3994 else
3995 {
3996 /* Enforced before the loop when !repeating_p. */
3997 unsigned int const_nunits = nunits.to_constant ();
3998 vec_index = i / const_nunits;
3999 mask_element = i % const_nunits;
4000 if (vec_index == first_vec_index
4001 || first_vec_index == -1)
4002 {
4003 first_vec_index = vec_index;
4004 }
4005 else if (vec_index == second_vec_index
4006 || second_vec_index == -1)
4007 {
4008 second_vec_index = vec_index;
4009 mask_element += const_nunits;
4010 }
4011 else
4012 {
4013 if (dump_enabled_p ())
4014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4015 "permutation requires at "
4016 "least three vectors %G",
4017 stmt_info->stmt);
4018 gcc_assert (analyze_only);
4019 return false;
4020 }
4021
4022 gcc_assert (mask_element < 2 * const_nunits);
4023 }
4024
4025 if (mask_element != index)
4026 noop_p = false;
4027 mask[index++] = mask_element;
4028
4029 if (index == count && !noop_p)
4030 {
4031 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4032 if (!can_vec_perm_const_p (mode, indices))
4033 {
4034 if (dump_enabled_p ())
4035 {
4036 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4037 vect_location,
4038 "unsupported vect permute { ");
4039 for (i = 0; i < count; ++i)
4040 {
4041 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4042 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4043 }
4044 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4045 }
4046 gcc_assert (analyze_only);
4047 return false;
4048 }
4049
4050 ++*n_perms;
4051 }
4052
4053 if (index == count)
4054 {
4055 if (!analyze_only)
4056 {
4057 tree mask_vec = NULL_TREE;
4058
4059 if (! noop_p)
4060 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4061
4062 if (second_vec_index == -1)
4063 second_vec_index = first_vec_index;
4064
4065 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4066 {
4067 /* Generate the permute statement if necessary. */
4068 tree first_vec = dr_chain[first_vec_index + ri];
4069 tree second_vec = dr_chain[second_vec_index + ri];
4070 stmt_vec_info perm_stmt_info;
4071 if (! noop_p)
4072 {
4073 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4074 tree perm_dest
4075 = vect_create_destination_var (gimple_assign_lhs (stmt),
4076 vectype);
4077 perm_dest = make_ssa_name (perm_dest);
4078 gassign *perm_stmt
4079 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4080 first_vec, second_vec,
4081 mask_vec);
4082 perm_stmt_info
4083 = vect_finish_stmt_generation (stmt_info, perm_stmt,
4084 gsi);
4085 }
4086 else
4087 /* If mask was NULL_TREE generate the requested
4088 identity transform. */
4089 perm_stmt_info = vinfo->lookup_def (first_vec);
4090
4091 /* Store the vector statement in NODE. */
4092 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
4093 = perm_stmt_info;
4094 }
4095 }
4096
4097 index = 0;
4098 first_vec_index = -1;
4099 second_vec_index = -1;
4100 noop_p = true;
4101 }
4102 }
4103
4104 return true;
4105 }
4106
4107 /* Vectorize SLP instance tree in postorder. */
4108
4109 static void
4110 vect_schedule_slp_instance (slp_tree node, slp_instance instance)
4111 {
4112 gimple_stmt_iterator si;
4113 stmt_vec_info stmt_info;
4114 unsigned int group_size;
4115 tree vectype;
4116 int i, j;
4117 slp_tree child;
4118
4119 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4120 return;
4121
4122 /* See if we have already vectorized the node in the graph of the
4123 SLP instance. */
4124 if (SLP_TREE_VEC_STMTS (node).exists ())
4125 return;
4126
4127 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4128 vect_schedule_slp_instance (child, instance);
4129
4130 /* Push SLP node def-type to stmts. */
4131 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4132 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4133 {
4134 stmt_vec_info child_stmt_info;
4135 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4136 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
4137 }
4138
4139 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4140
4141 /* VECTYPE is the type of the destination. */
4142 vectype = STMT_VINFO_VECTYPE (stmt_info);
4143 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4144 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4145
4146 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4147 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4148
4149 if (dump_enabled_p ())
4150 dump_printf_loc (MSG_NOTE, vect_location,
4151 "------>vectorizing SLP node starting from: %G",
4152 stmt_info->stmt);
4153
4154 /* Vectorized stmts go before the last scalar stmt which is where
4155 all uses are ready. */
4156 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4157 si = gsi_for_stmt (last_stmt_info->stmt);
4158
4159 /* Handle two-operation SLP nodes by vectorizing the group with
4160 both operations and then performing a merge. */
4161 if (SLP_TREE_TWO_OPERATORS (node))
4162 {
4163 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4164 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4165 enum tree_code ocode = ERROR_MARK;
4166 stmt_vec_info ostmt_info;
4167 vec_perm_builder mask (group_size, group_size, 1);
4168 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
4169 {
4170 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
4171 if (gimple_assign_rhs_code (ostmt) != code0)
4172 {
4173 mask.quick_push (1);
4174 ocode = gimple_assign_rhs_code (ostmt);
4175 }
4176 else
4177 mask.quick_push (0);
4178 }
4179 if (ocode != ERROR_MARK)
4180 {
4181 vec<stmt_vec_info> v0;
4182 vec<stmt_vec_info> v1;
4183 unsigned j;
4184 tree tmask = NULL_TREE;
4185 vect_transform_stmt (stmt_info, &si, node, instance);
4186 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4187 SLP_TREE_VEC_STMTS (node).truncate (0);
4188 gimple_assign_set_rhs_code (stmt, ocode);
4189 vect_transform_stmt (stmt_info, &si, node, instance);
4190 gimple_assign_set_rhs_code (stmt, code0);
4191 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4192 SLP_TREE_VEC_STMTS (node).truncate (0);
4193 tree meltype = build_nonstandard_integer_type
4194 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4195 tree mvectype = get_same_sized_vectype (meltype, vectype);
4196 unsigned k = 0, l;
4197 for (j = 0; j < v0.length (); ++j)
4198 {
4199 /* Enforced by vect_build_slp_tree, which rejects variable-length
4200 vectors for SLP_TREE_TWO_OPERATORS. */
4201 unsigned int const_nunits = nunits.to_constant ();
4202 tree_vector_builder melts (mvectype, const_nunits, 1);
4203 for (l = 0; l < const_nunits; ++l)
4204 {
4205 if (k >= group_size)
4206 k = 0;
4207 tree t = build_int_cst (meltype,
4208 mask[k++] * const_nunits + l);
4209 melts.quick_push (t);
4210 }
4211 tmask = melts.build ();
4212
4213 /* ??? Not all targets support a VEC_PERM_EXPR with a
4214 constant mask that would translate to a vec_merge RTX
4215 (with their vec_perm_const_ok). We can either not
4216 vectorize in that case or let veclower do its job.
4217 Unfortunately that isn't too great and at least for
4218 plus/minus we'd eventually like to match targets
4219 vector addsub instructions. */
4220 gimple *vstmt;
4221 vstmt = gimple_build_assign (make_ssa_name (vectype),
4222 VEC_PERM_EXPR,
4223 gimple_assign_lhs (v0[j]->stmt),
4224 gimple_assign_lhs (v1[j]->stmt),
4225 tmask);
4226 SLP_TREE_VEC_STMTS (node).quick_push
4227 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
4228 }
4229 v0.release ();
4230 v1.release ();
4231 return;
4232 }
4233 }
4234 vect_transform_stmt (stmt_info, &si, node, instance);
4235
4236 /* Restore stmt def-types. */
4237 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4238 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4239 {
4240 stmt_vec_info child_stmt_info;
4241 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4242 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4243 }
4244 }
4245
4246 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4247 For loop vectorization this is done in vectorizable_call, but for SLP
4248 it needs to be deferred until end of vect_schedule_slp, because multiple
4249 SLP instances may refer to the same scalar stmt. */
4250
4251 static void
4252 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4253 {
4254 gimple *new_stmt;
4255 gimple_stmt_iterator gsi;
4256 int i;
4257 slp_tree child;
4258 tree lhs;
4259 stmt_vec_info stmt_info;
4260
4261 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4262 return;
4263
4264 if (visited.add (node))
4265 return;
4266
4267 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4268 vect_remove_slp_scalar_calls (child, visited);
4269
4270 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4271 {
4272 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4273 if (!stmt || gimple_bb (stmt) == NULL)
4274 continue;
4275 if (is_pattern_stmt_p (stmt_info)
4276 || !PURE_SLP_STMT (stmt_info))
4277 continue;
4278 lhs = gimple_call_lhs (stmt);
4279 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4280 gsi = gsi_for_stmt (stmt);
4281 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4282 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4283 }
4284 }
4285
4286 static void
4287 vect_remove_slp_scalar_calls (slp_tree node)
4288 {
4289 hash_set<slp_tree> visited;
4290 vect_remove_slp_scalar_calls (node, visited);
4291 }
4292
4293 /* Vectorize the instance root. */
4294
4295 void
4296 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4297 {
4298 gassign *rstmt = NULL;
4299
4300 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4301 {
4302 stmt_vec_info child_stmt_info;
4303 int j;
4304
4305 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4306 {
4307 tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
4308 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4309 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4310 break;
4311 }
4312 }
4313 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4314 {
4315 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4316 stmt_vec_info child_stmt_info;
4317 int j;
4318 vec<constructor_elt, va_gc> *v;
4319 vec_alloc (v, nelts);
4320
4321 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4322 {
4323 CONSTRUCTOR_APPEND_ELT (v,
4324 NULL_TREE,
4325 gimple_get_lhs (child_stmt_info->stmt));
4326 }
4327 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4328 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4329 tree r_constructor = build_constructor (rtype, v);
4330 rstmt = gimple_build_assign (lhs, r_constructor);
4331 }
4332
4333 gcc_assert (rstmt);
4334
4335 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4336 gsi_replace (&rgsi, rstmt, true);
4337 }
4338
4339 /* Generate vector code for all SLP instances in the loop/basic block. */
4340
4341 void
4342 vect_schedule_slp (vec_info *vinfo)
4343 {
4344 vec<slp_instance> slp_instances;
4345 slp_instance instance;
4346 unsigned int i;
4347
4348 slp_instances = vinfo->slp_instances;
4349 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4350 {
4351 slp_tree node = SLP_INSTANCE_TREE (instance);
4352 /* Schedule the tree of INSTANCE. */
4353 vect_schedule_slp_instance (node, instance);
4354
4355 if (SLP_INSTANCE_ROOT_STMT (instance))
4356 vectorize_slp_instance_root_stmt (node, instance);
4357
4358 if (dump_enabled_p ())
4359 dump_printf_loc (MSG_NOTE, vect_location,
4360 "vectorizing stmts using SLP.\n");
4361 }
4362
4363 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4364 {
4365 slp_tree root = SLP_INSTANCE_TREE (instance);
4366 stmt_vec_info store_info;
4367 unsigned int j;
4368
4369 /* Remove scalar call stmts. Do not do this for basic-block
4370 vectorization as not all uses may be vectorized.
4371 ??? Why should this be necessary? DCE should be able to
4372 remove the stmts itself.
4373 ??? For BB vectorization we can as well remove scalar
4374 stmts starting from the SLP tree root if they have no
4375 uses. */
4376 if (is_a <loop_vec_info> (vinfo))
4377 vect_remove_slp_scalar_calls (root);
4378
4379 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4380 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4381 {
4382 if (!STMT_VINFO_DATA_REF (store_info))
4383 break;
4384
4385 if (SLP_INSTANCE_ROOT_STMT (instance))
4386 continue;
4387
4388 store_info = vect_orig_stmt (store_info);
4389 /* Free the attached stmt_vec_info and remove the stmt. */
4390 vinfo->remove_stmt (store_info);
4391 }
4392 }
4393 }