]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
2019-06-19 Steven G. Kargl <kargl@gcc.gnu.org>
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
53
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
56 {
57 int i;
58 slp_tree child;
59
60 if (--node->refcnt != 0)
61 return;
62
63 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64 vect_free_slp_tree (child, final_p);
65
66 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67 Some statements might no longer exist, after having been
68 removed by vect_transform_stmt. Updating the remaining
69 statements would be redundant. */
70 if (!final_p)
71 {
72 stmt_vec_info stmt_info;
73 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 {
75 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
77 }
78 }
79
80 SLP_TREE_CHILDREN (node).release ();
81 SLP_TREE_SCALAR_STMTS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
84
85 free (node);
86 }
87
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
91
92 void
93 vect_free_slp_instance (slp_instance instance, bool final_p)
94 {
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
98 }
99
100
101 /* Create an SLP node for SCALAR_STMTS. */
102
103 static slp_tree
104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
105 {
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
109
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
113 {
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
117 }
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
122
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_VEC_STMTS (node).create (0);
126 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
127 SLP_TREE_CHILDREN (node).create (nops);
128 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
129 SLP_TREE_TWO_OPERATORS (node) = false;
130 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
131 node->refcnt = 1;
132
133 unsigned i;
134 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
135 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
136
137 return node;
138 }
139
140
141 /* This structure is used in creation of an SLP tree. Each instance
142 corresponds to the same operand in a group of scalar stmts in an SLP
143 node. */
144 typedef struct _slp_oprnd_info
145 {
146 /* Def-stmts for the operands. */
147 vec<stmt_vec_info> def_stmts;
148 /* Information about the first statement, its vector def-type, type, the
149 operand itself in case it's constant, and an indication if it's a pattern
150 stmt. */
151 tree first_op_type;
152 enum vect_def_type first_dt;
153 bool first_pattern;
154 bool second_pattern;
155 } *slp_oprnd_info;
156
157
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
159 operand. */
160 static vec<slp_oprnd_info>
161 vect_create_oprnd_info (int nops, int group_size)
162 {
163 int i;
164 slp_oprnd_info oprnd_info;
165 vec<slp_oprnd_info> oprnds_info;
166
167 oprnds_info.create (nops);
168 for (i = 0; i < nops; i++)
169 {
170 oprnd_info = XNEW (struct _slp_oprnd_info);
171 oprnd_info->def_stmts.create (group_size);
172 oprnd_info->first_dt = vect_uninitialized_def;
173 oprnd_info->first_op_type = NULL_TREE;
174 oprnd_info->first_pattern = false;
175 oprnd_info->second_pattern = false;
176 oprnds_info.quick_push (oprnd_info);
177 }
178
179 return oprnds_info;
180 }
181
182
183 /* Free operands info. */
184
185 static void
186 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
187 {
188 int i;
189 slp_oprnd_info oprnd_info;
190
191 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
192 {
193 oprnd_info->def_stmts.release ();
194 XDELETE (oprnd_info);
195 }
196
197 oprnds_info.release ();
198 }
199
200
201 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
202 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
203 of the chain. */
204
205 int
206 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
207 stmt_vec_info first_stmt_info)
208 {
209 stmt_vec_info next_stmt_info = first_stmt_info;
210 int result = 0;
211
212 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
213 return -1;
214
215 do
216 {
217 if (next_stmt_info == stmt_info)
218 return result;
219 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
220 if (next_stmt_info)
221 result += DR_GROUP_GAP (next_stmt_info);
222 }
223 while (next_stmt_info);
224
225 return -1;
226 }
227
228 /* Check whether it is possible to load COUNT elements of type ELT_MODE
229 using the method implemented by duplicate_and_interleave. Return true
230 if so, returning the number of intermediate vectors in *NVECTORS_OUT
231 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
232 (if nonnull). */
233
234 bool
235 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
236 unsigned int *nvectors_out,
237 tree *vector_type_out,
238 tree *permutes)
239 {
240 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
241 poly_int64 nelts;
242 unsigned int nvectors = 1;
243 for (;;)
244 {
245 scalar_int_mode int_mode;
246 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
247 if (multiple_p (current_vector_size, elt_bytes, &nelts)
248 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
249 {
250 tree int_type = build_nonstandard_integer_type
251 (GET_MODE_BITSIZE (int_mode), 1);
252 tree vector_type = build_vector_type (int_type, nelts);
253 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
254 {
255 vec_perm_builder sel1 (nelts, 2, 3);
256 vec_perm_builder sel2 (nelts, 2, 3);
257 poly_int64 half_nelts = exact_div (nelts, 2);
258 for (unsigned int i = 0; i < 3; ++i)
259 {
260 sel1.quick_push (i);
261 sel1.quick_push (i + nelts);
262 sel2.quick_push (half_nelts + i);
263 sel2.quick_push (half_nelts + i + nelts);
264 }
265 vec_perm_indices indices1 (sel1, 2, nelts);
266 vec_perm_indices indices2 (sel2, 2, nelts);
267 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
268 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
269 {
270 if (nvectors_out)
271 *nvectors_out = nvectors;
272 if (vector_type_out)
273 *vector_type_out = vector_type;
274 if (permutes)
275 {
276 permutes[0] = vect_gen_perm_mask_checked (vector_type,
277 indices1);
278 permutes[1] = vect_gen_perm_mask_checked (vector_type,
279 indices2);
280 }
281 return true;
282 }
283 }
284 }
285 if (!multiple_p (elt_bytes, 2, &elt_bytes))
286 return false;
287 nvectors *= 2;
288 }
289 }
290
291 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
292 they are of a valid type and that they match the defs of the first stmt of
293 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
294 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
295 indicates swap is required for cond_expr stmts. Specifically, *SWAP
296 is 1 if STMT is cond and operands of comparison need to be swapped;
297 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
298 If there is any operand swap in this function, *SWAP is set to non-zero
299 value.
300 If there was a fatal error return -1; if the error could be corrected by
301 swapping operands of father node of this one, return 1; if everything is
302 ok return 0. */
303 static int
304 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
305 vec<stmt_vec_info> stmts, unsigned stmt_num,
306 vec<slp_oprnd_info> *oprnds_info)
307 {
308 stmt_vec_info stmt_info = stmts[stmt_num];
309 tree oprnd;
310 unsigned int i, number_of_oprnds;
311 enum vect_def_type dt = vect_uninitialized_def;
312 bool pattern = false;
313 slp_oprnd_info oprnd_info;
314 int first_op_idx = 1;
315 unsigned int commutative_op = -1U;
316 bool first_op_cond = false;
317 bool first = stmt_num == 0;
318 bool second = stmt_num == 1;
319
320 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
321 {
322 number_of_oprnds = gimple_call_num_args (stmt);
323 first_op_idx = 3;
324 if (gimple_call_internal_p (stmt))
325 {
326 internal_fn ifn = gimple_call_internal_fn (stmt);
327 commutative_op = first_commutative_argument (ifn);
328
329 /* Masked load, only look at mask. */
330 if (ifn == IFN_MASK_LOAD)
331 {
332 number_of_oprnds = 1;
333 /* Mask operand index. */
334 first_op_idx = 5;
335 }
336 }
337 }
338 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
339 {
340 enum tree_code code = gimple_assign_rhs_code (stmt);
341 number_of_oprnds = gimple_num_ops (stmt) - 1;
342 /* Swap can only be done for cond_expr if asked to, otherwise we
343 could result in different comparison code to the first stmt. */
344 if (code == COND_EXPR
345 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
346 {
347 first_op_cond = true;
348 number_of_oprnds++;
349 }
350 else
351 commutative_op = commutative_tree_code (code) ? 0U : -1U;
352 }
353 else
354 return -1;
355
356 bool swapped = (*swap != 0);
357 gcc_assert (!swapped || first_op_cond);
358 for (i = 0; i < number_of_oprnds; i++)
359 {
360 again:
361 if (first_op_cond)
362 {
363 /* Map indicating how operands of cond_expr should be swapped. */
364 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
365 int *map = maps[*swap];
366
367 if (i < 2)
368 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
369 first_op_idx), map[i]);
370 else
371 oprnd = gimple_op (stmt_info->stmt, map[i]);
372 }
373 else
374 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
375 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
376 oprnd = TREE_OPERAND (oprnd, 0);
377
378 oprnd_info = (*oprnds_info)[i];
379
380 stmt_vec_info def_stmt_info;
381 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
382 {
383 if (dump_enabled_p ())
384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
385 "Build SLP failed: can't analyze def for %T\n",
386 oprnd);
387
388 return -1;
389 }
390
391 if (second)
392 oprnd_info->second_pattern = pattern;
393
394 if (first)
395 {
396 oprnd_info->first_dt = dt;
397 oprnd_info->first_pattern = pattern;
398 oprnd_info->first_op_type = TREE_TYPE (oprnd);
399 }
400 else
401 {
402 /* Not first stmt of the group, check that the def-stmt/s match
403 the def-stmt/s of the first stmt. Allow different definition
404 types for reduction chains: the first stmt must be a
405 vect_reduction_def (a phi node), and the rest
406 vect_internal_def. */
407 tree type = TREE_TYPE (oprnd);
408 if ((oprnd_info->first_dt != dt
409 && !(oprnd_info->first_dt == vect_reduction_def
410 && dt == vect_internal_def)
411 && !((oprnd_info->first_dt == vect_external_def
412 || oprnd_info->first_dt == vect_constant_def)
413 && (dt == vect_external_def
414 || dt == vect_constant_def)))
415 || !types_compatible_p (oprnd_info->first_op_type, type))
416 {
417 /* Try swapping operands if we got a mismatch. */
418 if (i == commutative_op && !swapped)
419 {
420 swapped = true;
421 goto again;
422 }
423
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
426 "Build SLP failed: different types\n");
427
428 return 1;
429 }
430 if ((dt == vect_constant_def
431 || dt == vect_external_def)
432 && !current_vector_size.is_constant ()
433 && (TREE_CODE (type) == BOOLEAN_TYPE
434 || !can_duplicate_and_interleave_p (stmts.length (),
435 TYPE_MODE (type))))
436 {
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
439 "Build SLP failed: invalid type of def "
440 "for variable-length SLP %T\n", oprnd);
441 return -1;
442 }
443 }
444
445 /* Check the types of the definitions. */
446 switch (dt)
447 {
448 case vect_constant_def:
449 case vect_external_def:
450 break;
451
452 case vect_reduction_def:
453 case vect_induction_def:
454 case vect_internal_def:
455 oprnd_info->def_stmts.quick_push (def_stmt_info);
456 break;
457
458 default:
459 /* FORNOW: Not supported. */
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "Build SLP failed: illegal type of def %T\n",
463 oprnd);
464
465 return -1;
466 }
467 }
468
469 /* Swap operands. */
470 if (swapped)
471 {
472 /* If there are already uses of this stmt in a SLP instance then
473 we've committed to the operand order and can't swap it. */
474 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
475 {
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
478 "Build SLP failed: cannot swap operands of "
479 "shared stmt %G", stmt_info->stmt);
480 return -1;
481 }
482
483 if (first_op_cond)
484 {
485 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
486 tree cond = gimple_assign_rhs1 (stmt);
487 enum tree_code code = TREE_CODE (cond);
488
489 /* Swap. */
490 if (*swap == 1)
491 {
492 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
493 &TREE_OPERAND (cond, 1));
494 TREE_SET_CODE (cond, swap_tree_comparison (code));
495 }
496 /* Invert. */
497 else
498 {
499 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
500 gimple_assign_rhs3_ptr (stmt));
501 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
502 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
503 gcc_assert (code != ERROR_MARK);
504 TREE_SET_CODE (cond, code);
505 }
506 }
507 else
508 {
509 unsigned int op = commutative_op + first_op_idx;
510 swap_ssa_operands (stmt_info->stmt,
511 gimple_op_ptr (stmt_info->stmt, op),
512 gimple_op_ptr (stmt_info->stmt, op + 1));
513 }
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "swapped operands to match def types in %G",
517 stmt_info->stmt);
518 }
519
520 *swap = swapped;
521 return 0;
522 }
523
524 /* Return true if call statements CALL1 and CALL2 are similar enough
525 to be combined into the same SLP group. */
526
527 static bool
528 compatible_calls_p (gcall *call1, gcall *call2)
529 {
530 unsigned int nargs = gimple_call_num_args (call1);
531 if (nargs != gimple_call_num_args (call2))
532 return false;
533
534 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
535 return false;
536
537 if (gimple_call_internal_p (call1))
538 {
539 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
540 TREE_TYPE (gimple_call_lhs (call2))))
541 return false;
542 for (unsigned int i = 0; i < nargs; ++i)
543 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
544 TREE_TYPE (gimple_call_arg (call2, i))))
545 return false;
546 }
547 else
548 {
549 if (!operand_equal_p (gimple_call_fn (call1),
550 gimple_call_fn (call2), 0))
551 return false;
552
553 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
554 return false;
555 }
556 return true;
557 }
558
559 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
560 caller's attempt to find the vector type in STMT_INFO with the narrowest
561 element type. Return true if VECTYPE is nonnull and if it is valid
562 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
563 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
564 vect_build_slp_tree. */
565
566 static bool
567 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
568 tree vectype, poly_uint64 *max_nunits)
569 {
570 if (!vectype)
571 {
572 if (dump_enabled_p ())
573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
574 "Build SLP failed: unsupported data-type in %G\n",
575 stmt_info->stmt);
576 /* Fatal mismatch. */
577 return false;
578 }
579
580 /* If populating the vector type requires unrolling then fail
581 before adjusting *max_nunits for basic-block vectorization. */
582 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
583 unsigned HOST_WIDE_INT const_nunits;
584 if (STMT_VINFO_BB_VINFO (stmt_info)
585 && (!nunits.is_constant (&const_nunits)
586 || const_nunits > group_size))
587 {
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Build SLP failed: unrolling required "
591 "in basic block SLP\n");
592 /* Fatal mismatch. */
593 return false;
594 }
595
596 /* In case of multiple types we need to detect the smallest type. */
597 vect_update_max_nunits (max_nunits, vectype);
598 return true;
599 }
600
601 /* STMTS is a group of GROUP_SIZE SLP statements in which some
602 statements do the same operation as the first statement and in which
603 the others do ALT_STMT_CODE. Return true if we can take one vector
604 of the first operation and one vector of the second and permute them
605 to get the required result. VECTYPE is the type of the vector that
606 would be permuted. */
607
608 static bool
609 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
610 unsigned int group_size, tree vectype,
611 tree_code alt_stmt_code)
612 {
613 unsigned HOST_WIDE_INT count;
614 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
615 return false;
616
617 vec_perm_builder sel (count, count, 1);
618 for (unsigned int i = 0; i < count; ++i)
619 {
620 unsigned int elt = i;
621 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
622 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
623 elt += count;
624 sel.quick_push (elt);
625 }
626 vec_perm_indices indices (sel, 2, count);
627 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
628 }
629
630 /* Verify if the scalar stmts STMTS are isomorphic, require data
631 permutation or are of unsupported types of operation. Return
632 true if they are, otherwise return false and indicate in *MATCHES
633 which stmts are not isomorphic to the first one. If MATCHES[0]
634 is false then this indicates the comparison could not be
635 carried out or the stmts will never be vectorized by SLP.
636
637 Note COND_EXPR is possibly isomorphic to another one after swapping its
638 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
639 the first stmt by swapping the two operands of comparison; set SWAP[i]
640 to 2 if stmt I is isormorphic to the first stmt by inverting the code
641 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
642 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
643
644 static bool
645 vect_build_slp_tree_1 (unsigned char *swap,
646 vec<stmt_vec_info> stmts, unsigned int group_size,
647 poly_uint64 *max_nunits, bool *matches,
648 bool *two_operators)
649 {
650 unsigned int i;
651 stmt_vec_info first_stmt_info = stmts[0];
652 enum tree_code first_stmt_code = ERROR_MARK;
653 enum tree_code alt_stmt_code = ERROR_MARK;
654 enum tree_code rhs_code = ERROR_MARK;
655 enum tree_code first_cond_code = ERROR_MARK;
656 tree lhs;
657 bool need_same_oprnds = false;
658 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
659 optab optab;
660 int icode;
661 machine_mode optab_op2_mode;
662 machine_mode vec_mode;
663 stmt_vec_info first_load = NULL, prev_first_load = NULL;
664 bool load_p = false;
665
666 /* For every stmt in NODE find its def stmt/s. */
667 stmt_vec_info stmt_info;
668 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
669 {
670 gimple *stmt = stmt_info->stmt;
671 swap[i] = 0;
672 matches[i] = false;
673
674 if (dump_enabled_p ())
675 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
676
677 /* Fail to vectorize statements marked as unvectorizable. */
678 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "Build SLP failed: unvectorizable statement %G",
683 stmt);
684 /* Fatal mismatch. */
685 matches[0] = false;
686 return false;
687 }
688
689 lhs = gimple_get_lhs (stmt);
690 if (lhs == NULL_TREE)
691 {
692 if (dump_enabled_p ())
693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
694 "Build SLP failed: not GIMPLE_ASSIGN nor "
695 "GIMPLE_CALL %G", stmt);
696 /* Fatal mismatch. */
697 matches[0] = false;
698 return false;
699 }
700
701 tree nunits_vectype;
702 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
703 &nunits_vectype)
704 || (nunits_vectype
705 && !vect_record_max_nunits (stmt_info, group_size,
706 nunits_vectype, max_nunits)))
707 {
708 /* Fatal mismatch. */
709 matches[0] = false;
710 return false;
711 }
712
713 gcc_assert (vectype);
714
715 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
716 {
717 rhs_code = CALL_EXPR;
718
719 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
720 load_p = true;
721 else if ((gimple_call_internal_p (call_stmt)
722 && (!vectorizable_internal_fn_p
723 (gimple_call_internal_fn (call_stmt))))
724 || gimple_call_tail_p (call_stmt)
725 || gimple_call_noreturn_p (call_stmt)
726 || !gimple_call_nothrow_p (call_stmt)
727 || gimple_call_chain (call_stmt))
728 {
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
731 "Build SLP failed: unsupported call type %G",
732 call_stmt);
733 /* Fatal mismatch. */
734 matches[0] = false;
735 return false;
736 }
737 }
738 else
739 {
740 rhs_code = gimple_assign_rhs_code (stmt);
741 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
742 }
743
744 /* Check the operation. */
745 if (i == 0)
746 {
747 first_stmt_code = rhs_code;
748
749 /* Shift arguments should be equal in all the packed stmts for a
750 vector shift with scalar shift operand. */
751 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
752 || rhs_code == LROTATE_EXPR
753 || rhs_code == RROTATE_EXPR)
754 {
755 if (vectype == boolean_type_node)
756 {
757 if (dump_enabled_p ())
758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
759 "Build SLP failed: shift of a"
760 " boolean.\n");
761 /* Fatal mismatch. */
762 matches[0] = false;
763 return false;
764 }
765
766 vec_mode = TYPE_MODE (vectype);
767
768 /* First see if we have a vector/vector shift. */
769 optab = optab_for_tree_code (rhs_code, vectype,
770 optab_vector);
771
772 if (!optab
773 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
774 {
775 /* No vector/vector shift, try for a vector/scalar shift. */
776 optab = optab_for_tree_code (rhs_code, vectype,
777 optab_scalar);
778
779 if (!optab)
780 {
781 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Build SLP failed: no optab.\n");
784 /* Fatal mismatch. */
785 matches[0] = false;
786 return false;
787 }
788 icode = (int) optab_handler (optab, vec_mode);
789 if (icode == CODE_FOR_nothing)
790 {
791 if (dump_enabled_p ())
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "Build SLP failed: "
794 "op not supported by target.\n");
795 /* Fatal mismatch. */
796 matches[0] = false;
797 return false;
798 }
799 optab_op2_mode = insn_data[icode].operand[2].mode;
800 if (!VECTOR_MODE_P (optab_op2_mode))
801 {
802 need_same_oprnds = true;
803 first_op1 = gimple_assign_rhs2 (stmt);
804 }
805 }
806 }
807 else if (rhs_code == WIDEN_LSHIFT_EXPR)
808 {
809 need_same_oprnds = true;
810 first_op1 = gimple_assign_rhs2 (stmt);
811 }
812 }
813 else
814 {
815 if (first_stmt_code != rhs_code
816 && alt_stmt_code == ERROR_MARK)
817 alt_stmt_code = rhs_code;
818 if (first_stmt_code != rhs_code
819 && (first_stmt_code != IMAGPART_EXPR
820 || rhs_code != REALPART_EXPR)
821 && (first_stmt_code != REALPART_EXPR
822 || rhs_code != IMAGPART_EXPR)
823 /* Handle mismatches in plus/minus by computing both
824 and merging the results. */
825 && !((first_stmt_code == PLUS_EXPR
826 || first_stmt_code == MINUS_EXPR)
827 && (alt_stmt_code == PLUS_EXPR
828 || alt_stmt_code == MINUS_EXPR)
829 && rhs_code == alt_stmt_code)
830 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
831 && (first_stmt_code == ARRAY_REF
832 || first_stmt_code == BIT_FIELD_REF
833 || first_stmt_code == INDIRECT_REF
834 || first_stmt_code == COMPONENT_REF
835 || first_stmt_code == MEM_REF)))
836 {
837 if (dump_enabled_p ())
838 {
839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 "Build SLP failed: different operation "
841 "in stmt %G", stmt);
842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
843 "original stmt %G", first_stmt_info->stmt);
844 }
845 /* Mismatch. */
846 continue;
847 }
848
849 if (need_same_oprnds
850 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
851 {
852 if (dump_enabled_p ())
853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
854 "Build SLP failed: different shift "
855 "arguments in %G", stmt);
856 /* Mismatch. */
857 continue;
858 }
859
860 if (rhs_code == CALL_EXPR)
861 {
862 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
863 as_a <gcall *> (stmt)))
864 {
865 if (dump_enabled_p ())
866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
867 "Build SLP failed: different calls in %G",
868 stmt);
869 /* Mismatch. */
870 continue;
871 }
872 }
873 }
874
875 /* Grouped store or load. */
876 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
877 {
878 if (REFERENCE_CLASS_P (lhs))
879 {
880 /* Store. */
881 ;
882 }
883 else
884 {
885 /* Load. */
886 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
887 if (prev_first_load)
888 {
889 /* Check that there are no loads from different interleaving
890 chains in the same node. */
891 if (prev_first_load != first_load)
892 {
893 if (dump_enabled_p ())
894 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
895 vect_location,
896 "Build SLP failed: different "
897 "interleaving chains in one node %G",
898 stmt);
899 /* Mismatch. */
900 continue;
901 }
902 }
903 else
904 prev_first_load = first_load;
905 }
906 } /* Grouped access. */
907 else
908 {
909 if (load_p)
910 {
911 /* Not grouped load. */
912 if (dump_enabled_p ())
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "Build SLP failed: not grouped load %G", stmt);
915
916 /* FORNOW: Not grouped loads are not supported. */
917 /* Fatal mismatch. */
918 matches[0] = false;
919 return false;
920 }
921
922 /* Not memory operation. */
923 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
924 && TREE_CODE_CLASS (rhs_code) != tcc_unary
925 && TREE_CODE_CLASS (rhs_code) != tcc_expression
926 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
927 && rhs_code != VIEW_CONVERT_EXPR
928 && rhs_code != CALL_EXPR)
929 {
930 if (dump_enabled_p ())
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "Build SLP failed: operation unsupported %G",
933 stmt);
934 /* Fatal mismatch. */
935 matches[0] = false;
936 return false;
937 }
938
939 if (rhs_code == COND_EXPR)
940 {
941 tree cond_expr = gimple_assign_rhs1 (stmt);
942 enum tree_code cond_code = TREE_CODE (cond_expr);
943 enum tree_code swap_code = ERROR_MARK;
944 enum tree_code invert_code = ERROR_MARK;
945
946 if (i == 0)
947 first_cond_code = TREE_CODE (cond_expr);
948 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
949 {
950 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
951 swap_code = swap_tree_comparison (cond_code);
952 invert_code = invert_tree_comparison (cond_code, honor_nans);
953 }
954
955 if (first_cond_code == cond_code)
956 ;
957 /* Isomorphic can be achieved by swapping. */
958 else if (first_cond_code == swap_code)
959 swap[i] = 1;
960 /* Isomorphic can be achieved by inverting. */
961 else if (first_cond_code == invert_code)
962 swap[i] = 2;
963 else
964 {
965 if (dump_enabled_p ())
966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
967 "Build SLP failed: different"
968 " operation %G", stmt);
969 /* Mismatch. */
970 continue;
971 }
972 }
973 }
974
975 matches[i] = true;
976 }
977
978 for (i = 0; i < group_size; ++i)
979 if (!matches[i])
980 return false;
981
982 /* If we allowed a two-operation SLP node verify the target can cope
983 with the permute we are going to use. */
984 if (alt_stmt_code != ERROR_MARK
985 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
986 {
987 if (vectype == boolean_type_node
988 || !vect_two_operations_perm_ok_p (stmts, group_size,
989 vectype, alt_stmt_code))
990 {
991 for (i = 0; i < group_size; ++i)
992 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
993 {
994 matches[i] = false;
995 if (dump_enabled_p ())
996 {
997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
998 "Build SLP failed: different operation "
999 "in stmt %G", stmts[i]->stmt);
1000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 "original stmt %G", first_stmt_info->stmt);
1002 }
1003 }
1004 return false;
1005 }
1006 *two_operators = true;
1007 }
1008
1009 return true;
1010 }
1011
1012 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1013 Note we never remove apart from at destruction time so we do not
1014 need a special value for deleted that differs from empty. */
1015 struct bst_traits
1016 {
1017 typedef vec <stmt_vec_info> value_type;
1018 typedef vec <stmt_vec_info> compare_type;
1019 static inline hashval_t hash (value_type);
1020 static inline bool equal (value_type existing, value_type candidate);
1021 static inline bool is_empty (value_type x) { return !x.exists (); }
1022 static inline bool is_deleted (value_type x) { return !x.exists (); }
1023 static inline void mark_empty (value_type &x) { x.release (); }
1024 static inline void mark_deleted (value_type &x) { x.release (); }
1025 static inline void remove (value_type &x) { x.release (); }
1026 };
1027 inline hashval_t
1028 bst_traits::hash (value_type x)
1029 {
1030 inchash::hash h;
1031 for (unsigned i = 0; i < x.length (); ++i)
1032 h.add_int (gimple_uid (x[i]->stmt));
1033 return h.end ();
1034 }
1035 inline bool
1036 bst_traits::equal (value_type existing, value_type candidate)
1037 {
1038 if (existing.length () != candidate.length ())
1039 return false;
1040 for (unsigned i = 0; i < existing.length (); ++i)
1041 if (existing[i] != candidate[i])
1042 return false;
1043 return true;
1044 }
1045
1046 typedef hash_map <vec <gimple *>, slp_tree,
1047 simple_hashmap_traits <bst_traits, slp_tree> >
1048 scalar_stmts_to_slp_tree_map_t;
1049
1050 static slp_tree
1051 vect_build_slp_tree_2 (vec_info *vinfo,
1052 vec<stmt_vec_info> stmts, unsigned int group_size,
1053 poly_uint64 *max_nunits,
1054 bool *matches, unsigned *npermutes, unsigned *tree_size,
1055 scalar_stmts_to_slp_tree_map_t *bst_map);
1056
1057 static slp_tree
1058 vect_build_slp_tree (vec_info *vinfo,
1059 vec<stmt_vec_info> stmts, unsigned int group_size,
1060 poly_uint64 *max_nunits,
1061 bool *matches, unsigned *npermutes, unsigned *tree_size,
1062 scalar_stmts_to_slp_tree_map_t *bst_map)
1063 {
1064 if (slp_tree *leader = bst_map->get (stmts))
1065 {
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1068 *leader ? "" : "failed ", *leader);
1069 if (*leader)
1070 (*leader)->refcnt++;
1071 return *leader;
1072 }
1073 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1074 matches, npermutes, tree_size, bst_map);
1075 /* Keep a reference for the bst_map use. */
1076 if (res)
1077 res->refcnt++;
1078 bst_map->put (stmts.copy (), res);
1079 return res;
1080 }
1081
1082 /* Recursively build an SLP tree starting from NODE.
1083 Fail (and return a value not equal to zero) if def-stmts are not
1084 isomorphic, require data permutation or are of unsupported types of
1085 operation. Otherwise, return 0.
1086 The value returned is the depth in the SLP tree where a mismatch
1087 was found. */
1088
1089 static slp_tree
1090 vect_build_slp_tree_2 (vec_info *vinfo,
1091 vec<stmt_vec_info> stmts, unsigned int group_size,
1092 poly_uint64 *max_nunits,
1093 bool *matches, unsigned *npermutes, unsigned *tree_size,
1094 scalar_stmts_to_slp_tree_map_t *bst_map)
1095 {
1096 unsigned nops, i, this_tree_size = 0;
1097 poly_uint64 this_max_nunits = *max_nunits;
1098 slp_tree node;
1099
1100 matches[0] = false;
1101
1102 stmt_vec_info stmt_info = stmts[0];
1103 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1104 nops = gimple_call_num_args (stmt);
1105 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1106 {
1107 nops = gimple_num_ops (stmt) - 1;
1108 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1109 nops++;
1110 }
1111 else if (is_a <gphi *> (stmt_info->stmt))
1112 nops = 0;
1113 else
1114 return NULL;
1115
1116 /* If the SLP node is a PHI (induction or reduction), terminate
1117 the recursion. */
1118 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1119 {
1120 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1121 tree vectype = get_vectype_for_scalar_type (scalar_type);
1122 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1123 return NULL;
1124
1125 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1126 /* Induction from different IVs is not supported. */
1127 if (def_type == vect_induction_def)
1128 {
1129 stmt_vec_info other_info;
1130 FOR_EACH_VEC_ELT (stmts, i, other_info)
1131 if (stmt_info != other_info)
1132 return NULL;
1133 }
1134 else if (def_type == vect_reduction_def
1135 || def_type == vect_double_reduction_def
1136 || def_type == vect_nested_cycle)
1137 {
1138 /* Else def types have to match. */
1139 stmt_vec_info other_info;
1140 FOR_EACH_VEC_ELT (stmts, i, other_info)
1141 {
1142 /* But for reduction chains only check on the first stmt. */
1143 if (REDUC_GROUP_FIRST_ELEMENT (other_info)
1144 && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
1145 continue;
1146 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1147 return NULL;
1148 }
1149 }
1150 else
1151 return NULL;
1152 (*tree_size)++;
1153 node = vect_create_new_slp_node (stmts);
1154 return node;
1155 }
1156
1157
1158 bool two_operators = false;
1159 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1160 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1161 &this_max_nunits, matches, &two_operators))
1162 return NULL;
1163
1164 /* If the SLP node is a load, terminate the recursion unless masked. */
1165 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1166 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1167 {
1168 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1169 {
1170 /* Masked load. */
1171 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1172 nops = 1;
1173 }
1174 else
1175 {
1176 *max_nunits = this_max_nunits;
1177 (*tree_size)++;
1178 node = vect_create_new_slp_node (stmts);
1179 return node;
1180 }
1181 }
1182
1183 /* Get at the operands, verifying they are compatible. */
1184 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1185 slp_oprnd_info oprnd_info;
1186 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1187 {
1188 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1189 stmts, i, &oprnds_info);
1190 if (res != 0)
1191 matches[(res == -1) ? 0 : i] = false;
1192 if (!matches[0])
1193 break;
1194 }
1195 for (i = 0; i < group_size; ++i)
1196 if (!matches[i])
1197 {
1198 vect_free_oprnd_info (oprnds_info);
1199 return NULL;
1200 }
1201
1202 auto_vec<slp_tree, 4> children;
1203
1204 stmt_info = stmts[0];
1205
1206 /* Create SLP_TREE nodes for the definition node/s. */
1207 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1208 {
1209 slp_tree child;
1210 unsigned old_tree_size = this_tree_size;
1211 unsigned int j;
1212
1213 if (oprnd_info->first_dt != vect_internal_def
1214 && oprnd_info->first_dt != vect_reduction_def
1215 && oprnd_info->first_dt != vect_induction_def)
1216 continue;
1217
1218 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1219 group_size, &this_max_nunits,
1220 matches, npermutes,
1221 &this_tree_size, bst_map)) != NULL)
1222 {
1223 /* If we have all children of child built up from scalars then just
1224 throw that away and build it up this node from scalars. */
1225 if (!SLP_TREE_CHILDREN (child).is_empty ()
1226 /* ??? Rejecting patterns this way doesn't work. We'd have to
1227 do extra work to cancel the pattern so the uses see the
1228 scalar version. */
1229 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1230 {
1231 slp_tree grandchild;
1232
1233 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1234 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1235 break;
1236 if (!grandchild)
1237 {
1238 /* Roll back. */
1239 this_tree_size = old_tree_size;
1240 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1241 vect_free_slp_tree (grandchild, false);
1242 SLP_TREE_CHILDREN (child).truncate (0);
1243
1244 if (dump_enabled_p ())
1245 dump_printf_loc (MSG_NOTE, vect_location,
1246 "Building parent vector operands from "
1247 "scalars instead\n");
1248 oprnd_info->def_stmts = vNULL;
1249 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1250 ++this_tree_size;
1251 children.safe_push (child);
1252 continue;
1253 }
1254 }
1255
1256 oprnd_info->def_stmts = vNULL;
1257 children.safe_push (child);
1258 continue;
1259 }
1260
1261 /* If the SLP build failed fatally and we analyze a basic-block
1262 simply treat nodes we fail to build as externally defined
1263 (and thus build vectors from the scalar defs).
1264 The cost model will reject outright expensive cases.
1265 ??? This doesn't treat cases where permutation ultimatively
1266 fails (or we don't try permutation below). Ideally we'd
1267 even compute a permutation that will end up with the maximum
1268 SLP tree size... */
1269 if (is_a <bb_vec_info> (vinfo)
1270 && !matches[0]
1271 /* ??? Rejecting patterns this way doesn't work. We'd have to
1272 do extra work to cancel the pattern so the uses see the
1273 scalar version. */
1274 && !is_pattern_stmt_p (stmt_info))
1275 {
1276 if (dump_enabled_p ())
1277 dump_printf_loc (MSG_NOTE, vect_location,
1278 "Building vector operands from scalars\n");
1279 this_tree_size++;
1280 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1281 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1282 children.safe_push (child);
1283 oprnd_info->def_stmts = vNULL;
1284 continue;
1285 }
1286
1287 /* If the SLP build for operand zero failed and operand zero
1288 and one can be commutated try that for the scalar stmts
1289 that failed the match. */
1290 if (i == 0
1291 /* A first scalar stmt mismatch signals a fatal mismatch. */
1292 && matches[0]
1293 /* ??? For COND_EXPRs we can swap the comparison operands
1294 as well as the arms under some constraints. */
1295 && nops == 2
1296 && oprnds_info[1]->first_dt == vect_internal_def
1297 && is_gimple_assign (stmt_info->stmt)
1298 /* Do so only if the number of not successful permutes was nor more
1299 than a cut-ff as re-trying the recursive match on
1300 possibly each level of the tree would expose exponential
1301 behavior. */
1302 && *npermutes < 4)
1303 {
1304 /* See whether we can swap the matching or the non-matching
1305 stmt operands. */
1306 bool swap_not_matching = true;
1307 do
1308 {
1309 for (j = 0; j < group_size; ++j)
1310 {
1311 if (matches[j] != !swap_not_matching)
1312 continue;
1313 stmt_vec_info stmt_info = stmts[j];
1314 /* Verify if we can swap operands of this stmt. */
1315 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1316 if (!stmt
1317 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1318 {
1319 if (!swap_not_matching)
1320 goto fail;
1321 swap_not_matching = false;
1322 break;
1323 }
1324 /* Verify if we can safely swap or if we committed to a
1325 specific operand order already.
1326 ??? Instead of modifying GIMPLE stmts here we could
1327 record whether we want to swap operands in the SLP
1328 node and temporarily do that when processing it
1329 (or wrap operand accessors in a helper). */
1330 else if (swap[j] != 0
1331 || STMT_VINFO_NUM_SLP_USES (stmt_info))
1332 {
1333 if (!swap_not_matching)
1334 {
1335 if (dump_enabled_p ())
1336 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1337 vect_location,
1338 "Build SLP failed: cannot swap "
1339 "operands of shared stmt %G",
1340 stmts[j]->stmt);
1341 goto fail;
1342 }
1343 swap_not_matching = false;
1344 break;
1345 }
1346 }
1347 }
1348 while (j != group_size);
1349
1350 /* Swap mismatched definition stmts. */
1351 if (dump_enabled_p ())
1352 dump_printf_loc (MSG_NOTE, vect_location,
1353 "Re-trying with swapped operands of stmts ");
1354 for (j = 0; j < group_size; ++j)
1355 if (matches[j] == !swap_not_matching)
1356 {
1357 std::swap (oprnds_info[0]->def_stmts[j],
1358 oprnds_info[1]->def_stmts[j]);
1359 if (dump_enabled_p ())
1360 dump_printf (MSG_NOTE, "%d ", j);
1361 }
1362 if (dump_enabled_p ())
1363 dump_printf (MSG_NOTE, "\n");
1364 /* And try again with scratch 'matches' ... */
1365 bool *tem = XALLOCAVEC (bool, group_size);
1366 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1367 group_size, &this_max_nunits,
1368 tem, npermutes,
1369 &this_tree_size, bst_map)) != NULL)
1370 {
1371 /* ... so if successful we can apply the operand swapping
1372 to the GIMPLE IL. This is necessary because for example
1373 vect_get_slp_defs uses operand indexes and thus expects
1374 canonical operand order. This is also necessary even
1375 if we end up building the operand from scalars as
1376 we'll continue to process swapped operand two. */
1377 for (j = 0; j < group_size; ++j)
1378 gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
1379 for (j = 0; j < group_size; ++j)
1380 if (matches[j] == !swap_not_matching)
1381 {
1382 gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
1383 /* Avoid swapping operands twice. */
1384 if (gimple_plf (stmt, GF_PLF_1))
1385 continue;
1386 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1387 gimple_assign_rhs2_ptr (stmt));
1388 gimple_set_plf (stmt, GF_PLF_1, true);
1389 }
1390 /* Verify we swap all duplicates or none. */
1391 if (flag_checking)
1392 for (j = 0; j < group_size; ++j)
1393 gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
1394 == (matches[j] == !swap_not_matching));
1395
1396 /* If we have all children of child built up from scalars then
1397 just throw that away and build it up this node from scalars. */
1398 if (!SLP_TREE_CHILDREN (child).is_empty ()
1399 /* ??? Rejecting patterns this way doesn't work. We'd have
1400 to do extra work to cancel the pattern so the uses see the
1401 scalar version. */
1402 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1403 {
1404 unsigned int j;
1405 slp_tree grandchild;
1406
1407 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1408 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1409 break;
1410 if (!grandchild)
1411 {
1412 /* Roll back. */
1413 this_tree_size = old_tree_size;
1414 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1415 vect_free_slp_tree (grandchild, false);
1416 SLP_TREE_CHILDREN (child).truncate (0);
1417
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_NOTE, vect_location,
1420 "Building parent vector operands from "
1421 "scalars instead\n");
1422 oprnd_info->def_stmts = vNULL;
1423 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1424 ++this_tree_size;
1425 children.safe_push (child);
1426 continue;
1427 }
1428 }
1429
1430 oprnd_info->def_stmts = vNULL;
1431 children.safe_push (child);
1432 continue;
1433 }
1434
1435 ++*npermutes;
1436 }
1437
1438 fail:
1439 gcc_assert (child == NULL);
1440 FOR_EACH_VEC_ELT (children, j, child)
1441 vect_free_slp_tree (child, false);
1442 vect_free_oprnd_info (oprnds_info);
1443 return NULL;
1444 }
1445
1446 vect_free_oprnd_info (oprnds_info);
1447
1448 *tree_size += this_tree_size + 1;
1449 *max_nunits = this_max_nunits;
1450
1451 node = vect_create_new_slp_node (stmts);
1452 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1453 SLP_TREE_CHILDREN (node).splice (children);
1454 return node;
1455 }
1456
1457 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1458
1459 static void
1460 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1461 slp_tree node, hash_set<slp_tree> &visited)
1462 {
1463 int i;
1464 stmt_vec_info stmt_info;
1465 slp_tree child;
1466
1467 if (visited.add (node))
1468 return;
1469
1470 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1471 dump_user_location_t user_loc = loc.get_user_location ();
1472 dump_printf_loc (metadata, user_loc, "node%s %p\n",
1473 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1474 ? " (external)" : "", node);
1475 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1476 dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt);
1477 if (SLP_TREE_CHILDREN (node).is_empty ())
1478 return;
1479 dump_printf_loc (metadata, user_loc, "\tchildren");
1480 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1481 dump_printf (dump_kind, " %p", (void *)child);
1482 dump_printf (dump_kind, "\n");
1483 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1484 vect_print_slp_tree (dump_kind, loc, child, visited);
1485 }
1486
1487 static void
1488 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1489 slp_tree node)
1490 {
1491 hash_set<slp_tree> visited;
1492 vect_print_slp_tree (dump_kind, loc, node, visited);
1493 }
1494
1495 /* Mark the tree rooted at NODE with PURE_SLP. */
1496
1497 static void
1498 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1499 {
1500 int i;
1501 stmt_vec_info stmt_info;
1502 slp_tree child;
1503
1504 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1505 return;
1506
1507 if (visited.add (node))
1508 return;
1509
1510 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1511 STMT_SLP_TYPE (stmt_info) = pure_slp;
1512
1513 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1514 vect_mark_slp_stmts (child, visited);
1515 }
1516
1517 static void
1518 vect_mark_slp_stmts (slp_tree node)
1519 {
1520 hash_set<slp_tree> visited;
1521 vect_mark_slp_stmts (node, visited);
1522 }
1523
1524 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1525
1526 static void
1527 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1528 {
1529 int i;
1530 stmt_vec_info stmt_info;
1531 slp_tree child;
1532
1533 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1534 return;
1535
1536 if (visited.add (node))
1537 return;
1538
1539 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1540 {
1541 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1542 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1543 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1544 }
1545
1546 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1547 vect_mark_slp_stmts_relevant (child, visited);
1548 }
1549
1550 static void
1551 vect_mark_slp_stmts_relevant (slp_tree node)
1552 {
1553 hash_set<slp_tree> visited;
1554 vect_mark_slp_stmts_relevant (node, visited);
1555 }
1556
1557
1558 /* Rearrange the statements of NODE according to PERMUTATION. */
1559
1560 static void
1561 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1562 vec<unsigned> permutation,
1563 hash_set<slp_tree> &visited)
1564 {
1565 stmt_vec_info stmt_info;
1566 vec<stmt_vec_info> tmp_stmts;
1567 unsigned int i;
1568 slp_tree child;
1569
1570 if (visited.add (node))
1571 return;
1572
1573 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1574 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1575
1576 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1577 tmp_stmts.create (group_size);
1578 tmp_stmts.quick_grow_cleared (group_size);
1579
1580 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1581 tmp_stmts[permutation[i]] = stmt_info;
1582
1583 SLP_TREE_SCALAR_STMTS (node).release ();
1584 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1585 }
1586
1587
1588 /* Attempt to reorder stmts in a reduction chain so that we don't
1589 require any load permutation. Return true if that was possible,
1590 otherwise return false. */
1591
1592 static bool
1593 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1594 {
1595 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1596 unsigned int i, j;
1597 unsigned int lidx;
1598 slp_tree node, load;
1599
1600 /* Compare all the permutation sequences to the first one. We know
1601 that at least one load is permuted. */
1602 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1603 if (!node->load_permutation.exists ())
1604 return false;
1605 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1606 {
1607 if (!load->load_permutation.exists ())
1608 return false;
1609 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1610 if (lidx != node->load_permutation[j])
1611 return false;
1612 }
1613
1614 /* Check that the loads in the first sequence are different and there
1615 are no gaps between them. */
1616 auto_sbitmap load_index (group_size);
1617 bitmap_clear (load_index);
1618 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1619 {
1620 if (lidx >= group_size)
1621 return false;
1622 if (bitmap_bit_p (load_index, lidx))
1623 return false;
1624
1625 bitmap_set_bit (load_index, lidx);
1626 }
1627 for (i = 0; i < group_size; i++)
1628 if (!bitmap_bit_p (load_index, i))
1629 return false;
1630
1631 /* This permutation is valid for reduction. Since the order of the
1632 statements in the nodes is not important unless they are memory
1633 accesses, we can rearrange the statements in all the nodes
1634 according to the order of the loads. */
1635 hash_set<slp_tree> visited;
1636 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1637 node->load_permutation, visited);
1638
1639 /* We are done, no actual permutations need to be generated. */
1640 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1641 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1642 {
1643 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1644 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1645 /* But we have to keep those permutations that are required because
1646 of handling of gaps. */
1647 if (known_eq (unrolling_factor, 1U)
1648 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1649 && DR_GROUP_GAP (first_stmt_info) == 0))
1650 SLP_TREE_LOAD_PERMUTATION (node).release ();
1651 else
1652 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1653 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1654 }
1655
1656 return true;
1657 }
1658
1659 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1660
1661 static void
1662 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1663 hash_set<slp_tree> &visited)
1664 {
1665 if (visited.add (node))
1666 return;
1667
1668 if (SLP_TREE_CHILDREN (node).length () == 0)
1669 {
1670 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1671 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
1672 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
1673 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1674 SLP_INSTANCE_LOADS (inst).safe_push (node);
1675 }
1676 else
1677 {
1678 unsigned i;
1679 slp_tree child;
1680 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1681 vect_gather_slp_loads (inst, child, visited);
1682 }
1683 }
1684
1685 static void
1686 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1687 {
1688 hash_set<slp_tree> visited;
1689 vect_gather_slp_loads (inst, node, visited);
1690 }
1691
1692 /* Check if the required load permutations in the SLP instance
1693 SLP_INSTN are supported. */
1694
1695 static bool
1696 vect_supported_load_permutation_p (slp_instance slp_instn)
1697 {
1698 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1699 unsigned int i, j, k, next;
1700 slp_tree node;
1701
1702 if (dump_enabled_p ())
1703 {
1704 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1705 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1706 if (node->load_permutation.exists ())
1707 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1708 dump_printf (MSG_NOTE, "%d ", next);
1709 else
1710 for (k = 0; k < group_size; ++k)
1711 dump_printf (MSG_NOTE, "%d ", k);
1712 dump_printf (MSG_NOTE, "\n");
1713 }
1714
1715 /* In case of reduction every load permutation is allowed, since the order
1716 of the reduction statements is not important (as opposed to the case of
1717 grouped stores). The only condition we need to check is that all the
1718 load nodes are of the same size and have the same permutation (and then
1719 rearrange all the nodes of the SLP instance according to this
1720 permutation). */
1721
1722 /* Check that all the load nodes are of the same size. */
1723 /* ??? Can't we assert this? */
1724 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1725 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1726 return false;
1727
1728 node = SLP_INSTANCE_TREE (slp_instn);
1729 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1730
1731 /* Reduction (there are no data-refs in the root).
1732 In reduction chain the order of the loads is not important. */
1733 if (!STMT_VINFO_DATA_REF (stmt_info)
1734 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1735 vect_attempt_slp_rearrange_stmts (slp_instn);
1736
1737 /* In basic block vectorization we allow any subchain of an interleaving
1738 chain.
1739 FORNOW: not supported in loop SLP because of realignment compications. */
1740 if (STMT_VINFO_BB_VINFO (stmt_info))
1741 {
1742 /* Check whether the loads in an instance form a subchain and thus
1743 no permutation is necessary. */
1744 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1745 {
1746 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1747 continue;
1748 bool subchain_p = true;
1749 stmt_vec_info next_load_info = NULL;
1750 stmt_vec_info load_info;
1751 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1752 {
1753 if (j != 0
1754 && (next_load_info != load_info
1755 || DR_GROUP_GAP (load_info) != 1))
1756 {
1757 subchain_p = false;
1758 break;
1759 }
1760 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1761 }
1762 if (subchain_p)
1763 SLP_TREE_LOAD_PERMUTATION (node).release ();
1764 else
1765 {
1766 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1767 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1768 unsigned HOST_WIDE_INT nunits;
1769 unsigned k, maxk = 0;
1770 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1771 if (k > maxk)
1772 maxk = k;
1773 /* In BB vectorization we may not actually use a loaded vector
1774 accessing elements in excess of DR_GROUP_SIZE. */
1775 tree vectype = STMT_VINFO_VECTYPE (group_info);
1776 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1777 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1778 {
1779 if (dump_enabled_p ())
1780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1781 "BB vectorization with gaps at the end of "
1782 "a load is not supported\n");
1783 return false;
1784 }
1785
1786 /* Verify the permutation can be generated. */
1787 vec<tree> tem;
1788 unsigned n_perms;
1789 if (!vect_transform_slp_perm_load (node, tem, NULL,
1790 1, slp_instn, true, &n_perms))
1791 {
1792 if (dump_enabled_p ())
1793 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1794 vect_location,
1795 "unsupported load permutation\n");
1796 return false;
1797 }
1798 }
1799 }
1800 return true;
1801 }
1802
1803 /* For loop vectorization verify we can generate the permutation. Be
1804 conservative about the vectorization factor, there are permutations
1805 that will use three vector inputs only starting from a specific factor
1806 and the vectorization factor is not yet final.
1807 ??? The SLP instance unrolling factor might not be the maximum one. */
1808 unsigned n_perms;
1809 poly_uint64 test_vf
1810 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1811 LOOP_VINFO_VECT_FACTOR
1812 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1813 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1814 if (node->load_permutation.exists ()
1815 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1816 slp_instn, true, &n_perms))
1817 return false;
1818
1819 return true;
1820 }
1821
1822
1823 /* Find the last store in SLP INSTANCE. */
1824
1825 stmt_vec_info
1826 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1827 {
1828 stmt_vec_info last = NULL;
1829 stmt_vec_info stmt_vinfo;
1830
1831 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1832 {
1833 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1834 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1835 }
1836
1837 return last;
1838 }
1839
1840 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1841 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1842 (also containing the first GROUP1_SIZE stmts, since stores are
1843 consecutive), the second containing the remainder.
1844 Return the first stmt in the second group. */
1845
1846 static stmt_vec_info
1847 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1848 {
1849 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1850 gcc_assert (group1_size > 0);
1851 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1852 gcc_assert (group2_size > 0);
1853 DR_GROUP_SIZE (first_vinfo) = group1_size;
1854
1855 stmt_vec_info stmt_info = first_vinfo;
1856 for (unsigned i = group1_size; i > 1; i--)
1857 {
1858 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1859 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1860 }
1861 /* STMT is now the last element of the first group. */
1862 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1863 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1864
1865 DR_GROUP_SIZE (group2) = group2_size;
1866 for (stmt_info = group2; stmt_info;
1867 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1868 {
1869 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1870 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1871 }
1872
1873 /* For the second group, the DR_GROUP_GAP is that before the original group,
1874 plus skipping over the first vector. */
1875 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1876
1877 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1878 DR_GROUP_GAP (first_vinfo) += group2_size;
1879
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1882 group1_size, group2_size);
1883
1884 return group2;
1885 }
1886
1887 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1888 statements and a vector of NUNITS elements. */
1889
1890 static poly_uint64
1891 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1892 {
1893 return exact_div (common_multiple (nunits, group_size), group_size);
1894 }
1895
1896 /* Analyze an SLP instance starting from a group of grouped stores. Call
1897 vect_build_slp_tree to build a tree of packed stmts if possible.
1898 Return FALSE if it's impossible to SLP any stmt in the loop. */
1899
1900 static bool
1901 vect_analyze_slp_instance (vec_info *vinfo,
1902 stmt_vec_info stmt_info, unsigned max_tree_size)
1903 {
1904 slp_instance new_instance;
1905 slp_tree node;
1906 unsigned int group_size;
1907 tree vectype, scalar_type = NULL_TREE;
1908 unsigned int i;
1909 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1910 vec<stmt_vec_info> scalar_stmts;
1911
1912 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1913 {
1914 scalar_type = TREE_TYPE (DR_REF (dr));
1915 vectype = get_vectype_for_scalar_type (scalar_type);
1916 group_size = DR_GROUP_SIZE (stmt_info);
1917 }
1918 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1919 {
1920 gcc_assert (is_a <loop_vec_info> (vinfo));
1921 vectype = STMT_VINFO_VECTYPE (stmt_info);
1922 group_size = REDUC_GROUP_SIZE (stmt_info);
1923 }
1924 else
1925 {
1926 gcc_assert (is_a <loop_vec_info> (vinfo));
1927 vectype = STMT_VINFO_VECTYPE (stmt_info);
1928 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1929 }
1930
1931 if (!vectype)
1932 {
1933 if (dump_enabled_p ())
1934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1935 "Build SLP failed: unsupported data-type %T\n",
1936 scalar_type);
1937
1938 return false;
1939 }
1940 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1941
1942 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1943 scalar_stmts.create (group_size);
1944 stmt_vec_info next_info = stmt_info;
1945 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1946 {
1947 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1948 while (next_info)
1949 {
1950 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1951 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
1952 }
1953 }
1954 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1955 {
1956 /* Collect the reduction stmts and store them in
1957 SLP_TREE_SCALAR_STMTS. */
1958 while (next_info)
1959 {
1960 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1961 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
1962 }
1963 /* Mark the first element of the reduction chain as reduction to properly
1964 transform the node. In the reduction analysis phase only the last
1965 element of the chain is marked as reduction. */
1966 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
1967 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
1968 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
1969 }
1970 else
1971 {
1972 /* Collect reduction statements. */
1973 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1974 for (i = 0; reductions.iterate (i, &next_info); i++)
1975 scalar_stmts.safe_push (next_info);
1976 }
1977
1978 /* Build the tree for the SLP instance. */
1979 bool *matches = XALLOCAVEC (bool, group_size);
1980 unsigned npermutes = 0;
1981 scalar_stmts_to_slp_tree_map_t *bst_map
1982 = new scalar_stmts_to_slp_tree_map_t ();
1983 poly_uint64 max_nunits = nunits;
1984 unsigned tree_size = 0;
1985 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
1986 &max_nunits, matches, &npermutes,
1987 &tree_size, bst_map);
1988 /* The map keeps a reference on SLP nodes built, release that. */
1989 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1990 it != bst_map->end (); ++it)
1991 if ((*it).second)
1992 vect_free_slp_tree ((*it).second, false);
1993 delete bst_map;
1994 if (node != NULL)
1995 {
1996 /* Calculate the unrolling factor based on the smallest type. */
1997 poly_uint64 unrolling_factor
1998 = calculate_unrolling_factor (max_nunits, group_size);
1999
2000 if (maybe_ne (unrolling_factor, 1U)
2001 && is_a <bb_vec_info> (vinfo))
2002 {
2003 unsigned HOST_WIDE_INT const_max_nunits;
2004 if (!max_nunits.is_constant (&const_max_nunits)
2005 || const_max_nunits > group_size)
2006 {
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 "Build SLP failed: store group "
2010 "size not a multiple of the vector size "
2011 "in basic block SLP\n");
2012 vect_free_slp_tree (node, false);
2013 return false;
2014 }
2015 /* Fatal mismatch. */
2016 matches[group_size / const_max_nunits * const_max_nunits] = false;
2017 vect_free_slp_tree (node, false);
2018 }
2019 else
2020 {
2021 /* Create a new SLP instance. */
2022 new_instance = XNEW (struct _slp_instance);
2023 SLP_INSTANCE_TREE (new_instance) = node;
2024 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2025 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2026 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2027 vect_gather_slp_loads (new_instance, node);
2028 if (dump_enabled_p ())
2029 dump_printf_loc (MSG_NOTE, vect_location,
2030 "SLP size %u vs. limit %u.\n",
2031 tree_size, max_tree_size);
2032
2033 /* Compute the load permutation. */
2034 slp_tree load_node;
2035 bool loads_permuted = false;
2036 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2037 {
2038 vec<unsigned> load_permutation;
2039 int j;
2040 stmt_vec_info load_info;
2041 bool this_load_permuted = false;
2042 load_permutation.create (group_size);
2043 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2044 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2045 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2046 {
2047 int load_place = vect_get_place_in_interleaving_chain
2048 (load_info, first_stmt_info);
2049 gcc_assert (load_place != -1);
2050 if (load_place != j)
2051 this_load_permuted = true;
2052 load_permutation.safe_push (load_place);
2053 }
2054 if (!this_load_permuted
2055 /* The load requires permutation when unrolling exposes
2056 a gap either because the group is larger than the SLP
2057 group-size or because there is a gap between the groups. */
2058 && (known_eq (unrolling_factor, 1U)
2059 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2060 && DR_GROUP_GAP (first_stmt_info) == 0)))
2061 {
2062 load_permutation.release ();
2063 continue;
2064 }
2065 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2066 loads_permuted = true;
2067 }
2068
2069 if (loads_permuted)
2070 {
2071 if (!vect_supported_load_permutation_p (new_instance))
2072 {
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2075 "Build SLP failed: unsupported load "
2076 "permutation %G", stmt_info->stmt);
2077 vect_free_slp_instance (new_instance, false);
2078 return false;
2079 }
2080 }
2081
2082 /* If the loads and stores can be handled with load/store-lan
2083 instructions do not generate this SLP instance. */
2084 if (is_a <loop_vec_info> (vinfo)
2085 && loads_permuted
2086 && dr && vect_store_lanes_supported (vectype, group_size, false))
2087 {
2088 slp_tree load_node;
2089 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2090 {
2091 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2092 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2093 /* Use SLP for strided accesses (or if we can't load-lanes). */
2094 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2095 || ! vect_load_lanes_supported
2096 (STMT_VINFO_VECTYPE (stmt_vinfo),
2097 DR_GROUP_SIZE (stmt_vinfo), false))
2098 break;
2099 }
2100 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2101 {
2102 if (dump_enabled_p ())
2103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2104 "Built SLP cancelled: can use "
2105 "load/store-lanes\n");
2106 vect_free_slp_instance (new_instance, false);
2107 return false;
2108 }
2109 }
2110
2111 vinfo->slp_instances.safe_push (new_instance);
2112
2113 if (dump_enabled_p ())
2114 {
2115 dump_printf_loc (MSG_NOTE, vect_location,
2116 "Final SLP tree for instance:\n");
2117 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2118 }
2119
2120 return true;
2121 }
2122 }
2123 else
2124 {
2125 /* Failed to SLP. */
2126 /* Free the allocated memory. */
2127 scalar_stmts.release ();
2128 }
2129
2130 /* For basic block SLP, try to break the group up into multiples of the
2131 vector size. */
2132 unsigned HOST_WIDE_INT const_nunits;
2133 if (is_a <bb_vec_info> (vinfo)
2134 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2135 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2136 && nunits.is_constant (&const_nunits))
2137 {
2138 /* We consider breaking the group only on VF boundaries from the existing
2139 start. */
2140 for (i = 0; i < group_size; i++)
2141 if (!matches[i]) break;
2142
2143 if (i >= const_nunits && i < group_size)
2144 {
2145 /* Split into two groups at the first vector boundary before i. */
2146 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2147 unsigned group1_size = i & ~(const_nunits - 1);
2148
2149 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2150 group1_size);
2151 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2152 max_tree_size);
2153 /* If the first non-match was in the middle of a vector,
2154 skip the rest of that vector. */
2155 if (group1_size < i)
2156 {
2157 i = group1_size + const_nunits;
2158 if (i < group_size)
2159 rest = vect_split_slp_store_group (rest, const_nunits);
2160 }
2161 if (i < group_size)
2162 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2163 return res;
2164 }
2165 /* Even though the first vector did not all match, we might be able to SLP
2166 (some) of the remainder. FORNOW ignore this possibility. */
2167 }
2168
2169 return false;
2170 }
2171
2172
2173 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2174 trees of packed scalar stmts if SLP is possible. */
2175
2176 opt_result
2177 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2178 {
2179 unsigned int i;
2180 stmt_vec_info first_element;
2181
2182 DUMP_VECT_SCOPE ("vect_analyze_slp");
2183
2184 /* Find SLP sequences starting from groups of grouped stores. */
2185 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2186 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2187
2188 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2189 {
2190 if (loop_vinfo->reduction_chains.length () > 0)
2191 {
2192 /* Find SLP sequences starting from reduction chains. */
2193 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2194 if (! vect_analyze_slp_instance (vinfo, first_element,
2195 max_tree_size))
2196 {
2197 /* Dissolve reduction chain group. */
2198 stmt_vec_info vinfo = first_element;
2199 while (vinfo)
2200 {
2201 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2202 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2203 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2204 vinfo = next;
2205 }
2206 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2207 }
2208 }
2209
2210 /* Find SLP sequences starting from groups of reductions. */
2211 if (loop_vinfo->reductions.length () > 1)
2212 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2213 max_tree_size);
2214 }
2215
2216 return opt_result::success ();
2217 }
2218
2219
2220 /* For each possible SLP instance decide whether to SLP it and calculate overall
2221 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2222 least one instance. */
2223
2224 bool
2225 vect_make_slp_decision (loop_vec_info loop_vinfo)
2226 {
2227 unsigned int i;
2228 poly_uint64 unrolling_factor = 1;
2229 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2230 slp_instance instance;
2231 int decided_to_slp = 0;
2232
2233 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2234
2235 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2236 {
2237 /* FORNOW: SLP if you can. */
2238 /* All unroll factors have the form current_vector_size * X for some
2239 rational X, so they must have a common multiple. */
2240 unrolling_factor
2241 = force_common_multiple (unrolling_factor,
2242 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2243
2244 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2245 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2246 loop-based vectorization. Such stmts will be marked as HYBRID. */
2247 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2248 decided_to_slp++;
2249 }
2250
2251 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2252
2253 if (decided_to_slp && dump_enabled_p ())
2254 {
2255 dump_printf_loc (MSG_NOTE, vect_location,
2256 "Decided to SLP %d instances. Unrolling factor ",
2257 decided_to_slp);
2258 dump_dec (MSG_NOTE, unrolling_factor);
2259 dump_printf (MSG_NOTE, "\n");
2260 }
2261
2262 return (decided_to_slp > 0);
2263 }
2264
2265
2266 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2267 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2268
2269 static void
2270 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2271 hash_map<slp_tree, unsigned> &visited)
2272 {
2273 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2274 imm_use_iterator imm_iter;
2275 gimple *use_stmt;
2276 stmt_vec_info use_vinfo;
2277 slp_tree child;
2278 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2279 int j;
2280
2281 /* We need to union stype over the incoming graph edges but we still
2282 want to limit recursion to stay O(N+E). */
2283 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2284
2285 /* Propagate hybrid down the SLP tree. */
2286 if (stype == hybrid)
2287 ;
2288 else if (HYBRID_SLP_STMT (stmt_vinfo))
2289 stype = hybrid;
2290 else if (!only_edge)
2291 {
2292 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2293 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2294 /* If we get a pattern stmt here we have to use the LHS of the
2295 original stmt for immediate uses. */
2296 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2297 tree def;
2298 if (gimple_code (stmt) == GIMPLE_PHI)
2299 def = gimple_phi_result (stmt);
2300 else
2301 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2302 if (def)
2303 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2304 {
2305 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2306 if (!use_vinfo)
2307 continue;
2308 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2309 if (!STMT_SLP_TYPE (use_vinfo)
2310 && (STMT_VINFO_RELEVANT (use_vinfo)
2311 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2312 && !(gimple_code (use_stmt) == GIMPLE_PHI
2313 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2314 {
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2317 "def in non-SLP stmt: %G", use_stmt);
2318 stype = hybrid;
2319 }
2320 }
2321 }
2322
2323 if (stype == hybrid
2324 && !HYBRID_SLP_STMT (stmt_vinfo))
2325 {
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2328 stmt_vinfo->stmt);
2329 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2330 }
2331
2332 if (!only_edge)
2333 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2334 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2335 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2336 }
2337
2338 static void
2339 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2340 {
2341 hash_map<slp_tree, unsigned> visited;
2342 vect_detect_hybrid_slp_stmts (node, i, stype, visited);
2343 }
2344
2345 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2346
2347 static tree
2348 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2349 {
2350 walk_stmt_info *wi = (walk_stmt_info *)data;
2351 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2352
2353 if (wi->is_lhs)
2354 return NULL_TREE;
2355
2356 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2357 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2358 {
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2361 def_stmt_info->stmt);
2362 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2363 }
2364
2365 return NULL_TREE;
2366 }
2367
2368 static tree
2369 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2370 walk_stmt_info *wi)
2371 {
2372 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2373 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2374 /* If the stmt is in a SLP instance then this isn't a reason
2375 to mark use definitions in other SLP instances as hybrid. */
2376 if (! STMT_SLP_TYPE (use_vinfo)
2377 && (STMT_VINFO_RELEVANT (use_vinfo)
2378 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2379 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2380 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2381 ;
2382 else
2383 *handled = true;
2384 return NULL_TREE;
2385 }
2386
2387 /* Find stmts that must be both vectorized and SLPed. */
2388
2389 void
2390 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2391 {
2392 unsigned int i;
2393 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2394 slp_instance instance;
2395
2396 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2397
2398 /* First walk all pattern stmt in the loop and mark defs of uses as
2399 hybrid because immediate uses in them are not recorded. */
2400 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2401 {
2402 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2403 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2404 gsi_next (&gsi))
2405 {
2406 gimple *stmt = gsi_stmt (gsi);
2407 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2408 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2409 {
2410 walk_stmt_info wi;
2411 memset (&wi, 0, sizeof (wi));
2412 wi.info = loop_vinfo;
2413 gimple_stmt_iterator gsi2
2414 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2415 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2416 vect_detect_hybrid_slp_1, &wi);
2417 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2418 vect_detect_hybrid_slp_2,
2419 vect_detect_hybrid_slp_1, &wi);
2420 }
2421 }
2422 }
2423
2424 /* Then walk the SLP instance trees marking stmts with uses in
2425 non-SLP stmts as hybrid, also propagating hybrid down the
2426 SLP tree, collecting the above info on-the-fly. */
2427 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2428 {
2429 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2430 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2431 i, pure_slp);
2432 }
2433 }
2434
2435
2436 /* Initialize a bb_vec_info struct for the statements between
2437 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2438
2439 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2440 gimple_stmt_iterator region_end_in,
2441 vec_info_shared *shared)
2442 : vec_info (vec_info::bb, init_cost (NULL), shared),
2443 bb (gsi_bb (region_begin_in)),
2444 region_begin (region_begin_in),
2445 region_end (region_end_in)
2446 {
2447 gimple_stmt_iterator gsi;
2448
2449 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2450 gsi_next (&gsi))
2451 {
2452 gimple *stmt = gsi_stmt (gsi);
2453 gimple_set_uid (stmt, 0);
2454 add_stmt (stmt);
2455 }
2456
2457 bb->aux = this;
2458 }
2459
2460
2461 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2462 stmts in the basic block. */
2463
2464 _bb_vec_info::~_bb_vec_info ()
2465 {
2466 for (gimple_stmt_iterator si = region_begin;
2467 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2468 /* Reset region marker. */
2469 gimple_set_uid (gsi_stmt (si), -1);
2470
2471 bb->aux = NULL;
2472 }
2473
2474 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2475 given then that child nodes have already been processed, and that
2476 their def types currently match their SLP node's def type. */
2477
2478 static bool
2479 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2480 slp_instance node_instance,
2481 stmt_vector_for_cost *cost_vec)
2482 {
2483 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2484 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2485
2486 /* For BB vectorization vector types are assigned here.
2487 Memory accesses already got their vector type assigned
2488 in vect_analyze_data_refs. */
2489 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2490 if (bb_vinfo
2491 && ! STMT_VINFO_DATA_REF (stmt_info))
2492 {
2493 tree vectype, nunits_vectype;
2494 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2495 &nunits_vectype))
2496 /* We checked this when building the node. */
2497 gcc_unreachable ();
2498 if (vectype == boolean_type_node)
2499 {
2500 vectype = vect_get_mask_type_for_stmt (stmt_info);
2501 if (!vectype)
2502 /* vect_get_mask_type_for_stmt has already explained the
2503 failure. */
2504 return false;
2505 }
2506
2507 stmt_vec_info sstmt_info;
2508 unsigned int i;
2509 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2510 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2511 }
2512
2513 /* Calculate the number of vector statements to be created for the
2514 scalar stmts in this node. For SLP reductions it is equal to the
2515 number of vector statements in the children (which has already been
2516 calculated by the recursive call). Otherwise it is the number of
2517 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2518 VF divided by the number of elements in a vector. */
2519 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2520 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2521 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2522 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2523 else
2524 {
2525 poly_uint64 vf;
2526 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2527 vf = loop_vinfo->vectorization_factor;
2528 else
2529 vf = 1;
2530 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2531 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2532 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2533 = vect_get_num_vectors (vf * group_size, vectype);
2534 }
2535
2536 bool dummy;
2537 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2538 }
2539
2540 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2541 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2542
2543 Return true if the operations are supported. */
2544
2545 static bool
2546 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2547 slp_instance node_instance,
2548 scalar_stmts_to_slp_tree_map_t *visited,
2549 scalar_stmts_to_slp_tree_map_t *lvisited,
2550 stmt_vector_for_cost *cost_vec)
2551 {
2552 int i, j;
2553 slp_tree child;
2554
2555 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2556 return true;
2557
2558 /* If we already analyzed the exact same set of scalar stmts we're done.
2559 We share the generated vector stmts for those. */
2560 slp_tree *leader;
2561 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2562 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2563 {
2564 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2565 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2566 return true;
2567 }
2568
2569 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2570 doesn't result in any issue since we throw away the lvisited set
2571 when we fail. */
2572 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2573
2574 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2575 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2576 visited, lvisited, cost_vec))
2577 return false;
2578
2579 /* ??? We have to catch the case late where two first scalar stmts appear
2580 in multiple SLP children with different def type and fail. Remember
2581 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2582 match it when that is vect_internal_def. */
2583 auto_vec<vect_def_type, 4> dt;
2584 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2585 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2586 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2587
2588 /* Push SLP node def-type to stmt operands. */
2589 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2590 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2591 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2592 = SLP_TREE_DEF_TYPE (child);
2593
2594 /* Check everything worked out. */
2595 bool res = true;
2596 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2597 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2598 {
2599 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2600 != SLP_TREE_DEF_TYPE (child))
2601 res = false;
2602 }
2603 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) != dt[j])
2604 res = false;
2605 if (!res && dump_enabled_p ())
2606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2607 "not vectorized: same operand with different "
2608 "def type in stmt.\n");
2609
2610 if (res)
2611 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2612 cost_vec);
2613
2614 /* Restore def-types. */
2615 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2616 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2617
2618 return res;
2619 }
2620
2621
2622 /* Analyze statements in SLP instances of VINFO. Return true if the
2623 operations are supported. */
2624
2625 bool
2626 vect_slp_analyze_operations (vec_info *vinfo)
2627 {
2628 slp_instance instance;
2629 int i;
2630
2631 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2632
2633 scalar_stmts_to_slp_tree_map_t *visited
2634 = new scalar_stmts_to_slp_tree_map_t ();
2635 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2636 {
2637 scalar_stmts_to_slp_tree_map_t lvisited;
2638 stmt_vector_for_cost cost_vec;
2639 cost_vec.create (2);
2640 if (!vect_slp_analyze_node_operations (vinfo,
2641 SLP_INSTANCE_TREE (instance),
2642 instance, visited, &lvisited,
2643 &cost_vec))
2644 {
2645 slp_tree node = SLP_INSTANCE_TREE (instance);
2646 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2647 if (dump_enabled_p ())
2648 dump_printf_loc (MSG_NOTE, vect_location,
2649 "removing SLP instance operations starting from: %G",
2650 stmt_info->stmt);
2651 vect_free_slp_instance (instance, false);
2652 vinfo->slp_instances.ordered_remove (i);
2653 cost_vec.release ();
2654 }
2655 else
2656 {
2657 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2658 x != lvisited.end(); ++x)
2659 visited->put ((*x).first.copy (), (*x).second);
2660 i++;
2661
2662 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2663 cost_vec.release ();
2664 }
2665 }
2666 delete visited;
2667
2668 return !vinfo->slp_instances.is_empty ();
2669 }
2670
2671
2672 /* Compute the scalar cost of the SLP node NODE and its children
2673 and return it. Do not account defs that are marked in LIFE and
2674 update LIFE according to uses of NODE. */
2675
2676 static void
2677 vect_bb_slp_scalar_cost (basic_block bb,
2678 slp_tree node, vec<bool, va_heap> *life,
2679 stmt_vector_for_cost *cost_vec,
2680 hash_set<slp_tree> &visited)
2681 {
2682 unsigned i;
2683 stmt_vec_info stmt_info;
2684 slp_tree child;
2685
2686 if (visited.add (node))
2687 return;
2688
2689 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2690 {
2691 gimple *stmt = stmt_info->stmt;
2692 vec_info *vinfo = stmt_info->vinfo;
2693 ssa_op_iter op_iter;
2694 def_operand_p def_p;
2695
2696 if ((*life)[i])
2697 continue;
2698
2699 /* If there is a non-vectorized use of the defs then the scalar
2700 stmt is kept live in which case we do not account it or any
2701 required defs in the SLP children in the scalar cost. This
2702 way we make the vectorization more costly when compared to
2703 the scalar cost. */
2704 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2705 {
2706 imm_use_iterator use_iter;
2707 gimple *use_stmt;
2708 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2709 if (!is_gimple_debug (use_stmt))
2710 {
2711 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2712 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2713 {
2714 (*life)[i] = true;
2715 BREAK_FROM_IMM_USE_STMT (use_iter);
2716 }
2717 }
2718 }
2719 if ((*life)[i])
2720 continue;
2721
2722 /* Count scalar stmts only once. */
2723 if (gimple_visited_p (stmt))
2724 continue;
2725 gimple_set_visited (stmt, true);
2726
2727 vect_cost_for_stmt kind;
2728 if (STMT_VINFO_DATA_REF (stmt_info))
2729 {
2730 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2731 kind = scalar_load;
2732 else
2733 kind = scalar_store;
2734 }
2735 else
2736 kind = scalar_stmt;
2737 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2738 }
2739
2740 auto_vec<bool, 20> subtree_life;
2741 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2742 {
2743 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2744 {
2745 /* Do not directly pass LIFE to the recursive call, copy it to
2746 confine changes in the callee to the current child/subtree. */
2747 subtree_life.safe_splice (*life);
2748 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2749 visited);
2750 subtree_life.truncate (0);
2751 }
2752 }
2753 }
2754
2755 static void
2756 vect_bb_slp_scalar_cost (basic_block bb,
2757 slp_tree node, vec<bool, va_heap> *life,
2758 stmt_vector_for_cost *cost_vec)
2759 {
2760 hash_set<slp_tree> visited;
2761 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2762 }
2763
2764 /* Check if vectorization of the basic block is profitable. */
2765
2766 static bool
2767 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2768 {
2769 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2770 slp_instance instance;
2771 int i;
2772 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2773 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2774
2775 /* Calculate scalar cost. */
2776 stmt_vector_for_cost scalar_costs;
2777 scalar_costs.create (0);
2778 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2779 {
2780 auto_vec<bool, 20> life;
2781 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2782 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2783 SLP_INSTANCE_TREE (instance),
2784 &life, &scalar_costs);
2785 }
2786 void *target_cost_data = init_cost (NULL);
2787 add_stmt_costs (target_cost_data, &scalar_costs);
2788 scalar_costs.release ();
2789 unsigned dummy;
2790 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2791 destroy_cost_data (target_cost_data);
2792
2793 /* Unset visited flag. */
2794 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2795 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2796 gimple_set_visited (gsi_stmt (gsi), false);
2797
2798 /* Complete the target-specific cost calculation. */
2799 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2800 &vec_inside_cost, &vec_epilogue_cost);
2801
2802 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2803
2804 if (dump_enabled_p ())
2805 {
2806 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2807 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2808 vec_inside_cost);
2809 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2810 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2811 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2812 }
2813
2814 /* Vectorization is profitable if its cost is more than the cost of scalar
2815 version. Note that we err on the vector side for equal cost because
2816 the cost estimate is otherwise quite pessimistic (constant uses are
2817 free on the scalar side but cost a load on the vector side for
2818 example). */
2819 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2820 return false;
2821
2822 return true;
2823 }
2824
2825 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2826 if so and sets fatal to true if failure is independent of
2827 current_vector_size. */
2828
2829 static bb_vec_info
2830 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2831 gimple_stmt_iterator region_end,
2832 vec<data_reference_p> datarefs, int n_stmts,
2833 bool &fatal, vec_info_shared *shared)
2834 {
2835 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2836
2837 bb_vec_info bb_vinfo;
2838 slp_instance instance;
2839 int i;
2840 poly_uint64 min_vf = 2;
2841
2842 /* The first group of checks is independent of the vector size. */
2843 fatal = true;
2844
2845 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2846 {
2847 if (dump_enabled_p ())
2848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2849 "not vectorized: too many instructions in "
2850 "basic block.\n");
2851 free_data_refs (datarefs);
2852 return NULL;
2853 }
2854
2855 bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2856 if (!bb_vinfo)
2857 return NULL;
2858
2859 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2860 bb_vinfo->shared->save_datarefs ();
2861
2862 /* Analyze the data references. */
2863
2864 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2865 {
2866 if (dump_enabled_p ())
2867 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2868 "not vectorized: unhandled data-ref in basic "
2869 "block.\n");
2870
2871 delete bb_vinfo;
2872 return NULL;
2873 }
2874
2875 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2876 {
2877 if (dump_enabled_p ())
2878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2879 "not vectorized: not enough data-refs in "
2880 "basic block.\n");
2881
2882 delete bb_vinfo;
2883 return NULL;
2884 }
2885
2886 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2887 {
2888 if (dump_enabled_p ())
2889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2890 "not vectorized: unhandled data access in "
2891 "basic block.\n");
2892
2893 delete bb_vinfo;
2894 return NULL;
2895 }
2896
2897 /* If there are no grouped stores in the region there is no need
2898 to continue with pattern recog as vect_analyze_slp will fail
2899 anyway. */
2900 if (bb_vinfo->grouped_stores.is_empty ())
2901 {
2902 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904 "not vectorized: no grouped stores in "
2905 "basic block.\n");
2906
2907 delete bb_vinfo;
2908 return NULL;
2909 }
2910
2911 /* While the rest of the analysis below depends on it in some way. */
2912 fatal = false;
2913
2914 vect_pattern_recog (bb_vinfo);
2915
2916 /* Check the SLP opportunities in the basic block, analyze and build SLP
2917 trees. */
2918 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2919 {
2920 if (dump_enabled_p ())
2921 {
2922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923 "Failed to SLP the basic block.\n");
2924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2925 "not vectorized: failed to find SLP opportunities "
2926 "in basic block.\n");
2927 }
2928
2929 delete bb_vinfo;
2930 return NULL;
2931 }
2932
2933 vect_record_base_alignments (bb_vinfo);
2934
2935 /* Analyze and verify the alignment of data references and the
2936 dependence in the SLP instances. */
2937 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2938 {
2939 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2940 || ! vect_slp_analyze_instance_dependence (instance))
2941 {
2942 slp_tree node = SLP_INSTANCE_TREE (instance);
2943 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2944 if (dump_enabled_p ())
2945 dump_printf_loc (MSG_NOTE, vect_location,
2946 "removing SLP instance operations starting from: %G",
2947 stmt_info->stmt);
2948 vect_free_slp_instance (instance, false);
2949 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2950 continue;
2951 }
2952
2953 /* Mark all the statements that we want to vectorize as pure SLP and
2954 relevant. */
2955 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2956 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2957
2958 i++;
2959 }
2960 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2961 {
2962 delete bb_vinfo;
2963 return NULL;
2964 }
2965
2966 if (!vect_slp_analyze_operations (bb_vinfo))
2967 {
2968 if (dump_enabled_p ())
2969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2970 "not vectorized: bad operation in basic block.\n");
2971
2972 delete bb_vinfo;
2973 return NULL;
2974 }
2975
2976 /* Cost model: check if the vectorization is worthwhile. */
2977 if (!unlimited_cost_model (NULL)
2978 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2979 {
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982 "not vectorized: vectorization is not "
2983 "profitable.\n");
2984
2985 delete bb_vinfo;
2986 return NULL;
2987 }
2988
2989 if (dump_enabled_p ())
2990 dump_printf_loc (MSG_NOTE, vect_location,
2991 "Basic block will be vectorized using SLP\n");
2992
2993 return bb_vinfo;
2994 }
2995
2996
2997 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
2998 true if anything in the basic-block was vectorized. */
2999
3000 bool
3001 vect_slp_bb (basic_block bb)
3002 {
3003 bb_vec_info bb_vinfo;
3004 gimple_stmt_iterator gsi;
3005 bool any_vectorized = false;
3006 auto_vector_sizes vector_sizes;
3007
3008 /* Autodetect first vector size we try. */
3009 current_vector_size = 0;
3010 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
3011 unsigned int next_size = 0;
3012
3013 gsi = gsi_start_bb (bb);
3014
3015 poly_uint64 autodetected_vector_size = 0;
3016 while (1)
3017 {
3018 if (gsi_end_p (gsi))
3019 break;
3020
3021 gimple_stmt_iterator region_begin = gsi;
3022 vec<data_reference_p> datarefs = vNULL;
3023 int insns = 0;
3024
3025 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3026 {
3027 gimple *stmt = gsi_stmt (gsi);
3028 if (is_gimple_debug (stmt))
3029 continue;
3030 insns++;
3031
3032 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3033 vect_location = stmt;
3034
3035 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3036 break;
3037 }
3038
3039 /* Skip leading unhandled stmts. */
3040 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3041 {
3042 gsi_next (&gsi);
3043 continue;
3044 }
3045
3046 gimple_stmt_iterator region_end = gsi;
3047
3048 bool vectorized = false;
3049 bool fatal = false;
3050 vec_info_shared shared;
3051 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3052 datarefs, insns, fatal, &shared);
3053 if (bb_vinfo
3054 && dbg_cnt (vect_slp))
3055 {
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3058
3059 bb_vinfo->shared->check_datarefs ();
3060 vect_schedule_slp (bb_vinfo);
3061
3062 unsigned HOST_WIDE_INT bytes;
3063 if (dump_enabled_p ())
3064 {
3065 if (current_vector_size.is_constant (&bytes))
3066 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3067 "basic block part vectorized using %wu byte "
3068 "vectors\n", bytes);
3069 else
3070 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3071 "basic block part vectorized using variable "
3072 "length vectors\n");
3073 }
3074
3075 vectorized = true;
3076 }
3077 delete bb_vinfo;
3078
3079 any_vectorized |= vectorized;
3080
3081 if (next_size == 0)
3082 autodetected_vector_size = current_vector_size;
3083
3084 if (next_size < vector_sizes.length ()
3085 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3086 next_size += 1;
3087
3088 if (vectorized
3089 || next_size == vector_sizes.length ()
3090 || known_eq (current_vector_size, 0U)
3091 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3092 vector sizes will fail do not bother iterating. */
3093 || fatal)
3094 {
3095 if (gsi_end_p (region_end))
3096 break;
3097
3098 /* Skip the unhandled stmt. */
3099 gsi_next (&gsi);
3100
3101 /* And reset vector sizes. */
3102 current_vector_size = 0;
3103 next_size = 0;
3104 }
3105 else
3106 {
3107 /* Try the next biggest vector size. */
3108 current_vector_size = vector_sizes[next_size++];
3109 if (dump_enabled_p ())
3110 {
3111 dump_printf_loc (MSG_NOTE, vect_location,
3112 "***** Re-trying analysis with "
3113 "vector size ");
3114 dump_dec (MSG_NOTE, current_vector_size);
3115 dump_printf (MSG_NOTE, "\n");
3116 }
3117
3118 /* Start over. */
3119 gsi = region_begin;
3120 }
3121 }
3122
3123 return any_vectorized;
3124 }
3125
3126
3127 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3128
3129 static bool
3130 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3131 {
3132 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3133 tree op, vectype;
3134 enum vect_def_type dt;
3135
3136 /* For comparison and COND_EXPR type is chosen depending
3137 on the non-constant other comparison operand. */
3138 if (TREE_CODE_CLASS (code) == tcc_comparison)
3139 {
3140 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3141 op = gimple_assign_rhs1 (stmt);
3142
3143 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3144 gcc_unreachable ();
3145
3146 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3147 }
3148
3149 if (code == COND_EXPR)
3150 {
3151 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3152 tree cond = gimple_assign_rhs1 (stmt);
3153
3154 if (TREE_CODE (cond) == SSA_NAME)
3155 op = cond;
3156 else
3157 op = TREE_OPERAND (cond, 0);
3158
3159 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3160 gcc_unreachable ();
3161
3162 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3163 }
3164
3165 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3166 }
3167
3168 /* Build a variable-length vector in which the elements in ELTS are repeated
3169 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3170 RESULTS and add any new instructions to SEQ.
3171
3172 The approach we use is:
3173
3174 (1) Find a vector mode VM with integer elements of mode IM.
3175
3176 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3177 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3178 from small vectors to IM.
3179
3180 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3181
3182 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3183 correct byte contents.
3184
3185 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3186
3187 We try to find the largest IM for which this sequence works, in order
3188 to cut down on the number of interleaves. */
3189
3190 void
3191 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3192 unsigned int nresults, vec<tree> &results)
3193 {
3194 unsigned int nelts = elts.length ();
3195 tree element_type = TREE_TYPE (vector_type);
3196
3197 /* (1) Find a vector mode VM with integer elements of mode IM. */
3198 unsigned int nvectors = 1;
3199 tree new_vector_type;
3200 tree permutes[2];
3201 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3202 &nvectors, &new_vector_type,
3203 permutes))
3204 gcc_unreachable ();
3205
3206 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3207 unsigned int partial_nelts = nelts / nvectors;
3208 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3209
3210 tree_vector_builder partial_elts;
3211 auto_vec<tree, 32> pieces (nvectors * 2);
3212 pieces.quick_grow (nvectors * 2);
3213 for (unsigned int i = 0; i < nvectors; ++i)
3214 {
3215 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3216 ELTS' has mode IM. */
3217 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3218 for (unsigned int j = 0; j < partial_nelts; ++j)
3219 partial_elts.quick_push (elts[i * partial_nelts + j]);
3220 tree t = gimple_build_vector (seq, &partial_elts);
3221 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3222 TREE_TYPE (new_vector_type), t);
3223
3224 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3225 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3226 }
3227
3228 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3229 correct byte contents.
3230
3231 We need to repeat the following operation log2(nvectors) times:
3232
3233 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3234 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3235
3236 However, if each input repeats every N elements and the VF is
3237 a multiple of N * 2, the HI result is the same as the LO. */
3238 unsigned int in_start = 0;
3239 unsigned int out_start = nvectors;
3240 unsigned int hi_start = nvectors / 2;
3241 /* A bound on the number of outputs needed to produce NRESULTS results
3242 in the final iteration. */
3243 unsigned int noutputs_bound = nvectors * nresults;
3244 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3245 {
3246 noutputs_bound /= 2;
3247 unsigned int limit = MIN (noutputs_bound, nvectors);
3248 for (unsigned int i = 0; i < limit; ++i)
3249 {
3250 if ((i & 1) != 0
3251 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3252 2 * in_repeat))
3253 {
3254 pieces[out_start + i] = pieces[out_start + i - 1];
3255 continue;
3256 }
3257
3258 tree output = make_ssa_name (new_vector_type);
3259 tree input1 = pieces[in_start + (i / 2)];
3260 tree input2 = pieces[in_start + (i / 2) + hi_start];
3261 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3262 input1, input2,
3263 permutes[i & 1]);
3264 gimple_seq_add_stmt (seq, stmt);
3265 pieces[out_start + i] = output;
3266 }
3267 std::swap (in_start, out_start);
3268 }
3269
3270 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3271 results.reserve (nresults);
3272 for (unsigned int i = 0; i < nresults; ++i)
3273 if (i < nvectors)
3274 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3275 pieces[in_start + i]));
3276 else
3277 results.quick_push (results[i - nvectors]);
3278 }
3279
3280
3281 /* For constant and loop invariant defs of SLP_NODE this function returns
3282 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3283 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3284 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3285 REDUC_INDEX is the index of the reduction operand in the statements, unless
3286 it is -1. */
3287
3288 static void
3289 vect_get_constant_vectors (tree op, slp_tree slp_node,
3290 vec<tree> *vec_oprnds,
3291 unsigned int op_num, unsigned int number_of_vectors)
3292 {
3293 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3294 stmt_vec_info stmt_vinfo = stmts[0];
3295 gimple *stmt = stmt_vinfo->stmt;
3296 unsigned HOST_WIDE_INT nunits;
3297 tree vec_cst;
3298 unsigned j, number_of_places_left_in_vector;
3299 tree vector_type;
3300 tree vop;
3301 int group_size = stmts.length ();
3302 unsigned int vec_num, i;
3303 unsigned number_of_copies = 1;
3304 vec<tree> voprnds;
3305 voprnds.create (number_of_vectors);
3306 bool constant_p, is_store;
3307 tree neutral_op = NULL;
3308 enum tree_code code = gimple_expr_code (stmt);
3309 gimple_seq ctor_seq = NULL;
3310 auto_vec<tree, 16> permute_results;
3311
3312 /* Check if vector type is a boolean vector. */
3313 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3314 && vect_mask_constant_operand_p (stmt_vinfo))
3315 vector_type
3316 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3317 else
3318 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3319
3320 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3321 {
3322 is_store = true;
3323 op = gimple_assign_rhs1 (stmt);
3324 }
3325 else
3326 is_store = false;
3327
3328 gcc_assert (op);
3329
3330 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3331 created vectors. It is greater than 1 if unrolling is performed.
3332
3333 For example, we have two scalar operands, s1 and s2 (e.g., group of
3334 strided accesses of size two), while NUNITS is four (i.e., four scalars
3335 of this type can be packed in a vector). The output vector will contain
3336 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3337 will be 2).
3338
3339 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3340 containing the operands.
3341
3342 For example, NUNITS is four as before, and the group size is 8
3343 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3344 {s5, s6, s7, s8}. */
3345
3346 /* When using duplicate_and_interleave, we just need one element for
3347 each scalar statement. */
3348 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3349 nunits = group_size;
3350
3351 number_of_copies = nunits * number_of_vectors / group_size;
3352
3353 number_of_places_left_in_vector = nunits;
3354 constant_p = true;
3355 tree_vector_builder elts (vector_type, nunits, 1);
3356 elts.quick_grow (nunits);
3357 bool place_after_defs = false;
3358 for (j = 0; j < number_of_copies; j++)
3359 {
3360 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
3361 {
3362 stmt = stmt_vinfo->stmt;
3363 if (is_store)
3364 op = gimple_assign_rhs1 (stmt);
3365 else
3366 {
3367 switch (code)
3368 {
3369 case COND_EXPR:
3370 {
3371 tree cond = gimple_assign_rhs1 (stmt);
3372 if (TREE_CODE (cond) == SSA_NAME)
3373 op = gimple_op (stmt, op_num + 1);
3374 else if (op_num == 0 || op_num == 1)
3375 op = TREE_OPERAND (cond, op_num);
3376 else
3377 {
3378 if (op_num == 2)
3379 op = gimple_assign_rhs2 (stmt);
3380 else
3381 op = gimple_assign_rhs3 (stmt);
3382 }
3383 }
3384 break;
3385
3386 case CALL_EXPR:
3387 op = gimple_call_arg (stmt, op_num);
3388 break;
3389
3390 case LSHIFT_EXPR:
3391 case RSHIFT_EXPR:
3392 case LROTATE_EXPR:
3393 case RROTATE_EXPR:
3394 op = gimple_op (stmt, op_num + 1);
3395 /* Unlike the other binary operators, shifts/rotates have
3396 the shift count being int, instead of the same type as
3397 the lhs, so make sure the scalar is the right type if
3398 we are dealing with vectors of
3399 long long/long/short/char. */
3400 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3401 op = fold_convert (TREE_TYPE (vector_type), op);
3402 break;
3403
3404 default:
3405 op = gimple_op (stmt, op_num + 1);
3406 break;
3407 }
3408 }
3409
3410 /* Create 'vect_ = {op0,op1,...,opn}'. */
3411 number_of_places_left_in_vector--;
3412 tree orig_op = op;
3413 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3414 {
3415 if (CONSTANT_CLASS_P (op))
3416 {
3417 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3418 {
3419 /* Can't use VIEW_CONVERT_EXPR for booleans because
3420 of possibly different sizes of scalar value and
3421 vector element. */
3422 if (integer_zerop (op))
3423 op = build_int_cst (TREE_TYPE (vector_type), 0);
3424 else if (integer_onep (op))
3425 op = build_all_ones_cst (TREE_TYPE (vector_type));
3426 else
3427 gcc_unreachable ();
3428 }
3429 else
3430 op = fold_unary (VIEW_CONVERT_EXPR,
3431 TREE_TYPE (vector_type), op);
3432 gcc_assert (op && CONSTANT_CLASS_P (op));
3433 }
3434 else
3435 {
3436 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3437 gimple *init_stmt;
3438 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3439 {
3440 tree true_val
3441 = build_all_ones_cst (TREE_TYPE (vector_type));
3442 tree false_val
3443 = build_zero_cst (TREE_TYPE (vector_type));
3444 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3445 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3446 op, true_val,
3447 false_val);
3448 }
3449 else
3450 {
3451 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3452 op);
3453 init_stmt
3454 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3455 op);
3456 }
3457 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3458 op = new_temp;
3459 }
3460 }
3461 elts[number_of_places_left_in_vector] = op;
3462 if (!CONSTANT_CLASS_P (op))
3463 constant_p = false;
3464 if (TREE_CODE (orig_op) == SSA_NAME
3465 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3466 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3467 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3468 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3469 place_after_defs = true;
3470
3471 if (number_of_places_left_in_vector == 0)
3472 {
3473 if (constant_p
3474 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3475 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3476 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3477 else
3478 {
3479 if (vec_oprnds->is_empty ())
3480 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3481 number_of_vectors,
3482 permute_results);
3483 vec_cst = permute_results[number_of_vectors - j - 1];
3484 }
3485 tree init;
3486 gimple_stmt_iterator gsi;
3487 if (place_after_defs)
3488 {
3489 stmt_vec_info last_stmt_info
3490 = vect_find_last_scalar_stmt_in_slp (slp_node);
3491 gsi = gsi_for_stmt (last_stmt_info->stmt);
3492 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3493 &gsi);
3494 }
3495 else
3496 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3497 NULL);
3498 if (ctor_seq != NULL)
3499 {
3500 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3501 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3502 ctor_seq = NULL;
3503 }
3504 voprnds.quick_push (init);
3505 place_after_defs = false;
3506 number_of_places_left_in_vector = nunits;
3507 constant_p = true;
3508 elts.new_vector (vector_type, nunits, 1);
3509 elts.quick_grow (nunits);
3510 }
3511 }
3512 }
3513
3514 /* Since the vectors are created in the reverse order, we should invert
3515 them. */
3516 vec_num = voprnds.length ();
3517 for (j = vec_num; j != 0; j--)
3518 {
3519 vop = voprnds[j - 1];
3520 vec_oprnds->quick_push (vop);
3521 }
3522
3523 voprnds.release ();
3524
3525 /* In case that VF is greater than the unrolling factor needed for the SLP
3526 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3527 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3528 to replicate the vectors. */
3529 while (number_of_vectors > vec_oprnds->length ())
3530 {
3531 tree neutral_vec = NULL;
3532
3533 if (neutral_op)
3534 {
3535 if (!neutral_vec)
3536 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3537
3538 vec_oprnds->quick_push (neutral_vec);
3539 }
3540 else
3541 {
3542 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3543 vec_oprnds->quick_push (vop);
3544 }
3545 }
3546 }
3547
3548
3549 /* Get vectorized definitions from SLP_NODE that contains corresponding
3550 vectorized def-stmts. */
3551
3552 static void
3553 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3554 {
3555 tree vec_oprnd;
3556 stmt_vec_info vec_def_stmt_info;
3557 unsigned int i;
3558
3559 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3560
3561 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3562 {
3563 gcc_assert (vec_def_stmt_info);
3564 if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
3565 vec_oprnd = gimple_phi_result (vec_def_phi);
3566 else
3567 vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
3568 vec_oprnds->quick_push (vec_oprnd);
3569 }
3570 }
3571
3572
3573 /* Get vectorized definitions for SLP_NODE.
3574 If the scalar definitions are loop invariants or constants, collect them and
3575 call vect_get_constant_vectors() to create vector stmts.
3576 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3577 must be stored in the corresponding child of SLP_NODE, and we call
3578 vect_get_slp_vect_defs () to retrieve them. */
3579
3580 void
3581 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3582 vec<vec<tree> > *vec_oprnds)
3583 {
3584 int number_of_vects = 0, i;
3585 unsigned int child_index = 0;
3586 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3587 slp_tree child = NULL;
3588 vec<tree> vec_defs;
3589 tree oprnd;
3590 bool vectorized_defs;
3591
3592 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3593 FOR_EACH_VEC_ELT (ops, i, oprnd)
3594 {
3595 /* For each operand we check if it has vectorized definitions in a child
3596 node or we need to create them (for invariants and constants). We
3597 check if the LHS of the first stmt of the next child matches OPRND.
3598 If it does, we found the correct child. Otherwise, we call
3599 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3600 to check this child node for the next operand. */
3601 vectorized_defs = false;
3602 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3603 {
3604 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3605
3606 /* We have to check both pattern and original def, if available. */
3607 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3608 {
3609 stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
3610 stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
3611 tree first_def_op;
3612
3613 if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
3614 first_def_op = gimple_phi_result (first_def);
3615 else
3616 first_def_op = gimple_get_lhs (first_def_info->stmt);
3617 if (operand_equal_p (oprnd, first_def_op, 0)
3618 || (related
3619 && operand_equal_p (oprnd,
3620 gimple_get_lhs (related->stmt), 0)))
3621 {
3622 /* The number of vector defs is determined by the number of
3623 vector statements in the node from which we get those
3624 statements. */
3625 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3626 vectorized_defs = true;
3627 child_index++;
3628 }
3629 }
3630 else
3631 child_index++;
3632 }
3633
3634 if (!vectorized_defs)
3635 {
3636 if (i == 0)
3637 {
3638 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3639 /* Number of vector stmts was calculated according to LHS in
3640 vect_schedule_slp_instance (), fix it by replacing LHS with
3641 RHS, if necessary. See vect_get_smallest_scalar_type () for
3642 details. */
3643 vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
3644 &rhs_size_unit);
3645 if (rhs_size_unit != lhs_size_unit)
3646 {
3647 number_of_vects *= rhs_size_unit;
3648 number_of_vects /= lhs_size_unit;
3649 }
3650 }
3651 }
3652
3653 /* Allocate memory for vectorized defs. */
3654 vec_defs = vNULL;
3655 vec_defs.create (number_of_vects);
3656
3657 /* For reduction defs we call vect_get_constant_vectors (), since we are
3658 looking for initial loop invariant values. */
3659 if (vectorized_defs)
3660 /* The defs are already vectorized. */
3661 vect_get_slp_vect_defs (child, &vec_defs);
3662 else
3663 /* Build vectors from scalar defs. */
3664 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3665 number_of_vects);
3666
3667 vec_oprnds->quick_push (vec_defs);
3668 }
3669 }
3670
3671 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3672 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3673 permute statements for the SLP node NODE of the SLP instance
3674 SLP_NODE_INSTANCE. */
3675
3676 bool
3677 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3678 gimple_stmt_iterator *gsi, poly_uint64 vf,
3679 slp_instance slp_node_instance, bool analyze_only,
3680 unsigned *n_perms)
3681 {
3682 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3683 vec_info *vinfo = stmt_info->vinfo;
3684 int vec_index = 0;
3685 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3686 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3687 unsigned int mask_element;
3688 machine_mode mode;
3689
3690 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3691 return false;
3692
3693 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3694
3695 mode = TYPE_MODE (vectype);
3696 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3697
3698 /* Initialize the vect stmts of NODE to properly insert the generated
3699 stmts later. */
3700 if (! analyze_only)
3701 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3702 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3703 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3704
3705 /* Generate permutation masks for every NODE. Number of masks for each NODE
3706 is equal to GROUP_SIZE.
3707 E.g., we have a group of three nodes with three loads from the same
3708 location in each node, and the vector size is 4. I.e., we have a
3709 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3710 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3711 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3712 ...
3713
3714 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3715 The last mask is illegal since we assume two operands for permute
3716 operation, and the mask element values can't be outside that range.
3717 Hence, the last mask must be converted into {2,5,5,5}.
3718 For the first two permutations we need the first and the second input
3719 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3720 we need the second and the third vectors: {b1,c1,a2,b2} and
3721 {c2,a3,b3,c3}. */
3722
3723 int vect_stmts_counter = 0;
3724 unsigned int index = 0;
3725 int first_vec_index = -1;
3726 int second_vec_index = -1;
3727 bool noop_p = true;
3728 *n_perms = 0;
3729
3730 vec_perm_builder mask;
3731 unsigned int nelts_to_build;
3732 unsigned int nvectors_per_build;
3733 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3734 && multiple_p (nunits, group_size));
3735 if (repeating_p)
3736 {
3737 /* A single vector contains a whole number of copies of the node, so:
3738 (a) all permutes can use the same mask; and
3739 (b) the permutes only need a single vector input. */
3740 mask.new_vector (nunits, group_size, 3);
3741 nelts_to_build = mask.encoded_nelts ();
3742 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3743 }
3744 else
3745 {
3746 /* We need to construct a separate mask for each vector statement. */
3747 unsigned HOST_WIDE_INT const_nunits, const_vf;
3748 if (!nunits.is_constant (&const_nunits)
3749 || !vf.is_constant (&const_vf))
3750 return false;
3751 mask.new_vector (const_nunits, const_nunits, 1);
3752 nelts_to_build = const_vf * group_size;
3753 nvectors_per_build = 1;
3754 }
3755
3756 unsigned int count = mask.encoded_nelts ();
3757 mask.quick_grow (count);
3758 vec_perm_indices indices;
3759
3760 for (unsigned int j = 0; j < nelts_to_build; j++)
3761 {
3762 unsigned int iter_num = j / group_size;
3763 unsigned int stmt_num = j % group_size;
3764 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3765 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3766 if (repeating_p)
3767 {
3768 first_vec_index = 0;
3769 mask_element = i;
3770 }
3771 else
3772 {
3773 /* Enforced before the loop when !repeating_p. */
3774 unsigned int const_nunits = nunits.to_constant ();
3775 vec_index = i / const_nunits;
3776 mask_element = i % const_nunits;
3777 if (vec_index == first_vec_index
3778 || first_vec_index == -1)
3779 {
3780 first_vec_index = vec_index;
3781 }
3782 else if (vec_index == second_vec_index
3783 || second_vec_index == -1)
3784 {
3785 second_vec_index = vec_index;
3786 mask_element += const_nunits;
3787 }
3788 else
3789 {
3790 if (dump_enabled_p ())
3791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3792 "permutation requires at "
3793 "least three vectors %G",
3794 stmt_info->stmt);
3795 gcc_assert (analyze_only);
3796 return false;
3797 }
3798
3799 gcc_assert (mask_element < 2 * const_nunits);
3800 }
3801
3802 if (mask_element != index)
3803 noop_p = false;
3804 mask[index++] = mask_element;
3805
3806 if (index == count && !noop_p)
3807 {
3808 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3809 if (!can_vec_perm_const_p (mode, indices))
3810 {
3811 if (dump_enabled_p ())
3812 {
3813 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3814 vect_location,
3815 "unsupported vect permute { ");
3816 for (i = 0; i < count; ++i)
3817 {
3818 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3819 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3820 }
3821 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3822 }
3823 gcc_assert (analyze_only);
3824 return false;
3825 }
3826
3827 ++*n_perms;
3828 }
3829
3830 if (index == count)
3831 {
3832 if (!analyze_only)
3833 {
3834 tree mask_vec = NULL_TREE;
3835
3836 if (! noop_p)
3837 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3838
3839 if (second_vec_index == -1)
3840 second_vec_index = first_vec_index;
3841
3842 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3843 {
3844 /* Generate the permute statement if necessary. */
3845 tree first_vec = dr_chain[first_vec_index + ri];
3846 tree second_vec = dr_chain[second_vec_index + ri];
3847 stmt_vec_info perm_stmt_info;
3848 if (! noop_p)
3849 {
3850 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3851 tree perm_dest
3852 = vect_create_destination_var (gimple_assign_lhs (stmt),
3853 vectype);
3854 perm_dest = make_ssa_name (perm_dest);
3855 gassign *perm_stmt
3856 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3857 first_vec, second_vec,
3858 mask_vec);
3859 perm_stmt_info
3860 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3861 gsi);
3862 }
3863 else
3864 /* If mask was NULL_TREE generate the requested
3865 identity transform. */
3866 perm_stmt_info = vinfo->lookup_def (first_vec);
3867
3868 /* Store the vector statement in NODE. */
3869 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3870 = perm_stmt_info;
3871 }
3872 }
3873
3874 index = 0;
3875 first_vec_index = -1;
3876 second_vec_index = -1;
3877 noop_p = true;
3878 }
3879 }
3880
3881 return true;
3882 }
3883
3884 /* Vectorize SLP instance tree in postorder. */
3885
3886 static void
3887 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3888 scalar_stmts_to_slp_tree_map_t *bst_map)
3889 {
3890 gimple_stmt_iterator si;
3891 stmt_vec_info stmt_info;
3892 unsigned int group_size;
3893 tree vectype;
3894 int i, j;
3895 slp_tree child;
3896
3897 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3898 return;
3899
3900 /* See if we have already vectorized the node in the graph of the
3901 SLP instance. */
3902 if (SLP_TREE_VEC_STMTS (node).exists ())
3903 return;
3904
3905 /* See if we have already vectorized the same set of stmts and reuse their
3906 vectorized stmts across instances. */
3907 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3908 {
3909 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3910 return;
3911 }
3912
3913 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3914 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3915 vect_schedule_slp_instance (child, instance, bst_map);
3916
3917 /* Push SLP node def-type to stmts. */
3918 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3919 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3920 {
3921 stmt_vec_info child_stmt_info;
3922 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3923 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3924 }
3925
3926 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3927
3928 /* VECTYPE is the type of the destination. */
3929 vectype = STMT_VINFO_VECTYPE (stmt_info);
3930 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3931 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3932
3933 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3934 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3935
3936 if (dump_enabled_p ())
3937 dump_printf_loc (MSG_NOTE, vect_location,
3938 "------>vectorizing SLP node starting from: %G",
3939 stmt_info->stmt);
3940
3941 /* Vectorized stmts go before the last scalar stmt which is where
3942 all uses are ready. */
3943 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3944 si = gsi_for_stmt (last_stmt_info->stmt);
3945
3946 /* Mark the first element of the reduction chain as reduction to properly
3947 transform the node. In the analysis phase only the last element of the
3948 chain is marked as reduction. */
3949 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3950 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3951 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3952 {
3953 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3954 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3955 }
3956
3957 /* Handle two-operation SLP nodes by vectorizing the group with
3958 both operations and then performing a merge. */
3959 if (SLP_TREE_TWO_OPERATORS (node))
3960 {
3961 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3962 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3963 enum tree_code ocode = ERROR_MARK;
3964 stmt_vec_info ostmt_info;
3965 vec_perm_builder mask (group_size, group_size, 1);
3966 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3967 {
3968 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3969 if (gimple_assign_rhs_code (ostmt) != code0)
3970 {
3971 mask.quick_push (1);
3972 ocode = gimple_assign_rhs_code (ostmt);
3973 }
3974 else
3975 mask.quick_push (0);
3976 }
3977 if (ocode != ERROR_MARK)
3978 {
3979 vec<stmt_vec_info> v0;
3980 vec<stmt_vec_info> v1;
3981 unsigned j;
3982 tree tmask = NULL_TREE;
3983 vect_transform_stmt (stmt_info, &si, node, instance);
3984 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3985 SLP_TREE_VEC_STMTS (node).truncate (0);
3986 gimple_assign_set_rhs_code (stmt, ocode);
3987 vect_transform_stmt (stmt_info, &si, node, instance);
3988 gimple_assign_set_rhs_code (stmt, code0);
3989 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3990 SLP_TREE_VEC_STMTS (node).truncate (0);
3991 tree meltype = build_nonstandard_integer_type
3992 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3993 tree mvectype = get_same_sized_vectype (meltype, vectype);
3994 unsigned k = 0, l;
3995 for (j = 0; j < v0.length (); ++j)
3996 {
3997 /* Enforced by vect_build_slp_tree, which rejects variable-length
3998 vectors for SLP_TREE_TWO_OPERATORS. */
3999 unsigned int const_nunits = nunits.to_constant ();
4000 tree_vector_builder melts (mvectype, const_nunits, 1);
4001 for (l = 0; l < const_nunits; ++l)
4002 {
4003 if (k >= group_size)
4004 k = 0;
4005 tree t = build_int_cst (meltype,
4006 mask[k++] * const_nunits + l);
4007 melts.quick_push (t);
4008 }
4009 tmask = melts.build ();
4010
4011 /* ??? Not all targets support a VEC_PERM_EXPR with a
4012 constant mask that would translate to a vec_merge RTX
4013 (with their vec_perm_const_ok). We can either not
4014 vectorize in that case or let veclower do its job.
4015 Unfortunately that isn't too great and at least for
4016 plus/minus we'd eventually like to match targets
4017 vector addsub instructions. */
4018 gimple *vstmt;
4019 vstmt = gimple_build_assign (make_ssa_name (vectype),
4020 VEC_PERM_EXPR,
4021 gimple_assign_lhs (v0[j]->stmt),
4022 gimple_assign_lhs (v1[j]->stmt),
4023 tmask);
4024 SLP_TREE_VEC_STMTS (node).quick_push
4025 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
4026 }
4027 v0.release ();
4028 v1.release ();
4029 return;
4030 }
4031 }
4032 vect_transform_stmt (stmt_info, &si, node, instance);
4033
4034 /* Restore stmt def-types. */
4035 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4036 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4037 {
4038 stmt_vec_info child_stmt_info;
4039 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4040 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4041 }
4042 }
4043
4044 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4045 For loop vectorization this is done in vectorizable_call, but for SLP
4046 it needs to be deferred until end of vect_schedule_slp, because multiple
4047 SLP instances may refer to the same scalar stmt. */
4048
4049 static void
4050 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4051 {
4052 gimple *new_stmt;
4053 gimple_stmt_iterator gsi;
4054 int i;
4055 slp_tree child;
4056 tree lhs;
4057 stmt_vec_info stmt_info;
4058
4059 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4060 return;
4061
4062 if (visited.add (node))
4063 return;
4064
4065 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4066 vect_remove_slp_scalar_calls (child, visited);
4067
4068 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4069 {
4070 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4071 if (!stmt || gimple_bb (stmt) == NULL)
4072 continue;
4073 if (is_pattern_stmt_p (stmt_info)
4074 || !PURE_SLP_STMT (stmt_info))
4075 continue;
4076 lhs = gimple_call_lhs (stmt);
4077 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4078 gsi = gsi_for_stmt (stmt);
4079 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4080 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4081 }
4082 }
4083
4084 static void
4085 vect_remove_slp_scalar_calls (slp_tree node)
4086 {
4087 hash_set<slp_tree> visited;
4088 vect_remove_slp_scalar_calls (node, visited);
4089 }
4090
4091 /* Generate vector code for all SLP instances in the loop/basic block. */
4092
4093 void
4094 vect_schedule_slp (vec_info *vinfo)
4095 {
4096 vec<slp_instance> slp_instances;
4097 slp_instance instance;
4098 unsigned int i;
4099
4100 scalar_stmts_to_slp_tree_map_t *bst_map
4101 = new scalar_stmts_to_slp_tree_map_t ();
4102 slp_instances = vinfo->slp_instances;
4103 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4104 {
4105 /* Schedule the tree of INSTANCE. */
4106 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4107 instance, bst_map);
4108 if (dump_enabled_p ())
4109 dump_printf_loc (MSG_NOTE, vect_location,
4110 "vectorizing stmts using SLP.\n");
4111 }
4112 delete bst_map;
4113
4114 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4115 {
4116 slp_tree root = SLP_INSTANCE_TREE (instance);
4117 stmt_vec_info store_info;
4118 unsigned int j;
4119
4120 /* Remove scalar call stmts. Do not do this for basic-block
4121 vectorization as not all uses may be vectorized.
4122 ??? Why should this be necessary? DCE should be able to
4123 remove the stmts itself.
4124 ??? For BB vectorization we can as well remove scalar
4125 stmts starting from the SLP tree root if they have no
4126 uses. */
4127 if (is_a <loop_vec_info> (vinfo))
4128 vect_remove_slp_scalar_calls (root);
4129
4130 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4131 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4132 {
4133 if (!STMT_VINFO_DATA_REF (store_info))
4134 break;
4135
4136 store_info = vect_orig_stmt (store_info);
4137 /* Free the attached stmt_vec_info and remove the stmt. */
4138 vinfo->remove_stmt (store_info);
4139 }
4140 }
4141 }