]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
* doc/install.texi (Specific, alpha): Remove note to use
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
53
54 static void
55 vect_free_slp_tree (slp_tree node, bool final_p)
56 {
57 int i;
58 slp_tree child;
59
60 if (--node->refcnt != 0)
61 return;
62
63 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64 vect_free_slp_tree (child, final_p);
65
66 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67 Some statements might no longer exist, after having been
68 removed by vect_transform_stmt. Updating the remaining
69 statements would be redundant. */
70 if (!final_p)
71 {
72 stmt_vec_info stmt_info;
73 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 {
75 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
77 }
78 }
79
80 SLP_TREE_CHILDREN (node).release ();
81 SLP_TREE_SCALAR_STMTS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
84
85 free (node);
86 }
87
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
91
92 void
93 vect_free_slp_instance (slp_instance instance, bool final_p)
94 {
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
98 }
99
100
101 /* Create an SLP node for SCALAR_STMTS. */
102
103 static slp_tree
104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
105 {
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
109
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
113 {
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
117 }
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
122
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_VEC_STMTS (node).create (0);
126 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
127 SLP_TREE_CHILDREN (node).create (nops);
128 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
129 SLP_TREE_TWO_OPERATORS (node) = false;
130 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
131 node->refcnt = 1;
132
133 unsigned i;
134 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
135 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
136
137 return node;
138 }
139
140
141 /* This structure is used in creation of an SLP tree. Each instance
142 corresponds to the same operand in a group of scalar stmts in an SLP
143 node. */
144 typedef struct _slp_oprnd_info
145 {
146 /* Def-stmts for the operands. */
147 vec<stmt_vec_info> def_stmts;
148 /* Information about the first statement, its vector def-type, type, the
149 operand itself in case it's constant, and an indication if it's a pattern
150 stmt. */
151 tree first_op_type;
152 enum vect_def_type first_dt;
153 bool first_pattern;
154 bool second_pattern;
155 } *slp_oprnd_info;
156
157
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
159 operand. */
160 static vec<slp_oprnd_info>
161 vect_create_oprnd_info (int nops, int group_size)
162 {
163 int i;
164 slp_oprnd_info oprnd_info;
165 vec<slp_oprnd_info> oprnds_info;
166
167 oprnds_info.create (nops);
168 for (i = 0; i < nops; i++)
169 {
170 oprnd_info = XNEW (struct _slp_oprnd_info);
171 oprnd_info->def_stmts.create (group_size);
172 oprnd_info->first_dt = vect_uninitialized_def;
173 oprnd_info->first_op_type = NULL_TREE;
174 oprnd_info->first_pattern = false;
175 oprnd_info->second_pattern = false;
176 oprnds_info.quick_push (oprnd_info);
177 }
178
179 return oprnds_info;
180 }
181
182
183 /* Free operands info. */
184
185 static void
186 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
187 {
188 int i;
189 slp_oprnd_info oprnd_info;
190
191 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
192 {
193 oprnd_info->def_stmts.release ();
194 XDELETE (oprnd_info);
195 }
196
197 oprnds_info.release ();
198 }
199
200
201 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
202 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
203 of the chain. */
204
205 int
206 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
207 stmt_vec_info first_stmt_info)
208 {
209 stmt_vec_info next_stmt_info = first_stmt_info;
210 int result = 0;
211
212 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
213 return -1;
214
215 do
216 {
217 if (next_stmt_info == stmt_info)
218 return result;
219 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
220 if (next_stmt_info)
221 result += DR_GROUP_GAP (next_stmt_info);
222 }
223 while (next_stmt_info);
224
225 return -1;
226 }
227
228 /* Check whether it is possible to load COUNT elements of type ELT_MODE
229 using the method implemented by duplicate_and_interleave. Return true
230 if so, returning the number of intermediate vectors in *NVECTORS_OUT
231 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
232 (if nonnull). */
233
234 bool
235 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
236 unsigned int *nvectors_out,
237 tree *vector_type_out,
238 tree *permutes)
239 {
240 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
241 poly_int64 nelts;
242 unsigned int nvectors = 1;
243 for (;;)
244 {
245 scalar_int_mode int_mode;
246 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
247 if (multiple_p (current_vector_size, elt_bytes, &nelts)
248 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
249 {
250 tree int_type = build_nonstandard_integer_type
251 (GET_MODE_BITSIZE (int_mode), 1);
252 tree vector_type = build_vector_type (int_type, nelts);
253 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
254 {
255 vec_perm_builder sel1 (nelts, 2, 3);
256 vec_perm_builder sel2 (nelts, 2, 3);
257 poly_int64 half_nelts = exact_div (nelts, 2);
258 for (unsigned int i = 0; i < 3; ++i)
259 {
260 sel1.quick_push (i);
261 sel1.quick_push (i + nelts);
262 sel2.quick_push (half_nelts + i);
263 sel2.quick_push (half_nelts + i + nelts);
264 }
265 vec_perm_indices indices1 (sel1, 2, nelts);
266 vec_perm_indices indices2 (sel2, 2, nelts);
267 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
268 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
269 {
270 if (nvectors_out)
271 *nvectors_out = nvectors;
272 if (vector_type_out)
273 *vector_type_out = vector_type;
274 if (permutes)
275 {
276 permutes[0] = vect_gen_perm_mask_checked (vector_type,
277 indices1);
278 permutes[1] = vect_gen_perm_mask_checked (vector_type,
279 indices2);
280 }
281 return true;
282 }
283 }
284 }
285 if (!multiple_p (elt_bytes, 2, &elt_bytes))
286 return false;
287 nvectors *= 2;
288 }
289 }
290
291 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
292 they are of a valid type and that they match the defs of the first stmt of
293 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
294 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
295 indicates swap is required for cond_expr stmts. Specifically, *SWAP
296 is 1 if STMT is cond and operands of comparison need to be swapped;
297 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
298 If there is any operand swap in this function, *SWAP is set to non-zero
299 value.
300 If there was a fatal error return -1; if the error could be corrected by
301 swapping operands of father node of this one, return 1; if everything is
302 ok return 0. */
303 static int
304 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
305 vec<stmt_vec_info> stmts, unsigned stmt_num,
306 vec<slp_oprnd_info> *oprnds_info)
307 {
308 stmt_vec_info stmt_info = stmts[stmt_num];
309 tree oprnd;
310 unsigned int i, number_of_oprnds;
311 enum vect_def_type dt = vect_uninitialized_def;
312 bool pattern = false;
313 slp_oprnd_info oprnd_info;
314 int first_op_idx = 1;
315 unsigned int commutative_op = -1U;
316 bool first_op_cond = false;
317 bool first = stmt_num == 0;
318 bool second = stmt_num == 1;
319
320 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
321 {
322 number_of_oprnds = gimple_call_num_args (stmt);
323 first_op_idx = 3;
324 if (gimple_call_internal_p (stmt))
325 {
326 internal_fn ifn = gimple_call_internal_fn (stmt);
327 commutative_op = first_commutative_argument (ifn);
328
329 /* Masked load, only look at mask. */
330 if (ifn == IFN_MASK_LOAD)
331 {
332 number_of_oprnds = 1;
333 /* Mask operand index. */
334 first_op_idx = 5;
335 }
336 }
337 }
338 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
339 {
340 enum tree_code code = gimple_assign_rhs_code (stmt);
341 number_of_oprnds = gimple_num_ops (stmt) - 1;
342 /* Swap can only be done for cond_expr if asked to, otherwise we
343 could result in different comparison code to the first stmt. */
344 if (code == COND_EXPR
345 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
346 {
347 first_op_cond = true;
348 number_of_oprnds++;
349 }
350 else
351 commutative_op = commutative_tree_code (code) ? 0U : -1U;
352 }
353 else
354 return -1;
355
356 bool swapped = (*swap != 0);
357 gcc_assert (!swapped || first_op_cond);
358 for (i = 0; i < number_of_oprnds; i++)
359 {
360 again:
361 if (first_op_cond)
362 {
363 /* Map indicating how operands of cond_expr should be swapped. */
364 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
365 int *map = maps[*swap];
366
367 if (i < 2)
368 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
369 first_op_idx), map[i]);
370 else
371 oprnd = gimple_op (stmt_info->stmt, map[i]);
372 }
373 else
374 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
375 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
376 oprnd = TREE_OPERAND (oprnd, 0);
377
378 oprnd_info = (*oprnds_info)[i];
379
380 stmt_vec_info def_stmt_info;
381 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
382 {
383 if (dump_enabled_p ())
384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
385 "Build SLP failed: can't analyze def for %T\n",
386 oprnd);
387
388 return -1;
389 }
390
391 if (second)
392 oprnd_info->second_pattern = pattern;
393
394 if (first)
395 {
396 oprnd_info->first_dt = dt;
397 oprnd_info->first_pattern = pattern;
398 oprnd_info->first_op_type = TREE_TYPE (oprnd);
399 }
400 else
401 {
402 /* Not first stmt of the group, check that the def-stmt/s match
403 the def-stmt/s of the first stmt. Allow different definition
404 types for reduction chains: the first stmt must be a
405 vect_reduction_def (a phi node), and the rest
406 vect_internal_def. */
407 tree type = TREE_TYPE (oprnd);
408 if ((oprnd_info->first_dt != dt
409 && !(oprnd_info->first_dt == vect_reduction_def
410 && dt == vect_internal_def)
411 && !((oprnd_info->first_dt == vect_external_def
412 || oprnd_info->first_dt == vect_constant_def)
413 && (dt == vect_external_def
414 || dt == vect_constant_def)))
415 || !types_compatible_p (oprnd_info->first_op_type, type))
416 {
417 /* Try swapping operands if we got a mismatch. */
418 if (i == commutative_op && !swapped)
419 {
420 swapped = true;
421 goto again;
422 }
423
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
426 "Build SLP failed: different types\n");
427
428 return 1;
429 }
430 if ((dt == vect_constant_def
431 || dt == vect_external_def)
432 && !current_vector_size.is_constant ()
433 && (TREE_CODE (type) == BOOLEAN_TYPE
434 || !can_duplicate_and_interleave_p (stmts.length (),
435 TYPE_MODE (type))))
436 {
437 if (dump_enabled_p ())
438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
439 "Build SLP failed: invalid type of def "
440 "for variable-length SLP %T\n", oprnd);
441 return -1;
442 }
443 }
444
445 /* Check the types of the definitions. */
446 switch (dt)
447 {
448 case vect_constant_def:
449 case vect_external_def:
450 break;
451
452 case vect_reduction_def:
453 case vect_induction_def:
454 case vect_internal_def:
455 oprnd_info->def_stmts.quick_push (def_stmt_info);
456 break;
457
458 default:
459 /* FORNOW: Not supported. */
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "Build SLP failed: illegal type of def %T\n",
463 oprnd);
464
465 return -1;
466 }
467 }
468
469 /* Swap operands. */
470 if (swapped)
471 {
472 /* If there are already uses of this stmt in a SLP instance then
473 we've committed to the operand order and can't swap it. */
474 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
475 {
476 if (dump_enabled_p ())
477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
478 "Build SLP failed: cannot swap operands of "
479 "shared stmt %G", stmt_info->stmt);
480 return -1;
481 }
482
483 if (first_op_cond)
484 {
485 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
486 tree cond = gimple_assign_rhs1 (stmt);
487 enum tree_code code = TREE_CODE (cond);
488
489 /* Swap. */
490 if (*swap == 1)
491 {
492 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
493 &TREE_OPERAND (cond, 1));
494 TREE_SET_CODE (cond, swap_tree_comparison (code));
495 }
496 /* Invert. */
497 else
498 {
499 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
500 gimple_assign_rhs3_ptr (stmt));
501 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
502 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
503 gcc_assert (code != ERROR_MARK);
504 TREE_SET_CODE (cond, code);
505 }
506 }
507 else
508 {
509 unsigned int op = commutative_op + first_op_idx;
510 swap_ssa_operands (stmt_info->stmt,
511 gimple_op_ptr (stmt_info->stmt, op),
512 gimple_op_ptr (stmt_info->stmt, op + 1));
513 }
514 if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "swapped operands to match def types in %G",
517 stmt_info->stmt);
518 }
519
520 *swap = swapped;
521 return 0;
522 }
523
524 /* Return true if call statements CALL1 and CALL2 are similar enough
525 to be combined into the same SLP group. */
526
527 static bool
528 compatible_calls_p (gcall *call1, gcall *call2)
529 {
530 unsigned int nargs = gimple_call_num_args (call1);
531 if (nargs != gimple_call_num_args (call2))
532 return false;
533
534 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
535 return false;
536
537 if (gimple_call_internal_p (call1))
538 {
539 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
540 TREE_TYPE (gimple_call_lhs (call2))))
541 return false;
542 for (unsigned int i = 0; i < nargs; ++i)
543 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
544 TREE_TYPE (gimple_call_arg (call2, i))))
545 return false;
546 }
547 else
548 {
549 if (!operand_equal_p (gimple_call_fn (call1),
550 gimple_call_fn (call2), 0))
551 return false;
552
553 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
554 return false;
555 }
556 return true;
557 }
558
559 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
560 caller's attempt to find the vector type in STMT_INFO with the narrowest
561 element type. Return true if VECTYPE is nonnull and if it is valid
562 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
563 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
564 vect_build_slp_tree. */
565
566 static bool
567 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
568 tree vectype, poly_uint64 *max_nunits)
569 {
570 if (!vectype)
571 {
572 if (dump_enabled_p ())
573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
574 "Build SLP failed: unsupported data-type in %G\n",
575 stmt_info->stmt);
576 /* Fatal mismatch. */
577 return false;
578 }
579
580 /* If populating the vector type requires unrolling then fail
581 before adjusting *max_nunits for basic-block vectorization. */
582 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
583 unsigned HOST_WIDE_INT const_nunits;
584 if (STMT_VINFO_BB_VINFO (stmt_info)
585 && (!nunits.is_constant (&const_nunits)
586 || const_nunits > group_size))
587 {
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Build SLP failed: unrolling required "
591 "in basic block SLP\n");
592 /* Fatal mismatch. */
593 return false;
594 }
595
596 /* In case of multiple types we need to detect the smallest type. */
597 vect_update_max_nunits (max_nunits, vectype);
598 return true;
599 }
600
601 /* STMTS is a group of GROUP_SIZE SLP statements in which some
602 statements do the same operation as the first statement and in which
603 the others do ALT_STMT_CODE. Return true if we can take one vector
604 of the first operation and one vector of the second and permute them
605 to get the required result. VECTYPE is the type of the vector that
606 would be permuted. */
607
608 static bool
609 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
610 unsigned int group_size, tree vectype,
611 tree_code alt_stmt_code)
612 {
613 unsigned HOST_WIDE_INT count;
614 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
615 return false;
616
617 vec_perm_builder sel (count, count, 1);
618 for (unsigned int i = 0; i < count; ++i)
619 {
620 unsigned int elt = i;
621 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
622 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
623 elt += count;
624 sel.quick_push (elt);
625 }
626 vec_perm_indices indices (sel, 2, count);
627 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
628 }
629
630 /* Verify if the scalar stmts STMTS are isomorphic, require data
631 permutation or are of unsupported types of operation. Return
632 true if they are, otherwise return false and indicate in *MATCHES
633 which stmts are not isomorphic to the first one. If MATCHES[0]
634 is false then this indicates the comparison could not be
635 carried out or the stmts will never be vectorized by SLP.
636
637 Note COND_EXPR is possibly isomorphic to another one after swapping its
638 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
639 the first stmt by swapping the two operands of comparison; set SWAP[i]
640 to 2 if stmt I is isormorphic to the first stmt by inverting the code
641 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
642 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
643
644 static bool
645 vect_build_slp_tree_1 (unsigned char *swap,
646 vec<stmt_vec_info> stmts, unsigned int group_size,
647 poly_uint64 *max_nunits, bool *matches,
648 bool *two_operators)
649 {
650 unsigned int i;
651 stmt_vec_info first_stmt_info = stmts[0];
652 enum tree_code first_stmt_code = ERROR_MARK;
653 enum tree_code alt_stmt_code = ERROR_MARK;
654 enum tree_code rhs_code = ERROR_MARK;
655 enum tree_code first_cond_code = ERROR_MARK;
656 tree lhs;
657 bool need_same_oprnds = false;
658 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
659 optab optab;
660 int icode;
661 machine_mode optab_op2_mode;
662 machine_mode vec_mode;
663 stmt_vec_info first_load = NULL, prev_first_load = NULL;
664 bool load_p = false;
665
666 /* For every stmt in NODE find its def stmt/s. */
667 stmt_vec_info stmt_info;
668 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
669 {
670 gimple *stmt = stmt_info->stmt;
671 swap[i] = 0;
672 matches[i] = false;
673
674 if (dump_enabled_p ())
675 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
676
677 /* Fail to vectorize statements marked as unvectorizable. */
678 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
679 {
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
682 "Build SLP failed: unvectorizable statement %G",
683 stmt);
684 /* Fatal mismatch. */
685 matches[0] = false;
686 return false;
687 }
688
689 lhs = gimple_get_lhs (stmt);
690 if (lhs == NULL_TREE)
691 {
692 if (dump_enabled_p ())
693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
694 "Build SLP failed: not GIMPLE_ASSIGN nor "
695 "GIMPLE_CALL %G", stmt);
696 /* Fatal mismatch. */
697 matches[0] = false;
698 return false;
699 }
700
701 tree nunits_vectype;
702 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
703 &nunits_vectype)
704 || (nunits_vectype
705 && !vect_record_max_nunits (stmt_info, group_size,
706 nunits_vectype, max_nunits)))
707 {
708 /* Fatal mismatch. */
709 matches[0] = false;
710 return false;
711 }
712
713 gcc_assert (vectype);
714
715 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
716 {
717 rhs_code = CALL_EXPR;
718
719 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
720 load_p = true;
721 else if ((gimple_call_internal_p (call_stmt)
722 && (!vectorizable_internal_fn_p
723 (gimple_call_internal_fn (call_stmt))))
724 || gimple_call_tail_p (call_stmt)
725 || gimple_call_noreturn_p (call_stmt)
726 || !gimple_call_nothrow_p (call_stmt)
727 || gimple_call_chain (call_stmt))
728 {
729 if (dump_enabled_p ())
730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
731 "Build SLP failed: unsupported call type %G",
732 call_stmt);
733 /* Fatal mismatch. */
734 matches[0] = false;
735 return false;
736 }
737 }
738 else
739 {
740 rhs_code = gimple_assign_rhs_code (stmt);
741 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
742 }
743
744 /* Check the operation. */
745 if (i == 0)
746 {
747 first_stmt_code = rhs_code;
748
749 /* Shift arguments should be equal in all the packed stmts for a
750 vector shift with scalar shift operand. */
751 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
752 || rhs_code == LROTATE_EXPR
753 || rhs_code == RROTATE_EXPR)
754 {
755 if (vectype == boolean_type_node)
756 {
757 if (dump_enabled_p ())
758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
759 "Build SLP failed: shift of a"
760 " boolean.\n");
761 /* Fatal mismatch. */
762 matches[0] = false;
763 return false;
764 }
765
766 vec_mode = TYPE_MODE (vectype);
767
768 /* First see if we have a vector/vector shift. */
769 optab = optab_for_tree_code (rhs_code, vectype,
770 optab_vector);
771
772 if (!optab
773 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
774 {
775 /* No vector/vector shift, try for a vector/scalar shift. */
776 optab = optab_for_tree_code (rhs_code, vectype,
777 optab_scalar);
778
779 if (!optab)
780 {
781 if (dump_enabled_p ())
782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
783 "Build SLP failed: no optab.\n");
784 /* Fatal mismatch. */
785 matches[0] = false;
786 return false;
787 }
788 icode = (int) optab_handler (optab, vec_mode);
789 if (icode == CODE_FOR_nothing)
790 {
791 if (dump_enabled_p ())
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "Build SLP failed: "
794 "op not supported by target.\n");
795 /* Fatal mismatch. */
796 matches[0] = false;
797 return false;
798 }
799 optab_op2_mode = insn_data[icode].operand[2].mode;
800 if (!VECTOR_MODE_P (optab_op2_mode))
801 {
802 need_same_oprnds = true;
803 first_op1 = gimple_assign_rhs2 (stmt);
804 }
805 }
806 }
807 else if (rhs_code == WIDEN_LSHIFT_EXPR)
808 {
809 need_same_oprnds = true;
810 first_op1 = gimple_assign_rhs2 (stmt);
811 }
812 }
813 else
814 {
815 if (first_stmt_code != rhs_code
816 && alt_stmt_code == ERROR_MARK)
817 alt_stmt_code = rhs_code;
818 if (first_stmt_code != rhs_code
819 && (first_stmt_code != IMAGPART_EXPR
820 || rhs_code != REALPART_EXPR)
821 && (first_stmt_code != REALPART_EXPR
822 || rhs_code != IMAGPART_EXPR)
823 /* Handle mismatches in plus/minus by computing both
824 and merging the results. */
825 && !((first_stmt_code == PLUS_EXPR
826 || first_stmt_code == MINUS_EXPR)
827 && (alt_stmt_code == PLUS_EXPR
828 || alt_stmt_code == MINUS_EXPR)
829 && rhs_code == alt_stmt_code)
830 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
831 && (first_stmt_code == ARRAY_REF
832 || first_stmt_code == BIT_FIELD_REF
833 || first_stmt_code == INDIRECT_REF
834 || first_stmt_code == COMPONENT_REF
835 || first_stmt_code == MEM_REF)))
836 {
837 if (dump_enabled_p ())
838 {
839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 "Build SLP failed: different operation "
841 "in stmt %G", stmt);
842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
843 "original stmt %G", first_stmt_info->stmt);
844 }
845 /* Mismatch. */
846 continue;
847 }
848
849 if (need_same_oprnds
850 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
851 {
852 if (dump_enabled_p ())
853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
854 "Build SLP failed: different shift "
855 "arguments in %G", stmt);
856 /* Mismatch. */
857 continue;
858 }
859
860 if (!load_p && rhs_code == CALL_EXPR)
861 {
862 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
863 as_a <gcall *> (stmt)))
864 {
865 if (dump_enabled_p ())
866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
867 "Build SLP failed: different calls in %G",
868 stmt);
869 /* Mismatch. */
870 continue;
871 }
872 }
873 }
874
875 /* Grouped store or load. */
876 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
877 {
878 if (REFERENCE_CLASS_P (lhs))
879 {
880 /* Store. */
881 ;
882 }
883 else
884 {
885 /* Load. */
886 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
887 if (prev_first_load)
888 {
889 /* Check that there are no loads from different interleaving
890 chains in the same node. */
891 if (prev_first_load != first_load)
892 {
893 if (dump_enabled_p ())
894 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
895 vect_location,
896 "Build SLP failed: different "
897 "interleaving chains in one node %G",
898 stmt);
899 /* Mismatch. */
900 continue;
901 }
902 }
903 else
904 prev_first_load = first_load;
905 }
906 } /* Grouped access. */
907 else
908 {
909 if (load_p)
910 {
911 /* Not grouped load. */
912 if (dump_enabled_p ())
913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
914 "Build SLP failed: not grouped load %G", stmt);
915
916 /* FORNOW: Not grouped loads are not supported. */
917 /* Fatal mismatch. */
918 matches[0] = false;
919 return false;
920 }
921
922 /* Not memory operation. */
923 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
924 && TREE_CODE_CLASS (rhs_code) != tcc_unary
925 && TREE_CODE_CLASS (rhs_code) != tcc_expression
926 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
927 && rhs_code != VIEW_CONVERT_EXPR
928 && rhs_code != CALL_EXPR)
929 {
930 if (dump_enabled_p ())
931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 "Build SLP failed: operation unsupported %G",
933 stmt);
934 /* Fatal mismatch. */
935 matches[0] = false;
936 return false;
937 }
938
939 if (rhs_code == COND_EXPR)
940 {
941 tree cond_expr = gimple_assign_rhs1 (stmt);
942 enum tree_code cond_code = TREE_CODE (cond_expr);
943 enum tree_code swap_code = ERROR_MARK;
944 enum tree_code invert_code = ERROR_MARK;
945
946 if (i == 0)
947 first_cond_code = TREE_CODE (cond_expr);
948 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
949 {
950 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
951 swap_code = swap_tree_comparison (cond_code);
952 invert_code = invert_tree_comparison (cond_code, honor_nans);
953 }
954
955 if (first_cond_code == cond_code)
956 ;
957 /* Isomorphic can be achieved by swapping. */
958 else if (first_cond_code == swap_code)
959 swap[i] = 1;
960 /* Isomorphic can be achieved by inverting. */
961 else if (first_cond_code == invert_code)
962 swap[i] = 2;
963 else
964 {
965 if (dump_enabled_p ())
966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
967 "Build SLP failed: different"
968 " operation %G", stmt);
969 /* Mismatch. */
970 continue;
971 }
972 }
973 }
974
975 matches[i] = true;
976 }
977
978 for (i = 0; i < group_size; ++i)
979 if (!matches[i])
980 return false;
981
982 /* If we allowed a two-operation SLP node verify the target can cope
983 with the permute we are going to use. */
984 if (alt_stmt_code != ERROR_MARK
985 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
986 {
987 if (vectype == boolean_type_node
988 || !vect_two_operations_perm_ok_p (stmts, group_size,
989 vectype, alt_stmt_code))
990 {
991 for (i = 0; i < group_size; ++i)
992 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
993 {
994 matches[i] = false;
995 if (dump_enabled_p ())
996 {
997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
998 "Build SLP failed: different operation "
999 "in stmt %G", stmts[i]->stmt);
1000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 "original stmt %G", first_stmt_info->stmt);
1002 }
1003 }
1004 return false;
1005 }
1006 *two_operators = true;
1007 }
1008
1009 return true;
1010 }
1011
1012 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1013 Note we never remove apart from at destruction time so we do not
1014 need a special value for deleted that differs from empty. */
1015 struct bst_traits
1016 {
1017 typedef vec <stmt_vec_info> value_type;
1018 typedef vec <stmt_vec_info> compare_type;
1019 static inline hashval_t hash (value_type);
1020 static inline bool equal (value_type existing, value_type candidate);
1021 static inline bool is_empty (value_type x) { return !x.exists (); }
1022 static inline bool is_deleted (value_type x) { return !x.exists (); }
1023 static inline void mark_empty (value_type &x) { x.release (); }
1024 static inline void mark_deleted (value_type &x) { x.release (); }
1025 static inline void remove (value_type &x) { x.release (); }
1026 };
1027 inline hashval_t
1028 bst_traits::hash (value_type x)
1029 {
1030 inchash::hash h;
1031 for (unsigned i = 0; i < x.length (); ++i)
1032 h.add_int (gimple_uid (x[i]->stmt));
1033 return h.end ();
1034 }
1035 inline bool
1036 bst_traits::equal (value_type existing, value_type candidate)
1037 {
1038 if (existing.length () != candidate.length ())
1039 return false;
1040 for (unsigned i = 0; i < existing.length (); ++i)
1041 if (existing[i] != candidate[i])
1042 return false;
1043 return true;
1044 }
1045
1046 typedef hash_map <vec <gimple *>, slp_tree,
1047 simple_hashmap_traits <bst_traits, slp_tree> >
1048 scalar_stmts_to_slp_tree_map_t;
1049
1050 static slp_tree
1051 vect_build_slp_tree_2 (vec_info *vinfo,
1052 vec<stmt_vec_info> stmts, unsigned int group_size,
1053 poly_uint64 *max_nunits,
1054 bool *matches, unsigned *npermutes, unsigned *tree_size,
1055 scalar_stmts_to_slp_tree_map_t *bst_map);
1056
1057 static slp_tree
1058 vect_build_slp_tree (vec_info *vinfo,
1059 vec<stmt_vec_info> stmts, unsigned int group_size,
1060 poly_uint64 *max_nunits,
1061 bool *matches, unsigned *npermutes, unsigned *tree_size,
1062 scalar_stmts_to_slp_tree_map_t *bst_map)
1063 {
1064 if (slp_tree *leader = bst_map->get (stmts))
1065 {
1066 if (dump_enabled_p ())
1067 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1068 *leader ? "" : "failed ", *leader);
1069 if (*leader)
1070 (*leader)->refcnt++;
1071 return *leader;
1072 }
1073 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
1074 matches, npermutes, tree_size, bst_map);
1075 /* Keep a reference for the bst_map use. */
1076 if (res)
1077 res->refcnt++;
1078 bst_map->put (stmts.copy (), res);
1079 return res;
1080 }
1081
1082 /* Recursively build an SLP tree starting from NODE.
1083 Fail (and return a value not equal to zero) if def-stmts are not
1084 isomorphic, require data permutation or are of unsupported types of
1085 operation. Otherwise, return 0.
1086 The value returned is the depth in the SLP tree where a mismatch
1087 was found. */
1088
1089 static slp_tree
1090 vect_build_slp_tree_2 (vec_info *vinfo,
1091 vec<stmt_vec_info> stmts, unsigned int group_size,
1092 poly_uint64 *max_nunits,
1093 bool *matches, unsigned *npermutes, unsigned *tree_size,
1094 scalar_stmts_to_slp_tree_map_t *bst_map)
1095 {
1096 unsigned nops, i, this_tree_size = 0;
1097 poly_uint64 this_max_nunits = *max_nunits;
1098 slp_tree node;
1099
1100 matches[0] = false;
1101
1102 stmt_vec_info stmt_info = stmts[0];
1103 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1104 nops = gimple_call_num_args (stmt);
1105 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1106 {
1107 nops = gimple_num_ops (stmt) - 1;
1108 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1109 nops++;
1110 }
1111 else if (is_a <gphi *> (stmt_info->stmt))
1112 nops = 0;
1113 else
1114 return NULL;
1115
1116 /* If the SLP node is a PHI (induction or reduction), terminate
1117 the recursion. */
1118 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1119 {
1120 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1121 tree vectype = get_vectype_for_scalar_type (scalar_type);
1122 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1123 return NULL;
1124
1125 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1126 /* Induction from different IVs is not supported. */
1127 if (def_type == vect_induction_def)
1128 {
1129 stmt_vec_info other_info;
1130 FOR_EACH_VEC_ELT (stmts, i, other_info)
1131 if (stmt_info != other_info)
1132 return NULL;
1133 }
1134 else if (def_type == vect_reduction_def
1135 || def_type == vect_double_reduction_def
1136 || def_type == vect_nested_cycle)
1137 {
1138 /* Else def types have to match. */
1139 stmt_vec_info other_info;
1140 FOR_EACH_VEC_ELT (stmts, i, other_info)
1141 {
1142 /* But for reduction chains only check on the first stmt. */
1143 if (!STMT_VINFO_DATA_REF (other_info)
1144 && REDUC_GROUP_FIRST_ELEMENT (other_info)
1145 && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
1146 continue;
1147 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1148 return NULL;
1149 }
1150 }
1151 else
1152 return NULL;
1153 (*tree_size)++;
1154 node = vect_create_new_slp_node (stmts);
1155 return node;
1156 }
1157
1158
1159 bool two_operators = false;
1160 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1161 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1162 &this_max_nunits, matches, &two_operators))
1163 return NULL;
1164
1165 /* If the SLP node is a load, terminate the recursion unless masked. */
1166 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1167 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1168 {
1169 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1170 {
1171 /* Masked load. */
1172 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1173 nops = 1;
1174 }
1175 else
1176 {
1177 *max_nunits = this_max_nunits;
1178 (*tree_size)++;
1179 node = vect_create_new_slp_node (stmts);
1180 return node;
1181 }
1182 }
1183
1184 /* Get at the operands, verifying they are compatible. */
1185 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1186 slp_oprnd_info oprnd_info;
1187 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1188 {
1189 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1190 stmts, i, &oprnds_info);
1191 if (res != 0)
1192 matches[(res == -1) ? 0 : i] = false;
1193 if (!matches[0])
1194 break;
1195 }
1196 for (i = 0; i < group_size; ++i)
1197 if (!matches[i])
1198 {
1199 vect_free_oprnd_info (oprnds_info);
1200 return NULL;
1201 }
1202
1203 auto_vec<slp_tree, 4> children;
1204
1205 stmt_info = stmts[0];
1206
1207 /* Create SLP_TREE nodes for the definition node/s. */
1208 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1209 {
1210 slp_tree child;
1211 unsigned old_tree_size = this_tree_size;
1212 unsigned int j;
1213
1214 if (oprnd_info->first_dt != vect_internal_def
1215 && oprnd_info->first_dt != vect_reduction_def
1216 && oprnd_info->first_dt != vect_induction_def)
1217 continue;
1218
1219 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1220 group_size, &this_max_nunits,
1221 matches, npermutes,
1222 &this_tree_size, bst_map)) != NULL)
1223 {
1224 /* If we have all children of child built up from scalars then just
1225 throw that away and build it up this node from scalars. */
1226 if (!SLP_TREE_CHILDREN (child).is_empty ()
1227 /* ??? Rejecting patterns this way doesn't work. We'd have to
1228 do extra work to cancel the pattern so the uses see the
1229 scalar version. */
1230 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1231 {
1232 slp_tree grandchild;
1233
1234 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1235 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1236 break;
1237 if (!grandchild)
1238 {
1239 /* Roll back. */
1240 this_tree_size = old_tree_size;
1241 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1242 vect_free_slp_tree (grandchild, false);
1243 SLP_TREE_CHILDREN (child).truncate (0);
1244
1245 if (dump_enabled_p ())
1246 dump_printf_loc (MSG_NOTE, vect_location,
1247 "Building parent vector operands from "
1248 "scalars instead\n");
1249 oprnd_info->def_stmts = vNULL;
1250 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1251 ++this_tree_size;
1252 children.safe_push (child);
1253 continue;
1254 }
1255 }
1256
1257 oprnd_info->def_stmts = vNULL;
1258 children.safe_push (child);
1259 continue;
1260 }
1261
1262 /* If the SLP build failed fatally and we analyze a basic-block
1263 simply treat nodes we fail to build as externally defined
1264 (and thus build vectors from the scalar defs).
1265 The cost model will reject outright expensive cases.
1266 ??? This doesn't treat cases where permutation ultimatively
1267 fails (or we don't try permutation below). Ideally we'd
1268 even compute a permutation that will end up with the maximum
1269 SLP tree size... */
1270 if (is_a <bb_vec_info> (vinfo)
1271 && !matches[0]
1272 /* ??? Rejecting patterns this way doesn't work. We'd have to
1273 do extra work to cancel the pattern so the uses see the
1274 scalar version. */
1275 && !is_pattern_stmt_p (stmt_info))
1276 {
1277 if (dump_enabled_p ())
1278 dump_printf_loc (MSG_NOTE, vect_location,
1279 "Building vector operands from scalars\n");
1280 this_tree_size++;
1281 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1282 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1283 children.safe_push (child);
1284 oprnd_info->def_stmts = vNULL;
1285 continue;
1286 }
1287
1288 /* If the SLP build for operand zero failed and operand zero
1289 and one can be commutated try that for the scalar stmts
1290 that failed the match. */
1291 if (i == 0
1292 /* A first scalar stmt mismatch signals a fatal mismatch. */
1293 && matches[0]
1294 /* ??? For COND_EXPRs we can swap the comparison operands
1295 as well as the arms under some constraints. */
1296 && nops == 2
1297 && oprnds_info[1]->first_dt == vect_internal_def
1298 && is_gimple_assign (stmt_info->stmt)
1299 /* Swapping operands for reductions breaks assumptions later on. */
1300 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1301 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1302 /* Do so only if the number of not successful permutes was nor more
1303 than a cut-ff as re-trying the recursive match on
1304 possibly each level of the tree would expose exponential
1305 behavior. */
1306 && *npermutes < 4)
1307 {
1308 /* See whether we can swap the matching or the non-matching
1309 stmt operands. */
1310 bool swap_not_matching = true;
1311 do
1312 {
1313 for (j = 0; j < group_size; ++j)
1314 {
1315 if (matches[j] != !swap_not_matching)
1316 continue;
1317 stmt_vec_info stmt_info = stmts[j];
1318 /* Verify if we can swap operands of this stmt. */
1319 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1320 if (!stmt
1321 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1322 {
1323 if (!swap_not_matching)
1324 goto fail;
1325 swap_not_matching = false;
1326 break;
1327 }
1328 /* Verify if we can safely swap or if we committed to a
1329 specific operand order already.
1330 ??? Instead of modifying GIMPLE stmts here we could
1331 record whether we want to swap operands in the SLP
1332 node and temporarily do that when processing it
1333 (or wrap operand accessors in a helper). */
1334 else if (swap[j] != 0
1335 || STMT_VINFO_NUM_SLP_USES (stmt_info))
1336 {
1337 if (!swap_not_matching)
1338 {
1339 if (dump_enabled_p ())
1340 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1341 vect_location,
1342 "Build SLP failed: cannot swap "
1343 "operands of shared stmt %G",
1344 stmts[j]->stmt);
1345 goto fail;
1346 }
1347 swap_not_matching = false;
1348 break;
1349 }
1350 }
1351 }
1352 while (j != group_size);
1353
1354 /* Swap mismatched definition stmts. */
1355 if (dump_enabled_p ())
1356 dump_printf_loc (MSG_NOTE, vect_location,
1357 "Re-trying with swapped operands of stmts ");
1358 for (j = 0; j < group_size; ++j)
1359 if (matches[j] == !swap_not_matching)
1360 {
1361 std::swap (oprnds_info[0]->def_stmts[j],
1362 oprnds_info[1]->def_stmts[j]);
1363 if (dump_enabled_p ())
1364 dump_printf (MSG_NOTE, "%d ", j);
1365 }
1366 if (dump_enabled_p ())
1367 dump_printf (MSG_NOTE, "\n");
1368 /* And try again with scratch 'matches' ... */
1369 bool *tem = XALLOCAVEC (bool, group_size);
1370 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1371 group_size, &this_max_nunits,
1372 tem, npermutes,
1373 &this_tree_size, bst_map)) != NULL)
1374 {
1375 /* ... so if successful we can apply the operand swapping
1376 to the GIMPLE IL. This is necessary because for example
1377 vect_get_slp_defs uses operand indexes and thus expects
1378 canonical operand order. This is also necessary even
1379 if we end up building the operand from scalars as
1380 we'll continue to process swapped operand two. */
1381 for (j = 0; j < group_size; ++j)
1382 gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
1383 for (j = 0; j < group_size; ++j)
1384 if (matches[j] == !swap_not_matching)
1385 {
1386 gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
1387 /* Avoid swapping operands twice. */
1388 if (gimple_plf (stmt, GF_PLF_1))
1389 continue;
1390 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1391 gimple_assign_rhs2_ptr (stmt));
1392 gimple_set_plf (stmt, GF_PLF_1, true);
1393 }
1394 /* Verify we swap all duplicates or none. */
1395 if (flag_checking)
1396 for (j = 0; j < group_size; ++j)
1397 gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
1398 == (matches[j] == !swap_not_matching));
1399
1400 /* If we have all children of child built up from scalars then
1401 just throw that away and build it up this node from scalars. */
1402 if (!SLP_TREE_CHILDREN (child).is_empty ()
1403 /* ??? Rejecting patterns this way doesn't work. We'd have
1404 to do extra work to cancel the pattern so the uses see the
1405 scalar version. */
1406 && !is_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (child)[0]))
1407 {
1408 unsigned int j;
1409 slp_tree grandchild;
1410
1411 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1412 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1413 break;
1414 if (!grandchild)
1415 {
1416 /* Roll back. */
1417 this_tree_size = old_tree_size;
1418 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1419 vect_free_slp_tree (grandchild, false);
1420 SLP_TREE_CHILDREN (child).truncate (0);
1421
1422 if (dump_enabled_p ())
1423 dump_printf_loc (MSG_NOTE, vect_location,
1424 "Building parent vector operands from "
1425 "scalars instead\n");
1426 oprnd_info->def_stmts = vNULL;
1427 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1428 ++this_tree_size;
1429 children.safe_push (child);
1430 continue;
1431 }
1432 }
1433
1434 oprnd_info->def_stmts = vNULL;
1435 children.safe_push (child);
1436 continue;
1437 }
1438
1439 ++*npermutes;
1440 }
1441
1442 fail:
1443 gcc_assert (child == NULL);
1444 FOR_EACH_VEC_ELT (children, j, child)
1445 vect_free_slp_tree (child, false);
1446 vect_free_oprnd_info (oprnds_info);
1447 return NULL;
1448 }
1449
1450 vect_free_oprnd_info (oprnds_info);
1451
1452 *tree_size += this_tree_size + 1;
1453 *max_nunits = this_max_nunits;
1454
1455 node = vect_create_new_slp_node (stmts);
1456 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1457 SLP_TREE_CHILDREN (node).splice (children);
1458 return node;
1459 }
1460
1461 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1462
1463 static void
1464 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1465 slp_tree node, hash_set<slp_tree> &visited)
1466 {
1467 int i;
1468 stmt_vec_info stmt_info;
1469 slp_tree child;
1470
1471 if (visited.add (node))
1472 return;
1473
1474 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1475 dump_user_location_t user_loc = loc.get_user_location ();
1476 dump_printf_loc (metadata, user_loc, "node%s %p\n",
1477 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1478 ? " (external)" : "", node);
1479 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1480 dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt);
1481 if (SLP_TREE_CHILDREN (node).is_empty ())
1482 return;
1483 dump_printf_loc (metadata, user_loc, "\tchildren");
1484 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1485 dump_printf (dump_kind, " %p", (void *)child);
1486 dump_printf (dump_kind, "\n");
1487 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1488 vect_print_slp_tree (dump_kind, loc, child, visited);
1489 }
1490
1491 static void
1492 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1493 slp_tree node)
1494 {
1495 hash_set<slp_tree> visited;
1496 vect_print_slp_tree (dump_kind, loc, node, visited);
1497 }
1498
1499 /* Mark the tree rooted at NODE with PURE_SLP. */
1500
1501 static void
1502 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1503 {
1504 int i;
1505 stmt_vec_info stmt_info;
1506 slp_tree child;
1507
1508 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1509 return;
1510
1511 if (visited.add (node))
1512 return;
1513
1514 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1515 STMT_SLP_TYPE (stmt_info) = pure_slp;
1516
1517 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1518 vect_mark_slp_stmts (child, visited);
1519 }
1520
1521 static void
1522 vect_mark_slp_stmts (slp_tree node)
1523 {
1524 hash_set<slp_tree> visited;
1525 vect_mark_slp_stmts (node, visited);
1526 }
1527
1528 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1529
1530 static void
1531 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1532 {
1533 int i;
1534 stmt_vec_info stmt_info;
1535 slp_tree child;
1536
1537 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1538 return;
1539
1540 if (visited.add (node))
1541 return;
1542
1543 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1544 {
1545 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1546 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1547 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1548 }
1549
1550 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1551 vect_mark_slp_stmts_relevant (child, visited);
1552 }
1553
1554 static void
1555 vect_mark_slp_stmts_relevant (slp_tree node)
1556 {
1557 hash_set<slp_tree> visited;
1558 vect_mark_slp_stmts_relevant (node, visited);
1559 }
1560
1561
1562 /* Rearrange the statements of NODE according to PERMUTATION. */
1563
1564 static void
1565 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1566 vec<unsigned> permutation,
1567 hash_set<slp_tree> &visited)
1568 {
1569 stmt_vec_info stmt_info;
1570 vec<stmt_vec_info> tmp_stmts;
1571 unsigned int i;
1572 slp_tree child;
1573
1574 if (visited.add (node))
1575 return;
1576
1577 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1578 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1579
1580 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1581 tmp_stmts.create (group_size);
1582 tmp_stmts.quick_grow_cleared (group_size);
1583
1584 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1585 tmp_stmts[permutation[i]] = stmt_info;
1586
1587 SLP_TREE_SCALAR_STMTS (node).release ();
1588 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1589 }
1590
1591
1592 /* Attempt to reorder stmts in a reduction chain so that we don't
1593 require any load permutation. Return true if that was possible,
1594 otherwise return false. */
1595
1596 static bool
1597 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1598 {
1599 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1600 unsigned int i, j;
1601 unsigned int lidx;
1602 slp_tree node, load;
1603
1604 /* Compare all the permutation sequences to the first one. We know
1605 that at least one load is permuted. */
1606 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1607 if (!node->load_permutation.exists ())
1608 return false;
1609 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1610 {
1611 if (!load->load_permutation.exists ())
1612 return false;
1613 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1614 if (lidx != node->load_permutation[j])
1615 return false;
1616 }
1617
1618 /* Check that the loads in the first sequence are different and there
1619 are no gaps between them. */
1620 auto_sbitmap load_index (group_size);
1621 bitmap_clear (load_index);
1622 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1623 {
1624 if (lidx >= group_size)
1625 return false;
1626 if (bitmap_bit_p (load_index, lidx))
1627 return false;
1628
1629 bitmap_set_bit (load_index, lidx);
1630 }
1631 for (i = 0; i < group_size; i++)
1632 if (!bitmap_bit_p (load_index, i))
1633 return false;
1634
1635 /* This permutation is valid for reduction. Since the order of the
1636 statements in the nodes is not important unless they are memory
1637 accesses, we can rearrange the statements in all the nodes
1638 according to the order of the loads. */
1639 hash_set<slp_tree> visited;
1640 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1641 node->load_permutation, visited);
1642
1643 /* We are done, no actual permutations need to be generated. */
1644 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1645 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1646 {
1647 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1648 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1649 /* But we have to keep those permutations that are required because
1650 of handling of gaps. */
1651 if (known_eq (unrolling_factor, 1U)
1652 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1653 && DR_GROUP_GAP (first_stmt_info) == 0))
1654 SLP_TREE_LOAD_PERMUTATION (node).release ();
1655 else
1656 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1657 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1658 }
1659
1660 return true;
1661 }
1662
1663 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1664
1665 static void
1666 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1667 hash_set<slp_tree> &visited)
1668 {
1669 if (visited.add (node))
1670 return;
1671
1672 if (SLP_TREE_CHILDREN (node).length () == 0)
1673 {
1674 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1675 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
1676 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
1677 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1678 SLP_INSTANCE_LOADS (inst).safe_push (node);
1679 }
1680 else
1681 {
1682 unsigned i;
1683 slp_tree child;
1684 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1685 vect_gather_slp_loads (inst, child, visited);
1686 }
1687 }
1688
1689 static void
1690 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1691 {
1692 hash_set<slp_tree> visited;
1693 vect_gather_slp_loads (inst, node, visited);
1694 }
1695
1696 /* Check if the required load permutations in the SLP instance
1697 SLP_INSTN are supported. */
1698
1699 static bool
1700 vect_supported_load_permutation_p (slp_instance slp_instn)
1701 {
1702 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1703 unsigned int i, j, k, next;
1704 slp_tree node;
1705
1706 if (dump_enabled_p ())
1707 {
1708 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1709 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1710 if (node->load_permutation.exists ())
1711 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1712 dump_printf (MSG_NOTE, "%d ", next);
1713 else
1714 for (k = 0; k < group_size; ++k)
1715 dump_printf (MSG_NOTE, "%d ", k);
1716 dump_printf (MSG_NOTE, "\n");
1717 }
1718
1719 /* In case of reduction every load permutation is allowed, since the order
1720 of the reduction statements is not important (as opposed to the case of
1721 grouped stores). The only condition we need to check is that all the
1722 load nodes are of the same size and have the same permutation (and then
1723 rearrange all the nodes of the SLP instance according to this
1724 permutation). */
1725
1726 /* Check that all the load nodes are of the same size. */
1727 /* ??? Can't we assert this? */
1728 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1729 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1730 return false;
1731
1732 node = SLP_INSTANCE_TREE (slp_instn);
1733 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1734
1735 /* Reduction (there are no data-refs in the root).
1736 In reduction chain the order of the loads is not important. */
1737 if (!STMT_VINFO_DATA_REF (stmt_info)
1738 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1739 vect_attempt_slp_rearrange_stmts (slp_instn);
1740
1741 /* In basic block vectorization we allow any subchain of an interleaving
1742 chain.
1743 FORNOW: not supported in loop SLP because of realignment compications. */
1744 if (STMT_VINFO_BB_VINFO (stmt_info))
1745 {
1746 /* Check whether the loads in an instance form a subchain and thus
1747 no permutation is necessary. */
1748 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1749 {
1750 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1751 continue;
1752 bool subchain_p = true;
1753 stmt_vec_info next_load_info = NULL;
1754 stmt_vec_info load_info;
1755 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1756 {
1757 if (j != 0
1758 && (next_load_info != load_info
1759 || DR_GROUP_GAP (load_info) != 1))
1760 {
1761 subchain_p = false;
1762 break;
1763 }
1764 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1765 }
1766 if (subchain_p)
1767 SLP_TREE_LOAD_PERMUTATION (node).release ();
1768 else
1769 {
1770 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1771 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1772 unsigned HOST_WIDE_INT nunits;
1773 unsigned k, maxk = 0;
1774 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1775 if (k > maxk)
1776 maxk = k;
1777 /* In BB vectorization we may not actually use a loaded vector
1778 accessing elements in excess of DR_GROUP_SIZE. */
1779 tree vectype = STMT_VINFO_VECTYPE (group_info);
1780 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1781 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1782 {
1783 if (dump_enabled_p ())
1784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785 "BB vectorization with gaps at the end of "
1786 "a load is not supported\n");
1787 return false;
1788 }
1789
1790 /* Verify the permutation can be generated. */
1791 vec<tree> tem;
1792 unsigned n_perms;
1793 if (!vect_transform_slp_perm_load (node, tem, NULL,
1794 1, slp_instn, true, &n_perms))
1795 {
1796 if (dump_enabled_p ())
1797 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1798 vect_location,
1799 "unsupported load permutation\n");
1800 return false;
1801 }
1802 }
1803 }
1804 return true;
1805 }
1806
1807 /* For loop vectorization verify we can generate the permutation. Be
1808 conservative about the vectorization factor, there are permutations
1809 that will use three vector inputs only starting from a specific factor
1810 and the vectorization factor is not yet final.
1811 ??? The SLP instance unrolling factor might not be the maximum one. */
1812 unsigned n_perms;
1813 poly_uint64 test_vf
1814 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1815 LOOP_VINFO_VECT_FACTOR
1816 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1817 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1818 if (node->load_permutation.exists ()
1819 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1820 slp_instn, true, &n_perms))
1821 return false;
1822
1823 return true;
1824 }
1825
1826
1827 /* Find the last store in SLP INSTANCE. */
1828
1829 stmt_vec_info
1830 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1831 {
1832 stmt_vec_info last = NULL;
1833 stmt_vec_info stmt_vinfo;
1834
1835 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1836 {
1837 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1838 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1839 }
1840
1841 return last;
1842 }
1843
1844 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1845 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1846 (also containing the first GROUP1_SIZE stmts, since stores are
1847 consecutive), the second containing the remainder.
1848 Return the first stmt in the second group. */
1849
1850 static stmt_vec_info
1851 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1852 {
1853 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1854 gcc_assert (group1_size > 0);
1855 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1856 gcc_assert (group2_size > 0);
1857 DR_GROUP_SIZE (first_vinfo) = group1_size;
1858
1859 stmt_vec_info stmt_info = first_vinfo;
1860 for (unsigned i = group1_size; i > 1; i--)
1861 {
1862 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1863 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1864 }
1865 /* STMT is now the last element of the first group. */
1866 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1867 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1868
1869 DR_GROUP_SIZE (group2) = group2_size;
1870 for (stmt_info = group2; stmt_info;
1871 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1872 {
1873 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1874 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1875 }
1876
1877 /* For the second group, the DR_GROUP_GAP is that before the original group,
1878 plus skipping over the first vector. */
1879 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1880
1881 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1882 DR_GROUP_GAP (first_vinfo) += group2_size;
1883
1884 if (dump_enabled_p ())
1885 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1886 group1_size, group2_size);
1887
1888 return group2;
1889 }
1890
1891 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1892 statements and a vector of NUNITS elements. */
1893
1894 static poly_uint64
1895 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1896 {
1897 return exact_div (common_multiple (nunits, group_size), group_size);
1898 }
1899
1900 /* Analyze an SLP instance starting from a group of grouped stores. Call
1901 vect_build_slp_tree to build a tree of packed stmts if possible.
1902 Return FALSE if it's impossible to SLP any stmt in the loop. */
1903
1904 static bool
1905 vect_analyze_slp_instance (vec_info *vinfo,
1906 stmt_vec_info stmt_info, unsigned max_tree_size)
1907 {
1908 slp_instance new_instance;
1909 slp_tree node;
1910 unsigned int group_size;
1911 tree vectype, scalar_type = NULL_TREE;
1912 unsigned int i;
1913 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1914 vec<stmt_vec_info> scalar_stmts;
1915
1916 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1917 {
1918 scalar_type = TREE_TYPE (DR_REF (dr));
1919 vectype = get_vectype_for_scalar_type (scalar_type);
1920 group_size = DR_GROUP_SIZE (stmt_info);
1921 }
1922 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1923 {
1924 gcc_assert (is_a <loop_vec_info> (vinfo));
1925 vectype = STMT_VINFO_VECTYPE (stmt_info);
1926 group_size = REDUC_GROUP_SIZE (stmt_info);
1927 }
1928 else
1929 {
1930 gcc_assert (is_a <loop_vec_info> (vinfo));
1931 vectype = STMT_VINFO_VECTYPE (stmt_info);
1932 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1933 }
1934
1935 if (!vectype)
1936 {
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939 "Build SLP failed: unsupported data-type %T\n",
1940 scalar_type);
1941
1942 return false;
1943 }
1944 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1945
1946 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1947 scalar_stmts.create (group_size);
1948 stmt_vec_info next_info = stmt_info;
1949 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1950 {
1951 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1952 while (next_info)
1953 {
1954 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1955 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
1956 }
1957 }
1958 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1959 {
1960 /* Collect the reduction stmts and store them in
1961 SLP_TREE_SCALAR_STMTS. */
1962 while (next_info)
1963 {
1964 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1965 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
1966 }
1967 /* Mark the first element of the reduction chain as reduction to properly
1968 transform the node. In the reduction analysis phase only the last
1969 element of the chain is marked as reduction. */
1970 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
1971 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
1972 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
1973 }
1974 else
1975 {
1976 /* Collect reduction statements. */
1977 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1978 for (i = 0; reductions.iterate (i, &next_info); i++)
1979 scalar_stmts.safe_push (next_info);
1980 }
1981
1982 /* Build the tree for the SLP instance. */
1983 bool *matches = XALLOCAVEC (bool, group_size);
1984 unsigned npermutes = 0;
1985 scalar_stmts_to_slp_tree_map_t *bst_map
1986 = new scalar_stmts_to_slp_tree_map_t ();
1987 poly_uint64 max_nunits = nunits;
1988 unsigned tree_size = 0;
1989 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
1990 &max_nunits, matches, &npermutes,
1991 &tree_size, bst_map);
1992 /* The map keeps a reference on SLP nodes built, release that. */
1993 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1994 it != bst_map->end (); ++it)
1995 if ((*it).second)
1996 vect_free_slp_tree ((*it).second, false);
1997 delete bst_map;
1998 if (node != NULL)
1999 {
2000 /* Calculate the unrolling factor based on the smallest type. */
2001 poly_uint64 unrolling_factor
2002 = calculate_unrolling_factor (max_nunits, group_size);
2003
2004 if (maybe_ne (unrolling_factor, 1U)
2005 && is_a <bb_vec_info> (vinfo))
2006 {
2007 unsigned HOST_WIDE_INT const_max_nunits;
2008 if (!max_nunits.is_constant (&const_max_nunits)
2009 || const_max_nunits > group_size)
2010 {
2011 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013 "Build SLP failed: store group "
2014 "size not a multiple of the vector size "
2015 "in basic block SLP\n");
2016 vect_free_slp_tree (node, false);
2017 return false;
2018 }
2019 /* Fatal mismatch. */
2020 matches[group_size / const_max_nunits * const_max_nunits] = false;
2021 vect_free_slp_tree (node, false);
2022 }
2023 else
2024 {
2025 /* Create a new SLP instance. */
2026 new_instance = XNEW (class _slp_instance);
2027 SLP_INSTANCE_TREE (new_instance) = node;
2028 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2029 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2030 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2031 vect_gather_slp_loads (new_instance, node);
2032 if (dump_enabled_p ())
2033 dump_printf_loc (MSG_NOTE, vect_location,
2034 "SLP size %u vs. limit %u.\n",
2035 tree_size, max_tree_size);
2036
2037 /* Compute the load permutation. */
2038 slp_tree load_node;
2039 bool loads_permuted = false;
2040 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2041 {
2042 vec<unsigned> load_permutation;
2043 int j;
2044 stmt_vec_info load_info;
2045 bool this_load_permuted = false;
2046 load_permutation.create (group_size);
2047 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2048 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2049 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2050 {
2051 int load_place = vect_get_place_in_interleaving_chain
2052 (load_info, first_stmt_info);
2053 gcc_assert (load_place != -1);
2054 if (load_place != j)
2055 this_load_permuted = true;
2056 load_permutation.safe_push (load_place);
2057 }
2058 if (!this_load_permuted
2059 /* The load requires permutation when unrolling exposes
2060 a gap either because the group is larger than the SLP
2061 group-size or because there is a gap between the groups. */
2062 && (known_eq (unrolling_factor, 1U)
2063 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2064 && DR_GROUP_GAP (first_stmt_info) == 0)))
2065 {
2066 load_permutation.release ();
2067 continue;
2068 }
2069 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2070 loads_permuted = true;
2071 }
2072
2073 if (loads_permuted)
2074 {
2075 if (!vect_supported_load_permutation_p (new_instance))
2076 {
2077 if (dump_enabled_p ())
2078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079 "Build SLP failed: unsupported load "
2080 "permutation %G", stmt_info->stmt);
2081 vect_free_slp_instance (new_instance, false);
2082 return false;
2083 }
2084 }
2085
2086 /* If the loads and stores can be handled with load/store-lan
2087 instructions do not generate this SLP instance. */
2088 if (is_a <loop_vec_info> (vinfo)
2089 && loads_permuted
2090 && dr && vect_store_lanes_supported (vectype, group_size, false))
2091 {
2092 slp_tree load_node;
2093 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2094 {
2095 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2096 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2097 /* Use SLP for strided accesses (or if we can't load-lanes). */
2098 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2099 || ! vect_load_lanes_supported
2100 (STMT_VINFO_VECTYPE (stmt_vinfo),
2101 DR_GROUP_SIZE (stmt_vinfo), false))
2102 break;
2103 }
2104 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2105 {
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108 "Built SLP cancelled: can use "
2109 "load/store-lanes\n");
2110 vect_free_slp_instance (new_instance, false);
2111 return false;
2112 }
2113 }
2114
2115 vinfo->slp_instances.safe_push (new_instance);
2116
2117 if (dump_enabled_p ())
2118 {
2119 dump_printf_loc (MSG_NOTE, vect_location,
2120 "Final SLP tree for instance:\n");
2121 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2122 }
2123
2124 return true;
2125 }
2126 }
2127 else
2128 {
2129 /* Failed to SLP. */
2130 /* Free the allocated memory. */
2131 scalar_stmts.release ();
2132 }
2133
2134 /* For basic block SLP, try to break the group up into multiples of the
2135 vector size. */
2136 unsigned HOST_WIDE_INT const_nunits;
2137 if (is_a <bb_vec_info> (vinfo)
2138 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2139 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2140 && nunits.is_constant (&const_nunits))
2141 {
2142 /* We consider breaking the group only on VF boundaries from the existing
2143 start. */
2144 for (i = 0; i < group_size; i++)
2145 if (!matches[i]) break;
2146
2147 if (i >= const_nunits && i < group_size)
2148 {
2149 /* Split into two groups at the first vector boundary before i. */
2150 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2151 unsigned group1_size = i & ~(const_nunits - 1);
2152
2153 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2154 group1_size);
2155 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2156 max_tree_size);
2157 /* If the first non-match was in the middle of a vector,
2158 skip the rest of that vector. */
2159 if (group1_size < i)
2160 {
2161 i = group1_size + const_nunits;
2162 if (i < group_size)
2163 rest = vect_split_slp_store_group (rest, const_nunits);
2164 }
2165 if (i < group_size)
2166 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2167 return res;
2168 }
2169 /* Even though the first vector did not all match, we might be able to SLP
2170 (some) of the remainder. FORNOW ignore this possibility. */
2171 }
2172
2173 return false;
2174 }
2175
2176
2177 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2178 trees of packed scalar stmts if SLP is possible. */
2179
2180 opt_result
2181 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2182 {
2183 unsigned int i;
2184 stmt_vec_info first_element;
2185
2186 DUMP_VECT_SCOPE ("vect_analyze_slp");
2187
2188 /* Find SLP sequences starting from groups of grouped stores. */
2189 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2190 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2191
2192 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2193 {
2194 if (loop_vinfo->reduction_chains.length () > 0)
2195 {
2196 /* Find SLP sequences starting from reduction chains. */
2197 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2198 if (! vect_analyze_slp_instance (vinfo, first_element,
2199 max_tree_size))
2200 {
2201 /* Dissolve reduction chain group. */
2202 stmt_vec_info vinfo = first_element;
2203 while (vinfo)
2204 {
2205 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2206 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2207 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2208 vinfo = next;
2209 }
2210 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2211 }
2212 }
2213
2214 /* Find SLP sequences starting from groups of reductions. */
2215 if (loop_vinfo->reductions.length () > 1)
2216 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2217 max_tree_size);
2218 }
2219
2220 return opt_result::success ();
2221 }
2222
2223
2224 /* For each possible SLP instance decide whether to SLP it and calculate overall
2225 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2226 least one instance. */
2227
2228 bool
2229 vect_make_slp_decision (loop_vec_info loop_vinfo)
2230 {
2231 unsigned int i;
2232 poly_uint64 unrolling_factor = 1;
2233 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2234 slp_instance instance;
2235 int decided_to_slp = 0;
2236
2237 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2238
2239 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2240 {
2241 /* FORNOW: SLP if you can. */
2242 /* All unroll factors have the form current_vector_size * X for some
2243 rational X, so they must have a common multiple. */
2244 unrolling_factor
2245 = force_common_multiple (unrolling_factor,
2246 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2247
2248 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2249 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2250 loop-based vectorization. Such stmts will be marked as HYBRID. */
2251 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2252 decided_to_slp++;
2253 }
2254
2255 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2256
2257 if (decided_to_slp && dump_enabled_p ())
2258 {
2259 dump_printf_loc (MSG_NOTE, vect_location,
2260 "Decided to SLP %d instances. Unrolling factor ",
2261 decided_to_slp);
2262 dump_dec (MSG_NOTE, unrolling_factor);
2263 dump_printf (MSG_NOTE, "\n");
2264 }
2265
2266 return (decided_to_slp > 0);
2267 }
2268
2269
2270 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2271 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2272
2273 static void
2274 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2275 hash_map<slp_tree, unsigned> &visited)
2276 {
2277 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2278 imm_use_iterator imm_iter;
2279 gimple *use_stmt;
2280 stmt_vec_info use_vinfo;
2281 slp_tree child;
2282 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2283 int j;
2284
2285 /* We need to union stype over the incoming graph edges but we still
2286 want to limit recursion to stay O(N+E). */
2287 bool only_edge = (++visited.get_or_insert (node) < node->refcnt);
2288
2289 /* Propagate hybrid down the SLP tree. */
2290 if (stype == hybrid)
2291 ;
2292 else if (HYBRID_SLP_STMT (stmt_vinfo))
2293 stype = hybrid;
2294 else if (!only_edge)
2295 {
2296 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2297 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2298 /* If we get a pattern stmt here we have to use the LHS of the
2299 original stmt for immediate uses. */
2300 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2301 tree def;
2302 if (gimple_code (stmt) == GIMPLE_PHI)
2303 def = gimple_phi_result (stmt);
2304 else
2305 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2306 if (def)
2307 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2308 {
2309 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2310 if (!use_vinfo)
2311 continue;
2312 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2313 if (!STMT_SLP_TYPE (use_vinfo)
2314 && (STMT_VINFO_RELEVANT (use_vinfo)
2315 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2316 && !(gimple_code (use_stmt) == GIMPLE_PHI
2317 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2318 {
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2321 "def in non-SLP stmt: %G", use_stmt);
2322 stype = hybrid;
2323 }
2324 }
2325 }
2326
2327 if (stype == hybrid
2328 && !HYBRID_SLP_STMT (stmt_vinfo))
2329 {
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2332 stmt_vinfo->stmt);
2333 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2334 }
2335
2336 if (!only_edge)
2337 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2338 if (SLP_TREE_DEF_TYPE (child) != vect_external_def)
2339 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2340 }
2341
2342 static void
2343 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
2344 {
2345 hash_map<slp_tree, unsigned> visited;
2346 vect_detect_hybrid_slp_stmts (node, i, stype, visited);
2347 }
2348
2349 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2350
2351 static tree
2352 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2353 {
2354 walk_stmt_info *wi = (walk_stmt_info *)data;
2355 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2356
2357 if (wi->is_lhs)
2358 return NULL_TREE;
2359
2360 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2361 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2362 {
2363 if (dump_enabled_p ())
2364 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2365 def_stmt_info->stmt);
2366 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2367 }
2368
2369 return NULL_TREE;
2370 }
2371
2372 static tree
2373 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2374 walk_stmt_info *wi)
2375 {
2376 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2377 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2378 /* If the stmt is in a SLP instance then this isn't a reason
2379 to mark use definitions in other SLP instances as hybrid. */
2380 if (! STMT_SLP_TYPE (use_vinfo)
2381 && (STMT_VINFO_RELEVANT (use_vinfo)
2382 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2383 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2384 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2385 ;
2386 else
2387 *handled = true;
2388 return NULL_TREE;
2389 }
2390
2391 /* Find stmts that must be both vectorized and SLPed. */
2392
2393 void
2394 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2395 {
2396 unsigned int i;
2397 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2398 slp_instance instance;
2399
2400 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2401
2402 /* First walk all pattern stmt in the loop and mark defs of uses as
2403 hybrid because immediate uses in them are not recorded. */
2404 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2405 {
2406 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2407 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2408 gsi_next (&gsi))
2409 {
2410 gimple *stmt = gsi_stmt (gsi);
2411 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2412 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2413 {
2414 walk_stmt_info wi;
2415 memset (&wi, 0, sizeof (wi));
2416 wi.info = loop_vinfo;
2417 gimple_stmt_iterator gsi2
2418 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2419 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2420 vect_detect_hybrid_slp_1, &wi);
2421 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2422 vect_detect_hybrid_slp_2,
2423 vect_detect_hybrid_slp_1, &wi);
2424 }
2425 }
2426 }
2427
2428 /* Then walk the SLP instance trees marking stmts with uses in
2429 non-SLP stmts as hybrid, also propagating hybrid down the
2430 SLP tree, collecting the above info on-the-fly. */
2431 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2432 {
2433 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2434 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2435 i, pure_slp);
2436 }
2437 }
2438
2439
2440 /* Initialize a bb_vec_info struct for the statements between
2441 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2442
2443 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2444 gimple_stmt_iterator region_end_in,
2445 vec_info_shared *shared)
2446 : vec_info (vec_info::bb, init_cost (NULL), shared),
2447 bb (gsi_bb (region_begin_in)),
2448 region_begin (region_begin_in),
2449 region_end (region_end_in)
2450 {
2451 gimple_stmt_iterator gsi;
2452
2453 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2454 gsi_next (&gsi))
2455 {
2456 gimple *stmt = gsi_stmt (gsi);
2457 gimple_set_uid (stmt, 0);
2458 add_stmt (stmt);
2459 }
2460
2461 bb->aux = this;
2462 }
2463
2464
2465 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2466 stmts in the basic block. */
2467
2468 _bb_vec_info::~_bb_vec_info ()
2469 {
2470 for (gimple_stmt_iterator si = region_begin;
2471 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2472 /* Reset region marker. */
2473 gimple_set_uid (gsi_stmt (si), -1);
2474
2475 bb->aux = NULL;
2476 }
2477
2478 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2479 given then that child nodes have already been processed, and that
2480 their def types currently match their SLP node's def type. */
2481
2482 static bool
2483 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2484 slp_instance node_instance,
2485 stmt_vector_for_cost *cost_vec)
2486 {
2487 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2488 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2489
2490 /* For BB vectorization vector types are assigned here.
2491 Memory accesses already got their vector type assigned
2492 in vect_analyze_data_refs. */
2493 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2494 if (bb_vinfo
2495 && ! STMT_VINFO_DATA_REF (stmt_info))
2496 {
2497 tree vectype, nunits_vectype;
2498 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2499 &nunits_vectype))
2500 /* We checked this when building the node. */
2501 gcc_unreachable ();
2502 if (vectype == boolean_type_node)
2503 {
2504 vectype = vect_get_mask_type_for_stmt (stmt_info);
2505 if (!vectype)
2506 /* vect_get_mask_type_for_stmt has already explained the
2507 failure. */
2508 return false;
2509 }
2510
2511 stmt_vec_info sstmt_info;
2512 unsigned int i;
2513 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2514 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2515 }
2516
2517 /* Calculate the number of vector statements to be created for the
2518 scalar stmts in this node. For SLP reductions it is equal to the
2519 number of vector statements in the children (which has already been
2520 calculated by the recursive call). Otherwise it is the number of
2521 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2522 VF divided by the number of elements in a vector. */
2523 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2524 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2525 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2526 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2527 else
2528 {
2529 poly_uint64 vf;
2530 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2531 vf = loop_vinfo->vectorization_factor;
2532 else
2533 vf = 1;
2534 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2535 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2536 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2537 = vect_get_num_vectors (vf * group_size, vectype);
2538 }
2539
2540 bool dummy;
2541 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2542 }
2543
2544 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2545 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2546
2547 Return true if the operations are supported. */
2548
2549 static bool
2550 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2551 slp_instance node_instance,
2552 scalar_stmts_to_slp_tree_map_t *visited,
2553 scalar_stmts_to_slp_tree_map_t *lvisited,
2554 stmt_vector_for_cost *cost_vec)
2555 {
2556 int i, j;
2557 slp_tree child;
2558
2559 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2560 return true;
2561
2562 /* If we already analyzed the exact same set of scalar stmts we're done.
2563 We share the generated vector stmts for those. */
2564 slp_tree *leader;
2565 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2566 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2567 {
2568 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2569 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2570 return true;
2571 }
2572
2573 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2574 doesn't result in any issue since we throw away the lvisited set
2575 when we fail. */
2576 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2577
2578 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2579 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2580 visited, lvisited, cost_vec))
2581 return false;
2582
2583 /* ??? We have to catch the case late where two first scalar stmts appear
2584 in multiple SLP children with different def type and fail. Remember
2585 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2586 match it when that is vect_internal_def. */
2587 auto_vec<vect_def_type, 4> dt;
2588 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2589 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2590 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2591
2592 /* Push SLP node def-type to stmt operands. */
2593 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2594 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2595 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2596 = SLP_TREE_DEF_TYPE (child);
2597
2598 /* Check everything worked out. */
2599 bool res = true;
2600 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2601 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2602 {
2603 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2604 != SLP_TREE_DEF_TYPE (child))
2605 res = false;
2606 }
2607 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) != dt[j])
2608 res = false;
2609 if (!res && dump_enabled_p ())
2610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2611 "not vectorized: same operand with different "
2612 "def type in stmt.\n");
2613
2614 if (res)
2615 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2616 cost_vec);
2617
2618 /* Restore def-types. */
2619 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2620 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2621
2622 return res;
2623 }
2624
2625
2626 /* Analyze statements in SLP instances of VINFO. Return true if the
2627 operations are supported. */
2628
2629 bool
2630 vect_slp_analyze_operations (vec_info *vinfo)
2631 {
2632 slp_instance instance;
2633 int i;
2634
2635 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2636
2637 scalar_stmts_to_slp_tree_map_t *visited
2638 = new scalar_stmts_to_slp_tree_map_t ();
2639 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2640 {
2641 scalar_stmts_to_slp_tree_map_t lvisited;
2642 stmt_vector_for_cost cost_vec;
2643 cost_vec.create (2);
2644 if (!vect_slp_analyze_node_operations (vinfo,
2645 SLP_INSTANCE_TREE (instance),
2646 instance, visited, &lvisited,
2647 &cost_vec))
2648 {
2649 slp_tree node = SLP_INSTANCE_TREE (instance);
2650 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2651 if (dump_enabled_p ())
2652 dump_printf_loc (MSG_NOTE, vect_location,
2653 "removing SLP instance operations starting from: %G",
2654 stmt_info->stmt);
2655 vect_free_slp_instance (instance, false);
2656 vinfo->slp_instances.ordered_remove (i);
2657 cost_vec.release ();
2658 }
2659 else
2660 {
2661 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2662 x != lvisited.end(); ++x)
2663 visited->put ((*x).first.copy (), (*x).second);
2664 i++;
2665
2666 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2667 cost_vec.release ();
2668 }
2669 }
2670 delete visited;
2671
2672 return !vinfo->slp_instances.is_empty ();
2673 }
2674
2675
2676 /* Compute the scalar cost of the SLP node NODE and its children
2677 and return it. Do not account defs that are marked in LIFE and
2678 update LIFE according to uses of NODE. */
2679
2680 static void
2681 vect_bb_slp_scalar_cost (basic_block bb,
2682 slp_tree node, vec<bool, va_heap> *life,
2683 stmt_vector_for_cost *cost_vec,
2684 hash_set<slp_tree> &visited)
2685 {
2686 unsigned i;
2687 stmt_vec_info stmt_info;
2688 slp_tree child;
2689
2690 if (visited.add (node))
2691 return;
2692
2693 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2694 {
2695 gimple *stmt = stmt_info->stmt;
2696 vec_info *vinfo = stmt_info->vinfo;
2697 ssa_op_iter op_iter;
2698 def_operand_p def_p;
2699
2700 if ((*life)[i])
2701 continue;
2702
2703 /* If there is a non-vectorized use of the defs then the scalar
2704 stmt is kept live in which case we do not account it or any
2705 required defs in the SLP children in the scalar cost. This
2706 way we make the vectorization more costly when compared to
2707 the scalar cost. */
2708 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2709 {
2710 imm_use_iterator use_iter;
2711 gimple *use_stmt;
2712 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2713 if (!is_gimple_debug (use_stmt))
2714 {
2715 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2716 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2717 {
2718 (*life)[i] = true;
2719 BREAK_FROM_IMM_USE_STMT (use_iter);
2720 }
2721 }
2722 }
2723 if ((*life)[i])
2724 continue;
2725
2726 /* Count scalar stmts only once. */
2727 if (gimple_visited_p (stmt))
2728 continue;
2729 gimple_set_visited (stmt, true);
2730
2731 vect_cost_for_stmt kind;
2732 if (STMT_VINFO_DATA_REF (stmt_info))
2733 {
2734 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2735 kind = scalar_load;
2736 else
2737 kind = scalar_store;
2738 }
2739 else
2740 kind = scalar_stmt;
2741 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2742 }
2743
2744 auto_vec<bool, 20> subtree_life;
2745 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2746 {
2747 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2748 {
2749 /* Do not directly pass LIFE to the recursive call, copy it to
2750 confine changes in the callee to the current child/subtree. */
2751 subtree_life.safe_splice (*life);
2752 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2753 visited);
2754 subtree_life.truncate (0);
2755 }
2756 }
2757 }
2758
2759 static void
2760 vect_bb_slp_scalar_cost (basic_block bb,
2761 slp_tree node, vec<bool, va_heap> *life,
2762 stmt_vector_for_cost *cost_vec)
2763 {
2764 hash_set<slp_tree> visited;
2765 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2766 }
2767
2768 /* Check if vectorization of the basic block is profitable. */
2769
2770 static bool
2771 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2772 {
2773 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2774 slp_instance instance;
2775 int i;
2776 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2777 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2778
2779 /* Calculate scalar cost. */
2780 stmt_vector_for_cost scalar_costs;
2781 scalar_costs.create (0);
2782 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2783 {
2784 auto_vec<bool, 20> life;
2785 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2786 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2787 SLP_INSTANCE_TREE (instance),
2788 &life, &scalar_costs);
2789 }
2790 void *target_cost_data = init_cost (NULL);
2791 add_stmt_costs (target_cost_data, &scalar_costs);
2792 scalar_costs.release ();
2793 unsigned dummy;
2794 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2795 destroy_cost_data (target_cost_data);
2796
2797 /* Unset visited flag. */
2798 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2799 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2800 gimple_set_visited (gsi_stmt (gsi), false);
2801
2802 /* Complete the target-specific cost calculation. */
2803 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2804 &vec_inside_cost, &vec_epilogue_cost);
2805
2806 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2807
2808 if (dump_enabled_p ())
2809 {
2810 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2811 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2812 vec_inside_cost);
2813 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2814 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2815 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2816 }
2817
2818 /* Vectorization is profitable if its cost is more than the cost of scalar
2819 version. Note that we err on the vector side for equal cost because
2820 the cost estimate is otherwise quite pessimistic (constant uses are
2821 free on the scalar side but cost a load on the vector side for
2822 example). */
2823 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2824 return false;
2825
2826 return true;
2827 }
2828
2829 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2830 if so and sets fatal to true if failure is independent of
2831 current_vector_size. */
2832
2833 static bb_vec_info
2834 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2835 gimple_stmt_iterator region_end,
2836 vec<data_reference_p> datarefs, int n_stmts,
2837 bool &fatal, vec_info_shared *shared)
2838 {
2839 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2840
2841 bb_vec_info bb_vinfo;
2842 slp_instance instance;
2843 int i;
2844 poly_uint64 min_vf = 2;
2845
2846 /* The first group of checks is independent of the vector size. */
2847 fatal = true;
2848
2849 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2850 {
2851 if (dump_enabled_p ())
2852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2853 "not vectorized: too many instructions in "
2854 "basic block.\n");
2855 free_data_refs (datarefs);
2856 return NULL;
2857 }
2858
2859 bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2860 if (!bb_vinfo)
2861 return NULL;
2862
2863 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2864 bb_vinfo->shared->save_datarefs ();
2865
2866 /* Analyze the data references. */
2867
2868 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
2869 {
2870 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2872 "not vectorized: unhandled data-ref in basic "
2873 "block.\n");
2874
2875 delete bb_vinfo;
2876 return NULL;
2877 }
2878
2879 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2880 {
2881 if (dump_enabled_p ())
2882 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2883 "not vectorized: not enough data-refs in "
2884 "basic block.\n");
2885
2886 delete bb_vinfo;
2887 return NULL;
2888 }
2889
2890 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2891 {
2892 if (dump_enabled_p ())
2893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2894 "not vectorized: unhandled data access in "
2895 "basic block.\n");
2896
2897 delete bb_vinfo;
2898 return NULL;
2899 }
2900
2901 /* If there are no grouped stores in the region there is no need
2902 to continue with pattern recog as vect_analyze_slp will fail
2903 anyway. */
2904 if (bb_vinfo->grouped_stores.is_empty ())
2905 {
2906 if (dump_enabled_p ())
2907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908 "not vectorized: no grouped stores in "
2909 "basic block.\n");
2910
2911 delete bb_vinfo;
2912 return NULL;
2913 }
2914
2915 /* While the rest of the analysis below depends on it in some way. */
2916 fatal = false;
2917
2918 vect_pattern_recog (bb_vinfo);
2919
2920 /* Check the SLP opportunities in the basic block, analyze and build SLP
2921 trees. */
2922 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2923 {
2924 if (dump_enabled_p ())
2925 {
2926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2927 "Failed to SLP the basic block.\n");
2928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2929 "not vectorized: failed to find SLP opportunities "
2930 "in basic block.\n");
2931 }
2932
2933 delete bb_vinfo;
2934 return NULL;
2935 }
2936
2937 vect_record_base_alignments (bb_vinfo);
2938
2939 /* Analyze and verify the alignment of data references and the
2940 dependence in the SLP instances. */
2941 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2942 {
2943 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2944 || ! vect_slp_analyze_instance_dependence (instance))
2945 {
2946 slp_tree node = SLP_INSTANCE_TREE (instance);
2947 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location,
2950 "removing SLP instance operations starting from: %G",
2951 stmt_info->stmt);
2952 vect_free_slp_instance (instance, false);
2953 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2954 continue;
2955 }
2956
2957 /* Mark all the statements that we want to vectorize as pure SLP and
2958 relevant. */
2959 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2960 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2961
2962 i++;
2963 }
2964 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2965 {
2966 delete bb_vinfo;
2967 return NULL;
2968 }
2969
2970 if (!vect_slp_analyze_operations (bb_vinfo))
2971 {
2972 if (dump_enabled_p ())
2973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974 "not vectorized: bad operation in basic block.\n");
2975
2976 delete bb_vinfo;
2977 return NULL;
2978 }
2979
2980 /* Cost model: check if the vectorization is worthwhile. */
2981 if (!unlimited_cost_model (NULL)
2982 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2983 {
2984 if (dump_enabled_p ())
2985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2986 "not vectorized: vectorization is not "
2987 "profitable.\n");
2988
2989 delete bb_vinfo;
2990 return NULL;
2991 }
2992
2993 if (dump_enabled_p ())
2994 dump_printf_loc (MSG_NOTE, vect_location,
2995 "Basic block will be vectorized using SLP\n");
2996
2997 return bb_vinfo;
2998 }
2999
3000
3001 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3002 true if anything in the basic-block was vectorized. */
3003
3004 bool
3005 vect_slp_bb (basic_block bb)
3006 {
3007 bb_vec_info bb_vinfo;
3008 gimple_stmt_iterator gsi;
3009 bool any_vectorized = false;
3010 auto_vector_sizes vector_sizes;
3011
3012 /* Autodetect first vector size we try. */
3013 current_vector_size = 0;
3014 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
3015 unsigned int next_size = 0;
3016
3017 gsi = gsi_start_bb (bb);
3018
3019 poly_uint64 autodetected_vector_size = 0;
3020 while (1)
3021 {
3022 if (gsi_end_p (gsi))
3023 break;
3024
3025 gimple_stmt_iterator region_begin = gsi;
3026 vec<data_reference_p> datarefs = vNULL;
3027 int insns = 0;
3028
3029 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3030 {
3031 gimple *stmt = gsi_stmt (gsi);
3032 if (is_gimple_debug (stmt))
3033 continue;
3034 insns++;
3035
3036 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3037 vect_location = stmt;
3038
3039 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3040 break;
3041 }
3042
3043 /* Skip leading unhandled stmts. */
3044 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3045 {
3046 gsi_next (&gsi);
3047 continue;
3048 }
3049
3050 gimple_stmt_iterator region_end = gsi;
3051
3052 bool vectorized = false;
3053 bool fatal = false;
3054 vec_info_shared shared;
3055 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
3056 datarefs, insns, fatal, &shared);
3057 if (bb_vinfo
3058 && dbg_cnt (vect_slp))
3059 {
3060 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3062
3063 bb_vinfo->shared->check_datarefs ();
3064 vect_schedule_slp (bb_vinfo);
3065
3066 unsigned HOST_WIDE_INT bytes;
3067 if (dump_enabled_p ())
3068 {
3069 if (current_vector_size.is_constant (&bytes))
3070 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3071 "basic block part vectorized using %wu byte "
3072 "vectors\n", bytes);
3073 else
3074 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3075 "basic block part vectorized using variable "
3076 "length vectors\n");
3077 }
3078
3079 vectorized = true;
3080 }
3081 delete bb_vinfo;
3082
3083 any_vectorized |= vectorized;
3084
3085 if (next_size == 0)
3086 autodetected_vector_size = current_vector_size;
3087
3088 if (next_size < vector_sizes.length ()
3089 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3090 next_size += 1;
3091
3092 if (vectorized
3093 || next_size == vector_sizes.length ()
3094 || known_eq (current_vector_size, 0U)
3095 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3096 vector sizes will fail do not bother iterating. */
3097 || fatal)
3098 {
3099 if (gsi_end_p (region_end))
3100 break;
3101
3102 /* Skip the unhandled stmt. */
3103 gsi_next (&gsi);
3104
3105 /* And reset vector sizes. */
3106 current_vector_size = 0;
3107 next_size = 0;
3108 }
3109 else
3110 {
3111 /* Try the next biggest vector size. */
3112 current_vector_size = vector_sizes[next_size++];
3113 if (dump_enabled_p ())
3114 {
3115 dump_printf_loc (MSG_NOTE, vect_location,
3116 "***** Re-trying analysis with "
3117 "vector size ");
3118 dump_dec (MSG_NOTE, current_vector_size);
3119 dump_printf (MSG_NOTE, "\n");
3120 }
3121
3122 /* Start over. */
3123 gsi = region_begin;
3124 }
3125 }
3126
3127 return any_vectorized;
3128 }
3129
3130
3131 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3132
3133 static bool
3134 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3135 {
3136 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3137 tree op, vectype;
3138 enum vect_def_type dt;
3139
3140 /* For comparison and COND_EXPR type is chosen depending
3141 on the non-constant other comparison operand. */
3142 if (TREE_CODE_CLASS (code) == tcc_comparison)
3143 {
3144 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3145 op = gimple_assign_rhs1 (stmt);
3146
3147 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3148 gcc_unreachable ();
3149
3150 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3151 }
3152
3153 if (code == COND_EXPR)
3154 {
3155 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3156 tree cond = gimple_assign_rhs1 (stmt);
3157
3158 if (TREE_CODE (cond) == SSA_NAME)
3159 op = cond;
3160 else
3161 op = TREE_OPERAND (cond, 0);
3162
3163 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3164 gcc_unreachable ();
3165
3166 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3167 }
3168
3169 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3170 }
3171
3172 /* Build a variable-length vector in which the elements in ELTS are repeated
3173 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3174 RESULTS and add any new instructions to SEQ.
3175
3176 The approach we use is:
3177
3178 (1) Find a vector mode VM with integer elements of mode IM.
3179
3180 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3181 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3182 from small vectors to IM.
3183
3184 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3185
3186 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3187 correct byte contents.
3188
3189 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3190
3191 We try to find the largest IM for which this sequence works, in order
3192 to cut down on the number of interleaves. */
3193
3194 void
3195 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3196 unsigned int nresults, vec<tree> &results)
3197 {
3198 unsigned int nelts = elts.length ();
3199 tree element_type = TREE_TYPE (vector_type);
3200
3201 /* (1) Find a vector mode VM with integer elements of mode IM. */
3202 unsigned int nvectors = 1;
3203 tree new_vector_type;
3204 tree permutes[2];
3205 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3206 &nvectors, &new_vector_type,
3207 permutes))
3208 gcc_unreachable ();
3209
3210 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3211 unsigned int partial_nelts = nelts / nvectors;
3212 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3213
3214 tree_vector_builder partial_elts;
3215 auto_vec<tree, 32> pieces (nvectors * 2);
3216 pieces.quick_grow (nvectors * 2);
3217 for (unsigned int i = 0; i < nvectors; ++i)
3218 {
3219 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3220 ELTS' has mode IM. */
3221 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3222 for (unsigned int j = 0; j < partial_nelts; ++j)
3223 partial_elts.quick_push (elts[i * partial_nelts + j]);
3224 tree t = gimple_build_vector (seq, &partial_elts);
3225 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3226 TREE_TYPE (new_vector_type), t);
3227
3228 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3229 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3230 }
3231
3232 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3233 correct byte contents.
3234
3235 We need to repeat the following operation log2(nvectors) times:
3236
3237 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3238 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3239
3240 However, if each input repeats every N elements and the VF is
3241 a multiple of N * 2, the HI result is the same as the LO. */
3242 unsigned int in_start = 0;
3243 unsigned int out_start = nvectors;
3244 unsigned int hi_start = nvectors / 2;
3245 /* A bound on the number of outputs needed to produce NRESULTS results
3246 in the final iteration. */
3247 unsigned int noutputs_bound = nvectors * nresults;
3248 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3249 {
3250 noutputs_bound /= 2;
3251 unsigned int limit = MIN (noutputs_bound, nvectors);
3252 for (unsigned int i = 0; i < limit; ++i)
3253 {
3254 if ((i & 1) != 0
3255 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3256 2 * in_repeat))
3257 {
3258 pieces[out_start + i] = pieces[out_start + i - 1];
3259 continue;
3260 }
3261
3262 tree output = make_ssa_name (new_vector_type);
3263 tree input1 = pieces[in_start + (i / 2)];
3264 tree input2 = pieces[in_start + (i / 2) + hi_start];
3265 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3266 input1, input2,
3267 permutes[i & 1]);
3268 gimple_seq_add_stmt (seq, stmt);
3269 pieces[out_start + i] = output;
3270 }
3271 std::swap (in_start, out_start);
3272 }
3273
3274 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3275 results.reserve (nresults);
3276 for (unsigned int i = 0; i < nresults; ++i)
3277 if (i < nvectors)
3278 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3279 pieces[in_start + i]));
3280 else
3281 results.quick_push (results[i - nvectors]);
3282 }
3283
3284
3285 /* For constant and loop invariant defs of SLP_NODE this function returns
3286 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3287 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3288 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3289 REDUC_INDEX is the index of the reduction operand in the statements, unless
3290 it is -1. */
3291
3292 static void
3293 vect_get_constant_vectors (tree op, slp_tree slp_node,
3294 vec<tree> *vec_oprnds,
3295 unsigned int op_num, unsigned int number_of_vectors)
3296 {
3297 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3298 stmt_vec_info stmt_vinfo = stmts[0];
3299 gimple *stmt = stmt_vinfo->stmt;
3300 unsigned HOST_WIDE_INT nunits;
3301 tree vec_cst;
3302 unsigned j, number_of_places_left_in_vector;
3303 tree vector_type;
3304 tree vop;
3305 int group_size = stmts.length ();
3306 unsigned int vec_num, i;
3307 unsigned number_of_copies = 1;
3308 vec<tree> voprnds;
3309 voprnds.create (number_of_vectors);
3310 bool constant_p, is_store;
3311 tree neutral_op = NULL;
3312 enum tree_code code = gimple_expr_code (stmt);
3313 gimple_seq ctor_seq = NULL;
3314 auto_vec<tree, 16> permute_results;
3315
3316 /* Check if vector type is a boolean vector. */
3317 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3318 && vect_mask_constant_operand_p (stmt_vinfo))
3319 vector_type
3320 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3321 else
3322 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3323
3324 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3325 {
3326 is_store = true;
3327 op = gimple_assign_rhs1 (stmt);
3328 }
3329 else
3330 is_store = false;
3331
3332 gcc_assert (op);
3333
3334 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3335 created vectors. It is greater than 1 if unrolling is performed.
3336
3337 For example, we have two scalar operands, s1 and s2 (e.g., group of
3338 strided accesses of size two), while NUNITS is four (i.e., four scalars
3339 of this type can be packed in a vector). The output vector will contain
3340 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3341 will be 2).
3342
3343 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3344 containing the operands.
3345
3346 For example, NUNITS is four as before, and the group size is 8
3347 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3348 {s5, s6, s7, s8}. */
3349
3350 /* When using duplicate_and_interleave, we just need one element for
3351 each scalar statement. */
3352 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3353 nunits = group_size;
3354
3355 number_of_copies = nunits * number_of_vectors / group_size;
3356
3357 number_of_places_left_in_vector = nunits;
3358 constant_p = true;
3359 tree_vector_builder elts (vector_type, nunits, 1);
3360 elts.quick_grow (nunits);
3361 bool place_after_defs = false;
3362 for (j = 0; j < number_of_copies; j++)
3363 {
3364 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
3365 {
3366 stmt = stmt_vinfo->stmt;
3367 if (is_store)
3368 op = gimple_assign_rhs1 (stmt);
3369 else
3370 {
3371 switch (code)
3372 {
3373 case COND_EXPR:
3374 {
3375 tree cond = gimple_assign_rhs1 (stmt);
3376 if (TREE_CODE (cond) == SSA_NAME)
3377 op = gimple_op (stmt, op_num + 1);
3378 else if (op_num == 0 || op_num == 1)
3379 op = TREE_OPERAND (cond, op_num);
3380 else
3381 {
3382 if (op_num == 2)
3383 op = gimple_assign_rhs2 (stmt);
3384 else
3385 op = gimple_assign_rhs3 (stmt);
3386 }
3387 }
3388 break;
3389
3390 case CALL_EXPR:
3391 op = gimple_call_arg (stmt, op_num);
3392 break;
3393
3394 case LSHIFT_EXPR:
3395 case RSHIFT_EXPR:
3396 case LROTATE_EXPR:
3397 case RROTATE_EXPR:
3398 op = gimple_op (stmt, op_num + 1);
3399 /* Unlike the other binary operators, shifts/rotates have
3400 the shift count being int, instead of the same type as
3401 the lhs, so make sure the scalar is the right type if
3402 we are dealing with vectors of
3403 long long/long/short/char. */
3404 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3405 op = fold_convert (TREE_TYPE (vector_type), op);
3406 break;
3407
3408 default:
3409 op = gimple_op (stmt, op_num + 1);
3410 break;
3411 }
3412 }
3413
3414 /* Create 'vect_ = {op0,op1,...,opn}'. */
3415 number_of_places_left_in_vector--;
3416 tree orig_op = op;
3417 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3418 {
3419 if (CONSTANT_CLASS_P (op))
3420 {
3421 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3422 {
3423 /* Can't use VIEW_CONVERT_EXPR for booleans because
3424 of possibly different sizes of scalar value and
3425 vector element. */
3426 if (integer_zerop (op))
3427 op = build_int_cst (TREE_TYPE (vector_type), 0);
3428 else if (integer_onep (op))
3429 op = build_all_ones_cst (TREE_TYPE (vector_type));
3430 else
3431 gcc_unreachable ();
3432 }
3433 else
3434 op = fold_unary (VIEW_CONVERT_EXPR,
3435 TREE_TYPE (vector_type), op);
3436 gcc_assert (op && CONSTANT_CLASS_P (op));
3437 }
3438 else
3439 {
3440 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3441 gimple *init_stmt;
3442 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3443 {
3444 tree true_val
3445 = build_all_ones_cst (TREE_TYPE (vector_type));
3446 tree false_val
3447 = build_zero_cst (TREE_TYPE (vector_type));
3448 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3449 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3450 op, true_val,
3451 false_val);
3452 }
3453 else
3454 {
3455 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3456 op);
3457 init_stmt
3458 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3459 op);
3460 }
3461 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3462 op = new_temp;
3463 }
3464 }
3465 elts[number_of_places_left_in_vector] = op;
3466 if (!CONSTANT_CLASS_P (op))
3467 constant_p = false;
3468 if (TREE_CODE (orig_op) == SSA_NAME
3469 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3470 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3471 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3472 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3473 place_after_defs = true;
3474
3475 if (number_of_places_left_in_vector == 0)
3476 {
3477 if (constant_p
3478 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3479 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3480 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3481 else
3482 {
3483 if (vec_oprnds->is_empty ())
3484 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3485 number_of_vectors,
3486 permute_results);
3487 vec_cst = permute_results[number_of_vectors - j - 1];
3488 }
3489 tree init;
3490 gimple_stmt_iterator gsi;
3491 if (place_after_defs)
3492 {
3493 stmt_vec_info last_stmt_info
3494 = vect_find_last_scalar_stmt_in_slp (slp_node);
3495 gsi = gsi_for_stmt (last_stmt_info->stmt);
3496 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3497 &gsi);
3498 }
3499 else
3500 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3501 NULL);
3502 if (ctor_seq != NULL)
3503 {
3504 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3505 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3506 ctor_seq = NULL;
3507 }
3508 voprnds.quick_push (init);
3509 place_after_defs = false;
3510 number_of_places_left_in_vector = nunits;
3511 constant_p = true;
3512 elts.new_vector (vector_type, nunits, 1);
3513 elts.quick_grow (nunits);
3514 }
3515 }
3516 }
3517
3518 /* Since the vectors are created in the reverse order, we should invert
3519 them. */
3520 vec_num = voprnds.length ();
3521 for (j = vec_num; j != 0; j--)
3522 {
3523 vop = voprnds[j - 1];
3524 vec_oprnds->quick_push (vop);
3525 }
3526
3527 voprnds.release ();
3528
3529 /* In case that VF is greater than the unrolling factor needed for the SLP
3530 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3531 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3532 to replicate the vectors. */
3533 while (number_of_vectors > vec_oprnds->length ())
3534 {
3535 tree neutral_vec = NULL;
3536
3537 if (neutral_op)
3538 {
3539 if (!neutral_vec)
3540 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3541
3542 vec_oprnds->quick_push (neutral_vec);
3543 }
3544 else
3545 {
3546 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3547 vec_oprnds->quick_push (vop);
3548 }
3549 }
3550 }
3551
3552
3553 /* Get vectorized definitions from SLP_NODE that contains corresponding
3554 vectorized def-stmts. */
3555
3556 static void
3557 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3558 {
3559 tree vec_oprnd;
3560 stmt_vec_info vec_def_stmt_info;
3561 unsigned int i;
3562
3563 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3564
3565 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3566 {
3567 gcc_assert (vec_def_stmt_info);
3568 if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
3569 vec_oprnd = gimple_phi_result (vec_def_phi);
3570 else
3571 vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
3572 vec_oprnds->quick_push (vec_oprnd);
3573 }
3574 }
3575
3576
3577 /* Get vectorized definitions for SLP_NODE.
3578 If the scalar definitions are loop invariants or constants, collect them and
3579 call vect_get_constant_vectors() to create vector stmts.
3580 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3581 must be stored in the corresponding child of SLP_NODE, and we call
3582 vect_get_slp_vect_defs () to retrieve them. */
3583
3584 void
3585 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3586 vec<vec<tree> > *vec_oprnds)
3587 {
3588 int number_of_vects = 0, i;
3589 unsigned int child_index = 0;
3590 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3591 slp_tree child = NULL;
3592 vec<tree> vec_defs;
3593 tree oprnd;
3594 bool vectorized_defs;
3595
3596 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3597 FOR_EACH_VEC_ELT (ops, i, oprnd)
3598 {
3599 /* For each operand we check if it has vectorized definitions in a child
3600 node or we need to create them (for invariants and constants). We
3601 check if the LHS of the first stmt of the next child matches OPRND.
3602 If it does, we found the correct child. Otherwise, we call
3603 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3604 to check this child node for the next operand. */
3605 vectorized_defs = false;
3606 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3607 {
3608 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3609
3610 /* We have to check both pattern and original def, if available. */
3611 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3612 {
3613 stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
3614 stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
3615 tree first_def_op;
3616
3617 if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
3618 first_def_op = gimple_phi_result (first_def);
3619 else
3620 first_def_op = gimple_get_lhs (first_def_info->stmt);
3621 if (operand_equal_p (oprnd, first_def_op, 0)
3622 || (related
3623 && operand_equal_p (oprnd,
3624 gimple_get_lhs (related->stmt), 0)))
3625 {
3626 /* The number of vector defs is determined by the number of
3627 vector statements in the node from which we get those
3628 statements. */
3629 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3630 vectorized_defs = true;
3631 child_index++;
3632 }
3633 }
3634 else
3635 child_index++;
3636 }
3637
3638 if (!vectorized_defs)
3639 {
3640 if (i == 0)
3641 {
3642 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3643 /* Number of vector stmts was calculated according to LHS in
3644 vect_schedule_slp_instance (), fix it by replacing LHS with
3645 RHS, if necessary. See vect_get_smallest_scalar_type () for
3646 details. */
3647 vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
3648 &rhs_size_unit);
3649 if (rhs_size_unit != lhs_size_unit)
3650 {
3651 number_of_vects *= rhs_size_unit;
3652 number_of_vects /= lhs_size_unit;
3653 }
3654 }
3655 }
3656
3657 /* Allocate memory for vectorized defs. */
3658 vec_defs = vNULL;
3659 vec_defs.create (number_of_vects);
3660
3661 /* For reduction defs we call vect_get_constant_vectors (), since we are
3662 looking for initial loop invariant values. */
3663 if (vectorized_defs)
3664 /* The defs are already vectorized. */
3665 vect_get_slp_vect_defs (child, &vec_defs);
3666 else
3667 /* Build vectors from scalar defs. */
3668 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3669 number_of_vects);
3670
3671 vec_oprnds->quick_push (vec_defs);
3672 }
3673 }
3674
3675 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3676 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3677 permute statements for the SLP node NODE of the SLP instance
3678 SLP_NODE_INSTANCE. */
3679
3680 bool
3681 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3682 gimple_stmt_iterator *gsi, poly_uint64 vf,
3683 slp_instance slp_node_instance, bool analyze_only,
3684 unsigned *n_perms)
3685 {
3686 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3687 vec_info *vinfo = stmt_info->vinfo;
3688 int vec_index = 0;
3689 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3690 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3691 unsigned int mask_element;
3692 machine_mode mode;
3693
3694 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3695 return false;
3696
3697 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3698
3699 mode = TYPE_MODE (vectype);
3700 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3701
3702 /* Initialize the vect stmts of NODE to properly insert the generated
3703 stmts later. */
3704 if (! analyze_only)
3705 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3706 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3707 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3708
3709 /* Generate permutation masks for every NODE. Number of masks for each NODE
3710 is equal to GROUP_SIZE.
3711 E.g., we have a group of three nodes with three loads from the same
3712 location in each node, and the vector size is 4. I.e., we have a
3713 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3714 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3715 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3716 ...
3717
3718 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3719 The last mask is illegal since we assume two operands for permute
3720 operation, and the mask element values can't be outside that range.
3721 Hence, the last mask must be converted into {2,5,5,5}.
3722 For the first two permutations we need the first and the second input
3723 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3724 we need the second and the third vectors: {b1,c1,a2,b2} and
3725 {c2,a3,b3,c3}. */
3726
3727 int vect_stmts_counter = 0;
3728 unsigned int index = 0;
3729 int first_vec_index = -1;
3730 int second_vec_index = -1;
3731 bool noop_p = true;
3732 *n_perms = 0;
3733
3734 vec_perm_builder mask;
3735 unsigned int nelts_to_build;
3736 unsigned int nvectors_per_build;
3737 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3738 && multiple_p (nunits, group_size));
3739 if (repeating_p)
3740 {
3741 /* A single vector contains a whole number of copies of the node, so:
3742 (a) all permutes can use the same mask; and
3743 (b) the permutes only need a single vector input. */
3744 mask.new_vector (nunits, group_size, 3);
3745 nelts_to_build = mask.encoded_nelts ();
3746 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3747 }
3748 else
3749 {
3750 /* We need to construct a separate mask for each vector statement. */
3751 unsigned HOST_WIDE_INT const_nunits, const_vf;
3752 if (!nunits.is_constant (&const_nunits)
3753 || !vf.is_constant (&const_vf))
3754 return false;
3755 mask.new_vector (const_nunits, const_nunits, 1);
3756 nelts_to_build = const_vf * group_size;
3757 nvectors_per_build = 1;
3758 }
3759
3760 unsigned int count = mask.encoded_nelts ();
3761 mask.quick_grow (count);
3762 vec_perm_indices indices;
3763
3764 for (unsigned int j = 0; j < nelts_to_build; j++)
3765 {
3766 unsigned int iter_num = j / group_size;
3767 unsigned int stmt_num = j % group_size;
3768 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3769 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3770 if (repeating_p)
3771 {
3772 first_vec_index = 0;
3773 mask_element = i;
3774 }
3775 else
3776 {
3777 /* Enforced before the loop when !repeating_p. */
3778 unsigned int const_nunits = nunits.to_constant ();
3779 vec_index = i / const_nunits;
3780 mask_element = i % const_nunits;
3781 if (vec_index == first_vec_index
3782 || first_vec_index == -1)
3783 {
3784 first_vec_index = vec_index;
3785 }
3786 else if (vec_index == second_vec_index
3787 || second_vec_index == -1)
3788 {
3789 second_vec_index = vec_index;
3790 mask_element += const_nunits;
3791 }
3792 else
3793 {
3794 if (dump_enabled_p ())
3795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3796 "permutation requires at "
3797 "least three vectors %G",
3798 stmt_info->stmt);
3799 gcc_assert (analyze_only);
3800 return false;
3801 }
3802
3803 gcc_assert (mask_element < 2 * const_nunits);
3804 }
3805
3806 if (mask_element != index)
3807 noop_p = false;
3808 mask[index++] = mask_element;
3809
3810 if (index == count && !noop_p)
3811 {
3812 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3813 if (!can_vec_perm_const_p (mode, indices))
3814 {
3815 if (dump_enabled_p ())
3816 {
3817 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3818 vect_location,
3819 "unsupported vect permute { ");
3820 for (i = 0; i < count; ++i)
3821 {
3822 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3823 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3824 }
3825 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3826 }
3827 gcc_assert (analyze_only);
3828 return false;
3829 }
3830
3831 ++*n_perms;
3832 }
3833
3834 if (index == count)
3835 {
3836 if (!analyze_only)
3837 {
3838 tree mask_vec = NULL_TREE;
3839
3840 if (! noop_p)
3841 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3842
3843 if (second_vec_index == -1)
3844 second_vec_index = first_vec_index;
3845
3846 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3847 {
3848 /* Generate the permute statement if necessary. */
3849 tree first_vec = dr_chain[first_vec_index + ri];
3850 tree second_vec = dr_chain[second_vec_index + ri];
3851 stmt_vec_info perm_stmt_info;
3852 if (! noop_p)
3853 {
3854 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3855 tree perm_dest
3856 = vect_create_destination_var (gimple_assign_lhs (stmt),
3857 vectype);
3858 perm_dest = make_ssa_name (perm_dest);
3859 gassign *perm_stmt
3860 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3861 first_vec, second_vec,
3862 mask_vec);
3863 perm_stmt_info
3864 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3865 gsi);
3866 }
3867 else
3868 /* If mask was NULL_TREE generate the requested
3869 identity transform. */
3870 perm_stmt_info = vinfo->lookup_def (first_vec);
3871
3872 /* Store the vector statement in NODE. */
3873 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3874 = perm_stmt_info;
3875 }
3876 }
3877
3878 index = 0;
3879 first_vec_index = -1;
3880 second_vec_index = -1;
3881 noop_p = true;
3882 }
3883 }
3884
3885 return true;
3886 }
3887
3888 /* Vectorize SLP instance tree in postorder. */
3889
3890 static void
3891 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3892 scalar_stmts_to_slp_tree_map_t *bst_map)
3893 {
3894 gimple_stmt_iterator si;
3895 stmt_vec_info stmt_info;
3896 unsigned int group_size;
3897 tree vectype;
3898 int i, j;
3899 slp_tree child;
3900
3901 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3902 return;
3903
3904 /* See if we have already vectorized the node in the graph of the
3905 SLP instance. */
3906 if (SLP_TREE_VEC_STMTS (node).exists ())
3907 return;
3908
3909 /* See if we have already vectorized the same set of stmts and reuse their
3910 vectorized stmts across instances. */
3911 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3912 {
3913 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3914 return;
3915 }
3916
3917 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3918 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3919 vect_schedule_slp_instance (child, instance, bst_map);
3920
3921 /* Push SLP node def-type to stmts. */
3922 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3923 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3924 {
3925 stmt_vec_info child_stmt_info;
3926 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3927 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3928 }
3929
3930 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3931
3932 /* VECTYPE is the type of the destination. */
3933 vectype = STMT_VINFO_VECTYPE (stmt_info);
3934 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3935 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3936
3937 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3938 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3939
3940 if (dump_enabled_p ())
3941 dump_printf_loc (MSG_NOTE, vect_location,
3942 "------>vectorizing SLP node starting from: %G",
3943 stmt_info->stmt);
3944
3945 /* Vectorized stmts go before the last scalar stmt which is where
3946 all uses are ready. */
3947 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3948 si = gsi_for_stmt (last_stmt_info->stmt);
3949
3950 /* Mark the first element of the reduction chain as reduction to properly
3951 transform the node. In the analysis phase only the last element of the
3952 chain is marked as reduction. */
3953 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3954 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3955 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3956 {
3957 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3958 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3959 }
3960
3961 /* Handle two-operation SLP nodes by vectorizing the group with
3962 both operations and then performing a merge. */
3963 if (SLP_TREE_TWO_OPERATORS (node))
3964 {
3965 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3966 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3967 enum tree_code ocode = ERROR_MARK;
3968 stmt_vec_info ostmt_info;
3969 vec_perm_builder mask (group_size, group_size, 1);
3970 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3971 {
3972 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3973 if (gimple_assign_rhs_code (ostmt) != code0)
3974 {
3975 mask.quick_push (1);
3976 ocode = gimple_assign_rhs_code (ostmt);
3977 }
3978 else
3979 mask.quick_push (0);
3980 }
3981 if (ocode != ERROR_MARK)
3982 {
3983 vec<stmt_vec_info> v0;
3984 vec<stmt_vec_info> v1;
3985 unsigned j;
3986 tree tmask = NULL_TREE;
3987 vect_transform_stmt (stmt_info, &si, node, instance);
3988 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3989 SLP_TREE_VEC_STMTS (node).truncate (0);
3990 gimple_assign_set_rhs_code (stmt, ocode);
3991 vect_transform_stmt (stmt_info, &si, node, instance);
3992 gimple_assign_set_rhs_code (stmt, code0);
3993 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3994 SLP_TREE_VEC_STMTS (node).truncate (0);
3995 tree meltype = build_nonstandard_integer_type
3996 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3997 tree mvectype = get_same_sized_vectype (meltype, vectype);
3998 unsigned k = 0, l;
3999 for (j = 0; j < v0.length (); ++j)
4000 {
4001 /* Enforced by vect_build_slp_tree, which rejects variable-length
4002 vectors for SLP_TREE_TWO_OPERATORS. */
4003 unsigned int const_nunits = nunits.to_constant ();
4004 tree_vector_builder melts (mvectype, const_nunits, 1);
4005 for (l = 0; l < const_nunits; ++l)
4006 {
4007 if (k >= group_size)
4008 k = 0;
4009 tree t = build_int_cst (meltype,
4010 mask[k++] * const_nunits + l);
4011 melts.quick_push (t);
4012 }
4013 tmask = melts.build ();
4014
4015 /* ??? Not all targets support a VEC_PERM_EXPR with a
4016 constant mask that would translate to a vec_merge RTX
4017 (with their vec_perm_const_ok). We can either not
4018 vectorize in that case or let veclower do its job.
4019 Unfortunately that isn't too great and at least for
4020 plus/minus we'd eventually like to match targets
4021 vector addsub instructions. */
4022 gimple *vstmt;
4023 vstmt = gimple_build_assign (make_ssa_name (vectype),
4024 VEC_PERM_EXPR,
4025 gimple_assign_lhs (v0[j]->stmt),
4026 gimple_assign_lhs (v1[j]->stmt),
4027 tmask);
4028 SLP_TREE_VEC_STMTS (node).quick_push
4029 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
4030 }
4031 v0.release ();
4032 v1.release ();
4033 return;
4034 }
4035 }
4036 vect_transform_stmt (stmt_info, &si, node, instance);
4037
4038 /* Restore stmt def-types. */
4039 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4040 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4041 {
4042 stmt_vec_info child_stmt_info;
4043 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4044 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4045 }
4046 }
4047
4048 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4049 For loop vectorization this is done in vectorizable_call, but for SLP
4050 it needs to be deferred until end of vect_schedule_slp, because multiple
4051 SLP instances may refer to the same scalar stmt. */
4052
4053 static void
4054 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4055 {
4056 gimple *new_stmt;
4057 gimple_stmt_iterator gsi;
4058 int i;
4059 slp_tree child;
4060 tree lhs;
4061 stmt_vec_info stmt_info;
4062
4063 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4064 return;
4065
4066 if (visited.add (node))
4067 return;
4068
4069 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4070 vect_remove_slp_scalar_calls (child, visited);
4071
4072 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4073 {
4074 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4075 if (!stmt || gimple_bb (stmt) == NULL)
4076 continue;
4077 if (is_pattern_stmt_p (stmt_info)
4078 || !PURE_SLP_STMT (stmt_info))
4079 continue;
4080 lhs = gimple_call_lhs (stmt);
4081 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4082 gsi = gsi_for_stmt (stmt);
4083 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4084 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4085 }
4086 }
4087
4088 static void
4089 vect_remove_slp_scalar_calls (slp_tree node)
4090 {
4091 hash_set<slp_tree> visited;
4092 vect_remove_slp_scalar_calls (node, visited);
4093 }
4094
4095 /* Generate vector code for all SLP instances in the loop/basic block. */
4096
4097 void
4098 vect_schedule_slp (vec_info *vinfo)
4099 {
4100 vec<slp_instance> slp_instances;
4101 slp_instance instance;
4102 unsigned int i;
4103
4104 scalar_stmts_to_slp_tree_map_t *bst_map
4105 = new scalar_stmts_to_slp_tree_map_t ();
4106 slp_instances = vinfo->slp_instances;
4107 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4108 {
4109 /* Schedule the tree of INSTANCE. */
4110 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4111 instance, bst_map);
4112 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_NOTE, vect_location,
4114 "vectorizing stmts using SLP.\n");
4115 }
4116 delete bst_map;
4117
4118 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4119 {
4120 slp_tree root = SLP_INSTANCE_TREE (instance);
4121 stmt_vec_info store_info;
4122 unsigned int j;
4123
4124 /* Remove scalar call stmts. Do not do this for basic-block
4125 vectorization as not all uses may be vectorized.
4126 ??? Why should this be necessary? DCE should be able to
4127 remove the stmts itself.
4128 ??? For BB vectorization we can as well remove scalar
4129 stmts starting from the SLP tree root if they have no
4130 uses. */
4131 if (is_a <loop_vec_info> (vinfo))
4132 vect_remove_slp_scalar_calls (root);
4133
4134 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4135 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4136 {
4137 if (!STMT_VINFO_DATA_REF (store_info))
4138 break;
4139
4140 store_info = vect_orig_stmt (store_info);
4141 /* Free the attached stmt_vec_info and remove the stmt. */
4142 vinfo->remove_stmt (store_info);
4143 }
4144 }
4145 }