]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-slp.c
2015-10-29 Andrew MacLeod <amacleod@redhat.com>
[thirdparty/gcc.git] / gcc / tree-vect-slp.c
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "gimple-pretty-print.h"
36 #include "dumpfile.h"
37 #include "params.h"
38 #include "alias.h"
39 #include "fold-const.h"
40 #include "stor-layout.h"
41 #include "internal-fn.h"
42 #include "gimple-iterator.h"
43 #include "cfgloop.h"
44 #include "flags.h"
45 #include "tree-vectorizer.h"
46 #include "langhooks.h"
47 #include "gimple-walk.h"
48
49 /* Extract the location of the basic block in the source code.
50 Return the basic block location if succeed and NULL if not. */
51
52 source_location
53 find_bb_location (basic_block bb)
54 {
55 gimple *stmt = NULL;
56 gimple_stmt_iterator si;
57
58 if (!bb)
59 return UNKNOWN_LOCATION;
60
61 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
62 {
63 stmt = gsi_stmt (si);
64 if (gimple_location (stmt) != UNKNOWN_LOCATION)
65 return gimple_location (stmt);
66 }
67
68 return UNKNOWN_LOCATION;
69 }
70
71
72 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
73
74 static void
75 vect_free_slp_tree (slp_tree node)
76 {
77 int i;
78 slp_tree child;
79
80 if (!node)
81 return;
82
83 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
84 vect_free_slp_tree (child);
85
86 SLP_TREE_CHILDREN (node).release ();
87 SLP_TREE_SCALAR_STMTS (node).release ();
88 SLP_TREE_VEC_STMTS (node).release ();
89 SLP_TREE_LOAD_PERMUTATION (node).release ();
90
91 free (node);
92 }
93
94
95 /* Free the memory allocated for the SLP instance. */
96
97 void
98 vect_free_slp_instance (slp_instance instance)
99 {
100 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
101 SLP_INSTANCE_LOADS (instance).release ();
102 free (instance);
103 }
104
105
106 /* Create an SLP node for SCALAR_STMTS. */
107
108 static slp_tree
109 vect_create_new_slp_node (vec<gimple *> scalar_stmts)
110 {
111 slp_tree node;
112 gimple *stmt = scalar_stmts[0];
113 unsigned int nops;
114
115 if (is_gimple_call (stmt))
116 nops = gimple_call_num_args (stmt);
117 else if (is_gimple_assign (stmt))
118 {
119 nops = gimple_num_ops (stmt) - 1;
120 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
121 nops++;
122 }
123 else
124 return NULL;
125
126 node = XNEW (struct _slp_tree);
127 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
128 SLP_TREE_VEC_STMTS (node).create (0);
129 SLP_TREE_CHILDREN (node).create (nops);
130 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
131 SLP_TREE_TWO_OPERATORS (node) = false;
132
133 return node;
134 }
135
136
137 /* This structure is used in creation of an SLP tree. Each instance
138 corresponds to the same operand in a group of scalar stmts in an SLP
139 node. */
140 typedef struct _slp_oprnd_info
141 {
142 /* Def-stmts for the operands. */
143 vec<gimple *> def_stmts;
144 /* Information about the first statement, its vector def-type, type, the
145 operand itself in case it's constant, and an indication if it's a pattern
146 stmt. */
147 enum vect_def_type first_dt;
148 tree first_op_type;
149 bool first_pattern;
150 bool second_pattern;
151 } *slp_oprnd_info;
152
153
154 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
155 operand. */
156 static vec<slp_oprnd_info>
157 vect_create_oprnd_info (int nops, int group_size)
158 {
159 int i;
160 slp_oprnd_info oprnd_info;
161 vec<slp_oprnd_info> oprnds_info;
162
163 oprnds_info.create (nops);
164 for (i = 0; i < nops; i++)
165 {
166 oprnd_info = XNEW (struct _slp_oprnd_info);
167 oprnd_info->def_stmts.create (group_size);
168 oprnd_info->first_dt = vect_uninitialized_def;
169 oprnd_info->first_op_type = NULL_TREE;
170 oprnd_info->first_pattern = false;
171 oprnd_info->second_pattern = false;
172 oprnds_info.quick_push (oprnd_info);
173 }
174
175 return oprnds_info;
176 }
177
178
179 /* Free operands info. */
180
181 static void
182 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
183 {
184 int i;
185 slp_oprnd_info oprnd_info;
186
187 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
188 {
189 oprnd_info->def_stmts.release ();
190 XDELETE (oprnd_info);
191 }
192
193 oprnds_info.release ();
194 }
195
196
197 /* Find the place of the data-ref in STMT in the interleaving chain that starts
198 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
199
200 static int
201 vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
202 {
203 gimple *next_stmt = first_stmt;
204 int result = 0;
205
206 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
207 return -1;
208
209 do
210 {
211 if (next_stmt == stmt)
212 return result;
213 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
214 if (next_stmt)
215 result += GROUP_GAP (vinfo_for_stmt (next_stmt));
216 }
217 while (next_stmt);
218
219 return -1;
220 }
221
222
223 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
224 they are of a valid type and that they match the defs of the first stmt of
225 the SLP group (stored in OPRNDS_INFO). If there was a fatal error
226 return -1, if the error could be corrected by swapping operands of the
227 operation return 1, if everything is ok return 0. */
228
229 static int
230 vect_get_and_check_slp_defs (vec_info *vinfo,
231 gimple *stmt, unsigned stmt_num,
232 vec<slp_oprnd_info> *oprnds_info)
233 {
234 tree oprnd;
235 unsigned int i, number_of_oprnds;
236 gimple *def_stmt;
237 enum vect_def_type dt = vect_uninitialized_def;
238 struct loop *loop = NULL;
239 bool pattern = false;
240 slp_oprnd_info oprnd_info;
241 int first_op_idx = 1;
242 bool commutative = false;
243 bool first_op_cond = false;
244 bool first = stmt_num == 0;
245 bool second = stmt_num == 1;
246
247 if (is_a <loop_vec_info> (vinfo))
248 loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
249
250 if (is_gimple_call (stmt))
251 {
252 number_of_oprnds = gimple_call_num_args (stmt);
253 first_op_idx = 3;
254 }
255 else if (is_gimple_assign (stmt))
256 {
257 enum tree_code code = gimple_assign_rhs_code (stmt);
258 number_of_oprnds = gimple_num_ops (stmt) - 1;
259 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
260 {
261 first_op_cond = true;
262 commutative = true;
263 number_of_oprnds++;
264 }
265 else
266 commutative = commutative_tree_code (code);
267 }
268 else
269 return -1;
270
271 bool swapped = false;
272 for (i = 0; i < number_of_oprnds; i++)
273 {
274 again:
275 if (first_op_cond)
276 {
277 if (i == 0 || i == 1)
278 oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
279 swapped ? !i : i);
280 else
281 oprnd = gimple_op (stmt, first_op_idx + i - 1);
282 }
283 else
284 oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
285
286 oprnd_info = (*oprnds_info)[i];
287
288 if (!vect_is_simple_use (oprnd, vinfo, &def_stmt, &dt))
289 {
290 if (dump_enabled_p ())
291 {
292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
293 "Build SLP failed: can't analyze def for ");
294 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
295 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
296 }
297
298 return -1;
299 }
300
301 /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
302 from the pattern. Check that all the stmts of the node are in the
303 pattern. */
304 if (def_stmt && gimple_bb (def_stmt)
305 && ((is_a <loop_vec_info> (vinfo)
306 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
307 || (is_a <bb_vec_info> (vinfo)
308 && gimple_bb (def_stmt) == as_a <bb_vec_info> (vinfo)->bb
309 && gimple_code (def_stmt) != GIMPLE_PHI))
310 && vinfo_for_stmt (def_stmt)
311 && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
312 && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
313 && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
314 {
315 pattern = true;
316 if (!first && !oprnd_info->first_pattern
317 /* Allow different pattern state for the defs of the
318 first stmt in reduction chains. */
319 && (oprnd_info->first_dt != vect_reduction_def
320 || (!second && !oprnd_info->second_pattern)))
321 {
322 if (i == 0
323 && !swapped
324 && commutative)
325 {
326 swapped = true;
327 goto again;
328 }
329
330 if (dump_enabled_p ())
331 {
332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
333 "Build SLP failed: some of the stmts"
334 " are in a pattern, and others are not ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
336 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
337 }
338
339 return 1;
340 }
341
342 def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
343 dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
344
345 if (dt == vect_unknown_def_type)
346 {
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
349 "Unsupported pattern.\n");
350 return -1;
351 }
352
353 switch (gimple_code (def_stmt))
354 {
355 case GIMPLE_PHI:
356 case GIMPLE_ASSIGN:
357 break;
358
359 default:
360 if (dump_enabled_p ())
361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
362 "unsupported defining stmt:\n");
363 return -1;
364 }
365 }
366
367 if (second)
368 oprnd_info->second_pattern = pattern;
369
370 if (first)
371 {
372 oprnd_info->first_dt = dt;
373 oprnd_info->first_pattern = pattern;
374 oprnd_info->first_op_type = TREE_TYPE (oprnd);
375 }
376 else
377 {
378 /* Not first stmt of the group, check that the def-stmt/s match
379 the def-stmt/s of the first stmt. Allow different definition
380 types for reduction chains: the first stmt must be a
381 vect_reduction_def (a phi node), and the rest
382 vect_internal_def. */
383 if (((oprnd_info->first_dt != dt
384 && !(oprnd_info->first_dt == vect_reduction_def
385 && dt == vect_internal_def)
386 && !((oprnd_info->first_dt == vect_external_def
387 || oprnd_info->first_dt == vect_constant_def)
388 && (dt == vect_external_def
389 || dt == vect_constant_def)))
390 || !types_compatible_p (oprnd_info->first_op_type,
391 TREE_TYPE (oprnd))))
392 {
393 /* Try swapping operands if we got a mismatch. */
394 if (i == 0
395 && !swapped
396 && commutative)
397 {
398 swapped = true;
399 goto again;
400 }
401
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
404 "Build SLP failed: different types\n");
405
406 return 1;
407 }
408 }
409
410 /* Check the types of the definitions. */
411 switch (dt)
412 {
413 case vect_constant_def:
414 case vect_external_def:
415 case vect_reduction_def:
416 break;
417
418 case vect_internal_def:
419 oprnd_info->def_stmts.quick_push (def_stmt);
420 break;
421
422 default:
423 /* FORNOW: Not supported. */
424 if (dump_enabled_p ())
425 {
426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
427 "Build SLP failed: illegal type of def ");
428 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
429 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
430 }
431
432 return -1;
433 }
434 }
435
436 /* Swap operands. */
437 if (swapped)
438 {
439 if (first_op_cond)
440 {
441 tree cond = gimple_assign_rhs1 (stmt);
442 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
443 &TREE_OPERAND (cond, 1));
444 TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
445 }
446 else
447 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
448 gimple_assign_rhs2_ptr (stmt));
449 }
450
451 return 0;
452 }
453
454
455 /* Verify if the scalar stmts STMTS are isomorphic, require data
456 permutation or are of unsupported types of operation. Return
457 true if they are, otherwise return false and indicate in *MATCHES
458 which stmts are not isomorphic to the first one. If MATCHES[0]
459 is false then this indicates the comparison could not be
460 carried out or the stmts will never be vectorized by SLP. */
461
462 static bool
463 vect_build_slp_tree_1 (vec_info *vinfo,
464 vec<gimple *> stmts, unsigned int group_size,
465 unsigned nops, unsigned int *max_nunits,
466 unsigned int vectorization_factor, bool *matches,
467 bool *two_operators)
468 {
469 unsigned int i;
470 gimple *first_stmt = stmts[0], *stmt = stmts[0];
471 enum tree_code first_stmt_code = ERROR_MARK;
472 enum tree_code alt_stmt_code = ERROR_MARK;
473 enum tree_code rhs_code = ERROR_MARK;
474 enum tree_code first_cond_code = ERROR_MARK;
475 tree lhs;
476 bool need_same_oprnds = false;
477 tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
478 optab optab;
479 int icode;
480 machine_mode optab_op2_mode;
481 machine_mode vec_mode;
482 HOST_WIDE_INT dummy;
483 gimple *first_load = NULL, *prev_first_load = NULL;
484 tree cond;
485
486 /* For every stmt in NODE find its def stmt/s. */
487 FOR_EACH_VEC_ELT (stmts, i, stmt)
488 {
489 matches[i] = false;
490
491 if (dump_enabled_p ())
492 {
493 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
494 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
495 dump_printf (MSG_NOTE, "\n");
496 }
497
498 /* Fail to vectorize statements marked as unvectorizable. */
499 if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
500 {
501 if (dump_enabled_p ())
502 {
503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
504 "Build SLP failed: unvectorizable statement ");
505 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
506 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
507 }
508 /* Fatal mismatch. */
509 matches[0] = false;
510 return false;
511 }
512
513 lhs = gimple_get_lhs (stmt);
514 if (lhs == NULL_TREE)
515 {
516 if (dump_enabled_p ())
517 {
518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
519 "Build SLP failed: not GIMPLE_ASSIGN nor "
520 "GIMPLE_CALL ");
521 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
522 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
523 }
524 /* Fatal mismatch. */
525 matches[0] = false;
526 return false;
527 }
528
529 if (is_gimple_assign (stmt)
530 && gimple_assign_rhs_code (stmt) == COND_EXPR
531 && (cond = gimple_assign_rhs1 (stmt))
532 && !COMPARISON_CLASS_P (cond))
533 {
534 if (dump_enabled_p ())
535 {
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "Build SLP failed: condition is not "
538 "comparison ");
539 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
540 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
541 }
542 /* Fatal mismatch. */
543 matches[0] = false;
544 return false;
545 }
546
547 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
548 vectype = get_vectype_for_scalar_type (scalar_type);
549 if (!vectype)
550 {
551 if (dump_enabled_p ())
552 {
553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
554 "Build SLP failed: unsupported data-type ");
555 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
556 scalar_type);
557 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
558 }
559 /* Fatal mismatch. */
560 matches[0] = false;
561 return false;
562 }
563
564 /* If populating the vector type requires unrolling then fail
565 before adjusting *max_nunits for basic-block vectorization. */
566 if (is_a <bb_vec_info> (vinfo)
567 && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
568 {
569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
570 "Build SLP failed: unrolling required "
571 "in basic block SLP\n");
572 /* Fatal mismatch. */
573 matches[0] = false;
574 return false;
575 }
576
577 /* In case of multiple types we need to detect the smallest type. */
578 if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
579 {
580 *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
581 if (is_a <bb_vec_info> (vinfo))
582 vectorization_factor = *max_nunits;
583 }
584
585 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
586 {
587 rhs_code = CALL_EXPR;
588 if (gimple_call_internal_p (call_stmt)
589 || gimple_call_tail_p (call_stmt)
590 || gimple_call_noreturn_p (call_stmt)
591 || !gimple_call_nothrow_p (call_stmt)
592 || gimple_call_chain (call_stmt))
593 {
594 if (dump_enabled_p ())
595 {
596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
597 "Build SLP failed: unsupported call type ");
598 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
599 call_stmt, 0);
600 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
601 }
602 /* Fatal mismatch. */
603 matches[0] = false;
604 return false;
605 }
606 }
607 else
608 rhs_code = gimple_assign_rhs_code (stmt);
609
610 /* Check the operation. */
611 if (i == 0)
612 {
613 first_stmt_code = rhs_code;
614
615 /* Shift arguments should be equal in all the packed stmts for a
616 vector shift with scalar shift operand. */
617 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
618 || rhs_code == LROTATE_EXPR
619 || rhs_code == RROTATE_EXPR)
620 {
621 vec_mode = TYPE_MODE (vectype);
622
623 /* First see if we have a vector/vector shift. */
624 optab = optab_for_tree_code (rhs_code, vectype,
625 optab_vector);
626
627 if (!optab
628 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
629 {
630 /* No vector/vector shift, try for a vector/scalar shift. */
631 optab = optab_for_tree_code (rhs_code, vectype,
632 optab_scalar);
633
634 if (!optab)
635 {
636 if (dump_enabled_p ())
637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
638 "Build SLP failed: no optab.\n");
639 /* Fatal mismatch. */
640 matches[0] = false;
641 return false;
642 }
643 icode = (int) optab_handler (optab, vec_mode);
644 if (icode == CODE_FOR_nothing)
645 {
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
648 "Build SLP failed: "
649 "op not supported by target.\n");
650 /* Fatal mismatch. */
651 matches[0] = false;
652 return false;
653 }
654 optab_op2_mode = insn_data[icode].operand[2].mode;
655 if (!VECTOR_MODE_P (optab_op2_mode))
656 {
657 need_same_oprnds = true;
658 first_op1 = gimple_assign_rhs2 (stmt);
659 }
660 }
661 }
662 else if (rhs_code == WIDEN_LSHIFT_EXPR)
663 {
664 need_same_oprnds = true;
665 first_op1 = gimple_assign_rhs2 (stmt);
666 }
667 }
668 else
669 {
670 if (first_stmt_code != rhs_code
671 && alt_stmt_code == ERROR_MARK)
672 alt_stmt_code = rhs_code;
673 if (first_stmt_code != rhs_code
674 && (first_stmt_code != IMAGPART_EXPR
675 || rhs_code != REALPART_EXPR)
676 && (first_stmt_code != REALPART_EXPR
677 || rhs_code != IMAGPART_EXPR)
678 /* Handle mismatches in plus/minus by computing both
679 and merging the results. */
680 && !((first_stmt_code == PLUS_EXPR
681 || first_stmt_code == MINUS_EXPR)
682 && (alt_stmt_code == PLUS_EXPR
683 || alt_stmt_code == MINUS_EXPR)
684 && rhs_code == alt_stmt_code)
685 && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
686 && (first_stmt_code == ARRAY_REF
687 || first_stmt_code == BIT_FIELD_REF
688 || first_stmt_code == INDIRECT_REF
689 || first_stmt_code == COMPONENT_REF
690 || first_stmt_code == MEM_REF)))
691 {
692 if (dump_enabled_p ())
693 {
694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
695 "Build SLP failed: different operation "
696 "in stmt ");
697 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
699 "original stmt ");
700 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
701 first_stmt, 0);
702 }
703 /* Mismatch. */
704 continue;
705 }
706
707 if (need_same_oprnds
708 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
709 {
710 if (dump_enabled_p ())
711 {
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "Build SLP failed: different shift "
714 "arguments in ");
715 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
716 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
717 }
718 /* Mismatch. */
719 continue;
720 }
721
722 if (rhs_code == CALL_EXPR)
723 {
724 gimple *first_stmt = stmts[0];
725 if (gimple_call_num_args (stmt) != nops
726 || !operand_equal_p (gimple_call_fn (first_stmt),
727 gimple_call_fn (stmt), 0)
728 || gimple_call_fntype (first_stmt)
729 != gimple_call_fntype (stmt))
730 {
731 if (dump_enabled_p ())
732 {
733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
734 "Build SLP failed: different calls in ");
735 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
736 stmt, 0);
737 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
738 }
739 /* Mismatch. */
740 continue;
741 }
742 }
743 }
744
745 /* Grouped store or load. */
746 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
747 {
748 if (REFERENCE_CLASS_P (lhs))
749 {
750 /* Store. */
751 ;
752 }
753 else
754 {
755 /* Load. */
756 /* Check that the size of interleaved loads group is not
757 greater than the SLP group size. */
758 unsigned ncopies
759 = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
760 if (is_a <loop_vec_info> (vinfo)
761 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
762 && ((GROUP_SIZE (vinfo_for_stmt (stmt))
763 - GROUP_GAP (vinfo_for_stmt (stmt)))
764 > ncopies * group_size))
765 {
766 if (dump_enabled_p ())
767 {
768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
769 "Build SLP failed: the number "
770 "of interleaved loads is greater than "
771 "the SLP group size ");
772 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
773 stmt, 0);
774 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
775 }
776 /* Fatal mismatch. */
777 matches[0] = false;
778 return false;
779 }
780
781 first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
782 if (prev_first_load)
783 {
784 /* Check that there are no loads from different interleaving
785 chains in the same node. */
786 if (prev_first_load != first_load)
787 {
788 if (dump_enabled_p ())
789 {
790 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
791 vect_location,
792 "Build SLP failed: different "
793 "interleaving chains in one node ");
794 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
795 stmt, 0);
796 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
797 }
798 /* Mismatch. */
799 continue;
800 }
801 }
802 else
803 prev_first_load = first_load;
804 }
805 } /* Grouped access. */
806 else
807 {
808 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
809 {
810 /* Not grouped load. */
811 if (dump_enabled_p ())
812 {
813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
814 "Build SLP failed: not grouped load ");
815 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
816 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
817 }
818
819 /* FORNOW: Not grouped loads are not supported. */
820 /* Fatal mismatch. */
821 matches[0] = false;
822 return false;
823 }
824
825 /* Not memory operation. */
826 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
827 && TREE_CODE_CLASS (rhs_code) != tcc_unary
828 && TREE_CODE_CLASS (rhs_code) != tcc_expression
829 && rhs_code != CALL_EXPR)
830 {
831 if (dump_enabled_p ())
832 {
833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
834 "Build SLP failed: operation");
835 dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
836 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
837 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
838 }
839 /* Fatal mismatch. */
840 matches[0] = false;
841 return false;
842 }
843
844 if (rhs_code == COND_EXPR)
845 {
846 tree cond_expr = gimple_assign_rhs1 (stmt);
847
848 if (i == 0)
849 first_cond_code = TREE_CODE (cond_expr);
850 else if (first_cond_code != TREE_CODE (cond_expr))
851 {
852 if (dump_enabled_p ())
853 {
854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
855 "Build SLP failed: different"
856 " operation");
857 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
858 stmt, 0);
859 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
860 }
861 /* Mismatch. */
862 continue;
863 }
864 }
865 }
866
867 matches[i] = true;
868 }
869
870 for (i = 0; i < group_size; ++i)
871 if (!matches[i])
872 return false;
873
874 /* If we allowed a two-operation SLP node verify the target can cope
875 with the permute we are going to use. */
876 if (alt_stmt_code != ERROR_MARK
877 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
878 {
879 unsigned char *sel
880 = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
881 for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
882 {
883 sel[i] = i;
884 if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
885 sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
886 }
887 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
888 {
889 for (i = 0; i < group_size; ++i)
890 if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
891 {
892 matches[i] = false;
893 if (dump_enabled_p ())
894 {
895 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
896 "Build SLP failed: different operation "
897 "in stmt ");
898 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
899 stmts[i], 0);
900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
901 "original stmt ");
902 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
903 first_stmt, 0);
904 }
905 }
906 return false;
907 }
908 *two_operators = true;
909 }
910
911 return true;
912 }
913
914 /* Recursively build an SLP tree starting from NODE.
915 Fail (and return a value not equal to zero) if def-stmts are not
916 isomorphic, require data permutation or are of unsupported types of
917 operation. Otherwise, return 0.
918 The value returned is the depth in the SLP tree where a mismatch
919 was found. */
920
921 static bool
922 vect_build_slp_tree (vec_info *vinfo,
923 slp_tree *node, unsigned int group_size,
924 unsigned int *max_nunits,
925 vec<slp_tree> *loads,
926 unsigned int vectorization_factor,
927 bool *matches, unsigned *npermutes, unsigned *tree_size,
928 unsigned max_tree_size)
929 {
930 unsigned nops, i, this_tree_size = 0;
931 gimple *stmt;
932
933 matches[0] = false;
934
935 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
936 if (is_gimple_call (stmt))
937 nops = gimple_call_num_args (stmt);
938 else if (is_gimple_assign (stmt))
939 {
940 nops = gimple_num_ops (stmt) - 1;
941 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
942 nops++;
943 }
944 else
945 return false;
946
947 bool two_operators = false;
948 if (!vect_build_slp_tree_1 (vinfo,
949 SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
950 max_nunits, vectorization_factor, matches,
951 &two_operators))
952 return false;
953 SLP_TREE_TWO_OPERATORS (*node) = two_operators;
954
955 /* If the SLP node is a load, terminate the recursion. */
956 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
957 && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
958 {
959 loads->safe_push (*node);
960 return true;
961 }
962
963 /* Get at the operands, verifying they are compatible. */
964 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
965 slp_oprnd_info oprnd_info;
966 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
967 {
968 switch (vect_get_and_check_slp_defs (vinfo, stmt, i, &oprnds_info))
969 {
970 case 0:
971 break;
972 case -1:
973 matches[0] = false;
974 vect_free_oprnd_info (oprnds_info);
975 return false;
976 case 1:
977 matches[i] = false;
978 break;
979 }
980 }
981 for (i = 0; i < group_size; ++i)
982 if (!matches[i])
983 {
984 vect_free_oprnd_info (oprnds_info);
985 return false;
986 }
987
988 stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
989
990 /* Create SLP_TREE nodes for the definition node/s. */
991 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
992 {
993 slp_tree child;
994 unsigned old_nloads = loads->length ();
995 unsigned old_max_nunits = *max_nunits;
996
997 if (oprnd_info->first_dt != vect_internal_def)
998 continue;
999
1000 if (++this_tree_size > max_tree_size)
1001 {
1002 vect_free_oprnd_info (oprnds_info);
1003 return false;
1004 }
1005
1006 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1007 if (!child)
1008 {
1009 vect_free_oprnd_info (oprnds_info);
1010 return false;
1011 }
1012
1013 if (vect_build_slp_tree (vinfo, &child,
1014 group_size, max_nunits, loads,
1015 vectorization_factor, matches,
1016 npermutes, &this_tree_size, max_tree_size))
1017 {
1018 /* If we have all children of child built up from scalars then just
1019 throw that away and build it up this node from scalars. */
1020 if (!SLP_TREE_CHILDREN (child).is_empty ())
1021 {
1022 unsigned int j;
1023 slp_tree grandchild;
1024
1025 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1026 if (grandchild != NULL)
1027 break;
1028 if (!grandchild)
1029 {
1030 /* Roll back. */
1031 *max_nunits = old_max_nunits;
1032 loads->truncate (old_nloads);
1033 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1034 vect_free_slp_tree (grandchild);
1035 SLP_TREE_CHILDREN (child).truncate (0);
1036
1037 dump_printf_loc (MSG_NOTE, vect_location,
1038 "Building parent vector operands from "
1039 "scalars instead\n");
1040 oprnd_info->def_stmts = vNULL;
1041 vect_free_slp_tree (child);
1042 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1043 continue;
1044 }
1045 }
1046
1047 oprnd_info->def_stmts = vNULL;
1048 SLP_TREE_CHILDREN (*node).quick_push (child);
1049 continue;
1050 }
1051
1052 /* If the SLP build failed fatally and we analyze a basic-block
1053 simply treat nodes we fail to build as externally defined
1054 (and thus build vectors from the scalar defs).
1055 The cost model will reject outright expensive cases.
1056 ??? This doesn't treat cases where permutation ultimatively
1057 fails (or we don't try permutation below). Ideally we'd
1058 even compute a permutation that will end up with the maximum
1059 SLP tree size... */
1060 if (is_a <bb_vec_info> (vinfo)
1061 && !matches[0]
1062 /* ??? Rejecting patterns this way doesn't work. We'd have to
1063 do extra work to cancel the pattern so the uses see the
1064 scalar version. */
1065 && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
1066 {
1067 unsigned int j;
1068 slp_tree grandchild;
1069
1070 /* Roll back. */
1071 *max_nunits = old_max_nunits;
1072 loads->truncate (old_nloads);
1073 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1074 vect_free_slp_tree (grandchild);
1075 SLP_TREE_CHILDREN (child).truncate (0);
1076
1077 dump_printf_loc (MSG_NOTE, vect_location,
1078 "Building vector operands from scalars\n");
1079 oprnd_info->def_stmts = vNULL;
1080 vect_free_slp_tree (child);
1081 SLP_TREE_CHILDREN (*node).quick_push (NULL);
1082 continue;
1083 }
1084
1085 /* If the SLP build for operand zero failed and operand zero
1086 and one can be commutated try that for the scalar stmts
1087 that failed the match. */
1088 if (i == 0
1089 /* A first scalar stmt mismatch signals a fatal mismatch. */
1090 && matches[0]
1091 /* ??? For COND_EXPRs we can swap the comparison operands
1092 as well as the arms under some constraints. */
1093 && nops == 2
1094 && oprnds_info[1]->first_dt == vect_internal_def
1095 && is_gimple_assign (stmt)
1096 && commutative_tree_code (gimple_assign_rhs_code (stmt))
1097 && !SLP_TREE_TWO_OPERATORS (*node)
1098 /* Do so only if the number of not successful permutes was nor more
1099 than a cut-ff as re-trying the recursive match on
1100 possibly each level of the tree would expose exponential
1101 behavior. */
1102 && *npermutes < 4)
1103 {
1104 unsigned int j;
1105 slp_tree grandchild;
1106
1107 /* Roll back. */
1108 *max_nunits = old_max_nunits;
1109 loads->truncate (old_nloads);
1110 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1111 vect_free_slp_tree (grandchild);
1112 SLP_TREE_CHILDREN (child).truncate (0);
1113
1114 /* Swap mismatched definition stmts. */
1115 dump_printf_loc (MSG_NOTE, vect_location,
1116 "Re-trying with swapped operands of stmts ");
1117 for (j = 0; j < group_size; ++j)
1118 if (!matches[j])
1119 {
1120 std::swap (oprnds_info[0]->def_stmts[j],
1121 oprnds_info[1]->def_stmts[j]);
1122 dump_printf (MSG_NOTE, "%d ", j);
1123 }
1124 dump_printf (MSG_NOTE, "\n");
1125 /* And try again with scratch 'matches' ... */
1126 bool *tem = XALLOCAVEC (bool, group_size);
1127 if (vect_build_slp_tree (vinfo, &child,
1128 group_size, max_nunits, loads,
1129 vectorization_factor,
1130 tem, npermutes, &this_tree_size,
1131 max_tree_size))
1132 {
1133 /* ... so if successful we can apply the operand swapping
1134 to the GIMPLE IL. This is necessary because for example
1135 vect_get_slp_defs uses operand indexes and thus expects
1136 canonical operand order. */
1137 for (j = 0; j < group_size; ++j)
1138 if (!matches[j])
1139 {
1140 gimple *stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
1141 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1142 gimple_assign_rhs2_ptr (stmt));
1143 }
1144 oprnd_info->def_stmts = vNULL;
1145 SLP_TREE_CHILDREN (*node).quick_push (child);
1146 continue;
1147 }
1148
1149 ++*npermutes;
1150 }
1151
1152 oprnd_info->def_stmts = vNULL;
1153 vect_free_slp_tree (child);
1154 vect_free_oprnd_info (oprnds_info);
1155 return false;
1156 }
1157
1158 if (tree_size)
1159 *tree_size += this_tree_size;
1160
1161 vect_free_oprnd_info (oprnds_info);
1162 return true;
1163 }
1164
1165 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1166
1167 static void
1168 vect_print_slp_tree (int dump_kind, slp_tree node)
1169 {
1170 int i;
1171 gimple *stmt;
1172 slp_tree child;
1173
1174 if (!node)
1175 return;
1176
1177 dump_printf (dump_kind, "node ");
1178 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1179 {
1180 dump_printf (dump_kind, "\n\tstmt %d ", i);
1181 dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1182 }
1183 dump_printf (dump_kind, "\n");
1184
1185 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1186 vect_print_slp_tree (dump_kind, child);
1187 }
1188
1189
1190 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1191 If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1192 J). Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1193 stmts in NODE are to be marked. */
1194
1195 static void
1196 vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1197 {
1198 int i;
1199 gimple *stmt;
1200 slp_tree child;
1201
1202 if (!node)
1203 return;
1204
1205 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1206 if (j < 0 || i == j)
1207 STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1208
1209 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1210 vect_mark_slp_stmts (child, mark, j);
1211 }
1212
1213
1214 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1215
1216 static void
1217 vect_mark_slp_stmts_relevant (slp_tree node)
1218 {
1219 int i;
1220 gimple *stmt;
1221 stmt_vec_info stmt_info;
1222 slp_tree child;
1223
1224 if (!node)
1225 return;
1226
1227 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1228 {
1229 stmt_info = vinfo_for_stmt (stmt);
1230 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1231 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1232 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1233 }
1234
1235 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1236 vect_mark_slp_stmts_relevant (child);
1237 }
1238
1239
1240 /* Rearrange the statements of NODE according to PERMUTATION. */
1241
1242 static void
1243 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1244 vec<unsigned> permutation)
1245 {
1246 gimple *stmt;
1247 vec<gimple *> tmp_stmts;
1248 unsigned int i;
1249 slp_tree child;
1250
1251 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1252 vect_slp_rearrange_stmts (child, group_size, permutation);
1253
1254 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1255 tmp_stmts.create (group_size);
1256 tmp_stmts.quick_grow_cleared (group_size);
1257
1258 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1259 tmp_stmts[permutation[i]] = stmt;
1260
1261 SLP_TREE_SCALAR_STMTS (node).release ();
1262 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1263 }
1264
1265
1266 /* Attempt to reorder stmts in a reduction chain so that we don't
1267 require any load permutation. Return true if that was possible,
1268 otherwise return false. */
1269
1270 static bool
1271 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1272 {
1273 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1274 unsigned int i, j;
1275 sbitmap load_index;
1276 unsigned int lidx;
1277 slp_tree node, load;
1278
1279 /* Compare all the permutation sequences to the first one. We know
1280 that at least one load is permuted. */
1281 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1282 if (!node->load_permutation.exists ())
1283 return false;
1284 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1285 {
1286 if (!load->load_permutation.exists ())
1287 return false;
1288 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1289 if (lidx != node->load_permutation[j])
1290 return false;
1291 }
1292
1293 /* Check that the loads in the first sequence are different and there
1294 are no gaps between them. */
1295 load_index = sbitmap_alloc (group_size);
1296 bitmap_clear (load_index);
1297 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1298 {
1299 if (bitmap_bit_p (load_index, lidx))
1300 {
1301 sbitmap_free (load_index);
1302 return false;
1303 }
1304 bitmap_set_bit (load_index, lidx);
1305 }
1306 for (i = 0; i < group_size; i++)
1307 if (!bitmap_bit_p (load_index, i))
1308 {
1309 sbitmap_free (load_index);
1310 return false;
1311 }
1312 sbitmap_free (load_index);
1313
1314 /* This permutation is valid for reduction. Since the order of the
1315 statements in the nodes is not important unless they are memory
1316 accesses, we can rearrange the statements in all the nodes
1317 according to the order of the loads. */
1318 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1319 node->load_permutation);
1320
1321 /* We are done, no actual permutations need to be generated. */
1322 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1323 SLP_TREE_LOAD_PERMUTATION (node).release ();
1324 return true;
1325 }
1326
1327 /* Check if the required load permutations in the SLP instance
1328 SLP_INSTN are supported. */
1329
1330 static bool
1331 vect_supported_load_permutation_p (slp_instance slp_instn)
1332 {
1333 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1334 unsigned int i, j, k, next;
1335 slp_tree node;
1336 gimple *stmt, *load, *next_load, *first_load;
1337 struct data_reference *dr;
1338
1339 if (dump_enabled_p ())
1340 {
1341 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1342 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1343 if (node->load_permutation.exists ())
1344 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1345 dump_printf (MSG_NOTE, "%d ", next);
1346 else
1347 for (k = 0; k < group_size; ++k)
1348 dump_printf (MSG_NOTE, "%d ", k);
1349 dump_printf (MSG_NOTE, "\n");
1350 }
1351
1352 /* In case of reduction every load permutation is allowed, since the order
1353 of the reduction statements is not important (as opposed to the case of
1354 grouped stores). The only condition we need to check is that all the
1355 load nodes are of the same size and have the same permutation (and then
1356 rearrange all the nodes of the SLP instance according to this
1357 permutation). */
1358
1359 /* Check that all the load nodes are of the same size. */
1360 /* ??? Can't we assert this? */
1361 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1362 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1363 return false;
1364
1365 node = SLP_INSTANCE_TREE (slp_instn);
1366 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1367
1368 /* Reduction (there are no data-refs in the root).
1369 In reduction chain the order of the loads is not important. */
1370 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1371 && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1372 {
1373 if (vect_attempt_slp_rearrange_stmts (slp_instn))
1374 return true;
1375
1376 /* Fallthru to general load permutation handling. */
1377 }
1378
1379 /* In basic block vectorization we allow any subchain of an interleaving
1380 chain.
1381 FORNOW: not supported in loop SLP because of realignment compications. */
1382 if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1383 {
1384 /* Check whether the loads in an instance form a subchain and thus
1385 no permutation is necessary. */
1386 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1387 {
1388 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1389 continue;
1390 bool subchain_p = true;
1391 next_load = NULL;
1392 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1393 {
1394 if (j != 0
1395 && (next_load != load
1396 || GROUP_GAP (vinfo_for_stmt (load)) != 1))
1397 {
1398 subchain_p = false;
1399 break;
1400 }
1401 next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1402 }
1403 if (subchain_p)
1404 SLP_TREE_LOAD_PERMUTATION (node).release ();
1405 else
1406 {
1407 /* Verify the permutation can be generated. */
1408 vec<tree> tem;
1409 if (!vect_transform_slp_perm_load (node, tem, NULL,
1410 1, slp_instn, true))
1411 {
1412 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1413 vect_location,
1414 "unsupported load permutation\n");
1415 return false;
1416 }
1417 }
1418 }
1419
1420 /* Check that the alignment of the first load in every subchain, i.e.,
1421 the first statement in every load node, is supported.
1422 ??? This belongs in alignment checking. */
1423 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1424 {
1425 first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1426 if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1427 {
1428 dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1429 if (vect_supportable_dr_alignment (dr, false)
1430 == dr_unaligned_unsupported)
1431 {
1432 if (dump_enabled_p ())
1433 {
1434 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1435 vect_location,
1436 "unsupported unaligned load ");
1437 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1438 first_load, 0);
1439 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1440 }
1441 return false;
1442 }
1443 }
1444 }
1445
1446 return true;
1447 }
1448
1449 /* For loop vectorization verify we can generate the permutation. */
1450 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1451 if (node->load_permutation.exists ()
1452 && !vect_transform_slp_perm_load
1453 (node, vNULL, NULL,
1454 SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1455 return false;
1456
1457 return true;
1458 }
1459
1460
1461 /* Find the last store in SLP INSTANCE. */
1462
1463 static gimple *
1464 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1465 {
1466 gimple *last = NULL, *stmt;
1467
1468 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
1469 {
1470 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
1471 if (is_pattern_stmt_p (stmt_vinfo))
1472 last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
1473 else
1474 last = get_later_stmt (stmt, last);
1475 }
1476
1477 return last;
1478 }
1479
1480 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE. */
1481
1482 static void
1483 vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
1484 stmt_vector_for_cost *prologue_cost_vec,
1485 stmt_vector_for_cost *body_cost_vec,
1486 unsigned ncopies_for_cost)
1487 {
1488 unsigned i;
1489 slp_tree child;
1490 gimple *stmt, *s;
1491 stmt_vec_info stmt_info;
1492 tree lhs;
1493 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1494
1495 /* Recurse down the SLP tree. */
1496 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1497 if (child)
1498 vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
1499 body_cost_vec, ncopies_for_cost);
1500
1501 /* Look at the first scalar stmt to determine the cost. */
1502 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1503 stmt_info = vinfo_for_stmt (stmt);
1504 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1505 {
1506 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1507 vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1508 vect_uninitialized_def,
1509 node, prologue_cost_vec, body_cost_vec);
1510 else
1511 {
1512 int i;
1513 gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1514 vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1515 node, prologue_cost_vec, body_cost_vec);
1516 /* If the load is permuted record the cost for the permutation.
1517 ??? Loads from multiple chains are let through here only
1518 for a single special case involving complex numbers where
1519 in the end no permutation is necessary. */
1520 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1521 if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1522 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1523 && vect_get_place_in_interleaving_chain
1524 (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1525 {
1526 record_stmt_cost (body_cost_vec, group_size, vec_perm,
1527 stmt_info, 0, vect_body);
1528 break;
1529 }
1530 }
1531 }
1532 else
1533 {
1534 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1535 stmt_info, 0, vect_body);
1536 if (SLP_TREE_TWO_OPERATORS (node))
1537 {
1538 record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1539 stmt_info, 0, vect_body);
1540 record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
1541 stmt_info, 0, vect_body);
1542 }
1543 }
1544
1545 /* Scan operands and account for prologue cost of constants/externals.
1546 ??? This over-estimates cost for multiple uses and should be
1547 re-engineered. */
1548 lhs = gimple_get_lhs (stmt);
1549 for (i = 0; i < gimple_num_ops (stmt); ++i)
1550 {
1551 tree op = gimple_op (stmt, i);
1552 gimple *def_stmt;
1553 enum vect_def_type dt;
1554 if (!op || op == lhs)
1555 continue;
1556 if (vect_is_simple_use (op, stmt_info->vinfo, &def_stmt, &dt))
1557 {
1558 /* Without looking at the actual initializer a vector of
1559 constants can be implemented as load from the constant pool.
1560 ??? We need to pass down stmt_info for a vector type
1561 even if it points to the wrong stmt. */
1562 if (dt == vect_constant_def)
1563 record_stmt_cost (prologue_cost_vec, 1, vector_load,
1564 stmt_info, 0, vect_prologue);
1565 else if (dt == vect_external_def)
1566 record_stmt_cost (prologue_cost_vec, 1, vec_construct,
1567 stmt_info, 0, vect_prologue);
1568 }
1569 }
1570 }
1571
1572 /* Compute the cost for the SLP instance INSTANCE. */
1573
1574 static void
1575 vect_analyze_slp_cost (slp_instance instance, void *data)
1576 {
1577 stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1578 unsigned ncopies_for_cost;
1579 stmt_info_for_cost *si;
1580 unsigned i;
1581
1582 if (dump_enabled_p ())
1583 dump_printf_loc (MSG_NOTE, vect_location,
1584 "=== vect_analyze_slp_cost ===\n");
1585
1586 /* Calculate the number of vector stmts to create based on the unrolling
1587 factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1588 GROUP_SIZE / NUNITS otherwise. */
1589 unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1590 slp_tree node = SLP_INSTANCE_TREE (instance);
1591 stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
1592 /* Adjust the group_size by the vectorization factor which is always one
1593 for basic-block vectorization. */
1594 if (STMT_VINFO_LOOP_VINFO (stmt_info))
1595 group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
1596 unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1597 /* For reductions look at a reduction operand in case the reduction
1598 operation is widening like DOT_PROD or SAD. */
1599 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1600 {
1601 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1602 switch (gimple_assign_rhs_code (stmt))
1603 {
1604 case DOT_PROD_EXPR:
1605 case SAD_EXPR:
1606 nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
1607 (TREE_TYPE (gimple_assign_rhs1 (stmt))));
1608 break;
1609 default:;
1610 }
1611 }
1612 ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1613
1614 prologue_cost_vec.create (10);
1615 body_cost_vec.create (10);
1616 vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
1617 &prologue_cost_vec, &body_cost_vec,
1618 ncopies_for_cost);
1619
1620 /* Record the prologue costs, which were delayed until we were
1621 sure that SLP was successful. */
1622 FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1623 {
1624 struct _stmt_vec_info *stmt_info
1625 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1626 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1627 si->misalign, vect_prologue);
1628 }
1629
1630 /* Record the instance's instructions in the target cost model. */
1631 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1632 {
1633 struct _stmt_vec_info *stmt_info
1634 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1635 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1636 si->misalign, vect_body);
1637 }
1638
1639 prologue_cost_vec.release ();
1640 body_cost_vec.release ();
1641 }
1642
1643 /* Analyze an SLP instance starting from a group of grouped stores. Call
1644 vect_build_slp_tree to build a tree of packed stmts if possible.
1645 Return FALSE if it's impossible to SLP any stmt in the loop. */
1646
1647 static bool
1648 vect_analyze_slp_instance (vec_info *vinfo,
1649 gimple *stmt, unsigned max_tree_size)
1650 {
1651 slp_instance new_instance;
1652 slp_tree node;
1653 unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1654 unsigned int unrolling_factor = 1, nunits;
1655 tree vectype, scalar_type = NULL_TREE;
1656 gimple *next;
1657 unsigned int vectorization_factor = 0;
1658 int i;
1659 unsigned int max_nunits = 0;
1660 vec<slp_tree> loads;
1661 struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1662 vec<gimple *> scalar_stmts;
1663
1664 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1665 {
1666 if (dr)
1667 {
1668 scalar_type = TREE_TYPE (DR_REF (dr));
1669 vectype = get_vectype_for_scalar_type (scalar_type);
1670 }
1671 else
1672 {
1673 gcc_assert (is_a <loop_vec_info> (vinfo));
1674 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1675 }
1676
1677 group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1678 }
1679 else
1680 {
1681 gcc_assert (is_a <loop_vec_info> (vinfo));
1682 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1683 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1684 }
1685
1686 if (!vectype)
1687 {
1688 if (dump_enabled_p ())
1689 {
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691 "Build SLP failed: unsupported data-type ");
1692 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1693 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1694 }
1695
1696 return false;
1697 }
1698
1699 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1700 if (is_a <loop_vec_info> (vinfo))
1701 vectorization_factor = as_a <loop_vec_info> (vinfo)->vectorization_factor;
1702 else
1703 vectorization_factor = nunits;
1704
1705 /* Calculate the unrolling factor. */
1706 unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1707 if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo))
1708 {
1709 if (dump_enabled_p ())
1710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1711 "Build SLP failed: unrolling required in basic"
1712 " block SLP\n");
1713
1714 return false;
1715 }
1716
1717 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1718 scalar_stmts.create (group_size);
1719 next = stmt;
1720 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1721 {
1722 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1723 while (next)
1724 {
1725 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1726 && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1727 scalar_stmts.safe_push (
1728 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1729 else
1730 scalar_stmts.safe_push (next);
1731 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1732 }
1733 /* Mark the first element of the reduction chain as reduction to properly
1734 transform the node. In the reduction analysis phase only the last
1735 element of the chain is marked as reduction. */
1736 if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
1737 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
1738 }
1739 else
1740 {
1741 /* Collect reduction statements. */
1742 vec<gimple *> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1743 for (i = 0; reductions.iterate (i, &next); i++)
1744 scalar_stmts.safe_push (next);
1745 }
1746
1747 node = vect_create_new_slp_node (scalar_stmts);
1748
1749 loads.create (group_size);
1750
1751 /* Build the tree for the SLP instance. */
1752 bool *matches = XALLOCAVEC (bool, group_size);
1753 unsigned npermutes = 0;
1754 if (vect_build_slp_tree (vinfo, &node, group_size,
1755 &max_nunits, &loads,
1756 vectorization_factor, matches, &npermutes, NULL,
1757 max_tree_size))
1758 {
1759 /* Calculate the unrolling factor based on the smallest type. */
1760 if (max_nunits > nunits)
1761 unrolling_factor = least_common_multiple (max_nunits, group_size)
1762 / group_size;
1763
1764 if (unrolling_factor != 1 && is_a <bb_vec_info> (vinfo))
1765 {
1766 if (dump_enabled_p ())
1767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1768 "Build SLP failed: unrolling required in basic"
1769 " block SLP\n");
1770 vect_free_slp_tree (node);
1771 loads.release ();
1772 return false;
1773 }
1774
1775 /* Create a new SLP instance. */
1776 new_instance = XNEW (struct _slp_instance);
1777 SLP_INSTANCE_TREE (new_instance) = node;
1778 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1779 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1780 SLP_INSTANCE_LOADS (new_instance) = loads;
1781
1782 /* Compute the load permutation. */
1783 slp_tree load_node;
1784 bool loads_permuted = false;
1785 FOR_EACH_VEC_ELT (loads, i, load_node)
1786 {
1787 vec<unsigned> load_permutation;
1788 int j;
1789 gimple *load, *first_stmt;
1790 bool this_load_permuted = false;
1791 load_permutation.create (group_size);
1792 first_stmt = GROUP_FIRST_ELEMENT
1793 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1794 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1795 {
1796 int load_place
1797 = vect_get_place_in_interleaving_chain (load, first_stmt);
1798 gcc_assert (load_place != -1);
1799 if (load_place != j)
1800 this_load_permuted = true;
1801 load_permutation.safe_push (load_place);
1802 }
1803 if (!this_load_permuted
1804 /* The load requires permutation when unrolling exposes
1805 a gap either because the group is larger than the SLP
1806 group-size or because there is a gap between the groups. */
1807 && (unrolling_factor == 1
1808 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
1809 && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
1810 {
1811 load_permutation.release ();
1812 continue;
1813 }
1814 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1815 loads_permuted = true;
1816 }
1817
1818 if (loads_permuted)
1819 {
1820 if (!vect_supported_load_permutation_p (new_instance))
1821 {
1822 if (dump_enabled_p ())
1823 {
1824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1825 "Build SLP failed: unsupported load "
1826 "permutation ");
1827 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1828 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1829 }
1830 vect_free_slp_instance (new_instance);
1831 return false;
1832 }
1833 }
1834
1835 vinfo->slp_instances.safe_push (new_instance);
1836
1837 if (dump_enabled_p ())
1838 vect_print_slp_tree (MSG_NOTE, node);
1839
1840 return true;
1841 }
1842
1843 /* Failed to SLP. */
1844 /* Free the allocated memory. */
1845 vect_free_slp_tree (node);
1846 loads.release ();
1847
1848 return false;
1849 }
1850
1851
1852 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
1853 trees of packed scalar stmts if SLP is possible. */
1854
1855 bool
1856 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
1857 {
1858 unsigned int i;
1859 gimple *first_element;
1860 bool ok = false;
1861
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1864
1865 /* Find SLP sequences starting from groups of grouped stores. */
1866 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
1867 if (vect_analyze_slp_instance (vinfo, first_element, max_tree_size))
1868 ok = true;
1869
1870 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1871 {
1872 if (loop_vinfo->reduction_chains.length () > 0)
1873 {
1874 /* Find SLP sequences starting from reduction chains. */
1875 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
1876 if (vect_analyze_slp_instance (vinfo, first_element,
1877 max_tree_size))
1878 ok = true;
1879 else
1880 return false;
1881
1882 /* Don't try to vectorize SLP reductions if reduction chain was
1883 detected. */
1884 return ok;
1885 }
1886
1887 /* Find SLP sequences starting from groups of reductions. */
1888 if (loop_vinfo->reductions.length () > 1
1889 && vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
1890 max_tree_size))
1891 ok = true;
1892 }
1893
1894 return true;
1895 }
1896
1897
1898 /* For each possible SLP instance decide whether to SLP it and calculate overall
1899 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
1900 least one instance. */
1901
1902 bool
1903 vect_make_slp_decision (loop_vec_info loop_vinfo)
1904 {
1905 unsigned int i, unrolling_factor = 1;
1906 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1907 slp_instance instance;
1908 int decided_to_slp = 0;
1909
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1912 "\n");
1913
1914 FOR_EACH_VEC_ELT (slp_instances, i, instance)
1915 {
1916 /* FORNOW: SLP if you can. */
1917 if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1918 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1919
1920 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
1921 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1922 loop-based vectorization. Such stmts will be marked as HYBRID. */
1923 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1924 decided_to_slp++;
1925 }
1926
1927 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1928
1929 if (decided_to_slp && dump_enabled_p ())
1930 dump_printf_loc (MSG_NOTE, vect_location,
1931 "Decided to SLP %d instances. Unrolling factor %d\n",
1932 decided_to_slp, unrolling_factor);
1933
1934 return (decided_to_slp > 0);
1935 }
1936
1937
1938 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1939 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
1940
1941 static void
1942 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1943 {
1944 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1945 imm_use_iterator imm_iter;
1946 gimple *use_stmt;
1947 stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1948 slp_tree child;
1949 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1950 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1951 int j;
1952
1953 /* Propagate hybrid down the SLP tree. */
1954 if (stype == hybrid)
1955 ;
1956 else if (HYBRID_SLP_STMT (stmt_vinfo))
1957 stype = hybrid;
1958 else
1959 {
1960 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
1961 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1962 /* We always get the pattern stmt here, but for immediate
1963 uses we have to use the LHS of the original stmt. */
1964 gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1965 if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
1966 stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
1967 if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1968 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1969 {
1970 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1971 continue;
1972 use_vinfo = vinfo_for_stmt (use_stmt);
1973 if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1974 && STMT_VINFO_RELATED_STMT (use_vinfo))
1975 use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
1976 if (!STMT_SLP_TYPE (use_vinfo)
1977 && (STMT_VINFO_RELEVANT (use_vinfo)
1978 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
1979 && !(gimple_code (use_stmt) == GIMPLE_PHI
1980 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1981 {
1982 if (dump_enabled_p ())
1983 {
1984 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
1985 "def in non-SLP stmt: ");
1986 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
1987 }
1988 stype = hybrid;
1989 }
1990 }
1991 }
1992
1993 if (stype == hybrid
1994 && !HYBRID_SLP_STMT (stmt_vinfo))
1995 {
1996 if (dump_enabled_p ())
1997 {
1998 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
1999 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2000 }
2001 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2002 }
2003
2004 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2005 if (child)
2006 vect_detect_hybrid_slp_stmts (child, i, stype);
2007 }
2008
2009 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2010
2011 static tree
2012 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2013 {
2014 walk_stmt_info *wi = (walk_stmt_info *)data;
2015 struct loop *loopp = (struct loop *)wi->info;
2016
2017 if (wi->is_lhs)
2018 return NULL_TREE;
2019
2020 if (TREE_CODE (*tp) == SSA_NAME
2021 && !SSA_NAME_IS_DEFAULT_DEF (*tp))
2022 {
2023 gimple *def_stmt = SSA_NAME_DEF_STMT (*tp);
2024 if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
2025 && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
2026 {
2027 if (dump_enabled_p ())
2028 {
2029 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
2030 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2031 }
2032 STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
2033 }
2034 }
2035
2036 return NULL_TREE;
2037 }
2038
2039 static tree
2040 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2041 walk_stmt_info *)
2042 {
2043 /* If the stmt is in a SLP instance then this isn't a reason
2044 to mark use definitions in other SLP instances as hybrid. */
2045 if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
2046 *handled = true;
2047 return NULL_TREE;
2048 }
2049
2050 /* Find stmts that must be both vectorized and SLPed. */
2051
2052 void
2053 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2054 {
2055 unsigned int i;
2056 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2057 slp_instance instance;
2058
2059 if (dump_enabled_p ())
2060 dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
2061 "\n");
2062
2063 /* First walk all pattern stmt in the loop and mark defs of uses as
2064 hybrid because immediate uses in them are not recorded. */
2065 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2066 {
2067 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2068 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2069 gsi_next (&gsi))
2070 {
2071 gimple *stmt = gsi_stmt (gsi);
2072 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2073 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2074 {
2075 walk_stmt_info wi;
2076 memset (&wi, 0, sizeof (wi));
2077 wi.info = LOOP_VINFO_LOOP (loop_vinfo);
2078 gimple_stmt_iterator gsi2
2079 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2080 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2081 vect_detect_hybrid_slp_1, &wi);
2082 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2083 vect_detect_hybrid_slp_2,
2084 vect_detect_hybrid_slp_1, &wi);
2085 }
2086 }
2087 }
2088
2089 /* Then walk the SLP instance trees marking stmts with uses in
2090 non-SLP stmts as hybrid, also propagating hybrid down the
2091 SLP tree, collecting the above info on-the-fly. */
2092 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2093 {
2094 for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
2095 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2096 i, pure_slp);
2097 }
2098 }
2099
2100
2101 /* Create and initialize a new bb_vec_info struct for BB, as well as
2102 stmt_vec_info structs for all the stmts in it. */
2103
2104 static bb_vec_info
2105 new_bb_vec_info (basic_block bb)
2106 {
2107 bb_vec_info res = NULL;
2108 gimple_stmt_iterator gsi;
2109
2110 res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2111 res->kind = vec_info::bb;
2112 BB_VINFO_BB (res) = bb;
2113
2114 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2115 {
2116 gimple *stmt = gsi_stmt (gsi);
2117 gimple_set_uid (stmt, 0);
2118 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
2119 }
2120
2121 BB_VINFO_GROUPED_STORES (res).create (10);
2122 BB_VINFO_SLP_INSTANCES (res).create (2);
2123 BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2124
2125 bb->aux = res;
2126 return res;
2127 }
2128
2129
2130 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2131 stmts in the basic block. */
2132
2133 static void
2134 destroy_bb_vec_info (bb_vec_info bb_vinfo)
2135 {
2136 vec<slp_instance> slp_instances;
2137 slp_instance instance;
2138 basic_block bb;
2139 gimple_stmt_iterator si;
2140 unsigned i;
2141
2142 if (!bb_vinfo)
2143 return;
2144
2145 bb = BB_VINFO_BB (bb_vinfo);
2146
2147 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2148 {
2149 gimple *stmt = gsi_stmt (si);
2150 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2151
2152 if (stmt_info)
2153 /* Free stmt_vec_info. */
2154 free_stmt_vec_info (stmt);
2155 }
2156
2157 vect_destroy_datarefs (bb_vinfo);
2158 free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2159 BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2160 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2161 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2162 vect_free_slp_instance (instance);
2163 BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2164 destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2165 free (bb_vinfo);
2166 bb->aux = NULL;
2167 }
2168
2169
2170 /* Analyze statements contained in SLP tree node after recursively analyzing
2171 the subtree. Return TRUE if the operations are supported. */
2172
2173 static bool
2174 vect_slp_analyze_node_operations (slp_tree node)
2175 {
2176 bool dummy;
2177 int i;
2178 gimple *stmt;
2179 slp_tree child;
2180
2181 if (!node)
2182 return true;
2183
2184 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2185 if (!vect_slp_analyze_node_operations (child))
2186 return false;
2187
2188 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2189 {
2190 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2191 gcc_assert (stmt_info);
2192 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2193
2194 if (!vect_analyze_stmt (stmt, &dummy, node))
2195 return false;
2196 }
2197
2198 return true;
2199 }
2200
2201
2202 /* Analyze statements in SLP instances of the basic block. Return TRUE if the
2203 operations are supported. */
2204
2205 bool
2206 vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
2207 {
2208 slp_instance instance;
2209 int i;
2210
2211 if (dump_enabled_p ())
2212 dump_printf_loc (MSG_NOTE, vect_location,
2213 "=== vect_slp_analyze_operations ===\n");
2214
2215 for (i = 0; slp_instances.iterate (i, &instance); )
2216 {
2217 if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
2218 {
2219 dump_printf_loc (MSG_NOTE, vect_location,
2220 "removing SLP instance operations starting from: ");
2221 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
2222 SLP_TREE_SCALAR_STMTS
2223 (SLP_INSTANCE_TREE (instance))[0], 0);
2224 vect_free_slp_instance (instance);
2225 slp_instances.ordered_remove (i);
2226 }
2227 else
2228 {
2229 /* Compute the costs of the SLP instance. */
2230 vect_analyze_slp_cost (instance, data);
2231 i++;
2232 }
2233 }
2234
2235 if (!slp_instances.length ())
2236 return false;
2237
2238 return true;
2239 }
2240
2241
2242 /* Compute the scalar cost of the SLP node NODE and its children
2243 and return it. Do not account defs that are marked in LIFE and
2244 update LIFE according to uses of NODE. */
2245
2246 static unsigned
2247 vect_bb_slp_scalar_cost (basic_block bb,
2248 slp_tree node, vec<bool, va_heap> *life)
2249 {
2250 unsigned scalar_cost = 0;
2251 unsigned i;
2252 gimple *stmt;
2253 slp_tree child;
2254
2255 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2256 {
2257 unsigned stmt_cost;
2258 ssa_op_iter op_iter;
2259 def_operand_p def_p;
2260 stmt_vec_info stmt_info;
2261
2262 if ((*life)[i])
2263 continue;
2264
2265 /* If there is a non-vectorized use of the defs then the scalar
2266 stmt is kept live in which case we do not account it or any
2267 required defs in the SLP children in the scalar cost. This
2268 way we make the vectorization more costly when compared to
2269 the scalar cost. */
2270 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2271 {
2272 imm_use_iterator use_iter;
2273 gimple *use_stmt;
2274 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2275 if (!is_gimple_debug (use_stmt)
2276 && (gimple_code (use_stmt) == GIMPLE_PHI
2277 || gimple_bb (use_stmt) != bb
2278 || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2279 {
2280 (*life)[i] = true;
2281 BREAK_FROM_IMM_USE_STMT (use_iter);
2282 }
2283 }
2284 if ((*life)[i])
2285 continue;
2286
2287 stmt_info = vinfo_for_stmt (stmt);
2288 if (STMT_VINFO_DATA_REF (stmt_info))
2289 {
2290 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2291 stmt_cost = vect_get_stmt_cost (scalar_load);
2292 else
2293 stmt_cost = vect_get_stmt_cost (scalar_store);
2294 }
2295 else
2296 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2297
2298 scalar_cost += stmt_cost;
2299 }
2300
2301 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2302 if (child)
2303 scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2304
2305 return scalar_cost;
2306 }
2307
2308 /* Check if vectorization of the basic block is profitable. */
2309
2310 static bool
2311 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2312 {
2313 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2314 slp_instance instance;
2315 int i;
2316 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2317 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2318
2319 /* Calculate scalar cost. */
2320 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2321 {
2322 auto_vec<bool, 20> life;
2323 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2324 scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2325 SLP_INSTANCE_TREE (instance),
2326 &life);
2327 }
2328
2329 /* Complete the target-specific cost calculation. */
2330 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2331 &vec_inside_cost, &vec_epilogue_cost);
2332
2333 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2334
2335 if (dump_enabled_p ())
2336 {
2337 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2338 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2339 vec_inside_cost);
2340 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2341 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2342 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2343 }
2344
2345 /* Vectorization is profitable if its cost is less than the cost of scalar
2346 version. */
2347 if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2348 return false;
2349
2350 return true;
2351 }
2352
2353 /* Check if the basic block can be vectorized. */
2354
2355 static bb_vec_info
2356 vect_slp_analyze_bb_1 (basic_block bb)
2357 {
2358 bb_vec_info bb_vinfo;
2359 vec<slp_instance> slp_instances;
2360 slp_instance instance;
2361 int i;
2362 int min_vf = 2;
2363 unsigned n_stmts = 0;
2364
2365 bb_vinfo = new_bb_vec_info (bb);
2366 if (!bb_vinfo)
2367 return NULL;
2368
2369 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, &n_stmts))
2370 {
2371 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2373 "not vectorized: unhandled data-ref in basic "
2374 "block.\n");
2375
2376 destroy_bb_vec_info (bb_vinfo);
2377 return NULL;
2378 }
2379
2380 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2381 {
2382 if (dump_enabled_p ())
2383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2384 "not vectorized: not enough data-refs in "
2385 "basic block.\n");
2386
2387 destroy_bb_vec_info (bb_vinfo);
2388 return NULL;
2389 }
2390
2391 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2392 {
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395 "not vectorized: unhandled data access in "
2396 "basic block.\n");
2397
2398 destroy_bb_vec_info (bb_vinfo);
2399 return NULL;
2400 }
2401
2402 vect_pattern_recog (bb_vinfo);
2403
2404 if (!vect_analyze_data_refs_alignment (bb_vinfo))
2405 {
2406 if (dump_enabled_p ())
2407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408 "not vectorized: bad data alignment in basic "
2409 "block.\n");
2410
2411 destroy_bb_vec_info (bb_vinfo);
2412 return NULL;
2413 }
2414
2415 /* Check the SLP opportunities in the basic block, analyze and build SLP
2416 trees. */
2417 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2418 {
2419 if (dump_enabled_p ())
2420 {
2421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422 "Failed to SLP the basic block.\n");
2423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2424 "not vectorized: failed to find SLP opportunities "
2425 "in basic block.\n");
2426 }
2427
2428 destroy_bb_vec_info (bb_vinfo);
2429 return NULL;
2430 }
2431
2432 slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2433
2434 /* Mark all the statements that we want to vectorize as pure SLP and
2435 relevant. */
2436 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2437 {
2438 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2439 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2440 }
2441
2442 /* Mark all the statements that we do not want to vectorize. */
2443 for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2444 !gsi_end_p (gsi); gsi_next (&gsi))
2445 {
2446 stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2447 if (STMT_SLP_TYPE (vinfo) != pure_slp)
2448 STMT_VINFO_VECTORIZABLE (vinfo) = false;
2449 }
2450
2451 /* Analyze dependences. At this point all stmts not participating in
2452 vectorization have to be marked. Dependence analysis assumes
2453 that we either vectorize all SLP instances or none at all. */
2454 if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2455 {
2456 if (dump_enabled_p ())
2457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458 "not vectorized: unhandled data dependence "
2459 "in basic block.\n");
2460
2461 destroy_bb_vec_info (bb_vinfo);
2462 return NULL;
2463 }
2464
2465 if (!vect_verify_datarefs_alignment (bb_vinfo))
2466 {
2467 if (dump_enabled_p ())
2468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2469 "not vectorized: unsupported alignment in basic "
2470 "block.\n");
2471 destroy_bb_vec_info (bb_vinfo);
2472 return NULL;
2473 }
2474
2475 if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
2476 BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
2477 {
2478 if (dump_enabled_p ())
2479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2480 "not vectorized: bad operation in basic block.\n");
2481
2482 destroy_bb_vec_info (bb_vinfo);
2483 return NULL;
2484 }
2485
2486 /* Cost model: check if the vectorization is worthwhile. */
2487 if (!unlimited_cost_model (NULL)
2488 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2489 {
2490 if (dump_enabled_p ())
2491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2492 "not vectorized: vectorization is not "
2493 "profitable.\n");
2494
2495 destroy_bb_vec_info (bb_vinfo);
2496 return NULL;
2497 }
2498
2499 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_NOTE, vect_location,
2501 "Basic block will be vectorized using SLP\n");
2502
2503 return bb_vinfo;
2504 }
2505
2506
2507 bb_vec_info
2508 vect_slp_analyze_bb (basic_block bb)
2509 {
2510 bb_vec_info bb_vinfo;
2511 int insns = 0;
2512 gimple_stmt_iterator gsi;
2513 unsigned int vector_sizes;
2514
2515 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2517
2518 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2519 {
2520 gimple *stmt = gsi_stmt (gsi);
2521 if (!is_gimple_debug (stmt)
2522 && !gimple_nop_p (stmt)
2523 && gimple_code (stmt) != GIMPLE_LABEL)
2524 insns++;
2525 }
2526
2527 if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2528 {
2529 if (dump_enabled_p ())
2530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2531 "not vectorized: too many instructions in "
2532 "basic block.\n");
2533
2534 return NULL;
2535 }
2536
2537 /* Autodetect first vector size we try. */
2538 current_vector_size = 0;
2539 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2540
2541 while (1)
2542 {
2543 bb_vinfo = vect_slp_analyze_bb_1 (bb);
2544 if (bb_vinfo)
2545 return bb_vinfo;
2546
2547 destroy_bb_vec_info (bb_vinfo);
2548
2549 vector_sizes &= ~current_vector_size;
2550 if (vector_sizes == 0
2551 || current_vector_size == 0)
2552 return NULL;
2553
2554 /* Try the next biggest vector size. */
2555 current_vector_size = 1 << floor_log2 (vector_sizes);
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_NOTE, vect_location,
2558 "***** Re-trying analysis with "
2559 "vector size %d\n", current_vector_size);
2560 }
2561 }
2562
2563
2564 /* For constant and loop invariant defs of SLP_NODE this function returns
2565 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2566 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2567 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
2568 REDUC_INDEX is the index of the reduction operand in the statements, unless
2569 it is -1. */
2570
2571 static void
2572 vect_get_constant_vectors (tree op, slp_tree slp_node,
2573 vec<tree> *vec_oprnds,
2574 unsigned int op_num, unsigned int number_of_vectors,
2575 int reduc_index)
2576 {
2577 vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2578 gimple *stmt = stmts[0];
2579 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2580 unsigned nunits;
2581 tree vec_cst;
2582 tree *elts;
2583 unsigned j, number_of_places_left_in_vector;
2584 tree vector_type;
2585 tree vop;
2586 int group_size = stmts.length ();
2587 unsigned int vec_num, i;
2588 unsigned number_of_copies = 1;
2589 vec<tree> voprnds;
2590 voprnds.create (number_of_vectors);
2591 bool constant_p, is_store;
2592 tree neutral_op = NULL;
2593 enum tree_code code = gimple_expr_code (stmt);
2594 gimple *def_stmt;
2595 struct loop *loop;
2596 gimple_seq ctor_seq = NULL;
2597
2598 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2599 nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2600
2601 if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2602 && reduc_index != -1)
2603 {
2604 op_num = reduc_index;
2605 op = gimple_op (stmt, op_num + 1);
2606 /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2607 we need either neutral operands or the original operands. See
2608 get_initial_def_for_reduction() for details. */
2609 switch (code)
2610 {
2611 case WIDEN_SUM_EXPR:
2612 case DOT_PROD_EXPR:
2613 case SAD_EXPR:
2614 case PLUS_EXPR:
2615 case MINUS_EXPR:
2616 case BIT_IOR_EXPR:
2617 case BIT_XOR_EXPR:
2618 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2619 neutral_op = build_real (TREE_TYPE (op), dconst0);
2620 else
2621 neutral_op = build_int_cst (TREE_TYPE (op), 0);
2622
2623 break;
2624
2625 case MULT_EXPR:
2626 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2627 neutral_op = build_real (TREE_TYPE (op), dconst1);
2628 else
2629 neutral_op = build_int_cst (TREE_TYPE (op), 1);
2630
2631 break;
2632
2633 case BIT_AND_EXPR:
2634 neutral_op = build_int_cst (TREE_TYPE (op), -1);
2635 break;
2636
2637 /* For MIN/MAX we don't have an easy neutral operand but
2638 the initial values can be used fine here. Only for
2639 a reduction chain we have to force a neutral element. */
2640 case MAX_EXPR:
2641 case MIN_EXPR:
2642 if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2643 neutral_op = NULL;
2644 else
2645 {
2646 def_stmt = SSA_NAME_DEF_STMT (op);
2647 loop = (gimple_bb (stmt))->loop_father;
2648 neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2649 loop_preheader_edge (loop));
2650 }
2651 break;
2652
2653 default:
2654 gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
2655 neutral_op = NULL;
2656 }
2657 }
2658
2659 if (STMT_VINFO_DATA_REF (stmt_vinfo))
2660 {
2661 is_store = true;
2662 op = gimple_assign_rhs1 (stmt);
2663 }
2664 else
2665 is_store = false;
2666
2667 gcc_assert (op);
2668
2669 if (CONSTANT_CLASS_P (op))
2670 constant_p = true;
2671 else
2672 constant_p = false;
2673
2674 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2675 created vectors. It is greater than 1 if unrolling is performed.
2676
2677 For example, we have two scalar operands, s1 and s2 (e.g., group of
2678 strided accesses of size two), while NUNITS is four (i.e., four scalars
2679 of this type can be packed in a vector). The output vector will contain
2680 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
2681 will be 2).
2682
2683 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2684 containing the operands.
2685
2686 For example, NUNITS is four as before, and the group size is 8
2687 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
2688 {s5, s6, s7, s8}. */
2689
2690 number_of_copies = nunits * number_of_vectors / group_size;
2691
2692 number_of_places_left_in_vector = nunits;
2693 elts = XALLOCAVEC (tree, nunits);
2694 bool place_after_defs = false;
2695 for (j = 0; j < number_of_copies; j++)
2696 {
2697 for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2698 {
2699 if (is_store)
2700 op = gimple_assign_rhs1 (stmt);
2701 else
2702 {
2703 switch (code)
2704 {
2705 case COND_EXPR:
2706 if (op_num == 0 || op_num == 1)
2707 {
2708 tree cond = gimple_assign_rhs1 (stmt);
2709 op = TREE_OPERAND (cond, op_num);
2710 }
2711 else
2712 {
2713 if (op_num == 2)
2714 op = gimple_assign_rhs2 (stmt);
2715 else
2716 op = gimple_assign_rhs3 (stmt);
2717 }
2718 break;
2719
2720 case CALL_EXPR:
2721 op = gimple_call_arg (stmt, op_num);
2722 break;
2723
2724 case LSHIFT_EXPR:
2725 case RSHIFT_EXPR:
2726 case LROTATE_EXPR:
2727 case RROTATE_EXPR:
2728 op = gimple_op (stmt, op_num + 1);
2729 /* Unlike the other binary operators, shifts/rotates have
2730 the shift count being int, instead of the same type as
2731 the lhs, so make sure the scalar is the right type if
2732 we are dealing with vectors of
2733 long long/long/short/char. */
2734 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2735 op = fold_convert (TREE_TYPE (vector_type), op);
2736 break;
2737
2738 default:
2739 op = gimple_op (stmt, op_num + 1);
2740 break;
2741 }
2742 }
2743
2744 if (reduc_index != -1)
2745 {
2746 loop = (gimple_bb (stmt))->loop_father;
2747 def_stmt = SSA_NAME_DEF_STMT (op);
2748
2749 gcc_assert (loop);
2750
2751 /* Get the def before the loop. In reduction chain we have only
2752 one initial value. */
2753 if ((j != (number_of_copies - 1)
2754 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2755 && i != 0))
2756 && neutral_op)
2757 op = neutral_op;
2758 else
2759 op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2760 loop_preheader_edge (loop));
2761 }
2762
2763 /* Create 'vect_ = {op0,op1,...,opn}'. */
2764 number_of_places_left_in_vector--;
2765 tree orig_op = op;
2766 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2767 {
2768 if (CONSTANT_CLASS_P (op))
2769 {
2770 op = fold_unary (VIEW_CONVERT_EXPR,
2771 TREE_TYPE (vector_type), op);
2772 gcc_assert (op && CONSTANT_CLASS_P (op));
2773 }
2774 else
2775 {
2776 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2777 gimple *init_stmt;
2778 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2779 init_stmt
2780 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2781 gimple_seq_add_stmt (&ctor_seq, init_stmt);
2782 op = new_temp;
2783 }
2784 }
2785 elts[number_of_places_left_in_vector] = op;
2786 if (!CONSTANT_CLASS_P (op))
2787 constant_p = false;
2788 if (TREE_CODE (orig_op) == SSA_NAME
2789 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
2790 && STMT_VINFO_BB_VINFO (stmt_vinfo)
2791 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
2792 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
2793 place_after_defs = true;
2794
2795 if (number_of_places_left_in_vector == 0)
2796 {
2797 number_of_places_left_in_vector = nunits;
2798
2799 if (constant_p)
2800 vec_cst = build_vector (vector_type, elts);
2801 else
2802 {
2803 vec<constructor_elt, va_gc> *v;
2804 unsigned k;
2805 vec_alloc (v, nunits);
2806 for (k = 0; k < nunits; ++k)
2807 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2808 vec_cst = build_constructor (vector_type, v);
2809 }
2810 tree init;
2811 gimple_stmt_iterator gsi;
2812 if (place_after_defs)
2813 {
2814 gsi = gsi_for_stmt
2815 (vect_find_last_scalar_stmt_in_slp (slp_node));
2816 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
2817 }
2818 else
2819 init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
2820 if (ctor_seq != NULL)
2821 {
2822 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
2823 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2824 GSI_SAME_STMT);
2825 ctor_seq = NULL;
2826 }
2827 voprnds.quick_push (init);
2828 place_after_defs = false;
2829 }
2830 }
2831 }
2832
2833 /* Since the vectors are created in the reverse order, we should invert
2834 them. */
2835 vec_num = voprnds.length ();
2836 for (j = vec_num; j != 0; j--)
2837 {
2838 vop = voprnds[j - 1];
2839 vec_oprnds->quick_push (vop);
2840 }
2841
2842 voprnds.release ();
2843
2844 /* In case that VF is greater than the unrolling factor needed for the SLP
2845 group of stmts, NUMBER_OF_VECTORS to be created is greater than
2846 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2847 to replicate the vectors. */
2848 while (number_of_vectors > vec_oprnds->length ())
2849 {
2850 tree neutral_vec = NULL;
2851
2852 if (neutral_op)
2853 {
2854 if (!neutral_vec)
2855 neutral_vec = build_vector_from_val (vector_type, neutral_op);
2856
2857 vec_oprnds->quick_push (neutral_vec);
2858 }
2859 else
2860 {
2861 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2862 vec_oprnds->quick_push (vop);
2863 }
2864 }
2865 }
2866
2867
2868 /* Get vectorized definitions from SLP_NODE that contains corresponding
2869 vectorized def-stmts. */
2870
2871 static void
2872 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2873 {
2874 tree vec_oprnd;
2875 gimple *vec_def_stmt;
2876 unsigned int i;
2877
2878 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2879
2880 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2881 {
2882 gcc_assert (vec_def_stmt);
2883 vec_oprnd = gimple_get_lhs (vec_def_stmt);
2884 vec_oprnds->quick_push (vec_oprnd);
2885 }
2886 }
2887
2888
2889 /* Get vectorized definitions for SLP_NODE.
2890 If the scalar definitions are loop invariants or constants, collect them and
2891 call vect_get_constant_vectors() to create vector stmts.
2892 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2893 must be stored in the corresponding child of SLP_NODE, and we call
2894 vect_get_slp_vect_defs () to retrieve them. */
2895
2896 void
2897 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2898 vec<vec<tree> > *vec_oprnds, int reduc_index)
2899 {
2900 gimple *first_stmt;
2901 int number_of_vects = 0, i;
2902 unsigned int child_index = 0;
2903 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2904 slp_tree child = NULL;
2905 vec<tree> vec_defs;
2906 tree oprnd;
2907 bool vectorized_defs;
2908
2909 first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2910 FOR_EACH_VEC_ELT (ops, i, oprnd)
2911 {
2912 /* For each operand we check if it has vectorized definitions in a child
2913 node or we need to create them (for invariants and constants). We
2914 check if the LHS of the first stmt of the next child matches OPRND.
2915 If it does, we found the correct child. Otherwise, we call
2916 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2917 to check this child node for the next operand. */
2918 vectorized_defs = false;
2919 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2920 {
2921 child = SLP_TREE_CHILDREN (slp_node)[child_index];
2922
2923 /* We have to check both pattern and original def, if available. */
2924 if (child)
2925 {
2926 gimple *first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2927 gimple *related
2928 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2929
2930 if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2931 || (related
2932 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2933 {
2934 /* The number of vector defs is determined by the number of
2935 vector statements in the node from which we get those
2936 statements. */
2937 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2938 vectorized_defs = true;
2939 child_index++;
2940 }
2941 }
2942 else
2943 child_index++;
2944 }
2945
2946 if (!vectorized_defs)
2947 {
2948 if (i == 0)
2949 {
2950 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2951 /* Number of vector stmts was calculated according to LHS in
2952 vect_schedule_slp_instance (), fix it by replacing LHS with
2953 RHS, if necessary. See vect_get_smallest_scalar_type () for
2954 details. */
2955 vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2956 &rhs_size_unit);
2957 if (rhs_size_unit != lhs_size_unit)
2958 {
2959 number_of_vects *= rhs_size_unit;
2960 number_of_vects /= lhs_size_unit;
2961 }
2962 }
2963 }
2964
2965 /* Allocate memory for vectorized defs. */
2966 vec_defs = vNULL;
2967 vec_defs.create (number_of_vects);
2968
2969 /* For reduction defs we call vect_get_constant_vectors (), since we are
2970 looking for initial loop invariant values. */
2971 if (vectorized_defs && reduc_index == -1)
2972 /* The defs are already vectorized. */
2973 vect_get_slp_vect_defs (child, &vec_defs);
2974 else
2975 /* Build vectors from scalar defs. */
2976 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2977 number_of_vects, reduc_index);
2978
2979 vec_oprnds->quick_push (vec_defs);
2980
2981 /* For reductions, we only need initial values. */
2982 if (reduc_index != -1)
2983 return;
2984 }
2985 }
2986
2987
2988 /* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2989 building a vector of type MASK_TYPE from it) and two input vectors placed in
2990 DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2991 shifting by STRIDE elements of DR_CHAIN for every copy.
2992 (STRIDE is the number of vectorized stmts for NODE divided by the number of
2993 copies).
2994 VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2995 the created stmts must be inserted. */
2996
2997 static inline void
2998 vect_create_mask_and_perm (gimple *stmt,
2999 tree mask, int first_vec_indx, int second_vec_indx,
3000 gimple_stmt_iterator *gsi, slp_tree node,
3001 tree vectype, vec<tree> dr_chain,
3002 int ncopies, int vect_stmts_counter)
3003 {
3004 tree perm_dest;
3005 gimple *perm_stmt = NULL;
3006 int i, stride;
3007 tree first_vec, second_vec, data_ref;
3008
3009 stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
3010
3011 /* Initialize the vect stmts of NODE to properly insert the generated
3012 stmts later. */
3013 for (i = SLP_TREE_VEC_STMTS (node).length ();
3014 i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3015 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3016
3017 perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
3018 for (i = 0; i < ncopies; i++)
3019 {
3020 first_vec = dr_chain[first_vec_indx];
3021 second_vec = dr_chain[second_vec_indx];
3022
3023 /* Generate the permute statement. */
3024 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3025 first_vec, second_vec, mask);
3026 data_ref = make_ssa_name (perm_dest, perm_stmt);
3027 gimple_set_lhs (perm_stmt, data_ref);
3028 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
3029
3030 /* Store the vector statement in NODE. */
3031 SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
3032
3033 first_vec_indx += stride;
3034 second_vec_indx += stride;
3035 }
3036 }
3037
3038
3039 /* Given FIRST_MASK_ELEMENT - the mask element in element representation,
3040 return in CURRENT_MASK_ELEMENT its equivalent in target specific
3041 representation. Check that the mask is valid and return FALSE if not.
3042 Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
3043 the next vector, i.e., the current first vector is not needed. */
3044
3045 static bool
3046 vect_get_mask_element (gimple *stmt, int first_mask_element, int m,
3047 int mask_nunits, bool only_one_vec, int index,
3048 unsigned char *mask, int *current_mask_element,
3049 bool *need_next_vector, int *number_of_mask_fixes,
3050 bool *mask_fixed, bool *needs_first_vector)
3051 {
3052 int i;
3053
3054 /* Convert to target specific representation. */
3055 *current_mask_element = first_mask_element + m;
3056 /* Adjust the value in case it's a mask for second and third vectors. */
3057 *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
3058
3059 if (*current_mask_element < 0)
3060 {
3061 if (dump_enabled_p ())
3062 {
3063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3064 "permutation requires past vector ");
3065 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3066 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3067 }
3068 return false;
3069 }
3070
3071 if (*current_mask_element < mask_nunits)
3072 *needs_first_vector = true;
3073
3074 /* We have only one input vector to permute but the mask accesses values in
3075 the next vector as well. */
3076 if (only_one_vec && *current_mask_element >= mask_nunits)
3077 {
3078 if (dump_enabled_p ())
3079 {
3080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3081 "permutation requires at least two vectors ");
3082 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3083 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3084 }
3085
3086 return false;
3087 }
3088
3089 /* The mask requires the next vector. */
3090 while (*current_mask_element >= mask_nunits * 2)
3091 {
3092 if (*needs_first_vector || *mask_fixed)
3093 {
3094 /* We either need the first vector too or have already moved to the
3095 next vector. In both cases, this permutation needs three
3096 vectors. */
3097 if (dump_enabled_p ())
3098 {
3099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3100 "permutation requires at "
3101 "least three vectors ");
3102 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3103 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3104 }
3105
3106 return false;
3107 }
3108
3109 /* We move to the next vector, dropping the first one and working with
3110 the second and the third - we need to adjust the values of the mask
3111 accordingly. */
3112 *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3113
3114 for (i = 0; i < index; i++)
3115 mask[i] -= mask_nunits * *number_of_mask_fixes;
3116
3117 (*number_of_mask_fixes)++;
3118 *mask_fixed = true;
3119 }
3120
3121 *need_next_vector = *mask_fixed;
3122
3123 /* This was the last element of this mask. Start a new one. */
3124 if (index == mask_nunits - 1)
3125 {
3126 *number_of_mask_fixes = 1;
3127 *mask_fixed = false;
3128 *needs_first_vector = false;
3129 }
3130
3131 return true;
3132 }
3133
3134
3135 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3136 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3137 permute statements for the SLP node NODE of the SLP instance
3138 SLP_NODE_INSTANCE. */
3139
3140 bool
3141 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3142 gimple_stmt_iterator *gsi, int vf,
3143 slp_instance slp_node_instance, bool analyze_only)
3144 {
3145 gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3146 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3147 tree mask_element_type = NULL_TREE, mask_type;
3148 int i, j, k, nunits, vec_index = 0;
3149 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3150 int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3151 int first_mask_element;
3152 int index, unroll_factor, current_mask_element, ncopies;
3153 unsigned char *mask;
3154 bool only_one_vec = false, need_next_vector = false;
3155 int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3156 int number_of_mask_fixes = 1;
3157 bool mask_fixed = false;
3158 bool needs_first_vector = false;
3159 machine_mode mode;
3160
3161 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3162 return false;
3163
3164 stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
3165
3166 mode = TYPE_MODE (vectype);
3167
3168 if (!can_vec_perm_p (mode, false, NULL))
3169 {
3170 if (dump_enabled_p ())
3171 {
3172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3173 "no vect permute for ");
3174 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3175 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3176 }
3177 return false;
3178 }
3179
3180 /* The generic VEC_PERM_EXPR code always uses an integral type of the
3181 same size as the vector element being permuted. */
3182 mask_element_type = lang_hooks.types.type_for_mode
3183 (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3184 mask_type = get_vectype_for_scalar_type (mask_element_type);
3185 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3186 mask = XALLOCAVEC (unsigned char, nunits);
3187 unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3188
3189 /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3190 unrolling factor. */
3191 orig_vec_stmts_num
3192 = (STMT_VINFO_GROUP_SIZE (stmt_info)
3193 * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
3194 + nunits - 1) / nunits;
3195 if (orig_vec_stmts_num == 1)
3196 only_one_vec = true;
3197
3198 /* Number of copies is determined by the final vectorization factor
3199 relatively to SLP_NODE_INSTANCE unrolling factor. */
3200 ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3201
3202 /* Generate permutation masks for every NODE. Number of masks for each NODE
3203 is equal to GROUP_SIZE.
3204 E.g., we have a group of three nodes with three loads from the same
3205 location in each node, and the vector size is 4. I.e., we have a
3206 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3207 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3208 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3209 ...
3210
3211 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3212 The last mask is illegal since we assume two operands for permute
3213 operation, and the mask element values can't be outside that range.
3214 Hence, the last mask must be converted into {2,5,5,5}.
3215 For the first two permutations we need the first and the second input
3216 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3217 we need the second and the third vectors: {b1,c1,a2,b2} and
3218 {c2,a3,b3,c3}. */
3219
3220 {
3221 index = 0;
3222 vect_stmts_counter = 0;
3223 vec_index = 0;
3224 first_vec_index = vec_index++;
3225 if (only_one_vec)
3226 second_vec_index = first_vec_index;
3227 else
3228 second_vec_index = vec_index++;
3229
3230 for (j = 0; j < unroll_factor; j++)
3231 {
3232 for (k = 0; k < group_size; k++)
3233 {
3234 i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3235 first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
3236 if (!vect_get_mask_element (stmt, first_mask_element, 0,
3237 nunits, only_one_vec, index,
3238 mask, &current_mask_element,
3239 &need_next_vector,
3240 &number_of_mask_fixes, &mask_fixed,
3241 &needs_first_vector))
3242 return false;
3243 gcc_assert (current_mask_element >= 0
3244 && current_mask_element < 2 * nunits);
3245 mask[index++] = current_mask_element;
3246
3247 if (index == nunits)
3248 {
3249 index = 0;
3250 if (!can_vec_perm_p (mode, false, mask))
3251 {
3252 if (dump_enabled_p ())
3253 {
3254 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3255 vect_location,
3256 "unsupported vect permute { ");
3257 for (i = 0; i < nunits; ++i)
3258 dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3259 mask[i]);
3260 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3261 }
3262 return false;
3263 }
3264
3265 if (!analyze_only)
3266 {
3267 int l;
3268 tree mask_vec, *mask_elts;
3269 mask_elts = XALLOCAVEC (tree, nunits);
3270 for (l = 0; l < nunits; ++l)
3271 mask_elts[l] = build_int_cst (mask_element_type,
3272 mask[l]);
3273 mask_vec = build_vector (mask_type, mask_elts);
3274
3275 if (need_next_vector)
3276 {
3277 first_vec_index = second_vec_index;
3278 second_vec_index = vec_index;
3279 }
3280
3281 vect_create_mask_and_perm (stmt,
3282 mask_vec, first_vec_index, second_vec_index,
3283 gsi, node, vectype, dr_chain,
3284 ncopies, vect_stmts_counter++);
3285 }
3286 }
3287 }
3288 }
3289 }
3290
3291 return true;
3292 }
3293
3294
3295
3296 /* Vectorize SLP instance tree in postorder. */
3297
3298 static bool
3299 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3300 unsigned int vectorization_factor)
3301 {
3302 gimple *stmt;
3303 bool grouped_store, is_store;
3304 gimple_stmt_iterator si;
3305 stmt_vec_info stmt_info;
3306 unsigned int vec_stmts_size, nunits, group_size;
3307 tree vectype;
3308 int i;
3309 slp_tree child;
3310
3311 if (!node)
3312 return false;
3313
3314 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3315 vect_schedule_slp_instance (child, instance, vectorization_factor);
3316
3317 stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3318 stmt_info = vinfo_for_stmt (stmt);
3319
3320 /* VECTYPE is the type of the destination. */
3321 vectype = STMT_VINFO_VECTYPE (stmt_info);
3322 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3323 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3324
3325 /* For each SLP instance calculate number of vector stmts to be created
3326 for the scalar stmts in each node of the SLP tree. Number of vector
3327 elements in one vector iteration is the number of scalar elements in
3328 one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3329 size.
3330 Unless this is a SLP reduction in which case the number of vector
3331 stmts is equal to the number of vector stmts of the children. */
3332 if (GROUP_FIRST_ELEMENT (stmt_info)
3333 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
3334 vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
3335 else
3336 vec_stmts_size = (vectorization_factor * group_size) / nunits;
3337
3338 if (!SLP_TREE_VEC_STMTS (node).exists ())
3339 {
3340 SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3341 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3342 }
3343
3344 if (dump_enabled_p ())
3345 {
3346 dump_printf_loc (MSG_NOTE,vect_location,
3347 "------>vectorizing SLP node starting from: ");
3348 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3349 dump_printf (MSG_NOTE, "\n");
3350 }
3351
3352 /* Vectorized stmts go before the last scalar stmt which is where
3353 all uses are ready. */
3354 si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
3355
3356 /* Mark the first element of the reduction chain as reduction to properly
3357 transform the node. In the analysis phase only the last element of the
3358 chain is marked as reduction. */
3359 if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3360 && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3361 {
3362 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3363 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3364 }
3365
3366 /* Handle two-operation SLP nodes by vectorizing the group with
3367 both operations and then performing a merge. */
3368 if (SLP_TREE_TWO_OPERATORS (node))
3369 {
3370 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3371 enum tree_code ocode;
3372 gimple *ostmt;
3373 unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
3374 bool allsame = true;
3375 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
3376 if (gimple_assign_rhs_code (ostmt) != code0)
3377 {
3378 mask[i] = 1;
3379 allsame = false;
3380 ocode = gimple_assign_rhs_code (ostmt);
3381 }
3382 else
3383 mask[i] = 0;
3384 if (!allsame)
3385 {
3386 vec<gimple *> v0;
3387 vec<gimple *> v1;
3388 unsigned j;
3389 tree tmask = NULL_TREE;
3390 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3391 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3392 SLP_TREE_VEC_STMTS (node).truncate (0);
3393 gimple_assign_set_rhs_code (stmt, ocode);
3394 vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3395 gimple_assign_set_rhs_code (stmt, code0);
3396 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3397 SLP_TREE_VEC_STMTS (node).truncate (0);
3398 tree meltype = build_nonstandard_integer_type
3399 (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
3400 tree mvectype = get_same_sized_vectype (meltype, vectype);
3401 unsigned k = 0, l;
3402 for (j = 0; j < v0.length (); ++j)
3403 {
3404 tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
3405 for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
3406 {
3407 if (k >= group_size)
3408 k = 0;
3409 melts[l] = build_int_cst
3410 (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
3411 }
3412 tmask = build_vector (mvectype, melts);
3413
3414 /* ??? Not all targets support a VEC_PERM_EXPR with a
3415 constant mask that would translate to a vec_merge RTX
3416 (with their vec_perm_const_ok). We can either not
3417 vectorize in that case or let veclower do its job.
3418 Unfortunately that isn't too great and at least for
3419 plus/minus we'd eventually like to match targets
3420 vector addsub instructions. */
3421 gimple *vstmt;
3422 vstmt = gimple_build_assign (make_ssa_name (vectype),
3423 VEC_PERM_EXPR,
3424 gimple_assign_lhs (v0[j]),
3425 gimple_assign_lhs (v1[j]), tmask);
3426 vect_finish_stmt_generation (stmt, vstmt, &si);
3427 SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
3428 }
3429 v0.release ();
3430 v1.release ();
3431 return false;
3432 }
3433 }
3434 is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3435 return is_store;
3436 }
3437
3438 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3439 For loop vectorization this is done in vectorizable_call, but for SLP
3440 it needs to be deferred until end of vect_schedule_slp, because multiple
3441 SLP instances may refer to the same scalar stmt. */
3442
3443 static void
3444 vect_remove_slp_scalar_calls (slp_tree node)
3445 {
3446 gimple *stmt, *new_stmt;
3447 gimple_stmt_iterator gsi;
3448 int i;
3449 slp_tree child;
3450 tree lhs;
3451 stmt_vec_info stmt_info;
3452
3453 if (!node)
3454 return;
3455
3456 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3457 vect_remove_slp_scalar_calls (child);
3458
3459 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3460 {
3461 if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3462 continue;
3463 stmt_info = vinfo_for_stmt (stmt);
3464 if (stmt_info == NULL
3465 || is_pattern_stmt_p (stmt_info)
3466 || !PURE_SLP_STMT (stmt_info))
3467 continue;
3468 lhs = gimple_call_lhs (stmt);
3469 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3470 set_vinfo_for_stmt (new_stmt, stmt_info);
3471 set_vinfo_for_stmt (stmt, NULL);
3472 STMT_VINFO_STMT (stmt_info) = new_stmt;
3473 gsi = gsi_for_stmt (stmt);
3474 gsi_replace (&gsi, new_stmt, false);
3475 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3476 }
3477 }
3478
3479 /* Generate vector code for all SLP instances in the loop/basic block. */
3480
3481 bool
3482 vect_schedule_slp (vec_info *vinfo)
3483 {
3484 vec<slp_instance> slp_instances;
3485 slp_instance instance;
3486 unsigned int i, vf;
3487 bool is_store = false;
3488
3489 slp_instances = vinfo->slp_instances;
3490 if (is_a <loop_vec_info> (vinfo))
3491 vf = as_a <loop_vec_info> (vinfo)->vectorization_factor;
3492 else
3493 vf = 1;
3494
3495 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3496 {
3497 /* Schedule the tree of INSTANCE. */
3498 is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3499 instance, vf);
3500 if (dump_enabled_p ())
3501 dump_printf_loc (MSG_NOTE, vect_location,
3502 "vectorizing stmts using SLP.\n");
3503 }
3504
3505 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3506 {
3507 slp_tree root = SLP_INSTANCE_TREE (instance);
3508 gimple *store;
3509 unsigned int j;
3510 gimple_stmt_iterator gsi;
3511
3512 /* Remove scalar call stmts. Do not do this for basic-block
3513 vectorization as not all uses may be vectorized.
3514 ??? Why should this be necessary? DCE should be able to
3515 remove the stmts itself.
3516 ??? For BB vectorization we can as well remove scalar
3517 stmts starting from the SLP tree root if they have no
3518 uses. */
3519 if (is_a <loop_vec_info> (vinfo))
3520 vect_remove_slp_scalar_calls (root);
3521
3522 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3523 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3524 {
3525 if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3526 break;
3527
3528 if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3529 store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3530 /* Free the attached stmt_vec_info and remove the stmt. */
3531 gsi = gsi_for_stmt (store);
3532 unlink_stmt_vdef (store);
3533 gsi_remove (&gsi, true);
3534 release_defs (store);
3535 free_stmt_vec_info (store);
3536 }
3537 }
3538
3539 return is_store;
3540 }
3541
3542
3543 /* Vectorize the basic block. */
3544
3545 void
3546 vect_slp_transform_bb (basic_block bb)
3547 {
3548 bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3549 gimple_stmt_iterator si;
3550
3551 gcc_assert (bb_vinfo);
3552
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3555
3556 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3557 {
3558 gimple *stmt = gsi_stmt (si);
3559 stmt_vec_info stmt_info;
3560
3561 if (dump_enabled_p ())
3562 {
3563 dump_printf_loc (MSG_NOTE, vect_location,
3564 "------>SLPing statement: ");
3565 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3566 dump_printf (MSG_NOTE, "\n");
3567 }
3568
3569 stmt_info = vinfo_for_stmt (stmt);
3570 gcc_assert (stmt_info);
3571
3572 /* Schedule all the SLP instances when the first SLP stmt is reached. */
3573 if (STMT_SLP_TYPE (stmt_info))
3574 {
3575 vect_schedule_slp (bb_vinfo);
3576 break;
3577 }
3578 }
3579
3580 if (dump_enabled_p ())
3581 dump_printf_loc (MSG_NOTE, vect_location,
3582 "BASIC BLOCK VECTORIZED\n");
3583
3584 destroy_bb_vec_info (bb_vinfo);
3585 }