1 /* Loop autoparallelization.
2 Copyright (C) 2006 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
25 #include "coretypes.h"
29 #include "tree-flow.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
37 #include "langhooks.h"
39 /* This pass tries to distribute iterations of loops into several threads.
40 The implementation is straightforward -- for each loop we test whether its
41 iterations are independent, and if it is the case (and some additional
42 conditions regarding profitability and correctness are satisfied), we
43 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
46 The most of the complexity is in bringing the code into shape expected
48 -- for OMP_FOR, ensuring that the loop has only one induction variable
49 and that the exit test is at the start of the loop body
50 -- for OMP_PARALLEL, replacing the references to local addressable
51 variables by accesses through pointers, and breaking up ssa chains
52 by storing the values incoming to the parallelized loop to a structure
53 passed to the new function as an argument (something similar is done
54 in omp gimplification, unfortunately only a small part of the code
58 -- if there are several parallelizable loops in a function, it may be
59 possible to generate the threads just once (using synchronization to
60 ensure that cross-loop dependences are obeyed).
61 -- handling of common scalar dependence patterns (accumulation, ...)
62 -- handling of non-innermost loops */
64 /* Minimal number of iterations of a loop that should be executed in each
66 #define MIN_PER_THREAD 100
68 /* Element of hashtable of names to copy. */
70 struct name_to_copy_elt
72 unsigned version
; /* The version of the name to copy. */
73 tree new_name
; /* The new name used in the copy. */
74 tree field
; /* The field of the structure used to pass the
78 /* Equality and hash functions for hashtab code. */
81 name_to_copy_elt_eq (const void *aa
, const void *bb
)
83 struct name_to_copy_elt
*a
= (struct name_to_copy_elt
*) aa
;
84 struct name_to_copy_elt
*b
= (struct name_to_copy_elt
*) bb
;
86 return a
->version
== b
->version
;
90 name_to_copy_elt_hash (const void *aa
)
92 struct name_to_copy_elt
*a
= (struct name_to_copy_elt
*) aa
;
94 return (hashval_t
) a
->version
;
97 /* Returns true if the iterations of LOOP are independent on each other (that
98 is, if we can execute them in parallel), and if LOOP satisfies other
99 conditions that we need to be able to parallelize it. Description of number
100 of iterations is stored to NITER. */
103 loop_parallel_p (struct loop
*loop
, struct tree_niter_desc
*niter
)
105 edge exit
= single_dom_exit (loop
);
106 VEC (ddr_p
, heap
) *dependence_relations
;
107 VEC (data_reference_p
, heap
) *datarefs
;
108 lambda_trans_matrix trans
;
112 /* Only consider innermost loops with just one exit. The innermost-loop
113 restriction is not necessary, but it makes things simpler. */
114 if (loop
->inner
|| !exit
)
117 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
118 fprintf (dump_file
, "\nConsidering loop %d\n", loop
->num
);
120 /* We need to know # of iterations, and there should be no uses of values
121 defined inside loop outside of it, unless the values are invariants of
123 if (!number_of_iterations_exit (loop
, exit
, niter
, false))
125 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
126 fprintf (dump_file
, " FAILED: number of iterations not known\n");
130 for (phi
= phi_nodes (exit
->dest
); phi
; phi
= PHI_CHAIN (phi
))
132 tree val
= PHI_ARG_DEF_FROM_EDGE (phi
, exit
);
134 if (is_gimple_reg (val
))
136 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
137 fprintf (dump_file
, " FAILED: value used outside loop\n");
142 /* The iterations of the loop may communicate only through bivs whose
143 iteration space can be distributed efficiently. */
144 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
146 tree def
= PHI_RESULT (phi
);
149 if (is_gimple_reg (def
)
150 && !simple_iv (loop
, phi
, def
, &iv
, true))
152 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
154 " FAILED: scalar dependency between iterations\n");
159 /* We need to version the loop to verify assumptions in runtime. */
160 if (!can_duplicate_loop_p (loop
))
162 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
163 fprintf (dump_file
, " FAILED: cannot be duplicated\n");
167 /* Check for problems with dependences. If the loop can be reversed,
168 the iterations are independent. */
169 datarefs
= VEC_alloc (data_reference_p
, heap
, 10);
170 dependence_relations
= VEC_alloc (ddr_p
, heap
, 10 * 10);
171 compute_data_dependences_for_loop (loop
, true, &datarefs
,
172 &dependence_relations
);
173 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
174 dump_data_dependence_relations (dump_file
, dependence_relations
);
176 trans
= lambda_trans_matrix_new (1, 1);
177 LTM_MATRIX (trans
)[0][0] = -1;
179 if (lambda_transform_legal_p (trans
, 1, dependence_relations
))
182 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
183 fprintf (dump_file
, " SUCCESS: may be parallelized\n");
185 else if (dump_file
&& (dump_flags
& TDF_DETAILS
))
186 fprintf (dump_file
, " FAILED: data dependencies exist across iterations\n");
188 free_dependence_relations (dependence_relations
);
189 free_data_refs (datarefs
);
194 /* Assigns the address of VAR in TYPE to an ssa name, and returns this name.
195 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
196 to their addresses that can be reused. */
199 take_address_of (tree var
, tree type
, struct loop
*loop
, htab_t decl_address
)
201 int uid
= DECL_UID (var
);
203 struct int_tree_map ielt
, *nielt
;
204 tree name
, bvar
, stmt
;
205 edge entry
= loop_preheader_edge (loop
);
208 dslot
= htab_find_slot_with_hash (decl_address
, &ielt
, uid
, INSERT
);
211 bvar
= create_tmp_var (type
, get_name (var
));
212 add_referenced_var (bvar
);
213 stmt
= build_gimple_modify_stmt (bvar
,
215 build_addr (var
, current_function_decl
)));
216 name
= make_ssa_name (bvar
, stmt
);
217 GIMPLE_STMT_OPERAND (stmt
, 0) = name
;
218 bsi_insert_on_edge_immediate (entry
, stmt
);
220 nielt
= XNEW (struct int_tree_map
);
228 name
= ((struct int_tree_map
*) *dslot
)->to
;
229 if (TREE_TYPE (name
) == type
)
232 bvar
= SSA_NAME_VAR (name
);
233 stmt
= build_gimple_modify_stmt (bvar
,
234 fold_convert (type
, name
));
235 name
= make_ssa_name (bvar
, stmt
);
236 GIMPLE_STMT_OPERAND (stmt
, 0) = name
;
237 bsi_insert_on_edge_immediate (entry
, stmt
);
242 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
243 contains addresses of the references that had their address taken already.
244 If the expression is changed, CHANGED is set to true. Callback for
255 eliminate_local_variables_1 (tree
*tp
, int *walk_subtrees
, void *data
)
257 struct elv_data
*dta
= data
;
258 tree t
= *tp
, var
, addr
, addr_type
, type
;
264 if (!SSA_VAR_P (t
) || DECL_EXTERNAL (t
))
267 type
= TREE_TYPE (t
);
268 addr_type
= build_pointer_type (type
);
269 addr
= take_address_of (t
, addr_type
, dta
->loop
, dta
->decl_address
);
270 *tp
= build1 (INDIRECT_REF
, TREE_TYPE (*tp
), addr
);
276 if (TREE_CODE (t
) == ADDR_EXPR
)
278 var
= TREE_OPERAND (t
, 0);
283 if (!SSA_VAR_P (var
) || DECL_EXTERNAL (var
))
286 addr_type
= TREE_TYPE (t
);
287 addr
= take_address_of (var
, addr_type
, dta
->loop
, dta
->decl_address
);
295 && !GIMPLE_STMT_P (t
))
301 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
302 contains addresses for the references for that we have already taken
306 eliminate_local_variables_stmt (struct loop
*loop
, tree stmt
,
312 dta
.decl_address
= decl_address
;
315 walk_tree (&stmt
, eliminate_local_variables_1
, &dta
, NULL
);
321 /* Eliminates the references to local variables from LOOP. This includes:
323 1) Taking address of a local variable -- these are moved out of the loop
324 (and temporary variable is created to hold the address if necessary).
325 2) Dereferencing a local variable -- these are replaced with indirect
329 eliminate_local_variables (struct loop
*loop
)
331 basic_block bb
, *body
= get_loop_body (loop
);
333 block_stmt_iterator bsi
;
334 htab_t decl_address
= htab_create (10, int_tree_map_hash
, int_tree_map_eq
,
337 /* Find and rename the ssa names defined outside of loop. */
338 for (i
= 0; i
< loop
->num_nodes
; i
++)
342 for (bsi
= bsi_start (bb
); !bsi_end_p (bsi
); bsi_next (&bsi
))
343 eliminate_local_variables_stmt (loop
, bsi_stmt (bsi
), decl_address
);
346 htab_delete (decl_address
);
349 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
350 The copies are stored to NAME_COPIES, if NAME was already duplicated,
351 its duplicate stored in NAME_COPIES is returned.
353 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
354 duplicated, storing the copies in DECL_COPIES. */
357 separate_decls_in_loop_name (tree name
,
358 htab_t name_copies
, htab_t decl_copies
,
361 tree copy
, var
, var_copy
;
362 unsigned idx
, uid
, nuid
;
363 struct int_tree_map ielt
, *nielt
;
364 struct name_to_copy_elt elt
, *nelt
;
365 void **slot
, **dslot
;
367 if (TREE_CODE (name
) != SSA_NAME
)
370 idx
= SSA_NAME_VERSION (name
);
372 slot
= htab_find_slot_with_hash (name_copies
, &elt
, idx
,
373 copy_name_p
? INSERT
: NO_INSERT
);
375 return ((struct name_to_copy_elt
*) *slot
)->new_name
;
377 var
= SSA_NAME_VAR (name
);
378 uid
= DECL_UID (var
);
380 dslot
= htab_find_slot_with_hash (decl_copies
, &ielt
, uid
, INSERT
);
383 var_copy
= create_tmp_var (TREE_TYPE (var
), get_name (var
));
384 add_referenced_var (var_copy
);
385 nielt
= XNEW (struct int_tree_map
);
387 nielt
->to
= var_copy
;
390 /* Ensure that when we meet this decl next time, we won't duplicate
392 nuid
= DECL_UID (var_copy
);
394 dslot
= htab_find_slot_with_hash (decl_copies
, &ielt
, nuid
, INSERT
);
395 gcc_assert (!*dslot
);
396 nielt
= XNEW (struct int_tree_map
);
398 nielt
->to
= var_copy
;
402 var_copy
= ((struct int_tree_map
*) *dslot
)->to
;
406 copy
= duplicate_ssa_name (name
, NULL_TREE
);
407 nelt
= XNEW (struct name_to_copy_elt
);
409 nelt
->new_name
= copy
;
410 nelt
->field
= NULL_TREE
;
419 SSA_NAME_VAR (copy
) = var_copy
;
423 /* Finds the ssa names used in STMT that are defined outside of LOOP and
424 replaces such ssa names with their duplicates. The duplicates are stored to
425 NAME_COPIES. Base decls of all ssa names used in STMT
426 (including those defined in LOOP) are replaced with the new temporary
427 variables; the replacement decls are stored in DECL_COPIES. */
430 separate_decls_in_loop_stmt (struct loop
*loop
, tree stmt
,
431 htab_t name_copies
, htab_t decl_copies
)
439 mark_virtual_ops_for_renaming (stmt
);
441 FOR_EACH_PHI_OR_STMT_DEF (def
, stmt
, oi
, SSA_OP_DEF
)
443 name
= DEF_FROM_PTR (def
);
444 gcc_assert (TREE_CODE (name
) == SSA_NAME
);
445 copy
= separate_decls_in_loop_name (name
, name_copies
, decl_copies
,
447 gcc_assert (copy
== name
);
450 FOR_EACH_PHI_OR_STMT_USE (use
, stmt
, oi
, SSA_OP_USE
)
452 name
= USE_FROM_PTR (use
);
453 if (TREE_CODE (name
) != SSA_NAME
)
456 copy_name_p
= expr_invariant_in_loop_p (loop
, name
);
457 copy
= separate_decls_in_loop_name (name
, name_copies
, decl_copies
,
463 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
464 described in SLOT to the type passed in DATA. */
467 add_field_for_name (void **slot
, void *data
)
469 struct name_to_copy_elt
*elt
= *slot
;
471 tree name
= ssa_name (elt
->version
);
472 tree var
= SSA_NAME_VAR (name
);
473 tree field
= build_decl (FIELD_DECL
, DECL_NAME (var
), TREE_TYPE (var
));
475 insert_field_into_struct (type
, field
);
480 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
481 store to a field of STORE in STORE_BB for the ssa name and its duplicate
482 specified in SLOT. */
489 basic_block store_bb
;
494 create_loads_and_stores_for_name (void **slot
, void *data
)
496 struct name_to_copy_elt
*elt
= *slot
;
497 struct clsn_data
*clsn_data
= data
;
499 block_stmt_iterator bsi
;
500 tree type
= TREE_TYPE (elt
->new_name
);
501 tree struct_type
= TREE_TYPE (TREE_TYPE (clsn_data
->load
));
504 bsi
= bsi_last (clsn_data
->store_bb
);
505 stmt
= build_gimple_modify_stmt (
506 build3 (COMPONENT_REF
, type
, clsn_data
->store
, elt
->field
,
508 ssa_name (elt
->version
));
509 mark_virtual_ops_for_renaming (stmt
);
510 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
512 bsi
= bsi_last (clsn_data
->load_bb
);
513 load_struct
= fold_build1 (INDIRECT_REF
, struct_type
, clsn_data
->load
);
514 stmt
= build_gimple_modify_stmt (
516 build3 (COMPONENT_REF
, type
, load_struct
, elt
->field
,
518 SSA_NAME_DEF_STMT (elt
->new_name
) = stmt
;
519 bsi_insert_after (&bsi
, stmt
, BSI_NEW_STMT
);
524 /* Moves all the variables used in LOOP and defined outside of it (including
525 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
526 name) to a structure created for this purpose. The code
534 is transformed this way:
549 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
550 pointer `new' is intentionally not initialized (the loop will be split to a
551 separate function later, and `new' will be initialized from its arguments).
555 separate_decls_in_loop (struct loop
*loop
, tree
*arg_struct
,
556 tree
*new_arg_struct
)
558 basic_block bb1
= split_edge (loop_preheader_edge (loop
));
559 basic_block bb0
= single_pred (bb1
);
560 htab_t name_copies
= htab_create (10, name_to_copy_elt_hash
,
561 name_to_copy_elt_eq
, free
);
562 htab_t decl_copies
= htab_create (10, int_tree_map_hash
, int_tree_map_eq
,
564 basic_block bb
, *body
= get_loop_body (loop
);
566 tree phi
, type
, type_name
, nvar
;
567 block_stmt_iterator bsi
;
568 struct clsn_data clsn_data
;
570 /* Find and rename the ssa names defined outside of loop. */
571 for (i
= 0; i
< loop
->num_nodes
; i
++)
575 for (phi
= phi_nodes (bb
); phi
; phi
= PHI_CHAIN (phi
))
576 separate_decls_in_loop_stmt (loop
, phi
, name_copies
, decl_copies
);
578 for (bsi
= bsi_start (bb
); !bsi_end_p (bsi
); bsi_next (&bsi
))
579 separate_decls_in_loop_stmt (loop
, bsi_stmt (bsi
), name_copies
,
584 if (htab_elements (name_copies
) == 0)
586 /* It may happen that there is nothing to copy (if there are only
587 loop carried and external variables in the loop). */
589 *new_arg_struct
= NULL
;
593 /* Create the type for the structure to store the ssa names to. */
594 type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
595 type_name
= build_decl (TYPE_DECL
, create_tmp_var_name (".paral_data"),
597 TYPE_NAME (type
) = type_name
;
599 htab_traverse (name_copies
, add_field_for_name
, type
);
602 /* Create the loads and stores. */
603 *arg_struct
= create_tmp_var (type
, ".paral_data_store");
604 add_referenced_var (*arg_struct
);
605 nvar
= create_tmp_var (build_pointer_type (type
), ".paral_data_load");
606 add_referenced_var (nvar
);
607 *new_arg_struct
= make_ssa_name (nvar
, NULL_TREE
);
609 clsn_data
.store
= *arg_struct
;
610 clsn_data
.load
= *new_arg_struct
;
611 clsn_data
.store_bb
= bb0
;
612 clsn_data
.load_bb
= bb1
;
613 htab_traverse (name_copies
, create_loads_and_stores_for_name
,
617 htab_delete (decl_copies
);
618 htab_delete (name_copies
);
621 /* Bitmap containing uids of functions created by parallelization. We cannot
622 allocate it from the default obstack, as it must live across compilation
623 of several functions; we make it gc allocated instead. */
625 static GTY(()) bitmap parallelized_functions
;
627 /* Returns true if FN was created by create_loop_fn. */
630 parallelized_function_p (tree fn
)
632 if (!parallelized_functions
|| !DECL_ARTIFICIAL (fn
))
635 return bitmap_bit_p (parallelized_functions
, DECL_UID (fn
));
638 /* Creates and returns an empty function that will receive the body of
639 a parallelized loop. */
642 create_loop_fn (void)
646 tree decl
, type
, name
, t
;
647 struct function
*act_cfun
= cfun
;
648 static unsigned loopfn_num
;
650 snprintf (buf
, 100, "%s.$loopfn", current_function_name ());
651 ASM_FORMAT_PRIVATE_NAME (tname
, buf
, loopfn_num
++);
652 clean_symbol_name (tname
);
653 name
= get_identifier (tname
);
654 type
= build_function_type_list (void_type_node
, ptr_type_node
, NULL_TREE
);
656 decl
= build_decl (FUNCTION_DECL
, name
, type
);
657 if (!parallelized_functions
)
658 parallelized_functions
= BITMAP_GGC_ALLOC ();
659 bitmap_set_bit (parallelized_functions
, DECL_UID (decl
));
661 TREE_STATIC (decl
) = 1;
662 TREE_USED (decl
) = 1;
663 DECL_ARTIFICIAL (decl
) = 1;
664 DECL_IGNORED_P (decl
) = 0;
665 TREE_PUBLIC (decl
) = 0;
666 DECL_UNINLINABLE (decl
) = 1;
667 DECL_EXTERNAL (decl
) = 0;
668 DECL_CONTEXT (decl
) = NULL_TREE
;
669 DECL_INITIAL (decl
) = make_node (BLOCK
);
671 t
= build_decl (RESULT_DECL
, NULL_TREE
, void_type_node
);
672 DECL_ARTIFICIAL (t
) = 1;
673 DECL_IGNORED_P (t
) = 1;
674 DECL_RESULT (decl
) = t
;
676 t
= build_decl (PARM_DECL
, get_identifier (".paral_data_param"),
678 DECL_ARTIFICIAL (t
) = 1;
679 DECL_ARG_TYPE (t
) = ptr_type_node
;
680 DECL_CONTEXT (t
) = decl
;
682 DECL_ARGUMENTS (decl
) = t
;
684 allocate_struct_function (decl
);
686 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
693 /* Bases all the induction variables in LOOP on a single induction variable
694 (unsigned with base 0 and step 1), whose final value is compared with
695 NIT. The induction variable is incremented in the loop latch. */
698 canonicalize_loop_ivs (struct loop
*loop
, tree nit
)
700 unsigned precision
= TYPE_PRECISION (TREE_TYPE (nit
));
701 tree phi
, prev
, res
, type
, var_before
, val
, atype
, t
, next
;
702 block_stmt_iterator bsi
;
705 edge exit
= single_dom_exit (loop
);
707 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
709 res
= PHI_RESULT (phi
);
711 if (is_gimple_reg (res
)
712 && TYPE_PRECISION (TREE_TYPE (res
)) > precision
)
713 precision
= TYPE_PRECISION (TREE_TYPE (res
));
716 type
= lang_hooks
.types
.type_for_size (precision
, 1);
718 bsi
= bsi_last (loop
->latch
);
719 create_iv (build_int_cst_type (type
, 0), build_int_cst (type
, 1), NULL_TREE
,
720 loop
, &bsi
, true, &var_before
, NULL
);
722 bsi
= bsi_after_labels (loop
->header
);
724 for (phi
= phi_nodes (loop
->header
); phi
; phi
= next
)
726 next
= PHI_CHAIN (phi
);
727 res
= PHI_RESULT (phi
);
729 if (!is_gimple_reg (res
)
730 || res
== var_before
)
736 ok
= simple_iv (loop
, phi
, res
, &iv
, true);
739 remove_phi_node (phi
, prev
, false);
741 atype
= TREE_TYPE (res
);
742 val
= fold_build2 (PLUS_EXPR
, atype
,
743 unshare_expr (iv
.base
),
744 fold_build2 (MULT_EXPR
, atype
,
745 unshare_expr (iv
.step
),
746 fold_convert (atype
, var_before
)));
747 val
= force_gimple_operand_bsi (&bsi
, val
, false, NULL_TREE
, true,
749 t
= build_gimple_modify_stmt (res
, val
);
750 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
751 SSA_NAME_DEF_STMT (res
) = t
;
754 t
= last_stmt (exit
->src
);
755 /* Make the loop exit if the control condition is not satisfied. */
756 if (exit
->flags
& EDGE_TRUE_VALUE
)
760 extract_true_false_edges_from_block (exit
->src
, &te
, &fe
);
761 te
->flags
= EDGE_FALSE_VALUE
;
762 fe
->flags
= EDGE_TRUE_VALUE
;
764 COND_EXPR_COND (t
) = build2 (LT_EXPR
, boolean_type_node
, var_before
, nit
);
767 /* Moves the exit condition of LOOP to the beginning of its header, and
768 duplicates the part of the last iteration that gets disabled to the
769 exit of the loop. NIT is the number of iterations of the loop
770 (used to initialize the variables in the duplicated part).
772 TODO: the common case is that latch of the loop is empty and immediatelly
773 follows the loop exit. In this case, it would be better not to copy the
774 body of the loop, but only move the entry of the loop directly before the
775 exit check and increase the number of iterations of the loop by one.
776 This may need some additional preconditioning in case NIT = ~0. */
779 transform_to_exit_first_loop (struct loop
*loop
, tree nit
)
781 basic_block
*bbs
, *nbbs
, ex_bb
, orig_header
;
784 edge exit
= single_dom_exit (loop
), hpred
;
785 tree phi
, nphi
, cond
, control
, control_name
, res
, t
, cond_stmt
;
786 block_stmt_iterator bsi
;
788 split_block_after_labels (loop
->header
);
789 orig_header
= single_succ (loop
->header
);
790 hpred
= single_succ_edge (loop
->header
);
792 cond_stmt
= last_stmt (exit
->src
);
793 cond
= COND_EXPR_COND (cond_stmt
);
794 control
= TREE_OPERAND (cond
, 0);
795 gcc_assert (TREE_OPERAND (cond
, 1) == nit
);
797 /* Make sure that we have phi nodes on exit for all loop header phis
798 (create_parallel_loop requires that). */
799 for (phi
= phi_nodes (loop
->header
); phi
; phi
= PHI_CHAIN (phi
))
801 res
= PHI_RESULT (phi
);
802 t
= make_ssa_name (SSA_NAME_VAR (res
), phi
);
803 SET_PHI_RESULT (phi
, t
);
805 nphi
= create_phi_node (res
, orig_header
);
806 SSA_NAME_DEF_STMT (res
) = nphi
;
807 add_phi_arg (nphi
, t
, hpred
);
811 TREE_OPERAND (cond
, 0) = t
;
812 update_stmt (cond_stmt
);
817 bbs
= get_loop_body_in_dom_order (loop
);
818 for (n
= 0; bbs
[n
] != exit
->src
; n
++)
820 nbbs
= XNEWVEC (basic_block
, n
);
821 ok
= tree_duplicate_sese_tail (single_succ_edge (loop
->header
), exit
,
828 /* The only gimple reg that should be copied out of the loop is the
830 control_name
= NULL_TREE
;
831 for (phi
= phi_nodes (ex_bb
); phi
; phi
= PHI_CHAIN (phi
))
833 res
= PHI_RESULT (phi
);
834 if (!is_gimple_reg (res
))
837 gcc_assert (control_name
== NULL_TREE
838 && SSA_NAME_VAR (res
) == SSA_NAME_VAR (control
));
841 gcc_assert (control_name
!= NULL_TREE
);
842 phi
= SSA_NAME_DEF_STMT (control_name
);
843 remove_phi_node (phi
, NULL_TREE
, false);
845 /* Initialize the control variable to NIT. */
846 bsi
= bsi_after_labels (ex_bb
);
847 t
= build_gimple_modify_stmt (control_name
, nit
);
848 bsi_insert_before (&bsi
, t
, BSI_NEW_STMT
);
849 SSA_NAME_DEF_STMT (control_name
) = t
;
852 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
853 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
854 NEW_DATA is the variable that should be initialized from the argument
855 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
856 basic block containing OMP_PARALLEL tree. */
859 create_parallel_loop (struct loop
*loop
, tree loop_fn
, tree data
,
860 tree new_data
, unsigned n_threads
)
862 block_stmt_iterator bsi
;
863 basic_block bb
, paral_bb
, for_bb
, ex_bb
;
864 tree t
, param
, res
, for_stmt
;
865 tree cvar
, cvar_init
, initvar
, cvar_next
, cvar_base
, cond
, phi
, type
;
866 edge exit
, nexit
, guard
, end
, e
;
868 /* Prepare the OMP_PARALLEL statement. */
869 bb
= loop_preheader_edge (loop
)->src
;
870 paral_bb
= single_pred (bb
);
871 bsi
= bsi_last (paral_bb
);
873 t
= build_omp_clause (OMP_CLAUSE_NUM_THREADS
);
874 OMP_CLAUSE_NUM_THREADS_EXPR (t
)
875 = build_int_cst (integer_type_node
, n_threads
);
876 t
= build4 (OMP_PARALLEL
, void_type_node
, NULL_TREE
, t
,
879 bsi_insert_after (&bsi
, t
, BSI_NEW_STMT
);
881 /* Initialize NEW_DATA. */
884 bsi
= bsi_after_labels (bb
);
886 param
= make_ssa_name (DECL_ARGUMENTS (loop_fn
), NULL_TREE
);
887 t
= build_gimple_modify_stmt (param
, build_fold_addr_expr (data
));
888 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
889 SSA_NAME_DEF_STMT (param
) = t
;
891 t
= build_gimple_modify_stmt (new_data
,
892 fold_convert (TREE_TYPE (new_data
), param
));
893 bsi_insert_before (&bsi
, t
, BSI_SAME_STMT
);
894 SSA_NAME_DEF_STMT (new_data
) = t
;
897 /* Emit OMP_RETURN for OMP_PARALLEL. */
898 bb
= split_loop_exit_edge (single_dom_exit (loop
));
900 bsi_insert_after (&bsi
, make_node (OMP_RETURN
), BSI_NEW_STMT
);
902 /* Extract data for OMP_FOR. */
903 gcc_assert (loop
->header
== single_dom_exit (loop
)->src
);
904 cond
= COND_EXPR_COND (last_stmt (loop
->header
));
906 cvar
= TREE_OPERAND (cond
, 0);
907 cvar_base
= SSA_NAME_VAR (cvar
);
908 phi
= SSA_NAME_DEF_STMT (cvar
);
909 cvar_init
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_preheader_edge (loop
));
910 initvar
= make_ssa_name (cvar_base
, NULL_TREE
);
911 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi
, loop_preheader_edge (loop
)),
913 cvar_next
= PHI_ARG_DEF_FROM_EDGE (phi
, loop_latch_edge (loop
));
915 bsi
= bsi_last (loop
->latch
);
916 gcc_assert (bsi_stmt (bsi
) == SSA_NAME_DEF_STMT (cvar_next
));
917 bsi_remove (&bsi
, true);
920 for_bb
= split_edge (loop_preheader_edge (loop
));
921 ex_bb
= split_loop_exit_edge (single_dom_exit (loop
));
922 extract_true_false_edges_from_block (loop
->header
, &nexit
, &exit
);
923 gcc_assert (exit
== single_dom_exit (loop
));
925 guard
= make_edge (for_bb
, ex_bb
, 0);
926 single_succ_edge (loop
->latch
)->flags
= 0;
927 end
= make_edge (loop
->latch
, ex_bb
, EDGE_FALLTHRU
);
928 for (phi
= phi_nodes (ex_bb
); phi
; phi
= PHI_CHAIN (phi
))
930 res
= PHI_RESULT (phi
);
931 gcc_assert (!is_gimple_reg (phi
));
932 t
= SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi
, exit
));
933 add_phi_arg (phi
, PHI_ARG_DEF_FROM_EDGE (t
, loop_preheader_edge (loop
)),
935 add_phi_arg (phi
, PHI_ARG_DEF_FROM_EDGE (t
, loop_latch_edge (loop
)),
938 e
= redirect_edge_and_branch (exit
, nexit
->dest
);
939 PENDING_STMT (e
) = NULL
;
942 TREE_OPERAND (cond
, 0) = cvar_base
;
943 type
= TREE_TYPE (cvar
);
944 t
= build_omp_clause (OMP_CLAUSE_SCHEDULE
);
945 OMP_CLAUSE_SCHEDULE_KIND (t
) = OMP_CLAUSE_SCHEDULE_STATIC
;
947 for_stmt
= make_node (OMP_FOR
);
948 TREE_TYPE (for_stmt
) = void_type_node
;
949 OMP_FOR_CLAUSES (for_stmt
) = t
;
950 OMP_FOR_INIT (for_stmt
) = build_gimple_modify_stmt (initvar
, cvar_init
);
951 OMP_FOR_COND (for_stmt
) = cond
;
952 OMP_FOR_INCR (for_stmt
) = build_gimple_modify_stmt (
954 build2 (PLUS_EXPR
, type
,
956 build_int_cst (type
, 1)));
957 OMP_FOR_BODY (for_stmt
) = NULL_TREE
;
958 OMP_FOR_PRE_BODY (for_stmt
) = NULL_TREE
;
960 bsi
= bsi_last (for_bb
);
961 bsi_insert_after (&bsi
, for_stmt
, BSI_NEW_STMT
);
962 SSA_NAME_DEF_STMT (initvar
) = for_stmt
;
964 /* Emit OMP_CONTINUE. */
965 bsi
= bsi_last (loop
->latch
);
966 t
= build2 (OMP_CONTINUE
, void_type_node
, cvar_next
, cvar
);
967 bsi_insert_after (&bsi
, t
, BSI_NEW_STMT
);
968 SSA_NAME_DEF_STMT (cvar_next
) = t
;
970 /* Emit OMP_RETURN for OMP_FOR. */
971 bsi
= bsi_last (ex_bb
);
972 bsi_insert_after (&bsi
, make_node (OMP_RETURN
), BSI_NEW_STMT
);
977 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
978 parallel. NITER describes number of iterations of LOOP. */
981 gen_parallel_loop (struct loop
*loop
, unsigned n_threads
,
982 struct tree_niter_desc
*niter
)
985 tree many_iterations_cond
, type
, nit
;
986 tree stmts
, arg_struct
, new_arg_struct
;
987 basic_block parallel_head
;
992 ---------------------------------------------------------------------
995 IV = phi (INIT, IV + STEP)
1001 ---------------------------------------------------------------------
1003 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1004 we generate the following code:
1006 ---------------------------------------------------------------------
1009 || NITER < MIN_PER_THREAD * N_THREADS)
1013 store all local loop-invariant variables used in body of the loop to DATA.
1014 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1015 load the variables from DATA.
1016 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1020 OMP_RETURN -- OMP_FOR
1021 OMP_RETURN -- OMP_PARALLEL
1027 IV = phi (INIT, IV + STEP)
1038 /* Create two versions of the loop -- in the old one, we know that the
1039 number of iterations is large enough, and we will transform it into the
1040 loop that will be split to loop_fn, the new one will be used for the
1041 remaining iterations. */
1043 type
= TREE_TYPE (niter
->niter
);
1044 nit
= force_gimple_operand (unshare_expr (niter
->niter
), &stmts
, true,
1047 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1049 many_iterations_cond
=
1050 fold_build2 (GE_EXPR
, boolean_type_node
,
1051 nit
, build_int_cst (type
, MIN_PER_THREAD
* n_threads
));
1052 many_iterations_cond
1053 = fold_build2 (TRUTH_AND_EXPR
, boolean_type_node
,
1054 invert_truthvalue (unshare_expr (niter
->may_be_zero
)),
1055 many_iterations_cond
);
1056 many_iterations_cond
1057 = force_gimple_operand (many_iterations_cond
, &stmts
,
1060 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1061 if (!is_gimple_condexpr (many_iterations_cond
))
1063 many_iterations_cond
1064 = force_gimple_operand (many_iterations_cond
, &stmts
,
1067 bsi_insert_on_edge_immediate (loop_preheader_edge (loop
), stmts
);
1070 initialize_original_copy_tables ();
1072 /* We assume that the loop usually iterates a lot. */
1073 prob
= 4 * REG_BR_PROB_BASE
/ 5;
1074 nloop
= loop_version (loop
, many_iterations_cond
, NULL
,
1075 prob
, prob
, REG_BR_PROB_BASE
- prob
, true);
1076 update_ssa (TODO_update_ssa
);
1077 free_original_copy_tables ();
1079 /* Base all the induction variables in LOOP on a single control one. */
1080 canonicalize_loop_ivs (loop
, nit
);
1082 /* Ensure that the exit condition is the first statement in the loop. */
1083 transform_to_exit_first_loop (loop
, nit
);
1085 /* Eliminate the references to local variables from the loop. */
1086 eliminate_local_variables (loop
);
1088 /* In the old loop, move all variables non-local to the loop to a structure
1089 and back, and create separate decls for the variables used in loop. */
1090 separate_decls_in_loop (loop
, &arg_struct
, &new_arg_struct
);
1092 /* Create the parallel constructs. */
1093 parallel_head
= create_parallel_loop (loop
, create_loop_fn (), arg_struct
,
1094 new_arg_struct
, n_threads
);
1098 /* Cancel the loop (it is simpler to do it here rather than to teach the
1099 expander to do it). */
1100 cancel_loop_tree (loop
);
1102 /* Expand the parallel constructs. We do it directly here instead of running
1103 a separate expand_omp pass, since it is more efficient, and less likely to
1104 cause troubles with further analyses not being able to deal with the
1106 omp_expand_local (parallel_head
);
1109 /* Detect parallel loops and generate parallel code using libgomp
1110 primitives. Returns true if some loop was parallelized, false
1114 parallelize_loops (void)
1116 unsigned n_threads
= flag_tree_parallelize_loops
;
1117 bool changed
= false;
1119 struct tree_niter_desc niter_desc
;
1122 /* Do not parallelize loops in the functions created by parallelization. */
1123 if (parallelized_function_p (cfun
->decl
))
1126 FOR_EACH_LOOP (li
, loop
, 0)
1128 if (/* Do not bother with loops in cold areas. */
1129 !maybe_hot_bb_p (loop
->header
)
1130 /* Or loops that roll too little. */
1131 || expected_loop_iterations (loop
) <= n_threads
1132 /* And of course, the loop must be parallelizable. */
1133 || !can_duplicate_loop_p (loop
)
1134 || !loop_parallel_p (loop
, &niter_desc
))
1138 gen_parallel_loop (loop
, n_threads
, &niter_desc
);
1139 verify_flow_info ();
1140 verify_dominators (CDI_DOMINATORS
);
1141 verify_loop_structure ();
1142 verify_loop_closed_ssa ();
1148 #include "gt-tree-parloops.h"