]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-parloops.c
* tree-parloops.c: New file.
[thirdparty/gcc.git] / gcc / tree-parloops.c
1 /* Loop autoparallelization.
2 Copyright (C) 2006 Free Software Foundation, Inc.
3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> and
4 Zdenek Dvorak <dvorakz@suse.cz>.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "rtl.h"
29 #include "tree-flow.h"
30 #include "cfgloop.h"
31 #include "ggc.h"
32 #include "tree-data-ref.h"
33 #include "diagnostic.h"
34 #include "tree-pass.h"
35 #include "tree-scalar-evolution.h"
36 #include "hashtab.h"
37 #include "langhooks.h"
38
39 /* This pass tries to distribute iterations of loops into several threads.
40 The implementation is straightforward -- for each loop we test whether its
41 iterations are independent, and if it is the case (and some additional
42 conditions regarding profitability and correctness are satisfied), we
43 add OMP_PARALLEL and OMP_FOR codes and let omp expansion machinery do
44 its job.
45
46 The most of the complexity is in bringing the code into shape expected
47 by the omp expanders:
48 -- for OMP_FOR, ensuring that the loop has only one induction variable
49 and that the exit test is at the start of the loop body
50 -- for OMP_PARALLEL, replacing the references to local addressable
51 variables by accesses through pointers, and breaking up ssa chains
52 by storing the values incoming to the parallelized loop to a structure
53 passed to the new function as an argument (something similar is done
54 in omp gimplification, unfortunately only a small part of the code
55 can be shared).
56
57 TODO:
58 -- if there are several parallelizable loops in a function, it may be
59 possible to generate the threads just once (using synchronization to
60 ensure that cross-loop dependences are obeyed).
61 -- handling of common scalar dependence patterns (accumulation, ...)
62 -- handling of non-innermost loops */
63
64 /* Minimal number of iterations of a loop that should be executed in each
65 thread. */
66 #define MIN_PER_THREAD 100
67
68 /* Element of hashtable of names to copy. */
69
70 struct name_to_copy_elt
71 {
72 unsigned version; /* The version of the name to copy. */
73 tree new_name; /* The new name used in the copy. */
74 tree field; /* The field of the structure used to pass the
75 value. */
76 };
77
78 /* Equality and hash functions for hashtab code. */
79
80 static int
81 name_to_copy_elt_eq (const void *aa, const void *bb)
82 {
83 struct name_to_copy_elt *a = (struct name_to_copy_elt *) aa;
84 struct name_to_copy_elt *b = (struct name_to_copy_elt *) bb;
85
86 return a->version == b->version;
87 }
88
89 static hashval_t
90 name_to_copy_elt_hash (const void *aa)
91 {
92 struct name_to_copy_elt *a = (struct name_to_copy_elt *) aa;
93
94 return (hashval_t) a->version;
95 }
96
97 /* Returns true if the iterations of LOOP are independent on each other (that
98 is, if we can execute them in parallel), and if LOOP satisfies other
99 conditions that we need to be able to parallelize it. Description of number
100 of iterations is stored to NITER. */
101
102 static bool
103 loop_parallel_p (struct loop *loop, struct tree_niter_desc *niter)
104 {
105 edge exit = single_dom_exit (loop);
106 VEC (ddr_p, heap) *dependence_relations;
107 VEC (data_reference_p, heap) *datarefs;
108 lambda_trans_matrix trans;
109 bool ret = false;
110 tree phi;
111
112 /* Only consider innermost loops with just one exit. The innermost-loop
113 restriction is not necessary, but it makes things simpler. */
114 if (loop->inner || !exit)
115 return false;
116
117 if (dump_file && (dump_flags & TDF_DETAILS))
118 fprintf (dump_file, "\nConsidering loop %d\n", loop->num);
119
120 /* We need to know # of iterations, and there should be no uses of values
121 defined inside loop outside of it, unless the values are invariants of
122 the loop. */
123 if (!number_of_iterations_exit (loop, exit, niter, false))
124 {
125 if (dump_file && (dump_flags & TDF_DETAILS))
126 fprintf (dump_file, " FAILED: number of iterations not known\n");
127 return false;
128 }
129
130 for (phi = phi_nodes (exit->dest); phi; phi = PHI_CHAIN (phi))
131 {
132 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
133
134 if (is_gimple_reg (val))
135 {
136 if (dump_file && (dump_flags & TDF_DETAILS))
137 fprintf (dump_file, " FAILED: value used outside loop\n");
138 return false;
139 }
140 }
141
142 /* The iterations of the loop may communicate only through bivs whose
143 iteration space can be distributed efficiently. */
144 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
145 {
146 tree def = PHI_RESULT (phi);
147 affine_iv iv;
148
149 if (is_gimple_reg (def)
150 && !simple_iv (loop, phi, def, &iv, true))
151 {
152 if (dump_file && (dump_flags & TDF_DETAILS))
153 fprintf (dump_file,
154 " FAILED: scalar dependency between iterations\n");
155 return false;
156 }
157 }
158
159 /* We need to version the loop to verify assumptions in runtime. */
160 if (!can_duplicate_loop_p (loop))
161 {
162 if (dump_file && (dump_flags & TDF_DETAILS))
163 fprintf (dump_file, " FAILED: cannot be duplicated\n");
164 return false;
165 }
166
167 /* Check for problems with dependences. If the loop can be reversed,
168 the iterations are independent. */
169 datarefs = VEC_alloc (data_reference_p, heap, 10);
170 dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
171 compute_data_dependences_for_loop (loop, true, &datarefs,
172 &dependence_relations);
173 if (dump_file && (dump_flags & TDF_DETAILS))
174 dump_data_dependence_relations (dump_file, dependence_relations);
175
176 trans = lambda_trans_matrix_new (1, 1);
177 LTM_MATRIX (trans)[0][0] = -1;
178
179 if (lambda_transform_legal_p (trans, 1, dependence_relations))
180 {
181 ret = true;
182 if (dump_file && (dump_flags & TDF_DETAILS))
183 fprintf (dump_file, " SUCCESS: may be parallelized\n");
184 }
185 else if (dump_file && (dump_flags & TDF_DETAILS))
186 fprintf (dump_file, " FAILED: data dependencies exist across iterations\n");
187
188 free_dependence_relations (dependence_relations);
189 free_data_refs (datarefs);
190
191 return ret;
192 }
193
194 /* Assigns the address of VAR in TYPE to an ssa name, and returns this name.
195 The assignment statement is placed before LOOP. DECL_ADDRESS maps decls
196 to their addresses that can be reused. */
197
198 static tree
199 take_address_of (tree var, tree type, struct loop *loop, htab_t decl_address)
200 {
201 int uid = DECL_UID (var);
202 void **dslot;
203 struct int_tree_map ielt, *nielt;
204 tree name, bvar, stmt;
205 edge entry = loop_preheader_edge (loop);
206
207 ielt.uid = uid;
208 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT);
209 if (!*dslot)
210 {
211 bvar = create_tmp_var (type, get_name (var));
212 add_referenced_var (bvar);
213 stmt = build_gimple_modify_stmt (bvar,
214 fold_convert (type,
215 build_addr (var, current_function_decl)));
216 name = make_ssa_name (bvar, stmt);
217 GIMPLE_STMT_OPERAND (stmt, 0) = name;
218 bsi_insert_on_edge_immediate (entry, stmt);
219
220 nielt = XNEW (struct int_tree_map);
221 nielt->uid = uid;
222 nielt->to = name;
223 *dslot = nielt;
224
225 return name;
226 }
227
228 name = ((struct int_tree_map *) *dslot)->to;
229 if (TREE_TYPE (name) == type)
230 return name;
231
232 bvar = SSA_NAME_VAR (name);
233 stmt = build_gimple_modify_stmt (bvar,
234 fold_convert (type, name));
235 name = make_ssa_name (bvar, stmt);
236 GIMPLE_STMT_OPERAND (stmt, 0) = name;
237 bsi_insert_on_edge_immediate (entry, stmt);
238
239 return name;
240 }
241
242 /* Eliminates references to local variables in *TP out of LOOP. DECL_ADDRESS
243 contains addresses of the references that had their address taken already.
244 If the expression is changed, CHANGED is set to true. Callback for
245 walk_tree. */
246
247 struct elv_data
248 {
249 struct loop *loop;
250 htab_t decl_address;
251 bool changed;
252 };
253
254 static tree
255 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
256 {
257 struct elv_data *dta = data;
258 tree t = *tp, var, addr, addr_type, type;
259
260 if (DECL_P (t))
261 {
262 *walk_subtrees = 0;
263
264 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
265 return NULL_TREE;
266
267 type = TREE_TYPE (t);
268 addr_type = build_pointer_type (type);
269 addr = take_address_of (t, addr_type, dta->loop, dta->decl_address);
270 *tp = build1 (INDIRECT_REF, TREE_TYPE (*tp), addr);
271
272 dta->changed = true;
273 return NULL_TREE;
274 }
275
276 if (TREE_CODE (t) == ADDR_EXPR)
277 {
278 var = TREE_OPERAND (t, 0);
279 if (!DECL_P (var))
280 return NULL_TREE;
281
282 *walk_subtrees = 0;
283 if (!SSA_VAR_P (var) || DECL_EXTERNAL (var))
284 return NULL_TREE;
285
286 addr_type = TREE_TYPE (t);
287 addr = take_address_of (var, addr_type, dta->loop, dta->decl_address);
288 *tp = addr;
289
290 dta->changed = true;
291 return NULL_TREE;
292 }
293
294 if (!EXPR_P (t)
295 && !GIMPLE_STMT_P (t))
296 *walk_subtrees = 0;
297
298 return NULL_TREE;
299 }
300
301 /* Moves the references to local variables in STMT from LOOP. DECL_ADDRESS
302 contains addresses for the references for that we have already taken
303 them. */
304
305 static void
306 eliminate_local_variables_stmt (struct loop *loop, tree stmt,
307 htab_t decl_address)
308 {
309 struct elv_data dta;
310
311 dta.loop = loop;
312 dta.decl_address = decl_address;
313 dta.changed = false;
314
315 walk_tree (&stmt, eliminate_local_variables_1, &dta, NULL);
316
317 if (dta.changed)
318 update_stmt (stmt);
319 }
320
321 /* Eliminates the references to local variables from LOOP. This includes:
322
323 1) Taking address of a local variable -- these are moved out of the loop
324 (and temporary variable is created to hold the address if necessary).
325 2) Dereferencing a local variable -- these are replaced with indirect
326 references. */
327
328 static void
329 eliminate_local_variables (struct loop *loop)
330 {
331 basic_block bb, *body = get_loop_body (loop);
332 unsigned i;
333 block_stmt_iterator bsi;
334 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq,
335 free);
336
337 /* Find and rename the ssa names defined outside of loop. */
338 for (i = 0; i < loop->num_nodes; i++)
339 {
340 bb = body[i];
341
342 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
343 eliminate_local_variables_stmt (loop, bsi_stmt (bsi), decl_address);
344 }
345
346 htab_delete (decl_address);
347 }
348
349 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
350 The copies are stored to NAME_COPIES, if NAME was already duplicated,
351 its duplicate stored in NAME_COPIES is returned.
352
353 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
354 duplicated, storing the copies in DECL_COPIES. */
355
356 static tree
357 separate_decls_in_loop_name (tree name,
358 htab_t name_copies, htab_t decl_copies,
359 bool copy_name_p)
360 {
361 tree copy, var, var_copy;
362 unsigned idx, uid, nuid;
363 struct int_tree_map ielt, *nielt;
364 struct name_to_copy_elt elt, *nelt;
365 void **slot, **dslot;
366
367 if (TREE_CODE (name) != SSA_NAME)
368 return name;
369
370 idx = SSA_NAME_VERSION (name);
371 elt.version = idx;
372 slot = htab_find_slot_with_hash (name_copies, &elt, idx,
373 copy_name_p ? INSERT : NO_INSERT);
374 if (slot && *slot)
375 return ((struct name_to_copy_elt *) *slot)->new_name;
376
377 var = SSA_NAME_VAR (name);
378 uid = DECL_UID (var);
379 ielt.uid = uid;
380 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT);
381 if (!*dslot)
382 {
383 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
384 add_referenced_var (var_copy);
385 nielt = XNEW (struct int_tree_map);
386 nielt->uid = uid;
387 nielt->to = var_copy;
388 *dslot = nielt;
389
390 /* Ensure that when we meet this decl next time, we won't duplicate
391 it again. */
392 nuid = DECL_UID (var_copy);
393 ielt.uid = nuid;
394 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT);
395 gcc_assert (!*dslot);
396 nielt = XNEW (struct int_tree_map);
397 nielt->uid = nuid;
398 nielt->to = var_copy;
399 *dslot = nielt;
400 }
401 else
402 var_copy = ((struct int_tree_map *) *dslot)->to;
403
404 if (copy_name_p)
405 {
406 copy = duplicate_ssa_name (name, NULL_TREE);
407 nelt = XNEW (struct name_to_copy_elt);
408 nelt->version = idx;
409 nelt->new_name = copy;
410 nelt->field = NULL_TREE;
411 *slot = nelt;
412 }
413 else
414 {
415 gcc_assert (!slot);
416 copy = name;
417 }
418
419 SSA_NAME_VAR (copy) = var_copy;
420 return copy;
421 }
422
423 /* Finds the ssa names used in STMT that are defined outside of LOOP and
424 replaces such ssa names with their duplicates. The duplicates are stored to
425 NAME_COPIES. Base decls of all ssa names used in STMT
426 (including those defined in LOOP) are replaced with the new temporary
427 variables; the replacement decls are stored in DECL_COPIES. */
428
429 static void
430 separate_decls_in_loop_stmt (struct loop *loop, tree stmt,
431 htab_t name_copies, htab_t decl_copies)
432 {
433 use_operand_p use;
434 def_operand_p def;
435 ssa_op_iter oi;
436 tree name, copy;
437 bool copy_name_p;
438
439 mark_virtual_ops_for_renaming (stmt);
440
441 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
442 {
443 name = DEF_FROM_PTR (def);
444 gcc_assert (TREE_CODE (name) == SSA_NAME);
445 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
446 false);
447 gcc_assert (copy == name);
448 }
449
450 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
451 {
452 name = USE_FROM_PTR (use);
453 if (TREE_CODE (name) != SSA_NAME)
454 continue;
455
456 copy_name_p = expr_invariant_in_loop_p (loop, name);
457 copy = separate_decls_in_loop_name (name, name_copies, decl_copies,
458 copy_name_p);
459 SET_USE (use, copy);
460 }
461 }
462
463 /* Callback for htab_traverse. Adds a field corresponding to a ssa name
464 described in SLOT to the type passed in DATA. */
465
466 static int
467 add_field_for_name (void **slot, void *data)
468 {
469 struct name_to_copy_elt *elt = *slot;
470 tree type = data;
471 tree name = ssa_name (elt->version);
472 tree var = SSA_NAME_VAR (name);
473 tree field = build_decl (FIELD_DECL, DECL_NAME (var), TREE_TYPE (var));
474
475 insert_field_into_struct (type, field);
476 elt->field = field;
477 return 1;
478 }
479
480 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and
481 store to a field of STORE in STORE_BB for the ssa name and its duplicate
482 specified in SLOT. */
483
484 struct clsn_data
485 {
486 tree store;
487 tree load;
488
489 basic_block store_bb;
490 basic_block load_bb;
491 };
492
493 static int
494 create_loads_and_stores_for_name (void **slot, void *data)
495 {
496 struct name_to_copy_elt *elt = *slot;
497 struct clsn_data *clsn_data = data;
498 tree stmt;
499 block_stmt_iterator bsi;
500 tree type = TREE_TYPE (elt->new_name);
501 tree struct_type = TREE_TYPE (TREE_TYPE (clsn_data->load));
502 tree load_struct;
503
504 bsi = bsi_last (clsn_data->store_bb);
505 stmt = build_gimple_modify_stmt (
506 build3 (COMPONENT_REF, type, clsn_data->store, elt->field,
507 NULL_TREE),
508 ssa_name (elt->version));
509 mark_virtual_ops_for_renaming (stmt);
510 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
511
512 bsi = bsi_last (clsn_data->load_bb);
513 load_struct = fold_build1 (INDIRECT_REF, struct_type, clsn_data->load);
514 stmt = build_gimple_modify_stmt (
515 elt->new_name,
516 build3 (COMPONENT_REF, type, load_struct, elt->field,
517 NULL_TREE));
518 SSA_NAME_DEF_STMT (elt->new_name) = stmt;
519 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
520
521 return 1;
522 }
523
524 /* Moves all the variables used in LOOP and defined outside of it (including
525 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
526 name) to a structure created for this purpose. The code
527
528 while (1)
529 {
530 use (a);
531 use (b);
532 }
533
534 is transformed this way:
535
536 bb0:
537 old.a = a;
538 old.b = b;
539
540 bb1:
541 a' = new->a;
542 b' = new->b;
543 while (1)
544 {
545 use (a');
546 use (b');
547 }
548
549 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The
550 pointer `new' is intentionally not initialized (the loop will be split to a
551 separate function later, and `new' will be initialized from its arguments).
552 */
553
554 static void
555 separate_decls_in_loop (struct loop *loop, tree *arg_struct,
556 tree *new_arg_struct)
557 {
558 basic_block bb1 = split_edge (loop_preheader_edge (loop));
559 basic_block bb0 = single_pred (bb1);
560 htab_t name_copies = htab_create (10, name_to_copy_elt_hash,
561 name_to_copy_elt_eq, free);
562 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq,
563 free);
564 basic_block bb, *body = get_loop_body (loop);
565 unsigned i;
566 tree phi, type, type_name, nvar;
567 block_stmt_iterator bsi;
568 struct clsn_data clsn_data;
569
570 /* Find and rename the ssa names defined outside of loop. */
571 for (i = 0; i < loop->num_nodes; i++)
572 {
573 bb = body[i];
574
575 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
576 separate_decls_in_loop_stmt (loop, phi, name_copies, decl_copies);
577
578 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
579 separate_decls_in_loop_stmt (loop, bsi_stmt (bsi), name_copies,
580 decl_copies);
581 }
582 free (body);
583
584 if (htab_elements (name_copies) == 0)
585 {
586 /* It may happen that there is nothing to copy (if there are only
587 loop carried and external variables in the loop). */
588 *arg_struct = NULL;
589 *new_arg_struct = NULL;
590 }
591 else
592 {
593 /* Create the type for the structure to store the ssa names to. */
594 type = lang_hooks.types.make_type (RECORD_TYPE);
595 type_name = build_decl (TYPE_DECL, create_tmp_var_name (".paral_data"),
596 type);
597 TYPE_NAME (type) = type_name;
598
599 htab_traverse (name_copies, add_field_for_name, type);
600 layout_type (type);
601
602 /* Create the loads and stores. */
603 *arg_struct = create_tmp_var (type, ".paral_data_store");
604 add_referenced_var (*arg_struct);
605 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
606 add_referenced_var (nvar);
607 *new_arg_struct = make_ssa_name (nvar, NULL_TREE);
608
609 clsn_data.store = *arg_struct;
610 clsn_data.load = *new_arg_struct;
611 clsn_data.store_bb = bb0;
612 clsn_data.load_bb = bb1;
613 htab_traverse (name_copies, create_loads_and_stores_for_name,
614 &clsn_data);
615 }
616
617 htab_delete (decl_copies);
618 htab_delete (name_copies);
619 }
620
621 /* Bitmap containing uids of functions created by parallelization. We cannot
622 allocate it from the default obstack, as it must live across compilation
623 of several functions; we make it gc allocated instead. */
624
625 static GTY(()) bitmap parallelized_functions;
626
627 /* Returns true if FN was created by create_loop_fn. */
628
629 static bool
630 parallelized_function_p (tree fn)
631 {
632 if (!parallelized_functions || !DECL_ARTIFICIAL (fn))
633 return false;
634
635 return bitmap_bit_p (parallelized_functions, DECL_UID (fn));
636 }
637
638 /* Creates and returns an empty function that will receive the body of
639 a parallelized loop. */
640
641 static tree
642 create_loop_fn (void)
643 {
644 char buf[100];
645 char *tname;
646 tree decl, type, name, t;
647 struct function *act_cfun = cfun;
648 static unsigned loopfn_num;
649
650 snprintf (buf, 100, "%s.$loopfn", current_function_name ());
651 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
652 clean_symbol_name (tname);
653 name = get_identifier (tname);
654 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
655
656 decl = build_decl (FUNCTION_DECL, name, type);
657 if (!parallelized_functions)
658 parallelized_functions = BITMAP_GGC_ALLOC ();
659 bitmap_set_bit (parallelized_functions, DECL_UID (decl));
660
661 TREE_STATIC (decl) = 1;
662 TREE_USED (decl) = 1;
663 DECL_ARTIFICIAL (decl) = 1;
664 DECL_IGNORED_P (decl) = 0;
665 TREE_PUBLIC (decl) = 0;
666 DECL_UNINLINABLE (decl) = 1;
667 DECL_EXTERNAL (decl) = 0;
668 DECL_CONTEXT (decl) = NULL_TREE;
669 DECL_INITIAL (decl) = make_node (BLOCK);
670
671 t = build_decl (RESULT_DECL, NULL_TREE, void_type_node);
672 DECL_ARTIFICIAL (t) = 1;
673 DECL_IGNORED_P (t) = 1;
674 DECL_RESULT (decl) = t;
675
676 t = build_decl (PARM_DECL, get_identifier (".paral_data_param"),
677 ptr_type_node);
678 DECL_ARTIFICIAL (t) = 1;
679 DECL_ARG_TYPE (t) = ptr_type_node;
680 DECL_CONTEXT (t) = decl;
681 TREE_USED (t) = 1;
682 DECL_ARGUMENTS (decl) = t;
683
684 allocate_struct_function (decl);
685
686 /* The call to allocate_struct_function clobbers CFUN, so we need to restore
687 it. */
688 cfun = act_cfun;
689
690 return decl;
691 }
692
693 /* Bases all the induction variables in LOOP on a single induction variable
694 (unsigned with base 0 and step 1), whose final value is compared with
695 NIT. The induction variable is incremented in the loop latch. */
696
697 static void
698 canonicalize_loop_ivs (struct loop *loop, tree nit)
699 {
700 unsigned precision = TYPE_PRECISION (TREE_TYPE (nit));
701 tree phi, prev, res, type, var_before, val, atype, t, next;
702 block_stmt_iterator bsi;
703 bool ok;
704 affine_iv iv;
705 edge exit = single_dom_exit (loop);
706
707 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
708 {
709 res = PHI_RESULT (phi);
710
711 if (is_gimple_reg (res)
712 && TYPE_PRECISION (TREE_TYPE (res)) > precision)
713 precision = TYPE_PRECISION (TREE_TYPE (res));
714 }
715
716 type = lang_hooks.types.type_for_size (precision, 1);
717
718 bsi = bsi_last (loop->latch);
719 create_iv (build_int_cst_type (type, 0), build_int_cst (type, 1), NULL_TREE,
720 loop, &bsi, true, &var_before, NULL);
721
722 bsi = bsi_after_labels (loop->header);
723 prev = NULL;
724 for (phi = phi_nodes (loop->header); phi; phi = next)
725 {
726 next = PHI_CHAIN (phi);
727 res = PHI_RESULT (phi);
728
729 if (!is_gimple_reg (res)
730 || res == var_before)
731 {
732 prev = phi;
733 continue;
734 }
735
736 ok = simple_iv (loop, phi, res, &iv, true);
737 gcc_assert (ok);
738
739 remove_phi_node (phi, prev, false);
740
741 atype = TREE_TYPE (res);
742 val = fold_build2 (PLUS_EXPR, atype,
743 unshare_expr (iv.base),
744 fold_build2 (MULT_EXPR, atype,
745 unshare_expr (iv.step),
746 fold_convert (atype, var_before)));
747 val = force_gimple_operand_bsi (&bsi, val, false, NULL_TREE, true,
748 BSI_SAME_STMT);
749 t = build_gimple_modify_stmt (res, val);
750 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
751 SSA_NAME_DEF_STMT (res) = t;
752 }
753
754 t = last_stmt (exit->src);
755 /* Make the loop exit if the control condition is not satisfied. */
756 if (exit->flags & EDGE_TRUE_VALUE)
757 {
758 edge te, fe;
759
760 extract_true_false_edges_from_block (exit->src, &te, &fe);
761 te->flags = EDGE_FALSE_VALUE;
762 fe->flags = EDGE_TRUE_VALUE;
763 }
764 COND_EXPR_COND (t) = build2 (LT_EXPR, boolean_type_node, var_before, nit);
765 }
766
767 /* Moves the exit condition of LOOP to the beginning of its header, and
768 duplicates the part of the last iteration that gets disabled to the
769 exit of the loop. NIT is the number of iterations of the loop
770 (used to initialize the variables in the duplicated part).
771
772 TODO: the common case is that latch of the loop is empty and immediatelly
773 follows the loop exit. In this case, it would be better not to copy the
774 body of the loop, but only move the entry of the loop directly before the
775 exit check and increase the number of iterations of the loop by one.
776 This may need some additional preconditioning in case NIT = ~0. */
777
778 static void
779 transform_to_exit_first_loop (struct loop *loop, tree nit)
780 {
781 basic_block *bbs, *nbbs, ex_bb, orig_header;
782 unsigned n;
783 bool ok;
784 edge exit = single_dom_exit (loop), hpred;
785 tree phi, nphi, cond, control, control_name, res, t, cond_stmt;
786 block_stmt_iterator bsi;
787
788 split_block_after_labels (loop->header);
789 orig_header = single_succ (loop->header);
790 hpred = single_succ_edge (loop->header);
791
792 cond_stmt = last_stmt (exit->src);
793 cond = COND_EXPR_COND (cond_stmt);
794 control = TREE_OPERAND (cond, 0);
795 gcc_assert (TREE_OPERAND (cond, 1) == nit);
796
797 /* Make sure that we have phi nodes on exit for all loop header phis
798 (create_parallel_loop requires that). */
799 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
800 {
801 res = PHI_RESULT (phi);
802 t = make_ssa_name (SSA_NAME_VAR (res), phi);
803 SET_PHI_RESULT (phi, t);
804
805 nphi = create_phi_node (res, orig_header);
806 SSA_NAME_DEF_STMT (res) = nphi;
807 add_phi_arg (nphi, t, hpred);
808
809 if (res == control)
810 {
811 TREE_OPERAND (cond, 0) = t;
812 update_stmt (cond_stmt);
813 control = t;
814 }
815 }
816
817 bbs = get_loop_body_in_dom_order (loop);
818 for (n = 0; bbs[n] != exit->src; n++)
819 continue;
820 nbbs = XNEWVEC (basic_block, n);
821 ok = tree_duplicate_sese_tail (single_succ_edge (loop->header), exit,
822 bbs + 1, n, nbbs);
823 gcc_assert (ok);
824 free (bbs);
825 ex_bb = nbbs[0];
826 free (nbbs);
827
828 /* The only gimple reg that should be copied out of the loop is the
829 control variable. */
830 control_name = NULL_TREE;
831 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
832 {
833 res = PHI_RESULT (phi);
834 if (!is_gimple_reg (res))
835 continue;
836
837 gcc_assert (control_name == NULL_TREE
838 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
839 control_name = res;
840 }
841 gcc_assert (control_name != NULL_TREE);
842 phi = SSA_NAME_DEF_STMT (control_name);
843 remove_phi_node (phi, NULL_TREE, false);
844
845 /* Initialize the control variable to NIT. */
846 bsi = bsi_after_labels (ex_bb);
847 t = build_gimple_modify_stmt (control_name, nit);
848 bsi_insert_before (&bsi, t, BSI_NEW_STMT);
849 SSA_NAME_DEF_STMT (control_name) = t;
850 }
851
852 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
853 LOOP_FN and DATA are the arguments of OMP_PARALLEL.
854 NEW_DATA is the variable that should be initialized from the argument
855 of LOOP_FN. N_THREADS is the requested number of threads. Returns the
856 basic block containing OMP_PARALLEL tree. */
857
858 static basic_block
859 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
860 tree new_data, unsigned n_threads)
861 {
862 block_stmt_iterator bsi;
863 basic_block bb, paral_bb, for_bb, ex_bb;
864 tree t, param, res, for_stmt;
865 tree cvar, cvar_init, initvar, cvar_next, cvar_base, cond, phi, type;
866 edge exit, nexit, guard, end, e;
867
868 /* Prepare the OMP_PARALLEL statement. */
869 bb = loop_preheader_edge (loop)->src;
870 paral_bb = single_pred (bb);
871 bsi = bsi_last (paral_bb);
872
873 t = build_omp_clause (OMP_CLAUSE_NUM_THREADS);
874 OMP_CLAUSE_NUM_THREADS_EXPR (t)
875 = build_int_cst (integer_type_node, n_threads);
876 t = build4 (OMP_PARALLEL, void_type_node, NULL_TREE, t,
877 loop_fn, data);
878
879 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
880
881 /* Initialize NEW_DATA. */
882 if (data)
883 {
884 bsi = bsi_after_labels (bb);
885
886 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL_TREE);
887 t = build_gimple_modify_stmt (param, build_fold_addr_expr (data));
888 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
889 SSA_NAME_DEF_STMT (param) = t;
890
891 t = build_gimple_modify_stmt (new_data,
892 fold_convert (TREE_TYPE (new_data), param));
893 bsi_insert_before (&bsi, t, BSI_SAME_STMT);
894 SSA_NAME_DEF_STMT (new_data) = t;
895 }
896
897 /* Emit OMP_RETURN for OMP_PARALLEL. */
898 bb = split_loop_exit_edge (single_dom_exit (loop));
899 bsi = bsi_last (bb);
900 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
901
902 /* Extract data for OMP_FOR. */
903 gcc_assert (loop->header == single_dom_exit (loop)->src);
904 cond = COND_EXPR_COND (last_stmt (loop->header));
905
906 cvar = TREE_OPERAND (cond, 0);
907 cvar_base = SSA_NAME_VAR (cvar);
908 phi = SSA_NAME_DEF_STMT (cvar);
909 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
910 initvar = make_ssa_name (cvar_base, NULL_TREE);
911 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
912 initvar);
913 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
914
915 bsi = bsi_last (loop->latch);
916 gcc_assert (bsi_stmt (bsi) == SSA_NAME_DEF_STMT (cvar_next));
917 bsi_remove (&bsi, true);
918
919 /* Prepare cfg. */
920 for_bb = split_edge (loop_preheader_edge (loop));
921 ex_bb = split_loop_exit_edge (single_dom_exit (loop));
922 extract_true_false_edges_from_block (loop->header, &nexit, &exit);
923 gcc_assert (exit == single_dom_exit (loop));
924
925 guard = make_edge (for_bb, ex_bb, 0);
926 single_succ_edge (loop->latch)->flags = 0;
927 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU);
928 for (phi = phi_nodes (ex_bb); phi; phi = PHI_CHAIN (phi))
929 {
930 res = PHI_RESULT (phi);
931 gcc_assert (!is_gimple_reg (phi));
932 t = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit));
933 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_preheader_edge (loop)),
934 guard);
935 add_phi_arg (phi, PHI_ARG_DEF_FROM_EDGE (t, loop_latch_edge (loop)),
936 end);
937 }
938 e = redirect_edge_and_branch (exit, nexit->dest);
939 PENDING_STMT (e) = NULL;
940
941 /* Emit OMP_FOR. */
942 TREE_OPERAND (cond, 0) = cvar_base;
943 type = TREE_TYPE (cvar);
944 t = build_omp_clause (OMP_CLAUSE_SCHEDULE);
945 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
946
947 for_stmt = make_node (OMP_FOR);
948 TREE_TYPE (for_stmt) = void_type_node;
949 OMP_FOR_CLAUSES (for_stmt) = t;
950 OMP_FOR_INIT (for_stmt) = build_gimple_modify_stmt (initvar, cvar_init);
951 OMP_FOR_COND (for_stmt) = cond;
952 OMP_FOR_INCR (for_stmt) = build_gimple_modify_stmt (
953 cvar_base,
954 build2 (PLUS_EXPR, type,
955 cvar_base,
956 build_int_cst (type, 1)));
957 OMP_FOR_BODY (for_stmt) = NULL_TREE;
958 OMP_FOR_PRE_BODY (for_stmt) = NULL_TREE;
959
960 bsi = bsi_last (for_bb);
961 bsi_insert_after (&bsi, for_stmt, BSI_NEW_STMT);
962 SSA_NAME_DEF_STMT (initvar) = for_stmt;
963
964 /* Emit OMP_CONTINUE. */
965 bsi = bsi_last (loop->latch);
966 t = build2 (OMP_CONTINUE, void_type_node, cvar_next, cvar);
967 bsi_insert_after (&bsi, t, BSI_NEW_STMT);
968 SSA_NAME_DEF_STMT (cvar_next) = t;
969
970 /* Emit OMP_RETURN for OMP_FOR. */
971 bsi = bsi_last (ex_bb);
972 bsi_insert_after (&bsi, make_node (OMP_RETURN), BSI_NEW_STMT);
973
974 return paral_bb;
975 }
976
977 /* Generates code to execute the iterations of LOOP in N_THREADS threads in
978 parallel. NITER describes number of iterations of LOOP. */
979
980 static void
981 gen_parallel_loop (struct loop *loop, unsigned n_threads,
982 struct tree_niter_desc *niter)
983 {
984 struct loop *nloop;
985 tree many_iterations_cond, type, nit;
986 tree stmts, arg_struct, new_arg_struct;
987 basic_block parallel_head;
988 unsigned prob;
989
990 /* From
991
992 ---------------------------------------------------------------------
993 loop
994 {
995 IV = phi (INIT, IV + STEP)
996 BODY1;
997 if (COND)
998 break;
999 BODY2;
1000 }
1001 ---------------------------------------------------------------------
1002
1003 with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
1004 we generate the following code:
1005
1006 ---------------------------------------------------------------------
1007
1008 if (MAY_BE_ZERO
1009 || NITER < MIN_PER_THREAD * N_THREADS)
1010 goto original;
1011
1012 BODY1;
1013 store all local loop-invariant variables used in body of the loop to DATA.
1014 OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
1015 load the variables from DATA.
1016 OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
1017 BODY2;
1018 BODY1;
1019 OMP_CONTINUE;
1020 OMP_RETURN -- OMP_FOR
1021 OMP_RETURN -- OMP_PARALLEL
1022 goto end;
1023
1024 original:
1025 loop
1026 {
1027 IV = phi (INIT, IV + STEP)
1028 BODY1;
1029 if (COND)
1030 break;
1031 BODY2;
1032 }
1033
1034 end:
1035
1036 */
1037
1038 /* Create two versions of the loop -- in the old one, we know that the
1039 number of iterations is large enough, and we will transform it into the
1040 loop that will be split to loop_fn, the new one will be used for the
1041 remaining iterations. */
1042
1043 type = TREE_TYPE (niter->niter);
1044 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
1045 NULL_TREE);
1046 if (stmts)
1047 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1048
1049 many_iterations_cond =
1050 fold_build2 (GE_EXPR, boolean_type_node,
1051 nit, build_int_cst (type, MIN_PER_THREAD * n_threads));
1052 many_iterations_cond
1053 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1054 invert_truthvalue (unshare_expr (niter->may_be_zero)),
1055 many_iterations_cond);
1056 many_iterations_cond
1057 = force_gimple_operand (many_iterations_cond, &stmts,
1058 false, NULL_TREE);
1059 if (stmts)
1060 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1061 if (!is_gimple_condexpr (many_iterations_cond))
1062 {
1063 many_iterations_cond
1064 = force_gimple_operand (many_iterations_cond, &stmts,
1065 true, NULL_TREE);
1066 if (stmts)
1067 bsi_insert_on_edge_immediate (loop_preheader_edge (loop), stmts);
1068 }
1069
1070 initialize_original_copy_tables ();
1071
1072 /* We assume that the loop usually iterates a lot. */
1073 prob = 4 * REG_BR_PROB_BASE / 5;
1074 nloop = loop_version (loop, many_iterations_cond, NULL,
1075 prob, prob, REG_BR_PROB_BASE - prob, true);
1076 update_ssa (TODO_update_ssa);
1077 free_original_copy_tables ();
1078
1079 /* Base all the induction variables in LOOP on a single control one. */
1080 canonicalize_loop_ivs (loop, nit);
1081
1082 /* Ensure that the exit condition is the first statement in the loop. */
1083 transform_to_exit_first_loop (loop, nit);
1084
1085 /* Eliminate the references to local variables from the loop. */
1086 eliminate_local_variables (loop);
1087
1088 /* In the old loop, move all variables non-local to the loop to a structure
1089 and back, and create separate decls for the variables used in loop. */
1090 separate_decls_in_loop (loop, &arg_struct, &new_arg_struct);
1091
1092 /* Create the parallel constructs. */
1093 parallel_head = create_parallel_loop (loop, create_loop_fn (), arg_struct,
1094 new_arg_struct, n_threads);
1095
1096 scev_reset ();
1097
1098 /* Cancel the loop (it is simpler to do it here rather than to teach the
1099 expander to do it). */
1100 cancel_loop_tree (loop);
1101
1102 /* Expand the parallel constructs. We do it directly here instead of running
1103 a separate expand_omp pass, since it is more efficient, and less likely to
1104 cause troubles with further analyses not being able to deal with the
1105 OMP trees. */
1106 omp_expand_local (parallel_head);
1107 }
1108
1109 /* Detect parallel loops and generate parallel code using libgomp
1110 primitives. Returns true if some loop was parallelized, false
1111 otherwise. */
1112
1113 bool
1114 parallelize_loops (void)
1115 {
1116 unsigned n_threads = flag_tree_parallelize_loops;
1117 bool changed = false;
1118 struct loop *loop;
1119 struct tree_niter_desc niter_desc;
1120 loop_iterator li;
1121
1122 /* Do not parallelize loops in the functions created by parallelization. */
1123 if (parallelized_function_p (cfun->decl))
1124 return false;
1125
1126 FOR_EACH_LOOP (li, loop, 0)
1127 {
1128 if (/* Do not bother with loops in cold areas. */
1129 !maybe_hot_bb_p (loop->header)
1130 /* Or loops that roll too little. */
1131 || expected_loop_iterations (loop) <= n_threads
1132 /* And of course, the loop must be parallelizable. */
1133 || !can_duplicate_loop_p (loop)
1134 || !loop_parallel_p (loop, &niter_desc))
1135 continue;
1136
1137 changed = true;
1138 gen_parallel_loop (loop, n_threads, &niter_desc);
1139 verify_flow_info ();
1140 verify_dominators (CDI_DOMINATORS);
1141 verify_loop_structure ();
1142 verify_loop_closed_ssa ();
1143 }
1144
1145 return changed;
1146 }
1147
1148 #include "gt-tree-parloops.h"