2 Copyright (C) 2006-2019 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 /* This pass performs loop distribution: for example, the loop
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
94 #include "coretypes.h"
99 #include "tree-pass.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop-ivopts.h"
110 #include "tree-ssa-loop.h"
111 #include "tree-into-ssa.h"
112 #include "tree-ssa.h"
114 #include "tree-scalar-evolution.h"
116 #include "tree-vectorizer.h"
120 #define MAX_DATAREFS_NUM \
121 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
123 /* Threshold controlling number of distributed partitions. Given it may
124 be unnecessary if a memory stream cost model is invented in the future,
125 we define it as a temporary macro, rather than a parameter. */
126 #define NUM_PARTITION_THRESHOLD (4)
128 /* Hashtable helpers. */
130 struct ddr_hasher
: nofree_ptr_hash
<struct data_dependence_relation
>
132 static inline hashval_t
hash (const data_dependence_relation
*);
133 static inline bool equal (const data_dependence_relation
*,
134 const data_dependence_relation
*);
137 /* Hash function for data dependence. */
140 ddr_hasher::hash (const data_dependence_relation
*ddr
)
143 h
.add_ptr (DDR_A (ddr
));
144 h
.add_ptr (DDR_B (ddr
));
148 /* Hash table equality function for data dependence. */
151 ddr_hasher::equal (const data_dependence_relation
*ddr1
,
152 const data_dependence_relation
*ddr2
)
154 return (DDR_A (ddr1
) == DDR_A (ddr2
) && DDR_B (ddr1
) == DDR_B (ddr2
));
157 /* The loop (nest) to be distributed. */
158 static vec
<loop_p
> loop_nest
;
160 /* Vector of data references in the loop to be distributed. */
161 static vec
<data_reference_p
> datarefs_vec
;
163 /* Store index of data reference in aux field. */
164 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
166 /* Hash table for data dependence relation in the loop to be distributed. */
167 static hash_table
<ddr_hasher
> *ddrs_table
;
169 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
172 /* The statement represented by this vertex. */
175 /* Vector of data-references in this statement. */
176 vec
<data_reference_p
> datarefs
;
178 /* True when the statement contains a write to memory. */
181 /* True when the statement contains a read from memory. */
185 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
186 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
187 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
188 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
189 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
190 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
191 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
192 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
194 /* Data dependence type. */
198 /* Read After Write (RAW). */
201 /* Control dependence (execute conditional on). */
205 /* Dependence information attached to an edge of the RDG. */
209 /* Type of the dependence. */
210 enum rdg_dep_type type
;
213 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
215 /* Dump vertex I in RDG to FILE. */
218 dump_rdg_vertex (FILE *file
, struct graph
*rdg
, int i
)
220 struct vertex
*v
= &(rdg
->vertices
[i
]);
221 struct graph_edge
*e
;
223 fprintf (file
, "(vertex %d: (%s%s) (in:", i
,
224 RDG_MEM_WRITE_STMT (rdg
, i
) ? "w" : "",
225 RDG_MEM_READS_STMT (rdg
, i
) ? "r" : "");
228 for (e
= v
->pred
; e
; e
= e
->pred_next
)
229 fprintf (file
, " %d", e
->src
);
231 fprintf (file
, ") (out:");
234 for (e
= v
->succ
; e
; e
= e
->succ_next
)
235 fprintf (file
, " %d", e
->dest
);
237 fprintf (file
, ")\n");
238 print_gimple_stmt (file
, RDGV_STMT (v
), 0, TDF_VOPS
|TDF_MEMSYMS
);
239 fprintf (file
, ")\n");
242 /* Call dump_rdg_vertex on stderr. */
245 debug_rdg_vertex (struct graph
*rdg
, int i
)
247 dump_rdg_vertex (stderr
, rdg
, i
);
250 /* Dump the reduced dependence graph RDG to FILE. */
253 dump_rdg (FILE *file
, struct graph
*rdg
)
255 fprintf (file
, "(rdg\n");
256 for (int i
= 0; i
< rdg
->n_vertices
; i
++)
257 dump_rdg_vertex (file
, rdg
, i
);
258 fprintf (file
, ")\n");
261 /* Call dump_rdg on stderr. */
264 debug_rdg (struct graph
*rdg
)
266 dump_rdg (stderr
, rdg
);
270 dot_rdg_1 (FILE *file
, struct graph
*rdg
)
273 pretty_printer buffer
;
274 pp_needs_newline (&buffer
) = false;
275 buffer
.buffer
->stream
= file
;
277 fprintf (file
, "digraph RDG {\n");
279 for (i
= 0; i
< rdg
->n_vertices
; i
++)
281 struct vertex
*v
= &(rdg
->vertices
[i
]);
282 struct graph_edge
*e
;
284 fprintf (file
, "%d [label=\"[%d] ", i
, i
);
285 pp_gimple_stmt_1 (&buffer
, RDGV_STMT (v
), 0, TDF_SLIM
);
287 fprintf (file
, "\"]\n");
289 /* Highlight reads from memory. */
290 if (RDG_MEM_READS_STMT (rdg
, i
))
291 fprintf (file
, "%d [style=filled, fillcolor=green]\n", i
);
293 /* Highlight stores to memory. */
294 if (RDG_MEM_WRITE_STMT (rdg
, i
))
295 fprintf (file
, "%d [style=filled, fillcolor=red]\n", i
);
298 for (e
= v
->succ
; e
; e
= e
->succ_next
)
299 switch (RDGE_TYPE (e
))
302 /* These are the most common dependences: don't print these. */
303 fprintf (file
, "%d -> %d \n", i
, e
->dest
);
307 fprintf (file
, "%d -> %d [label=control] \n", i
, e
->dest
);
315 fprintf (file
, "}\n\n");
318 /* Display the Reduced Dependence Graph using dotty. */
321 dot_rdg (struct graph
*rdg
)
323 /* When debugging, you may want to enable the following code. */
325 FILE *file
= popen ("dot -Tx11", "w");
328 dot_rdg_1 (file
, rdg
);
330 close (fileno (file
));
333 dot_rdg_1 (stderr
, rdg
);
337 /* Returns the index of STMT in RDG. */
340 rdg_vertex_for_stmt (struct graph
*rdg ATTRIBUTE_UNUSED
, gimple
*stmt
)
342 int index
= gimple_uid (stmt
);
343 gcc_checking_assert (index
== -1 || RDG_STMT (rdg
, index
) == stmt
);
347 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
348 the index of DEF in RDG. */
351 create_rdg_edges_for_scalar (struct graph
*rdg
, tree def
, int idef
)
353 use_operand_p imm_use_p
;
354 imm_use_iterator iterator
;
356 FOR_EACH_IMM_USE_FAST (imm_use_p
, iterator
, def
)
358 struct graph_edge
*e
;
359 int use
= rdg_vertex_for_stmt (rdg
, USE_STMT (imm_use_p
));
364 e
= add_edge (rdg
, idef
, use
);
365 e
->data
= XNEW (struct rdg_edge
);
366 RDGE_TYPE (e
) = flow_dd
;
370 /* Creates an edge for the control dependences of BB to the vertex V. */
373 create_edge_for_control_dependence (struct graph
*rdg
, basic_block bb
,
374 int v
, control_dependences
*cd
)
378 EXECUTE_IF_SET_IN_BITMAP (cd
->get_edges_dependent_on (bb
->index
),
381 basic_block cond_bb
= cd
->get_edge_src (edge_n
);
382 gimple
*stmt
= last_stmt (cond_bb
);
383 if (stmt
&& is_ctrl_stmt (stmt
))
385 struct graph_edge
*e
;
386 int c
= rdg_vertex_for_stmt (rdg
, stmt
);
390 e
= add_edge (rdg
, c
, v
);
391 e
->data
= XNEW (struct rdg_edge
);
392 RDGE_TYPE (e
) = control_dd
;
397 /* Creates the edges of the reduced dependence graph RDG. */
400 create_rdg_flow_edges (struct graph
*rdg
)
406 for (i
= 0; i
< rdg
->n_vertices
; i
++)
407 FOR_EACH_PHI_OR_STMT_DEF (def_p
, RDG_STMT (rdg
, i
),
409 create_rdg_edges_for_scalar (rdg
, DEF_FROM_PTR (def_p
), i
);
412 /* Creates the edges of the reduced dependence graph RDG. */
415 create_rdg_cd_edges (struct graph
*rdg
, control_dependences
*cd
, loop_p loop
)
419 for (i
= 0; i
< rdg
->n_vertices
; i
++)
421 gimple
*stmt
= RDG_STMT (rdg
, i
);
422 if (gimple_code (stmt
) == GIMPLE_PHI
)
426 FOR_EACH_EDGE (e
, ei
, gimple_bb (stmt
)->preds
)
427 if (flow_bb_inside_loop_p (loop
, e
->src
))
428 create_edge_for_control_dependence (rdg
, e
->src
, i
, cd
);
431 create_edge_for_control_dependence (rdg
, gimple_bb (stmt
), i
, cd
);
435 /* Build the vertices of the reduced dependence graph RDG. Return false
439 create_rdg_vertices (struct graph
*rdg
, vec
<gimple
*> stmts
, loop_p loop
)
444 FOR_EACH_VEC_ELT (stmts
, i
, stmt
)
446 struct vertex
*v
= &(rdg
->vertices
[i
]);
448 /* Record statement to vertex mapping. */
449 gimple_set_uid (stmt
, i
);
451 v
->data
= XNEW (struct rdg_vertex
);
452 RDGV_STMT (v
) = stmt
;
453 RDGV_DATAREFS (v
).create (0);
454 RDGV_HAS_MEM_WRITE (v
) = false;
455 RDGV_HAS_MEM_READS (v
) = false;
456 if (gimple_code (stmt
) == GIMPLE_PHI
)
459 unsigned drp
= datarefs_vec
.length ();
460 if (!find_data_references_in_stmt (loop
, stmt
, &datarefs_vec
))
462 for (unsigned j
= drp
; j
< datarefs_vec
.length (); ++j
)
464 data_reference_p dr
= datarefs_vec
[j
];
466 RDGV_HAS_MEM_READS (v
) = true;
468 RDGV_HAS_MEM_WRITE (v
) = true;
469 RDGV_DATAREFS (v
).safe_push (dr
);
475 /* Array mapping basic block's index to its topological order. */
476 static int *bb_top_order_index
;
477 /* And size of the array. */
478 static int bb_top_order_index_size
;
480 /* If X has a smaller topological sort number than Y, returns -1;
481 if greater, returns 1. */
484 bb_top_order_cmp (const void *x
, const void *y
)
486 basic_block bb1
= *(const basic_block
*) x
;
487 basic_block bb2
= *(const basic_block
*) y
;
489 gcc_assert (bb1
->index
< bb_top_order_index_size
490 && bb2
->index
< bb_top_order_index_size
);
491 gcc_assert (bb1
== bb2
492 || bb_top_order_index
[bb1
->index
]
493 != bb_top_order_index
[bb2
->index
]);
495 return (bb_top_order_index
[bb1
->index
] - bb_top_order_index
[bb2
->index
]);
498 /* Initialize STMTS with all the statements of LOOP. We use topological
499 order to discover all statements. The order is important because
500 generate_loops_for_partition is using the same traversal for identifying
501 statements in loop copies. */
504 stmts_from_loop (struct loop
*loop
, vec
<gimple
*> *stmts
)
507 basic_block
*bbs
= get_loop_body_in_custom_order (loop
, bb_top_order_cmp
);
509 for (i
= 0; i
< loop
->num_nodes
; i
++)
511 basic_block bb
= bbs
[i
];
513 for (gphi_iterator bsi
= gsi_start_phis (bb
); !gsi_end_p (bsi
);
515 if (!virtual_operand_p (gimple_phi_result (bsi
.phi ())))
516 stmts
->safe_push (bsi
.phi ());
518 for (gimple_stmt_iterator bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
);
521 gimple
*stmt
= gsi_stmt (bsi
);
522 if (gimple_code (stmt
) != GIMPLE_LABEL
&& !is_gimple_debug (stmt
))
523 stmts
->safe_push (stmt
);
530 /* Free the reduced dependence graph RDG. */
533 free_rdg (struct graph
*rdg
)
537 for (i
= 0; i
< rdg
->n_vertices
; i
++)
539 struct vertex
*v
= &(rdg
->vertices
[i
]);
540 struct graph_edge
*e
;
542 for (e
= v
->succ
; e
; e
= e
->succ_next
)
547 gimple_set_uid (RDGV_STMT (v
), -1);
548 (RDGV_DATAREFS (v
)).release ();
556 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
557 LOOP, and one edge per flow dependence or control dependence from control
558 dependence CD. During visiting each statement, data references are also
559 collected and recorded in global data DATAREFS_VEC. */
561 static struct graph
*
562 build_rdg (struct loop
*loop
, control_dependences
*cd
)
566 /* Create the RDG vertices from the stmts of the loop nest. */
567 auto_vec
<gimple
*, 10> stmts
;
568 stmts_from_loop (loop
, &stmts
);
569 rdg
= new_graph (stmts
.length ());
570 if (!create_rdg_vertices (rdg
, stmts
, loop
))
577 create_rdg_flow_edges (rdg
);
579 create_rdg_cd_edges (rdg
, cd
, loop
);
585 /* Kind of distributed loop. */
586 enum partition_kind
{
588 /* Partial memset stands for a paritition can be distributed into a loop
589 of memset calls, rather than a single memset call. It's handled just
590 like a normal parition, i.e, distributed as separate loop, no memset
593 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
594 loop nest as deep as possible. As a result, parloop achieves better
595 parallelization by parallelizing deeper loop nest. This hack should
596 be unnecessary and removed once distributed memset can be understood
597 and analyzed in data reference analysis. See PR82604 for more. */
598 PKIND_PARTIAL_MEMSET
,
599 PKIND_MEMSET
, PKIND_MEMCPY
, PKIND_MEMMOVE
602 /* Type of distributed loop. */
603 enum partition_type
{
604 /* The distributed loop can be executed parallelly. */
606 /* The distributed loop has to be executed sequentially. */
610 /* Builtin info for loop distribution. */
613 /* data-references a kind != PKIND_NORMAL partition is about. */
614 data_reference_p dst_dr
;
615 data_reference_p src_dr
;
616 /* Base address and size of memory objects operated by the builtin. Note
617 both dest and source memory objects must have the same size. */
621 /* Base and offset part of dst_base after stripping constant offset. This
622 is only used in memset builtin distribution for now. */
624 unsigned HOST_WIDE_INT dst_base_offset
;
627 /* Partition for loop distribution. */
630 /* Statements of the partition. */
632 /* True if the partition defines variable which is used outside of loop. */
634 enum partition_kind kind
;
635 enum partition_type type
;
636 /* Data references in the partition. */
638 /* Information of builtin parition. */
639 struct builtin_info
*builtin
;
643 /* Allocate and initialize a partition from BITMAP. */
646 partition_alloc (void)
648 partition
*partition
= XCNEW (struct partition
);
649 partition
->stmts
= BITMAP_ALLOC (NULL
);
650 partition
->reduction_p
= false;
651 partition
->kind
= PKIND_NORMAL
;
652 partition
->datarefs
= BITMAP_ALLOC (NULL
);
656 /* Free PARTITION. */
659 partition_free (partition
*partition
)
661 BITMAP_FREE (partition
->stmts
);
662 BITMAP_FREE (partition
->datarefs
);
663 if (partition
->builtin
)
664 free (partition
->builtin
);
669 /* Returns true if the partition can be generated as a builtin. */
672 partition_builtin_p (partition
*partition
)
674 return partition
->kind
> PKIND_PARTIAL_MEMSET
;
677 /* Returns true if the partition contains a reduction. */
680 partition_reduction_p (partition
*partition
)
682 return partition
->reduction_p
;
685 /* Partitions are fused because of different reasons. */
688 FUSE_NON_BUILTIN
= 0,
695 /* Description on different fusing reason. */
696 static const char *fuse_message
[] = {
697 "they are non-builtins",
698 "they have reductions",
699 "they have shared memory refs",
700 "they are in the same dependence scc",
701 "there is no point to distribute loop"};
704 update_type_for_merge (struct graph
*, partition
*, partition
*);
706 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
707 graph and we update type for result partition if it is non-NULL. */
710 partition_merge_into (struct graph
*rdg
, partition
*dest
,
711 partition
*partition
, enum fuse_type ft
)
713 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
715 fprintf (dump_file
, "Fuse partitions because %s:\n", fuse_message
[ft
]);
716 fprintf (dump_file
, " Part 1: ");
717 dump_bitmap (dump_file
, dest
->stmts
);
718 fprintf (dump_file
, " Part 2: ");
719 dump_bitmap (dump_file
, partition
->stmts
);
722 dest
->kind
= PKIND_NORMAL
;
723 if (dest
->type
== PTYPE_PARALLEL
)
724 dest
->type
= partition
->type
;
726 bitmap_ior_into (dest
->stmts
, partition
->stmts
);
727 if (partition_reduction_p (partition
))
728 dest
->reduction_p
= true;
730 /* Further check if any data dependence prevents us from executing the
731 new partition parallelly. */
732 if (dest
->type
== PTYPE_PARALLEL
&& rdg
!= NULL
)
733 update_type_for_merge (rdg
, dest
, partition
);
735 bitmap_ior_into (dest
->datarefs
, partition
->datarefs
);
739 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
743 ssa_name_has_uses_outside_loop_p (tree def
, loop_p loop
)
745 imm_use_iterator imm_iter
;
748 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
, def
)
750 if (is_gimple_debug (USE_STMT (use_p
)))
753 basic_block use_bb
= gimple_bb (USE_STMT (use_p
));
754 if (!flow_bb_inside_loop_p (loop
, use_bb
))
761 /* Returns true when STMT defines a scalar variable used after the
765 stmt_has_scalar_dependences_outside_loop (loop_p loop
, gimple
*stmt
)
770 if (gimple_code (stmt
) == GIMPLE_PHI
)
771 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt
), loop
);
773 FOR_EACH_SSA_DEF_OPERAND (def_p
, stmt
, op_iter
, SSA_OP_DEF
)
774 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p
), loop
))
780 /* Return a copy of LOOP placed before LOOP. */
783 copy_loop_before (struct loop
*loop
)
786 edge preheader
= loop_preheader_edge (loop
);
788 initialize_original_copy_tables ();
789 res
= slpeel_tree_duplicate_loop_to_edge_cfg (loop
, NULL
, preheader
);
790 gcc_assert (res
!= NULL
);
791 free_original_copy_tables ();
792 delete_update_ssa ();
797 /* Creates an empty basic block after LOOP. */
800 create_bb_after_loop (struct loop
*loop
)
802 edge exit
= single_exit (loop
);
810 /* Generate code for PARTITION from the code in LOOP. The loop is
811 copied when COPY_P is true. All the statements not flagged in the
812 PARTITION bitmap are removed from the loop or from its copy. The
813 statements are indexed in sequence inside a basic block, and the
814 basic blocks of a loop are taken in dom order. */
817 generate_loops_for_partition (struct loop
*loop
, partition
*partition
,
825 int orig_loop_num
= loop
->orig_loop_num
;
826 loop
= copy_loop_before (loop
);
827 gcc_assert (loop
!= NULL
);
828 loop
->orig_loop_num
= orig_loop_num
;
829 create_preheader (loop
, CP_SIMPLE_PREHEADERS
);
830 create_bb_after_loop (loop
);
834 /* Origin number is set to the new versioned loop's num. */
835 gcc_assert (loop
->orig_loop_num
!= loop
->num
);
838 /* Remove stmts not in the PARTITION bitmap. */
839 bbs
= get_loop_body_in_dom_order (loop
);
841 if (MAY_HAVE_DEBUG_BIND_STMTS
)
842 for (i
= 0; i
< loop
->num_nodes
; i
++)
844 basic_block bb
= bbs
[i
];
846 for (gphi_iterator bsi
= gsi_start_phis (bb
); !gsi_end_p (bsi
);
849 gphi
*phi
= bsi
.phi ();
850 if (!virtual_operand_p (gimple_phi_result (phi
))
851 && !bitmap_bit_p (partition
->stmts
, gimple_uid (phi
)))
852 reset_debug_uses (phi
);
855 for (gimple_stmt_iterator bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
); gsi_next (&bsi
))
857 gimple
*stmt
= gsi_stmt (bsi
);
858 if (gimple_code (stmt
) != GIMPLE_LABEL
859 && !is_gimple_debug (stmt
)
860 && !bitmap_bit_p (partition
->stmts
, gimple_uid (stmt
)))
861 reset_debug_uses (stmt
);
865 for (i
= 0; i
< loop
->num_nodes
; i
++)
867 basic_block bb
= bbs
[i
];
868 edge inner_exit
= NULL
;
870 if (loop
!= bb
->loop_father
)
871 inner_exit
= single_exit (bb
->loop_father
);
873 for (gphi_iterator bsi
= gsi_start_phis (bb
); !gsi_end_p (bsi
);)
875 gphi
*phi
= bsi
.phi ();
876 if (!virtual_operand_p (gimple_phi_result (phi
))
877 && !bitmap_bit_p (partition
->stmts
, gimple_uid (phi
)))
878 remove_phi_node (&bsi
, true);
883 for (gimple_stmt_iterator bsi
= gsi_start_bb (bb
); !gsi_end_p (bsi
);)
885 gimple
*stmt
= gsi_stmt (bsi
);
886 if (gimple_code (stmt
) != GIMPLE_LABEL
887 && !is_gimple_debug (stmt
)
888 && !bitmap_bit_p (partition
->stmts
, gimple_uid (stmt
)))
890 /* In distribution of loop nest, if bb is inner loop's exit_bb,
891 we choose its exit edge/path in order to avoid generating
892 infinite loop. For all other cases, we choose an arbitrary
893 path through the empty CFG part that this unnecessary
894 control stmt controls. */
895 if (gcond
*cond_stmt
= dyn_cast
<gcond
*> (stmt
))
897 if (inner_exit
&& inner_exit
->flags
& EDGE_TRUE_VALUE
)
898 gimple_cond_make_true (cond_stmt
);
900 gimple_cond_make_false (cond_stmt
);
903 else if (gimple_code (stmt
) == GIMPLE_SWITCH
)
905 gswitch
*switch_stmt
= as_a
<gswitch
*> (stmt
);
906 gimple_switch_set_index
907 (switch_stmt
, CASE_LOW (gimple_switch_label (switch_stmt
, 1)));
912 unlink_stmt_vdef (stmt
);
913 gsi_remove (&bsi
, true);
925 /* If VAL memory representation contains the same value in all bytes,
926 return that value, otherwise return -1.
927 E.g. for 0x24242424 return 0x24, for IEEE double
928 747708026454360457216.0 return 0x44, etc. */
931 const_with_all_bytes_same (tree val
)
933 unsigned char buf
[64];
936 if (integer_zerop (val
)
937 || (TREE_CODE (val
) == CONSTRUCTOR
938 && !TREE_CLOBBER_P (val
)
939 && CONSTRUCTOR_NELTS (val
) == 0))
942 if (real_zerop (val
))
944 /* Only return 0 for +0.0, not for -0.0, which doesn't have
945 an all bytes same memory representation. Don't transform
946 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
947 switch (TREE_CODE (val
))
950 if (!real_isneg (TREE_REAL_CST_PTR (val
)))
954 if (!const_with_all_bytes_same (TREE_REALPART (val
))
955 && !const_with_all_bytes_same (TREE_IMAGPART (val
)))
960 unsigned int count
= vector_cst_encoded_nelts (val
);
962 for (j
= 0; j
< count
; ++j
)
963 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val
, j
)))
974 if (CHAR_BIT
!= 8 || BITS_PER_UNIT
!= 8)
977 len
= native_encode_expr (val
, buf
, sizeof (buf
));
980 for (i
= 1; i
< len
; i
++)
981 if (buf
[i
] != buf
[0])
986 /* Generate a call to memset for PARTITION in LOOP. */
989 generate_memset_builtin (struct loop
*loop
, partition
*partition
)
991 gimple_stmt_iterator gsi
;
992 tree mem
, fn
, nb_bytes
;
994 struct builtin_info
*builtin
= partition
->builtin
;
997 /* The new statements will be placed before LOOP. */
998 gsi
= gsi_last_bb (loop_preheader_edge (loop
)->src
);
1000 nb_bytes
= rewrite_to_non_trapping_overflow (builtin
->size
);
1001 nb_bytes
= force_gimple_operand_gsi (&gsi
, nb_bytes
, true, NULL_TREE
,
1002 false, GSI_CONTINUE_LINKING
);
1003 mem
= builtin
->dst_base
;
1004 mem
= force_gimple_operand_gsi (&gsi
, mem
, true, NULL_TREE
,
1005 false, GSI_CONTINUE_LINKING
);
1007 /* This exactly matches the pattern recognition in classify_partition. */
1008 val
= gimple_assign_rhs1 (DR_STMT (builtin
->dst_dr
));
1009 /* Handle constants like 0x15151515 and similarly
1010 floating point constants etc. where all bytes are the same. */
1011 int bytev
= const_with_all_bytes_same (val
);
1013 val
= build_int_cst (integer_type_node
, bytev
);
1014 else if (TREE_CODE (val
) == INTEGER_CST
)
1015 val
= fold_convert (integer_type_node
, val
);
1016 else if (!useless_type_conversion_p (integer_type_node
, TREE_TYPE (val
)))
1018 tree tem
= make_ssa_name (integer_type_node
);
1019 gimple
*cstmt
= gimple_build_assign (tem
, NOP_EXPR
, val
);
1020 gsi_insert_after (&gsi
, cstmt
, GSI_CONTINUE_LINKING
);
1024 fn
= build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET
));
1025 fn_call
= gimple_build_call (fn
, 3, mem
, val
, nb_bytes
);
1026 gsi_insert_after (&gsi
, fn_call
, GSI_CONTINUE_LINKING
);
1028 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1030 fprintf (dump_file
, "generated memset");
1032 fprintf (dump_file
, " zero\n");
1034 fprintf (dump_file
, "\n");
1038 /* Generate a call to memcpy for PARTITION in LOOP. */
1041 generate_memcpy_builtin (struct loop
*loop
, partition
*partition
)
1043 gimple_stmt_iterator gsi
;
1045 tree dest
, src
, fn
, nb_bytes
;
1046 enum built_in_function kind
;
1047 struct builtin_info
*builtin
= partition
->builtin
;
1049 /* The new statements will be placed before LOOP. */
1050 gsi
= gsi_last_bb (loop_preheader_edge (loop
)->src
);
1052 nb_bytes
= rewrite_to_non_trapping_overflow (builtin
->size
);
1053 nb_bytes
= force_gimple_operand_gsi (&gsi
, nb_bytes
, true, NULL_TREE
,
1054 false, GSI_CONTINUE_LINKING
);
1055 dest
= builtin
->dst_base
;
1056 src
= builtin
->src_base
;
1057 if (partition
->kind
== PKIND_MEMCPY
1058 || ! ptr_derefs_may_alias_p (dest
, src
))
1059 kind
= BUILT_IN_MEMCPY
;
1061 kind
= BUILT_IN_MEMMOVE
;
1063 dest
= force_gimple_operand_gsi (&gsi
, dest
, true, NULL_TREE
,
1064 false, GSI_CONTINUE_LINKING
);
1065 src
= force_gimple_operand_gsi (&gsi
, src
, true, NULL_TREE
,
1066 false, GSI_CONTINUE_LINKING
);
1067 fn
= build_fold_addr_expr (builtin_decl_implicit (kind
));
1068 fn_call
= gimple_build_call (fn
, 3, dest
, src
, nb_bytes
);
1069 gsi_insert_after (&gsi
, fn_call
, GSI_CONTINUE_LINKING
);
1071 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1073 if (kind
== BUILT_IN_MEMCPY
)
1074 fprintf (dump_file
, "generated memcpy\n");
1076 fprintf (dump_file
, "generated memmove\n");
1080 /* Remove and destroy the loop LOOP. */
1083 destroy_loop (struct loop
*loop
)
1085 unsigned nbbs
= loop
->num_nodes
;
1086 edge exit
= single_exit (loop
);
1087 basic_block src
= loop_preheader_edge (loop
)->src
, dest
= exit
->dest
;
1091 bbs
= get_loop_body_in_dom_order (loop
);
1093 redirect_edge_pred (exit
, src
);
1094 exit
->flags
&= ~(EDGE_TRUE_VALUE
|EDGE_FALSE_VALUE
);
1095 exit
->flags
|= EDGE_FALLTHRU
;
1096 cancel_loop_tree (loop
);
1097 rescan_loop_exit (exit
, false, true);
1102 /* We have made sure to not leave any dangling uses of SSA
1103 names defined in the loop. With the exception of virtuals.
1104 Make sure we replace all uses of virtual defs that will remain
1105 outside of the loop with the bare symbol as delete_basic_block
1106 will release them. */
1108 for (gphi_iterator gsi
= gsi_start_phis (bbs
[i
]); !gsi_end_p (gsi
);
1111 gphi
*phi
= gsi
.phi ();
1112 if (virtual_operand_p (gimple_phi_result (phi
)))
1113 mark_virtual_phi_result_for_renaming (phi
);
1115 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]); !gsi_end_p (gsi
);
1118 gimple
*stmt
= gsi_stmt (gsi
);
1119 tree vdef
= gimple_vdef (stmt
);
1120 if (vdef
&& TREE_CODE (vdef
) == SSA_NAME
)
1121 mark_virtual_operand_for_renaming (vdef
);
1123 delete_basic_block (bbs
[i
]);
1129 set_immediate_dominator (CDI_DOMINATORS
, dest
,
1130 recompute_dominator (CDI_DOMINATORS
, dest
));
1133 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1136 generate_code_for_partition (struct loop
*loop
,
1137 partition
*partition
, bool copy_p
)
1139 switch (partition
->kind
)
1142 case PKIND_PARTIAL_MEMSET
:
1143 /* Reductions all have to be in the last partition. */
1144 gcc_assert (!partition_reduction_p (partition
)
1146 generate_loops_for_partition (loop
, partition
, copy_p
);
1150 generate_memset_builtin (loop
, partition
);
1155 generate_memcpy_builtin (loop
, partition
);
1162 /* Common tail for partitions we turn into a call. If this was the last
1163 partition for which we generate code, we have to destroy the loop. */
1169 /* Return data dependence relation for data references A and B. The two
1170 data references must be in lexicographic order wrto reduced dependence
1171 graph RDG. We firstly try to find ddr from global ddr hash table. If
1172 it doesn't exist, compute the ddr and cache it. */
1174 static data_dependence_relation
*
1175 get_data_dependence (struct graph
*rdg
, data_reference_p a
, data_reference_p b
)
1177 struct data_dependence_relation ent
, **slot
;
1178 struct data_dependence_relation
*ddr
;
1180 gcc_assert (DR_IS_WRITE (a
) || DR_IS_WRITE (b
));
1181 gcc_assert (rdg_vertex_for_stmt (rdg
, DR_STMT (a
))
1182 <= rdg_vertex_for_stmt (rdg
, DR_STMT (b
)));
1185 slot
= ddrs_table
->find_slot (&ent
, INSERT
);
1188 ddr
= initialize_data_dependence_relation (a
, b
, loop_nest
);
1189 compute_affine_dependence (ddr
, loop_nest
[0]);
1196 /* In reduced dependence graph RDG for loop distribution, return true if
1197 dependence between references DR1 and DR2 leads to a dependence cycle
1198 and such dependence cycle can't be resolved by runtime alias check. */
1201 data_dep_in_cycle_p (struct graph
*rdg
,
1202 data_reference_p dr1
, data_reference_p dr2
)
1204 struct data_dependence_relation
*ddr
;
1206 /* Re-shuffle data-refs to be in topological order. */
1207 if (rdg_vertex_for_stmt (rdg
, DR_STMT (dr1
))
1208 > rdg_vertex_for_stmt (rdg
, DR_STMT (dr2
)))
1209 std::swap (dr1
, dr2
);
1211 ddr
= get_data_dependence (rdg
, dr1
, dr2
);
1213 /* In case of no data dependence. */
1214 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1216 /* For unknown data dependence or known data dependence which can't be
1217 expressed in classic distance vector, we check if it can be resolved
1218 by runtime alias check. If yes, we still consider data dependence
1219 as won't introduce data dependence cycle. */
1220 else if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
1221 || DDR_NUM_DIST_VECTS (ddr
) == 0)
1222 return !runtime_alias_check_p (ddr
, NULL
, true);
1223 else if (DDR_NUM_DIST_VECTS (ddr
) > 1)
1225 else if (DDR_REVERSED_P (ddr
)
1226 || lambda_vector_zerop (DDR_DIST_VECT (ddr
, 0), 1))
1232 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1233 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1236 update_type_for_merge (struct graph
*rdg
,
1237 partition
*partition1
, partition
*partition2
)
1240 bitmap_iterator bi
, bj
;
1241 data_reference_p dr1
, dr2
;
1243 EXECUTE_IF_SET_IN_BITMAP (partition1
->datarefs
, 0, i
, bi
)
1245 unsigned start
= (partition1
== partition2
) ? i
+ 1 : 0;
1247 dr1
= datarefs_vec
[i
];
1248 EXECUTE_IF_SET_IN_BITMAP (partition2
->datarefs
, start
, j
, bj
)
1250 dr2
= datarefs_vec
[j
];
1251 if (DR_IS_READ (dr1
) && DR_IS_READ (dr2
))
1254 /* Partition can only be executed sequentially if there is any
1255 data dependence cycle. */
1256 if (data_dep_in_cycle_p (rdg
, dr1
, dr2
))
1258 partition1
->type
= PTYPE_SEQUENTIAL
;
1265 /* Returns a partition with all the statements needed for computing
1266 the vertex V of the RDG, also including the loop exit conditions. */
1269 build_rdg_partition_for_vertex (struct graph
*rdg
, int v
)
1271 partition
*partition
= partition_alloc ();
1272 auto_vec
<int, 3> nodes
;
1275 data_reference_p dr
;
1277 graphds_dfs (rdg
, &v
, 1, &nodes
, false, NULL
);
1279 FOR_EACH_VEC_ELT (nodes
, i
, x
)
1281 bitmap_set_bit (partition
->stmts
, x
);
1283 for (j
= 0; RDG_DATAREFS (rdg
, x
).iterate (j
, &dr
); ++j
)
1285 unsigned idx
= (unsigned) DR_INDEX (dr
);
1286 gcc_assert (idx
< datarefs_vec
.length ());
1288 /* Partition can only be executed sequentially if there is any
1289 unknown data reference. */
1290 if (!DR_BASE_ADDRESS (dr
) || !DR_OFFSET (dr
)
1291 || !DR_INIT (dr
) || !DR_STEP (dr
))
1292 partition
->type
= PTYPE_SEQUENTIAL
;
1294 bitmap_set_bit (partition
->datarefs
, idx
);
1298 if (partition
->type
== PTYPE_SEQUENTIAL
)
1301 /* Further check if any data dependence prevents us from executing the
1302 partition parallelly. */
1303 update_type_for_merge (rdg
, partition
, partition
);
1308 /* Given PARTITION of LOOP and RDG, record single load/store data references
1309 for builtin partition in SRC_DR/DST_DR, return false if there is no such
1313 find_single_drs (struct loop
*loop
, struct graph
*rdg
, partition
*partition
,
1314 data_reference_p
*dst_dr
, data_reference_p
*src_dr
)
1317 data_reference_p single_ld
= NULL
, single_st
= NULL
;
1320 EXECUTE_IF_SET_IN_BITMAP (partition
->stmts
, 0, i
, bi
)
1322 gimple
*stmt
= RDG_STMT (rdg
, i
);
1323 data_reference_p dr
;
1325 if (gimple_code (stmt
) == GIMPLE_PHI
)
1328 /* Any scalar stmts are ok. */
1329 if (!gimple_vuse (stmt
))
1332 /* Otherwise just regular loads/stores. */
1333 if (!gimple_assign_single_p (stmt
))
1336 /* But exactly one store and/or load. */
1337 for (unsigned j
= 0; RDG_DATAREFS (rdg
, i
).iterate (j
, &dr
); ++j
)
1339 tree type
= TREE_TYPE (DR_REF (dr
));
1341 /* The memset, memcpy and memmove library calls are only
1342 able to deal with generic address space. */
1343 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type
)))
1346 if (DR_IS_READ (dr
))
1348 if (single_ld
!= NULL
)
1354 if (single_st
!= NULL
)
1364 /* Bail out if this is a bitfield memory reference. */
1365 if (TREE_CODE (DR_REF (single_st
)) == COMPONENT_REF
1366 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st
), 1)))
1369 /* Data reference must be executed exactly once per iteration of each
1370 loop in the loop nest. We only need to check dominance information
1371 against the outermost one in a perfect loop nest because a bb can't
1372 dominate outermost loop's latch without dominating inner loop's. */
1373 basic_block bb_st
= gimple_bb (DR_STMT (single_st
));
1374 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb_st
))
1379 gimple
*store
= DR_STMT (single_st
), *load
= DR_STMT (single_ld
);
1380 /* Direct aggregate copy or via an SSA name temporary. */
1382 && gimple_assign_lhs (load
) != gimple_assign_rhs1 (store
))
1385 /* Bail out if this is a bitfield memory reference. */
1386 if (TREE_CODE (DR_REF (single_ld
)) == COMPONENT_REF
1387 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld
), 1)))
1390 /* Load and store must be in the same loop nest. */
1391 basic_block bb_ld
= gimple_bb (DR_STMT (single_ld
));
1392 if (bb_st
->loop_father
!= bb_ld
->loop_father
)
1395 /* Data reference must be executed exactly once per iteration.
1396 Same as single_st, we only need to check against the outermost
1398 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb_ld
))
1401 edge e
= single_exit (bb_st
->loop_father
);
1402 bool dom_ld
= dominated_by_p (CDI_DOMINATORS
, e
->src
, bb_ld
);
1403 bool dom_st
= dominated_by_p (CDI_DOMINATORS
, e
->src
, bb_st
);
1404 if (dom_ld
!= dom_st
)
1408 *src_dr
= single_ld
;
1409 *dst_dr
= single_st
;
1413 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1414 loops from inner to outer to see if loop's step equals to access size at
1415 each level of loop. Return 2 if we can prove this at all level loops;
1416 record access base and size in BASE and SIZE; save loop's step at each
1417 level of loop in STEPS if it is not null. For example:
1419 int arr[100][100][100];
1420 for (i = 0; i < 100; i++) ;steps[2] = 40000
1421 for (j = 100; j > 0; j--) ;steps[1] = -400
1422 for (k = 0; k < 100; k++) ;steps[0] = 4
1423 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1425 Return 1 if we can prove the equality at the innermost loop, but not all
1426 level loops. In this case, no information is recorded.
1428 Return 0 if no equality can be proven at any level loops. */
1431 compute_access_range (loop_p loop_nest
, data_reference_p dr
, tree
*base
,
1432 tree
*size
, vec
<tree
> *steps
= NULL
)
1434 location_t loc
= gimple_location (DR_STMT (dr
));
1435 basic_block bb
= gimple_bb (DR_STMT (dr
));
1436 struct loop
*loop
= bb
->loop_father
;
1437 tree ref
= DR_REF (dr
);
1438 tree access_base
= build_fold_addr_expr (ref
);
1439 tree access_size
= TYPE_SIZE_UNIT (TREE_TYPE (ref
));
1443 tree scev_fn
= analyze_scalar_evolution (loop
, access_base
);
1444 if (TREE_CODE (scev_fn
) != POLYNOMIAL_CHREC
)
1447 access_base
= CHREC_LEFT (scev_fn
);
1448 if (tree_contains_chrecs (access_base
, NULL
))
1451 tree scev_step
= CHREC_RIGHT (scev_fn
);
1452 /* Only support constant steps. */
1453 if (TREE_CODE (scev_step
) != INTEGER_CST
)
1456 enum ev_direction access_dir
= scev_direction (scev_fn
);
1457 if (access_dir
== EV_DIR_UNKNOWN
)
1461 steps
->safe_push (scev_step
);
1463 scev_step
= fold_convert_loc (loc
, sizetype
, scev_step
);
1464 /* Compute absolute value of scev step. */
1465 if (access_dir
== EV_DIR_DECREASES
)
1466 scev_step
= fold_build1_loc (loc
, NEGATE_EXPR
, sizetype
, scev_step
);
1468 /* At each level of loop, scev step must equal to access size. In other
1469 words, DR must access consecutive memory between loop iterations. */
1470 if (!operand_equal_p (scev_step
, access_size
, 0))
1473 /* Access stride can be computed for data reference at least for the
1477 /* Compute DR's execution times in loop. */
1478 tree niters
= number_of_latch_executions (loop
);
1479 niters
= fold_convert_loc (loc
, sizetype
, niters
);
1480 if (dominated_by_p (CDI_DOMINATORS
, single_exit (loop
)->src
, bb
))
1481 niters
= size_binop_loc (loc
, PLUS_EXPR
, niters
, size_one_node
);
1483 /* Compute DR's overall access size in loop. */
1484 access_size
= fold_build2_loc (loc
, MULT_EXPR
, sizetype
,
1486 /* Adjust base address in case of negative step. */
1487 if (access_dir
== EV_DIR_DECREASES
)
1489 tree adj
= fold_build2_loc (loc
, MINUS_EXPR
, sizetype
,
1490 scev_step
, access_size
);
1491 access_base
= fold_build_pointer_plus_loc (loc
, access_base
, adj
);
1493 } while (loop
!= loop_nest
&& (loop
= loop_outer (loop
)) != NULL
);
1495 *base
= access_base
;
1496 *size
= access_size
;
1497 /* Access stride can be computed for data reference at each level loop. */
1501 /* Allocate and return builtin struct. Record information like DST_DR,
1502 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1504 static struct builtin_info
*
1505 alloc_builtin (data_reference_p dst_dr
, data_reference_p src_dr
,
1506 tree dst_base
, tree src_base
, tree size
)
1508 struct builtin_info
*builtin
= XNEW (struct builtin_info
);
1509 builtin
->dst_dr
= dst_dr
;
1510 builtin
->src_dr
= src_dr
;
1511 builtin
->dst_base
= dst_base
;
1512 builtin
->src_base
= src_base
;
1513 builtin
->size
= size
;
1517 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1521 classify_builtin_st (loop_p loop
, partition
*partition
, data_reference_p dr
)
1523 gimple
*stmt
= DR_STMT (dr
);
1524 tree base
, size
, rhs
= gimple_assign_rhs1 (stmt
);
1526 if (const_with_all_bytes_same (rhs
) == -1
1527 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs
))
1528 || (TYPE_MODE (TREE_TYPE (rhs
))
1529 != TYPE_MODE (unsigned_char_type_node
))))
1532 if (TREE_CODE (rhs
) == SSA_NAME
1533 && !SSA_NAME_IS_DEFAULT_DEF (rhs
)
1534 && flow_bb_inside_loop_p (loop
, gimple_bb (SSA_NAME_DEF_STMT (rhs
))))
1537 int res
= compute_access_range (loop
, dr
, &base
, &size
);
1542 partition
->kind
= PKIND_PARTIAL_MEMSET
;
1546 poly_uint64 base_offset
;
1547 unsigned HOST_WIDE_INT const_base_offset
;
1548 tree base_base
= strip_offset (base
, &base_offset
);
1549 if (!base_offset
.is_constant (&const_base_offset
))
1552 struct builtin_info
*builtin
;
1553 builtin
= alloc_builtin (dr
, NULL
, base
, NULL_TREE
, size
);
1554 builtin
->dst_base_base
= base_base
;
1555 builtin
->dst_base_offset
= const_base_offset
;
1556 partition
->builtin
= builtin
;
1557 partition
->kind
= PKIND_MEMSET
;
1560 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1561 if it forms builtin memcpy or memmove call. */
1564 classify_builtin_ldst (loop_p loop
, struct graph
*rdg
, partition
*partition
,
1565 data_reference_p dst_dr
, data_reference_p src_dr
)
1567 tree base
, size
, src_base
, src_size
;
1568 auto_vec
<tree
> dst_steps
, src_steps
;
1570 /* Compute access range of both load and store. */
1571 int res
= compute_access_range (loop
, dst_dr
, &base
, &size
, &dst_steps
);
1574 res
= compute_access_range (loop
, src_dr
, &src_base
, &src_size
, &src_steps
);
1578 /* They much have the same access size. */
1579 if (!operand_equal_p (size
, src_size
, 0))
1582 /* Load and store in loop nest must access memory in the same way, i.e,
1583 their must have the same steps in each loop of the nest. */
1584 if (dst_steps
.length () != src_steps
.length ())
1586 for (unsigned i
= 0; i
< dst_steps
.length (); ++i
)
1587 if (!operand_equal_p (dst_steps
[i
], src_steps
[i
], 0))
1590 /* Now check that if there is a dependence. */
1591 ddr_p ddr
= get_data_dependence (rdg
, src_dr
, dst_dr
);
1593 /* Classify as memcpy if no dependence between load and store. */
1594 if (DDR_ARE_DEPENDENT (ddr
) == chrec_known
)
1596 partition
->builtin
= alloc_builtin (dst_dr
, src_dr
, base
, src_base
, size
);
1597 partition
->kind
= PKIND_MEMCPY
;
1601 /* Can't do memmove in case of unknown dependence or dependence without
1602 classical distance vector. */
1603 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
1604 || DDR_NUM_DIST_VECTS (ddr
) == 0)
1608 lambda_vector dist_v
;
1609 int num_lev
= (DDR_LOOP_NEST (ddr
)).length ();
1610 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr
), i
, dist_v
)
1612 unsigned dep_lev
= dependence_level (dist_v
, num_lev
);
1613 /* Can't do memmove if load depends on store. */
1614 if (dep_lev
> 0 && dist_v
[dep_lev
- 1] > 0 && !DDR_REVERSED_P (ddr
))
1618 partition
->builtin
= alloc_builtin (dst_dr
, src_dr
, base
, src_base
, size
);
1619 partition
->kind
= PKIND_MEMMOVE
;
1623 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1624 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1625 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1628 classify_partition (loop_p loop
, struct graph
*rdg
, partition
*partition
,
1629 bitmap stmt_in_all_partitions
)
1633 data_reference_p single_ld
= NULL
, single_st
= NULL
;
1634 bool volatiles_p
= false, has_reduction
= false;
1636 EXECUTE_IF_SET_IN_BITMAP (partition
->stmts
, 0, i
, bi
)
1638 gimple
*stmt
= RDG_STMT (rdg
, i
);
1640 if (gimple_has_volatile_ops (stmt
))
1643 /* If the stmt is not included by all partitions and there is uses
1644 outside of the loop, then mark the partition as reduction. */
1645 if (stmt_has_scalar_dependences_outside_loop (loop
, stmt
))
1647 /* Due to limitation in the transform phase we have to fuse all
1648 reduction partitions. As a result, this could cancel valid
1649 loop distribution especially for loop that induction variable
1650 is used outside of loop. To workaround this issue, we skip
1651 marking partition as reudction if the reduction stmt belongs
1652 to all partitions. In such case, reduction will be computed
1653 correctly no matter how partitions are fused/distributed. */
1654 if (!bitmap_bit_p (stmt_in_all_partitions
, i
))
1656 partition
->reduction_p
= true;
1659 has_reduction
= true;
1663 /* Perform general partition disqualification for builtins. */
1665 /* Simple workaround to prevent classifying the partition as builtin
1666 if it contains any use outside of loop. */
1668 || !flag_tree_loop_distribute_patterns
)
1671 /* Find single load/store data references for builtin partition. */
1672 if (!find_single_drs (loop
, rdg
, partition
, &single_st
, &single_ld
))
1675 /* Classify the builtin kind. */
1676 if (single_ld
== NULL
)
1677 classify_builtin_st (loop
, partition
, single_st
);
1679 classify_builtin_ldst (loop
, rdg
, partition
, single_st
, single_ld
);
1682 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1686 share_memory_accesses (struct graph
*rdg
,
1687 partition
*partition1
, partition
*partition2
)
1690 bitmap_iterator bi
, bj
;
1691 data_reference_p dr1
, dr2
;
1693 /* First check whether in the intersection of the two partitions are
1694 any loads or stores. Common loads are the situation that happens
1696 EXECUTE_IF_AND_IN_BITMAP (partition1
->stmts
, partition2
->stmts
, 0, i
, bi
)
1697 if (RDG_MEM_WRITE_STMT (rdg
, i
)
1698 || RDG_MEM_READS_STMT (rdg
, i
))
1701 /* Then check whether the two partitions access the same memory object. */
1702 EXECUTE_IF_SET_IN_BITMAP (partition1
->datarefs
, 0, i
, bi
)
1704 dr1
= datarefs_vec
[i
];
1706 if (!DR_BASE_ADDRESS (dr1
)
1707 || !DR_OFFSET (dr1
) || !DR_INIT (dr1
) || !DR_STEP (dr1
))
1710 EXECUTE_IF_SET_IN_BITMAP (partition2
->datarefs
, 0, j
, bj
)
1712 dr2
= datarefs_vec
[j
];
1714 if (!DR_BASE_ADDRESS (dr2
)
1715 || !DR_OFFSET (dr2
) || !DR_INIT (dr2
) || !DR_STEP (dr2
))
1718 if (operand_equal_p (DR_BASE_ADDRESS (dr1
), DR_BASE_ADDRESS (dr2
), 0)
1719 && operand_equal_p (DR_OFFSET (dr1
), DR_OFFSET (dr2
), 0)
1720 && operand_equal_p (DR_INIT (dr1
), DR_INIT (dr2
), 0)
1721 && operand_equal_p (DR_STEP (dr1
), DR_STEP (dr2
), 0))
1729 /* For each seed statement in STARTING_STMTS, this function builds
1730 partition for it by adding depended statements according to RDG.
1731 All partitions are recorded in PARTITIONS. */
1734 rdg_build_partitions (struct graph
*rdg
,
1735 vec
<gimple
*> starting_stmts
,
1736 vec
<partition
*> *partitions
)
1738 auto_bitmap processed
;
1742 FOR_EACH_VEC_ELT (starting_stmts
, i
, stmt
)
1744 int v
= rdg_vertex_for_stmt (rdg
, stmt
);
1746 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1748 "ldist asked to generate code for vertex %d\n", v
);
1750 /* If the vertex is already contained in another partition so
1751 is the partition rooted at it. */
1752 if (bitmap_bit_p (processed
, v
))
1755 partition
*partition
= build_rdg_partition_for_vertex (rdg
, v
);
1756 bitmap_ior_into (processed
, partition
->stmts
);
1758 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
1760 fprintf (dump_file
, "ldist creates useful %s partition:\n",
1761 partition
->type
== PTYPE_PARALLEL
? "parallel" : "sequent");
1762 bitmap_print (dump_file
, partition
->stmts
, " ", "\n");
1765 partitions
->safe_push (partition
);
1768 /* All vertices should have been assigned to at least one partition now,
1769 other than vertices belonging to dead code. */
1772 /* Dump to FILE the PARTITIONS. */
1775 dump_rdg_partitions (FILE *file
, vec
<partition
*> partitions
)
1778 partition
*partition
;
1780 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
1781 debug_bitmap_file (file
, partition
->stmts
);
1784 /* Debug PARTITIONS. */
1785 extern void debug_rdg_partitions (vec
<partition
*> );
1788 debug_rdg_partitions (vec
<partition
*> partitions
)
1790 dump_rdg_partitions (stderr
, partitions
);
1793 /* Returns the number of read and write operations in the RDG. */
1796 number_of_rw_in_rdg (struct graph
*rdg
)
1800 for (i
= 0; i
< rdg
->n_vertices
; i
++)
1802 if (RDG_MEM_WRITE_STMT (rdg
, i
))
1805 if (RDG_MEM_READS_STMT (rdg
, i
))
1812 /* Returns the number of read and write operations in a PARTITION of
1816 number_of_rw_in_partition (struct graph
*rdg
, partition
*partition
)
1822 EXECUTE_IF_SET_IN_BITMAP (partition
->stmts
, 0, i
, ii
)
1824 if (RDG_MEM_WRITE_STMT (rdg
, i
))
1827 if (RDG_MEM_READS_STMT (rdg
, i
))
1834 /* Returns true when one of the PARTITIONS contains all the read or
1835 write operations of RDG. */
1838 partition_contains_all_rw (struct graph
*rdg
,
1839 vec
<partition
*> partitions
)
1842 partition
*partition
;
1843 int nrw
= number_of_rw_in_rdg (rdg
);
1845 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
1846 if (nrw
== number_of_rw_in_partition (rdg
, partition
))
1852 /* Compute partition dependence created by the data references in DRS1
1853 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1854 not NULL, we record dependence introduced by possible alias between
1855 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1856 dependence as if it doesn't exist at all. */
1859 pg_add_dependence_edges (struct graph
*rdg
, int dir
,
1860 bitmap drs1
, bitmap drs2
, vec
<ddr_p
> *alias_ddrs
)
1863 bitmap_iterator bi
, bj
;
1864 data_reference_p dr1
, dr2
, saved_dr1
;
1866 /* dependence direction - 0 is no dependence, -1 is back,
1867 1 is forth, 2 is both (we can stop then, merging will occur). */
1868 EXECUTE_IF_SET_IN_BITMAP (drs1
, 0, i
, bi
)
1870 dr1
= datarefs_vec
[i
];
1872 EXECUTE_IF_SET_IN_BITMAP (drs2
, 0, j
, bj
)
1874 int res
, this_dir
= 1;
1877 dr2
= datarefs_vec
[j
];
1879 /* Skip all <read, read> data dependence. */
1880 if (DR_IS_READ (dr1
) && DR_IS_READ (dr2
))
1884 /* Re-shuffle data-refs to be in topological order. */
1885 if (rdg_vertex_for_stmt (rdg
, DR_STMT (dr1
))
1886 > rdg_vertex_for_stmt (rdg
, DR_STMT (dr2
)))
1888 std::swap (dr1
, dr2
);
1889 this_dir
= -this_dir
;
1891 ddr
= get_data_dependence (rdg
, dr1
, dr2
);
1892 if (DDR_ARE_DEPENDENT (ddr
) == chrec_dont_know
)
1895 res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr1
),
1896 DR_BASE_ADDRESS (dr2
));
1897 /* Be conservative. If data references are not well analyzed,
1898 or the two data references have the same base address and
1899 offset, add dependence and consider it alias to each other.
1900 In other words, the dependence cannot be resolved by
1901 runtime alias check. */
1902 if (!DR_BASE_ADDRESS (dr1
) || !DR_BASE_ADDRESS (dr2
)
1903 || !DR_OFFSET (dr1
) || !DR_OFFSET (dr2
)
1904 || !DR_INIT (dr1
) || !DR_INIT (dr2
)
1905 || !DR_STEP (dr1
) || !tree_fits_uhwi_p (DR_STEP (dr1
))
1906 || !DR_STEP (dr2
) || !tree_fits_uhwi_p (DR_STEP (dr2
))
1909 /* Data dependence could be resolved by runtime alias check,
1910 record it in ALIAS_DDRS. */
1911 else if (alias_ddrs
!= NULL
)
1912 alias_ddrs
->safe_push (ddr
);
1913 /* Or simply ignore it. */
1915 else if (DDR_ARE_DEPENDENT (ddr
) == NULL_TREE
)
1917 if (DDR_REVERSED_P (ddr
))
1918 this_dir
= -this_dir
;
1920 /* Known dependences can still be unordered througout the
1921 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1922 if (DDR_NUM_DIST_VECTS (ddr
) != 1)
1924 /* If the overlap is exact preserve stmt order. */
1925 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr
, 0),
1926 DDR_NB_LOOPS (ddr
)))
1928 /* Else as the distance vector is lexicographic positive swap
1929 the dependence direction. */
1931 this_dir
= -this_dir
;
1939 else if (this_dir
!= 0 && dir
!= this_dir
)
1941 /* Shuffle "back" dr1. */
1948 /* Compare postorder number of the partition graph vertices V1 and V2. */
1951 pgcmp (const void *v1_
, const void *v2_
)
1953 const vertex
*v1
= (const vertex
*)v1_
;
1954 const vertex
*v2
= (const vertex
*)v2_
;
1955 return v2
->post
- v1
->post
;
1958 /* Data attached to vertices of partition dependence graph. */
1961 /* ID of the corresponding partition. */
1963 /* The partition. */
1964 struct partition
*partition
;
1967 /* Data attached to edges of partition dependence graph. */
1970 /* If the dependence edge can be resolved by runtime alias check,
1971 this vector contains data dependence relations for runtime alias
1972 check. On the other hand, if the dependence edge is introduced
1973 because of compilation time known data dependence, this vector
1974 contains nothing. */
1975 vec
<ddr_p
> alias_ddrs
;
1978 /* Callback data for traversing edges in graph. */
1979 struct pg_edge_callback_data
1981 /* Bitmap contains strong connected components should be merged. */
1982 bitmap sccs_to_merge
;
1983 /* Array constains component information for all vertices. */
1984 int *vertices_component
;
1985 /* Vector to record all data dependence relations which are needed
1986 to break strong connected components by runtime alias checks. */
1987 vec
<ddr_p
> *alias_ddrs
;
1990 /* Initialize vertice's data for partition dependence graph PG with
1994 init_partition_graph_vertices (struct graph
*pg
,
1995 vec
<struct partition
*> *partitions
)
1998 partition
*partition
;
1999 struct pg_vdata
*data
;
2001 for (i
= 0; partitions
->iterate (i
, &partition
); ++i
)
2003 data
= new pg_vdata
;
2004 pg
->vertices
[i
].data
= data
;
2006 data
->partition
= partition
;
2010 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2011 dependence relations to the EDGE if DDRS isn't NULL. */
2014 add_partition_graph_edge (struct graph
*pg
, int i
, int j
, vec
<ddr_p
> *ddrs
)
2016 struct graph_edge
*e
= add_edge (pg
, i
, j
);
2018 /* If the edge is attached with data dependence relations, it means this
2019 dependence edge can be resolved by runtime alias checks. */
2022 struct pg_edata
*data
= new pg_edata
;
2024 gcc_assert (ddrs
->length () > 0);
2026 data
->alias_ddrs
= vNULL
;
2027 data
->alias_ddrs
.safe_splice (*ddrs
);
2031 /* Callback function for graph travesal algorithm. It returns true
2032 if edge E should skipped when traversing the graph. */
2035 pg_skip_alias_edge (struct graph_edge
*e
)
2037 struct pg_edata
*data
= (struct pg_edata
*)e
->data
;
2038 return (data
!= NULL
&& data
->alias_ddrs
.length () > 0);
2041 /* Callback function freeing data attached to edge E of graph. */
2044 free_partition_graph_edata_cb (struct graph
*, struct graph_edge
*e
, void *)
2046 if (e
->data
!= NULL
)
2048 struct pg_edata
*data
= (struct pg_edata
*)e
->data
;
2049 data
->alias_ddrs
.release ();
2054 /* Free data attached to vertice of partition dependence graph PG. */
2057 free_partition_graph_vdata (struct graph
*pg
)
2060 struct pg_vdata
*data
;
2062 for (i
= 0; i
< pg
->n_vertices
; ++i
)
2064 data
= (struct pg_vdata
*)pg
->vertices
[i
].data
;
2069 /* Build and return partition dependence graph for PARTITIONS. RDG is
2070 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2071 is true, data dependence caused by possible alias between references
2072 is ignored, as if it doesn't exist at all; otherwise all depdendences
2075 static struct graph
*
2076 build_partition_graph (struct graph
*rdg
,
2077 vec
<struct partition
*> *partitions
,
2078 bool ignore_alias_p
)
2081 struct partition
*partition1
, *partition2
;
2082 graph
*pg
= new_graph (partitions
->length ());
2083 auto_vec
<ddr_p
> alias_ddrs
, *alias_ddrs_p
;
2085 alias_ddrs_p
= ignore_alias_p
? NULL
: &alias_ddrs
;
2087 init_partition_graph_vertices (pg
, partitions
);
2089 for (i
= 0; partitions
->iterate (i
, &partition1
); ++i
)
2091 for (j
= i
+ 1; partitions
->iterate (j
, &partition2
); ++j
)
2093 /* dependence direction - 0 is no dependence, -1 is back,
2094 1 is forth, 2 is both (we can stop then, merging will occur). */
2097 /* If the first partition has reduction, add back edge; if the
2098 second partition has reduction, add forth edge. This makes
2099 sure that reduction partition will be sorted as the last one. */
2100 if (partition_reduction_p (partition1
))
2102 else if (partition_reduction_p (partition2
))
2105 /* Cleanup the temporary vector. */
2106 alias_ddrs
.truncate (0);
2108 dir
= pg_add_dependence_edges (rdg
, dir
, partition1
->datarefs
,
2109 partition2
->datarefs
, alias_ddrs_p
);
2111 /* Add edge to partition graph if there exists dependence. There
2112 are two types of edges. One type edge is caused by compilation
2113 time known dependence, this type cannot be resolved by runtime
2114 alias check. The other type can be resolved by runtime alias
2116 if (dir
== 1 || dir
== 2
2117 || alias_ddrs
.length () > 0)
2119 /* Attach data dependence relations to edge that can be resolved
2120 by runtime alias check. */
2121 bool alias_edge_p
= (dir
!= 1 && dir
!= 2);
2122 add_partition_graph_edge (pg
, i
, j
,
2123 (alias_edge_p
) ? &alias_ddrs
: NULL
);
2125 if (dir
== -1 || dir
== 2
2126 || alias_ddrs
.length () > 0)
2128 /* Attach data dependence relations to edge that can be resolved
2129 by runtime alias check. */
2130 bool alias_edge_p
= (dir
!= -1 && dir
!= 2);
2131 add_partition_graph_edge (pg
, j
, i
,
2132 (alias_edge_p
) ? &alias_ddrs
: NULL
);
2139 /* Sort partitions in PG in descending post order and store them in
2143 sort_partitions_by_post_order (struct graph
*pg
,
2144 vec
<struct partition
*> *partitions
)
2147 struct pg_vdata
*data
;
2149 /* Now order the remaining nodes in descending postorder. */
2150 qsort (pg
->vertices
, pg
->n_vertices
, sizeof (vertex
), pgcmp
);
2151 partitions
->truncate (0);
2152 for (i
= 0; i
< pg
->n_vertices
; ++i
)
2154 data
= (struct pg_vdata
*)pg
->vertices
[i
].data
;
2155 if (data
->partition
)
2156 partitions
->safe_push (data
->partition
);
2160 /* Given reduced dependence graph RDG merge strong connected components
2161 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
2162 possible alias between references is ignored, as if it doesn't exist
2163 at all; otherwise all depdendences are considered. */
2166 merge_dep_scc_partitions (struct graph
*rdg
,
2167 vec
<struct partition
*> *partitions
,
2168 bool ignore_alias_p
)
2170 struct partition
*partition1
, *partition2
;
2171 struct pg_vdata
*data
;
2172 graph
*pg
= build_partition_graph (rdg
, partitions
, ignore_alias_p
);
2173 int i
, j
, num_sccs
= graphds_scc (pg
, NULL
);
2175 /* Strong connected compoenent means dependence cycle, we cannot distribute
2176 them. So fuse them together. */
2177 if ((unsigned) num_sccs
< partitions
->length ())
2179 for (i
= 0; i
< num_sccs
; ++i
)
2181 for (j
= 0; partitions
->iterate (j
, &partition1
); ++j
)
2182 if (pg
->vertices
[j
].component
== i
)
2184 for (j
= j
+ 1; partitions
->iterate (j
, &partition2
); ++j
)
2185 if (pg
->vertices
[j
].component
== i
)
2187 partition_merge_into (NULL
, partition1
,
2188 partition2
, FUSE_SAME_SCC
);
2189 partition1
->type
= PTYPE_SEQUENTIAL
;
2190 (*partitions
)[j
] = NULL
;
2191 partition_free (partition2
);
2192 data
= (struct pg_vdata
*)pg
->vertices
[j
].data
;
2193 data
->partition
= NULL
;
2198 sort_partitions_by_post_order (pg
, partitions
);
2199 gcc_assert (partitions
->length () == (unsigned)num_sccs
);
2200 free_partition_graph_vdata (pg
);
2204 /* Callback function for traversing edge E in graph G. DATA is private
2208 pg_collect_alias_ddrs (struct graph
*g
, struct graph_edge
*e
, void *data
)
2210 int i
, j
, component
;
2211 struct pg_edge_callback_data
*cbdata
;
2212 struct pg_edata
*edata
= (struct pg_edata
*) e
->data
;
2214 /* If the edge doesn't have attached data dependence, it represents
2215 compilation time known dependences. This type dependence cannot
2216 be resolved by runtime alias check. */
2217 if (edata
== NULL
|| edata
->alias_ddrs
.length () == 0)
2220 cbdata
= (struct pg_edge_callback_data
*) data
;
2223 component
= cbdata
->vertices_component
[i
];
2224 /* Vertices are topologically sorted according to compilation time
2225 known dependences, so we can break strong connected components
2226 by removing edges of the opposite direction, i.e, edges pointing
2227 from vertice with smaller post number to vertice with bigger post
2229 if (g
->vertices
[i
].post
< g
->vertices
[j
].post
2230 /* We only need to remove edges connecting vertices in the same
2231 strong connected component to break it. */
2232 && component
== cbdata
->vertices_component
[j
]
2233 /* Check if we want to break the strong connected component or not. */
2234 && !bitmap_bit_p (cbdata
->sccs_to_merge
, component
))
2235 cbdata
->alias_ddrs
->safe_splice (edata
->alias_ddrs
);
2238 /* This is the main function breaking strong conected components in
2239 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2240 relations for runtime alias check in ALIAS_DDRS. */
2243 break_alias_scc_partitions (struct graph
*rdg
,
2244 vec
<struct partition
*> *partitions
,
2245 vec
<ddr_p
> *alias_ddrs
)
2247 int i
, j
, k
, num_sccs
, num_sccs_no_alias
;
2248 /* Build partition dependence graph. */
2249 graph
*pg
= build_partition_graph (rdg
, partitions
, false);
2251 alias_ddrs
->truncate (0);
2252 /* Find strong connected components in the graph, with all dependence edges
2254 num_sccs
= graphds_scc (pg
, NULL
);
2255 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2256 compilation time known dependences are merged before this function. */
2257 if ((unsigned) num_sccs
< partitions
->length ())
2259 struct pg_edge_callback_data cbdata
;
2260 auto_bitmap sccs_to_merge
;
2261 auto_vec
<enum partition_type
> scc_types
;
2262 struct partition
*partition
, *first
;
2264 /* If all partitions in a SCC have the same type, we can simply merge the
2265 SCC. This loop finds out such SCCS and record them in bitmap. */
2266 bitmap_set_range (sccs_to_merge
, 0, (unsigned) num_sccs
);
2267 for (i
= 0; i
< num_sccs
; ++i
)
2269 for (j
= 0; partitions
->iterate (j
, &first
); ++j
)
2270 if (pg
->vertices
[j
].component
== i
)
2273 bool same_type
= true, all_builtins
= partition_builtin_p (first
);
2274 for (++j
; partitions
->iterate (j
, &partition
); ++j
)
2276 if (pg
->vertices
[j
].component
!= i
)
2279 if (first
->type
!= partition
->type
)
2284 all_builtins
&= partition_builtin_p (partition
);
2286 /* Merge SCC if all partitions in SCC have the same type, though the
2287 result partition is sequential, because vectorizer can do better
2288 runtime alias check. One expecption is all partitions in SCC are
2290 if (!same_type
|| all_builtins
)
2291 bitmap_clear_bit (sccs_to_merge
, i
);
2294 /* Initialize callback data for traversing. */
2295 cbdata
.sccs_to_merge
= sccs_to_merge
;
2296 cbdata
.alias_ddrs
= alias_ddrs
;
2297 cbdata
.vertices_component
= XNEWVEC (int, pg
->n_vertices
);
2298 /* Record the component information which will be corrupted by next
2299 graph scc finding call. */
2300 for (i
= 0; i
< pg
->n_vertices
; ++i
)
2301 cbdata
.vertices_component
[i
] = pg
->vertices
[i
].component
;
2303 /* Collect data dependences for runtime alias checks to break SCCs. */
2304 if (bitmap_count_bits (sccs_to_merge
) != (unsigned) num_sccs
)
2306 /* Run SCC finding algorithm again, with alias dependence edges
2307 skipped. This is to topologically sort partitions according to
2308 compilation time known dependence. Note the topological order
2309 is stored in the form of pg's post order number. */
2310 num_sccs_no_alias
= graphds_scc (pg
, NULL
, pg_skip_alias_edge
);
2311 gcc_assert (partitions
->length () == (unsigned) num_sccs_no_alias
);
2312 /* With topological order, we can construct two subgraphs L and R.
2313 L contains edge <x, y> where x < y in terms of post order, while
2314 R contains edge <x, y> where x > y. Edges for compilation time
2315 known dependence all fall in R, so we break SCCs by removing all
2316 (alias) edges of in subgraph L. */
2317 for_each_edge (pg
, pg_collect_alias_ddrs
, &cbdata
);
2320 /* For SCC that doesn't need to be broken, merge it. */
2321 for (i
= 0; i
< num_sccs
; ++i
)
2323 if (!bitmap_bit_p (sccs_to_merge
, i
))
2326 for (j
= 0; partitions
->iterate (j
, &first
); ++j
)
2327 if (cbdata
.vertices_component
[j
] == i
)
2329 for (k
= j
+ 1; partitions
->iterate (k
, &partition
); ++k
)
2331 struct pg_vdata
*data
;
2333 if (cbdata
.vertices_component
[k
] != i
)
2336 /* Update postorder number so that merged reduction partition is
2337 sorted after other partitions. */
2338 if (!partition_reduction_p (first
)
2339 && partition_reduction_p (partition
))
2341 gcc_assert (pg
->vertices
[k
].post
< pg
->vertices
[j
].post
);
2342 pg
->vertices
[j
].post
= pg
->vertices
[k
].post
;
2344 partition_merge_into (NULL
, first
, partition
, FUSE_SAME_SCC
);
2345 (*partitions
)[k
] = NULL
;
2346 partition_free (partition
);
2347 data
= (struct pg_vdata
*)pg
->vertices
[k
].data
;
2348 gcc_assert (data
->id
== k
);
2349 data
->partition
= NULL
;
2350 /* The result partition of merged SCC must be sequential. */
2351 first
->type
= PTYPE_SEQUENTIAL
;
2356 sort_partitions_by_post_order (pg
, partitions
);
2357 free_partition_graph_vdata (pg
);
2358 for_each_edge (pg
, free_partition_graph_edata_cb
, NULL
);
2361 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2363 fprintf (dump_file
, "Possible alias data dependence to break:\n");
2364 dump_data_dependence_relations (dump_file
, *alias_ddrs
);
2368 /* Compute and return an expression whose value is the segment length which
2369 will be accessed by DR in NITERS iterations. */
2372 data_ref_segment_size (struct data_reference
*dr
, tree niters
)
2374 niters
= size_binop (MINUS_EXPR
,
2375 fold_convert (sizetype
, niters
),
2377 return size_binop (MULT_EXPR
,
2378 fold_convert (sizetype
, DR_STEP (dr
)),
2379 fold_convert (sizetype
, niters
));
2382 /* Return true if LOOP's latch is dominated by statement for data reference
2386 latch_dominated_by_data_ref (struct loop
*loop
, data_reference
*dr
)
2388 return dominated_by_p (CDI_DOMINATORS
, single_exit (loop
)->src
,
2389 gimple_bb (DR_STMT (dr
)));
2392 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2393 data dependence relations ALIAS_DDRS. */
2396 compute_alias_check_pairs (struct loop
*loop
, vec
<ddr_p
> *alias_ddrs
,
2397 vec
<dr_with_seg_len_pair_t
> *comp_alias_pairs
)
2400 unsigned HOST_WIDE_INT factor
= 1;
2401 tree niters_plus_one
, niters
= number_of_latch_executions (loop
);
2403 gcc_assert (niters
!= NULL_TREE
&& niters
!= chrec_dont_know
);
2404 niters
= fold_convert (sizetype
, niters
);
2405 niters_plus_one
= size_binop (PLUS_EXPR
, niters
, size_one_node
);
2407 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2408 fprintf (dump_file
, "Creating alias check pairs:\n");
2410 /* Iterate all data dependence relations and compute alias check pairs. */
2411 for (i
= 0; i
< alias_ddrs
->length (); i
++)
2413 ddr_p ddr
= (*alias_ddrs
)[i
];
2414 struct data_reference
*dr_a
= DDR_A (ddr
);
2415 struct data_reference
*dr_b
= DDR_B (ddr
);
2416 tree seg_length_a
, seg_length_b
;
2417 int comp_res
= data_ref_compare_tree (DR_BASE_ADDRESS (dr_a
),
2418 DR_BASE_ADDRESS (dr_b
));
2421 comp_res
= data_ref_compare_tree (DR_OFFSET (dr_a
), DR_OFFSET (dr_b
));
2422 gcc_assert (comp_res
!= 0);
2424 if (latch_dominated_by_data_ref (loop
, dr_a
))
2425 seg_length_a
= data_ref_segment_size (dr_a
, niters_plus_one
);
2427 seg_length_a
= data_ref_segment_size (dr_a
, niters
);
2429 if (latch_dominated_by_data_ref (loop
, dr_b
))
2430 seg_length_b
= data_ref_segment_size (dr_b
, niters_plus_one
);
2432 seg_length_b
= data_ref_segment_size (dr_b
, niters
);
2434 unsigned HOST_WIDE_INT access_size_a
2435 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a
))));
2436 unsigned HOST_WIDE_INT access_size_b
2437 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b
))));
2438 unsigned int align_a
= TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a
)));
2439 unsigned int align_b
= TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b
)));
2441 dr_with_seg_len_pair_t dr_with_seg_len_pair
2442 (dr_with_seg_len (dr_a
, seg_length_a
, access_size_a
, align_a
),
2443 dr_with_seg_len (dr_b
, seg_length_b
, access_size_b
, align_b
));
2445 /* Canonicalize pairs by sorting the two DR members. */
2447 std::swap (dr_with_seg_len_pair
.first
, dr_with_seg_len_pair
.second
);
2449 comp_alias_pairs
->safe_push (dr_with_seg_len_pair
);
2452 if (tree_fits_uhwi_p (niters
))
2453 factor
= tree_to_uhwi (niters
);
2455 /* Prune alias check pairs. */
2456 prune_runtime_alias_test_list (comp_alias_pairs
, factor
);
2457 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2459 "Improved number of alias checks from %d to %d\n",
2460 alias_ddrs
->length (), comp_alias_pairs
->length ());
2463 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2464 checks and version LOOP under condition of these runtime alias checks. */
2467 version_loop_by_alias_check (vec
<struct partition
*> *partitions
,
2468 struct loop
*loop
, vec
<ddr_p
> *alias_ddrs
)
2470 profile_probability prob
;
2471 basic_block cond_bb
;
2473 tree lhs
, arg0
, cond_expr
= NULL_TREE
;
2474 gimple_seq cond_stmts
= NULL
;
2475 gimple
*call_stmt
= NULL
;
2476 auto_vec
<dr_with_seg_len_pair_t
> comp_alias_pairs
;
2478 /* Generate code for runtime alias checks if necessary. */
2479 gcc_assert (alias_ddrs
->length () > 0);
2481 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2483 "Version loop <%d> with runtime alias check\n", loop
->num
);
2485 compute_alias_check_pairs (loop
, alias_ddrs
, &comp_alias_pairs
);
2486 create_runtime_alias_checks (loop
, &comp_alias_pairs
, &cond_expr
);
2487 cond_expr
= force_gimple_operand_1 (cond_expr
, &cond_stmts
,
2488 is_gimple_val
, NULL_TREE
);
2490 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2491 bool cancelable_p
= flag_tree_loop_vectorize
;
2495 struct partition
*partition
;
2496 for (; partitions
->iterate (i
, &partition
); ++i
)
2497 if (!partition_builtin_p (partition
))
2500 /* If all partitions are builtins, distributing it would be profitable and
2501 we don't want to cancel the runtime alias checks. */
2502 if (i
== partitions
->length ())
2503 cancelable_p
= false;
2506 /* Generate internal function call for loop distribution alias check if the
2507 runtime alias check should be cancelable. */
2510 call_stmt
= gimple_build_call_internal (IFN_LOOP_DIST_ALIAS
,
2511 2, NULL_TREE
, cond_expr
);
2512 lhs
= make_ssa_name (boolean_type_node
);
2513 gimple_call_set_lhs (call_stmt
, lhs
);
2518 prob
= profile_probability::guessed_always ().apply_scale (9, 10);
2519 initialize_original_copy_tables ();
2520 nloop
= loop_version (loop
, lhs
, &cond_bb
, prob
, prob
.invert (),
2521 prob
, prob
.invert (), true);
2522 free_original_copy_tables ();
2523 /* Record the original loop number in newly generated loops. In case of
2524 distribution, the original loop will be distributed and the new loop
2526 loop
->orig_loop_num
= nloop
->num
;
2527 nloop
->orig_loop_num
= nloop
->num
;
2528 nloop
->dont_vectorize
= true;
2529 nloop
->force_vectorize
= false;
2533 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2534 loop could be destroyed. */
2535 arg0
= build_int_cst (integer_type_node
, loop
->orig_loop_num
);
2536 gimple_call_set_arg (call_stmt
, 0, arg0
);
2537 gimple_seq_add_stmt_without_update (&cond_stmts
, call_stmt
);
2542 gimple_stmt_iterator cond_gsi
= gsi_last_bb (cond_bb
);
2543 gsi_insert_seq_before (&cond_gsi
, cond_stmts
, GSI_SAME_STMT
);
2545 update_ssa (TODO_update_ssa
);
2548 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2549 ALIAS_DDRS are data dependence relations for runtime alias check. */
2552 version_for_distribution_p (vec
<struct partition
*> *partitions
,
2553 vec
<ddr_p
> *alias_ddrs
)
2555 /* No need to version loop if we have only one partition. */
2556 if (partitions
->length () == 1)
2559 /* Need to version loop if runtime alias check is necessary. */
2560 return (alias_ddrs
->length () > 0);
2563 /* Compare base offset of builtin mem* partitions P1 and P2. */
2566 offset_cmp (const void *vp1
, const void *vp2
)
2568 struct partition
*p1
= *(struct partition
*const *) vp1
;
2569 struct partition
*p2
= *(struct partition
*const *) vp2
;
2570 unsigned HOST_WIDE_INT o1
= p1
->builtin
->dst_base_offset
;
2571 unsigned HOST_WIDE_INT o2
= p2
->builtin
->dst_base_offset
;
2572 return (o2
< o1
) - (o1
< o2
);
2575 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2576 case optimization transforming below code:
2578 __builtin_memset (&obj, 0, 100);
2580 __builtin_memset (_1, 0, 200);
2582 __builtin_memset (_2, 0, 100);
2586 __builtin_memset (&obj, 0, 400);
2588 Note we don't have dependence information between different partitions
2589 at this point, as a result, we can't handle nonadjacent memset builtin
2590 partitions since dependence might be broken. */
2593 fuse_memset_builtins (vec
<struct partition
*> *partitions
)
2596 struct partition
*part1
, *part2
;
2599 for (i
= 0; partitions
->iterate (i
, &part1
);)
2601 if (part1
->kind
!= PKIND_MEMSET
)
2607 /* Find sub-array of memset builtins of the same base. Index range
2608 of the sub-array is [i, j) with "j > i". */
2609 for (j
= i
+ 1; partitions
->iterate (j
, &part2
); ++j
)
2611 if (part2
->kind
!= PKIND_MEMSET
2612 || !operand_equal_p (part1
->builtin
->dst_base_base
,
2613 part2
->builtin
->dst_base_base
, 0))
2616 /* Memset calls setting different values can't be merged. */
2617 rhs1
= gimple_assign_rhs1 (DR_STMT (part1
->builtin
->dst_dr
));
2618 rhs2
= gimple_assign_rhs1 (DR_STMT (part2
->builtin
->dst_dr
));
2619 if (!operand_equal_p (rhs1
, rhs2
, 0))
2623 /* Stable sort is required in order to avoid breaking dependence. */
2624 gcc_stablesort (&(*partitions
)[i
], j
- i
, sizeof (*partitions
)[i
],
2626 /* Continue with next partition. */
2630 /* Merge all consecutive memset builtin partitions. */
2631 for (i
= 0; i
< partitions
->length () - 1;)
2633 part1
= (*partitions
)[i
];
2634 if (part1
->kind
!= PKIND_MEMSET
)
2640 part2
= (*partitions
)[i
+ 1];
2641 /* Only merge memset partitions of the same base and with constant
2643 if (part2
->kind
!= PKIND_MEMSET
2644 || TREE_CODE (part1
->builtin
->size
) != INTEGER_CST
2645 || TREE_CODE (part2
->builtin
->size
) != INTEGER_CST
2646 || !operand_equal_p (part1
->builtin
->dst_base_base
,
2647 part2
->builtin
->dst_base_base
, 0))
2652 rhs1
= gimple_assign_rhs1 (DR_STMT (part1
->builtin
->dst_dr
));
2653 rhs2
= gimple_assign_rhs1 (DR_STMT (part2
->builtin
->dst_dr
));
2654 int bytev1
= const_with_all_bytes_same (rhs1
);
2655 int bytev2
= const_with_all_bytes_same (rhs2
);
2656 /* Only merge memset partitions of the same value. */
2657 if (bytev1
!= bytev2
|| bytev1
== -1)
2662 wide_int end1
= wi::add (part1
->builtin
->dst_base_offset
,
2663 wi::to_wide (part1
->builtin
->size
));
2664 /* Only merge adjacent memset partitions. */
2665 if (wi::ne_p (end1
, part2
->builtin
->dst_base_offset
))
2670 /* Merge partitions[i] and partitions[i+1]. */
2671 part1
->builtin
->size
= fold_build2 (PLUS_EXPR
, sizetype
,
2672 part1
->builtin
->size
,
2673 part2
->builtin
->size
);
2674 partition_free (part2
);
2675 partitions
->ordered_remove (i
+ 1);
2679 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2680 ALIAS_DDRS contains ddrs which need runtime alias check. */
2683 finalize_partitions (struct loop
*loop
, vec
<struct partition
*> *partitions
,
2684 vec
<ddr_p
> *alias_ddrs
)
2687 struct partition
*partition
, *a
;
2689 if (partitions
->length () == 1
2690 || alias_ddrs
->length () > 0)
2693 unsigned num_builtin
= 0, num_normal
= 0, num_partial_memset
= 0;
2694 bool same_type_p
= true;
2695 enum partition_type type
= ((*partitions
)[0])->type
;
2696 for (i
= 0; partitions
->iterate (i
, &partition
); ++i
)
2698 same_type_p
&= (type
== partition
->type
);
2699 if (partition_builtin_p (partition
))
2705 if (partition
->kind
== PKIND_PARTIAL_MEMSET
)
2706 num_partial_memset
++;
2709 /* Don't distribute current loop into too many loops given we don't have
2710 memory stream cost model. Be even more conservative in case of loop
2711 nest distribution. */
2712 if ((same_type_p
&& num_builtin
== 0
2713 && (loop
->inner
== NULL
|| num_normal
!= 2 || num_partial_memset
!= 1))
2714 || (loop
->inner
!= NULL
2715 && i
>= NUM_PARTITION_THRESHOLD
&& num_normal
> 1)
2716 || (loop
->inner
== NULL
2717 && i
>= NUM_PARTITION_THRESHOLD
&& num_normal
> num_builtin
))
2719 a
= (*partitions
)[0];
2720 for (i
= 1; partitions
->iterate (i
, &partition
); ++i
)
2722 partition_merge_into (NULL
, a
, partition
, FUSE_FINALIZE
);
2723 partition_free (partition
);
2725 partitions
->truncate (1);
2728 /* Fuse memset builtins if possible. */
2729 if (partitions
->length () > 1)
2730 fuse_memset_builtins (partitions
);
2733 /* Distributes the code from LOOP in such a way that producer statements
2734 are placed before consumer statements. Tries to separate only the
2735 statements from STMTS into separate loops. Returns the number of
2736 distributed loops. Set NB_CALLS to number of generated builtin calls.
2737 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2740 distribute_loop (struct loop
*loop
, vec
<gimple
*> stmts
,
2741 control_dependences
*cd
, int *nb_calls
, bool *destroy_p
)
2743 ddrs_table
= new hash_table
<ddr_hasher
> (389);
2745 partition
*partition
;
2751 loop_nest
.create (0);
2752 if (!find_loop_nest (loop
, &loop_nest
))
2754 loop_nest
.release ();
2759 datarefs_vec
.create (20);
2760 rdg
= build_rdg (loop
, cd
);
2763 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2765 "Loop %d not distributed: failed to build the RDG.\n",
2768 loop_nest
.release ();
2769 free_data_refs (datarefs_vec
);
2774 if (datarefs_vec
.length () > MAX_DATAREFS_NUM
)
2776 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2778 "Loop %d not distributed: too many memory references.\n",
2782 loop_nest
.release ();
2783 free_data_refs (datarefs_vec
);
2788 data_reference_p dref
;
2789 for (i
= 0; datarefs_vec
.iterate (i
, &dref
); ++i
)
2790 dref
->aux
= (void *) (uintptr_t) i
;
2792 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2793 dump_rdg (dump_file
, rdg
);
2795 auto_vec
<struct partition
*, 3> partitions
;
2796 rdg_build_partitions (rdg
, stmts
, &partitions
);
2798 auto_vec
<ddr_p
> alias_ddrs
;
2800 auto_bitmap stmt_in_all_partitions
;
2801 bitmap_copy (stmt_in_all_partitions
, partitions
[0]->stmts
);
2802 for (i
= 1; partitions
.iterate (i
, &partition
); ++i
)
2803 bitmap_and_into (stmt_in_all_partitions
, partitions
[i
]->stmts
);
2805 any_builtin
= false;
2806 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
2808 classify_partition (loop
, rdg
, partition
, stmt_in_all_partitions
);
2809 any_builtin
|= partition_builtin_p (partition
);
2812 /* If we are only distributing patterns but did not detect any,
2814 if (!flag_tree_loop_distribution
2821 /* If we are only distributing patterns fuse all partitions that
2822 were not classified as builtins. This also avoids chopping
2823 a loop into pieces, separated by builtin calls. That is, we
2824 only want no or a single loop body remaining. */
2825 struct partition
*into
;
2826 if (!flag_tree_loop_distribution
)
2828 for (i
= 0; partitions
.iterate (i
, &into
); ++i
)
2829 if (!partition_builtin_p (into
))
2831 for (++i
; partitions
.iterate (i
, &partition
); ++i
)
2832 if (!partition_builtin_p (partition
))
2834 partition_merge_into (NULL
, into
, partition
, FUSE_NON_BUILTIN
);
2835 partitions
.unordered_remove (i
);
2836 partition_free (partition
);
2841 /* Due to limitations in the transform phase we have to fuse all
2842 reduction partitions into the last partition so the existing
2843 loop will contain all loop-closed PHI nodes. */
2844 for (i
= 0; partitions
.iterate (i
, &into
); ++i
)
2845 if (partition_reduction_p (into
))
2847 for (i
= i
+ 1; partitions
.iterate (i
, &partition
); ++i
)
2848 if (partition_reduction_p (partition
))
2850 partition_merge_into (rdg
, into
, partition
, FUSE_REDUCTION
);
2851 partitions
.unordered_remove (i
);
2852 partition_free (partition
);
2856 /* Apply our simple cost model - fuse partitions with similar
2858 for (i
= 0; partitions
.iterate (i
, &into
); ++i
)
2860 bool changed
= false;
2861 if (partition_builtin_p (into
) || into
->kind
== PKIND_PARTIAL_MEMSET
)
2864 partitions
.iterate (j
, &partition
); ++j
)
2866 if (share_memory_accesses (rdg
, into
, partition
))
2868 partition_merge_into (rdg
, into
, partition
, FUSE_SHARE_REF
);
2869 partitions
.unordered_remove (j
);
2870 partition_free (partition
);
2875 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2876 accesses when 1 and 2 have similar accesses but not 0 and 1
2877 then in the next iteration we will fail to consider merging
2878 1 into 0,2. So try again if we did any merging into 0. */
2883 /* Build the partition dependency graph and fuse partitions in strong
2884 connected component. */
2885 if (partitions
.length () > 1)
2887 /* Don't support loop nest distribution under runtime alias check
2888 since it's not likely to enable many vectorization opportunities. */
2890 merge_dep_scc_partitions (rdg
, &partitions
, false);
2893 merge_dep_scc_partitions (rdg
, &partitions
, true);
2894 if (partitions
.length () > 1)
2895 break_alias_scc_partitions (rdg
, &partitions
, &alias_ddrs
);
2899 finalize_partitions (loop
, &partitions
, &alias_ddrs
);
2901 nbp
= partitions
.length ();
2903 || (nbp
== 1 && !partition_builtin_p (partitions
[0]))
2904 || (nbp
> 1 && partition_contains_all_rw (rdg
, partitions
)))
2910 if (version_for_distribution_p (&partitions
, &alias_ddrs
))
2911 version_loop_by_alias_check (&partitions
, loop
, &alias_ddrs
);
2913 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
2916 "distribute loop <%d> into partitions:\n", loop
->num
);
2917 dump_rdg_partitions (dump_file
, partitions
);
2920 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
2922 if (partition_builtin_p (partition
))
2924 *destroy_p
|= generate_code_for_partition (loop
, partition
, i
< nbp
- 1);
2928 loop_nest
.release ();
2929 free_data_refs (datarefs_vec
);
2930 for (hash_table
<ddr_hasher
>::iterator iter
= ddrs_table
->begin ();
2931 iter
!= ddrs_table
->end (); ++iter
)
2933 free_dependence_relation (*iter
);
2938 FOR_EACH_VEC_ELT (partitions
, i
, partition
)
2939 partition_free (partition
);
2942 return nbp
- *nb_calls
;
2945 /* Distribute all loops in the current function. */
2949 const pass_data pass_data_loop_distribution
=
2951 GIMPLE_PASS
, /* type */
2953 OPTGROUP_LOOP
, /* optinfo_flags */
2954 TV_TREE_LOOP_DISTRIBUTION
, /* tv_id */
2955 ( PROP_cfg
| PROP_ssa
), /* properties_required */
2956 0, /* properties_provided */
2957 0, /* properties_destroyed */
2958 0, /* todo_flags_start */
2959 0, /* todo_flags_finish */
2962 class pass_loop_distribution
: public gimple_opt_pass
2965 pass_loop_distribution (gcc::context
*ctxt
)
2966 : gimple_opt_pass (pass_data_loop_distribution
, ctxt
)
2969 /* opt_pass methods: */
2970 virtual bool gate (function
*)
2972 return flag_tree_loop_distribution
2973 || flag_tree_loop_distribute_patterns
;
2976 virtual unsigned int execute (function
*);
2978 }; // class pass_loop_distribution
2981 /* Given LOOP, this function records seed statements for distribution in
2982 WORK_LIST. Return false if there is nothing for distribution. */
2985 find_seed_stmts_for_distribution (struct loop
*loop
, vec
<gimple
*> *work_list
)
2987 basic_block
*bbs
= get_loop_body_in_dom_order (loop
);
2989 /* Initialize the worklist with stmts we seed the partitions with. */
2990 for (unsigned i
= 0; i
< loop
->num_nodes
; ++i
)
2992 for (gphi_iterator gsi
= gsi_start_phis (bbs
[i
]);
2993 !gsi_end_p (gsi
); gsi_next (&gsi
))
2995 gphi
*phi
= gsi
.phi ();
2996 if (virtual_operand_p (gimple_phi_result (phi
)))
2998 /* Distribute stmts which have defs that are used outside of
3000 if (!stmt_has_scalar_dependences_outside_loop (loop
, phi
))
3002 work_list
->safe_push (phi
);
3004 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
3005 !gsi_end_p (gsi
); gsi_next (&gsi
))
3007 gimple
*stmt
= gsi_stmt (gsi
);
3009 /* If there is a stmt with side-effects bail out - we
3010 cannot and should not distribute this loop. */
3011 if (gimple_has_side_effects (stmt
))
3017 /* Distribute stmts which have defs that are used outside of
3019 if (stmt_has_scalar_dependences_outside_loop (loop
, stmt
))
3021 /* Otherwise only distribute stores for now. */
3022 else if (!gimple_vdef (stmt
))
3025 work_list
->safe_push (stmt
);
3029 return work_list
->length () > 0;
3032 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3033 perfect loop nest. */
3035 static struct loop
*
3036 prepare_perfect_loop_nest (struct loop
*loop
)
3038 struct loop
*outer
= loop_outer (loop
);
3039 tree niters
= number_of_latch_executions (loop
);
3041 /* TODO: We only support the innermost 3-level loop nest distribution
3042 because of compilation time issue for now. This should be relaxed
3043 in the future. Note we only allow 3-level loop nest distribution
3044 when parallelizing loops. */
3045 while ((loop
->inner
== NULL
3046 || (loop
->inner
->inner
== NULL
&& flag_tree_parallelize_loops
> 1))
3047 && loop_outer (outer
)
3048 && outer
->inner
== loop
&& loop
->next
== NULL
3049 && single_exit (outer
)
3050 && optimize_loop_for_speed_p (outer
)
3051 && !chrec_contains_symbols_defined_in_loop (niters
, outer
->num
)
3052 && (niters
= number_of_latch_executions (outer
)) != NULL_TREE
3053 && niters
!= chrec_dont_know
)
3056 outer
= loop_outer (loop
);
3063 pass_loop_distribution::execute (function
*fun
)
3066 bool changed
= false;
3068 control_dependences
*cd
= NULL
;
3069 auto_vec
<loop_p
> loops_to_be_destroyed
;
3071 if (number_of_loops (fun
) <= 1)
3074 /* Compute topological order for basic blocks. Topological order is
3075 needed because data dependence is computed for data references in
3076 lexicographical order. */
3077 if (bb_top_order_index
== NULL
)
3080 int *rpo
= XNEWVEC (int, last_basic_block_for_fn (cfun
));
3082 bb_top_order_index
= XNEWVEC (int, last_basic_block_for_fn (cfun
));
3083 bb_top_order_index_size
= last_basic_block_for_fn (cfun
);
3084 rpo_num
= pre_and_rev_post_order_compute_fn (cfun
, NULL
, rpo
, true);
3085 for (int i
= 0; i
< rpo_num
; i
++)
3086 bb_top_order_index
[rpo
[i
]] = i
;
3091 FOR_ALL_BB_FN (bb
, fun
)
3093 gimple_stmt_iterator gsi
;
3094 for (gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3095 gimple_set_uid (gsi_stmt (gsi
), -1);
3096 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
3097 gimple_set_uid (gsi_stmt (gsi
), -1);
3100 /* We can at the moment only distribute non-nested loops, thus restrict
3101 walking to innermost loops. */
3102 FOR_EACH_LOOP (loop
, LI_ONLY_INNERMOST
)
3104 /* Don't distribute multiple exit edges loop, or cold loop. */
3105 if (!single_exit (loop
)
3106 || !optimize_loop_for_speed_p (loop
))
3109 /* Don't distribute loop if niters is unknown. */
3110 tree niters
= number_of_latch_executions (loop
);
3111 if (niters
== NULL_TREE
|| niters
== chrec_dont_know
)
3114 /* Get the perfect loop nest for distribution. */
3115 loop
= prepare_perfect_loop_nest (loop
);
3116 for (; loop
; loop
= loop
->inner
)
3118 auto_vec
<gimple
*> work_list
;
3119 if (!find_seed_stmts_for_distribution (loop
, &work_list
))
3122 const char *str
= loop
->inner
? " nest" : "";
3123 dump_user_location_t loc
= find_loop_location (loop
);
3126 calculate_dominance_info (CDI_DOMINATORS
);
3127 calculate_dominance_info (CDI_POST_DOMINATORS
);
3128 cd
= new control_dependences ();
3129 free_dominance_info (CDI_POST_DOMINATORS
);
3133 int nb_generated_loops
, nb_generated_calls
;
3134 nb_generated_loops
= distribute_loop (loop
, work_list
, cd
,
3135 &nb_generated_calls
,
3138 loops_to_be_destroyed
.safe_push (loop
);
3140 if (nb_generated_loops
+ nb_generated_calls
> 0)
3143 if (dump_enabled_p ())
3144 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
,
3145 loc
, "Loop%s %d distributed: split to %d loops "
3146 "and %d library calls.\n", str
, loop
->num
,
3147 nb_generated_loops
, nb_generated_calls
);
3152 if (dump_file
&& (dump_flags
& TDF_DETAILS
))
3153 fprintf (dump_file
, "Loop%s %d not distributed.\n", str
, loop
->num
);
3160 if (bb_top_order_index
!= NULL
)
3162 free (bb_top_order_index
);
3163 bb_top_order_index
= NULL
;
3164 bb_top_order_index_size
= 0;
3169 /* Destroy loop bodies that could not be reused. Do this late as we
3170 otherwise can end up refering to stale data in control dependences. */
3172 FOR_EACH_VEC_ELT (loops_to_be_destroyed
, i
, loop
)
3173 destroy_loop (loop
);
3175 /* Cached scalar evolutions now may refer to wrong or non-existing
3178 mark_virtual_operands_for_renaming (fun
);
3179 rewrite_into_loop_closed_ssa (NULL
, TODO_update_ssa
);
3182 checking_verify_loop_structure ();
3184 return changed
? TODO_cleanup_cfg
: 0;
3190 make_pass_loop_distribution (gcc::context
*ctxt
)
3192 return new pass_loop_distribution (ctxt
);