]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-loop-distribution.c
re PR tree-optimization/89278 (ICE in gimplify_modify_expr, at gimplify.c:5821)
[thirdparty/gcc.git] / gcc / tree-loop-distribution.c
1 /* Loop distribution.
2 Copyright (C) 2006-2019 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 /* This pass performs loop distribution: for example, the loop
23
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
28
29 is transformed to
30
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
34 |
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
38
39 Loop distribution is the dual of loop fusion. It separates statements
40 of a loop (or loop nest) into multiple loops (or loop nests) with the
41 same loop header. The major goal is to separate statements which may
42 be vectorized from those that can't. This pass implements distribution
43 in the following steps:
44
45 1) Seed partitions with specific type statements. For now we support
46 two types seed statements: statement defining variable used outside
47 of loop; statement storing to memory.
48 2) Build reduced dependence graph (RDG) for loop to be distributed.
49 The vertices (RDG:V) model all statements in the loop and the edges
50 (RDG:E) model flow and control dependencies between statements.
51 3) Apart from RDG, compute data dependencies between memory references.
52 4) Starting from seed statement, build up partition by adding depended
53 statements according to RDG's dependence information. Partition is
54 classified as parallel type if it can be executed paralleled; or as
55 sequential type if it can't. Parallel type partition is further
56 classified as different builtin kinds if it can be implemented as
57 builtin function calls.
58 5) Build partition dependence graph (PG) based on data dependencies.
59 The vertices (PG:V) model all partitions and the edges (PG:E) model
60 all data dependencies between every partitions pair. In general,
61 data dependence is either compilation time known or unknown. In C
62 family languages, there exists quite amount compilation time unknown
63 dependencies because of possible alias relation of data references.
64 We categorize PG's edge to two types: "true" edge that represents
65 compilation time known data dependencies; "alias" edge for all other
66 data dependencies.
67 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68 partitions in each strong connected component (SCC) correspondingly.
69 Build new PG for merged partitions.
70 7) Traverse PG again and this time with both "true" and "alias" edges
71 included. We try to break SCCs by removing some edges. Because
72 SCCs by "true" edges are all fused in step 6), we can break SCCs
73 by removing some "alias" edges. It's NP-hard to choose optimal
74 edge set, fortunately simple approximation is good enough for us
75 given the small problem scale.
76 8) Collect all data dependencies of the removed "alias" edges. Create
77 runtime alias checks for collected data dependencies.
78 9) Version loop under the condition of runtime alias checks. Given
79 loop distribution generally introduces additional overhead, it is
80 only useful if vectorization is achieved in distributed loop. We
81 version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82 no distributed loop can be vectorized, we simply remove distributed
83 loops and recover to the original one.
84
85 TODO:
86 1) We only distribute innermost two-level loop nest now. We should
87 extend it for arbitrary loop nests in the future.
88 2) We only fuse partitions in SCC now. A better fusion algorithm is
89 desired to minimize loop overhead, maximize parallelism and maximize
90 data reuse. */
91
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop-ivopts.h"
110 #include "tree-ssa-loop.h"
111 #include "tree-into-ssa.h"
112 #include "tree-ssa.h"
113 #include "cfgloop.h"
114 #include "tree-scalar-evolution.h"
115 #include "params.h"
116 #include "tree-vectorizer.h"
117 #include "tree-eh.h"
118
119
120 #define MAX_DATAREFS_NUM \
121 ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
122
123 /* Threshold controlling number of distributed partitions. Given it may
124 be unnecessary if a memory stream cost model is invented in the future,
125 we define it as a temporary macro, rather than a parameter. */
126 #define NUM_PARTITION_THRESHOLD (4)
127
128 /* Hashtable helpers. */
129
130 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
131 {
132 static inline hashval_t hash (const data_dependence_relation *);
133 static inline bool equal (const data_dependence_relation *,
134 const data_dependence_relation *);
135 };
136
137 /* Hash function for data dependence. */
138
139 inline hashval_t
140 ddr_hasher::hash (const data_dependence_relation *ddr)
141 {
142 inchash::hash h;
143 h.add_ptr (DDR_A (ddr));
144 h.add_ptr (DDR_B (ddr));
145 return h.end ();
146 }
147
148 /* Hash table equality function for data dependence. */
149
150 inline bool
151 ddr_hasher::equal (const data_dependence_relation *ddr1,
152 const data_dependence_relation *ddr2)
153 {
154 return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
155 }
156
157 /* The loop (nest) to be distributed. */
158 static vec<loop_p> loop_nest;
159
160 /* Vector of data references in the loop to be distributed. */
161 static vec<data_reference_p> datarefs_vec;
162
163 /* Store index of data reference in aux field. */
164 #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
165
166 /* Hash table for data dependence relation in the loop to be distributed. */
167 static hash_table<ddr_hasher> *ddrs_table;
168
169 /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
170 struct rdg_vertex
171 {
172 /* The statement represented by this vertex. */
173 gimple *stmt;
174
175 /* Vector of data-references in this statement. */
176 vec<data_reference_p> datarefs;
177
178 /* True when the statement contains a write to memory. */
179 bool has_mem_write;
180
181 /* True when the statement contains a read from memory. */
182 bool has_mem_reads;
183 };
184
185 #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
186 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
187 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
188 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
189 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
190 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
191 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
192 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
193
194 /* Data dependence type. */
195
196 enum rdg_dep_type
197 {
198 /* Read After Write (RAW). */
199 flow_dd = 'f',
200
201 /* Control dependence (execute conditional on). */
202 control_dd = 'c'
203 };
204
205 /* Dependence information attached to an edge of the RDG. */
206
207 struct rdg_edge
208 {
209 /* Type of the dependence. */
210 enum rdg_dep_type type;
211 };
212
213 #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
214
215 /* Dump vertex I in RDG to FILE. */
216
217 static void
218 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
219 {
220 struct vertex *v = &(rdg->vertices[i]);
221 struct graph_edge *e;
222
223 fprintf (file, "(vertex %d: (%s%s) (in:", i,
224 RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
225 RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
226
227 if (v->pred)
228 for (e = v->pred; e; e = e->pred_next)
229 fprintf (file, " %d", e->src);
230
231 fprintf (file, ") (out:");
232
233 if (v->succ)
234 for (e = v->succ; e; e = e->succ_next)
235 fprintf (file, " %d", e->dest);
236
237 fprintf (file, ")\n");
238 print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
239 fprintf (file, ")\n");
240 }
241
242 /* Call dump_rdg_vertex on stderr. */
243
244 DEBUG_FUNCTION void
245 debug_rdg_vertex (struct graph *rdg, int i)
246 {
247 dump_rdg_vertex (stderr, rdg, i);
248 }
249
250 /* Dump the reduced dependence graph RDG to FILE. */
251
252 static void
253 dump_rdg (FILE *file, struct graph *rdg)
254 {
255 fprintf (file, "(rdg\n");
256 for (int i = 0; i < rdg->n_vertices; i++)
257 dump_rdg_vertex (file, rdg, i);
258 fprintf (file, ")\n");
259 }
260
261 /* Call dump_rdg on stderr. */
262
263 DEBUG_FUNCTION void
264 debug_rdg (struct graph *rdg)
265 {
266 dump_rdg (stderr, rdg);
267 }
268
269 static void
270 dot_rdg_1 (FILE *file, struct graph *rdg)
271 {
272 int i;
273 pretty_printer buffer;
274 pp_needs_newline (&buffer) = false;
275 buffer.buffer->stream = file;
276
277 fprintf (file, "digraph RDG {\n");
278
279 for (i = 0; i < rdg->n_vertices; i++)
280 {
281 struct vertex *v = &(rdg->vertices[i]);
282 struct graph_edge *e;
283
284 fprintf (file, "%d [label=\"[%d] ", i, i);
285 pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
286 pp_flush (&buffer);
287 fprintf (file, "\"]\n");
288
289 /* Highlight reads from memory. */
290 if (RDG_MEM_READS_STMT (rdg, i))
291 fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
292
293 /* Highlight stores to memory. */
294 if (RDG_MEM_WRITE_STMT (rdg, i))
295 fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
296
297 if (v->succ)
298 for (e = v->succ; e; e = e->succ_next)
299 switch (RDGE_TYPE (e))
300 {
301 case flow_dd:
302 /* These are the most common dependences: don't print these. */
303 fprintf (file, "%d -> %d \n", i, e->dest);
304 break;
305
306 case control_dd:
307 fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
308 break;
309
310 default:
311 gcc_unreachable ();
312 }
313 }
314
315 fprintf (file, "}\n\n");
316 }
317
318 /* Display the Reduced Dependence Graph using dotty. */
319
320 DEBUG_FUNCTION void
321 dot_rdg (struct graph *rdg)
322 {
323 /* When debugging, you may want to enable the following code. */
324 #ifdef HAVE_POPEN
325 FILE *file = popen ("dot -Tx11", "w");
326 if (!file)
327 return;
328 dot_rdg_1 (file, rdg);
329 fflush (file);
330 close (fileno (file));
331 pclose (file);
332 #else
333 dot_rdg_1 (stderr, rdg);
334 #endif
335 }
336
337 /* Returns the index of STMT in RDG. */
338
339 static int
340 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
341 {
342 int index = gimple_uid (stmt);
343 gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
344 return index;
345 }
346
347 /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
348 the index of DEF in RDG. */
349
350 static void
351 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
352 {
353 use_operand_p imm_use_p;
354 imm_use_iterator iterator;
355
356 FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
357 {
358 struct graph_edge *e;
359 int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
360
361 if (use < 0)
362 continue;
363
364 e = add_edge (rdg, idef, use);
365 e->data = XNEW (struct rdg_edge);
366 RDGE_TYPE (e) = flow_dd;
367 }
368 }
369
370 /* Creates an edge for the control dependences of BB to the vertex V. */
371
372 static void
373 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
374 int v, control_dependences *cd)
375 {
376 bitmap_iterator bi;
377 unsigned edge_n;
378 EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
379 0, edge_n, bi)
380 {
381 basic_block cond_bb = cd->get_edge_src (edge_n);
382 gimple *stmt = last_stmt (cond_bb);
383 if (stmt && is_ctrl_stmt (stmt))
384 {
385 struct graph_edge *e;
386 int c = rdg_vertex_for_stmt (rdg, stmt);
387 if (c < 0)
388 continue;
389
390 e = add_edge (rdg, c, v);
391 e->data = XNEW (struct rdg_edge);
392 RDGE_TYPE (e) = control_dd;
393 }
394 }
395 }
396
397 /* Creates the edges of the reduced dependence graph RDG. */
398
399 static void
400 create_rdg_flow_edges (struct graph *rdg)
401 {
402 int i;
403 def_operand_p def_p;
404 ssa_op_iter iter;
405
406 for (i = 0; i < rdg->n_vertices; i++)
407 FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
408 iter, SSA_OP_DEF)
409 create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
410 }
411
412 /* Creates the edges of the reduced dependence graph RDG. */
413
414 static void
415 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
416 {
417 int i;
418
419 for (i = 0; i < rdg->n_vertices; i++)
420 {
421 gimple *stmt = RDG_STMT (rdg, i);
422 if (gimple_code (stmt) == GIMPLE_PHI)
423 {
424 edge_iterator ei;
425 edge e;
426 FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
427 if (flow_bb_inside_loop_p (loop, e->src))
428 create_edge_for_control_dependence (rdg, e->src, i, cd);
429 }
430 else
431 create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
432 }
433 }
434
435 /* Build the vertices of the reduced dependence graph RDG. Return false
436 if that failed. */
437
438 static bool
439 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
440 {
441 int i;
442 gimple *stmt;
443
444 FOR_EACH_VEC_ELT (stmts, i, stmt)
445 {
446 struct vertex *v = &(rdg->vertices[i]);
447
448 /* Record statement to vertex mapping. */
449 gimple_set_uid (stmt, i);
450
451 v->data = XNEW (struct rdg_vertex);
452 RDGV_STMT (v) = stmt;
453 RDGV_DATAREFS (v).create (0);
454 RDGV_HAS_MEM_WRITE (v) = false;
455 RDGV_HAS_MEM_READS (v) = false;
456 if (gimple_code (stmt) == GIMPLE_PHI)
457 continue;
458
459 unsigned drp = datarefs_vec.length ();
460 if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
461 return false;
462 for (unsigned j = drp; j < datarefs_vec.length (); ++j)
463 {
464 data_reference_p dr = datarefs_vec[j];
465 if (DR_IS_READ (dr))
466 RDGV_HAS_MEM_READS (v) = true;
467 else
468 RDGV_HAS_MEM_WRITE (v) = true;
469 RDGV_DATAREFS (v).safe_push (dr);
470 }
471 }
472 return true;
473 }
474
475 /* Array mapping basic block's index to its topological order. */
476 static int *bb_top_order_index;
477 /* And size of the array. */
478 static int bb_top_order_index_size;
479
480 /* If X has a smaller topological sort number than Y, returns -1;
481 if greater, returns 1. */
482
483 static int
484 bb_top_order_cmp (const void *x, const void *y)
485 {
486 basic_block bb1 = *(const basic_block *) x;
487 basic_block bb2 = *(const basic_block *) y;
488
489 gcc_assert (bb1->index < bb_top_order_index_size
490 && bb2->index < bb_top_order_index_size);
491 gcc_assert (bb1 == bb2
492 || bb_top_order_index[bb1->index]
493 != bb_top_order_index[bb2->index]);
494
495 return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
496 }
497
498 /* Initialize STMTS with all the statements of LOOP. We use topological
499 order to discover all statements. The order is important because
500 generate_loops_for_partition is using the same traversal for identifying
501 statements in loop copies. */
502
503 static void
504 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
505 {
506 unsigned int i;
507 basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
508
509 for (i = 0; i < loop->num_nodes; i++)
510 {
511 basic_block bb = bbs[i];
512
513 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
514 gsi_next (&bsi))
515 if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
516 stmts->safe_push (bsi.phi ());
517
518 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
519 gsi_next (&bsi))
520 {
521 gimple *stmt = gsi_stmt (bsi);
522 if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
523 stmts->safe_push (stmt);
524 }
525 }
526
527 free (bbs);
528 }
529
530 /* Free the reduced dependence graph RDG. */
531
532 static void
533 free_rdg (struct graph *rdg)
534 {
535 int i;
536
537 for (i = 0; i < rdg->n_vertices; i++)
538 {
539 struct vertex *v = &(rdg->vertices[i]);
540 struct graph_edge *e;
541
542 for (e = v->succ; e; e = e->succ_next)
543 free (e->data);
544
545 if (v->data)
546 {
547 gimple_set_uid (RDGV_STMT (v), -1);
548 (RDGV_DATAREFS (v)).release ();
549 free (v->data);
550 }
551 }
552
553 free_graph (rdg);
554 }
555
556 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
557 LOOP, and one edge per flow dependence or control dependence from control
558 dependence CD. During visiting each statement, data references are also
559 collected and recorded in global data DATAREFS_VEC. */
560
561 static struct graph *
562 build_rdg (struct loop *loop, control_dependences *cd)
563 {
564 struct graph *rdg;
565
566 /* Create the RDG vertices from the stmts of the loop nest. */
567 auto_vec<gimple *, 10> stmts;
568 stmts_from_loop (loop, &stmts);
569 rdg = new_graph (stmts.length ());
570 if (!create_rdg_vertices (rdg, stmts, loop))
571 {
572 free_rdg (rdg);
573 return NULL;
574 }
575 stmts.release ();
576
577 create_rdg_flow_edges (rdg);
578 if (cd)
579 create_rdg_cd_edges (rdg, cd, loop);
580
581 return rdg;
582 }
583
584
585 /* Kind of distributed loop. */
586 enum partition_kind {
587 PKIND_NORMAL,
588 /* Partial memset stands for a paritition can be distributed into a loop
589 of memset calls, rather than a single memset call. It's handled just
590 like a normal parition, i.e, distributed as separate loop, no memset
591 call is generated.
592
593 Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
594 loop nest as deep as possible. As a result, parloop achieves better
595 parallelization by parallelizing deeper loop nest. This hack should
596 be unnecessary and removed once distributed memset can be understood
597 and analyzed in data reference analysis. See PR82604 for more. */
598 PKIND_PARTIAL_MEMSET,
599 PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
600 };
601
602 /* Type of distributed loop. */
603 enum partition_type {
604 /* The distributed loop can be executed parallelly. */
605 PTYPE_PARALLEL = 0,
606 /* The distributed loop has to be executed sequentially. */
607 PTYPE_SEQUENTIAL
608 };
609
610 /* Builtin info for loop distribution. */
611 struct builtin_info
612 {
613 /* data-references a kind != PKIND_NORMAL partition is about. */
614 data_reference_p dst_dr;
615 data_reference_p src_dr;
616 /* Base address and size of memory objects operated by the builtin. Note
617 both dest and source memory objects must have the same size. */
618 tree dst_base;
619 tree src_base;
620 tree size;
621 /* Base and offset part of dst_base after stripping constant offset. This
622 is only used in memset builtin distribution for now. */
623 tree dst_base_base;
624 unsigned HOST_WIDE_INT dst_base_offset;
625 };
626
627 /* Partition for loop distribution. */
628 struct partition
629 {
630 /* Statements of the partition. */
631 bitmap stmts;
632 /* True if the partition defines variable which is used outside of loop. */
633 bool reduction_p;
634 enum partition_kind kind;
635 enum partition_type type;
636 /* Data references in the partition. */
637 bitmap datarefs;
638 /* Information of builtin parition. */
639 struct builtin_info *builtin;
640 };
641
642
643 /* Allocate and initialize a partition from BITMAP. */
644
645 static partition *
646 partition_alloc (void)
647 {
648 partition *partition = XCNEW (struct partition);
649 partition->stmts = BITMAP_ALLOC (NULL);
650 partition->reduction_p = false;
651 partition->kind = PKIND_NORMAL;
652 partition->datarefs = BITMAP_ALLOC (NULL);
653 return partition;
654 }
655
656 /* Free PARTITION. */
657
658 static void
659 partition_free (partition *partition)
660 {
661 BITMAP_FREE (partition->stmts);
662 BITMAP_FREE (partition->datarefs);
663 if (partition->builtin)
664 free (partition->builtin);
665
666 free (partition);
667 }
668
669 /* Returns true if the partition can be generated as a builtin. */
670
671 static bool
672 partition_builtin_p (partition *partition)
673 {
674 return partition->kind > PKIND_PARTIAL_MEMSET;
675 }
676
677 /* Returns true if the partition contains a reduction. */
678
679 static bool
680 partition_reduction_p (partition *partition)
681 {
682 return partition->reduction_p;
683 }
684
685 /* Partitions are fused because of different reasons. */
686 enum fuse_type
687 {
688 FUSE_NON_BUILTIN = 0,
689 FUSE_REDUCTION = 1,
690 FUSE_SHARE_REF = 2,
691 FUSE_SAME_SCC = 3,
692 FUSE_FINALIZE = 4
693 };
694
695 /* Description on different fusing reason. */
696 static const char *fuse_message[] = {
697 "they are non-builtins",
698 "they have reductions",
699 "they have shared memory refs",
700 "they are in the same dependence scc",
701 "there is no point to distribute loop"};
702
703 static void
704 update_type_for_merge (struct graph *, partition *, partition *);
705
706 /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
707 graph and we update type for result partition if it is non-NULL. */
708
709 static void
710 partition_merge_into (struct graph *rdg, partition *dest,
711 partition *partition, enum fuse_type ft)
712 {
713 if (dump_file && (dump_flags & TDF_DETAILS))
714 {
715 fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
716 fprintf (dump_file, " Part 1: ");
717 dump_bitmap (dump_file, dest->stmts);
718 fprintf (dump_file, " Part 2: ");
719 dump_bitmap (dump_file, partition->stmts);
720 }
721
722 dest->kind = PKIND_NORMAL;
723 if (dest->type == PTYPE_PARALLEL)
724 dest->type = partition->type;
725
726 bitmap_ior_into (dest->stmts, partition->stmts);
727 if (partition_reduction_p (partition))
728 dest->reduction_p = true;
729
730 /* Further check if any data dependence prevents us from executing the
731 new partition parallelly. */
732 if (dest->type == PTYPE_PARALLEL && rdg != NULL)
733 update_type_for_merge (rdg, dest, partition);
734
735 bitmap_ior_into (dest->datarefs, partition->datarefs);
736 }
737
738
739 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
740 the LOOP. */
741
742 static bool
743 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
744 {
745 imm_use_iterator imm_iter;
746 use_operand_p use_p;
747
748 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
749 {
750 if (is_gimple_debug (USE_STMT (use_p)))
751 continue;
752
753 basic_block use_bb = gimple_bb (USE_STMT (use_p));
754 if (!flow_bb_inside_loop_p (loop, use_bb))
755 return true;
756 }
757
758 return false;
759 }
760
761 /* Returns true when STMT defines a scalar variable used after the
762 loop LOOP. */
763
764 static bool
765 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
766 {
767 def_operand_p def_p;
768 ssa_op_iter op_iter;
769
770 if (gimple_code (stmt) == GIMPLE_PHI)
771 return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
772
773 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
774 if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
775 return true;
776
777 return false;
778 }
779
780 /* Return a copy of LOOP placed before LOOP. */
781
782 static struct loop *
783 copy_loop_before (struct loop *loop)
784 {
785 struct loop *res;
786 edge preheader = loop_preheader_edge (loop);
787
788 initialize_original_copy_tables ();
789 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
790 gcc_assert (res != NULL);
791 free_original_copy_tables ();
792 delete_update_ssa ();
793
794 return res;
795 }
796
797 /* Creates an empty basic block after LOOP. */
798
799 static void
800 create_bb_after_loop (struct loop *loop)
801 {
802 edge exit = single_exit (loop);
803
804 if (!exit)
805 return;
806
807 split_edge (exit);
808 }
809
810 /* Generate code for PARTITION from the code in LOOP. The loop is
811 copied when COPY_P is true. All the statements not flagged in the
812 PARTITION bitmap are removed from the loop or from its copy. The
813 statements are indexed in sequence inside a basic block, and the
814 basic blocks of a loop are taken in dom order. */
815
816 static void
817 generate_loops_for_partition (struct loop *loop, partition *partition,
818 bool copy_p)
819 {
820 unsigned i;
821 basic_block *bbs;
822
823 if (copy_p)
824 {
825 int orig_loop_num = loop->orig_loop_num;
826 loop = copy_loop_before (loop);
827 gcc_assert (loop != NULL);
828 loop->orig_loop_num = orig_loop_num;
829 create_preheader (loop, CP_SIMPLE_PREHEADERS);
830 create_bb_after_loop (loop);
831 }
832 else
833 {
834 /* Origin number is set to the new versioned loop's num. */
835 gcc_assert (loop->orig_loop_num != loop->num);
836 }
837
838 /* Remove stmts not in the PARTITION bitmap. */
839 bbs = get_loop_body_in_dom_order (loop);
840
841 if (MAY_HAVE_DEBUG_BIND_STMTS)
842 for (i = 0; i < loop->num_nodes; i++)
843 {
844 basic_block bb = bbs[i];
845
846 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
847 gsi_next (&bsi))
848 {
849 gphi *phi = bsi.phi ();
850 if (!virtual_operand_p (gimple_phi_result (phi))
851 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
852 reset_debug_uses (phi);
853 }
854
855 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
856 {
857 gimple *stmt = gsi_stmt (bsi);
858 if (gimple_code (stmt) != GIMPLE_LABEL
859 && !is_gimple_debug (stmt)
860 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
861 reset_debug_uses (stmt);
862 }
863 }
864
865 for (i = 0; i < loop->num_nodes; i++)
866 {
867 basic_block bb = bbs[i];
868 edge inner_exit = NULL;
869
870 if (loop != bb->loop_father)
871 inner_exit = single_exit (bb->loop_father);
872
873 for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
874 {
875 gphi *phi = bsi.phi ();
876 if (!virtual_operand_p (gimple_phi_result (phi))
877 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
878 remove_phi_node (&bsi, true);
879 else
880 gsi_next (&bsi);
881 }
882
883 for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
884 {
885 gimple *stmt = gsi_stmt (bsi);
886 if (gimple_code (stmt) != GIMPLE_LABEL
887 && !is_gimple_debug (stmt)
888 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
889 {
890 /* In distribution of loop nest, if bb is inner loop's exit_bb,
891 we choose its exit edge/path in order to avoid generating
892 infinite loop. For all other cases, we choose an arbitrary
893 path through the empty CFG part that this unnecessary
894 control stmt controls. */
895 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
896 {
897 if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
898 gimple_cond_make_true (cond_stmt);
899 else
900 gimple_cond_make_false (cond_stmt);
901 update_stmt (stmt);
902 }
903 else if (gimple_code (stmt) == GIMPLE_SWITCH)
904 {
905 gswitch *switch_stmt = as_a <gswitch *> (stmt);
906 gimple_switch_set_index
907 (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
908 update_stmt (stmt);
909 }
910 else
911 {
912 unlink_stmt_vdef (stmt);
913 gsi_remove (&bsi, true);
914 release_defs (stmt);
915 continue;
916 }
917 }
918 gsi_next (&bsi);
919 }
920 }
921
922 free (bbs);
923 }
924
925 /* If VAL memory representation contains the same value in all bytes,
926 return that value, otherwise return -1.
927 E.g. for 0x24242424 return 0x24, for IEEE double
928 747708026454360457216.0 return 0x44, etc. */
929
930 static int
931 const_with_all_bytes_same (tree val)
932 {
933 unsigned char buf[64];
934 int i, len;
935
936 if (integer_zerop (val)
937 || (TREE_CODE (val) == CONSTRUCTOR
938 && !TREE_CLOBBER_P (val)
939 && CONSTRUCTOR_NELTS (val) == 0))
940 return 0;
941
942 if (real_zerop (val))
943 {
944 /* Only return 0 for +0.0, not for -0.0, which doesn't have
945 an all bytes same memory representation. Don't transform
946 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
947 switch (TREE_CODE (val))
948 {
949 case REAL_CST:
950 if (!real_isneg (TREE_REAL_CST_PTR (val)))
951 return 0;
952 break;
953 case COMPLEX_CST:
954 if (!const_with_all_bytes_same (TREE_REALPART (val))
955 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
956 return 0;
957 break;
958 case VECTOR_CST:
959 {
960 unsigned int count = vector_cst_encoded_nelts (val);
961 unsigned int j;
962 for (j = 0; j < count; ++j)
963 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
964 break;
965 if (j == count)
966 return 0;
967 break;
968 }
969 default:
970 break;
971 }
972 }
973
974 if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
975 return -1;
976
977 len = native_encode_expr (val, buf, sizeof (buf));
978 if (len == 0)
979 return -1;
980 for (i = 1; i < len; i++)
981 if (buf[i] != buf[0])
982 return -1;
983 return buf[0];
984 }
985
986 /* Generate a call to memset for PARTITION in LOOP. */
987
988 static void
989 generate_memset_builtin (struct loop *loop, partition *partition)
990 {
991 gimple_stmt_iterator gsi;
992 tree mem, fn, nb_bytes;
993 tree val;
994 struct builtin_info *builtin = partition->builtin;
995 gimple *fn_call;
996
997 /* The new statements will be placed before LOOP. */
998 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
999
1000 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1001 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1002 false, GSI_CONTINUE_LINKING);
1003 mem = builtin->dst_base;
1004 mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1005 false, GSI_CONTINUE_LINKING);
1006
1007 /* This exactly matches the pattern recognition in classify_partition. */
1008 val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1009 /* Handle constants like 0x15151515 and similarly
1010 floating point constants etc. where all bytes are the same. */
1011 int bytev = const_with_all_bytes_same (val);
1012 if (bytev != -1)
1013 val = build_int_cst (integer_type_node, bytev);
1014 else if (TREE_CODE (val) == INTEGER_CST)
1015 val = fold_convert (integer_type_node, val);
1016 else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1017 {
1018 tree tem = make_ssa_name (integer_type_node);
1019 gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1020 gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1021 val = tem;
1022 }
1023
1024 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1025 fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1026 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1027
1028 if (dump_file && (dump_flags & TDF_DETAILS))
1029 {
1030 fprintf (dump_file, "generated memset");
1031 if (bytev == 0)
1032 fprintf (dump_file, " zero\n");
1033 else
1034 fprintf (dump_file, "\n");
1035 }
1036 }
1037
1038 /* Generate a call to memcpy for PARTITION in LOOP. */
1039
1040 static void
1041 generate_memcpy_builtin (struct loop *loop, partition *partition)
1042 {
1043 gimple_stmt_iterator gsi;
1044 gimple *fn_call;
1045 tree dest, src, fn, nb_bytes;
1046 enum built_in_function kind;
1047 struct builtin_info *builtin = partition->builtin;
1048
1049 /* The new statements will be placed before LOOP. */
1050 gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1051
1052 nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1053 nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1054 false, GSI_CONTINUE_LINKING);
1055 dest = builtin->dst_base;
1056 src = builtin->src_base;
1057 if (partition->kind == PKIND_MEMCPY
1058 || ! ptr_derefs_may_alias_p (dest, src))
1059 kind = BUILT_IN_MEMCPY;
1060 else
1061 kind = BUILT_IN_MEMMOVE;
1062
1063 dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1064 false, GSI_CONTINUE_LINKING);
1065 src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1066 false, GSI_CONTINUE_LINKING);
1067 fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1068 fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1069 gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1070
1071 if (dump_file && (dump_flags & TDF_DETAILS))
1072 {
1073 if (kind == BUILT_IN_MEMCPY)
1074 fprintf (dump_file, "generated memcpy\n");
1075 else
1076 fprintf (dump_file, "generated memmove\n");
1077 }
1078 }
1079
1080 /* Remove and destroy the loop LOOP. */
1081
1082 static void
1083 destroy_loop (struct loop *loop)
1084 {
1085 unsigned nbbs = loop->num_nodes;
1086 edge exit = single_exit (loop);
1087 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1088 basic_block *bbs;
1089 unsigned i;
1090
1091 bbs = get_loop_body_in_dom_order (loop);
1092
1093 redirect_edge_pred (exit, src);
1094 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1095 exit->flags |= EDGE_FALLTHRU;
1096 cancel_loop_tree (loop);
1097 rescan_loop_exit (exit, false, true);
1098
1099 i = nbbs;
1100 do
1101 {
1102 /* We have made sure to not leave any dangling uses of SSA
1103 names defined in the loop. With the exception of virtuals.
1104 Make sure we replace all uses of virtual defs that will remain
1105 outside of the loop with the bare symbol as delete_basic_block
1106 will release them. */
1107 --i;
1108 for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1109 gsi_next (&gsi))
1110 {
1111 gphi *phi = gsi.phi ();
1112 if (virtual_operand_p (gimple_phi_result (phi)))
1113 mark_virtual_phi_result_for_renaming (phi);
1114 }
1115 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1116 gsi_next (&gsi))
1117 {
1118 gimple *stmt = gsi_stmt (gsi);
1119 tree vdef = gimple_vdef (stmt);
1120 if (vdef && TREE_CODE (vdef) == SSA_NAME)
1121 mark_virtual_operand_for_renaming (vdef);
1122 }
1123 delete_basic_block (bbs[i]);
1124 }
1125 while (i != 0);
1126
1127 free (bbs);
1128
1129 set_immediate_dominator (CDI_DOMINATORS, dest,
1130 recompute_dominator (CDI_DOMINATORS, dest));
1131 }
1132
1133 /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
1134
1135 static bool
1136 generate_code_for_partition (struct loop *loop,
1137 partition *partition, bool copy_p)
1138 {
1139 switch (partition->kind)
1140 {
1141 case PKIND_NORMAL:
1142 case PKIND_PARTIAL_MEMSET:
1143 /* Reductions all have to be in the last partition. */
1144 gcc_assert (!partition_reduction_p (partition)
1145 || !copy_p);
1146 generate_loops_for_partition (loop, partition, copy_p);
1147 return false;
1148
1149 case PKIND_MEMSET:
1150 generate_memset_builtin (loop, partition);
1151 break;
1152
1153 case PKIND_MEMCPY:
1154 case PKIND_MEMMOVE:
1155 generate_memcpy_builtin (loop, partition);
1156 break;
1157
1158 default:
1159 gcc_unreachable ();
1160 }
1161
1162 /* Common tail for partitions we turn into a call. If this was the last
1163 partition for which we generate code, we have to destroy the loop. */
1164 if (!copy_p)
1165 return true;
1166 return false;
1167 }
1168
1169 /* Return data dependence relation for data references A and B. The two
1170 data references must be in lexicographic order wrto reduced dependence
1171 graph RDG. We firstly try to find ddr from global ddr hash table. If
1172 it doesn't exist, compute the ddr and cache it. */
1173
1174 static data_dependence_relation *
1175 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1176 {
1177 struct data_dependence_relation ent, **slot;
1178 struct data_dependence_relation *ddr;
1179
1180 gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1181 gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1182 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1183 ent.a = a;
1184 ent.b = b;
1185 slot = ddrs_table->find_slot (&ent, INSERT);
1186 if (*slot == NULL)
1187 {
1188 ddr = initialize_data_dependence_relation (a, b, loop_nest);
1189 compute_affine_dependence (ddr, loop_nest[0]);
1190 *slot = ddr;
1191 }
1192
1193 return *slot;
1194 }
1195
1196 /* In reduced dependence graph RDG for loop distribution, return true if
1197 dependence between references DR1 and DR2 leads to a dependence cycle
1198 and such dependence cycle can't be resolved by runtime alias check. */
1199
1200 static bool
1201 data_dep_in_cycle_p (struct graph *rdg,
1202 data_reference_p dr1, data_reference_p dr2)
1203 {
1204 struct data_dependence_relation *ddr;
1205
1206 /* Re-shuffle data-refs to be in topological order. */
1207 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1208 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1209 std::swap (dr1, dr2);
1210
1211 ddr = get_data_dependence (rdg, dr1, dr2);
1212
1213 /* In case of no data dependence. */
1214 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1215 return false;
1216 /* For unknown data dependence or known data dependence which can't be
1217 expressed in classic distance vector, we check if it can be resolved
1218 by runtime alias check. If yes, we still consider data dependence
1219 as won't introduce data dependence cycle. */
1220 else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1221 || DDR_NUM_DIST_VECTS (ddr) == 0)
1222 return !runtime_alias_check_p (ddr, NULL, true);
1223 else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1224 return true;
1225 else if (DDR_REVERSED_P (ddr)
1226 || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1227 return false;
1228
1229 return true;
1230 }
1231
1232 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1233 PARTITION1's type after merging PARTITION2 into PARTITION1. */
1234
1235 static void
1236 update_type_for_merge (struct graph *rdg,
1237 partition *partition1, partition *partition2)
1238 {
1239 unsigned i, j;
1240 bitmap_iterator bi, bj;
1241 data_reference_p dr1, dr2;
1242
1243 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1244 {
1245 unsigned start = (partition1 == partition2) ? i + 1 : 0;
1246
1247 dr1 = datarefs_vec[i];
1248 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1249 {
1250 dr2 = datarefs_vec[j];
1251 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1252 continue;
1253
1254 /* Partition can only be executed sequentially if there is any
1255 data dependence cycle. */
1256 if (data_dep_in_cycle_p (rdg, dr1, dr2))
1257 {
1258 partition1->type = PTYPE_SEQUENTIAL;
1259 return;
1260 }
1261 }
1262 }
1263 }
1264
1265 /* Returns a partition with all the statements needed for computing
1266 the vertex V of the RDG, also including the loop exit conditions. */
1267
1268 static partition *
1269 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1270 {
1271 partition *partition = partition_alloc ();
1272 auto_vec<int, 3> nodes;
1273 unsigned i, j;
1274 int x;
1275 data_reference_p dr;
1276
1277 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1278
1279 FOR_EACH_VEC_ELT (nodes, i, x)
1280 {
1281 bitmap_set_bit (partition->stmts, x);
1282
1283 for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1284 {
1285 unsigned idx = (unsigned) DR_INDEX (dr);
1286 gcc_assert (idx < datarefs_vec.length ());
1287
1288 /* Partition can only be executed sequentially if there is any
1289 unknown data reference. */
1290 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1291 || !DR_INIT (dr) || !DR_STEP (dr))
1292 partition->type = PTYPE_SEQUENTIAL;
1293
1294 bitmap_set_bit (partition->datarefs, idx);
1295 }
1296 }
1297
1298 if (partition->type == PTYPE_SEQUENTIAL)
1299 return partition;
1300
1301 /* Further check if any data dependence prevents us from executing the
1302 partition parallelly. */
1303 update_type_for_merge (rdg, partition, partition);
1304
1305 return partition;
1306 }
1307
1308 /* Given PARTITION of LOOP and RDG, record single load/store data references
1309 for builtin partition in SRC_DR/DST_DR, return false if there is no such
1310 data references. */
1311
1312 static bool
1313 find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
1314 data_reference_p *dst_dr, data_reference_p *src_dr)
1315 {
1316 unsigned i;
1317 data_reference_p single_ld = NULL, single_st = NULL;
1318 bitmap_iterator bi;
1319
1320 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1321 {
1322 gimple *stmt = RDG_STMT (rdg, i);
1323 data_reference_p dr;
1324
1325 if (gimple_code (stmt) == GIMPLE_PHI)
1326 continue;
1327
1328 /* Any scalar stmts are ok. */
1329 if (!gimple_vuse (stmt))
1330 continue;
1331
1332 /* Otherwise just regular loads/stores. */
1333 if (!gimple_assign_single_p (stmt))
1334 return false;
1335
1336 /* But exactly one store and/or load. */
1337 for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1338 {
1339 tree type = TREE_TYPE (DR_REF (dr));
1340
1341 /* The memset, memcpy and memmove library calls are only
1342 able to deal with generic address space. */
1343 if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1344 return false;
1345
1346 if (DR_IS_READ (dr))
1347 {
1348 if (single_ld != NULL)
1349 return false;
1350 single_ld = dr;
1351 }
1352 else
1353 {
1354 if (single_st != NULL)
1355 return false;
1356 single_st = dr;
1357 }
1358 }
1359 }
1360
1361 if (!single_st)
1362 return false;
1363
1364 /* Bail out if this is a bitfield memory reference. */
1365 if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1366 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1367 return false;
1368
1369 /* Data reference must be executed exactly once per iteration of each
1370 loop in the loop nest. We only need to check dominance information
1371 against the outermost one in a perfect loop nest because a bb can't
1372 dominate outermost loop's latch without dominating inner loop's. */
1373 basic_block bb_st = gimple_bb (DR_STMT (single_st));
1374 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1375 return false;
1376
1377 if (single_ld)
1378 {
1379 gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1380 /* Direct aggregate copy or via an SSA name temporary. */
1381 if (load != store
1382 && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1383 return false;
1384
1385 /* Bail out if this is a bitfield memory reference. */
1386 if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1387 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1388 return false;
1389
1390 /* Load and store must be in the same loop nest. */
1391 basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1392 if (bb_st->loop_father != bb_ld->loop_father)
1393 return false;
1394
1395 /* Data reference must be executed exactly once per iteration.
1396 Same as single_st, we only need to check against the outermost
1397 loop. */
1398 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1399 return false;
1400
1401 edge e = single_exit (bb_st->loop_father);
1402 bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1403 bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1404 if (dom_ld != dom_st)
1405 return false;
1406 }
1407
1408 *src_dr = single_ld;
1409 *dst_dr = single_st;
1410 return true;
1411 }
1412
1413 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1414 loops from inner to outer to see if loop's step equals to access size at
1415 each level of loop. Return 2 if we can prove this at all level loops;
1416 record access base and size in BASE and SIZE; save loop's step at each
1417 level of loop in STEPS if it is not null. For example:
1418
1419 int arr[100][100][100];
1420 for (i = 0; i < 100; i++) ;steps[2] = 40000
1421 for (j = 100; j > 0; j--) ;steps[1] = -400
1422 for (k = 0; k < 100; k++) ;steps[0] = 4
1423 arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1424
1425 Return 1 if we can prove the equality at the innermost loop, but not all
1426 level loops. In this case, no information is recorded.
1427
1428 Return 0 if no equality can be proven at any level loops. */
1429
1430 static int
1431 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1432 tree *size, vec<tree> *steps = NULL)
1433 {
1434 location_t loc = gimple_location (DR_STMT (dr));
1435 basic_block bb = gimple_bb (DR_STMT (dr));
1436 struct loop *loop = bb->loop_father;
1437 tree ref = DR_REF (dr);
1438 tree access_base = build_fold_addr_expr (ref);
1439 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1440 int res = 0;
1441
1442 do {
1443 tree scev_fn = analyze_scalar_evolution (loop, access_base);
1444 if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1445 return res;
1446
1447 access_base = CHREC_LEFT (scev_fn);
1448 if (tree_contains_chrecs (access_base, NULL))
1449 return res;
1450
1451 tree scev_step = CHREC_RIGHT (scev_fn);
1452 /* Only support constant steps. */
1453 if (TREE_CODE (scev_step) != INTEGER_CST)
1454 return res;
1455
1456 enum ev_direction access_dir = scev_direction (scev_fn);
1457 if (access_dir == EV_DIR_UNKNOWN)
1458 return res;
1459
1460 if (steps != NULL)
1461 steps->safe_push (scev_step);
1462
1463 scev_step = fold_convert_loc (loc, sizetype, scev_step);
1464 /* Compute absolute value of scev step. */
1465 if (access_dir == EV_DIR_DECREASES)
1466 scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1467
1468 /* At each level of loop, scev step must equal to access size. In other
1469 words, DR must access consecutive memory between loop iterations. */
1470 if (!operand_equal_p (scev_step, access_size, 0))
1471 return res;
1472
1473 /* Access stride can be computed for data reference at least for the
1474 innermost loop. */
1475 res = 1;
1476
1477 /* Compute DR's execution times in loop. */
1478 tree niters = number_of_latch_executions (loop);
1479 niters = fold_convert_loc (loc, sizetype, niters);
1480 if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1481 niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1482
1483 /* Compute DR's overall access size in loop. */
1484 access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1485 niters, scev_step);
1486 /* Adjust base address in case of negative step. */
1487 if (access_dir == EV_DIR_DECREASES)
1488 {
1489 tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1490 scev_step, access_size);
1491 access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1492 }
1493 } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1494
1495 *base = access_base;
1496 *size = access_size;
1497 /* Access stride can be computed for data reference at each level loop. */
1498 return 2;
1499 }
1500
1501 /* Allocate and return builtin struct. Record information like DST_DR,
1502 SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1503
1504 static struct builtin_info *
1505 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1506 tree dst_base, tree src_base, tree size)
1507 {
1508 struct builtin_info *builtin = XNEW (struct builtin_info);
1509 builtin->dst_dr = dst_dr;
1510 builtin->src_dr = src_dr;
1511 builtin->dst_base = dst_base;
1512 builtin->src_base = src_base;
1513 builtin->size = size;
1514 return builtin;
1515 }
1516
1517 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1518 memset call. */
1519
1520 static void
1521 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1522 {
1523 gimple *stmt = DR_STMT (dr);
1524 tree base, size, rhs = gimple_assign_rhs1 (stmt);
1525
1526 if (const_with_all_bytes_same (rhs) == -1
1527 && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1528 || (TYPE_MODE (TREE_TYPE (rhs))
1529 != TYPE_MODE (unsigned_char_type_node))))
1530 return;
1531
1532 if (TREE_CODE (rhs) == SSA_NAME
1533 && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1534 && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1535 return;
1536
1537 int res = compute_access_range (loop, dr, &base, &size);
1538 if (res == 0)
1539 return;
1540 if (res == 1)
1541 {
1542 partition->kind = PKIND_PARTIAL_MEMSET;
1543 return;
1544 }
1545
1546 poly_uint64 base_offset;
1547 unsigned HOST_WIDE_INT const_base_offset;
1548 tree base_base = strip_offset (base, &base_offset);
1549 if (!base_offset.is_constant (&const_base_offset))
1550 return;
1551
1552 struct builtin_info *builtin;
1553 builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1554 builtin->dst_base_base = base_base;
1555 builtin->dst_base_offset = const_base_offset;
1556 partition->builtin = builtin;
1557 partition->kind = PKIND_MEMSET;
1558 }
1559
1560 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1561 if it forms builtin memcpy or memmove call. */
1562
1563 static void
1564 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1565 data_reference_p dst_dr, data_reference_p src_dr)
1566 {
1567 tree base, size, src_base, src_size;
1568 auto_vec<tree> dst_steps, src_steps;
1569
1570 /* Compute access range of both load and store. */
1571 int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1572 if (res != 2)
1573 return;
1574 res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1575 if (res != 2)
1576 return;
1577
1578 /* They much have the same access size. */
1579 if (!operand_equal_p (size, src_size, 0))
1580 return;
1581
1582 /* Load and store in loop nest must access memory in the same way, i.e,
1583 their must have the same steps in each loop of the nest. */
1584 if (dst_steps.length () != src_steps.length ())
1585 return;
1586 for (unsigned i = 0; i < dst_steps.length (); ++i)
1587 if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1588 return;
1589
1590 /* Now check that if there is a dependence. */
1591 ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1592
1593 /* Classify as memcpy if no dependence between load and store. */
1594 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1595 {
1596 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1597 partition->kind = PKIND_MEMCPY;
1598 return;
1599 }
1600
1601 /* Can't do memmove in case of unknown dependence or dependence without
1602 classical distance vector. */
1603 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1604 || DDR_NUM_DIST_VECTS (ddr) == 0)
1605 return;
1606
1607 unsigned i;
1608 lambda_vector dist_v;
1609 int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1610 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1611 {
1612 unsigned dep_lev = dependence_level (dist_v, num_lev);
1613 /* Can't do memmove if load depends on store. */
1614 if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1615 return;
1616 }
1617
1618 partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1619 partition->kind = PKIND_MEMMOVE;
1620 return;
1621 }
1622
1623 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1624 For the moment we detect memset, memcpy and memmove patterns. Bitmap
1625 STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */
1626
1627 static void
1628 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1629 bitmap stmt_in_all_partitions)
1630 {
1631 bitmap_iterator bi;
1632 unsigned i;
1633 data_reference_p single_ld = NULL, single_st = NULL;
1634 bool volatiles_p = false, has_reduction = false;
1635
1636 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1637 {
1638 gimple *stmt = RDG_STMT (rdg, i);
1639
1640 if (gimple_has_volatile_ops (stmt))
1641 volatiles_p = true;
1642
1643 /* If the stmt is not included by all partitions and there is uses
1644 outside of the loop, then mark the partition as reduction. */
1645 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1646 {
1647 /* Due to limitation in the transform phase we have to fuse all
1648 reduction partitions. As a result, this could cancel valid
1649 loop distribution especially for loop that induction variable
1650 is used outside of loop. To workaround this issue, we skip
1651 marking partition as reudction if the reduction stmt belongs
1652 to all partitions. In such case, reduction will be computed
1653 correctly no matter how partitions are fused/distributed. */
1654 if (!bitmap_bit_p (stmt_in_all_partitions, i))
1655 {
1656 partition->reduction_p = true;
1657 return;
1658 }
1659 has_reduction = true;
1660 }
1661 }
1662
1663 /* Perform general partition disqualification for builtins. */
1664 if (volatiles_p
1665 /* Simple workaround to prevent classifying the partition as builtin
1666 if it contains any use outside of loop. */
1667 || has_reduction
1668 || !flag_tree_loop_distribute_patterns)
1669 return;
1670
1671 /* Find single load/store data references for builtin partition. */
1672 if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1673 return;
1674
1675 /* Classify the builtin kind. */
1676 if (single_ld == NULL)
1677 classify_builtin_st (loop, partition, single_st);
1678 else
1679 classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1680 }
1681
1682 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1683 object in RDG. */
1684
1685 static bool
1686 share_memory_accesses (struct graph *rdg,
1687 partition *partition1, partition *partition2)
1688 {
1689 unsigned i, j;
1690 bitmap_iterator bi, bj;
1691 data_reference_p dr1, dr2;
1692
1693 /* First check whether in the intersection of the two partitions are
1694 any loads or stores. Common loads are the situation that happens
1695 most often. */
1696 EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1697 if (RDG_MEM_WRITE_STMT (rdg, i)
1698 || RDG_MEM_READS_STMT (rdg, i))
1699 return true;
1700
1701 /* Then check whether the two partitions access the same memory object. */
1702 EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1703 {
1704 dr1 = datarefs_vec[i];
1705
1706 if (!DR_BASE_ADDRESS (dr1)
1707 || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1708 continue;
1709
1710 EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1711 {
1712 dr2 = datarefs_vec[j];
1713
1714 if (!DR_BASE_ADDRESS (dr2)
1715 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1716 continue;
1717
1718 if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1719 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1720 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1721 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1722 return true;
1723 }
1724 }
1725
1726 return false;
1727 }
1728
1729 /* For each seed statement in STARTING_STMTS, this function builds
1730 partition for it by adding depended statements according to RDG.
1731 All partitions are recorded in PARTITIONS. */
1732
1733 static void
1734 rdg_build_partitions (struct graph *rdg,
1735 vec<gimple *> starting_stmts,
1736 vec<partition *> *partitions)
1737 {
1738 auto_bitmap processed;
1739 int i;
1740 gimple *stmt;
1741
1742 FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1743 {
1744 int v = rdg_vertex_for_stmt (rdg, stmt);
1745
1746 if (dump_file && (dump_flags & TDF_DETAILS))
1747 fprintf (dump_file,
1748 "ldist asked to generate code for vertex %d\n", v);
1749
1750 /* If the vertex is already contained in another partition so
1751 is the partition rooted at it. */
1752 if (bitmap_bit_p (processed, v))
1753 continue;
1754
1755 partition *partition = build_rdg_partition_for_vertex (rdg, v);
1756 bitmap_ior_into (processed, partition->stmts);
1757
1758 if (dump_file && (dump_flags & TDF_DETAILS))
1759 {
1760 fprintf (dump_file, "ldist creates useful %s partition:\n",
1761 partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1762 bitmap_print (dump_file, partition->stmts, " ", "\n");
1763 }
1764
1765 partitions->safe_push (partition);
1766 }
1767
1768 /* All vertices should have been assigned to at least one partition now,
1769 other than vertices belonging to dead code. */
1770 }
1771
1772 /* Dump to FILE the PARTITIONS. */
1773
1774 static void
1775 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1776 {
1777 int i;
1778 partition *partition;
1779
1780 FOR_EACH_VEC_ELT (partitions, i, partition)
1781 debug_bitmap_file (file, partition->stmts);
1782 }
1783
1784 /* Debug PARTITIONS. */
1785 extern void debug_rdg_partitions (vec<partition *> );
1786
1787 DEBUG_FUNCTION void
1788 debug_rdg_partitions (vec<partition *> partitions)
1789 {
1790 dump_rdg_partitions (stderr, partitions);
1791 }
1792
1793 /* Returns the number of read and write operations in the RDG. */
1794
1795 static int
1796 number_of_rw_in_rdg (struct graph *rdg)
1797 {
1798 int i, res = 0;
1799
1800 for (i = 0; i < rdg->n_vertices; i++)
1801 {
1802 if (RDG_MEM_WRITE_STMT (rdg, i))
1803 ++res;
1804
1805 if (RDG_MEM_READS_STMT (rdg, i))
1806 ++res;
1807 }
1808
1809 return res;
1810 }
1811
1812 /* Returns the number of read and write operations in a PARTITION of
1813 the RDG. */
1814
1815 static int
1816 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1817 {
1818 int res = 0;
1819 unsigned i;
1820 bitmap_iterator ii;
1821
1822 EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1823 {
1824 if (RDG_MEM_WRITE_STMT (rdg, i))
1825 ++res;
1826
1827 if (RDG_MEM_READS_STMT (rdg, i))
1828 ++res;
1829 }
1830
1831 return res;
1832 }
1833
1834 /* Returns true when one of the PARTITIONS contains all the read or
1835 write operations of RDG. */
1836
1837 static bool
1838 partition_contains_all_rw (struct graph *rdg,
1839 vec<partition *> partitions)
1840 {
1841 int i;
1842 partition *partition;
1843 int nrw = number_of_rw_in_rdg (rdg);
1844
1845 FOR_EACH_VEC_ELT (partitions, i, partition)
1846 if (nrw == number_of_rw_in_partition (rdg, partition))
1847 return true;
1848
1849 return false;
1850 }
1851
1852 /* Compute partition dependence created by the data references in DRS1
1853 and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
1854 not NULL, we record dependence introduced by possible alias between
1855 two data references in ALIAS_DDRS; otherwise, we simply ignore such
1856 dependence as if it doesn't exist at all. */
1857
1858 static int
1859 pg_add_dependence_edges (struct graph *rdg, int dir,
1860 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1861 {
1862 unsigned i, j;
1863 bitmap_iterator bi, bj;
1864 data_reference_p dr1, dr2, saved_dr1;
1865
1866 /* dependence direction - 0 is no dependence, -1 is back,
1867 1 is forth, 2 is both (we can stop then, merging will occur). */
1868 EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1869 {
1870 dr1 = datarefs_vec[i];
1871
1872 EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1873 {
1874 int res, this_dir = 1;
1875 ddr_p ddr;
1876
1877 dr2 = datarefs_vec[j];
1878
1879 /* Skip all <read, read> data dependence. */
1880 if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1881 continue;
1882
1883 saved_dr1 = dr1;
1884 /* Re-shuffle data-refs to be in topological order. */
1885 if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1886 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1887 {
1888 std::swap (dr1, dr2);
1889 this_dir = -this_dir;
1890 }
1891 ddr = get_data_dependence (rdg, dr1, dr2);
1892 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1893 {
1894 this_dir = 0;
1895 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1896 DR_BASE_ADDRESS (dr2));
1897 /* Be conservative. If data references are not well analyzed,
1898 or the two data references have the same base address and
1899 offset, add dependence and consider it alias to each other.
1900 In other words, the dependence cannot be resolved by
1901 runtime alias check. */
1902 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1903 || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1904 || !DR_INIT (dr1) || !DR_INIT (dr2)
1905 || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1906 || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1907 || res == 0)
1908 this_dir = 2;
1909 /* Data dependence could be resolved by runtime alias check,
1910 record it in ALIAS_DDRS. */
1911 else if (alias_ddrs != NULL)
1912 alias_ddrs->safe_push (ddr);
1913 /* Or simply ignore it. */
1914 }
1915 else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1916 {
1917 if (DDR_REVERSED_P (ddr))
1918 this_dir = -this_dir;
1919
1920 /* Known dependences can still be unordered througout the
1921 iteration space, see gcc.dg/tree-ssa/ldist-16.c. */
1922 if (DDR_NUM_DIST_VECTS (ddr) != 1)
1923 this_dir = 2;
1924 /* If the overlap is exact preserve stmt order. */
1925 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
1926 DDR_NB_LOOPS (ddr)))
1927 ;
1928 /* Else as the distance vector is lexicographic positive swap
1929 the dependence direction. */
1930 else
1931 this_dir = -this_dir;
1932 }
1933 else
1934 this_dir = 0;
1935 if (this_dir == 2)
1936 return 2;
1937 else if (dir == 0)
1938 dir = this_dir;
1939 else if (this_dir != 0 && dir != this_dir)
1940 return 2;
1941 /* Shuffle "back" dr1. */
1942 dr1 = saved_dr1;
1943 }
1944 }
1945 return dir;
1946 }
1947
1948 /* Compare postorder number of the partition graph vertices V1 and V2. */
1949
1950 static int
1951 pgcmp (const void *v1_, const void *v2_)
1952 {
1953 const vertex *v1 = (const vertex *)v1_;
1954 const vertex *v2 = (const vertex *)v2_;
1955 return v2->post - v1->post;
1956 }
1957
1958 /* Data attached to vertices of partition dependence graph. */
1959 struct pg_vdata
1960 {
1961 /* ID of the corresponding partition. */
1962 int id;
1963 /* The partition. */
1964 struct partition *partition;
1965 };
1966
1967 /* Data attached to edges of partition dependence graph. */
1968 struct pg_edata
1969 {
1970 /* If the dependence edge can be resolved by runtime alias check,
1971 this vector contains data dependence relations for runtime alias
1972 check. On the other hand, if the dependence edge is introduced
1973 because of compilation time known data dependence, this vector
1974 contains nothing. */
1975 vec<ddr_p> alias_ddrs;
1976 };
1977
1978 /* Callback data for traversing edges in graph. */
1979 struct pg_edge_callback_data
1980 {
1981 /* Bitmap contains strong connected components should be merged. */
1982 bitmap sccs_to_merge;
1983 /* Array constains component information for all vertices. */
1984 int *vertices_component;
1985 /* Vector to record all data dependence relations which are needed
1986 to break strong connected components by runtime alias checks. */
1987 vec<ddr_p> *alias_ddrs;
1988 };
1989
1990 /* Initialize vertice's data for partition dependence graph PG with
1991 PARTITIONS. */
1992
1993 static void
1994 init_partition_graph_vertices (struct graph *pg,
1995 vec<struct partition *> *partitions)
1996 {
1997 int i;
1998 partition *partition;
1999 struct pg_vdata *data;
2000
2001 for (i = 0; partitions->iterate (i, &partition); ++i)
2002 {
2003 data = new pg_vdata;
2004 pg->vertices[i].data = data;
2005 data->id = i;
2006 data->partition = partition;
2007 }
2008 }
2009
2010 /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2011 dependence relations to the EDGE if DDRS isn't NULL. */
2012
2013 static void
2014 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2015 {
2016 struct graph_edge *e = add_edge (pg, i, j);
2017
2018 /* If the edge is attached with data dependence relations, it means this
2019 dependence edge can be resolved by runtime alias checks. */
2020 if (ddrs != NULL)
2021 {
2022 struct pg_edata *data = new pg_edata;
2023
2024 gcc_assert (ddrs->length () > 0);
2025 e->data = data;
2026 data->alias_ddrs = vNULL;
2027 data->alias_ddrs.safe_splice (*ddrs);
2028 }
2029 }
2030
2031 /* Callback function for graph travesal algorithm. It returns true
2032 if edge E should skipped when traversing the graph. */
2033
2034 static bool
2035 pg_skip_alias_edge (struct graph_edge *e)
2036 {
2037 struct pg_edata *data = (struct pg_edata *)e->data;
2038 return (data != NULL && data->alias_ddrs.length () > 0);
2039 }
2040
2041 /* Callback function freeing data attached to edge E of graph. */
2042
2043 static void
2044 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2045 {
2046 if (e->data != NULL)
2047 {
2048 struct pg_edata *data = (struct pg_edata *)e->data;
2049 data->alias_ddrs.release ();
2050 delete data;
2051 }
2052 }
2053
2054 /* Free data attached to vertice of partition dependence graph PG. */
2055
2056 static void
2057 free_partition_graph_vdata (struct graph *pg)
2058 {
2059 int i;
2060 struct pg_vdata *data;
2061
2062 for (i = 0; i < pg->n_vertices; ++i)
2063 {
2064 data = (struct pg_vdata *)pg->vertices[i].data;
2065 delete data;
2066 }
2067 }
2068
2069 /* Build and return partition dependence graph for PARTITIONS. RDG is
2070 reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2071 is true, data dependence caused by possible alias between references
2072 is ignored, as if it doesn't exist at all; otherwise all depdendences
2073 are considered. */
2074
2075 static struct graph *
2076 build_partition_graph (struct graph *rdg,
2077 vec<struct partition *> *partitions,
2078 bool ignore_alias_p)
2079 {
2080 int i, j;
2081 struct partition *partition1, *partition2;
2082 graph *pg = new_graph (partitions->length ());
2083 auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2084
2085 alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2086
2087 init_partition_graph_vertices (pg, partitions);
2088
2089 for (i = 0; partitions->iterate (i, &partition1); ++i)
2090 {
2091 for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2092 {
2093 /* dependence direction - 0 is no dependence, -1 is back,
2094 1 is forth, 2 is both (we can stop then, merging will occur). */
2095 int dir = 0;
2096
2097 /* If the first partition has reduction, add back edge; if the
2098 second partition has reduction, add forth edge. This makes
2099 sure that reduction partition will be sorted as the last one. */
2100 if (partition_reduction_p (partition1))
2101 dir = -1;
2102 else if (partition_reduction_p (partition2))
2103 dir = 1;
2104
2105 /* Cleanup the temporary vector. */
2106 alias_ddrs.truncate (0);
2107
2108 dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2109 partition2->datarefs, alias_ddrs_p);
2110
2111 /* Add edge to partition graph if there exists dependence. There
2112 are two types of edges. One type edge is caused by compilation
2113 time known dependence, this type cannot be resolved by runtime
2114 alias check. The other type can be resolved by runtime alias
2115 check. */
2116 if (dir == 1 || dir == 2
2117 || alias_ddrs.length () > 0)
2118 {
2119 /* Attach data dependence relations to edge that can be resolved
2120 by runtime alias check. */
2121 bool alias_edge_p = (dir != 1 && dir != 2);
2122 add_partition_graph_edge (pg, i, j,
2123 (alias_edge_p) ? &alias_ddrs : NULL);
2124 }
2125 if (dir == -1 || dir == 2
2126 || alias_ddrs.length () > 0)
2127 {
2128 /* Attach data dependence relations to edge that can be resolved
2129 by runtime alias check. */
2130 bool alias_edge_p = (dir != -1 && dir != 2);
2131 add_partition_graph_edge (pg, j, i,
2132 (alias_edge_p) ? &alias_ddrs : NULL);
2133 }
2134 }
2135 }
2136 return pg;
2137 }
2138
2139 /* Sort partitions in PG in descending post order and store them in
2140 PARTITIONS. */
2141
2142 static void
2143 sort_partitions_by_post_order (struct graph *pg,
2144 vec<struct partition *> *partitions)
2145 {
2146 int i;
2147 struct pg_vdata *data;
2148
2149 /* Now order the remaining nodes in descending postorder. */
2150 qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2151 partitions->truncate (0);
2152 for (i = 0; i < pg->n_vertices; ++i)
2153 {
2154 data = (struct pg_vdata *)pg->vertices[i].data;
2155 if (data->partition)
2156 partitions->safe_push (data->partition);
2157 }
2158 }
2159
2160 /* Given reduced dependence graph RDG merge strong connected components
2161 of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
2162 possible alias between references is ignored, as if it doesn't exist
2163 at all; otherwise all depdendences are considered. */
2164
2165 static void
2166 merge_dep_scc_partitions (struct graph *rdg,
2167 vec<struct partition *> *partitions,
2168 bool ignore_alias_p)
2169 {
2170 struct partition *partition1, *partition2;
2171 struct pg_vdata *data;
2172 graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2173 int i, j, num_sccs = graphds_scc (pg, NULL);
2174
2175 /* Strong connected compoenent means dependence cycle, we cannot distribute
2176 them. So fuse them together. */
2177 if ((unsigned) num_sccs < partitions->length ())
2178 {
2179 for (i = 0; i < num_sccs; ++i)
2180 {
2181 for (j = 0; partitions->iterate (j, &partition1); ++j)
2182 if (pg->vertices[j].component == i)
2183 break;
2184 for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2185 if (pg->vertices[j].component == i)
2186 {
2187 partition_merge_into (NULL, partition1,
2188 partition2, FUSE_SAME_SCC);
2189 partition1->type = PTYPE_SEQUENTIAL;
2190 (*partitions)[j] = NULL;
2191 partition_free (partition2);
2192 data = (struct pg_vdata *)pg->vertices[j].data;
2193 data->partition = NULL;
2194 }
2195 }
2196 }
2197
2198 sort_partitions_by_post_order (pg, partitions);
2199 gcc_assert (partitions->length () == (unsigned)num_sccs);
2200 free_partition_graph_vdata (pg);
2201 free_graph (pg);
2202 }
2203
2204 /* Callback function for traversing edge E in graph G. DATA is private
2205 callback data. */
2206
2207 static void
2208 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2209 {
2210 int i, j, component;
2211 struct pg_edge_callback_data *cbdata;
2212 struct pg_edata *edata = (struct pg_edata *) e->data;
2213
2214 /* If the edge doesn't have attached data dependence, it represents
2215 compilation time known dependences. This type dependence cannot
2216 be resolved by runtime alias check. */
2217 if (edata == NULL || edata->alias_ddrs.length () == 0)
2218 return;
2219
2220 cbdata = (struct pg_edge_callback_data *) data;
2221 i = e->src;
2222 j = e->dest;
2223 component = cbdata->vertices_component[i];
2224 /* Vertices are topologically sorted according to compilation time
2225 known dependences, so we can break strong connected components
2226 by removing edges of the opposite direction, i.e, edges pointing
2227 from vertice with smaller post number to vertice with bigger post
2228 number. */
2229 if (g->vertices[i].post < g->vertices[j].post
2230 /* We only need to remove edges connecting vertices in the same
2231 strong connected component to break it. */
2232 && component == cbdata->vertices_component[j]
2233 /* Check if we want to break the strong connected component or not. */
2234 && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2235 cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2236 }
2237
2238 /* This is the main function breaking strong conected components in
2239 PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2240 relations for runtime alias check in ALIAS_DDRS. */
2241
2242 static void
2243 break_alias_scc_partitions (struct graph *rdg,
2244 vec<struct partition *> *partitions,
2245 vec<ddr_p> *alias_ddrs)
2246 {
2247 int i, j, k, num_sccs, num_sccs_no_alias;
2248 /* Build partition dependence graph. */
2249 graph *pg = build_partition_graph (rdg, partitions, false);
2250
2251 alias_ddrs->truncate (0);
2252 /* Find strong connected components in the graph, with all dependence edges
2253 considered. */
2254 num_sccs = graphds_scc (pg, NULL);
2255 /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2256 compilation time known dependences are merged before this function. */
2257 if ((unsigned) num_sccs < partitions->length ())
2258 {
2259 struct pg_edge_callback_data cbdata;
2260 auto_bitmap sccs_to_merge;
2261 auto_vec<enum partition_type> scc_types;
2262 struct partition *partition, *first;
2263
2264 /* If all partitions in a SCC have the same type, we can simply merge the
2265 SCC. This loop finds out such SCCS and record them in bitmap. */
2266 bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2267 for (i = 0; i < num_sccs; ++i)
2268 {
2269 for (j = 0; partitions->iterate (j, &first); ++j)
2270 if (pg->vertices[j].component == i)
2271 break;
2272
2273 bool same_type = true, all_builtins = partition_builtin_p (first);
2274 for (++j; partitions->iterate (j, &partition); ++j)
2275 {
2276 if (pg->vertices[j].component != i)
2277 continue;
2278
2279 if (first->type != partition->type)
2280 {
2281 same_type = false;
2282 break;
2283 }
2284 all_builtins &= partition_builtin_p (partition);
2285 }
2286 /* Merge SCC if all partitions in SCC have the same type, though the
2287 result partition is sequential, because vectorizer can do better
2288 runtime alias check. One expecption is all partitions in SCC are
2289 builtins. */
2290 if (!same_type || all_builtins)
2291 bitmap_clear_bit (sccs_to_merge, i);
2292 }
2293
2294 /* Initialize callback data for traversing. */
2295 cbdata.sccs_to_merge = sccs_to_merge;
2296 cbdata.alias_ddrs = alias_ddrs;
2297 cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2298 /* Record the component information which will be corrupted by next
2299 graph scc finding call. */
2300 for (i = 0; i < pg->n_vertices; ++i)
2301 cbdata.vertices_component[i] = pg->vertices[i].component;
2302
2303 /* Collect data dependences for runtime alias checks to break SCCs. */
2304 if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2305 {
2306 /* Run SCC finding algorithm again, with alias dependence edges
2307 skipped. This is to topologically sort partitions according to
2308 compilation time known dependence. Note the topological order
2309 is stored in the form of pg's post order number. */
2310 num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2311 gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2312 /* With topological order, we can construct two subgraphs L and R.
2313 L contains edge <x, y> where x < y in terms of post order, while
2314 R contains edge <x, y> where x > y. Edges for compilation time
2315 known dependence all fall in R, so we break SCCs by removing all
2316 (alias) edges of in subgraph L. */
2317 for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2318 }
2319
2320 /* For SCC that doesn't need to be broken, merge it. */
2321 for (i = 0; i < num_sccs; ++i)
2322 {
2323 if (!bitmap_bit_p (sccs_to_merge, i))
2324 continue;
2325
2326 for (j = 0; partitions->iterate (j, &first); ++j)
2327 if (cbdata.vertices_component[j] == i)
2328 break;
2329 for (k = j + 1; partitions->iterate (k, &partition); ++k)
2330 {
2331 struct pg_vdata *data;
2332
2333 if (cbdata.vertices_component[k] != i)
2334 continue;
2335
2336 /* Update postorder number so that merged reduction partition is
2337 sorted after other partitions. */
2338 if (!partition_reduction_p (first)
2339 && partition_reduction_p (partition))
2340 {
2341 gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2342 pg->vertices[j].post = pg->vertices[k].post;
2343 }
2344 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2345 (*partitions)[k] = NULL;
2346 partition_free (partition);
2347 data = (struct pg_vdata *)pg->vertices[k].data;
2348 gcc_assert (data->id == k);
2349 data->partition = NULL;
2350 /* The result partition of merged SCC must be sequential. */
2351 first->type = PTYPE_SEQUENTIAL;
2352 }
2353 }
2354 }
2355
2356 sort_partitions_by_post_order (pg, partitions);
2357 free_partition_graph_vdata (pg);
2358 for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2359 free_graph (pg);
2360
2361 if (dump_file && (dump_flags & TDF_DETAILS))
2362 {
2363 fprintf (dump_file, "Possible alias data dependence to break:\n");
2364 dump_data_dependence_relations (dump_file, *alias_ddrs);
2365 }
2366 }
2367
2368 /* Compute and return an expression whose value is the segment length which
2369 will be accessed by DR in NITERS iterations. */
2370
2371 static tree
2372 data_ref_segment_size (struct data_reference *dr, tree niters)
2373 {
2374 niters = size_binop (MINUS_EXPR,
2375 fold_convert (sizetype, niters),
2376 size_one_node);
2377 return size_binop (MULT_EXPR,
2378 fold_convert (sizetype, DR_STEP (dr)),
2379 fold_convert (sizetype, niters));
2380 }
2381
2382 /* Return true if LOOP's latch is dominated by statement for data reference
2383 DR. */
2384
2385 static inline bool
2386 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2387 {
2388 return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2389 gimple_bb (DR_STMT (dr)));
2390 }
2391
2392 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2393 data dependence relations ALIAS_DDRS. */
2394
2395 static void
2396 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2397 vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2398 {
2399 unsigned int i;
2400 unsigned HOST_WIDE_INT factor = 1;
2401 tree niters_plus_one, niters = number_of_latch_executions (loop);
2402
2403 gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2404 niters = fold_convert (sizetype, niters);
2405 niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2406
2407 if (dump_file && (dump_flags & TDF_DETAILS))
2408 fprintf (dump_file, "Creating alias check pairs:\n");
2409
2410 /* Iterate all data dependence relations and compute alias check pairs. */
2411 for (i = 0; i < alias_ddrs->length (); i++)
2412 {
2413 ddr_p ddr = (*alias_ddrs)[i];
2414 struct data_reference *dr_a = DDR_A (ddr);
2415 struct data_reference *dr_b = DDR_B (ddr);
2416 tree seg_length_a, seg_length_b;
2417 int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2418 DR_BASE_ADDRESS (dr_b));
2419
2420 if (comp_res == 0)
2421 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2422 gcc_assert (comp_res != 0);
2423
2424 if (latch_dominated_by_data_ref (loop, dr_a))
2425 seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2426 else
2427 seg_length_a = data_ref_segment_size (dr_a, niters);
2428
2429 if (latch_dominated_by_data_ref (loop, dr_b))
2430 seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2431 else
2432 seg_length_b = data_ref_segment_size (dr_b, niters);
2433
2434 unsigned HOST_WIDE_INT access_size_a
2435 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2436 unsigned HOST_WIDE_INT access_size_b
2437 = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2438 unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2439 unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2440
2441 dr_with_seg_len_pair_t dr_with_seg_len_pair
2442 (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2443 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2444
2445 /* Canonicalize pairs by sorting the two DR members. */
2446 if (comp_res > 0)
2447 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2448
2449 comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2450 }
2451
2452 if (tree_fits_uhwi_p (niters))
2453 factor = tree_to_uhwi (niters);
2454
2455 /* Prune alias check pairs. */
2456 prune_runtime_alias_test_list (comp_alias_pairs, factor);
2457 if (dump_file && (dump_flags & TDF_DETAILS))
2458 fprintf (dump_file,
2459 "Improved number of alias checks from %d to %d\n",
2460 alias_ddrs->length (), comp_alias_pairs->length ());
2461 }
2462
2463 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2464 checks and version LOOP under condition of these runtime alias checks. */
2465
2466 static void
2467 version_loop_by_alias_check (vec<struct partition *> *partitions,
2468 struct loop *loop, vec<ddr_p> *alias_ddrs)
2469 {
2470 profile_probability prob;
2471 basic_block cond_bb;
2472 struct loop *nloop;
2473 tree lhs, arg0, cond_expr = NULL_TREE;
2474 gimple_seq cond_stmts = NULL;
2475 gimple *call_stmt = NULL;
2476 auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2477
2478 /* Generate code for runtime alias checks if necessary. */
2479 gcc_assert (alias_ddrs->length () > 0);
2480
2481 if (dump_file && (dump_flags & TDF_DETAILS))
2482 fprintf (dump_file,
2483 "Version loop <%d> with runtime alias check\n", loop->num);
2484
2485 compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2486 create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2487 cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2488 is_gimple_val, NULL_TREE);
2489
2490 /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2491 bool cancelable_p = flag_tree_loop_vectorize;
2492 if (cancelable_p)
2493 {
2494 unsigned i = 0;
2495 struct partition *partition;
2496 for (; partitions->iterate (i, &partition); ++i)
2497 if (!partition_builtin_p (partition))
2498 break;
2499
2500 /* If all partitions are builtins, distributing it would be profitable and
2501 we don't want to cancel the runtime alias checks. */
2502 if (i == partitions->length ())
2503 cancelable_p = false;
2504 }
2505
2506 /* Generate internal function call for loop distribution alias check if the
2507 runtime alias check should be cancelable. */
2508 if (cancelable_p)
2509 {
2510 call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2511 2, NULL_TREE, cond_expr);
2512 lhs = make_ssa_name (boolean_type_node);
2513 gimple_call_set_lhs (call_stmt, lhs);
2514 }
2515 else
2516 lhs = cond_expr;
2517
2518 prob = profile_probability::guessed_always ().apply_scale (9, 10);
2519 initialize_original_copy_tables ();
2520 nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2521 prob, prob.invert (), true);
2522 free_original_copy_tables ();
2523 /* Record the original loop number in newly generated loops. In case of
2524 distribution, the original loop will be distributed and the new loop
2525 is kept. */
2526 loop->orig_loop_num = nloop->num;
2527 nloop->orig_loop_num = nloop->num;
2528 nloop->dont_vectorize = true;
2529 nloop->force_vectorize = false;
2530
2531 if (call_stmt)
2532 {
2533 /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2534 loop could be destroyed. */
2535 arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2536 gimple_call_set_arg (call_stmt, 0, arg0);
2537 gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2538 }
2539
2540 if (cond_stmts)
2541 {
2542 gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2543 gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2544 }
2545 update_ssa (TODO_update_ssa);
2546 }
2547
2548 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2549 ALIAS_DDRS are data dependence relations for runtime alias check. */
2550
2551 static inline bool
2552 version_for_distribution_p (vec<struct partition *> *partitions,
2553 vec<ddr_p> *alias_ddrs)
2554 {
2555 /* No need to version loop if we have only one partition. */
2556 if (partitions->length () == 1)
2557 return false;
2558
2559 /* Need to version loop if runtime alias check is necessary. */
2560 return (alias_ddrs->length () > 0);
2561 }
2562
2563 /* Compare base offset of builtin mem* partitions P1 and P2. */
2564
2565 static int
2566 offset_cmp (const void *vp1, const void *vp2)
2567 {
2568 struct partition *p1 = *(struct partition *const *) vp1;
2569 struct partition *p2 = *(struct partition *const *) vp2;
2570 unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2571 unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2572 return (o2 < o1) - (o1 < o2);
2573 }
2574
2575 /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2576 case optimization transforming below code:
2577
2578 __builtin_memset (&obj, 0, 100);
2579 _1 = &obj + 100;
2580 __builtin_memset (_1, 0, 200);
2581 _2 = &obj + 300;
2582 __builtin_memset (_2, 0, 100);
2583
2584 into:
2585
2586 __builtin_memset (&obj, 0, 400);
2587
2588 Note we don't have dependence information between different partitions
2589 at this point, as a result, we can't handle nonadjacent memset builtin
2590 partitions since dependence might be broken. */
2591
2592 static void
2593 fuse_memset_builtins (vec<struct partition *> *partitions)
2594 {
2595 unsigned i, j;
2596 struct partition *part1, *part2;
2597 tree rhs1, rhs2;
2598
2599 for (i = 0; partitions->iterate (i, &part1);)
2600 {
2601 if (part1->kind != PKIND_MEMSET)
2602 {
2603 i++;
2604 continue;
2605 }
2606
2607 /* Find sub-array of memset builtins of the same base. Index range
2608 of the sub-array is [i, j) with "j > i". */
2609 for (j = i + 1; partitions->iterate (j, &part2); ++j)
2610 {
2611 if (part2->kind != PKIND_MEMSET
2612 || !operand_equal_p (part1->builtin->dst_base_base,
2613 part2->builtin->dst_base_base, 0))
2614 break;
2615
2616 /* Memset calls setting different values can't be merged. */
2617 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2618 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2619 if (!operand_equal_p (rhs1, rhs2, 0))
2620 break;
2621 }
2622
2623 /* Stable sort is required in order to avoid breaking dependence. */
2624 gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2625 offset_cmp);
2626 /* Continue with next partition. */
2627 i = j;
2628 }
2629
2630 /* Merge all consecutive memset builtin partitions. */
2631 for (i = 0; i < partitions->length () - 1;)
2632 {
2633 part1 = (*partitions)[i];
2634 if (part1->kind != PKIND_MEMSET)
2635 {
2636 i++;
2637 continue;
2638 }
2639
2640 part2 = (*partitions)[i + 1];
2641 /* Only merge memset partitions of the same base and with constant
2642 access sizes. */
2643 if (part2->kind != PKIND_MEMSET
2644 || TREE_CODE (part1->builtin->size) != INTEGER_CST
2645 || TREE_CODE (part2->builtin->size) != INTEGER_CST
2646 || !operand_equal_p (part1->builtin->dst_base_base,
2647 part2->builtin->dst_base_base, 0))
2648 {
2649 i++;
2650 continue;
2651 }
2652 rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2653 rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2654 int bytev1 = const_with_all_bytes_same (rhs1);
2655 int bytev2 = const_with_all_bytes_same (rhs2);
2656 /* Only merge memset partitions of the same value. */
2657 if (bytev1 != bytev2 || bytev1 == -1)
2658 {
2659 i++;
2660 continue;
2661 }
2662 wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2663 wi::to_wide (part1->builtin->size));
2664 /* Only merge adjacent memset partitions. */
2665 if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2666 {
2667 i++;
2668 continue;
2669 }
2670 /* Merge partitions[i] and partitions[i+1]. */
2671 part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2672 part1->builtin->size,
2673 part2->builtin->size);
2674 partition_free (part2);
2675 partitions->ordered_remove (i + 1);
2676 }
2677 }
2678
2679 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2680 ALIAS_DDRS contains ddrs which need runtime alias check. */
2681
2682 static void
2683 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2684 vec<ddr_p> *alias_ddrs)
2685 {
2686 unsigned i;
2687 struct partition *partition, *a;
2688
2689 if (partitions->length () == 1
2690 || alias_ddrs->length () > 0)
2691 return;
2692
2693 unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2694 bool same_type_p = true;
2695 enum partition_type type = ((*partitions)[0])->type;
2696 for (i = 0; partitions->iterate (i, &partition); ++i)
2697 {
2698 same_type_p &= (type == partition->type);
2699 if (partition_builtin_p (partition))
2700 {
2701 num_builtin++;
2702 continue;
2703 }
2704 num_normal++;
2705 if (partition->kind == PKIND_PARTIAL_MEMSET)
2706 num_partial_memset++;
2707 }
2708
2709 /* Don't distribute current loop into too many loops given we don't have
2710 memory stream cost model. Be even more conservative in case of loop
2711 nest distribution. */
2712 if ((same_type_p && num_builtin == 0
2713 && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2714 || (loop->inner != NULL
2715 && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2716 || (loop->inner == NULL
2717 && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2718 {
2719 a = (*partitions)[0];
2720 for (i = 1; partitions->iterate (i, &partition); ++i)
2721 {
2722 partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2723 partition_free (partition);
2724 }
2725 partitions->truncate (1);
2726 }
2727
2728 /* Fuse memset builtins if possible. */
2729 if (partitions->length () > 1)
2730 fuse_memset_builtins (partitions);
2731 }
2732
2733 /* Distributes the code from LOOP in such a way that producer statements
2734 are placed before consumer statements. Tries to separate only the
2735 statements from STMTS into separate loops. Returns the number of
2736 distributed loops. Set NB_CALLS to number of generated builtin calls.
2737 Set *DESTROY_P to whether LOOP needs to be destroyed. */
2738
2739 static int
2740 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2741 control_dependences *cd, int *nb_calls, bool *destroy_p)
2742 {
2743 ddrs_table = new hash_table<ddr_hasher> (389);
2744 struct graph *rdg;
2745 partition *partition;
2746 bool any_builtin;
2747 int i, nbp;
2748
2749 *destroy_p = false;
2750 *nb_calls = 0;
2751 loop_nest.create (0);
2752 if (!find_loop_nest (loop, &loop_nest))
2753 {
2754 loop_nest.release ();
2755 delete ddrs_table;
2756 return 0;
2757 }
2758
2759 datarefs_vec.create (20);
2760 rdg = build_rdg (loop, cd);
2761 if (!rdg)
2762 {
2763 if (dump_file && (dump_flags & TDF_DETAILS))
2764 fprintf (dump_file,
2765 "Loop %d not distributed: failed to build the RDG.\n",
2766 loop->num);
2767
2768 loop_nest.release ();
2769 free_data_refs (datarefs_vec);
2770 delete ddrs_table;
2771 return 0;
2772 }
2773
2774 if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2775 {
2776 if (dump_file && (dump_flags & TDF_DETAILS))
2777 fprintf (dump_file,
2778 "Loop %d not distributed: too many memory references.\n",
2779 loop->num);
2780
2781 free_rdg (rdg);
2782 loop_nest.release ();
2783 free_data_refs (datarefs_vec);
2784 delete ddrs_table;
2785 return 0;
2786 }
2787
2788 data_reference_p dref;
2789 for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2790 dref->aux = (void *) (uintptr_t) i;
2791
2792 if (dump_file && (dump_flags & TDF_DETAILS))
2793 dump_rdg (dump_file, rdg);
2794
2795 auto_vec<struct partition *, 3> partitions;
2796 rdg_build_partitions (rdg, stmts, &partitions);
2797
2798 auto_vec<ddr_p> alias_ddrs;
2799
2800 auto_bitmap stmt_in_all_partitions;
2801 bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2802 for (i = 1; partitions.iterate (i, &partition); ++i)
2803 bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2804
2805 any_builtin = false;
2806 FOR_EACH_VEC_ELT (partitions, i, partition)
2807 {
2808 classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2809 any_builtin |= partition_builtin_p (partition);
2810 }
2811
2812 /* If we are only distributing patterns but did not detect any,
2813 simply bail out. */
2814 if (!flag_tree_loop_distribution
2815 && !any_builtin)
2816 {
2817 nbp = 0;
2818 goto ldist_done;
2819 }
2820
2821 /* If we are only distributing patterns fuse all partitions that
2822 were not classified as builtins. This also avoids chopping
2823 a loop into pieces, separated by builtin calls. That is, we
2824 only want no or a single loop body remaining. */
2825 struct partition *into;
2826 if (!flag_tree_loop_distribution)
2827 {
2828 for (i = 0; partitions.iterate (i, &into); ++i)
2829 if (!partition_builtin_p (into))
2830 break;
2831 for (++i; partitions.iterate (i, &partition); ++i)
2832 if (!partition_builtin_p (partition))
2833 {
2834 partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2835 partitions.unordered_remove (i);
2836 partition_free (partition);
2837 i--;
2838 }
2839 }
2840
2841 /* Due to limitations in the transform phase we have to fuse all
2842 reduction partitions into the last partition so the existing
2843 loop will contain all loop-closed PHI nodes. */
2844 for (i = 0; partitions.iterate (i, &into); ++i)
2845 if (partition_reduction_p (into))
2846 break;
2847 for (i = i + 1; partitions.iterate (i, &partition); ++i)
2848 if (partition_reduction_p (partition))
2849 {
2850 partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2851 partitions.unordered_remove (i);
2852 partition_free (partition);
2853 i--;
2854 }
2855
2856 /* Apply our simple cost model - fuse partitions with similar
2857 memory accesses. */
2858 for (i = 0; partitions.iterate (i, &into); ++i)
2859 {
2860 bool changed = false;
2861 if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2862 continue;
2863 for (int j = i + 1;
2864 partitions.iterate (j, &partition); ++j)
2865 {
2866 if (share_memory_accesses (rdg, into, partition))
2867 {
2868 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2869 partitions.unordered_remove (j);
2870 partition_free (partition);
2871 j--;
2872 changed = true;
2873 }
2874 }
2875 /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2876 accesses when 1 and 2 have similar accesses but not 0 and 1
2877 then in the next iteration we will fail to consider merging
2878 1 into 0,2. So try again if we did any merging into 0. */
2879 if (changed)
2880 i--;
2881 }
2882
2883 /* Build the partition dependency graph and fuse partitions in strong
2884 connected component. */
2885 if (partitions.length () > 1)
2886 {
2887 /* Don't support loop nest distribution under runtime alias check
2888 since it's not likely to enable many vectorization opportunities. */
2889 if (loop->inner)
2890 merge_dep_scc_partitions (rdg, &partitions, false);
2891 else
2892 {
2893 merge_dep_scc_partitions (rdg, &partitions, true);
2894 if (partitions.length () > 1)
2895 break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2896 }
2897 }
2898
2899 finalize_partitions (loop, &partitions, &alias_ddrs);
2900
2901 nbp = partitions.length ();
2902 if (nbp == 0
2903 || (nbp == 1 && !partition_builtin_p (partitions[0]))
2904 || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2905 {
2906 nbp = 0;
2907 goto ldist_done;
2908 }
2909
2910 if (version_for_distribution_p (&partitions, &alias_ddrs))
2911 version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
2912
2913 if (dump_file && (dump_flags & TDF_DETAILS))
2914 {
2915 fprintf (dump_file,
2916 "distribute loop <%d> into partitions:\n", loop->num);
2917 dump_rdg_partitions (dump_file, partitions);
2918 }
2919
2920 FOR_EACH_VEC_ELT (partitions, i, partition)
2921 {
2922 if (partition_builtin_p (partition))
2923 (*nb_calls)++;
2924 *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2925 }
2926
2927 ldist_done:
2928 loop_nest.release ();
2929 free_data_refs (datarefs_vec);
2930 for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2931 iter != ddrs_table->end (); ++iter)
2932 {
2933 free_dependence_relation (*iter);
2934 *iter = NULL;
2935 }
2936 delete ddrs_table;
2937
2938 FOR_EACH_VEC_ELT (partitions, i, partition)
2939 partition_free (partition);
2940
2941 free_rdg (rdg);
2942 return nbp - *nb_calls;
2943 }
2944
2945 /* Distribute all loops in the current function. */
2946
2947 namespace {
2948
2949 const pass_data pass_data_loop_distribution =
2950 {
2951 GIMPLE_PASS, /* type */
2952 "ldist", /* name */
2953 OPTGROUP_LOOP, /* optinfo_flags */
2954 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2955 ( PROP_cfg | PROP_ssa ), /* properties_required */
2956 0, /* properties_provided */
2957 0, /* properties_destroyed */
2958 0, /* todo_flags_start */
2959 0, /* todo_flags_finish */
2960 };
2961
2962 class pass_loop_distribution : public gimple_opt_pass
2963 {
2964 public:
2965 pass_loop_distribution (gcc::context *ctxt)
2966 : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2967 {}
2968
2969 /* opt_pass methods: */
2970 virtual bool gate (function *)
2971 {
2972 return flag_tree_loop_distribution
2973 || flag_tree_loop_distribute_patterns;
2974 }
2975
2976 virtual unsigned int execute (function *);
2977
2978 }; // class pass_loop_distribution
2979
2980
2981 /* Given LOOP, this function records seed statements for distribution in
2982 WORK_LIST. Return false if there is nothing for distribution. */
2983
2984 static bool
2985 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
2986 {
2987 basic_block *bbs = get_loop_body_in_dom_order (loop);
2988
2989 /* Initialize the worklist with stmts we seed the partitions with. */
2990 for (unsigned i = 0; i < loop->num_nodes; ++i)
2991 {
2992 for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2993 !gsi_end_p (gsi); gsi_next (&gsi))
2994 {
2995 gphi *phi = gsi.phi ();
2996 if (virtual_operand_p (gimple_phi_result (phi)))
2997 continue;
2998 /* Distribute stmts which have defs that are used outside of
2999 the loop. */
3000 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3001 continue;
3002 work_list->safe_push (phi);
3003 }
3004 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3005 !gsi_end_p (gsi); gsi_next (&gsi))
3006 {
3007 gimple *stmt = gsi_stmt (gsi);
3008
3009 /* If there is a stmt with side-effects bail out - we
3010 cannot and should not distribute this loop. */
3011 if (gimple_has_side_effects (stmt))
3012 {
3013 free (bbs);
3014 return false;
3015 }
3016
3017 /* Distribute stmts which have defs that are used outside of
3018 the loop. */
3019 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3020 ;
3021 /* Otherwise only distribute stores for now. */
3022 else if (!gimple_vdef (stmt))
3023 continue;
3024
3025 work_list->safe_push (stmt);
3026 }
3027 }
3028 free (bbs);
3029 return work_list->length () > 0;
3030 }
3031
3032 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3033 perfect loop nest. */
3034
3035 static struct loop *
3036 prepare_perfect_loop_nest (struct loop *loop)
3037 {
3038 struct loop *outer = loop_outer (loop);
3039 tree niters = number_of_latch_executions (loop);
3040
3041 /* TODO: We only support the innermost 3-level loop nest distribution
3042 because of compilation time issue for now. This should be relaxed
3043 in the future. Note we only allow 3-level loop nest distribution
3044 when parallelizing loops. */
3045 while ((loop->inner == NULL
3046 || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3047 && loop_outer (outer)
3048 && outer->inner == loop && loop->next == NULL
3049 && single_exit (outer)
3050 && optimize_loop_for_speed_p (outer)
3051 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3052 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3053 && niters != chrec_dont_know)
3054 {
3055 loop = outer;
3056 outer = loop_outer (loop);
3057 }
3058
3059 return loop;
3060 }
3061
3062 unsigned int
3063 pass_loop_distribution::execute (function *fun)
3064 {
3065 struct loop *loop;
3066 bool changed = false;
3067 basic_block bb;
3068 control_dependences *cd = NULL;
3069 auto_vec<loop_p> loops_to_be_destroyed;
3070
3071 if (number_of_loops (fun) <= 1)
3072 return 0;
3073
3074 /* Compute topological order for basic blocks. Topological order is
3075 needed because data dependence is computed for data references in
3076 lexicographical order. */
3077 if (bb_top_order_index == NULL)
3078 {
3079 int rpo_num;
3080 int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
3081
3082 bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3083 bb_top_order_index_size = last_basic_block_for_fn (cfun);
3084 rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
3085 for (int i = 0; i < rpo_num; i++)
3086 bb_top_order_index[rpo[i]] = i;
3087
3088 free (rpo);
3089 }
3090
3091 FOR_ALL_BB_FN (bb, fun)
3092 {
3093 gimple_stmt_iterator gsi;
3094 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3095 gimple_set_uid (gsi_stmt (gsi), -1);
3096 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3097 gimple_set_uid (gsi_stmt (gsi), -1);
3098 }
3099
3100 /* We can at the moment only distribute non-nested loops, thus restrict
3101 walking to innermost loops. */
3102 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3103 {
3104 /* Don't distribute multiple exit edges loop, or cold loop. */
3105 if (!single_exit (loop)
3106 || !optimize_loop_for_speed_p (loop))
3107 continue;
3108
3109 /* Don't distribute loop if niters is unknown. */
3110 tree niters = number_of_latch_executions (loop);
3111 if (niters == NULL_TREE || niters == chrec_dont_know)
3112 continue;
3113
3114 /* Get the perfect loop nest for distribution. */
3115 loop = prepare_perfect_loop_nest (loop);
3116 for (; loop; loop = loop->inner)
3117 {
3118 auto_vec<gimple *> work_list;
3119 if (!find_seed_stmts_for_distribution (loop, &work_list))
3120 break;
3121
3122 const char *str = loop->inner ? " nest" : "";
3123 dump_user_location_t loc = find_loop_location (loop);
3124 if (!cd)
3125 {
3126 calculate_dominance_info (CDI_DOMINATORS);
3127 calculate_dominance_info (CDI_POST_DOMINATORS);
3128 cd = new control_dependences ();
3129 free_dominance_info (CDI_POST_DOMINATORS);
3130 }
3131
3132 bool destroy_p;
3133 int nb_generated_loops, nb_generated_calls;
3134 nb_generated_loops = distribute_loop (loop, work_list, cd,
3135 &nb_generated_calls,
3136 &destroy_p);
3137 if (destroy_p)
3138 loops_to_be_destroyed.safe_push (loop);
3139
3140 if (nb_generated_loops + nb_generated_calls > 0)
3141 {
3142 changed = true;
3143 if (dump_enabled_p ())
3144 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3145 loc, "Loop%s %d distributed: split to %d loops "
3146 "and %d library calls.\n", str, loop->num,
3147 nb_generated_loops, nb_generated_calls);
3148
3149 break;
3150 }
3151
3152 if (dump_file && (dump_flags & TDF_DETAILS))
3153 fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3154 }
3155 }
3156
3157 if (cd)
3158 delete cd;
3159
3160 if (bb_top_order_index != NULL)
3161 {
3162 free (bb_top_order_index);
3163 bb_top_order_index = NULL;
3164 bb_top_order_index_size = 0;
3165 }
3166
3167 if (changed)
3168 {
3169 /* Destroy loop bodies that could not be reused. Do this late as we
3170 otherwise can end up refering to stale data in control dependences. */
3171 unsigned i;
3172 FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3173 destroy_loop (loop);
3174
3175 /* Cached scalar evolutions now may refer to wrong or non-existing
3176 loops. */
3177 scev_reset_htab ();
3178 mark_virtual_operands_for_renaming (fun);
3179 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3180 }
3181
3182 checking_verify_loop_structure ();
3183
3184 return changed ? TODO_cleanup_cfg : 0;
3185 }
3186
3187 } // anon namespace
3188
3189 gimple_opt_pass *
3190 make_pass_loop_distribution (gcc::context *ctxt)
3191 {
3192 return new pass_loop_distribution (ctxt);
3193 }