]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-ssa-loop-prefetch.c
InetAddress.java: Throw an UnknownHostException if lookup fails.
[thirdparty/gcc.git] / gcc / tree-ssa-loop-prefetch.c
CommitLineData
b076a3fd
ZD
1/* Array prefetching.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 2, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING. If not, write to the Free
18Software Foundation, 59 Temple Place - Suite 330, Boston, MA
1902111-1307, USA. */
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "tm.h"
25#include "tree.h"
26#include "rtl.h"
27#include "tm_p.h"
28#include "hard-reg-set.h"
29#include "basic-block.h"
30#include "output.h"
31#include "diagnostic.h"
32#include "tree-flow.h"
33#include "tree-dump.h"
34#include "timevar.h"
35#include "cfgloop.h"
36#include "varray.h"
37#include "expr.h"
38#include "tree-pass.h"
39#include "ggc.h"
40#include "insn-config.h"
41#include "recog.h"
42#include "hashtab.h"
43#include "tree-chrec.h"
44#include "tree-scalar-evolution.h"
45#include "toplev.h"
46#include "params.h"
47#include "langhooks.h"
48
49/* This pass inserts prefetch instructions to optimize cache usage during
50 accesses to arrays in loops. It processes loops sequentially and:
51
52 1) Gathers all memory references in the single loop.
53 2) For each of the references it decides when it is profitable to prefetch
54 it. To do it, we evaluate the reuse among the accesses, and determines
55 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
56 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
57 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
58 iterations of the loop that are zero modulo PREFETCH_MOD). For example
59 (assuming cache line size is 64 bytes, char has size 1 byte and there
60 is no hardware sequential prefetch):
61
62 char *a;
63 for (i = 0; i < max; i++)
64 {
65 a[255] = ...; (0)
66 a[i] = ...; (1)
67 a[i + 64] = ...; (2)
68 a[16*i] = ...; (3)
69 a[187*i] = ...; (4)
70 a[187*i + 50] = ...; (5)
71 }
72
73 (0) obviously has PREFETCH_BEFORE 1
74 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
75 location 64 iterations before it, and PREFETCH_MOD 64 (since
76 it hits the same cache line otherwise).
77 (2) has PREFETCH_MOD 64
78 (3) has PREFETCH_MOD 4
79 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
80 the cache line accessed by (4) is the same with probability only
81 7/32.
82 (5) has PREFETCH_MOD 1 as well.
83
84 3) We determine how much ahead we need to prefetch. The number of
85 iterations needed is time to fetch / time spent in one iteration of
86 the loop. The problem is that we do not know either of these values,
87 so we just make a heuristic guess based on a magic (possibly)
88 target-specific constant and size of the loop.
89
90 4) Determine which of the references we prefetch. We take into account
91 that there is a maximum number of simultaneous prefetches (provided
92 by machine description). We prefetch as many prefetches as possible
93 while still within this bound (starting with those with lowest
94 prefetch_mod, since they are responsible for most of the cache
95 misses).
96
97 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
98 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
99 prefetching nonaccessed memory.
100 TODO -- actually implement peeling.
101
102 6) We actually emit the prefetch instructions. ??? Perhaps emit the
103 prefetch instructions with guards in cases where 5) was not sufficient
104 to satisfy the constraints?
105
106 Some other TODO:
107 -- write and use more general reuse analysis (that could be also used
108 in other cache aimed loop optimizations)
109 -- make it behave sanely together with the prefetches given by user
110 (now we just ignore them; at the very least we should avoid
111 optimizing loops in that user put his own prefetches)
112 -- we assume cache line size alignment of arrays; this could be
113 improved. */
114
115/* Magic constants follow. These should be replaced by machine specific
116 numbers. */
117
c0220ea4 118/* A number that should roughly correspond to the number of instructions
b076a3fd
ZD
119 executed before the prefetch is completed. */
120
121#ifndef PREFETCH_LATENCY
122#define PREFETCH_LATENCY 200
123#endif
124
125/* Number of prefetches that can run at the same time. */
126
127#ifndef SIMULTANEOUS_PREFETCHES
128#define SIMULTANEOUS_PREFETCHES 3
129#endif
130
131/* True if write can be prefetched by a read prefetch. */
132
133#ifndef WRITE_CAN_USE_READ_PREFETCH
134#define WRITE_CAN_USE_READ_PREFETCH 1
135#endif
136
137/* True if read can be prefetched by a write prefetch. */
138
139#ifndef READ_CAN_USE_WRITE_PREFETCH
140#define READ_CAN_USE_WRITE_PREFETCH 0
141#endif
142
143/* Cache line size. Assumed to be a power of two. */
144
145#ifndef PREFETCH_BLOCK
146#define PREFETCH_BLOCK 32
147#endif
148
149/* Do we have a forward hardware sequential prefetching? */
150
151#ifndef HAVE_FORWARD_PREFETCH
152#define HAVE_FORWARD_PREFETCH 0
153#endif
154
155/* Do we have a backward hardware sequential prefetching? */
156
157#ifndef HAVE_BACKWARD_PREFETCH
158#define HAVE_BACKWARD_PREFETCH 0
159#endif
160
161/* In some cases we are only able to determine that there is a certain
162 probability that the two accesses hit the same cache line. In this
163 case, we issue the prefetches for both of them if this probability
164 is less then (1000 - ACCEPTABLE_MISS_RATE) promile. */
165
166#ifndef ACCEPTABLE_MISS_RATE
167#define ACCEPTABLE_MISS_RATE 50
168#endif
169
170#ifndef HAVE_prefetch
171#define HAVE_prefetch 0
172#endif
173
174/* The group of references between that reuse may occur. */
175
176struct mem_ref_group
177{
178 tree base; /* Base of the reference. */
179 HOST_WIDE_INT step; /* Step of the reference. */
180 struct mem_ref *refs; /* References in the group. */
181 struct mem_ref_group *next; /* Next group of references. */
182};
183
184/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
185
186#define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
187
188/* The memory reference. */
189
190struct mem_ref
191{
192 tree stmt; /* Statement in that the reference appears. */
193 tree mem; /* The reference. */
194 HOST_WIDE_INT delta; /* Constant offset of the reference. */
195 bool write_p; /* Is it a write? */
196 struct mem_ref_group *group; /* The group of references it belongs to. */
197 unsigned HOST_WIDE_INT prefetch_mod;
198 /* Prefetch only each PREFETCH_MOD-th
199 iteration. */
200 unsigned HOST_WIDE_INT prefetch_before;
201 /* Prefetch only first PREFETCH_BEFORE
202 iterations. */
203 bool issue_prefetch_p; /* Should we really issue the prefetch? */
204 struct mem_ref *next; /* The next reference in the group. */
205};
206
207/* Dumps information obout reference REF to FILE. */
208
209static void
210dump_mem_ref (FILE *file, struct mem_ref *ref)
211{
212 fprintf (file, "Reference %p:\n", (void *) ref);
213
214 fprintf (file, " group %p (base ", (void *) ref->group);
215 print_generic_expr (file, ref->group->base, TDF_SLIM);
216 fprintf (file, ", step ");
217 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
218 fprintf (file, ")\n");
219
220 fprintf (dump_file, " delta ");
221 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
222 fprintf (file, "\n");
223
224 fprintf (file, " %s\n", ref->write_p ? "write" : "read");
225
226 fprintf (file, "\n");
227}
228
229/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
230 exist. */
231
232static struct mem_ref_group *
233find_or_create_group (struct mem_ref_group **groups, tree base,
234 HOST_WIDE_INT step)
235{
236 struct mem_ref_group *group;
237
238 for (; *groups; groups = &(*groups)->next)
239 {
240 if ((*groups)->step == step
241 && operand_equal_p ((*groups)->base, base, 0))
242 return *groups;
243
244 /* Keep the list of groups sorted by decreasing step. */
245 if ((*groups)->step < step)
246 break;
247 }
248
249 group = xcalloc (1, sizeof (struct mem_ref_group));
250 group->base = base;
251 group->step = step;
252 group->refs = NULL;
253 group->next = *groups;
254 *groups = group;
255
256 return group;
257}
258
259/* Records a memory reference MEM in GROUP with offset DELTA and write status
260 WRITE_P. The reference occurs in statement STMT. */
261
262static void
263record_ref (struct mem_ref_group *group, tree stmt, tree mem,
264 HOST_WIDE_INT delta, bool write_p)
265{
266 struct mem_ref **aref;
267
268 /* Do not record the same address twice. */
269 for (aref = &group->refs; *aref; aref = &(*aref)->next)
270 {
271 /* It does not have to be possible for write reference to reuse the read
272 prefetch, or vice versa. */
273 if (!WRITE_CAN_USE_READ_PREFETCH
274 && write_p
275 && !(*aref)->write_p)
276 continue;
277 if (!READ_CAN_USE_WRITE_PREFETCH
278 && !write_p
279 && (*aref)->write_p)
280 continue;
281
282 if ((*aref)->delta == delta)
283 return;
284 }
285
286 (*aref) = xcalloc (1, sizeof (struct mem_ref));
287 (*aref)->stmt = stmt;
288 (*aref)->mem = mem;
289 (*aref)->delta = delta;
290 (*aref)->write_p = write_p;
291 (*aref)->prefetch_before = PREFETCH_ALL;
292 (*aref)->prefetch_mod = 1;
293 (*aref)->issue_prefetch_p = false;
294 (*aref)->group = group;
295 (*aref)->next = NULL;
296
297 if (dump_file && (dump_flags & TDF_DETAILS))
298 dump_mem_ref (dump_file, *aref);
299}
300
301/* Release memory references in GROUPS. */
302
303static void
304release_mem_refs (struct mem_ref_group *groups)
305{
306 struct mem_ref_group *next_g;
307 struct mem_ref *ref, *next_r;
308
309 for (; groups; groups = next_g)
310 {
311 next_g = groups->next;
312 for (ref = groups->refs; ref; ref = next_r)
313 {
314 next_r = ref->next;
315 free (ref);
316 }
317 free (groups);
318 }
319}
320
321/* A structure used to pass arguments to idx_analyze_ref. */
322
323struct ar_data
324{
325 struct loop *loop; /* Loop of the reference. */
326 tree stmt; /* Statement of the reference. */
327 HOST_WIDE_INT *step; /* Step of the memory reference. */
328 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
329};
330
331/* Analyzes a single INDEX of a memory reference to obtain information
332 described at analyze_ref. Callback for for_each_index. */
333
334static bool
335idx_analyze_ref (tree base, tree *index, void *data)
336{
337 struct ar_data *ar_data = data;
338 tree ibase, step, stepsize;
339 HOST_WIDE_INT istep, idelta = 0, imult = 1;
340 affine_iv iv;
341
342 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
343 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
344 return false;
345
346 if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false))
347 return false;
348 ibase = iv.base;
349 step = iv.step;
350
351 if (zero_p (step))
352 istep = 0;
353 else
354 {
355 if (!cst_and_fits_in_hwi (step))
356 return false;
357 istep = int_cst_value (step);
358 }
359
360 if (TREE_CODE (ibase) == PLUS_EXPR
361 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
362 {
363 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
364 ibase = TREE_OPERAND (ibase, 0);
365 }
366 if (cst_and_fits_in_hwi (ibase))
367 {
368 idelta += int_cst_value (ibase);
369 ibase = build_int_cst_type (TREE_TYPE (ibase), 0);
370 }
371
372 if (TREE_CODE (base) == ARRAY_REF)
373 {
374 stepsize = array_ref_element_size (base);
375 if (!cst_and_fits_in_hwi (stepsize))
376 return false;
377 imult = int_cst_value (stepsize);
378
379 istep *= imult;
380 idelta *= imult;
381 }
382
383 *ar_data->step += istep;
384 *ar_data->delta += idelta;
385 *index = ibase;
386
387 return true;
388}
389
390/* Tries to express REF in shape &BASE + STEP * iter + DELTA, where DELTA and
391 STEP are integer constants and iter is number of iterations of LOOP. The
392 reference occurs in statement STMT. */
393
394static bool
395analyze_ref (struct loop *loop, tree ref, tree *base,
396 HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
397 tree stmt)
398{
399 struct ar_data ar_data;
400 tree off;
401 HOST_WIDE_INT bit_offset;
402
403 *step = 0;
404 *delta = 0;
405
406 /* First strip off the component references. Ignore bitfields. */
407 if (TREE_CODE (ref) == COMPONENT_REF
408 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
409 ref = TREE_OPERAND (ref, 0);
410
411 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
412 {
413 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
414 bit_offset = TREE_INT_CST_LOW (off);
415 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
416
417 *delta += bit_offset / BITS_PER_UNIT;
418 }
419
420 *base = unshare_expr (ref);
421 ar_data.loop = loop;
422 ar_data.stmt = stmt;
423 ar_data.step = step;
424 ar_data.delta = delta;
425 return for_each_index (base, idx_analyze_ref, &ar_data);
426}
427
428/* Record a memory reference REF to the list REFS. The reference occurs in
429 LOOP in statement STMT and it is write if WRITE_P. */
430
431static void
432gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
433 tree ref, bool write_p, tree stmt)
434{
435 tree base;
436 HOST_WIDE_INT step, delta;
437 struct mem_ref_group *agrp;
438
439 if (!analyze_ref (loop, ref, &base, &step, &delta, stmt))
440 return;
441
442 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
443 are integer constants. */
444 agrp = find_or_create_group (refs, base, step);
445 record_ref (agrp, stmt, ref, delta, write_p);
446}
447
448/* Record the suitable memory references in LOOP. */
449
450static struct mem_ref_group *
451gather_memory_references (struct loop *loop)
452{
453 basic_block *body = get_loop_body_in_dom_order (loop);
454 basic_block bb;
455 unsigned i;
456 block_stmt_iterator bsi;
457 tree stmt, lhs, rhs;
458 struct mem_ref_group *refs = NULL;
459
460 /* Scan the loop body in order, so that the former references precede the
461 later ones. */
462 for (i = 0; i < loop->num_nodes; i++)
463 {
464 bb = body[i];
465 if (bb->loop_father != loop)
466 continue;
467
468 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
469 {
470 stmt = bsi_stmt (bsi);
471 if (TREE_CODE (stmt) != MODIFY_EXPR)
472 continue;
473
474 lhs = TREE_OPERAND (stmt, 0);
475 rhs = TREE_OPERAND (stmt, 1);
476
477 if (REFERENCE_CLASS_P (rhs))
478 gather_memory_references_ref (loop, &refs, rhs, false, stmt);
479 if (REFERENCE_CLASS_P (lhs))
480 gather_memory_references_ref (loop, &refs, lhs, true, stmt);
481 }
482 }
483 free (body);
484
485 return refs;
486}
487
488/* Prune the prefetch candidate REF using the self-reuse. */
489
490static void
491prune_ref_by_self_reuse (struct mem_ref *ref)
492{
493 HOST_WIDE_INT step = ref->group->step;
494 bool backward = step < 0;
495
496 if (step == 0)
497 {
498 /* Prefetch references to invariant address just once. */
499 ref->prefetch_before = 1;
500 return;
501 }
502
503 if (backward)
504 step = -step;
505
506 if (step > PREFETCH_BLOCK)
507 return;
508
509 if ((backward && HAVE_BACKWARD_PREFETCH)
510 || (!backward && HAVE_FORWARD_PREFETCH))
511 {
512 ref->prefetch_before = 1;
513 return;
514 }
515
516 ref->prefetch_mod = PREFETCH_BLOCK / step;
517}
518
519/* Divides X by BY, rounding down. */
520
521static HOST_WIDE_INT
522ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
523{
524 gcc_assert (by > 0);
525
526 if (x >= 0)
527 return x / by;
528 else
529 return (x + by - 1) / by;
530}
531
532/* Prune the prefetch candidate REF using the reuse with BY.
533 If BY_IS_BEFORE is true, BY is before REF in the loop. */
534
535static void
536prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
537 bool by_is_before)
538{
539 HOST_WIDE_INT step = ref->group->step;
540 bool backward = step < 0;
541 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
542 HOST_WIDE_INT delta = delta_b - delta_r;
543 HOST_WIDE_INT hit_from;
544 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
545
546 if (delta == 0)
547 {
548 /* If the references has the same address, only prefetch the
549 former. */
550 if (by_is_before)
551 ref->prefetch_before = 0;
552
553 return;
554 }
555
556 if (!step)
557 {
558 /* If the reference addresses are invariant and fall into the
559 same cache line, prefetch just the first one. */
560 if (!by_is_before)
561 return;
562
563 if (ddown (ref->delta, PREFETCH_BLOCK)
564 != ddown (by->delta, PREFETCH_BLOCK))
565 return;
566
567 ref->prefetch_before = 0;
568 return;
569 }
570
571 /* Only prune the reference that is behind in the array. */
572 if (backward)
573 {
574 if (delta > 0)
575 return;
576
577 /* Transform the data so that we may assume that the accesses
578 are forward. */
579 delta = - delta;
580 step = -step;
581 delta_r = PREFETCH_BLOCK - 1 - delta_r;
582 delta_b = PREFETCH_BLOCK - 1 - delta_b;
583 }
584 else
585 {
586 if (delta < 0)
587 return;
588 }
589
590 /* Check whether the two references are likely to hit the same cache
591 line, and how distant the iterations in that it occurs are from
592 each other. */
593
594 if (step <= PREFETCH_BLOCK)
595 {
596 /* The accesses are sure to meet. Let us check when. */
597 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
598 prefetch_before = (hit_from - delta_r + step - 1) / step;
599
600 if (prefetch_before < ref->prefetch_before)
601 ref->prefetch_before = prefetch_before;
602
603 return;
604 }
605
606 /* A more complicated case. First let us ensure that size of cache line
607 and step are coprime (here we assume that PREFETCH_BLOCK is a power
608 of two. */
609 prefetch_block = PREFETCH_BLOCK;
610 while ((step & 1) == 0
611 && prefetch_block > 1)
612 {
613 step >>= 1;
614 prefetch_block >>= 1;
615 delta >>= 1;
616 }
617
618 /* Now step > prefetch_block, and step and prefetch_block are coprime.
619 Determine the probability that the accesses hit the same cache line. */
620
621 prefetch_before = delta / step;
622 delta %= step;
623 if ((unsigned HOST_WIDE_INT) delta
624 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
625 {
626 if (prefetch_before < ref->prefetch_before)
627 ref->prefetch_before = prefetch_before;
628
629 return;
630 }
631
632 /* Try also the following iteration. */
633 prefetch_before++;
634 delta = step - delta;
635 if ((unsigned HOST_WIDE_INT) delta
636 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
637 {
638 if (prefetch_before < ref->prefetch_before)
639 ref->prefetch_before = prefetch_before;
640
641 return;
642 }
643
644 /* The ref probably does not reuse by. */
645 return;
646}
647
648/* Prune the prefetch candidate REF using the reuses with other references
649 in REFS. */
650
651static void
652prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
653{
654 struct mem_ref *prune_by;
655 bool before = true;
656
657 prune_ref_by_self_reuse (ref);
658
659 for (prune_by = refs; prune_by; prune_by = prune_by->next)
660 {
661 if (prune_by == ref)
662 {
663 before = false;
664 continue;
665 }
666
667 if (!WRITE_CAN_USE_READ_PREFETCH
668 && ref->write_p
669 && !prune_by->write_p)
670 continue;
671 if (!READ_CAN_USE_WRITE_PREFETCH
672 && !ref->write_p
673 && prune_by->write_p)
674 continue;
675
676 prune_ref_by_group_reuse (ref, prune_by, before);
677 }
678}
679
680/* Prune the prefetch candidates in GROUP using the reuse analysis. */
681
682static void
683prune_group_by_reuse (struct mem_ref_group *group)
684{
685 struct mem_ref *ref_pruned;
686
687 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
688 {
689 prune_ref_by_reuse (ref_pruned, group->refs);
690
691 if (dump_file && (dump_flags & TDF_DETAILS))
692 {
693 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
694
695 if (ref_pruned->prefetch_before == PREFETCH_ALL
696 && ref_pruned->prefetch_mod == 1)
697 fprintf (dump_file, " no restrictions");
698 else if (ref_pruned->prefetch_before == 0)
699 fprintf (dump_file, " do not prefetch");
700 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
701 fprintf (dump_file, " prefetch once");
702 else
703 {
704 if (ref_pruned->prefetch_before != PREFETCH_ALL)
705 {
706 fprintf (dump_file, " prefetch before ");
707 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
708 ref_pruned->prefetch_before);
709 }
710 if (ref_pruned->prefetch_mod != 1)
711 {
712 fprintf (dump_file, " prefetch mod ");
713 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
714 ref_pruned->prefetch_mod);
715 }
716 }
717 fprintf (dump_file, "\n");
718 }
719 }
720}
721
722/* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
723
724static void
725prune_by_reuse (struct mem_ref_group *groups)
726{
727 for (; groups; groups = groups->next)
728 prune_group_by_reuse (groups);
729}
730
731/* Returns true if we should issue prefetch for REF. */
732
733static bool
734should_issue_prefetch_p (struct mem_ref *ref)
735{
736 /* For now do not issue prefetches for only first few of the
737 iterations. */
738 if (ref->prefetch_before != PREFETCH_ALL)
739 return false;
740
741 return true;
742}
743
744/* Decide which of the prefetch candidates in GROUPS to prefetch.
745 AHEAD is the number of iterations to prefetch ahead (which corresponds
746 to the number of simultaneous instances of one prefetch running at a
747 time). UNROLL_FACTOR is the factor by that the loop is going to be
748 unrolled. Returns true if there is anything to prefetch. */
749
750static bool
751schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
752 unsigned ahead)
753{
754 unsigned max_prefetches, n_prefetches;
755 struct mem_ref *ref;
756 bool any = false;
757
758 max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
759 if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
760 max_prefetches = SIMULTANEOUS_PREFETCHES;
761
762 if (dump_file && (dump_flags & TDF_DETAILS))
763 fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
764
765 if (!max_prefetches)
766 return false;
767
768 /* For now we just take memory references one by one and issue
769 prefetches for as many as possible. The groups are sorted
770 starting with the largest step, since the references with
c0220ea4 771 large step are more likely to cause many cache misses. */
b076a3fd
ZD
772
773 for (; groups; groups = groups->next)
774 for (ref = groups->refs; ref; ref = ref->next)
775 {
776 if (!should_issue_prefetch_p (ref))
777 continue;
778
779 ref->issue_prefetch_p = true;
780
781 /* If prefetch_mod is less then unroll_factor, we need to insert
782 several prefetches for the reference. */
783 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
784 / ref->prefetch_mod);
785 if (max_prefetches <= n_prefetches)
786 return true;
787
788 max_prefetches -= n_prefetches;
789 any = true;
790 }
791
792 return any;
793}
794
795/* Determine whether there is any reference suitable for prefetching
796 in GROUPS. */
797
798static bool
799anything_to_prefetch_p (struct mem_ref_group *groups)
800{
801 struct mem_ref *ref;
802
803 for (; groups; groups = groups->next)
804 for (ref = groups->refs; ref; ref = ref->next)
805 if (should_issue_prefetch_p (ref))
806 return true;
807
808 return false;
809}
810
811/* Issue prefetches for the reference REF into loop as decided before.
812 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
813 is the factor by thet LOOP was unrolled. */
814
815static void
816issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
817{
818 HOST_WIDE_INT delta;
819 tree addr, addr_base, prefetch, params, write_p;
820 block_stmt_iterator bsi;
821 unsigned n_prefetches, ap;
822
823 if (dump_file && (dump_flags & TDF_DETAILS))
824 fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
825
826 bsi = bsi_for_stmt (ref->stmt);
827
828 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
829 / ref->prefetch_mod);
830 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
831 addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
832
833 for (ap = 0; ap < n_prefetches; ap++)
834 {
835 /* Determine the address to prefetch. */
836 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
837 addr = fold_build2 (PLUS_EXPR, ptr_type_node,
838 addr_base, build_int_cst (ptr_type_node, delta));
839 addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
840
841 /* Create the prefetch instruction. */
842 write_p = ref->write_p ? integer_one_node : integer_zero_node;
843 params = tree_cons (NULL_TREE, addr,
844 tree_cons (NULL_TREE, write_p, NULL_TREE));
845
846 prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
847 params);
848 bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
849 }
850}
851
852/* Issue prefetches for the references in GROUPS into loop as decided before.
853 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
854 factor by that LOOP was unrolled. */
855
856static void
857issue_prefetches (struct mem_ref_group *groups,
858 unsigned unroll_factor, unsigned ahead)
859{
860 struct mem_ref *ref;
861
862 for (; groups; groups = groups->next)
863 for (ref = groups->refs; ref; ref = ref->next)
864 if (ref->issue_prefetch_p)
865 issue_prefetch_ref (ref, unroll_factor, ahead);
866}
867
868/* Determines whether we can profitably unroll LOOP FACTOR times, and if
869 this is the case, fill in DESC by the description of number of
870 iterations. */
871
872static bool
873should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
874 unsigned factor)
875{
876 if (!can_unroll_loop_p (loop, factor, desc))
877 return false;
878
879 /* We only consider loops without control flow for unrolling. This is not
880 a hard restriction -- tree_unroll_loop works with arbitrary loops
881 as well; but the unrolling/prefetching is usually more profitable for
882 loops consisting of a single basic block, and we want to limit the
883 code growth. */
884 if (loop->num_nodes > 2)
885 return false;
886
887 return true;
888}
889
890/* Determine the coefficient by that unroll LOOP, from the information
891 contained in the list of memory references REFS. Description of
892 umber of iterations of LOOP is stored to DESC. AHEAD is the number
893 of iterations ahead that we need to prefetch. NINSNS is number of
894 insns of the LOOP. */
895
896static unsigned
897determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
898 unsigned ahead, unsigned ninsns,
899 struct tree_niter_desc *desc)
900{
901 unsigned upper_bound, size_factor, constraint_factor;
902 unsigned factor, max_mod_constraint, ahead_factor;
903 struct mem_ref_group *agp;
904 struct mem_ref *ref;
905
906 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
907
908 /* First check whether the loop is not too large to unroll. */
909 size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
910 if (size_factor <= 1)
911 return 1;
912
913 if (size_factor < upper_bound)
914 upper_bound = size_factor;
915
916 max_mod_constraint = 1;
917 for (agp = refs; agp; agp = agp->next)
918 for (ref = agp->refs; ref; ref = ref->next)
919 if (should_issue_prefetch_p (ref)
920 && ref->prefetch_mod > max_mod_constraint)
921 max_mod_constraint = ref->prefetch_mod;
922
923 /* Set constraint_factor as large as needed to be able to satisfy the
924 largest modulo constraint. */
925 constraint_factor = max_mod_constraint;
926
927 /* If ahead is too large in comparison with the number of available
928 prefetches, unroll the loop as much as needed to be able to prefetch
929 at least partially some of the references in the loop. */
930 ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
931 / SIMULTANEOUS_PREFETCHES);
932
933 /* Unroll as much as useful, but bound the code size growth. */
934 if (constraint_factor < ahead_factor)
935 factor = ahead_factor;
936 else
937 factor = constraint_factor;
938 if (factor > upper_bound)
939 factor = upper_bound;
940
941 if (!should_unroll_loop_p (loop, desc, factor))
942 return 1;
943
944 return factor;
945}
946
947/* Issue prefetch instructions for array references in LOOP. Returns
948 true if the LOOP was unrolled. LOOPS is the array containing all
949 loops. */
950
951static bool
952loop_prefetch_arrays (struct loops *loops, struct loop *loop)
953{
954 struct mem_ref_group *refs;
955 unsigned ahead, ninsns, unroll_factor;
956 struct tree_niter_desc desc;
957 bool unrolled = false;
958
959 /* Step 1: gather the memory references. */
960 refs = gather_memory_references (loop);
961
962 /* Step 2: estimate the reuse effects. */
963 prune_by_reuse (refs);
964
965 if (!anything_to_prefetch_p (refs))
966 goto fail;
967
968 /* Step 3: determine the ahead and unroll factor. */
969
970 /* FIXME: We should use not size of the loop, but the average number of
971 instructions executed per iteration of the loop. */
972 ninsns = tree_num_loop_insns (loop);
973 ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
974 unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
975 &desc);
976 if (dump_file && (dump_flags & TDF_DETAILS))
977 fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
978
979 /* If the loop rolls less than the required unroll factor, prefetching
980 is useless. */
981 if (unroll_factor > 1
982 && cst_and_fits_in_hwi (desc.niter)
983 && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
984 goto fail;
985
986 /* Step 4: what to prefetch? */
987 if (!schedule_prefetches (refs, unroll_factor, ahead))
988 goto fail;
989
990 /* Step 5: unroll the loop. TODO -- peeling of first and last few
991 iterations so that we do not issue superfluous prefetches. */
992 if (unroll_factor != 1)
993 {
994 tree_unroll_loop (loops, loop, unroll_factor,
995 single_dom_exit (loop), &desc);
996 unrolled = true;
997 }
998
999 /* Step 6: issue the prefetches. */
1000 issue_prefetches (refs, unroll_factor, ahead);
1001
1002fail:
1003 release_mem_refs (refs);
1004 return unrolled;
1005}
1006
1007/* Issue prefetch instructions for array references in LOOPS. */
1008
1009void
1010tree_ssa_prefetch_arrays (struct loops *loops)
1011{
1012 unsigned i;
1013 struct loop *loop;
1014 bool unrolled = false;
1015
1016 if (!HAVE_prefetch
1017 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1018 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1019 of processor costs and i486 does not have prefetch, but
1020 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1021 || PREFETCH_BLOCK == 0)
1022 return;
1023
1024 initialize_original_copy_tables ();
1025
1026 if (!built_in_decls[BUILT_IN_PREFETCH])
1027 {
1028 tree type = build_function_type (void_type_node,
1029 tree_cons (NULL_TREE,
1030 const_ptr_type_node,
1031 NULL_TREE));
1032 tree decl = lang_hooks.builtin_function ("__builtin_prefetch", type,
1033 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1034 NULL, NULL_TREE);
1035 DECL_IS_NOVOPS (decl) = true;
1036 built_in_decls[BUILT_IN_PREFETCH] = decl;
1037 }
1038
1039 /* We assume that size of cache line is a power of two, so verify this
1040 here. */
1041 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1042
1043 for (i = loops->num - 1; i > 0; i--)
1044 {
1045 loop = loops->parray[i];
1046
1047 if (dump_file && (dump_flags & TDF_DETAILS))
1048 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1049
1050 if (loop)
1051 unrolled |= loop_prefetch_arrays (loops, loop);
1052
1053 if (dump_file && (dump_flags & TDF_DETAILS))
1054 fprintf (dump_file, "\n\n");
1055 }
1056
1057 if (unrolled)
1058 {
1059 scev_reset ();
1060 cleanup_tree_cfg_loop ();
1061 }
1062
1063 free_original_copy_tables ();
1064}