]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-ssa-loop-prefetch.c
gcc.c (execute): For -### don't quote arguments that contain just alphanumerics and...
[thirdparty/gcc.git] / gcc / tree-ssa-loop-prefetch.c
CommitLineData
b076a3fd 1/* Array prefetching.
fa10beec 2 Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc.
b8698a0f 3
b076a3fd 4This file is part of GCC.
b8698a0f 5
b076a3fd
ZD
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
9dcd6f09 8Free Software Foundation; either version 3, or (at your option) any
b076a3fd 9later version.
b8698a0f 10
b076a3fd
ZD
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14for more details.
b8698a0f 15
b076a3fd 16You should have received a copy of the GNU General Public License
9dcd6f09
NC
17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
b076a3fd
ZD
19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
23#include "tm.h"
24#include "tree.h"
25#include "rtl.h"
26#include "tm_p.h"
27#include "hard-reg-set.h"
28#include "basic-block.h"
29#include "output.h"
30#include "diagnostic.h"
31#include "tree-flow.h"
32#include "tree-dump.h"
33#include "timevar.h"
34#include "cfgloop.h"
b076a3fd
ZD
35#include "expr.h"
36#include "tree-pass.h"
37#include "ggc.h"
38#include "insn-config.h"
39#include "recog.h"
40#include "hashtab.h"
41#include "tree-chrec.h"
42#include "tree-scalar-evolution.h"
43#include "toplev.h"
44#include "params.h"
45#include "langhooks.h"
7f9bc51b 46#include "tree-inline.h"
5417e022 47#include "tree-data-ref.h"
79f5e442 48#include "optabs.h"
b076a3fd
ZD
49
50/* This pass inserts prefetch instructions to optimize cache usage during
51 accesses to arrays in loops. It processes loops sequentially and:
52
53 1) Gathers all memory references in the single loop.
54 2) For each of the references it decides when it is profitable to prefetch
55 it. To do it, we evaluate the reuse among the accesses, and determines
56 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
57 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
58 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
59 iterations of the loop that are zero modulo PREFETCH_MOD). For example
60 (assuming cache line size is 64 bytes, char has size 1 byte and there
61 is no hardware sequential prefetch):
62
63 char *a;
64 for (i = 0; i < max; i++)
65 {
66 a[255] = ...; (0)
67 a[i] = ...; (1)
68 a[i + 64] = ...; (2)
69 a[16*i] = ...; (3)
70 a[187*i] = ...; (4)
71 a[187*i + 50] = ...; (5)
72 }
73
74 (0) obviously has PREFETCH_BEFORE 1
75 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
76 location 64 iterations before it, and PREFETCH_MOD 64 (since
77 it hits the same cache line otherwise).
78 (2) has PREFETCH_MOD 64
79 (3) has PREFETCH_MOD 4
80 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
81 the cache line accessed by (4) is the same with probability only
82 7/32.
83 (5) has PREFETCH_MOD 1 as well.
84
5417e022
ZD
85 Additionally, we use data dependence analysis to determine for each
86 reference the distance till the first reuse; this information is used
87 to determine the temporality of the issued prefetch instruction.
88
b076a3fd
ZD
89 3) We determine how much ahead we need to prefetch. The number of
90 iterations needed is time to fetch / time spent in one iteration of
91 the loop. The problem is that we do not know either of these values,
92 so we just make a heuristic guess based on a magic (possibly)
93 target-specific constant and size of the loop.
94
95 4) Determine which of the references we prefetch. We take into account
96 that there is a maximum number of simultaneous prefetches (provided
97 by machine description). We prefetch as many prefetches as possible
98 while still within this bound (starting with those with lowest
99 prefetch_mod, since they are responsible for most of the cache
100 misses).
b8698a0f 101
b076a3fd
ZD
102 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
103 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
104 prefetching nonaccessed memory.
105 TODO -- actually implement peeling.
b8698a0f 106
b076a3fd
ZD
107 6) We actually emit the prefetch instructions. ??? Perhaps emit the
108 prefetch instructions with guards in cases where 5) was not sufficient
109 to satisfy the constraints?
110
db34470d
GS
111 The function is_loop_prefetching_profitable() implements a cost model
112 to determine if prefetching is profitable for a given loop. The cost
113 model has two heuristcs:
114 1. A heuristic that determines whether the given loop has enough CPU
115 ops that can be overlapped with cache missing memory ops.
b8698a0f
L
116 If not, the loop won't benefit from prefetching. This is implemented
117 by requirung the ratio between the instruction count and the mem ref
db34470d
GS
118 count to be above a certain minimum.
119 2. A heuristic that disables prefetching in a loop with an unknown trip
b8698a0f 120 count if the prefetching cost is above a certain limit. The relative
db34470d
GS
121 prefetching cost is estimated by taking the ratio between the
122 prefetch count and the total intruction count (this models the I-cache
123 cost).
124 The limits used in these heuristics are defined as parameters with
b8698a0f 125 reasonable default values. Machine-specific default values will be
db34470d 126 added later.
b8698a0f 127
b076a3fd
ZD
128 Some other TODO:
129 -- write and use more general reuse analysis (that could be also used
130 in other cache aimed loop optimizations)
131 -- make it behave sanely together with the prefetches given by user
132 (now we just ignore them; at the very least we should avoid
133 optimizing loops in that user put his own prefetches)
134 -- we assume cache line size alignment of arrays; this could be
135 improved. */
136
137/* Magic constants follow. These should be replaced by machine specific
138 numbers. */
139
b076a3fd
ZD
140/* True if write can be prefetched by a read prefetch. */
141
142#ifndef WRITE_CAN_USE_READ_PREFETCH
143#define WRITE_CAN_USE_READ_PREFETCH 1
144#endif
145
146/* True if read can be prefetched by a write prefetch. */
147
148#ifndef READ_CAN_USE_WRITE_PREFETCH
149#define READ_CAN_USE_WRITE_PREFETCH 0
150#endif
151
47eb5b32
ZD
152/* The size of the block loaded by a single prefetch. Usually, this is
153 the same as cache line size (at the moment, we only consider one level
154 of cache hierarchy). */
b076a3fd
ZD
155
156#ifndef PREFETCH_BLOCK
47eb5b32 157#define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
b076a3fd
ZD
158#endif
159
160/* Do we have a forward hardware sequential prefetching? */
161
162#ifndef HAVE_FORWARD_PREFETCH
163#define HAVE_FORWARD_PREFETCH 0
164#endif
165
166/* Do we have a backward hardware sequential prefetching? */
167
168#ifndef HAVE_BACKWARD_PREFETCH
169#define HAVE_BACKWARD_PREFETCH 0
170#endif
171
172/* In some cases we are only able to determine that there is a certain
173 probability that the two accesses hit the same cache line. In this
174 case, we issue the prefetches for both of them if this probability
fa10beec 175 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
b076a3fd
ZD
176
177#ifndef ACCEPTABLE_MISS_RATE
178#define ACCEPTABLE_MISS_RATE 50
179#endif
180
181#ifndef HAVE_prefetch
182#define HAVE_prefetch 0
183#endif
184
46cb0441
ZD
185#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
186#define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
5417e022
ZD
187
188/* We consider a memory access nontemporal if it is not reused sooner than
189 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
190 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
191 so that we use nontemporal prefetches e.g. if single memory location
192 is accessed several times in a single iteration of the loop. */
193#define NONTEMPORAL_FRACTION 16
194
79f5e442
ZD
195/* In case we have to emit a memory fence instruction after the loop that
196 uses nontemporal stores, this defines the builtin to use. */
197
198#ifndef FENCE_FOLLOWING_MOVNT
199#define FENCE_FOLLOWING_MOVNT NULL_TREE
200#endif
201
9bf4598b
CF
202/* It is not profitable to prefetch when the trip count is not at
203 least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
204 For example, in a loop with a prefetch ahead distance of 10,
205 supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
206 profitable to prefetch when the trip count is greater or equal to
207 40. In that case, 30 out of the 40 iterations will benefit from
208 prefetching. */
209
210#ifndef TRIP_COUNT_TO_AHEAD_RATIO
211#define TRIP_COUNT_TO_AHEAD_RATIO 4
212#endif
213
b076a3fd
ZD
214/* The group of references between that reuse may occur. */
215
216struct mem_ref_group
217{
218 tree base; /* Base of the reference. */
219 HOST_WIDE_INT step; /* Step of the reference. */
220 struct mem_ref *refs; /* References in the group. */
221 struct mem_ref_group *next; /* Next group of references. */
222};
223
224/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
225
226#define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
227
228/* The memory reference. */
229
230struct mem_ref
231{
726a989a 232 gimple stmt; /* Statement in that the reference appears. */
b076a3fd
ZD
233 tree mem; /* The reference. */
234 HOST_WIDE_INT delta; /* Constant offset of the reference. */
b076a3fd
ZD
235 struct mem_ref_group *group; /* The group of references it belongs to. */
236 unsigned HOST_WIDE_INT prefetch_mod;
237 /* Prefetch only each PREFETCH_MOD-th
238 iteration. */
239 unsigned HOST_WIDE_INT prefetch_before;
240 /* Prefetch only first PREFETCH_BEFORE
241 iterations. */
5417e022
ZD
242 unsigned reuse_distance; /* The amount of data accessed before the first
243 reuse of this value. */
b076a3fd 244 struct mem_ref *next; /* The next reference in the group. */
79f5e442
ZD
245 unsigned write_p : 1; /* Is it a write? */
246 unsigned independent_p : 1; /* True if the reference is independent on
247 all other references inside the loop. */
248 unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
249 unsigned storent_p : 1; /* True if we changed the store to a
250 nontemporal one. */
b076a3fd
ZD
251};
252
75c40d56 253/* Dumps information about reference REF to FILE. */
b076a3fd
ZD
254
255static void
256dump_mem_ref (FILE *file, struct mem_ref *ref)
257{
258 fprintf (file, "Reference %p:\n", (void *) ref);
259
260 fprintf (file, " group %p (base ", (void *) ref->group);
261 print_generic_expr (file, ref->group->base, TDF_SLIM);
262 fprintf (file, ", step ");
263 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
264 fprintf (file, ")\n");
265
e324a72f 266 fprintf (file, " delta ");
b076a3fd
ZD
267 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
268 fprintf (file, "\n");
269
270 fprintf (file, " %s\n", ref->write_p ? "write" : "read");
271
272 fprintf (file, "\n");
273}
274
275/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
276 exist. */
277
278static struct mem_ref_group *
279find_or_create_group (struct mem_ref_group **groups, tree base,
280 HOST_WIDE_INT step)
281{
282 struct mem_ref_group *group;
283
284 for (; *groups; groups = &(*groups)->next)
285 {
286 if ((*groups)->step == step
287 && operand_equal_p ((*groups)->base, base, 0))
288 return *groups;
289
290 /* Keep the list of groups sorted by decreasing step. */
291 if ((*groups)->step < step)
292 break;
293 }
294
5417e022 295 group = XNEW (struct mem_ref_group);
b076a3fd
ZD
296 group->base = base;
297 group->step = step;
298 group->refs = NULL;
299 group->next = *groups;
300 *groups = group;
301
302 return group;
303}
304
305/* Records a memory reference MEM in GROUP with offset DELTA and write status
306 WRITE_P. The reference occurs in statement STMT. */
307
308static void
726a989a 309record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
b076a3fd
ZD
310 HOST_WIDE_INT delta, bool write_p)
311{
312 struct mem_ref **aref;
313
314 /* Do not record the same address twice. */
315 for (aref = &group->refs; *aref; aref = &(*aref)->next)
316 {
317 /* It does not have to be possible for write reference to reuse the read
318 prefetch, or vice versa. */
319 if (!WRITE_CAN_USE_READ_PREFETCH
320 && write_p
321 && !(*aref)->write_p)
322 continue;
323 if (!READ_CAN_USE_WRITE_PREFETCH
324 && !write_p
325 && (*aref)->write_p)
326 continue;
327
328 if ((*aref)->delta == delta)
329 return;
330 }
331
5417e022 332 (*aref) = XNEW (struct mem_ref);
b076a3fd
ZD
333 (*aref)->stmt = stmt;
334 (*aref)->mem = mem;
335 (*aref)->delta = delta;
336 (*aref)->write_p = write_p;
337 (*aref)->prefetch_before = PREFETCH_ALL;
338 (*aref)->prefetch_mod = 1;
5417e022 339 (*aref)->reuse_distance = 0;
b076a3fd
ZD
340 (*aref)->issue_prefetch_p = false;
341 (*aref)->group = group;
342 (*aref)->next = NULL;
79f5e442
ZD
343 (*aref)->independent_p = false;
344 (*aref)->storent_p = false;
b076a3fd
ZD
345
346 if (dump_file && (dump_flags & TDF_DETAILS))
347 dump_mem_ref (dump_file, *aref);
348}
349
350/* Release memory references in GROUPS. */
351
352static void
353release_mem_refs (struct mem_ref_group *groups)
354{
355 struct mem_ref_group *next_g;
356 struct mem_ref *ref, *next_r;
357
358 for (; groups; groups = next_g)
359 {
360 next_g = groups->next;
361 for (ref = groups->refs; ref; ref = next_r)
362 {
363 next_r = ref->next;
364 free (ref);
365 }
366 free (groups);
367 }
368}
369
370/* A structure used to pass arguments to idx_analyze_ref. */
371
372struct ar_data
373{
374 struct loop *loop; /* Loop of the reference. */
726a989a 375 gimple stmt; /* Statement of the reference. */
b076a3fd
ZD
376 HOST_WIDE_INT *step; /* Step of the memory reference. */
377 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
378};
379
380/* Analyzes a single INDEX of a memory reference to obtain information
381 described at analyze_ref. Callback for for_each_index. */
382
383static bool
384idx_analyze_ref (tree base, tree *index, void *data)
385{
c22940cd 386 struct ar_data *ar_data = (struct ar_data *) data;
b076a3fd
ZD
387 tree ibase, step, stepsize;
388 HOST_WIDE_INT istep, idelta = 0, imult = 1;
389 affine_iv iv;
390
391 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
392 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
393 return false;
394
f017bf5e
ZD
395 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
396 *index, &iv, false))
b076a3fd
ZD
397 return false;
398 ibase = iv.base;
399 step = iv.step;
400
6e42ce54
ZD
401 if (!cst_and_fits_in_hwi (step))
402 return false;
403 istep = int_cst_value (step);
b076a3fd 404
5be014d5 405 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
b076a3fd
ZD
406 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
407 {
408 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
409 ibase = TREE_OPERAND (ibase, 0);
410 }
411 if (cst_and_fits_in_hwi (ibase))
412 {
413 idelta += int_cst_value (ibase);
ff5e9a94 414 ibase = build_int_cst (TREE_TYPE (ibase), 0);
b076a3fd
ZD
415 }
416
417 if (TREE_CODE (base) == ARRAY_REF)
418 {
419 stepsize = array_ref_element_size (base);
420 if (!cst_and_fits_in_hwi (stepsize))
421 return false;
422 imult = int_cst_value (stepsize);
423
424 istep *= imult;
425 idelta *= imult;
426 }
427
428 *ar_data->step += istep;
429 *ar_data->delta += idelta;
430 *index = ibase;
431
432 return true;
433}
434
aac8b8ed 435/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
b076a3fd 436 STEP are integer constants and iter is number of iterations of LOOP. The
aac8b8ed
RS
437 reference occurs in statement STMT. Strips nonaddressable component
438 references from REF_P. */
b076a3fd
ZD
439
440static bool
aac8b8ed 441analyze_ref (struct loop *loop, tree *ref_p, tree *base,
b076a3fd 442 HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
726a989a 443 gimple stmt)
b076a3fd
ZD
444{
445 struct ar_data ar_data;
446 tree off;
447 HOST_WIDE_INT bit_offset;
aac8b8ed 448 tree ref = *ref_p;
b076a3fd
ZD
449
450 *step = 0;
451 *delta = 0;
452
453 /* First strip off the component references. Ignore bitfields. */
454 if (TREE_CODE (ref) == COMPONENT_REF
455 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
456 ref = TREE_OPERAND (ref, 0);
457
aac8b8ed
RS
458 *ref_p = ref;
459
b076a3fd
ZD
460 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
461 {
462 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
463 bit_offset = TREE_INT_CST_LOW (off);
464 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
b8698a0f 465
b076a3fd
ZD
466 *delta += bit_offset / BITS_PER_UNIT;
467 }
468
469 *base = unshare_expr (ref);
470 ar_data.loop = loop;
471 ar_data.stmt = stmt;
472 ar_data.step = step;
473 ar_data.delta = delta;
474 return for_each_index (base, idx_analyze_ref, &ar_data);
475}
476
477/* Record a memory reference REF to the list REFS. The reference occurs in
79f5e442
ZD
478 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
479 reference was recorded, false otherwise. */
b076a3fd 480
79f5e442 481static bool
b076a3fd 482gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
726a989a 483 tree ref, bool write_p, gimple stmt)
b076a3fd
ZD
484{
485 tree base;
486 HOST_WIDE_INT step, delta;
487 struct mem_ref_group *agrp;
488
a80a2701
JJ
489 if (get_base_address (ref) == NULL)
490 return false;
491
aac8b8ed 492 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
79f5e442 493 return false;
b076a3fd
ZD
494
495 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
496 are integer constants. */
497 agrp = find_or_create_group (refs, base, step);
498 record_ref (agrp, stmt, ref, delta, write_p);
79f5e442
ZD
499
500 return true;
b076a3fd
ZD
501}
502
79f5e442
ZD
503/* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
504 true if there are no other memory references inside the loop. */
b076a3fd
ZD
505
506static struct mem_ref_group *
db34470d 507gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
b076a3fd
ZD
508{
509 basic_block *body = get_loop_body_in_dom_order (loop);
510 basic_block bb;
511 unsigned i;
726a989a
RB
512 gimple_stmt_iterator bsi;
513 gimple stmt;
514 tree lhs, rhs;
b076a3fd
ZD
515 struct mem_ref_group *refs = NULL;
516
79f5e442 517 *no_other_refs = true;
db34470d 518 *ref_count = 0;
79f5e442 519
b076a3fd
ZD
520 /* Scan the loop body in order, so that the former references precede the
521 later ones. */
522 for (i = 0; i < loop->num_nodes; i++)
523 {
524 bb = body[i];
525 if (bb->loop_father != loop)
526 continue;
527
726a989a 528 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
b076a3fd 529 {
726a989a 530 stmt = gsi_stmt (bsi);
79f5e442 531
726a989a 532 if (gimple_code (stmt) != GIMPLE_ASSIGN)
79f5e442 533 {
5006671f 534 if (gimple_vuse (stmt)
726a989a
RB
535 || (is_gimple_call (stmt)
536 && !(gimple_call_flags (stmt) & ECF_CONST)))
79f5e442
ZD
537 *no_other_refs = false;
538 continue;
539 }
b076a3fd 540
726a989a
RB
541 lhs = gimple_assign_lhs (stmt);
542 rhs = gimple_assign_rhs1 (stmt);
b076a3fd
ZD
543
544 if (REFERENCE_CLASS_P (rhs))
db34470d 545 {
79f5e442
ZD
546 *no_other_refs &= gather_memory_references_ref (loop, &refs,
547 rhs, false, stmt);
db34470d
GS
548 *ref_count += 1;
549 }
b076a3fd 550 if (REFERENCE_CLASS_P (lhs))
db34470d 551 {
79f5e442
ZD
552 *no_other_refs &= gather_memory_references_ref (loop, &refs,
553 lhs, true, stmt);
db34470d
GS
554 *ref_count += 1;
555 }
b076a3fd
ZD
556 }
557 }
558 free (body);
559
560 return refs;
561}
562
563/* Prune the prefetch candidate REF using the self-reuse. */
564
565static void
566prune_ref_by_self_reuse (struct mem_ref *ref)
567{
568 HOST_WIDE_INT step = ref->group->step;
569 bool backward = step < 0;
570
571 if (step == 0)
572 {
573 /* Prefetch references to invariant address just once. */
574 ref->prefetch_before = 1;
575 return;
576 }
577
578 if (backward)
579 step = -step;
580
581 if (step > PREFETCH_BLOCK)
582 return;
583
584 if ((backward && HAVE_BACKWARD_PREFETCH)
585 || (!backward && HAVE_FORWARD_PREFETCH))
586 {
587 ref->prefetch_before = 1;
588 return;
589 }
590
591 ref->prefetch_mod = PREFETCH_BLOCK / step;
592}
593
594/* Divides X by BY, rounding down. */
595
596static HOST_WIDE_INT
597ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
598{
599 gcc_assert (by > 0);
600
601 if (x >= 0)
602 return x / by;
603 else
604 return (x + by - 1) / by;
605}
606
b8698a0f
L
607/* Given a CACHE_LINE_SIZE and two inductive memory references
608 with a common STEP greater than CACHE_LINE_SIZE and an address
609 difference DELTA, compute the probability that they will fall
610 in different cache lines. DISTINCT_ITERS is the number of
611 distinct iterations after which the pattern repeats itself.
2c6dd136
GS
612 ALIGN_UNIT is the unit of alignment in bytes. */
613
614static int
b8698a0f 615compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size,
2c6dd136
GS
616 HOST_WIDE_INT step, HOST_WIDE_INT delta,
617 unsigned HOST_WIDE_INT distinct_iters,
618 int align_unit)
619{
620 unsigned align, iter;
621 int total_positions, miss_positions, miss_rate;
622 int address1, address2, cache_line1, cache_line2;
623
624 total_positions = 0;
625 miss_positions = 0;
b8698a0f 626
2c6dd136
GS
627 /* Iterate through all possible alignments of the first
628 memory reference within its cache line. */
629 for (align = 0; align < cache_line_size; align += align_unit)
630
631 /* Iterate through all distinct iterations. */
632 for (iter = 0; iter < distinct_iters; iter++)
633 {
634 address1 = align + step * iter;
635 address2 = address1 + delta;
636 cache_line1 = address1 / cache_line_size;
637 cache_line2 = address2 / cache_line_size;
638 total_positions += 1;
639 if (cache_line1 != cache_line2)
640 miss_positions += 1;
641 }
642 miss_rate = 1000 * miss_positions / total_positions;
643 return miss_rate;
644}
645
b076a3fd
ZD
646/* Prune the prefetch candidate REF using the reuse with BY.
647 If BY_IS_BEFORE is true, BY is before REF in the loop. */
648
649static void
650prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
651 bool by_is_before)
652{
653 HOST_WIDE_INT step = ref->group->step;
654 bool backward = step < 0;
655 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
656 HOST_WIDE_INT delta = delta_b - delta_r;
657 HOST_WIDE_INT hit_from;
658 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
2c6dd136
GS
659 int miss_rate;
660 HOST_WIDE_INT reduced_step;
661 unsigned HOST_WIDE_INT reduced_prefetch_block;
662 tree ref_type;
663 int align_unit;
b076a3fd
ZD
664
665 if (delta == 0)
666 {
667 /* If the references has the same address, only prefetch the
668 former. */
669 if (by_is_before)
670 ref->prefetch_before = 0;
b8698a0f 671
b076a3fd
ZD
672 return;
673 }
674
675 if (!step)
676 {
677 /* If the reference addresses are invariant and fall into the
678 same cache line, prefetch just the first one. */
679 if (!by_is_before)
680 return;
681
682 if (ddown (ref->delta, PREFETCH_BLOCK)
683 != ddown (by->delta, PREFETCH_BLOCK))
684 return;
685
686 ref->prefetch_before = 0;
687 return;
688 }
689
690 /* Only prune the reference that is behind in the array. */
691 if (backward)
692 {
693 if (delta > 0)
694 return;
695
696 /* Transform the data so that we may assume that the accesses
697 are forward. */
698 delta = - delta;
699 step = -step;
700 delta_r = PREFETCH_BLOCK - 1 - delta_r;
701 delta_b = PREFETCH_BLOCK - 1 - delta_b;
702 }
703 else
704 {
705 if (delta < 0)
706 return;
707 }
708
709 /* Check whether the two references are likely to hit the same cache
710 line, and how distant the iterations in that it occurs are from
711 each other. */
712
713 if (step <= PREFETCH_BLOCK)
714 {
715 /* The accesses are sure to meet. Let us check when. */
716 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
717 prefetch_before = (hit_from - delta_r + step - 1) / step;
718
719 if (prefetch_before < ref->prefetch_before)
720 ref->prefetch_before = prefetch_before;
721
722 return;
723 }
724
b8698a0f 725 /* A more complicated case with step > prefetch_block. First reduce
2c6dd136 726 the ratio between the step and the cache line size to its simplest
b8698a0f
L
727 terms. The resulting denominator will then represent the number of
728 distinct iterations after which each address will go back to its
729 initial location within the cache line. This computation assumes
2c6dd136 730 that PREFETCH_BLOCK is a power of two. */
b076a3fd 731 prefetch_block = PREFETCH_BLOCK;
2c6dd136
GS
732 reduced_prefetch_block = prefetch_block;
733 reduced_step = step;
734 while ((reduced_step & 1) == 0
735 && reduced_prefetch_block > 1)
b076a3fd 736 {
2c6dd136
GS
737 reduced_step >>= 1;
738 reduced_prefetch_block >>= 1;
b076a3fd
ZD
739 }
740
b076a3fd
ZD
741 prefetch_before = delta / step;
742 delta %= step;
2c6dd136
GS
743 ref_type = TREE_TYPE (ref->mem);
744 align_unit = TYPE_ALIGN (ref_type) / 8;
b8698a0f 745 miss_rate = compute_miss_rate(prefetch_block, step, delta,
2c6dd136
GS
746 reduced_prefetch_block, align_unit);
747 if (miss_rate <= ACCEPTABLE_MISS_RATE)
b076a3fd
ZD
748 {
749 if (prefetch_before < ref->prefetch_before)
750 ref->prefetch_before = prefetch_before;
751
752 return;
753 }
754
755 /* Try also the following iteration. */
756 prefetch_before++;
757 delta = step - delta;
b8698a0f 758 miss_rate = compute_miss_rate(prefetch_block, step, delta,
2c6dd136 759 reduced_prefetch_block, align_unit);
b8698a0f 760 if (miss_rate <= ACCEPTABLE_MISS_RATE)
b076a3fd
ZD
761 {
762 if (prefetch_before < ref->prefetch_before)
763 ref->prefetch_before = prefetch_before;
764
765 return;
766 }
767
768 /* The ref probably does not reuse by. */
769 return;
770}
771
772/* Prune the prefetch candidate REF using the reuses with other references
773 in REFS. */
774
775static void
776prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
777{
778 struct mem_ref *prune_by;
779 bool before = true;
780
781 prune_ref_by_self_reuse (ref);
782
783 for (prune_by = refs; prune_by; prune_by = prune_by->next)
784 {
785 if (prune_by == ref)
786 {
787 before = false;
788 continue;
789 }
790
791 if (!WRITE_CAN_USE_READ_PREFETCH
792 && ref->write_p
793 && !prune_by->write_p)
794 continue;
795 if (!READ_CAN_USE_WRITE_PREFETCH
796 && !ref->write_p
797 && prune_by->write_p)
798 continue;
799
800 prune_ref_by_group_reuse (ref, prune_by, before);
801 }
802}
803
804/* Prune the prefetch candidates in GROUP using the reuse analysis. */
805
806static void
807prune_group_by_reuse (struct mem_ref_group *group)
808{
809 struct mem_ref *ref_pruned;
810
811 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
812 {
813 prune_ref_by_reuse (ref_pruned, group->refs);
814
815 if (dump_file && (dump_flags & TDF_DETAILS))
816 {
817 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
818
819 if (ref_pruned->prefetch_before == PREFETCH_ALL
820 && ref_pruned->prefetch_mod == 1)
821 fprintf (dump_file, " no restrictions");
822 else if (ref_pruned->prefetch_before == 0)
823 fprintf (dump_file, " do not prefetch");
824 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
825 fprintf (dump_file, " prefetch once");
826 else
827 {
828 if (ref_pruned->prefetch_before != PREFETCH_ALL)
829 {
830 fprintf (dump_file, " prefetch before ");
831 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
832 ref_pruned->prefetch_before);
833 }
834 if (ref_pruned->prefetch_mod != 1)
835 {
836 fprintf (dump_file, " prefetch mod ");
837 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
838 ref_pruned->prefetch_mod);
839 }
840 }
841 fprintf (dump_file, "\n");
842 }
843 }
844}
845
846/* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
847
848static void
849prune_by_reuse (struct mem_ref_group *groups)
850{
851 for (; groups; groups = groups->next)
852 prune_group_by_reuse (groups);
853}
854
855/* Returns true if we should issue prefetch for REF. */
856
857static bool
858should_issue_prefetch_p (struct mem_ref *ref)
859{
860 /* For now do not issue prefetches for only first few of the
861 iterations. */
862 if (ref->prefetch_before != PREFETCH_ALL)
863 return false;
864
79f5e442
ZD
865 /* Do not prefetch nontemporal stores. */
866 if (ref->storent_p)
867 return false;
868
b076a3fd
ZD
869 return true;
870}
871
872/* Decide which of the prefetch candidates in GROUPS to prefetch.
873 AHEAD is the number of iterations to prefetch ahead (which corresponds
874 to the number of simultaneous instances of one prefetch running at a
875 time). UNROLL_FACTOR is the factor by that the loop is going to be
876 unrolled. Returns true if there is anything to prefetch. */
877
878static bool
879schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
880 unsigned ahead)
881{
911b3fdb
ZD
882 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
883 unsigned slots_per_prefetch;
b076a3fd
ZD
884 struct mem_ref *ref;
885 bool any = false;
886
911b3fdb
ZD
887 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
888 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
b076a3fd 889
911b3fdb
ZD
890 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
891 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
892 it will need a prefetch slot. */
893 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
b076a3fd 894 if (dump_file && (dump_flags & TDF_DETAILS))
911b3fdb
ZD
895 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
896 slots_per_prefetch);
b076a3fd
ZD
897
898 /* For now we just take memory references one by one and issue
899 prefetches for as many as possible. The groups are sorted
900 starting with the largest step, since the references with
c0220ea4 901 large step are more likely to cause many cache misses. */
b076a3fd
ZD
902
903 for (; groups; groups = groups->next)
904 for (ref = groups->refs; ref; ref = ref->next)
905 {
906 if (!should_issue_prefetch_p (ref))
907 continue;
908
911b3fdb
ZD
909 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
910 and we unroll the loop UNROLL_FACTOR times, we need to insert
911 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
912 iteration. */
b076a3fd
ZD
913 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
914 / ref->prefetch_mod);
911b3fdb
ZD
915 prefetch_slots = n_prefetches * slots_per_prefetch;
916
917 /* If more than half of the prefetches would be lost anyway, do not
918 issue the prefetch. */
919 if (2 * remaining_prefetch_slots < prefetch_slots)
920 continue;
921
922 ref->issue_prefetch_p = true;
b076a3fd 923
911b3fdb
ZD
924 if (remaining_prefetch_slots <= prefetch_slots)
925 return true;
926 remaining_prefetch_slots -= prefetch_slots;
b076a3fd
ZD
927 any = true;
928 }
929
930 return any;
931}
932
db34470d 933/* Estimate the number of prefetches in the given GROUPS. */
b076a3fd 934
db34470d
GS
935static int
936estimate_prefetch_count (struct mem_ref_group *groups)
b076a3fd
ZD
937{
938 struct mem_ref *ref;
db34470d 939 int prefetch_count = 0;
b076a3fd
ZD
940
941 for (; groups; groups = groups->next)
942 for (ref = groups->refs; ref; ref = ref->next)
943 if (should_issue_prefetch_p (ref))
db34470d 944 prefetch_count++;
b076a3fd 945
db34470d 946 return prefetch_count;
b076a3fd
ZD
947}
948
949/* Issue prefetches for the reference REF into loop as decided before.
950 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
917f1b7e 951 is the factor by which LOOP was unrolled. */
b076a3fd
ZD
952
953static void
954issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
955{
956 HOST_WIDE_INT delta;
726a989a
RB
957 tree addr, addr_base, write_p, local;
958 gimple prefetch;
959 gimple_stmt_iterator bsi;
b076a3fd 960 unsigned n_prefetches, ap;
5417e022 961 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
b076a3fd
ZD
962
963 if (dump_file && (dump_flags & TDF_DETAILS))
5417e022
ZD
964 fprintf (dump_file, "Issued%s prefetch for %p.\n",
965 nontemporal ? " nontemporal" : "",
966 (void *) ref);
b076a3fd 967
726a989a 968 bsi = gsi_for_stmt (ref->stmt);
b076a3fd
ZD
969
970 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
971 / ref->prefetch_mod);
972 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
726a989a
RB
973 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
974 true, NULL, true, GSI_SAME_STMT);
911b3fdb 975 write_p = ref->write_p ? integer_one_node : integer_zero_node;
5417e022 976 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
b076a3fd
ZD
977
978 for (ap = 0; ap < n_prefetches; ap++)
979 {
980 /* Determine the address to prefetch. */
981 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
5be014d5
AP
982 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node,
983 addr_base, size_int (delta));
726a989a
RB
984 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
985 true, GSI_SAME_STMT);
b076a3fd
ZD
986
987 /* Create the prefetch instruction. */
726a989a
RB
988 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH],
989 3, addr, write_p, local);
990 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
b076a3fd
ZD
991 }
992}
993
994/* Issue prefetches for the references in GROUPS into loop as decided before.
995 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
996 factor by that LOOP was unrolled. */
997
998static void
999issue_prefetches (struct mem_ref_group *groups,
1000 unsigned unroll_factor, unsigned ahead)
1001{
1002 struct mem_ref *ref;
1003
1004 for (; groups; groups = groups->next)
1005 for (ref = groups->refs; ref; ref = ref->next)
1006 if (ref->issue_prefetch_p)
1007 issue_prefetch_ref (ref, unroll_factor, ahead);
1008}
1009
79f5e442
ZD
1010/* Returns true if REF is a memory write for that a nontemporal store insn
1011 can be used. */
1012
1013static bool
1014nontemporal_store_p (struct mem_ref *ref)
1015{
1016 enum machine_mode mode;
1017 enum insn_code code;
1018
1019 /* REF must be a write that is not reused. We require it to be independent
1020 on all other memory references in the loop, as the nontemporal stores may
1021 be reordered with respect to other memory references. */
1022 if (!ref->write_p
1023 || !ref->independent_p
1024 || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1025 return false;
1026
1027 /* Check that we have the storent instruction for the mode. */
1028 mode = TYPE_MODE (TREE_TYPE (ref->mem));
1029 if (mode == BLKmode)
1030 return false;
1031
166cdb08 1032 code = optab_handler (storent_optab, mode)->insn_code;
79f5e442
ZD
1033 return code != CODE_FOR_nothing;
1034}
1035
1036/* If REF is a nontemporal store, we mark the corresponding modify statement
1037 and return true. Otherwise, we return false. */
1038
1039static bool
1040mark_nontemporal_store (struct mem_ref *ref)
1041{
1042 if (!nontemporal_store_p (ref))
1043 return false;
1044
1045 if (dump_file && (dump_flags & TDF_DETAILS))
1046 fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1047 (void *) ref);
1048
726a989a 1049 gimple_assign_set_nontemporal_move (ref->stmt, true);
79f5e442
ZD
1050 ref->storent_p = true;
1051
1052 return true;
1053}
1054
1055/* Issue a memory fence instruction after LOOP. */
1056
1057static void
1058emit_mfence_after_loop (struct loop *loop)
1059{
1060 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1061 edge exit;
726a989a
RB
1062 gimple call;
1063 gimple_stmt_iterator bsi;
79f5e442
ZD
1064 unsigned i;
1065
1066 for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1067 {
726a989a 1068 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
79f5e442
ZD
1069
1070 if (!single_pred_p (exit->dest)
1071 /* If possible, we prefer not to insert the fence on other paths
1072 in cfg. */
1073 && !(exit->flags & EDGE_ABNORMAL))
1074 split_loop_exit_edge (exit);
726a989a 1075 bsi = gsi_after_labels (exit->dest);
79f5e442 1076
726a989a 1077 gsi_insert_before (&bsi, call, GSI_NEW_STMT);
79f5e442
ZD
1078 mark_virtual_ops_for_renaming (call);
1079 }
1080
1081 VEC_free (edge, heap, exits);
1082 update_ssa (TODO_update_ssa_only_virtuals);
1083}
1084
1085/* Returns true if we can use storent in loop, false otherwise. */
1086
1087static bool
1088may_use_storent_in_loop_p (struct loop *loop)
1089{
1090 bool ret = true;
1091
1092 if (loop->inner != NULL)
1093 return false;
1094
1095 /* If we must issue a mfence insn after using storent, check that there
1096 is a suitable place for it at each of the loop exits. */
1097 if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1098 {
1099 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1100 unsigned i;
1101 edge exit;
1102
1103 for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1104 if ((exit->flags & EDGE_ABNORMAL)
1105 && exit->dest == EXIT_BLOCK_PTR)
1106 ret = false;
1107
1108 VEC_free (edge, heap, exits);
1109 }
1110
1111 return ret;
1112}
1113
1114/* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1115 references in the loop. */
1116
1117static void
1118mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1119{
1120 struct mem_ref *ref;
1121 bool any = false;
1122
1123 if (!may_use_storent_in_loop_p (loop))
1124 return;
1125
1126 for (; groups; groups = groups->next)
1127 for (ref = groups->refs; ref; ref = ref->next)
1128 any |= mark_nontemporal_store (ref);
1129
1130 if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1131 emit_mfence_after_loop (loop);
1132}
1133
b076a3fd
ZD
1134/* Determines whether we can profitably unroll LOOP FACTOR times, and if
1135 this is the case, fill in DESC by the description of number of
1136 iterations. */
1137
1138static bool
1139should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1140 unsigned factor)
1141{
1142 if (!can_unroll_loop_p (loop, factor, desc))
1143 return false;
1144
1145 /* We only consider loops without control flow for unrolling. This is not
1146 a hard restriction -- tree_unroll_loop works with arbitrary loops
1147 as well; but the unrolling/prefetching is usually more profitable for
1148 loops consisting of a single basic block, and we want to limit the
1149 code growth. */
1150 if (loop->num_nodes > 2)
1151 return false;
1152
1153 return true;
1154}
1155
1156/* Determine the coefficient by that unroll LOOP, from the information
1157 contained in the list of memory references REFS. Description of
2711355f
ZD
1158 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1159 insns of the LOOP. EST_NITER is the estimated number of iterations of
1160 the loop, or -1 if no estimate is available. */
b076a3fd
ZD
1161
1162static unsigned
1163determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
2711355f
ZD
1164 unsigned ninsns, struct tree_niter_desc *desc,
1165 HOST_WIDE_INT est_niter)
b076a3fd 1166{
911b3fdb
ZD
1167 unsigned upper_bound;
1168 unsigned nfactor, factor, mod_constraint;
b076a3fd
ZD
1169 struct mem_ref_group *agp;
1170 struct mem_ref *ref;
1171
911b3fdb
ZD
1172 /* First check whether the loop is not too large to unroll. We ignore
1173 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1174 from unrolling them enough to make exactly one cache line covered by each
1175 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1176 us from unrolling the loops too many times in cases where we only expect
1177 gains from better scheduling and decreasing loop overhead, which is not
1178 the case here. */
1179 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
2711355f
ZD
1180
1181 /* If we unrolled the loop more times than it iterates, the unrolled version
1182 of the loop would be never entered. */
1183 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1184 upper_bound = est_niter;
1185
911b3fdb 1186 if (upper_bound <= 1)
b076a3fd
ZD
1187 return 1;
1188
911b3fdb
ZD
1189 /* Choose the factor so that we may prefetch each cache just once,
1190 but bound the unrolling by UPPER_BOUND. */
1191 factor = 1;
b076a3fd
ZD
1192 for (agp = refs; agp; agp = agp->next)
1193 for (ref = agp->refs; ref; ref = ref->next)
911b3fdb
ZD
1194 if (should_issue_prefetch_p (ref))
1195 {
1196 mod_constraint = ref->prefetch_mod;
1197 nfactor = least_common_multiple (mod_constraint, factor);
1198 if (nfactor <= upper_bound)
1199 factor = nfactor;
1200 }
b076a3fd
ZD
1201
1202 if (!should_unroll_loop_p (loop, desc, factor))
1203 return 1;
1204
1205 return factor;
1206}
1207
5417e022
ZD
1208/* Returns the total volume of the memory references REFS, taking into account
1209 reuses in the innermost loop and cache line size. TODO -- we should also
1210 take into account reuses across the iterations of the loops in the loop
1211 nest. */
1212
1213static unsigned
1214volume_of_references (struct mem_ref_group *refs)
1215{
1216 unsigned volume = 0;
1217 struct mem_ref_group *gr;
1218 struct mem_ref *ref;
1219
1220 for (gr = refs; gr; gr = gr->next)
1221 for (ref = gr->refs; ref; ref = ref->next)
1222 {
1223 /* Almost always reuses another value? */
1224 if (ref->prefetch_before != PREFETCH_ALL)
1225 continue;
1226
1227 /* If several iterations access the same cache line, use the size of
1228 the line divided by this number. Otherwise, a cache line is
1229 accessed in each iteration. TODO -- in the latter case, we should
1230 take the size of the reference into account, rounding it up on cache
1231 line size multiple. */
1232 volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1233 }
1234 return volume;
1235}
1236
1237/* Returns the volume of memory references accessed across VEC iterations of
1238 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1239 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1240
1241static unsigned
1242volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1243{
1244 unsigned i;
1245
1246 for (i = 0; i < n; i++)
1247 if (vec[i] != 0)
1248 break;
1249
1250 if (i == n)
1251 return 0;
1252
1253 gcc_assert (vec[i] > 0);
1254
1255 /* We ignore the parts of the distance vector in subloops, since usually
1256 the numbers of iterations are much smaller. */
1257 return loop_sizes[i] * vec[i];
1258}
1259
1260/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1261 at the position corresponding to the loop of the step. N is the depth
1262 of the considered loop nest, and, LOOP is its innermost loop. */
1263
1264static void
1265add_subscript_strides (tree access_fn, unsigned stride,
1266 HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1267{
1268 struct loop *aloop;
1269 tree step;
1270 HOST_WIDE_INT astep;
1271 unsigned min_depth = loop_depth (loop) - n;
1272
1273 while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1274 {
1275 aloop = get_chrec_loop (access_fn);
1276 step = CHREC_RIGHT (access_fn);
1277 access_fn = CHREC_LEFT (access_fn);
1278
1279 if ((unsigned) loop_depth (aloop) <= min_depth)
1280 continue;
1281
1282 if (host_integerp (step, 0))
1283 astep = tree_low_cst (step, 0);
1284 else
1285 astep = L1_CACHE_LINE_SIZE;
1286
1287 strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1288
1289 }
1290}
1291
1292/* Returns the volume of memory references accessed between two consecutive
1293 self-reuses of the reference DR. We consider the subscripts of DR in N
1294 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1295 loops. LOOP is the innermost loop of the current loop nest. */
1296
1297static unsigned
1298self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1299 struct loop *loop)
1300{
1301 tree stride, access_fn;
1302 HOST_WIDE_INT *strides, astride;
1303 VEC (tree, heap) *access_fns;
1304 tree ref = DR_REF (dr);
1305 unsigned i, ret = ~0u;
1306
1307 /* In the following example:
1308
1309 for (i = 0; i < N; i++)
1310 for (j = 0; j < N; j++)
1311 use (a[j][i]);
1312 the same cache line is accessed each N steps (except if the change from
1313 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1314 we cannot rely purely on the results of the data dependence analysis.
1315
1316 Instead, we compute the stride of the reference in each loop, and consider
1317 the innermost loop in that the stride is less than cache size. */
1318
1319 strides = XCNEWVEC (HOST_WIDE_INT, n);
1320 access_fns = DR_ACCESS_FNS (dr);
1321
1322 for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
1323 {
1324 /* Keep track of the reference corresponding to the subscript, so that we
1325 know its stride. */
1326 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1327 ref = TREE_OPERAND (ref, 0);
b8698a0f 1328
5417e022
ZD
1329 if (TREE_CODE (ref) == ARRAY_REF)
1330 {
1331 stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1332 if (host_integerp (stride, 1))
1333 astride = tree_low_cst (stride, 1);
1334 else
1335 astride = L1_CACHE_LINE_SIZE;
1336
1337 ref = TREE_OPERAND (ref, 0);
1338 }
1339 else
1340 astride = 1;
1341
1342 add_subscript_strides (access_fn, astride, strides, n, loop);
1343 }
1344
1345 for (i = n; i-- > 0; )
1346 {
1347 unsigned HOST_WIDE_INT s;
1348
1349 s = strides[i] < 0 ? -strides[i] : strides[i];
1350
1351 if (s < (unsigned) L1_CACHE_LINE_SIZE
1352 && (loop_sizes[i]
1353 > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1354 {
1355 ret = loop_sizes[i];
1356 break;
1357 }
1358 }
1359
1360 free (strides);
1361 return ret;
1362}
1363
1364/* Determines the distance till the first reuse of each reference in REFS
79f5e442
ZD
1365 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1366 memory references in the loop. */
5417e022
ZD
1367
1368static void
79f5e442
ZD
1369determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1370 bool no_other_refs)
5417e022
ZD
1371{
1372 struct loop *nest, *aloop;
1373 VEC (data_reference_p, heap) *datarefs = NULL;
1374 VEC (ddr_p, heap) *dependences = NULL;
1375 struct mem_ref_group *gr;
79f5e442 1376 struct mem_ref *ref, *refb;
5417e022
ZD
1377 VEC (loop_p, heap) *vloops = NULL;
1378 unsigned *loop_data_size;
1379 unsigned i, j, n;
1380 unsigned volume, dist, adist;
1381 HOST_WIDE_INT vol;
1382 data_reference_p dr;
1383 ddr_p dep;
1384
1385 if (loop->inner)
1386 return;
1387
1388 /* Find the outermost loop of the loop nest of loop (we require that
1389 there are no sibling loops inside the nest). */
1390 nest = loop;
1391 while (1)
1392 {
1393 aloop = loop_outer (nest);
1394
1395 if (aloop == current_loops->tree_root
1396 || aloop->inner->next)
1397 break;
1398
1399 nest = aloop;
1400 }
1401
1402 /* For each loop, determine the amount of data accessed in each iteration.
1403 We use this to estimate whether the reference is evicted from the
1404 cache before its reuse. */
1405 find_loop_nest (nest, &vloops);
1406 n = VEC_length (loop_p, vloops);
1407 loop_data_size = XNEWVEC (unsigned, n);
1408 volume = volume_of_references (refs);
1409 i = n;
1410 while (i-- != 0)
1411 {
1412 loop_data_size[i] = volume;
1413 /* Bound the volume by the L2 cache size, since above this bound,
1414 all dependence distances are equivalent. */
1415 if (volume > L2_CACHE_SIZE_BYTES)
1416 continue;
1417
1418 aloop = VEC_index (loop_p, vloops, i);
1419 vol = estimated_loop_iterations_int (aloop, false);
1420 if (vol < 0)
1421 vol = expected_loop_iterations (aloop);
1422 volume *= vol;
1423 }
1424
1425 /* Prepare the references in the form suitable for data dependence
0d52bcc1 1426 analysis. We ignore unanalyzable data references (the results
5417e022
ZD
1427 are used just as a heuristics to estimate temporality of the
1428 references, hence we do not need to worry about correctness). */
1429 for (gr = refs; gr; gr = gr->next)
1430 for (ref = gr->refs; ref; ref = ref->next)
1431 {
1432 dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
1433
1434 if (dr)
1435 {
1436 ref->reuse_distance = volume;
1437 dr->aux = ref;
1438 VEC_safe_push (data_reference_p, heap, datarefs, dr);
1439 }
79f5e442
ZD
1440 else
1441 no_other_refs = false;
5417e022
ZD
1442 }
1443
1444 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
1445 {
1446 dist = self_reuse_distance (dr, loop_data_size, n, loop);
3d9a9f94 1447 ref = (struct mem_ref *) dr->aux;
5417e022
ZD
1448 if (ref->reuse_distance > dist)
1449 ref->reuse_distance = dist;
79f5e442
ZD
1450
1451 if (no_other_refs)
1452 ref->independent_p = true;
5417e022
ZD
1453 }
1454
1455 compute_all_dependences (datarefs, &dependences, vloops, true);
1456
1457 for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
1458 {
1459 if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1460 continue;
1461
3d9a9f94
KG
1462 ref = (struct mem_ref *) DDR_A (dep)->aux;
1463 refb = (struct mem_ref *) DDR_B (dep)->aux;
79f5e442 1464
5417e022
ZD
1465 if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1466 || DDR_NUM_DIST_VECTS (dep) == 0)
1467 {
0d52bcc1 1468 /* If the dependence cannot be analyzed, assume that there might be
5417e022
ZD
1469 a reuse. */
1470 dist = 0;
b8698a0f 1471
79f5e442
ZD
1472 ref->independent_p = false;
1473 refb->independent_p = false;
5417e022
ZD
1474 }
1475 else
1476 {
0d52bcc1 1477 /* The distance vectors are normalized to be always lexicographically
5417e022
ZD
1478 positive, hence we cannot tell just from them whether DDR_A comes
1479 before DDR_B or vice versa. However, it is not important,
1480 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1481 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1482 in cache (and marking it as nontemporal would not affect
1483 anything). */
1484
1485 dist = volume;
1486 for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1487 {
1488 adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1489 loop_data_size, n);
1490
79f5e442
ZD
1491 /* If this is a dependence in the innermost loop (i.e., the
1492 distances in all superloops are zero) and it is not
1493 the trivial self-dependence with distance zero, record that
1494 the references are not completely independent. */
1495 if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1496 && (ref != refb
1497 || DDR_DIST_VECT (dep, j)[n-1] != 0))
1498 {
1499 ref->independent_p = false;
1500 refb->independent_p = false;
1501 }
1502
5417e022
ZD
1503 /* Ignore accesses closer than
1504 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1505 so that we use nontemporal prefetches e.g. if single memory
1506 location is accessed several times in a single iteration of
1507 the loop. */
1508 if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1509 continue;
1510
1511 if (adist < dist)
1512 dist = adist;
1513 }
1514 }
1515
5417e022
ZD
1516 if (ref->reuse_distance > dist)
1517 ref->reuse_distance = dist;
79f5e442
ZD
1518 if (refb->reuse_distance > dist)
1519 refb->reuse_distance = dist;
5417e022
ZD
1520 }
1521
1522 free_dependence_relations (dependences);
1523 free_data_refs (datarefs);
1524 free (loop_data_size);
1525
1526 if (dump_file && (dump_flags & TDF_DETAILS))
1527 {
1528 fprintf (dump_file, "Reuse distances:\n");
1529 for (gr = refs; gr; gr = gr->next)
1530 for (ref = gr->refs; ref; ref = ref->next)
1531 fprintf (dump_file, " ref %p distance %u\n",
1532 (void *) ref, ref->reuse_distance);
1533 }
1534}
1535
db34470d
GS
1536/* Do a cost-benefit analysis to determine if prefetching is profitable
1537 for the current loop given the following parameters:
1538 AHEAD: the iteration ahead distance,
b8698a0f 1539 EST_NITER: the estimated trip count,
db34470d
GS
1540 NINSNS: estimated number of instructions in the loop,
1541 PREFETCH_COUNT: an estimate of the number of prefetches
1542 MEM_REF_COUNT: total number of memory references in the loop. */
1543
b8698a0f
L
1544static bool
1545is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter,
1546 unsigned ninsns, unsigned prefetch_count,
ccacf0e1 1547 unsigned mem_ref_count, unsigned unroll_factor)
db34470d
GS
1548{
1549 int insn_to_mem_ratio, insn_to_prefetch_ratio;
1550
1551 if (mem_ref_count == 0)
1552 return false;
1553
b8698a0f
L
1554 /* Prefetching improves performance by overlapping cache missing
1555 memory accesses with CPU operations. If the loop does not have
1556 enough CPU operations to overlap with memory operations, prefetching
1557 won't give a significant benefit. One approximate way of checking
1558 this is to require the ratio of instructions to memory references to
db34470d
GS
1559 be above a certain limit. This approximation works well in practice.
1560 TODO: Implement a more precise computation by estimating the time
1561 for each CPU or memory op in the loop. Time estimates for memory ops
1562 should account for cache misses. */
b8698a0f 1563 insn_to_mem_ratio = ninsns / mem_ref_count;
db34470d
GS
1564
1565 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
55e5a2eb
CF
1566 {
1567 if (dump_file && (dump_flags & TDF_DETAILS))
1568 fprintf (dump_file,
1569 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1570 insn_to_mem_ratio);
1571 return false;
1572 }
db34470d
GS
1573
1574 /* Profitability of prefetching is highly dependent on the trip count.
b8698a0f
L
1575 For a given AHEAD distance, the first AHEAD iterations do not benefit
1576 from prefetching, and the last AHEAD iterations execute useless
db34470d
GS
1577 prefetches. So, if the trip count is not large enough relative to AHEAD,
1578 prefetching may cause serious performance degradation. To avoid this
b8698a0f 1579 problem when the trip count is not known at compile time, we
db34470d 1580 conservatively skip loops with high prefetching costs. For now, only
b8698a0f 1581 the I-cache cost is considered. The relative I-cache cost is estimated
db34470d
GS
1582 by taking the ratio between the number of prefetches and the total
1583 number of instructions. Since we are using integer arithmetic, we
b8698a0f 1584 compute the reciprocal of this ratio.
ccacf0e1
CF
1585 (unroll_factor * ninsns) is used to estimate the number of instructions in
1586 the unrolled loop. This implementation is a bit simplistic -- the number
1587 of issued prefetch instructions is also affected by unrolling. So,
1588 prefetch_mod and the unroll factor should be taken into account when
1589 determining prefetch_count. Also, the number of insns of the unrolled
1590 loop will usually be significantly smaller than the number of insns of the
1591 original loop * unroll_factor (at least the induction variable increases
1592 and the exit branches will get eliminated), so it might be better to use
1593 tree_estimate_loop_size + estimated_unrolled_size. */
db34470d
GS
1594 if (est_niter < 0)
1595 {
ccacf0e1 1596 insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
db34470d
GS
1597 return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO;
1598 }
b8698a0f 1599
9bf4598b 1600 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
db34470d
GS
1601 {
1602 if (dump_file && (dump_flags & TDF_DETAILS))
1603 fprintf (dump_file,
1604 "Not prefetching -- loop estimated to roll only %d times\n",
1605 (int) est_niter);
1606 return false;
1607 }
1608 return true;
1609}
1610
1611
b076a3fd 1612/* Issue prefetch instructions for array references in LOOP. Returns
d73be268 1613 true if the LOOP was unrolled. */
b076a3fd
ZD
1614
1615static bool
d73be268 1616loop_prefetch_arrays (struct loop *loop)
b076a3fd
ZD
1617{
1618 struct mem_ref_group *refs;
2711355f
ZD
1619 unsigned ahead, ninsns, time, unroll_factor;
1620 HOST_WIDE_INT est_niter;
b076a3fd 1621 struct tree_niter_desc desc;
79f5e442 1622 bool unrolled = false, no_other_refs;
db34470d
GS
1623 unsigned prefetch_count;
1624 unsigned mem_ref_count;
b076a3fd 1625
efd8f750 1626 if (optimize_loop_nest_for_size_p (loop))
2732d767
ZD
1627 {
1628 if (dump_file && (dump_flags & TDF_DETAILS))
1629 fprintf (dump_file, " ignored (cold area)\n");
1630 return false;
1631 }
1632
b076a3fd 1633 /* Step 1: gather the memory references. */
db34470d 1634 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
b076a3fd
ZD
1635
1636 /* Step 2: estimate the reuse effects. */
1637 prune_by_reuse (refs);
1638
db34470d
GS
1639 prefetch_count = estimate_prefetch_count (refs);
1640 if (prefetch_count == 0)
b076a3fd
ZD
1641 goto fail;
1642
79f5e442 1643 determine_loop_nest_reuse (loop, refs, no_other_refs);
5417e022 1644
b076a3fd
ZD
1645 /* Step 3: determine the ahead and unroll factor. */
1646
2711355f
ZD
1647 /* FIXME: the time should be weighted by the probabilities of the blocks in
1648 the loop body. */
1649 time = tree_num_loop_insns (loop, &eni_time_weights);
1650 ahead = (PREFETCH_LATENCY + time - 1) / time;
b8698a0f 1651 est_niter = estimated_loop_iterations_int (loop, false);
79f5e442 1652
2711355f
ZD
1653 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1654 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1655 est_niter);
1656 if (dump_file && (dump_flags & TDF_DETAILS))
b8698a0f 1657 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
d81f5387 1658 HOST_WIDE_INT_PRINT_DEC "\n"
b8698a0f
L
1659 "insn count %d, mem ref count %d, prefetch count %d\n",
1660 ahead, unroll_factor, est_niter,
1661 ninsns, mem_ref_count, prefetch_count);
db34470d 1662
ccacf0e1
CF
1663 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, prefetch_count,
1664 mem_ref_count, unroll_factor))
db34470d
GS
1665 goto fail;
1666
1667 mark_nontemporal_stores (loop, refs);
2711355f 1668
b076a3fd
ZD
1669 /* Step 4: what to prefetch? */
1670 if (!schedule_prefetches (refs, unroll_factor, ahead))
1671 goto fail;
1672
1673 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1674 iterations so that we do not issue superfluous prefetches. */
1675 if (unroll_factor != 1)
1676 {
d73be268 1677 tree_unroll_loop (loop, unroll_factor,
b076a3fd
ZD
1678 single_dom_exit (loop), &desc);
1679 unrolled = true;
1680 }
1681
1682 /* Step 6: issue the prefetches. */
1683 issue_prefetches (refs, unroll_factor, ahead);
1684
1685fail:
1686 release_mem_refs (refs);
1687 return unrolled;
1688}
1689
d73be268 1690/* Issue prefetch instructions for array references in loops. */
b076a3fd 1691
c7f965b6 1692unsigned int
d73be268 1693tree_ssa_prefetch_arrays (void)
b076a3fd 1694{
42fd6772 1695 loop_iterator li;
b076a3fd
ZD
1696 struct loop *loop;
1697 bool unrolled = false;
c7f965b6 1698 int todo_flags = 0;
b076a3fd
ZD
1699
1700 if (!HAVE_prefetch
1701 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1702 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1703 of processor costs and i486 does not have prefetch, but
1704 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1705 || PREFETCH_BLOCK == 0)
c7f965b6 1706 return 0;
b076a3fd 1707
47eb5b32
ZD
1708 if (dump_file && (dump_flags & TDF_DETAILS))
1709 {
1710 fprintf (dump_file, "Prefetching parameters:\n");
1711 fprintf (dump_file, " simultaneous prefetches: %d\n",
1712 SIMULTANEOUS_PREFETCHES);
1713 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
47eb5b32 1714 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
46cb0441
ZD
1715 fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1716 L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
5417e022 1717 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
b8698a0f
L
1718 fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1719 fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
db34470d 1720 MIN_INSN_TO_PREFETCH_RATIO);
b8698a0f 1721 fprintf (dump_file, " min insn-to-mem ratio: %d \n",
db34470d 1722 PREFETCH_MIN_INSN_TO_MEM_RATIO);
47eb5b32
ZD
1723 fprintf (dump_file, "\n");
1724 }
1725
b076a3fd
ZD
1726 initialize_original_copy_tables ();
1727
1728 if (!built_in_decls[BUILT_IN_PREFETCH])
1729 {
1730 tree type = build_function_type (void_type_node,
1731 tree_cons (NULL_TREE,
1732 const_ptr_type_node,
1733 NULL_TREE));
c79efc4d
RÁE
1734 tree decl = add_builtin_function ("__builtin_prefetch", type,
1735 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1736 NULL, NULL_TREE);
b076a3fd
ZD
1737 DECL_IS_NOVOPS (decl) = true;
1738 built_in_decls[BUILT_IN_PREFETCH] = decl;
1739 }
1740
1741 /* We assume that size of cache line is a power of two, so verify this
1742 here. */
1743 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1744
42fd6772 1745 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
b076a3fd 1746 {
b076a3fd
ZD
1747 if (dump_file && (dump_flags & TDF_DETAILS))
1748 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1749
d73be268 1750 unrolled |= loop_prefetch_arrays (loop);
b076a3fd
ZD
1751
1752 if (dump_file && (dump_flags & TDF_DETAILS))
1753 fprintf (dump_file, "\n\n");
1754 }
1755
1756 if (unrolled)
1757 {
1758 scev_reset ();
c7f965b6 1759 todo_flags |= TODO_cleanup_cfg;
b076a3fd
ZD
1760 }
1761
1762 free_original_copy_tables ();
c7f965b6 1763 return todo_flags;
b076a3fd 1764}