]>
Commit | Line | Data |
---|---|---|
13e08dc9 | 1 | /* Loop versioning pass. |
a945c346 | 2 | Copyright (C) 2018-2024 Free Software Foundation, Inc. |
13e08dc9 RS |
3 | |
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify it | |
7 | under the terms of the GNU General Public License as published by the | |
8 | Free Software Foundation; either version 3, or (at your option) any | |
9 | later version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT | |
12 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GCC; see the file COPYING3. If not see | |
18 | <http://www.gnu.org/licenses/>. */ | |
19 | ||
20 | #include "config.h" | |
21 | #include "system.h" | |
22 | #include "coretypes.h" | |
23 | #include "backend.h" | |
24 | #include "tree.h" | |
25 | #include "gimple.h" | |
26 | #include "gimple-iterator.h" | |
27 | #include "tree-pass.h" | |
28 | #include "gimplify-me.h" | |
29 | #include "cfgloop.h" | |
30 | #include "tree-ssa-loop.h" | |
31 | #include "ssa.h" | |
32 | #include "tree-scalar-evolution.h" | |
13e08dc9 RS |
33 | #include "tree-ssa-loop-ivopts.h" |
34 | #include "fold-const.h" | |
35 | #include "tree-ssa-propagate.h" | |
36 | #include "tree-inline.h" | |
37 | #include "domwalk.h" | |
13e08dc9 RS |
38 | #include "tree-vectorizer.h" |
39 | #include "omp-general.h" | |
40 | #include "predict.h" | |
41 | #include "tree-into-ssa.h" | |
6165cf6b AH |
42 | #include "gimple-range.h" |
43 | #include "tree-cfg.h" | |
13e08dc9 RS |
44 | |
45 | namespace { | |
46 | ||
47 | /* This pass looks for loops that could be simplified if certain loop | |
48 | invariant conditions were true. It is effectively a form of loop | |
49 | splitting in which the pass produces the split conditions itself, | |
50 | instead of using ones that are already present in the IL. | |
51 | ||
52 | Versioning for when strides are 1 | |
53 | --------------------------------- | |
54 | ||
55 | At the moment the only thing the pass looks for are memory references | |
56 | like: | |
57 | ||
58 | for (auto i : ...) | |
59 | ...x[i * stride]... | |
60 | ||
61 | It considers changing such loops to: | |
62 | ||
63 | if (stride == 1) | |
64 | for (auto i : ...) [A] | |
65 | ...x[i]... | |
66 | else | |
67 | for (auto i : ...) [B] | |
68 | ...x[i * stride]... | |
69 | ||
70 | This can have several benefits: | |
71 | ||
72 | (1) [A] is often easier or cheaper to vectorize than [B]. | |
73 | ||
74 | (2) The scalar code in [A] is simpler than the scalar code in [B] | |
75 | (if the loops cannot be vectorized or need an epilogue loop). | |
76 | ||
77 | (3) We might recognize [A] as a pattern, such as a memcpy or memset. | |
78 | ||
79 | (4) [A] has simpler address evolutions, which can help other passes | |
80 | like loop interchange. | |
81 | ||
82 | The optimization is particularly useful for assumed-shape arrays in | |
83 | Fortran, where the stride of the innermost dimension depends on the | |
84 | array descriptor but is often equal to 1 in practice. For example: | |
85 | ||
86 | subroutine f1(x) | |
87 | real :: x(:) | |
88 | x(:) = 100 | |
89 | end subroutine f1 | |
90 | ||
91 | generates the equivalent of: | |
92 | ||
93 | raw_stride = *x.dim[0].stride; | |
94 | stride = raw_stride != 0 ? raw_stride : 1; | |
95 | x_base = *x.data; | |
96 | ... | |
97 | tmp1 = stride * S; | |
98 | tmp2 = tmp1 - stride; | |
99 | *x_base[tmp2] = 1.0e+2; | |
100 | ||
101 | but in the common case that stride == 1, the last three statements | |
102 | simplify to: | |
103 | ||
104 | tmp3 = S + -1; | |
105 | *x_base[tmp3] = 1.0e+2; | |
106 | ||
107 | The optimization is in principle very simple. The difficult parts are: | |
108 | ||
109 | (a) deciding which parts of a general address calculation correspond | |
110 | to the inner dimension of an array, since this usually isn't explicit | |
111 | in the IL, and for C often isn't even explicit in the source code | |
112 | ||
113 | (b) estimating when the transformation is worthwhile | |
114 | ||
115 | Structure | |
116 | --------- | |
117 | ||
118 | The pass has four phases: | |
119 | ||
120 | (1) Walk through the statements looking for and recording potential | |
121 | versioning opportunities. Stop if there are none. | |
122 | ||
123 | (2) Use context-sensitive range information to see whether any versioning | |
124 | conditions are impossible in practice. Remove them if so, and stop | |
125 | if no opportunities remain. | |
126 | ||
127 | (We do this only after (1) to keep compile time down when no | |
128 | versioning opportunities exist.) | |
129 | ||
130 | (3) Apply the cost model. Decide which versioning opportunities are | |
131 | worthwhile and at which nesting level they should be applied. | |
132 | ||
133 | (4) Attempt to version all the loops selected by (3), so that: | |
134 | ||
135 | for (...) | |
136 | ... | |
137 | ||
138 | becomes: | |
139 | ||
140 | if (!cond) | |
141 | for (...) // Original loop | |
142 | ... | |
143 | else | |
144 | for (...) // New loop | |
145 | ... | |
146 | ||
147 | Use the version condition COND to simplify the new loop. */ | |
148 | ||
149 | /* Enumerates the likelihood that a particular value indexes the inner | |
150 | dimension of an array. */ | |
151 | enum inner_likelihood { | |
152 | INNER_UNLIKELY, | |
153 | INNER_DONT_KNOW, | |
154 | INNER_LIKELY | |
155 | }; | |
156 | ||
157 | /* Information about one term of an address_info. */ | |
158 | struct address_term_info | |
159 | { | |
160 | /* The value of the term is EXPR * MULTIPLIER. */ | |
161 | tree expr; | |
162 | unsigned HOST_WIDE_INT multiplier; | |
163 | ||
164 | /* The stride applied by EXPR in each iteration of some unrecorded loop, | |
165 | or null if no stride has been identified. */ | |
166 | tree stride; | |
167 | ||
168 | /* Enumerates the likelihood that EXPR indexes the inner dimension | |
169 | of an array. */ | |
170 | enum inner_likelihood inner_likelihood; | |
171 | ||
172 | /* True if STRIDE == 1 is a versioning opportunity when considered | |
173 | in isolation. */ | |
174 | bool versioning_opportunity_p; | |
175 | }; | |
176 | ||
177 | /* Information about an address calculation, and the range of constant | |
178 | offsets applied to it. */ | |
6c1dae73 | 179 | class address_info |
13e08dc9 | 180 | { |
6c1dae73 | 181 | public: |
13e08dc9 RS |
182 | static const unsigned int MAX_TERMS = 8; |
183 | ||
184 | /* One statement that calculates the address. If multiple statements | |
185 | share the same address, we only record the first. */ | |
186 | gimple *stmt; | |
187 | ||
188 | /* The loop containing STMT (cached for convenience). If multiple | |
189 | statements share the same address, they all belong to this loop. */ | |
99b1c316 | 190 | class loop *loop; |
13e08dc9 RS |
191 | |
192 | /* A decomposition of the calculation into a sum of terms plus an | |
193 | optional base. When BASE is provided, it is never an SSA name. | |
194 | Once initialization is complete, all members of TERMs are SSA names. */ | |
195 | tree base; | |
196 | auto_vec<address_term_info, MAX_TERMS> terms; | |
197 | ||
198 | /* All bytes accessed from the address fall in the offset range | |
199 | [MIN_OFFSET, MAX_OFFSET). */ | |
200 | HOST_WIDE_INT min_offset, max_offset; | |
201 | }; | |
202 | ||
203 | /* Stores addresses based on their base and terms (ignoring the offsets). */ | |
204 | struct address_info_hasher : nofree_ptr_hash <address_info> | |
205 | { | |
206 | static hashval_t hash (const address_info *); | |
207 | static bool equal (const address_info *, const address_info *); | |
208 | }; | |
209 | ||
210 | /* Information about the versioning we'd like to apply to a loop. */ | |
6c1dae73 | 211 | class loop_info |
13e08dc9 | 212 | { |
6c1dae73 | 213 | public: |
13e08dc9 RS |
214 | bool worth_versioning_p () const; |
215 | ||
216 | /* True if we've decided not to version this loop. The remaining | |
217 | fields are meaningless if so. */ | |
218 | bool rejected_p; | |
219 | ||
220 | /* True if at least one subloop of this loop benefits from versioning. */ | |
221 | bool subloops_benefit_p; | |
222 | ||
223 | /* An estimate of the total number of instructions in the loop, | |
224 | excluding those in subloops that benefit from versioning. */ | |
225 | unsigned int num_insns; | |
226 | ||
227 | /* The outermost loop that can handle all the version checks | |
228 | described below. */ | |
99b1c316 | 229 | class loop *outermost; |
13e08dc9 RS |
230 | |
231 | /* The first entry in the list of blocks that belong to this loop | |
232 | (and not to subloops). m_next_block_in_loop provides the chain | |
233 | pointers for the list. */ | |
234 | basic_block block_list; | |
235 | ||
236 | /* We'd like to version the loop for the case in which these SSA names | |
237 | (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime. */ | |
238 | bitmap_head unity_names; | |
239 | ||
240 | /* If versioning succeeds, this points the version of the loop that | |
241 | assumes the version conditions holds. */ | |
99b1c316 | 242 | class loop *optimized_loop; |
13e08dc9 RS |
243 | }; |
244 | ||
245 | /* The main pass structure. */ | |
246 | class loop_versioning | |
247 | { | |
248 | public: | |
249 | loop_versioning (function *); | |
250 | ~loop_versioning (); | |
251 | unsigned int run (); | |
252 | ||
253 | private: | |
254 | /* Used to walk the dominator tree to find loop versioning conditions | |
255 | that are always false. */ | |
256 | class lv_dom_walker : public dom_walker | |
257 | { | |
258 | public: | |
259 | lv_dom_walker (loop_versioning &); | |
260 | ||
ff171cb1 | 261 | edge before_dom_children (basic_block) final override; |
13e08dc9 RS |
262 | |
263 | private: | |
264 | /* The parent pass. */ | |
265 | loop_versioning &m_lv; | |
13e08dc9 RS |
266 | }; |
267 | ||
268 | /* Used to simplify statements based on conditions that are established | |
269 | by the version checks. */ | |
270 | class name_prop : public substitute_and_fold_engine | |
271 | { | |
272 | public: | |
273 | name_prop (loop_info &li) : m_li (li) {} | |
ff171cb1 | 274 | tree value_of_expr (tree name, gimple *) final override; |
13e08dc9 RS |
275 | |
276 | private: | |
277 | /* Information about the versioning we've performed on the loop. */ | |
278 | loop_info &m_li; | |
279 | }; | |
280 | ||
99b1c316 | 281 | loop_info &get_loop_info (class loop *loop) { return m_loops[loop->num]; } |
13e08dc9 | 282 | |
99b1c316 | 283 | unsigned int max_insns_for_loop (class loop *); |
13e08dc9 RS |
284 | bool expensive_stmt_p (gimple *); |
285 | ||
286 | void version_for_unity (gimple *, tree); | |
287 | bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT, | |
288 | unsigned HOST_WIDE_INT * = 0); | |
289 | bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *); | |
290 | bool multiply_term_by (address_term_info &, tree); | |
291 | inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT); | |
e047844b | 292 | void dump_inner_likelihood (address_info &, address_term_info &); |
13e08dc9 | 293 | void analyze_stride (address_info &, address_term_info &, |
99b1c316 | 294 | tree, class loop *); |
13e08dc9 | 295 | bool find_per_loop_multiplication (address_info &, address_term_info &); |
e047844b RS |
296 | bool analyze_term_using_scevs (address_info &, address_term_info &); |
297 | void analyze_arbitrary_term (address_info &, address_term_info &); | |
13e08dc9 RS |
298 | void analyze_address_fragment (address_info &); |
299 | void record_address_fragment (gimple *, unsigned HOST_WIDE_INT, | |
300 | tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT); | |
301 | void analyze_expr (gimple *, tree); | |
302 | bool analyze_block (basic_block); | |
303 | bool analyze_blocks (); | |
304 | ||
6165cf6b | 305 | void prune_loop_conditions (class loop *); |
13e08dc9 RS |
306 | bool prune_conditions (); |
307 | ||
99b1c316 MS |
308 | void merge_loop_info (class loop *, class loop *); |
309 | void add_loop_to_queue (class loop *); | |
310 | bool decide_whether_loop_is_versionable (class loop *); | |
13e08dc9 RS |
311 | bool make_versioning_decisions (); |
312 | ||
99b1c316 | 313 | bool version_loop (class loop *); |
13e08dc9 RS |
314 | void implement_versioning_decisions (); |
315 | ||
316 | /* The function we're optimizing. */ | |
317 | function *m_fn; | |
318 | ||
319 | /* The obstack to use for all pass-specific bitmaps. */ | |
320 | bitmap_obstack m_bitmap_obstack; | |
321 | ||
322 | /* An obstack to use for general allocation. */ | |
323 | obstack m_obstack; | |
324 | ||
13e08dc9 RS |
325 | /* The total number of loop version conditions we've found. */ |
326 | unsigned int m_num_conditions; | |
327 | ||
328 | /* Assume that an address fragment of the form i * stride * scale | |
329 | (for variable stride and constant scale) will not benefit from | |
330 | versioning for stride == 1 when scale is greater than this value. */ | |
331 | unsigned HOST_WIDE_INT m_maximum_scale; | |
332 | ||
333 | /* Information about each loop. */ | |
334 | auto_vec<loop_info> m_loops; | |
335 | ||
336 | /* Used to form a linked list of blocks that belong to a loop, | |
337 | started by loop_info::block_list. */ | |
338 | auto_vec<basic_block> m_next_block_in_loop; | |
339 | ||
340 | /* The list of loops that we've decided to version. */ | |
99b1c316 | 341 | auto_vec<class loop *> m_loops_to_version; |
13e08dc9 RS |
342 | |
343 | /* A table of addresses in the current loop, keyed off their values | |
344 | but not their offsets. */ | |
345 | hash_table <address_info_hasher> m_address_table; | |
346 | ||
347 | /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order. */ | |
348 | auto_vec <address_info *, 32> m_address_list; | |
349 | }; | |
350 | ||
351 | /* If EXPR is an SSA name and not a default definition, return the | |
352 | defining statement, otherwise return null. */ | |
353 | ||
354 | static gimple * | |
355 | maybe_get_stmt (tree expr) | |
356 | { | |
357 | if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr)) | |
358 | return SSA_NAME_DEF_STMT (expr); | |
359 | return NULL; | |
360 | } | |
361 | ||
362 | /* Like maybe_get_stmt, but also return null if the defining | |
363 | statement isn't an assignment. */ | |
364 | ||
365 | static gassign * | |
366 | maybe_get_assign (tree expr) | |
367 | { | |
368 | return safe_dyn_cast <gassign *> (maybe_get_stmt (expr)); | |
369 | } | |
370 | ||
371 | /* Return true if this pass should look through a cast of expression FROM | |
372 | to type TYPE when analyzing pieces of an address. */ | |
373 | ||
374 | static bool | |
375 | look_through_cast_p (tree type, tree from) | |
376 | { | |
377 | return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type) | |
378 | && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type)); | |
379 | } | |
380 | ||
381 | /* Strip all conversions of integers or pointers from EXPR, regardless | |
382 | of whether the conversions are nops. This is useful in the context | |
383 | of this pass because we're not trying to fold or simulate the | |
384 | expression; we just want to see how it's structured. */ | |
385 | ||
386 | static tree | |
387 | strip_casts (tree expr) | |
388 | { | |
389 | const unsigned int MAX_NITERS = 4; | |
390 | ||
391 | tree type = TREE_TYPE (expr); | |
392 | while (CONVERT_EXPR_P (expr) | |
393 | && look_through_cast_p (type, TREE_OPERAND (expr, 0))) | |
394 | expr = TREE_OPERAND (expr, 0); | |
395 | ||
396 | for (unsigned int niters = 0; niters < MAX_NITERS; ++niters) | |
397 | { | |
398 | gassign *assign = maybe_get_assign (expr); | |
399 | if (assign | |
400 | && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)) | |
401 | && look_through_cast_p (type, gimple_assign_rhs1 (assign))) | |
402 | expr = gimple_assign_rhs1 (assign); | |
403 | else | |
404 | break; | |
405 | } | |
406 | return expr; | |
407 | } | |
408 | ||
409 | /* Compare two address_term_infos in the same address_info. */ | |
410 | ||
411 | static int | |
412 | compare_address_terms (const void *a_uncast, const void *b_uncast) | |
413 | { | |
414 | const address_term_info *a = (const address_term_info *) a_uncast; | |
415 | const address_term_info *b = (const address_term_info *) b_uncast; | |
416 | ||
417 | if (a->expr != b->expr) | |
418 | return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1; | |
419 | ||
420 | if (a->multiplier != b->multiplier) | |
421 | return a->multiplier < b->multiplier ? -1 : 1; | |
422 | ||
423 | return 0; | |
424 | } | |
425 | ||
426 | /* Dump ADDRESS using flags FLAGS. */ | |
427 | ||
428 | static void | |
429 | dump_address_info (dump_flags_t flags, address_info &address) | |
430 | { | |
431 | if (address.base) | |
432 | dump_printf (flags, "%T + ", address.base); | |
433 | for (unsigned int i = 0; i < address.terms.length (); ++i) | |
434 | { | |
435 | if (i != 0) | |
436 | dump_printf (flags, " + "); | |
437 | dump_printf (flags, "%T", address.terms[i].expr); | |
438 | if (address.terms[i].multiplier != 1) | |
439 | dump_printf (flags, " * %wd", address.terms[i].multiplier); | |
440 | } | |
441 | dump_printf (flags, " + [%wd, %wd]", | |
442 | address.min_offset, address.max_offset - 1); | |
443 | } | |
444 | ||
445 | /* Hash an address_info based on its base and terms. */ | |
446 | ||
447 | hashval_t | |
448 | address_info_hasher::hash (const address_info *info) | |
449 | { | |
450 | inchash::hash hash; | |
451 | hash.add_int (info->base ? TREE_CODE (info->base) : 0); | |
452 | hash.add_int (info->terms.length ()); | |
453 | for (unsigned int i = 0; i < info->terms.length (); ++i) | |
454 | { | |
455 | hash.add_int (SSA_NAME_VERSION (info->terms[i].expr)); | |
456 | hash.add_hwi (info->terms[i].multiplier); | |
457 | } | |
458 | return hash.end (); | |
459 | } | |
460 | ||
461 | /* Return true if two address_infos have equal bases and terms. Other | |
462 | properties might be different (such as the statement or constant | |
463 | offset range). */ | |
464 | ||
465 | bool | |
466 | address_info_hasher::equal (const address_info *a, const address_info *b) | |
467 | { | |
468 | if (a->base != b->base | |
469 | && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0))) | |
470 | return false; | |
471 | ||
472 | if (a->terms.length () != b->terms.length ()) | |
473 | return false; | |
474 | ||
475 | for (unsigned int i = 0; i < a->terms.length (); ++i) | |
476 | if (a->terms[i].expr != b->terms[i].expr | |
477 | || a->terms[i].multiplier != b->terms[i].multiplier) | |
478 | return false; | |
479 | ||
480 | return true; | |
481 | } | |
482 | ||
483 | /* Return true if we want to version the loop, i.e. if we have a | |
484 | specific reason for doing so and no specific reason not to. */ | |
485 | ||
486 | bool | |
487 | loop_info::worth_versioning_p () const | |
488 | { | |
489 | return (!rejected_p | |
490 | && (!bitmap_empty_p (&unity_names) || subloops_benefit_p)); | |
491 | } | |
492 | ||
493 | loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv) | |
6165cf6b | 494 | : dom_walker (CDI_DOMINATORS), m_lv (lv) |
13e08dc9 RS |
495 | { |
496 | } | |
497 | ||
498 | /* Process BB before processing the blocks it dominates. */ | |
499 | ||
500 | edge | |
501 | loop_versioning::lv_dom_walker::before_dom_children (basic_block bb) | |
502 | { | |
13e08dc9 | 503 | if (bb == bb->loop_father->header) |
6165cf6b | 504 | m_lv.prune_loop_conditions (bb->loop_father); |
13e08dc9 RS |
505 | |
506 | return NULL; | |
507 | } | |
508 | ||
13e08dc9 RS |
509 | /* Decide whether to replace VAL with a new value in a versioned loop. |
510 | Return the new value if so, otherwise return null. */ | |
511 | ||
512 | tree | |
a889e06a | 513 | loop_versioning::name_prop::value_of_expr (tree val, gimple *) |
13e08dc9 RS |
514 | { |
515 | if (TREE_CODE (val) == SSA_NAME | |
516 | && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val))) | |
517 | return build_one_cst (TREE_TYPE (val)); | |
518 | return NULL_TREE; | |
519 | } | |
520 | ||
521 | /* Initialize the structure to optimize FN. */ | |
522 | ||
523 | loop_versioning::loop_versioning (function *fn) | |
524 | : m_fn (fn), | |
13e08dc9 RS |
525 | m_num_conditions (0), |
526 | m_address_table (31) | |
527 | { | |
ffb521f1 | 528 | unsigned m_nloops = number_of_loops (fn); |
13e08dc9 RS |
529 | bitmap_obstack_initialize (&m_bitmap_obstack); |
530 | gcc_obstack_init (&m_obstack); | |
531 | ||
532 | /* Initialize the loop information. */ | |
cb3874dc | 533 | m_loops.safe_grow_cleared (m_nloops, true); |
13e08dc9 RS |
534 | for (unsigned int i = 0; i < m_nloops; ++i) |
535 | { | |
536 | m_loops[i].outermost = get_loop (m_fn, 0); | |
537 | bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack); | |
538 | } | |
539 | ||
540 | /* Initialize the list of blocks that belong to each loop. */ | |
541 | unsigned int nbbs = last_basic_block_for_fn (fn); | |
cb3874dc | 542 | m_next_block_in_loop.safe_grow (nbbs, true); |
13e08dc9 RS |
543 | basic_block bb; |
544 | FOR_EACH_BB_FN (bb, fn) | |
545 | { | |
546 | loop_info &li = get_loop_info (bb->loop_father); | |
547 | m_next_block_in_loop[bb->index] = li.block_list; | |
548 | li.block_list = bb; | |
549 | } | |
550 | ||
551 | /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for | |
552 | unvectorizable code, since it is the largest size that can be | |
553 | handled efficiently by scalar code. omp_max_vf calculates the | |
554 | maximum number of bytes in a vector, when such a value is relevant | |
555 | to loop optimization. */ | |
556 | m_maximum_scale = estimated_poly_value (omp_max_vf ()); | |
557 | m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE); | |
558 | } | |
559 | ||
560 | loop_versioning::~loop_versioning () | |
561 | { | |
562 | bitmap_obstack_release (&m_bitmap_obstack); | |
563 | obstack_free (&m_obstack, NULL); | |
564 | } | |
565 | ||
566 | /* Return the maximum number of instructions allowed in LOOP before | |
567 | it becomes too big for versioning. | |
568 | ||
569 | There are separate limits for inner and outer loops. The limit for | |
570 | inner loops applies only to loops that benefit directly from versioning. | |
571 | The limit for outer loops applies to all code in the outer loop and | |
572 | its subloops that *doesn't* benefit directly from versioning; such code | |
573 | would be "taken along for the ride". The idea is that if the cost of | |
574 | the latter is small, it is better to version outer loops rather than | |
575 | inner loops, both to reduce the number of repeated checks and to enable | |
576 | more of the loop nest to be optimized as a natural nest (e.g. by loop | |
577 | interchange or outer-loop vectorization). */ | |
578 | ||
579 | unsigned int | |
99b1c316 | 580 | loop_versioning::max_insns_for_loop (class loop *loop) |
13e08dc9 RS |
581 | { |
582 | return (loop->inner | |
028d4092 ML |
583 | ? param_loop_versioning_max_outer_insns |
584 | : param_loop_versioning_max_inner_insns); | |
13e08dc9 RS |
585 | } |
586 | ||
587 | /* Return true if for cost reasons we should avoid versioning any loop | |
588 | that contains STMT. | |
589 | ||
590 | Note that we don't need to check whether versioning is invalid for | |
591 | correctness reasons, since the versioning process does that for us. | |
592 | The conditions involved are too rare to be worth duplicating here. */ | |
593 | ||
594 | bool | |
595 | loop_versioning::expensive_stmt_p (gimple *stmt) | |
596 | { | |
597 | if (gcall *call = dyn_cast <gcall *> (stmt)) | |
598 | /* Assume for now that the time spent in an "expensive" call would | |
599 | overwhelm any saving from versioning. */ | |
600 | return !gimple_inexpensive_call_p (call); | |
601 | return false; | |
602 | } | |
603 | ||
604 | /* Record that we want to version the loop that contains STMT for the | |
605 | case in which SSA name NAME is equal to 1. We already know that NAME | |
606 | is invariant in the loop. */ | |
607 | ||
608 | void | |
609 | loop_versioning::version_for_unity (gimple *stmt, tree name) | |
610 | { | |
99b1c316 | 611 | class loop *loop = loop_containing_stmt (stmt); |
13e08dc9 RS |
612 | loop_info &li = get_loop_info (loop); |
613 | ||
614 | if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name))) | |
615 | { | |
616 | /* This is the first time we've wanted to version LOOP for NAME. | |
617 | Keep track of the outermost loop that can handle all versioning | |
618 | checks in LI. */ | |
99b1c316 | 619 | class loop *outermost |
13e08dc9 RS |
620 | = outermost_invariant_loop_for_expr (loop, name); |
621 | if (loop_depth (li.outermost) < loop_depth (outermost)) | |
622 | li.outermost = outermost; | |
623 | ||
624 | if (dump_enabled_p ()) | |
625 | { | |
626 | dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop" | |
627 | " for when %T == 1", name); | |
628 | if (outermost == loop) | |
629 | dump_printf (MSG_NOTE, "; cannot hoist check further"); | |
630 | else | |
631 | { | |
632 | dump_printf (MSG_NOTE, "; could implement the check at loop" | |
633 | " depth %d", loop_depth (outermost)); | |
634 | if (loop_depth (li.outermost) > loop_depth (outermost)) | |
635 | dump_printf (MSG_NOTE, ", but other checks only allow" | |
636 | " a depth of %d", loop_depth (li.outermost)); | |
637 | } | |
638 | dump_printf (MSG_NOTE, "\n"); | |
639 | } | |
640 | ||
641 | m_num_conditions += 1; | |
642 | } | |
643 | else | |
644 | { | |
645 | /* This is a duplicate request. */ | |
646 | if (dump_enabled_p ()) | |
647 | dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing" | |
648 | " loop for when %T == 1\n", name); | |
649 | } | |
650 | } | |
651 | ||
652 | /* Return true if OP1_TREE is constant and if in principle it is worth | |
653 | versioning an address fragment of the form: | |
654 | ||
655 | i * OP1_TREE * OP2 * stride | |
656 | ||
657 | for the case in which stride == 1. This in practice means testing | |
658 | whether: | |
659 | ||
660 | OP1_TREE * OP2 <= M_MAXIMUM_SCALE. | |
661 | ||
662 | If RESULT is nonnull, store OP1_TREE * OP2 there when returning true. */ | |
663 | ||
664 | bool | |
665 | loop_versioning::acceptable_multiplier_p (tree op1_tree, | |
666 | unsigned HOST_WIDE_INT op2, | |
667 | unsigned HOST_WIDE_INT *result) | |
668 | { | |
669 | if (tree_fits_uhwi_p (op1_tree)) | |
670 | { | |
671 | unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree); | |
672 | /* The first part checks for overflow. */ | |
673 | if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale) | |
674 | { | |
675 | if (result) | |
676 | *result = op1 * op2; | |
677 | return true; | |
678 | } | |
679 | } | |
680 | return false; | |
681 | } | |
682 | ||
683 | /* Return true if it is worth using loop versioning on a memory access | |
684 | of type TYPE. Store the size of the access in *SIZE if so. */ | |
685 | ||
686 | bool | |
687 | loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size) | |
688 | { | |
689 | return (TYPE_SIZE_UNIT (type) | |
690 | && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size)); | |
691 | } | |
692 | ||
693 | /* See whether OP is constant and whether we can multiply TERM by that | |
694 | constant without exceeding M_MAXIMUM_SCALE. Return true and update | |
695 | TERM if so. */ | |
696 | ||
697 | bool | |
698 | loop_versioning::multiply_term_by (address_term_info &term, tree op) | |
699 | { | |
700 | return acceptable_multiplier_p (op, term.multiplier, &term.multiplier); | |
701 | } | |
702 | ||
703 | /* Decide whether an address fragment of the form STRIDE * MULTIPLIER | |
704 | is likely to be indexing an innermost dimension, returning the result | |
705 | as an INNER_* probability. */ | |
706 | ||
707 | inner_likelihood | |
708 | loop_versioning::get_inner_likelihood (tree stride, | |
709 | unsigned HOST_WIDE_INT multiplier) | |
710 | { | |
711 | const unsigned int MAX_NITERS = 8; | |
712 | ||
713 | /* Iterate over possible values of STRIDE. Return INNER_LIKELY if at | |
714 | least one of those values is likely to be for the innermost dimension. | |
715 | Record in UNLIKELY_P if at least one of those values is unlikely to be | |
716 | for the innermost dimension. | |
717 | ||
718 | E.g. for: | |
719 | ||
720 | stride = cond ? a * b : 1 | |
721 | ||
722 | we should treat STRIDE as being a likely inner dimension, since | |
723 | we know that it is 1 under at least some circumstances. (See the | |
724 | Fortran example below.) However: | |
725 | ||
726 | stride = a * b | |
727 | ||
728 | on its own is unlikely to be for the innermost dimension, since | |
729 | that would require both a and b to be 1 at runtime. */ | |
730 | bool unlikely_p = false; | |
731 | tree worklist[MAX_NITERS]; | |
732 | unsigned int length = 0; | |
733 | worklist[length++] = stride; | |
734 | for (unsigned int i = 0; i < length; ++i) | |
735 | { | |
736 | tree expr = worklist[i]; | |
737 | ||
738 | if (CONSTANT_CLASS_P (expr)) | |
739 | { | |
740 | /* See if EXPR * MULTIPLIER would be consistent with an individual | |
741 | access or a small grouped access. */ | |
742 | if (acceptable_multiplier_p (expr, multiplier)) | |
743 | return INNER_LIKELY; | |
744 | else | |
745 | unlikely_p = true; | |
746 | } | |
747 | else if (gimple *stmt = maybe_get_stmt (expr)) | |
748 | { | |
749 | /* If EXPR is set by a PHI node, queue its arguments in case | |
750 | we find one that is consistent with an inner dimension. | |
751 | ||
752 | An important instance of this is the Fortran handling of array | |
753 | descriptors, which calculates the stride of the inner dimension | |
754 | using a PHI equivalent of: | |
755 | ||
756 | raw_stride = a.dim[0].stride; | |
757 | stride = raw_stride != 0 ? raw_stride : 1; | |
758 | ||
759 | (Strides for outer dimensions do not treat 0 specially.) */ | |
760 | if (gphi *phi = dyn_cast <gphi *> (stmt)) | |
761 | { | |
762 | unsigned int nargs = gimple_phi_num_args (phi); | |
763 | for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j) | |
764 | worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j)); | |
765 | } | |
766 | /* If the value is set by an assignment, expect it to be read | |
767 | from memory (such as an array descriptor) rather than be | |
768 | calculated. */ | |
769 | else if (gassign *assign = dyn_cast <gassign *> (stmt)) | |
770 | { | |
771 | if (!gimple_assign_load_p (assign)) | |
772 | unlikely_p = true; | |
773 | } | |
774 | /* Things like calls don't really tell us anything. */ | |
775 | } | |
776 | } | |
777 | ||
778 | /* We didn't find any possible values of STRIDE that were likely to be | |
779 | for the innermost dimension. If we found one that was actively | |
780 | unlikely to be for the innermost dimension, assume that that applies | |
781 | to STRIDE too. */ | |
782 | return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW; | |
783 | } | |
784 | ||
e047844b RS |
785 | /* Dump the likelihood that TERM's stride is for the innermost dimension. |
786 | ADDRESS is the address that contains TERM. */ | |
787 | ||
788 | void | |
789 | loop_versioning::dump_inner_likelihood (address_info &address, | |
790 | address_term_info &term) | |
791 | { | |
792 | if (term.inner_likelihood == INNER_LIKELY) | |
793 | dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the" | |
794 | " innermost dimension\n", term.stride); | |
795 | else if (term.inner_likelihood == INNER_UNLIKELY) | |
796 | dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the" | |
797 | " innermost dimension\n", term.stride); | |
798 | else | |
799 | dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T" | |
800 | " is the innermost dimension\n", term.stride); | |
801 | } | |
802 | ||
13e08dc9 RS |
803 | /* The caller has identified that STRIDE is the stride of interest |
804 | in TERM, and that the stride is applied in OP_LOOP. Record this | |
805 | information in TERM, deciding whether STRIDE is likely to be for | |
806 | the innermost dimension of an array and whether it represents a | |
807 | versioning opportunity. ADDRESS is the address that contains TERM. */ | |
808 | ||
809 | void | |
810 | loop_versioning::analyze_stride (address_info &address, | |
811 | address_term_info &term, | |
99b1c316 | 812 | tree stride, class loop *op_loop) |
13e08dc9 RS |
813 | { |
814 | term.stride = stride; | |
815 | ||
816 | term.inner_likelihood = get_inner_likelihood (stride, term.multiplier); | |
817 | if (dump_enabled_p ()) | |
e047844b | 818 | dump_inner_likelihood (address, term); |
13e08dc9 RS |
819 | |
820 | /* To be a versioning opportunity we require: | |
821 | ||
822 | - The multiplier applied by TERM is equal to the access size, | |
823 | so that when STRIDE is 1, the accesses in successive loop | |
824 | iterations are consecutive. | |
825 | ||
826 | This is deliberately conservative. We could relax it to handle | |
827 | other cases (such as those with gaps between iterations) if we | |
828 | find any real testcases for which it's useful. | |
829 | ||
830 | - the stride is applied in the same loop as STMT rather than | |
831 | in an outer loop. Although versioning for strides applied in | |
832 | outer loops could help in some cases -- such as enabling | |
833 | more loop interchange -- the savings are much lower than for | |
834 | inner loops. | |
835 | ||
836 | - the stride is an SSA name that is invariant in STMT's loop, | |
837 | since otherwise versioning isn't possible. */ | |
838 | unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset; | |
839 | if (term.multiplier == access_size | |
840 | && address.loop == op_loop | |
841 | && TREE_CODE (stride) == SSA_NAME | |
842 | && expr_invariant_in_loop_p (address.loop, stride)) | |
843 | { | |
844 | term.versioning_opportunity_p = true; | |
845 | if (dump_enabled_p ()) | |
846 | dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning" | |
847 | " opportunity\n", stride); | |
848 | } | |
849 | } | |
850 | ||
851 | /* See whether address term TERM (which belongs to ADDRESS) is the result | |
852 | of multiplying a varying SSA name by a loop-invariant SSA name. | |
853 | Return true and update TERM if so. | |
854 | ||
855 | This handles both cases that SCEV might handle, such as: | |
856 | ||
857 | for (int i = 0; i < n; ++i) | |
858 | res += a[i * stride]; | |
859 | ||
860 | and ones in which the term varies arbitrarily between iterations, such as: | |
861 | ||
862 | for (int i = 0; i < n; ++i) | |
863 | res += a[index[i] * stride]; */ | |
864 | ||
865 | bool | |
866 | loop_versioning::find_per_loop_multiplication (address_info &address, | |
867 | address_term_info &term) | |
868 | { | |
e047844b | 869 | gassign *mult = maybe_get_assign (term.expr); |
13e08dc9 RS |
870 | if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR) |
871 | return false; | |
872 | ||
99b1c316 | 873 | class loop *mult_loop = loop_containing_stmt (mult); |
13e08dc9 RS |
874 | if (!loop_outer (mult_loop)) |
875 | return false; | |
876 | ||
877 | tree op1 = strip_casts (gimple_assign_rhs1 (mult)); | |
878 | tree op2 = strip_casts (gimple_assign_rhs2 (mult)); | |
879 | if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME) | |
880 | return false; | |
881 | ||
882 | bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1); | |
883 | bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2); | |
884 | if (invariant1_p == invariant2_p) | |
885 | return false; | |
886 | ||
887 | /* Make sure that the loop invariant is OP2 rather than OP1. */ | |
888 | if (invariant1_p) | |
889 | std::swap (op1, op2); | |
890 | ||
891 | if (dump_enabled_p ()) | |
892 | dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T" | |
893 | " * loop-invariant %T\n", term.expr, op1, op2); | |
894 | analyze_stride (address, term, op2, mult_loop); | |
895 | return true; | |
896 | } | |
897 | ||
898 | /* Try to use scalar evolutions to find an address stride for TERM, | |
e047844b | 899 | which belongs to ADDRESS. Return true and update TERM if so. |
13e08dc9 RS |
900 | |
901 | Here we are interested in any evolution information we can find, | |
902 | not just evolutions wrt ADDRESS->LOOP. For example, if we find that | |
903 | an outer loop obviously iterates over the inner dimension of an array, | |
904 | that information can help us eliminate worthless versioning opportunities | |
905 | in inner loops. */ | |
906 | ||
e047844b | 907 | bool |
13e08dc9 RS |
908 | loop_versioning::analyze_term_using_scevs (address_info &address, |
909 | address_term_info &term) | |
910 | { | |
911 | gimple *setter = maybe_get_stmt (term.expr); | |
912 | if (!setter) | |
e047844b | 913 | return false; |
13e08dc9 | 914 | |
99b1c316 | 915 | class loop *wrt_loop = loop_containing_stmt (setter); |
13e08dc9 | 916 | if (!loop_outer (wrt_loop)) |
e047844b | 917 | return false; |
13e08dc9 RS |
918 | |
919 | tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr)); | |
920 | if (TREE_CODE (chrec) == POLYNOMIAL_CHREC) | |
921 | { | |
922 | if (dump_enabled_p ()) | |
923 | dump_printf_loc (MSG_NOTE, address.stmt, | |
924 | "address term %T = %T\n", term.expr, chrec); | |
925 | ||
926 | /* Peel casts and accumulate constant multiplications, up to the | |
927 | limit allowed by M_MAXIMUM_SCALE. */ | |
928 | tree stride = strip_casts (CHREC_RIGHT (chrec)); | |
929 | while (TREE_CODE (stride) == MULT_EXPR | |
930 | && multiply_term_by (term, TREE_OPERAND (stride, 1))) | |
931 | stride = strip_casts (TREE_OPERAND (stride, 0)); | |
932 | ||
933 | gassign *assign; | |
934 | while ((assign = maybe_get_assign (stride)) | |
935 | && gimple_assign_rhs_code (assign) == MULT_EXPR | |
936 | && multiply_term_by (term, gimple_assign_rhs2 (assign))) | |
937 | { | |
938 | if (dump_enabled_p ()) | |
939 | dump_printf_loc (MSG_NOTE, address.stmt, | |
953e08fd | 940 | "looking through %G", (gimple *) assign); |
13e08dc9 RS |
941 | stride = strip_casts (gimple_assign_rhs1 (assign)); |
942 | } | |
943 | ||
944 | analyze_stride (address, term, stride, get_chrec_loop (chrec)); | |
e047844b RS |
945 | return true; |
946 | } | |
947 | ||
948 | return false; | |
949 | } | |
950 | ||
951 | /* Address term TERM is an arbitrary term that provides no versioning | |
952 | opportunities. Analyze it to see whether it contains any likely | |
953 | inner strides, so that we don't mistakenly version for other | |
954 | (less likely) candidates. | |
955 | ||
956 | This copes with invariant innermost indices such as: | |
957 | ||
958 | x(i, :) = 100 | |
959 | ||
960 | where the "i" component of the address is invariant in the loop | |
961 | but provides the real inner stride. | |
962 | ||
963 | ADDRESS is the address that contains TERM. */ | |
964 | ||
965 | void | |
966 | loop_versioning::analyze_arbitrary_term (address_info &address, | |
967 | address_term_info &term) | |
968 | ||
969 | { | |
970 | /* A multiplication offers two potential strides. Pick the one that | |
971 | is most likely to be an innermost stride. */ | |
972 | tree expr = term.expr, alt = NULL_TREE; | |
973 | gassign *mult = maybe_get_assign (expr); | |
974 | if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR) | |
975 | { | |
976 | expr = strip_casts (gimple_assign_rhs1 (mult)); | |
977 | alt = strip_casts (gimple_assign_rhs2 (mult)); | |
978 | } | |
979 | term.stride = expr; | |
980 | term.inner_likelihood = get_inner_likelihood (expr, term.multiplier); | |
981 | if (alt) | |
982 | { | |
983 | inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier); | |
984 | if (alt_l > term.inner_likelihood) | |
985 | { | |
986 | term.stride = alt; | |
987 | term.inner_likelihood = alt_l; | |
988 | } | |
13e08dc9 | 989 | } |
e047844b RS |
990 | if (dump_enabled_p ()) |
991 | dump_inner_likelihood (address, term); | |
13e08dc9 RS |
992 | } |
993 | ||
994 | /* Try to identify loop strides in ADDRESS and try to choose realistic | |
995 | versioning opportunities based on these strides. | |
996 | ||
997 | The main difficulty here isn't finding strides that could be used | |
998 | in a version check (that's pretty easy). The problem instead is to | |
999 | avoid versioning for some stride S that is unlikely ever to be 1 at | |
1000 | runtime. Versioning for S == 1 on its own would lead to unnecessary | |
1001 | code bloat, while adding S == 1 to more realistic version conditions | |
1002 | would lose the optimisation opportunity offered by those other conditions. | |
1003 | ||
1004 | For example, versioning for a stride of 1 in the Fortran code: | |
1005 | ||
1006 | integer :: a(:,:) | |
1007 | a(1,:) = 1 | |
1008 | ||
1009 | is not usually a good idea, since the assignment is iterating over | |
1010 | an outer dimension and is relatively unlikely to have a stride of 1. | |
1011 | (It isn't impossible, since the inner dimension might be 1, or the | |
1012 | array might be transposed.) Similarly, in: | |
1013 | ||
1014 | integer :: a(:,:), b(:,:) | |
1015 | b(:,1) = a(1,:) | |
1016 | ||
1017 | b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't. | |
1018 | Versioning for when both strides are 1 would lose most of the benefit | |
1019 | of versioning for b's access. | |
1020 | ||
1021 | The approach we take is as follows: | |
1022 | ||
1023 | - Analyze each term to see whether it has an identifiable stride, | |
1024 | regardless of which loop applies the stride. | |
1025 | ||
1026 | - Evaluate the likelihood that each such stride is for the innermost | |
1027 | dimension of an array, on the scale "likely", "don't know" or "unlikely". | |
1028 | ||
1029 | - If there is a single "likely" innermost stride, and that stride is | |
1030 | applied in the loop that contains STMT, version the loop for when the | |
1031 | stride is 1. This deals with the cases in which we're fairly | |
1032 | confident of doing the right thing, such as the b(:,1) reference above. | |
1033 | ||
1034 | - If there are no "likely" innermost strides, and the loop that contains | |
1035 | STMT uses a stride that we rated as "don't know", version for when | |
1036 | that stride is 1. This is principally used for C code such as: | |
1037 | ||
1038 | for (int i = 0; i < n; ++i) | |
1039 | a[i * x] = ...; | |
1040 | ||
1041 | and: | |
1042 | ||
1043 | for (int j = 0; j < n; ++j) | |
1044 | for (int i = 0; i < n; ++i) | |
1045 | a[i * x + j * y] = ...; | |
1046 | ||
1047 | where nothing in the way "x" and "y" are set gives a hint as to | |
1048 | whether "i" iterates over the innermost dimension of the array. | |
d5029d45 | 1049 | In these situations it seems reasonable to assume the |
13e08dc9 RS |
1050 | programmer has nested the loops appropriately (although of course |
1051 | there are examples like GEMM in which this assumption doesn't hold | |
1052 | for all accesses in the loop). | |
1053 | ||
1054 | This case is also useful for the Fortran equivalent of the | |
1055 | above C code. */ | |
1056 | ||
1057 | void | |
1058 | loop_versioning::analyze_address_fragment (address_info &address) | |
1059 | { | |
1060 | if (dump_enabled_p ()) | |
1061 | { | |
1062 | dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment "); | |
1063 | dump_address_info (MSG_NOTE, address); | |
1064 | dump_printf (MSG_NOTE, "\n"); | |
1065 | } | |
1066 | ||
1067 | /* Analyze each component of the sum to see whether it involves an | |
1068 | apparent stride. | |
1069 | ||
1070 | There is an overlap between the addresses that | |
1071 | find_per_loop_multiplication and analyze_term_using_scevs can handle, | |
1072 | but the former is much cheaper than SCEV analysis, so try it first. */ | |
1073 | for (unsigned int i = 0; i < address.terms.length (); ++i) | |
e047844b RS |
1074 | if (!find_per_loop_multiplication (address, address.terms[i]) |
1075 | && !analyze_term_using_scevs (address, address.terms[i]) | |
1076 | && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr))) | |
1077 | analyze_arbitrary_term (address, address.terms[i]); | |
13e08dc9 RS |
1078 | |
1079 | /* Check for strides that are likely to be for the innermost dimension. | |
1080 | ||
1081 | 1. If there is a single likely inner stride, if it is an SSA name, | |
1082 | and if it is worth versioning the loop for when the SSA name | |
1083 | equals 1, record that we want to do so. | |
1084 | ||
1085 | 2. Otherwise, if there any likely inner strides, bail out. This means | |
1086 | one of: | |
1087 | ||
1088 | (a) There are multiple likely inner strides. This suggests we're | |
1089 | confused and be can't be confident of doing the right thing. | |
1090 | ||
1091 | (b) There is a single likely inner stride and it is a constant | |
1092 | rather than an SSA name. This can mean either that the access | |
1093 | is a natural one without any variable strides, such as: | |
1094 | ||
1095 | for (int i = 0; i < n; ++i) | |
1096 | a[i] += 1; | |
1097 | ||
1098 | or that a variable stride is applied to an outer dimension, | |
1099 | such as: | |
1100 | ||
1101 | for (int i = 0; i < n; ++i) | |
1102 | for (int j = 0; j < n; ++j) | |
1103 | a[j * stride][i] += 1; | |
1104 | ||
1105 | (c) There is a single likely inner stride, and it is an SSA name, | |
1106 | but it isn't a worthwhile versioning opportunity. This usually | |
1107 | means that the variable stride is applied by an outer loop, | |
1108 | such as: | |
1109 | ||
1110 | for (int i = 0; i < n; ++i) | |
1111 | for (int j = 0; j < n; ++j) | |
1112 | a[j][i * stride] += 1; | |
1113 | ||
1114 | or (using an example with a more natural loop nesting): | |
1115 | ||
1116 | for (int i = 0; i < n; ++i) | |
1117 | for (int j = 0; j < n; ++j) | |
1118 | a[i][j] += b[i * stride]; | |
1119 | ||
1120 | in cases where b[i * stride] cannot (yet) be hoisted for | |
1121 | aliasing reasons. | |
1122 | ||
1123 | 3. If there are no likely inner strides, fall through to the next | |
1124 | set of checks. | |
1125 | ||
1126 | Pointer equality is enough to check for uniqueness in (1), since we | |
1127 | only care about SSA names. */ | |
1128 | tree chosen_stride = NULL_TREE; | |
1129 | tree version_stride = NULL_TREE; | |
1130 | for (unsigned int i = 0; i < address.terms.length (); ++i) | |
1131 | if (chosen_stride != address.terms[i].stride | |
1132 | && address.terms[i].inner_likelihood == INNER_LIKELY) | |
1133 | { | |
1134 | if (chosen_stride) | |
1135 | return; | |
1136 | chosen_stride = address.terms[i].stride; | |
1137 | if (address.terms[i].versioning_opportunity_p) | |
1138 | version_stride = chosen_stride; | |
1139 | } | |
1140 | ||
1141 | /* If there are no likely inner strides, see if there is a single | |
1142 | versioning opportunity for a stride that was rated as INNER_DONT_KNOW. | |
1143 | See the comment above the function for the cases that this code | |
1144 | handles. */ | |
1145 | if (!chosen_stride) | |
1146 | for (unsigned int i = 0; i < address.terms.length (); ++i) | |
1147 | if (version_stride != address.terms[i].stride | |
1148 | && address.terms[i].inner_likelihood == INNER_DONT_KNOW | |
1149 | && address.terms[i].versioning_opportunity_p) | |
1150 | { | |
1151 | if (version_stride) | |
1152 | return; | |
1153 | version_stride = address.terms[i].stride; | |
1154 | } | |
1155 | ||
1156 | if (version_stride) | |
1157 | version_for_unity (address.stmt, version_stride); | |
1158 | } | |
1159 | ||
1160 | /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses | |
1161 | TYPE_SIZE bytes and record this address fragment for later processing. | |
1162 | STMT is the statement that contains the address. */ | |
1163 | ||
1164 | void | |
1165 | loop_versioning::record_address_fragment (gimple *stmt, | |
1166 | unsigned HOST_WIDE_INT type_size, | |
1167 | tree expr, | |
1168 | unsigned HOST_WIDE_INT multiplier, | |
1169 | HOST_WIDE_INT offset) | |
1170 | { | |
1171 | /* We're only interested in computed values. */ | |
1172 | if (TREE_CODE (expr) != SSA_NAME) | |
1173 | return; | |
1174 | ||
1175 | /* Quick exit if no part of the address is calculated in STMT's loop, | |
1176 | since such addresses have no versioning opportunities. */ | |
99b1c316 | 1177 | class loop *loop = loop_containing_stmt (stmt); |
13e08dc9 RS |
1178 | if (expr_invariant_in_loop_p (loop, expr)) |
1179 | return; | |
1180 | ||
1181 | /* Set up an address_info for EXPR * MULTIPLIER. */ | |
1182 | address_info *address = XOBNEW (&m_obstack, address_info); | |
1183 | new (address) address_info; | |
1184 | address->stmt = stmt; | |
1185 | address->loop = loop; | |
1186 | address->base = NULL_TREE; | |
1187 | address->terms.quick_grow (1); | |
1188 | address->terms[0].expr = expr; | |
1189 | address->terms[0].multiplier = multiplier; | |
1190 | address->terms[0].stride = NULL_TREE; | |
1191 | address->terms[0].inner_likelihood = INNER_UNLIKELY; | |
1192 | address->terms[0].versioning_opportunity_p = false; | |
1193 | address->min_offset = offset; | |
1194 | ||
1195 | /* Peel apart the expression into a sum of address_terms, where each | |
1196 | term is multiplied by a constant. Treat a + b and a - b the same, | |
1197 | since it doesn't matter for our purposes whether an address is | |
1198 | increasing or decreasing. Distribute (a + b) * constant into | |
1199 | a * constant + b * constant. | |
1200 | ||
1201 | We don't care which loop each term belongs to, since we want to | |
1202 | examine as many candidate strides as possible when determining | |
1203 | which is likely to be for the innermost dimension. We therefore | |
1204 | don't limit the search to statements in STMT's loop. */ | |
1205 | for (unsigned int i = 0; i < address->terms.length (); ) | |
1206 | { | |
1207 | if (gassign *assign = maybe_get_assign (address->terms[i].expr)) | |
1208 | { | |
1209 | tree_code code = gimple_assign_rhs_code (assign); | |
1210 | if (code == PLUS_EXPR | |
1211 | || code == POINTER_PLUS_EXPR | |
1212 | || code == MINUS_EXPR) | |
1213 | { | |
1214 | tree op1 = gimple_assign_rhs1 (assign); | |
1215 | tree op2 = gimple_assign_rhs2 (assign); | |
1216 | if (TREE_CODE (op2) == INTEGER_CST) | |
1217 | { | |
1218 | address->terms[i].expr = strip_casts (op1); | |
1219 | /* This is heuristic only, so don't worry about truncation | |
1220 | or overflow. */ | |
1221 | address->min_offset += (TREE_INT_CST_LOW (op2) | |
1222 | * address->terms[i].multiplier); | |
1223 | continue; | |
1224 | } | |
1225 | else if (address->terms.length () < address_info::MAX_TERMS) | |
1226 | { | |
1227 | unsigned int j = address->terms.length (); | |
1228 | address->terms.quick_push (address->terms[i]); | |
1229 | address->terms[i].expr = strip_casts (op1); | |
1230 | address->terms[j].expr = strip_casts (op2); | |
1231 | continue; | |
1232 | } | |
1233 | } | |
1234 | if (code == MULT_EXPR) | |
1235 | { | |
1236 | tree op1 = gimple_assign_rhs1 (assign); | |
1237 | tree op2 = gimple_assign_rhs2 (assign); | |
1238 | if (multiply_term_by (address->terms[i], op2)) | |
1239 | { | |
1240 | address->terms[i].expr = strip_casts (op1); | |
1241 | continue; | |
1242 | } | |
1243 | } | |
e944354e RD |
1244 | if (CONVERT_EXPR_CODE_P (code)) |
1245 | { | |
1246 | tree op1 = gimple_assign_rhs1 (assign); | |
1247 | address->terms[i].expr = strip_casts (op1); | |
1248 | continue; | |
1249 | } | |
13e08dc9 RS |
1250 | } |
1251 | i += 1; | |
1252 | } | |
1253 | ||
1254 | /* Peel off any symbolic pointer. */ | |
1255 | if (TREE_CODE (address->terms[0].expr) != SSA_NAME | |
1256 | && address->terms[0].multiplier == 1) | |
1257 | { | |
1258 | if (address->terms.length () == 1) | |
1259 | { | |
1260 | obstack_free (&m_obstack, address); | |
1261 | return; | |
1262 | } | |
1263 | address->base = address->terms[0].expr; | |
1264 | address->terms.ordered_remove (0); | |
1265 | } | |
1266 | ||
1267 | /* Require all remaining terms to be SSA names. (This could be false | |
1268 | for unfolded statements, but they aren't worth dealing with.) */ | |
1269 | for (unsigned int i = 0; i < address->terms.length (); ++i) | |
1270 | if (TREE_CODE (address->terms[i].expr) != SSA_NAME) | |
1271 | { | |
1272 | obstack_free (&m_obstack, address); | |
1273 | return; | |
1274 | } | |
1275 | ||
1276 | /* The loop above set MIN_OFFSET based on the first byte of the | |
1277 | referenced data. Calculate the end + 1. */ | |
1278 | address->max_offset = address->min_offset + type_size; | |
1279 | ||
1280 | /* Put the terms into a canonical order for the hash table lookup below. */ | |
1281 | address->terms.qsort (compare_address_terms); | |
1282 | ||
1283 | if (dump_enabled_p ()) | |
1284 | { | |
1285 | dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr); | |
1286 | if (multiplier != 1) | |
1287 | dump_printf (MSG_NOTE, " * %wd", multiplier); | |
1288 | dump_printf (MSG_NOTE, " = "); | |
1289 | dump_address_info (MSG_NOTE, *address); | |
1290 | dump_printf (MSG_NOTE, "\n"); | |
1291 | } | |
1292 | ||
1293 | /* Pool address information with the same terms (but potentially | |
1294 | different offsets). */ | |
1295 | address_info **slot = m_address_table.find_slot (address, INSERT); | |
1296 | if (address_info *old_address = *slot) | |
1297 | { | |
1298 | /* We've already seen an address with the same terms. Extend the | |
1299 | offset range to account for this access. Doing this can paper | |
1300 | over gaps, such as in: | |
1301 | ||
1302 | a[i * stride * 4] + a[i * stride * 4 + 3]; | |
1303 | ||
1304 | where nothing references "+ 1" or "+ 2". However, the vectorizer | |
1305 | handles such gapped accesses without problems, so it's not worth | |
1306 | trying to exclude them. */ | |
1307 | if (old_address->min_offset > address->min_offset) | |
1308 | old_address->min_offset = address->min_offset; | |
1309 | if (old_address->max_offset < address->max_offset) | |
1310 | old_address->max_offset = address->max_offset; | |
1311 | obstack_free (&m_obstack, address); | |
1312 | } | |
1313 | else | |
1314 | { | |
1315 | /* This is the first time we've seen an address with these terms. */ | |
1316 | *slot = address; | |
1317 | m_address_list.safe_push (address); | |
1318 | } | |
1319 | } | |
1320 | ||
1321 | /* Analyze expression EXPR, which occurs in STMT. */ | |
1322 | ||
1323 | void | |
1324 | loop_versioning::analyze_expr (gimple *stmt, tree expr) | |
1325 | { | |
1326 | unsigned HOST_WIDE_INT type_size; | |
1327 | ||
1328 | while (handled_component_p (expr)) | |
1329 | { | |
1330 | /* See whether we can use versioning to avoid a multiplication | |
1331 | in an array index. */ | |
1332 | if (TREE_CODE (expr) == ARRAY_REF | |
1333 | && acceptable_type_p (TREE_TYPE (expr), &type_size)) | |
1334 | record_address_fragment (stmt, type_size, | |
1335 | TREE_OPERAND (expr, 1), type_size, 0); | |
1336 | expr = TREE_OPERAND (expr, 0); | |
1337 | } | |
1338 | ||
1339 | /* See whether we can use versioning to avoid a multiplication | |
1340 | in the pointer calculation of a MEM_REF. */ | |
1341 | if (TREE_CODE (expr) == MEM_REF | |
1342 | && acceptable_type_p (TREE_TYPE (expr), &type_size)) | |
1343 | record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1, | |
1344 | /* This is heuristic only, so don't worry | |
1345 | about truncation or overflow. */ | |
1346 | TREE_INT_CST_LOW (TREE_OPERAND (expr, 1))); | |
1347 | ||
1348 | /* These would be easy to handle if they existed at this stage. */ | |
1349 | gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF); | |
1350 | } | |
1351 | ||
1352 | /* Analyze all the statements in BB looking for useful version checks. | |
1353 | Return true on success, false if something prevents the block from | |
1354 | being versioned. */ | |
1355 | ||
1356 | bool | |
1357 | loop_versioning::analyze_block (basic_block bb) | |
1358 | { | |
99b1c316 | 1359 | class loop *loop = bb->loop_father; |
13e08dc9 RS |
1360 | loop_info &li = get_loop_info (loop); |
1361 | for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); | |
1362 | gsi_next (&gsi)) | |
1363 | { | |
1364 | gimple *stmt = gsi_stmt (gsi); | |
1365 | if (is_gimple_debug (stmt)) | |
1366 | continue; | |
1367 | ||
1368 | if (expensive_stmt_p (stmt)) | |
1369 | { | |
1370 | if (dump_enabled_p ()) | |
1371 | dump_printf_loc (MSG_NOTE, stmt, "expensive statement" | |
1372 | " prevents versioning: %G", stmt); | |
1373 | return false; | |
1374 | } | |
1375 | ||
1376 | /* Only look for direct versioning opportunities in inner loops | |
1377 | since the benefit tends to be much smaller for outer loops. */ | |
1378 | if (!loop->inner) | |
1379 | { | |
1380 | unsigned int nops = gimple_num_ops (stmt); | |
1381 | for (unsigned int i = 0; i < nops; ++i) | |
1382 | if (tree op = gimple_op (stmt, i)) | |
1383 | analyze_expr (stmt, op); | |
1384 | } | |
1385 | ||
1386 | /* The point of the instruction limit is to prevent excessive | |
1387 | code growth, so this is a size-based estimate even though | |
1388 | the optimization is aimed at speed. */ | |
1389 | li.num_insns += estimate_num_insns (stmt, &eni_size_weights); | |
1390 | } | |
1391 | ||
1392 | return true; | |
1393 | } | |
1394 | ||
1395 | /* Analyze all the blocks in the function, looking for useful version checks. | |
1396 | Return true if we found one. */ | |
1397 | ||
1398 | bool | |
1399 | loop_versioning::analyze_blocks () | |
1400 | { | |
1401 | AUTO_DUMP_SCOPE ("analyze_blocks", | |
1402 | dump_user_location_t::from_function_decl (m_fn->decl)); | |
1403 | ||
1404 | /* For now we don't try to version the whole function, although | |
1405 | versioning at that level could be useful in some cases. */ | |
1406 | get_loop_info (get_loop (m_fn, 0)).rejected_p = true; | |
1407 | ||
e41ba804 | 1408 | for (auto loop : loops_list (cfun, LI_FROM_INNERMOST)) |
13e08dc9 RS |
1409 | { |
1410 | loop_info &linfo = get_loop_info (loop); | |
1411 | ||
1412 | /* Ignore cold loops. */ | |
1413 | if (!optimize_loop_for_speed_p (loop)) | |
1414 | linfo.rejected_p = true; | |
1415 | ||
1416 | /* See whether an inner loop prevents versioning of this loop. */ | |
1417 | if (!linfo.rejected_p) | |
99b1c316 | 1418 | for (class loop *inner = loop->inner; inner; inner = inner->next) |
13e08dc9 RS |
1419 | if (get_loop_info (inner).rejected_p) |
1420 | { | |
1421 | linfo.rejected_p = true; | |
1422 | break; | |
1423 | } | |
1424 | ||
1425 | /* If versioning the loop is still a possibility, examine the | |
1426 | statements in the loop to look for versioning opportunities. */ | |
1427 | if (!linfo.rejected_p) | |
1428 | { | |
1429 | void *start_point = obstack_alloc (&m_obstack, 0); | |
1430 | ||
1431 | for (basic_block bb = linfo.block_list; bb; | |
1432 | bb = m_next_block_in_loop[bb->index]) | |
1433 | if (!analyze_block (bb)) | |
1434 | { | |
1435 | linfo.rejected_p = true; | |
1436 | break; | |
ffb521f1 | 1437 | } |
13e08dc9 RS |
1438 | |
1439 | if (!linfo.rejected_p) | |
1440 | { | |
1441 | /* Process any queued address fragments, now that we have | |
1442 | complete grouping information. */ | |
1443 | address_info *address; | |
1444 | unsigned int i; | |
1445 | FOR_EACH_VEC_ELT (m_address_list, i, address) | |
1446 | analyze_address_fragment (*address); | |
1447 | } | |
1448 | ||
1449 | m_address_table.empty (); | |
1450 | m_address_list.truncate (0); | |
1451 | obstack_free (&m_obstack, start_point); | |
1452 | } | |
1453 | } | |
1454 | ||
1455 | return m_num_conditions != 0; | |
1456 | } | |
1457 | ||
1458 | /* Use the ranges in VRS to remove impossible versioning conditions from | |
1459 | LOOP. */ | |
1460 | ||
1461 | void | |
6165cf6b | 1462 | loop_versioning::prune_loop_conditions (class loop *loop) |
13e08dc9 RS |
1463 | { |
1464 | loop_info &li = get_loop_info (loop); | |
1465 | ||
1466 | int to_remove = -1; | |
1467 | bitmap_iterator bi; | |
1468 | unsigned int i; | |
6165cf6b | 1469 | int_range_max r; |
13e08dc9 RS |
1470 | EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi) |
1471 | { | |
1472 | tree name = ssa_name (i); | |
6165cf6b AH |
1473 | gimple *stmt = first_stmt (loop->header); |
1474 | ||
1475 | if (get_range_query (cfun)->range_of_expr (r, name, stmt) | |
cb779afe | 1476 | && !r.contains_p (wi::one (TYPE_PRECISION (TREE_TYPE (name))))) |
13e08dc9 RS |
1477 | { |
1478 | if (dump_enabled_p ()) | |
1479 | dump_printf_loc (MSG_NOTE, find_loop_location (loop), | |
1480 | "%T can never be 1 in this loop\n", name); | |
1481 | ||
1482 | if (to_remove >= 0) | |
1483 | bitmap_clear_bit (&li.unity_names, to_remove); | |
1484 | to_remove = i; | |
1485 | m_num_conditions -= 1; | |
1486 | } | |
1487 | } | |
1488 | if (to_remove >= 0) | |
1489 | bitmap_clear_bit (&li.unity_names, to_remove); | |
1490 | } | |
1491 | ||
1492 | /* Remove any scheduled loop version conditions that will never be true. | |
1493 | Return true if any remain. */ | |
1494 | ||
1495 | bool | |
1496 | loop_versioning::prune_conditions () | |
1497 | { | |
1498 | AUTO_DUMP_SCOPE ("prune_loop_conditions", | |
1499 | dump_user_location_t::from_function_decl (m_fn->decl)); | |
1500 | ||
1501 | calculate_dominance_info (CDI_DOMINATORS); | |
1502 | lv_dom_walker dom_walker (*this); | |
1503 | dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn)); | |
1504 | return m_num_conditions != 0; | |
1505 | } | |
1506 | ||
1507 | /* Merge the version checks for INNER into immediately-enclosing loop | |
1508 | OUTER. */ | |
1509 | ||
1510 | void | |
99b1c316 | 1511 | loop_versioning::merge_loop_info (class loop *outer, class loop *inner) |
13e08dc9 RS |
1512 | { |
1513 | loop_info &inner_li = get_loop_info (inner); | |
1514 | loop_info &outer_li = get_loop_info (outer); | |
1515 | ||
1516 | if (dump_enabled_p ()) | |
1517 | { | |
1518 | bitmap_iterator bi; | |
1519 | unsigned int i; | |
1520 | EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi) | |
1521 | if (!bitmap_bit_p (&outer_li.unity_names, i)) | |
1522 | dump_printf_loc (MSG_NOTE, find_loop_location (inner), | |
1523 | "hoisting check that %T == 1 to outer loop\n", | |
1524 | ssa_name (i)); | |
1525 | } | |
1526 | ||
1527 | bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names); | |
1528 | if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost)) | |
1529 | outer_li.outermost = inner_li.outermost; | |
1530 | } | |
1531 | ||
1532 | /* Add LOOP to the queue of loops to version. */ | |
1533 | ||
1534 | void | |
99b1c316 | 1535 | loop_versioning::add_loop_to_queue (class loop *loop) |
13e08dc9 RS |
1536 | { |
1537 | loop_info &li = get_loop_info (loop); | |
1538 | ||
1539 | if (dump_enabled_p ()) | |
1540 | dump_printf_loc (MSG_NOTE, find_loop_location (loop), | |
1541 | "queuing this loop for versioning\n"); | |
1542 | m_loops_to_version.safe_push (loop); | |
1543 | ||
1544 | /* Don't try to version superloops. */ | |
1545 | li.rejected_p = true; | |
1546 | } | |
1547 | ||
1548 | /* Decide whether the cost model would allow us to version LOOP, | |
1549 | either directly or as part of a parent loop, and return true if so. | |
1550 | This does not imply that the loop is actually worth versioning in its | |
1551 | own right, just that it would be valid to version it if something | |
1552 | benefited. | |
1553 | ||
1554 | We have already made this decision for all inner loops of LOOP. */ | |
1555 | ||
1556 | bool | |
99b1c316 | 1557 | loop_versioning::decide_whether_loop_is_versionable (class loop *loop) |
13e08dc9 RS |
1558 | { |
1559 | loop_info &li = get_loop_info (loop); | |
1560 | ||
1561 | if (li.rejected_p) | |
1562 | return false; | |
1563 | ||
1564 | /* Examine the decisions made for inner loops. */ | |
99b1c316 | 1565 | for (class loop *inner = loop->inner; inner; inner = inner->next) |
13e08dc9 RS |
1566 | { |
1567 | loop_info &inner_li = get_loop_info (inner); | |
1568 | if (inner_li.rejected_p) | |
1569 | { | |
1570 | if (dump_enabled_p ()) | |
1571 | dump_printf_loc (MSG_NOTE, find_loop_location (loop), | |
1572 | "not versioning this loop because one of its" | |
1573 | " inner loops should not be versioned\n"); | |
1574 | return false; | |
1575 | } | |
1576 | ||
1577 | if (inner_li.worth_versioning_p ()) | |
1578 | li.subloops_benefit_p = true; | |
1579 | ||
1580 | /* Accumulate the number of instructions from subloops that are not | |
1581 | the innermost, or that don't benefit from versioning. Only the | |
1582 | instructions from innermost loops that benefit from versioning | |
1583 | should be weighed against loop-versioning-max-inner-insns; | |
1584 | everything else should be weighed against | |
1585 | loop-versioning-max-outer-insns. */ | |
1586 | if (!inner_li.worth_versioning_p () || inner->inner) | |
1587 | { | |
1588 | if (dump_enabled_p ()) | |
1589 | dump_printf_loc (MSG_NOTE, find_loop_location (loop), | |
1590 | "counting %d instructions from this loop" | |
1591 | " against its parent loop\n", inner_li.num_insns); | |
1592 | li.num_insns += inner_li.num_insns; | |
1593 | } | |
1594 | } | |
1595 | ||
1596 | /* Enforce the size limits. */ | |
1597 | if (li.worth_versioning_p ()) | |
1598 | { | |
1599 | unsigned int max_num_insns = max_insns_for_loop (loop); | |
1600 | if (dump_enabled_p ()) | |
1601 | dump_printf_loc (MSG_NOTE, find_loop_location (loop), | |
1602 | "this loop has %d instructions, against" | |
1603 | " a versioning limit of %d\n", | |
1604 | li.num_insns, max_num_insns); | |
1605 | if (li.num_insns > max_num_insns) | |
1606 | { | |
1607 | if (dump_enabled_p ()) | |
1608 | dump_printf_loc (MSG_MISSED_OPTIMIZATION | |
1609 | | MSG_PRIORITY_USER_FACING, | |
1610 | find_loop_location (loop), | |
1611 | "this loop is too big to version"); | |
1612 | return false; | |
1613 | } | |
1614 | } | |
1615 | ||
1616 | /* Hoist all version checks from subloops to this loop. */ | |
99b1c316 | 1617 | for (class loop *subloop = loop->inner; subloop; subloop = subloop->next) |
13e08dc9 RS |
1618 | merge_loop_info (loop, subloop); |
1619 | ||
1620 | return true; | |
1621 | } | |
1622 | ||
1623 | /* Decide which loops to version and add them to the versioning queue. | |
1624 | Return true if there are any loops to version. */ | |
1625 | ||
1626 | bool | |
1627 | loop_versioning::make_versioning_decisions () | |
1628 | { | |
1629 | AUTO_DUMP_SCOPE ("make_versioning_decisions", | |
1630 | dump_user_location_t::from_function_decl (m_fn->decl)); | |
1631 | ||
e41ba804 | 1632 | for (auto loop : loops_list (cfun, LI_FROM_INNERMOST)) |
13e08dc9 RS |
1633 | { |
1634 | loop_info &linfo = get_loop_info (loop); | |
1635 | if (decide_whether_loop_is_versionable (loop)) | |
1636 | { | |
1637 | /* Commit to versioning LOOP directly if we can't hoist the | |
1638 | version checks any further. */ | |
1639 | if (linfo.worth_versioning_p () | |
1640 | && (loop_depth (loop) == 1 || linfo.outermost == loop)) | |
1641 | add_loop_to_queue (loop); | |
1642 | } | |
1643 | else | |
1644 | { | |
1645 | /* We can't version this loop, so individually version any | |
1646 | subloops that would benefit and haven't been versioned yet. */ | |
1647 | linfo.rejected_p = true; | |
99b1c316 | 1648 | for (class loop *subloop = loop->inner; subloop; |
13e08dc9 RS |
1649 | subloop = subloop->next) |
1650 | if (get_loop_info (subloop).worth_versioning_p ()) | |
1651 | add_loop_to_queue (subloop); | |
1652 | } | |
1653 | } | |
1654 | ||
1655 | return !m_loops_to_version.is_empty (); | |
1656 | } | |
1657 | ||
1658 | /* Attempt to implement loop versioning for LOOP, using the information | |
1659 | cached in the associated loop_info. Return true on success. */ | |
1660 | ||
1661 | bool | |
99b1c316 | 1662 | loop_versioning::version_loop (class loop *loop) |
13e08dc9 RS |
1663 | { |
1664 | loop_info &li = get_loop_info (loop); | |
1665 | ||
1666 | /* Build up a condition that selects the original loop instead of | |
1667 | the simplified loop. */ | |
1668 | tree cond = boolean_false_node; | |
1669 | bitmap_iterator bi; | |
1670 | unsigned int i; | |
1671 | EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi) | |
1672 | { | |
1673 | tree name = ssa_name (i); | |
1674 | tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name, | |
1675 | build_one_cst (TREE_TYPE (name))); | |
1676 | cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one); | |
1677 | } | |
1678 | ||
1679 | /* Convert the condition into a suitable gcond. */ | |
1680 | gimple_seq stmts = NULL; | |
c090743b RB |
1681 | cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr_for_cond, |
1682 | NULL_TREE); | |
13e08dc9 RS |
1683 | |
1684 | /* Version the loop. */ | |
1685 | initialize_original_copy_tables (); | |
1686 | basic_block cond_bb; | |
1687 | li.optimized_loop = loop_version (loop, cond, &cond_bb, | |
1688 | profile_probability::unlikely (), | |
1689 | profile_probability::likely (), | |
1690 | profile_probability::unlikely (), | |
1691 | profile_probability::likely (), true); | |
1692 | free_original_copy_tables (); | |
1693 | if (!li.optimized_loop) | |
1694 | { | |
1695 | if (dump_enabled_p ()) | |
1696 | dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop), | |
1697 | "tried but failed to version this loop for when" | |
1698 | " certain strides are 1\n"); | |
1699 | return false; | |
1700 | } | |
1701 | ||
1702 | if (dump_enabled_p ()) | |
1703 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop), | |
1704 | "versioned this loop for when certain strides are 1\n"); | |
1705 | ||
1706 | /* Insert the statements that feed COND. */ | |
1707 | if (stmts) | |
1708 | { | |
1709 | gimple_stmt_iterator gsi = gsi_last_bb (cond_bb); | |
1710 | gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT); | |
1711 | } | |
1712 | ||
1713 | return true; | |
1714 | } | |
1715 | ||
1716 | /* Attempt to version all loops in the versioning queue. */ | |
1717 | ||
1718 | void | |
1719 | loop_versioning::implement_versioning_decisions () | |
1720 | { | |
1721 | /* No AUTO_DUMP_SCOPE here since all messages are top-level and | |
1722 | user-facing at this point. */ | |
1723 | ||
1724 | bool any_succeeded_p = false; | |
99b1c316 | 1725 | class loop *loop; |
13e08dc9 RS |
1726 | unsigned int i; |
1727 | FOR_EACH_VEC_ELT (m_loops_to_version, i, loop) | |
1728 | if (version_loop (loop)) | |
1729 | any_succeeded_p = true; | |
1730 | if (!any_succeeded_p) | |
1731 | return; | |
1732 | ||
1733 | update_ssa (TODO_update_ssa); | |
1734 | ||
1735 | /* Simplify the new loop, which is used when COND is false. */ | |
1736 | FOR_EACH_VEC_ELT (m_loops_to_version, i, loop) | |
1737 | { | |
1738 | loop_info &linfo = get_loop_info (loop); | |
1739 | if (linfo.optimized_loop) | |
1740 | name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header); | |
1741 | } | |
1742 | } | |
1743 | ||
1744 | /* Run the pass and return a set of TODO_* flags. */ | |
1745 | ||
1746 | unsigned int | |
1747 | loop_versioning::run () | |
1748 | { | |
1749 | gcc_assert (scev_initialized_p ()); | |
1750 | ||
1751 | if (analyze_blocks () | |
1752 | && prune_conditions () | |
1753 | && make_versioning_decisions ()) | |
1754 | implement_versioning_decisions (); | |
1755 | ||
1756 | return 0; | |
1757 | } | |
1758 | ||
1759 | /* Loop versioning pass. */ | |
1760 | ||
1761 | const pass_data pass_data_loop_versioning = | |
1762 | { | |
1763 | GIMPLE_PASS, /* type */ | |
1764 | "lversion", /* name */ | |
1765 | OPTGROUP_LOOP, /* optinfo_flags */ | |
1766 | TV_LOOP_VERSIONING, /* tv_id */ | |
1767 | PROP_cfg, /* properties_required */ | |
1768 | 0, /* properties_provided */ | |
1769 | 0, /* properties_destroyed */ | |
1770 | 0, /* todo_flags_start */ | |
1771 | 0, /* todo_flags_finish */ | |
1772 | }; | |
1773 | ||
1774 | class pass_loop_versioning : public gimple_opt_pass | |
1775 | { | |
1776 | public: | |
1777 | pass_loop_versioning (gcc::context *ctxt) | |
1778 | : gimple_opt_pass (pass_data_loop_versioning, ctxt) | |
1779 | {} | |
1780 | ||
1781 | /* opt_pass methods: */ | |
725793af DM |
1782 | bool gate (function *) final override |
1783 | { | |
1784 | return flag_version_loops_for_strides; | |
1785 | } | |
1786 | unsigned int execute (function *) final override; | |
13e08dc9 RS |
1787 | }; |
1788 | ||
1789 | unsigned int | |
1790 | pass_loop_versioning::execute (function *fn) | |
1791 | { | |
1792 | if (number_of_loops (fn) <= 1) | |
1793 | return 0; | |
1794 | ||
6165cf6b AH |
1795 | enable_ranger (fn); |
1796 | unsigned int ret = loop_versioning (fn).run (); | |
1797 | disable_ranger (fn); | |
1798 | return ret; | |
13e08dc9 RS |
1799 | } |
1800 | ||
1801 | } // anon namespace | |
1802 | ||
1803 | gimple_opt_pass * | |
1804 | make_pass_loop_versioning (gcc::context *ctxt) | |
1805 | { | |
1806 | return new pass_loop_versioning (ctxt); | |
1807 | } |