]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-ssa-math-opts.c
[6/77] Make GET_MODE_WIDER return an opt_mode
[thirdparty/gcc.git] / gcc / tree-ssa-math-opts.c
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2017 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
22
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
27
28 that can be optimized to
29
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
35
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
38
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
42
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 by the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
50
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
56
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
60
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
68
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
75
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
79
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
86
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "params.h"
113 #include "internal-fn.h"
114 #include "case-cfn-macros.h"
115 #include "optabs-libfuncs.h"
116 #include "tree-eh.h"
117 #include "targhooks.h"
118
119 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a
121 division. */
122 struct occurrence {
123 /* The basic block represented by this structure. */
124 basic_block bb;
125
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */
128 tree recip_def;
129
130 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
131 was inserted in BB. */
132 gimple *recip_def_stmt;
133
134 /* Pointer to a list of "struct occurrence"s for blocks dominated
135 by BB. */
136 struct occurrence *children;
137
138 /* Pointer to the next "struct occurrence"s in the list of blocks
139 sharing a common dominator. */
140 struct occurrence *next;
141
142 /* The number of divisions that are in BB before compute_merit. The
143 number of divisions that are in BB or post-dominate it after
144 compute_merit. */
145 int num_divisions;
146
147 /* True if the basic block has a division, false if it is a common
148 dominator for basic blocks that do. If it is false and trapping
149 math is active, BB is not a candidate for inserting a reciprocal. */
150 bool bb_has_division;
151 };
152
153 static struct
154 {
155 /* Number of 1.0/X ops inserted. */
156 int rdivs_inserted;
157
158 /* Number of 1.0/FUNC ops inserted. */
159 int rfuncs_inserted;
160 } reciprocal_stats;
161
162 static struct
163 {
164 /* Number of cexpi calls inserted. */
165 int inserted;
166 } sincos_stats;
167
168 static struct
169 {
170 /* Number of hand-written 16-bit nop / bswaps found. */
171 int found_16bit;
172
173 /* Number of hand-written 32-bit nop / bswaps found. */
174 int found_32bit;
175
176 /* Number of hand-written 64-bit nop / bswaps found. */
177 int found_64bit;
178 } nop_stats, bswap_stats;
179
180 static struct
181 {
182 /* Number of widening multiplication ops inserted. */
183 int widen_mults_inserted;
184
185 /* Number of integer multiply-and-accumulate ops inserted. */
186 int maccs_inserted;
187
188 /* Number of fp fused multiply-add ops inserted. */
189 int fmas_inserted;
190
191 /* Number of divmod calls inserted. */
192 int divmod_calls_inserted;
193 } widen_mul_stats;
194
195 /* The instance of "struct occurrence" representing the highest
196 interesting block in the dominator tree. */
197 static struct occurrence *occ_head;
198
199 /* Allocation pool for getting instances of "struct occurrence". */
200 static object_allocator<occurrence> *occ_pool;
201
202
203
204 /* Allocate and return a new struct occurrence for basic block BB, and
205 whose children list is headed by CHILDREN. */
206 static struct occurrence *
207 occ_new (basic_block bb, struct occurrence *children)
208 {
209 struct occurrence *occ;
210
211 bb->aux = occ = occ_pool->allocate ();
212 memset (occ, 0, sizeof (struct occurrence));
213
214 occ->bb = bb;
215 occ->children = children;
216 return occ;
217 }
218
219
220 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
221 list of "struct occurrence"s, one per basic block, having IDOM as
222 their common dominator.
223
224 We try to insert NEW_OCC as deep as possible in the tree, and we also
225 insert any other block that is a common dominator for BB and one
226 block already in the tree. */
227
228 static void
229 insert_bb (struct occurrence *new_occ, basic_block idom,
230 struct occurrence **p_head)
231 {
232 struct occurrence *occ, **p_occ;
233
234 for (p_occ = p_head; (occ = *p_occ) != NULL; )
235 {
236 basic_block bb = new_occ->bb, occ_bb = occ->bb;
237 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
238 if (dom == bb)
239 {
240 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
241 from its list. */
242 *p_occ = occ->next;
243 occ->next = new_occ->children;
244 new_occ->children = occ;
245
246 /* Try the next block (it may as well be dominated by BB). */
247 }
248
249 else if (dom == occ_bb)
250 {
251 /* OCC_BB dominates BB. Tail recurse to look deeper. */
252 insert_bb (new_occ, dom, &occ->children);
253 return;
254 }
255
256 else if (dom != idom)
257 {
258 gcc_assert (!dom->aux);
259
260 /* There is a dominator between IDOM and BB, add it and make
261 two children out of NEW_OCC and OCC. First, remove OCC from
262 its list. */
263 *p_occ = occ->next;
264 new_occ->next = occ;
265 occ->next = NULL;
266
267 /* None of the previous blocks has DOM as a dominator: if we tail
268 recursed, we would reexamine them uselessly. Just switch BB with
269 DOM, and go on looking for blocks dominated by DOM. */
270 new_occ = occ_new (dom, new_occ);
271 }
272
273 else
274 {
275 /* Nothing special, go on with the next element. */
276 p_occ = &occ->next;
277 }
278 }
279
280 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
281 new_occ->next = *p_head;
282 *p_head = new_occ;
283 }
284
285 /* Register that we found a division in BB. */
286
287 static inline void
288 register_division_in (basic_block bb)
289 {
290 struct occurrence *occ;
291
292 occ = (struct occurrence *) bb->aux;
293 if (!occ)
294 {
295 occ = occ_new (bb, NULL);
296 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
297 }
298
299 occ->bb_has_division = true;
300 occ->num_divisions++;
301 }
302
303
304 /* Compute the number of divisions that postdominate each block in OCC and
305 its children. */
306
307 static void
308 compute_merit (struct occurrence *occ)
309 {
310 struct occurrence *occ_child;
311 basic_block dom = occ->bb;
312
313 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
314 {
315 basic_block bb;
316 if (occ_child->children)
317 compute_merit (occ_child);
318
319 if (flag_exceptions)
320 bb = single_noncomplex_succ (dom);
321 else
322 bb = dom;
323
324 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
325 occ->num_divisions += occ_child->num_divisions;
326 }
327 }
328
329
330 /* Return whether USE_STMT is a floating-point division by DEF. */
331 static inline bool
332 is_division_by (gimple *use_stmt, tree def)
333 {
334 return is_gimple_assign (use_stmt)
335 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
336 && gimple_assign_rhs2 (use_stmt) == def
337 /* Do not recognize x / x as valid division, as we are getting
338 confused later by replacing all immediate uses x in such
339 a stmt. */
340 && gimple_assign_rhs1 (use_stmt) != def;
341 }
342
343 /* Walk the subset of the dominator tree rooted at OCC, setting the
344 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
345 the given basic block. The field may be left NULL, of course,
346 if it is not possible or profitable to do the optimization.
347
348 DEF_BSI is an iterator pointing at the statement defining DEF.
349 If RECIP_DEF is set, a dominator already has a computation that can
350 be used. */
351
352 static void
353 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
354 tree def, tree recip_def, int threshold)
355 {
356 tree type;
357 gassign *new_stmt;
358 gimple_stmt_iterator gsi;
359 struct occurrence *occ_child;
360
361 if (!recip_def
362 && (occ->bb_has_division || !flag_trapping_math)
363 && occ->num_divisions >= threshold)
364 {
365 /* Make a variable with the replacement and substitute it. */
366 type = TREE_TYPE (def);
367 recip_def = create_tmp_reg (type, "reciptmp");
368 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
369 build_one_cst (type), def);
370
371 if (occ->bb_has_division)
372 {
373 /* Case 1: insert before an existing division. */
374 gsi = gsi_after_labels (occ->bb);
375 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
376 gsi_next (&gsi);
377
378 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
379 }
380 else if (def_gsi && occ->bb == def_gsi->bb)
381 {
382 /* Case 2: insert right after the definition. Note that this will
383 never happen if the definition statement can throw, because in
384 that case the sole successor of the statement's basic block will
385 dominate all the uses as well. */
386 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
387 }
388 else
389 {
390 /* Case 3: insert in a basic block not containing defs/uses. */
391 gsi = gsi_after_labels (occ->bb);
392 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
393 }
394
395 reciprocal_stats.rdivs_inserted++;
396
397 occ->recip_def_stmt = new_stmt;
398 }
399
400 occ->recip_def = recip_def;
401 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
402 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
403 }
404
405
406 /* Replace the division at USE_P with a multiplication by the reciprocal, if
407 possible. */
408
409 static inline void
410 replace_reciprocal (use_operand_p use_p)
411 {
412 gimple *use_stmt = USE_STMT (use_p);
413 basic_block bb = gimple_bb (use_stmt);
414 struct occurrence *occ = (struct occurrence *) bb->aux;
415
416 if (optimize_bb_for_speed_p (bb)
417 && occ->recip_def && use_stmt != occ->recip_def_stmt)
418 {
419 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
420 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
421 SET_USE (use_p, occ->recip_def);
422 fold_stmt_inplace (&gsi);
423 update_stmt (use_stmt);
424 }
425 }
426
427
428 /* Free OCC and return one more "struct occurrence" to be freed. */
429
430 static struct occurrence *
431 free_bb (struct occurrence *occ)
432 {
433 struct occurrence *child, *next;
434
435 /* First get the two pointers hanging off OCC. */
436 next = occ->next;
437 child = occ->children;
438 occ->bb->aux = NULL;
439 occ_pool->remove (occ);
440
441 /* Now ensure that we don't recurse unless it is necessary. */
442 if (!child)
443 return next;
444 else
445 {
446 while (next)
447 next = free_bb (next);
448
449 return child;
450 }
451 }
452
453
454 /* Look for floating-point divisions among DEF's uses, and try to
455 replace them by multiplications with the reciprocal. Add
456 as many statements computing the reciprocal as needed.
457
458 DEF must be a GIMPLE register of a floating-point type. */
459
460 static void
461 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
462 {
463 use_operand_p use_p;
464 imm_use_iterator use_iter;
465 struct occurrence *occ;
466 int count = 0, threshold;
467
468 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
469
470 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
471 {
472 gimple *use_stmt = USE_STMT (use_p);
473 if (is_division_by (use_stmt, def))
474 {
475 register_division_in (gimple_bb (use_stmt));
476 count++;
477 }
478 }
479
480 /* Do the expensive part only if we can hope to optimize something. */
481 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
482 if (count >= threshold)
483 {
484 gimple *use_stmt;
485 for (occ = occ_head; occ; occ = occ->next)
486 {
487 compute_merit (occ);
488 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
489 }
490
491 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
492 {
493 if (is_division_by (use_stmt, def))
494 {
495 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
496 replace_reciprocal (use_p);
497 }
498 }
499 }
500
501 for (occ = occ_head; occ; )
502 occ = free_bb (occ);
503
504 occ_head = NULL;
505 }
506
507 /* Return an internal function that implements the reciprocal of CALL,
508 or IFN_LAST if there is no such function that the target supports. */
509
510 internal_fn
511 internal_fn_reciprocal (gcall *call)
512 {
513 internal_fn ifn;
514
515 switch (gimple_call_combined_fn (call))
516 {
517 CASE_CFN_SQRT:
518 ifn = IFN_RSQRT;
519 break;
520
521 default:
522 return IFN_LAST;
523 }
524
525 tree_pair types = direct_internal_fn_types (ifn, call);
526 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
527 return IFN_LAST;
528
529 return ifn;
530 }
531
532 /* Go through all the floating-point SSA_NAMEs, and call
533 execute_cse_reciprocals_1 on each of them. */
534 namespace {
535
536 const pass_data pass_data_cse_reciprocals =
537 {
538 GIMPLE_PASS, /* type */
539 "recip", /* name */
540 OPTGROUP_NONE, /* optinfo_flags */
541 TV_NONE, /* tv_id */
542 PROP_ssa, /* properties_required */
543 0, /* properties_provided */
544 0, /* properties_destroyed */
545 0, /* todo_flags_start */
546 TODO_update_ssa, /* todo_flags_finish */
547 };
548
549 class pass_cse_reciprocals : public gimple_opt_pass
550 {
551 public:
552 pass_cse_reciprocals (gcc::context *ctxt)
553 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
554 {}
555
556 /* opt_pass methods: */
557 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
558 virtual unsigned int execute (function *);
559
560 }; // class pass_cse_reciprocals
561
562 unsigned int
563 pass_cse_reciprocals::execute (function *fun)
564 {
565 basic_block bb;
566 tree arg;
567
568 occ_pool = new object_allocator<occurrence> ("dominators for recip");
569
570 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
571 calculate_dominance_info (CDI_DOMINATORS);
572 calculate_dominance_info (CDI_POST_DOMINATORS);
573
574 if (flag_checking)
575 FOR_EACH_BB_FN (bb, fun)
576 gcc_assert (!bb->aux);
577
578 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
579 if (FLOAT_TYPE_P (TREE_TYPE (arg))
580 && is_gimple_reg (arg))
581 {
582 tree name = ssa_default_def (fun, arg);
583 if (name)
584 execute_cse_reciprocals_1 (NULL, name);
585 }
586
587 FOR_EACH_BB_FN (bb, fun)
588 {
589 tree def;
590
591 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
592 gsi_next (&gsi))
593 {
594 gphi *phi = gsi.phi ();
595 def = PHI_RESULT (phi);
596 if (! virtual_operand_p (def)
597 && FLOAT_TYPE_P (TREE_TYPE (def)))
598 execute_cse_reciprocals_1 (NULL, def);
599 }
600
601 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
602 gsi_next (&gsi))
603 {
604 gimple *stmt = gsi_stmt (gsi);
605
606 if (gimple_has_lhs (stmt)
607 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
608 && FLOAT_TYPE_P (TREE_TYPE (def))
609 && TREE_CODE (def) == SSA_NAME)
610 execute_cse_reciprocals_1 (&gsi, def);
611 }
612
613 if (optimize_bb_for_size_p (bb))
614 continue;
615
616 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
617 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
618 gsi_next (&gsi))
619 {
620 gimple *stmt = gsi_stmt (gsi);
621
622 if (is_gimple_assign (stmt)
623 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
624 {
625 tree arg1 = gimple_assign_rhs2 (stmt);
626 gimple *stmt1;
627
628 if (TREE_CODE (arg1) != SSA_NAME)
629 continue;
630
631 stmt1 = SSA_NAME_DEF_STMT (arg1);
632
633 if (is_gimple_call (stmt1)
634 && gimple_call_lhs (stmt1))
635 {
636 bool fail;
637 imm_use_iterator ui;
638 use_operand_p use_p;
639 tree fndecl = NULL_TREE;
640
641 gcall *call = as_a <gcall *> (stmt1);
642 internal_fn ifn = internal_fn_reciprocal (call);
643 if (ifn == IFN_LAST)
644 {
645 fndecl = gimple_call_fndecl (call);
646 if (!fndecl
647 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD)
648 continue;
649 fndecl = targetm.builtin_reciprocal (fndecl);
650 if (!fndecl)
651 continue;
652 }
653
654 /* Check that all uses of the SSA name are divisions,
655 otherwise replacing the defining statement will do
656 the wrong thing. */
657 fail = false;
658 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
659 {
660 gimple *stmt2 = USE_STMT (use_p);
661 if (is_gimple_debug (stmt2))
662 continue;
663 if (!is_gimple_assign (stmt2)
664 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
665 || gimple_assign_rhs1 (stmt2) == arg1
666 || gimple_assign_rhs2 (stmt2) != arg1)
667 {
668 fail = true;
669 break;
670 }
671 }
672 if (fail)
673 continue;
674
675 gimple_replace_ssa_lhs (call, arg1);
676 if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
677 {
678 auto_vec<tree, 4> args;
679 for (unsigned int i = 0;
680 i < gimple_call_num_args (call); i++)
681 args.safe_push (gimple_call_arg (call, i));
682 gcall *stmt2;
683 if (ifn == IFN_LAST)
684 stmt2 = gimple_build_call_vec (fndecl, args);
685 else
686 stmt2 = gimple_build_call_internal_vec (ifn, args);
687 gimple_call_set_lhs (stmt2, arg1);
688 if (gimple_vdef (call))
689 {
690 gimple_set_vdef (stmt2, gimple_vdef (call));
691 SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
692 }
693 gimple_call_set_nothrow (stmt2,
694 gimple_call_nothrow_p (call));
695 gimple_set_vuse (stmt2, gimple_vuse (call));
696 gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
697 gsi_replace (&gsi2, stmt2, true);
698 }
699 else
700 {
701 if (ifn == IFN_LAST)
702 gimple_call_set_fndecl (call, fndecl);
703 else
704 gimple_call_set_internal_fn (call, ifn);
705 update_stmt (call);
706 }
707 reciprocal_stats.rfuncs_inserted++;
708
709 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
710 {
711 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
712 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
713 fold_stmt_inplace (&gsi);
714 update_stmt (stmt);
715 }
716 }
717 }
718 }
719 }
720
721 statistics_counter_event (fun, "reciprocal divs inserted",
722 reciprocal_stats.rdivs_inserted);
723 statistics_counter_event (fun, "reciprocal functions inserted",
724 reciprocal_stats.rfuncs_inserted);
725
726 free_dominance_info (CDI_DOMINATORS);
727 free_dominance_info (CDI_POST_DOMINATORS);
728 delete occ_pool;
729 return 0;
730 }
731
732 } // anon namespace
733
734 gimple_opt_pass *
735 make_pass_cse_reciprocals (gcc::context *ctxt)
736 {
737 return new pass_cse_reciprocals (ctxt);
738 }
739
740 /* Records an occurrence at statement USE_STMT in the vector of trees
741 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
742 is not yet initialized. Returns true if the occurrence was pushed on
743 the vector. Adjusts *TOP_BB to be the basic block dominating all
744 statements in the vector. */
745
746 static bool
747 maybe_record_sincos (vec<gimple *> *stmts,
748 basic_block *top_bb, gimple *use_stmt)
749 {
750 basic_block use_bb = gimple_bb (use_stmt);
751 if (*top_bb
752 && (*top_bb == use_bb
753 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
754 stmts->safe_push (use_stmt);
755 else if (!*top_bb
756 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
757 {
758 stmts->safe_push (use_stmt);
759 *top_bb = use_bb;
760 }
761 else
762 return false;
763
764 return true;
765 }
766
767 /* Look for sin, cos and cexpi calls with the same argument NAME and
768 create a single call to cexpi CSEing the result in this case.
769 We first walk over all immediate uses of the argument collecting
770 statements that we can CSE in a vector and in a second pass replace
771 the statement rhs with a REALPART or IMAGPART expression on the
772 result of the cexpi call we insert before the use statement that
773 dominates all other candidates. */
774
775 static bool
776 execute_cse_sincos_1 (tree name)
777 {
778 gimple_stmt_iterator gsi;
779 imm_use_iterator use_iter;
780 tree fndecl, res, type;
781 gimple *def_stmt, *use_stmt, *stmt;
782 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
783 auto_vec<gimple *> stmts;
784 basic_block top_bb = NULL;
785 int i;
786 bool cfg_changed = false;
787
788 type = TREE_TYPE (name);
789 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
790 {
791 if (gimple_code (use_stmt) != GIMPLE_CALL
792 || !gimple_call_lhs (use_stmt))
793 continue;
794
795 switch (gimple_call_combined_fn (use_stmt))
796 {
797 CASE_CFN_COS:
798 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
799 break;
800
801 CASE_CFN_SIN:
802 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
803 break;
804
805 CASE_CFN_CEXPI:
806 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
807 break;
808
809 default:;
810 }
811 }
812
813 if (seen_cos + seen_sin + seen_cexpi <= 1)
814 return false;
815
816 /* Simply insert cexpi at the beginning of top_bb but not earlier than
817 the name def statement. */
818 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
819 if (!fndecl)
820 return false;
821 stmt = gimple_build_call (fndecl, 1, name);
822 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
823 gimple_call_set_lhs (stmt, res);
824
825 def_stmt = SSA_NAME_DEF_STMT (name);
826 if (!SSA_NAME_IS_DEFAULT_DEF (name)
827 && gimple_code (def_stmt) != GIMPLE_PHI
828 && gimple_bb (def_stmt) == top_bb)
829 {
830 gsi = gsi_for_stmt (def_stmt);
831 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
832 }
833 else
834 {
835 gsi = gsi_after_labels (top_bb);
836 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
837 }
838 sincos_stats.inserted++;
839
840 /* And adjust the recorded old call sites. */
841 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
842 {
843 tree rhs = NULL;
844
845 switch (gimple_call_combined_fn (use_stmt))
846 {
847 CASE_CFN_COS:
848 rhs = fold_build1 (REALPART_EXPR, type, res);
849 break;
850
851 CASE_CFN_SIN:
852 rhs = fold_build1 (IMAGPART_EXPR, type, res);
853 break;
854
855 CASE_CFN_CEXPI:
856 rhs = res;
857 break;
858
859 default:;
860 gcc_unreachable ();
861 }
862
863 /* Replace call with a copy. */
864 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
865
866 gsi = gsi_for_stmt (use_stmt);
867 gsi_replace (&gsi, stmt, true);
868 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
869 cfg_changed = true;
870 }
871
872 return cfg_changed;
873 }
874
875 /* To evaluate powi(x,n), the floating point value x raised to the
876 constant integer exponent n, we use a hybrid algorithm that
877 combines the "window method" with look-up tables. For an
878 introduction to exponentiation algorithms and "addition chains",
879 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
880 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
881 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
882 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
883
884 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
885 multiplications to inline before calling the system library's pow
886 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
887 so this default never requires calling pow, powf or powl. */
888
889 #ifndef POWI_MAX_MULTS
890 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
891 #endif
892
893 /* The size of the "optimal power tree" lookup table. All
894 exponents less than this value are simply looked up in the
895 powi_table below. This threshold is also used to size the
896 cache of pseudo registers that hold intermediate results. */
897 #define POWI_TABLE_SIZE 256
898
899 /* The size, in bits of the window, used in the "window method"
900 exponentiation algorithm. This is equivalent to a radix of
901 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
902 #define POWI_WINDOW_SIZE 3
903
904 /* The following table is an efficient representation of an
905 "optimal power tree". For each value, i, the corresponding
906 value, j, in the table states than an optimal evaluation
907 sequence for calculating pow(x,i) can be found by evaluating
908 pow(x,j)*pow(x,i-j). An optimal power tree for the first
909 100 integers is given in Knuth's "Seminumerical algorithms". */
910
911 static const unsigned char powi_table[POWI_TABLE_SIZE] =
912 {
913 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
914 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
915 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
916 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
917 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
918 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
919 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
920 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
921 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
922 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
923 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
924 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
925 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
926 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
927 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
928 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
929 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
930 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
931 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
932 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
933 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
934 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
935 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
936 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
937 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
938 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
939 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
940 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
941 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
942 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
943 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
944 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
945 };
946
947
948 /* Return the number of multiplications required to calculate
949 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
950 subroutine of powi_cost. CACHE is an array indicating
951 which exponents have already been calculated. */
952
953 static int
954 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
955 {
956 /* If we've already calculated this exponent, then this evaluation
957 doesn't require any additional multiplications. */
958 if (cache[n])
959 return 0;
960
961 cache[n] = true;
962 return powi_lookup_cost (n - powi_table[n], cache)
963 + powi_lookup_cost (powi_table[n], cache) + 1;
964 }
965
966 /* Return the number of multiplications required to calculate
967 powi(x,n) for an arbitrary x, given the exponent N. This
968 function needs to be kept in sync with powi_as_mults below. */
969
970 static int
971 powi_cost (HOST_WIDE_INT n)
972 {
973 bool cache[POWI_TABLE_SIZE];
974 unsigned HOST_WIDE_INT digit;
975 unsigned HOST_WIDE_INT val;
976 int result;
977
978 if (n == 0)
979 return 0;
980
981 /* Ignore the reciprocal when calculating the cost. */
982 val = (n < 0) ? -n : n;
983
984 /* Initialize the exponent cache. */
985 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
986 cache[1] = true;
987
988 result = 0;
989
990 while (val >= POWI_TABLE_SIZE)
991 {
992 if (val & 1)
993 {
994 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
995 result += powi_lookup_cost (digit, cache)
996 + POWI_WINDOW_SIZE + 1;
997 val >>= POWI_WINDOW_SIZE;
998 }
999 else
1000 {
1001 val >>= 1;
1002 result++;
1003 }
1004 }
1005
1006 return result + powi_lookup_cost (val, cache);
1007 }
1008
1009 /* Recursive subroutine of powi_as_mults. This function takes the
1010 array, CACHE, of already calculated exponents and an exponent N and
1011 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
1012
1013 static tree
1014 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1015 HOST_WIDE_INT n, tree *cache)
1016 {
1017 tree op0, op1, ssa_target;
1018 unsigned HOST_WIDE_INT digit;
1019 gassign *mult_stmt;
1020
1021 if (n < POWI_TABLE_SIZE && cache[n])
1022 return cache[n];
1023
1024 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1025
1026 if (n < POWI_TABLE_SIZE)
1027 {
1028 cache[n] = ssa_target;
1029 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1030 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1031 }
1032 else if (n & 1)
1033 {
1034 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1035 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1036 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1037 }
1038 else
1039 {
1040 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1041 op1 = op0;
1042 }
1043
1044 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1045 gimple_set_location (mult_stmt, loc);
1046 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1047
1048 return ssa_target;
1049 }
1050
1051 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1052 This function needs to be kept in sync with powi_cost above. */
1053
1054 static tree
1055 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1056 tree arg0, HOST_WIDE_INT n)
1057 {
1058 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1059 gassign *div_stmt;
1060 tree target;
1061
1062 if (n == 0)
1063 return build_real (type, dconst1);
1064
1065 memset (cache, 0, sizeof (cache));
1066 cache[1] = arg0;
1067
1068 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1069 if (n >= 0)
1070 return result;
1071
1072 /* If the original exponent was negative, reciprocate the result. */
1073 target = make_temp_ssa_name (type, NULL, "powmult");
1074 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1075 build_real (type, dconst1), result);
1076 gimple_set_location (div_stmt, loc);
1077 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1078
1079 return target;
1080 }
1081
1082 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1083 location info LOC. If the arguments are appropriate, create an
1084 equivalent sequence of statements prior to GSI using an optimal
1085 number of multiplications, and return an expession holding the
1086 result. */
1087
1088 static tree
1089 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1090 tree arg0, HOST_WIDE_INT n)
1091 {
1092 /* Avoid largest negative number. */
1093 if (n != -n
1094 && ((n >= -1 && n <= 2)
1095 || (optimize_function_for_speed_p (cfun)
1096 && powi_cost (n) <= POWI_MAX_MULTS)))
1097 return powi_as_mults (gsi, loc, arg0, n);
1098
1099 return NULL_TREE;
1100 }
1101
1102 /* Build a gimple call statement that calls FN with argument ARG.
1103 Set the lhs of the call statement to a fresh SSA name. Insert the
1104 statement prior to GSI's current position, and return the fresh
1105 SSA name. */
1106
1107 static tree
1108 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1109 tree fn, tree arg)
1110 {
1111 gcall *call_stmt;
1112 tree ssa_target;
1113
1114 call_stmt = gimple_build_call (fn, 1, arg);
1115 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1116 gimple_set_lhs (call_stmt, ssa_target);
1117 gimple_set_location (call_stmt, loc);
1118 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1119
1120 return ssa_target;
1121 }
1122
1123 /* Build a gimple binary operation with the given CODE and arguments
1124 ARG0, ARG1, assigning the result to a new SSA name for variable
1125 TARGET. Insert the statement prior to GSI's current position, and
1126 return the fresh SSA name.*/
1127
1128 static tree
1129 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1130 const char *name, enum tree_code code,
1131 tree arg0, tree arg1)
1132 {
1133 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1134 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1135 gimple_set_location (stmt, loc);
1136 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1137 return result;
1138 }
1139
1140 /* Build a gimple reference operation with the given CODE and argument
1141 ARG, assigning the result to a new SSA name of TYPE with NAME.
1142 Insert the statement prior to GSI's current position, and return
1143 the fresh SSA name. */
1144
1145 static inline tree
1146 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1147 const char *name, enum tree_code code, tree arg0)
1148 {
1149 tree result = make_temp_ssa_name (type, NULL, name);
1150 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1151 gimple_set_location (stmt, loc);
1152 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1153 return result;
1154 }
1155
1156 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1157 prior to GSI's current position, and return the fresh SSA name. */
1158
1159 static tree
1160 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1161 tree type, tree val)
1162 {
1163 tree result = make_ssa_name (type);
1164 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1165 gimple_set_location (stmt, loc);
1166 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1167 return result;
1168 }
1169
1170 struct pow_synth_sqrt_info
1171 {
1172 bool *factors;
1173 unsigned int deepest;
1174 unsigned int num_mults;
1175 };
1176
1177 /* Return true iff the real value C can be represented as a
1178 sum of powers of 0.5 up to N. That is:
1179 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1180 Record in INFO the various parameters of the synthesis algorithm such
1181 as the factors a[i], the maximum 0.5 power and the number of
1182 multiplications that will be required. */
1183
1184 bool
1185 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1186 struct pow_synth_sqrt_info *info)
1187 {
1188 REAL_VALUE_TYPE factor = dconsthalf;
1189 REAL_VALUE_TYPE remainder = c;
1190
1191 info->deepest = 0;
1192 info->num_mults = 0;
1193 memset (info->factors, 0, n * sizeof (bool));
1194
1195 for (unsigned i = 0; i < n; i++)
1196 {
1197 REAL_VALUE_TYPE res;
1198
1199 /* If something inexact happened bail out now. */
1200 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1201 return false;
1202
1203 /* We have hit zero. The number is representable as a sum
1204 of powers of 0.5. */
1205 if (real_equal (&res, &dconst0))
1206 {
1207 info->factors[i] = true;
1208 info->deepest = i + 1;
1209 return true;
1210 }
1211 else if (!REAL_VALUE_NEGATIVE (res))
1212 {
1213 remainder = res;
1214 info->factors[i] = true;
1215 info->num_mults++;
1216 }
1217 else
1218 info->factors[i] = false;
1219
1220 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1221 }
1222 return false;
1223 }
1224
1225 /* Return the tree corresponding to FN being applied
1226 to ARG N times at GSI and LOC.
1227 Look up previous results from CACHE if need be.
1228 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1229
1230 static tree
1231 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1232 tree fn, location_t loc, tree *cache)
1233 {
1234 tree res = cache[n];
1235 if (!res)
1236 {
1237 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1238 res = build_and_insert_call (gsi, loc, fn, prev);
1239 cache[n] = res;
1240 }
1241
1242 return res;
1243 }
1244
1245 /* Print to STREAM the repeated application of function FNAME to ARG
1246 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1247 "foo (foo (x))". */
1248
1249 static void
1250 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1251 unsigned int n)
1252 {
1253 if (n == 0)
1254 fprintf (stream, "%s", arg);
1255 else
1256 {
1257 fprintf (stream, "%s (", fname);
1258 print_nested_fn (stream, fname, arg, n - 1);
1259 fprintf (stream, ")");
1260 }
1261 }
1262
1263 /* Print to STREAM the fractional sequence of sqrt chains
1264 applied to ARG, described by INFO. Used for the dump file. */
1265
1266 static void
1267 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1268 struct pow_synth_sqrt_info *info)
1269 {
1270 for (unsigned int i = 0; i < info->deepest; i++)
1271 {
1272 bool is_set = info->factors[i];
1273 if (is_set)
1274 {
1275 print_nested_fn (stream, "sqrt", arg, i + 1);
1276 if (i != info->deepest - 1)
1277 fprintf (stream, " * ");
1278 }
1279 }
1280 }
1281
1282 /* Print to STREAM a representation of raising ARG to an integer
1283 power N. Used for the dump file. */
1284
1285 static void
1286 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1287 {
1288 if (n > 1)
1289 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1290 else if (n == 1)
1291 fprintf (stream, "%s", arg);
1292 }
1293
1294 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1295 square roots. Place at GSI and LOC. Limit the maximum depth
1296 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1297 result of the expanded sequence or NULL_TREE if the expansion failed.
1298
1299 This routine assumes that ARG1 is a real number with a fractional part
1300 (the integer exponent case will have been handled earlier in
1301 gimple_expand_builtin_pow).
1302
1303 For ARG1 > 0.0:
1304 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1305 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1306 FRAC_PART == ARG1 - WHOLE_PART:
1307 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1308 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1309 if it can be expressed as such, that is if FRAC_PART satisfies:
1310 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1311 where integer a[i] is either 0 or 1.
1312
1313 Example:
1314 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1315 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1316
1317 For ARG1 < 0.0 there are two approaches:
1318 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1319 is calculated as above.
1320
1321 Example:
1322 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1323 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1324
1325 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1326 FRAC_PART := ARG1 - WHOLE_PART
1327 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1328 Example:
1329 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1330 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1331
1332 For ARG1 < 0.0 we choose between (A) and (B) depending on
1333 how many multiplications we'd have to do.
1334 So, for the example in (B): POW (x, -5.875), if we were to
1335 follow algorithm (A) we would produce:
1336 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1337 which contains more multiplications than approach (B).
1338
1339 Hopefully, this approach will eliminate potentially expensive POW library
1340 calls when unsafe floating point math is enabled and allow the compiler to
1341 further optimise the multiplies, square roots and divides produced by this
1342 function. */
1343
1344 static tree
1345 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1346 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1347 {
1348 tree type = TREE_TYPE (arg0);
1349 machine_mode mode = TYPE_MODE (type);
1350 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1351 bool one_over = true;
1352
1353 if (!sqrtfn)
1354 return NULL_TREE;
1355
1356 if (TREE_CODE (arg1) != REAL_CST)
1357 return NULL_TREE;
1358
1359 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1360
1361 gcc_assert (max_depth > 0);
1362 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1363
1364 struct pow_synth_sqrt_info synth_info;
1365 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1366 synth_info.deepest = 0;
1367 synth_info.num_mults = 0;
1368
1369 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1370 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1371
1372 /* The whole and fractional parts of exp. */
1373 REAL_VALUE_TYPE whole_part;
1374 REAL_VALUE_TYPE frac_part;
1375
1376 real_floor (&whole_part, mode, &exp);
1377 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1378
1379
1380 REAL_VALUE_TYPE ceil_whole = dconst0;
1381 REAL_VALUE_TYPE ceil_fract = dconst0;
1382
1383 if (neg_exp)
1384 {
1385 real_ceil (&ceil_whole, mode, &exp);
1386 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1387 }
1388
1389 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1390 return NULL_TREE;
1391
1392 /* Check whether it's more profitable to not use 1.0 / ... */
1393 if (neg_exp)
1394 {
1395 struct pow_synth_sqrt_info alt_synth_info;
1396 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1397 alt_synth_info.deepest = 0;
1398 alt_synth_info.num_mults = 0;
1399
1400 if (representable_as_half_series_p (ceil_fract, max_depth,
1401 &alt_synth_info)
1402 && alt_synth_info.deepest <= synth_info.deepest
1403 && alt_synth_info.num_mults < synth_info.num_mults)
1404 {
1405 whole_part = ceil_whole;
1406 frac_part = ceil_fract;
1407 synth_info.deepest = alt_synth_info.deepest;
1408 synth_info.num_mults = alt_synth_info.num_mults;
1409 memcpy (synth_info.factors, alt_synth_info.factors,
1410 (max_depth + 1) * sizeof (bool));
1411 one_over = false;
1412 }
1413 }
1414
1415 HOST_WIDE_INT n = real_to_integer (&whole_part);
1416 REAL_VALUE_TYPE cint;
1417 real_from_integer (&cint, VOIDmode, n, SIGNED);
1418
1419 if (!real_identical (&whole_part, &cint))
1420 return NULL_TREE;
1421
1422 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1423 return NULL_TREE;
1424
1425 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1426
1427 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1428
1429 /* Calculate the integer part of the exponent. */
1430 if (n > 1)
1431 {
1432 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1433 if (!integer_res)
1434 return NULL_TREE;
1435 }
1436
1437 if (dump_file)
1438 {
1439 char string[64];
1440
1441 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1442 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1443
1444 if (neg_exp)
1445 {
1446 if (one_over)
1447 {
1448 fprintf (dump_file, "1.0 / (");
1449 dump_integer_part (dump_file, "x", n);
1450 if (n > 0)
1451 fprintf (dump_file, " * ");
1452 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1453 fprintf (dump_file, ")");
1454 }
1455 else
1456 {
1457 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1458 fprintf (dump_file, " / (");
1459 dump_integer_part (dump_file, "x", n);
1460 fprintf (dump_file, ")");
1461 }
1462 }
1463 else
1464 {
1465 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1466 if (n > 0)
1467 fprintf (dump_file, " * ");
1468 dump_integer_part (dump_file, "x", n);
1469 }
1470
1471 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1472 }
1473
1474
1475 tree fract_res = NULL_TREE;
1476 cache[0] = arg0;
1477
1478 /* Calculate the fractional part of the exponent. */
1479 for (unsigned i = 0; i < synth_info.deepest; i++)
1480 {
1481 if (synth_info.factors[i])
1482 {
1483 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1484
1485 if (!fract_res)
1486 fract_res = sqrt_chain;
1487
1488 else
1489 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1490 fract_res, sqrt_chain);
1491 }
1492 }
1493
1494 tree res = NULL_TREE;
1495
1496 if (neg_exp)
1497 {
1498 if (one_over)
1499 {
1500 if (n > 0)
1501 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1502 fract_res, integer_res);
1503 else
1504 res = fract_res;
1505
1506 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1507 build_real (type, dconst1), res);
1508 }
1509 else
1510 {
1511 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1512 fract_res, integer_res);
1513 }
1514 }
1515 else
1516 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1517 fract_res, integer_res);
1518 return res;
1519 }
1520
1521 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1522 with location info LOC. If possible, create an equivalent and
1523 less expensive sequence of statements prior to GSI, and return an
1524 expession holding the result. */
1525
1526 static tree
1527 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1528 tree arg0, tree arg1)
1529 {
1530 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1531 REAL_VALUE_TYPE c2, dconst3;
1532 HOST_WIDE_INT n;
1533 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1534 machine_mode mode;
1535 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1536 bool hw_sqrt_exists, c_is_int, c2_is_int;
1537
1538 dconst1_4 = dconst1;
1539 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1540
1541 /* If the exponent isn't a constant, there's nothing of interest
1542 to be done. */
1543 if (TREE_CODE (arg1) != REAL_CST)
1544 return NULL_TREE;
1545
1546 /* Don't perform the operation if flag_signaling_nans is on
1547 and the operand is a signaling NaN. */
1548 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1549 && ((TREE_CODE (arg0) == REAL_CST
1550 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1551 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1552 return NULL_TREE;
1553
1554 /* If the exponent is equivalent to an integer, expand to an optimal
1555 multiplication sequence when profitable. */
1556 c = TREE_REAL_CST (arg1);
1557 n = real_to_integer (&c);
1558 real_from_integer (&cint, VOIDmode, n, SIGNED);
1559 c_is_int = real_identical (&c, &cint);
1560
1561 if (c_is_int
1562 && ((n >= -1 && n <= 2)
1563 || (flag_unsafe_math_optimizations
1564 && speed_p
1565 && powi_cost (n) <= POWI_MAX_MULTS)))
1566 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1567
1568 /* Attempt various optimizations using sqrt and cbrt. */
1569 type = TREE_TYPE (arg0);
1570 mode = TYPE_MODE (type);
1571 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1572
1573 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1574 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1575 sqrt(-0) = -0. */
1576 if (sqrtfn
1577 && real_equal (&c, &dconsthalf)
1578 && !HONOR_SIGNED_ZEROS (mode))
1579 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1580
1581 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1582
1583 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1584 optimizations since 1./3. is not exactly representable. If x
1585 is negative and finite, the correct value of pow(x,1./3.) is
1586 a NaN with the "invalid" exception raised, because the value
1587 of 1./3. actually has an even denominator. The correct value
1588 of cbrt(x) is a negative real value. */
1589 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1590 dconst1_3 = real_value_truncate (mode, dconst_third ());
1591
1592 if (flag_unsafe_math_optimizations
1593 && cbrtfn
1594 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1595 && real_equal (&c, &dconst1_3))
1596 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1597
1598 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1599 if we don't have a hardware sqrt insn. */
1600 dconst1_6 = dconst1_3;
1601 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1602
1603 if (flag_unsafe_math_optimizations
1604 && sqrtfn
1605 && cbrtfn
1606 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1607 && speed_p
1608 && hw_sqrt_exists
1609 && real_equal (&c, &dconst1_6))
1610 {
1611 /* sqrt(x) */
1612 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1613
1614 /* cbrt(sqrt(x)) */
1615 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1616 }
1617
1618
1619 /* Attempt to expand the POW as a product of square root chains.
1620 Expand the 0.25 case even when otpimising for size. */
1621 if (flag_unsafe_math_optimizations
1622 && sqrtfn
1623 && hw_sqrt_exists
1624 && (speed_p || real_equal (&c, &dconst1_4))
1625 && !HONOR_SIGNED_ZEROS (mode))
1626 {
1627 unsigned int max_depth = speed_p
1628 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1629 : 2;
1630
1631 tree expand_with_sqrts
1632 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1633
1634 if (expand_with_sqrts)
1635 return expand_with_sqrts;
1636 }
1637
1638 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1639 n = real_to_integer (&c2);
1640 real_from_integer (&cint, VOIDmode, n, SIGNED);
1641 c2_is_int = real_identical (&c2, &cint);
1642
1643 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1644
1645 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1646 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1647
1648 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1649 different from pow(x, 1./3.) due to rounding and behavior with
1650 negative x, we need to constrain this transformation to unsafe
1651 math and positive x or finite math. */
1652 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1653 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1654 real_round (&c2, mode, &c2);
1655 n = real_to_integer (&c2);
1656 real_from_integer (&cint, VOIDmode, n, SIGNED);
1657 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1658 real_convert (&c2, mode, &c2);
1659
1660 if (flag_unsafe_math_optimizations
1661 && cbrtfn
1662 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1663 && real_identical (&c2, &c)
1664 && !c2_is_int
1665 && optimize_function_for_speed_p (cfun)
1666 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1667 {
1668 tree powi_x_ndiv3 = NULL_TREE;
1669
1670 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1671 possible or profitable, give up. Skip the degenerate case when
1672 abs(n) < 3, where the result is always 1. */
1673 if (absu_hwi (n) >= 3)
1674 {
1675 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1676 abs_hwi (n / 3));
1677 if (!powi_x_ndiv3)
1678 return NULL_TREE;
1679 }
1680
1681 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1682 as that creates an unnecessary variable. Instead, just produce
1683 either cbrt(x) or cbrt(x) * cbrt(x). */
1684 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1685
1686 if (absu_hwi (n) % 3 == 1)
1687 powi_cbrt_x = cbrt_x;
1688 else
1689 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1690 cbrt_x, cbrt_x);
1691
1692 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1693 if (absu_hwi (n) < 3)
1694 result = powi_cbrt_x;
1695 else
1696 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1697 powi_x_ndiv3, powi_cbrt_x);
1698
1699 /* If n is negative, reciprocate the result. */
1700 if (n < 0)
1701 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1702 build_real (type, dconst1), result);
1703
1704 return result;
1705 }
1706
1707 /* No optimizations succeeded. */
1708 return NULL_TREE;
1709 }
1710
1711 /* ARG is the argument to a cabs builtin call in GSI with location info
1712 LOC. Create a sequence of statements prior to GSI that calculates
1713 sqrt(R*R + I*I), where R and I are the real and imaginary components
1714 of ARG, respectively. Return an expression holding the result. */
1715
1716 static tree
1717 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1718 {
1719 tree real_part, imag_part, addend1, addend2, sum, result;
1720 tree type = TREE_TYPE (TREE_TYPE (arg));
1721 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1722 machine_mode mode = TYPE_MODE (type);
1723
1724 if (!flag_unsafe_math_optimizations
1725 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1726 || !sqrtfn
1727 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1728 return NULL_TREE;
1729
1730 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1731 REALPART_EXPR, arg);
1732 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1733 real_part, real_part);
1734 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1735 IMAGPART_EXPR, arg);
1736 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1737 imag_part, imag_part);
1738 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1739 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1740
1741 return result;
1742 }
1743
1744 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1745 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1746 an optimal number of multiplies, when n is a constant. */
1747
1748 namespace {
1749
1750 const pass_data pass_data_cse_sincos =
1751 {
1752 GIMPLE_PASS, /* type */
1753 "sincos", /* name */
1754 OPTGROUP_NONE, /* optinfo_flags */
1755 TV_NONE, /* tv_id */
1756 PROP_ssa, /* properties_required */
1757 PROP_gimple_opt_math, /* properties_provided */
1758 0, /* properties_destroyed */
1759 0, /* todo_flags_start */
1760 TODO_update_ssa, /* todo_flags_finish */
1761 };
1762
1763 class pass_cse_sincos : public gimple_opt_pass
1764 {
1765 public:
1766 pass_cse_sincos (gcc::context *ctxt)
1767 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1768 {}
1769
1770 /* opt_pass methods: */
1771 virtual bool gate (function *)
1772 {
1773 /* We no longer require either sincos or cexp, since powi expansion
1774 piggybacks on this pass. */
1775 return optimize;
1776 }
1777
1778 virtual unsigned int execute (function *);
1779
1780 }; // class pass_cse_sincos
1781
1782 unsigned int
1783 pass_cse_sincos::execute (function *fun)
1784 {
1785 basic_block bb;
1786 bool cfg_changed = false;
1787
1788 calculate_dominance_info (CDI_DOMINATORS);
1789 memset (&sincos_stats, 0, sizeof (sincos_stats));
1790
1791 FOR_EACH_BB_FN (bb, fun)
1792 {
1793 gimple_stmt_iterator gsi;
1794 bool cleanup_eh = false;
1795
1796 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1797 {
1798 gimple *stmt = gsi_stmt (gsi);
1799
1800 /* Only the last stmt in a bb could throw, no need to call
1801 gimple_purge_dead_eh_edges if we change something in the middle
1802 of a basic block. */
1803 cleanup_eh = false;
1804
1805 if (is_gimple_call (stmt)
1806 && gimple_call_lhs (stmt))
1807 {
1808 tree arg, arg0, arg1, result;
1809 HOST_WIDE_INT n;
1810 location_t loc;
1811
1812 switch (gimple_call_combined_fn (stmt))
1813 {
1814 CASE_CFN_COS:
1815 CASE_CFN_SIN:
1816 CASE_CFN_CEXPI:
1817 /* Make sure we have either sincos or cexp. */
1818 if (!targetm.libc_has_function (function_c99_math_complex)
1819 && !targetm.libc_has_function (function_sincos))
1820 break;
1821
1822 arg = gimple_call_arg (stmt, 0);
1823 if (TREE_CODE (arg) == SSA_NAME)
1824 cfg_changed |= execute_cse_sincos_1 (arg);
1825 break;
1826
1827 CASE_CFN_POW:
1828 arg0 = gimple_call_arg (stmt, 0);
1829 arg1 = gimple_call_arg (stmt, 1);
1830
1831 loc = gimple_location (stmt);
1832 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1833
1834 if (result)
1835 {
1836 tree lhs = gimple_get_lhs (stmt);
1837 gassign *new_stmt = gimple_build_assign (lhs, result);
1838 gimple_set_location (new_stmt, loc);
1839 unlink_stmt_vdef (stmt);
1840 gsi_replace (&gsi, new_stmt, true);
1841 cleanup_eh = true;
1842 if (gimple_vdef (stmt))
1843 release_ssa_name (gimple_vdef (stmt));
1844 }
1845 break;
1846
1847 CASE_CFN_POWI:
1848 arg0 = gimple_call_arg (stmt, 0);
1849 arg1 = gimple_call_arg (stmt, 1);
1850 loc = gimple_location (stmt);
1851
1852 if (real_minus_onep (arg0))
1853 {
1854 tree t0, t1, cond, one, minus_one;
1855 gassign *stmt;
1856
1857 t0 = TREE_TYPE (arg0);
1858 t1 = TREE_TYPE (arg1);
1859 one = build_real (t0, dconst1);
1860 minus_one = build_real (t0, dconstm1);
1861
1862 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1863 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1864 arg1, build_int_cst (t1, 1));
1865 gimple_set_location (stmt, loc);
1866 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1867
1868 result = make_temp_ssa_name (t0, NULL, "powi");
1869 stmt = gimple_build_assign (result, COND_EXPR, cond,
1870 minus_one, one);
1871 gimple_set_location (stmt, loc);
1872 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1873 }
1874 else
1875 {
1876 if (!tree_fits_shwi_p (arg1))
1877 break;
1878
1879 n = tree_to_shwi (arg1);
1880 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1881 }
1882
1883 if (result)
1884 {
1885 tree lhs = gimple_get_lhs (stmt);
1886 gassign *new_stmt = gimple_build_assign (lhs, result);
1887 gimple_set_location (new_stmt, loc);
1888 unlink_stmt_vdef (stmt);
1889 gsi_replace (&gsi, new_stmt, true);
1890 cleanup_eh = true;
1891 if (gimple_vdef (stmt))
1892 release_ssa_name (gimple_vdef (stmt));
1893 }
1894 break;
1895
1896 CASE_CFN_CABS:
1897 arg0 = gimple_call_arg (stmt, 0);
1898 loc = gimple_location (stmt);
1899 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1900
1901 if (result)
1902 {
1903 tree lhs = gimple_get_lhs (stmt);
1904 gassign *new_stmt = gimple_build_assign (lhs, result);
1905 gimple_set_location (new_stmt, loc);
1906 unlink_stmt_vdef (stmt);
1907 gsi_replace (&gsi, new_stmt, true);
1908 cleanup_eh = true;
1909 if (gimple_vdef (stmt))
1910 release_ssa_name (gimple_vdef (stmt));
1911 }
1912 break;
1913
1914 default:;
1915 }
1916 }
1917 }
1918 if (cleanup_eh)
1919 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1920 }
1921
1922 statistics_counter_event (fun, "sincos statements inserted",
1923 sincos_stats.inserted);
1924
1925 return cfg_changed ? TODO_cleanup_cfg : 0;
1926 }
1927
1928 } // anon namespace
1929
1930 gimple_opt_pass *
1931 make_pass_cse_sincos (gcc::context *ctxt)
1932 {
1933 return new pass_cse_sincos (ctxt);
1934 }
1935
1936 /* A symbolic number structure is used to detect byte permutation and selection
1937 patterns of a source. To achieve that, its field N contains an artificial
1938 number consisting of BITS_PER_MARKER sized markers tracking where does each
1939 byte come from in the source:
1940
1941 0 - target byte has the value 0
1942 FF - target byte has an unknown value (eg. due to sign extension)
1943 1..size - marker value is the byte index in the source (0 for lsb).
1944
1945 To detect permutations on memory sources (arrays and structures), a symbolic
1946 number is also associated:
1947 - a base address BASE_ADDR and an OFFSET giving the address of the source;
1948 - a range which gives the difference between the highest and lowest accessed
1949 memory location to make such a symbolic number;
1950 - the address SRC of the source element of lowest address as a convenience
1951 to easily get BASE_ADDR + offset + lowest bytepos;
1952 - number of expressions N_OPS bitwise ored together to represent
1953 approximate cost of the computation.
1954
1955 Note 1: the range is different from size as size reflects the size of the
1956 type of the current expression. For instance, for an array char a[],
1957 (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
1958 (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
1959 time a range of 1.
1960
1961 Note 2: for non-memory sources, range holds the same value as size.
1962
1963 Note 3: SRC points to the SSA_NAME in case of non-memory source. */
1964
1965 struct symbolic_number {
1966 uint64_t n;
1967 tree type;
1968 tree base_addr;
1969 tree offset;
1970 HOST_WIDE_INT bytepos;
1971 tree src;
1972 tree alias_set;
1973 tree vuse;
1974 unsigned HOST_WIDE_INT range;
1975 int n_ops;
1976 };
1977
1978 #define BITS_PER_MARKER 8
1979 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1980 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1981 #define HEAD_MARKER(n, size) \
1982 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1983
1984 /* The number which the find_bswap_or_nop_1 result should match in
1985 order to have a nop. The number is masked according to the size of
1986 the symbolic number before using it. */
1987 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1988 (uint64_t)0x08070605 << 32 | 0x04030201)
1989
1990 /* The number which the find_bswap_or_nop_1 result should match in
1991 order to have a byte swap. The number is masked according to the
1992 size of the symbolic number before using it. */
1993 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1994 (uint64_t)0x01020304 << 32 | 0x05060708)
1995
1996 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1997 number N. Return false if the requested operation is not permitted
1998 on a symbolic number. */
1999
2000 static inline bool
2001 do_shift_rotate (enum tree_code code,
2002 struct symbolic_number *n,
2003 int count)
2004 {
2005 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2006 unsigned head_marker;
2007
2008 if (count % BITS_PER_UNIT != 0)
2009 return false;
2010 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
2011
2012 /* Zero out the extra bits of N in order to avoid them being shifted
2013 into the significant bits. */
2014 if (size < 64 / BITS_PER_MARKER)
2015 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2016
2017 switch (code)
2018 {
2019 case LSHIFT_EXPR:
2020 n->n <<= count;
2021 break;
2022 case RSHIFT_EXPR:
2023 head_marker = HEAD_MARKER (n->n, size);
2024 n->n >>= count;
2025 /* Arithmetic shift of signed type: result is dependent on the value. */
2026 if (!TYPE_UNSIGNED (n->type) && head_marker)
2027 for (i = 0; i < count / BITS_PER_MARKER; i++)
2028 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2029 << ((size - 1 - i) * BITS_PER_MARKER);
2030 break;
2031 case LROTATE_EXPR:
2032 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
2033 break;
2034 case RROTATE_EXPR:
2035 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
2036 break;
2037 default:
2038 return false;
2039 }
2040 /* Zero unused bits for size. */
2041 if (size < 64 / BITS_PER_MARKER)
2042 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2043 return true;
2044 }
2045
2046 /* Perform sanity checking for the symbolic number N and the gimple
2047 statement STMT. */
2048
2049 static inline bool
2050 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
2051 {
2052 tree lhs_type;
2053
2054 lhs_type = gimple_expr_type (stmt);
2055
2056 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2057 return false;
2058
2059 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2060 return false;
2061
2062 return true;
2063 }
2064
2065 /* Initialize the symbolic number N for the bswap pass from the base element
2066 SRC manipulated by the bitwise OR expression. */
2067
2068 static bool
2069 init_symbolic_number (struct symbolic_number *n, tree src)
2070 {
2071 int size;
2072
2073 if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
2074 return false;
2075
2076 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2077 n->src = src;
2078
2079 /* Set up the symbolic number N by setting each byte to a value between 1 and
2080 the byte size of rhs1. The highest order byte is set to n->size and the
2081 lowest order byte to 1. */
2082 n->type = TREE_TYPE (src);
2083 size = TYPE_PRECISION (n->type);
2084 if (size % BITS_PER_UNIT != 0)
2085 return false;
2086 size /= BITS_PER_UNIT;
2087 if (size > 64 / BITS_PER_MARKER)
2088 return false;
2089 n->range = size;
2090 n->n = CMPNOP;
2091 n->n_ops = 1;
2092
2093 if (size < 64 / BITS_PER_MARKER)
2094 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2095
2096 return true;
2097 }
2098
2099 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2100 the answer. If so, REF is that memory source and the base of the memory area
2101 accessed and the offset of the access from that base are recorded in N. */
2102
2103 bool
2104 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2105 {
2106 /* Leaf node is an array or component ref. Memorize its base and
2107 offset from base to compare to other such leaf node. */
2108 HOST_WIDE_INT bitsize, bitpos;
2109 machine_mode mode;
2110 int unsignedp, reversep, volatilep;
2111 tree offset, base_addr;
2112
2113 /* Not prepared to handle PDP endian. */
2114 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2115 return false;
2116
2117 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2118 return false;
2119
2120 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2121 &unsignedp, &reversep, &volatilep);
2122
2123 if (TREE_CODE (base_addr) == MEM_REF)
2124 {
2125 offset_int bit_offset = 0;
2126 tree off = TREE_OPERAND (base_addr, 1);
2127
2128 if (!integer_zerop (off))
2129 {
2130 offset_int boff, coff = mem_ref_offset (base_addr);
2131 boff = coff << LOG2_BITS_PER_UNIT;
2132 bit_offset += boff;
2133 }
2134
2135 base_addr = TREE_OPERAND (base_addr, 0);
2136
2137 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2138 if (wi::neg_p (bit_offset))
2139 {
2140 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2141 offset_int tem = bit_offset.and_not (mask);
2142 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2143 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2144 bit_offset -= tem;
2145 tem >>= LOG2_BITS_PER_UNIT;
2146 if (offset)
2147 offset = size_binop (PLUS_EXPR, offset,
2148 wide_int_to_tree (sizetype, tem));
2149 else
2150 offset = wide_int_to_tree (sizetype, tem);
2151 }
2152
2153 bitpos += bit_offset.to_shwi ();
2154 }
2155
2156 if (bitpos % BITS_PER_UNIT)
2157 return false;
2158 if (bitsize % BITS_PER_UNIT)
2159 return false;
2160 if (reversep)
2161 return false;
2162
2163 if (!init_symbolic_number (n, ref))
2164 return false;
2165 n->base_addr = base_addr;
2166 n->offset = offset;
2167 n->bytepos = bitpos / BITS_PER_UNIT;
2168 n->alias_set = reference_alias_ptr_type (ref);
2169 n->vuse = gimple_vuse (stmt);
2170 return true;
2171 }
2172
2173 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2174 symbolic number N1 and N2 whose source statements are respectively
2175 SOURCE_STMT1 and SOURCE_STMT2. */
2176
2177 static gimple *
2178 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2179 gimple *source_stmt2, struct symbolic_number *n2,
2180 struct symbolic_number *n)
2181 {
2182 int i, size;
2183 uint64_t mask;
2184 gimple *source_stmt;
2185 struct symbolic_number *n_start;
2186
2187 tree rhs1 = gimple_assign_rhs1 (source_stmt1);
2188 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2189 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2190 rhs1 = TREE_OPERAND (rhs1, 0);
2191 tree rhs2 = gimple_assign_rhs1 (source_stmt2);
2192 if (TREE_CODE (rhs2) == BIT_FIELD_REF
2193 && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
2194 rhs2 = TREE_OPERAND (rhs2, 0);
2195
2196 /* Sources are different, cancel bswap if they are not memory location with
2197 the same base (array, structure, ...). */
2198 if (rhs1 != rhs2)
2199 {
2200 uint64_t inc;
2201 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2202 struct symbolic_number *toinc_n_ptr, *n_end;
2203 basic_block bb1, bb2;
2204
2205 if (!n1->base_addr || !n2->base_addr
2206 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2207 return NULL;
2208
2209 if (!n1->offset != !n2->offset
2210 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2211 return NULL;
2212
2213 if (n1->bytepos < n2->bytepos)
2214 {
2215 n_start = n1;
2216 start_sub = n2->bytepos - n1->bytepos;
2217 }
2218 else
2219 {
2220 n_start = n2;
2221 start_sub = n1->bytepos - n2->bytepos;
2222 }
2223
2224 bb1 = gimple_bb (source_stmt1);
2225 bb2 = gimple_bb (source_stmt2);
2226 if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
2227 source_stmt = source_stmt1;
2228 else
2229 source_stmt = source_stmt2;
2230
2231 /* Find the highest address at which a load is performed and
2232 compute related info. */
2233 end1 = n1->bytepos + (n1->range - 1);
2234 end2 = n2->bytepos + (n2->range - 1);
2235 if (end1 < end2)
2236 {
2237 end = end2;
2238 end_sub = end2 - end1;
2239 }
2240 else
2241 {
2242 end = end1;
2243 end_sub = end1 - end2;
2244 }
2245 n_end = (end2 > end1) ? n2 : n1;
2246
2247 /* Find symbolic number whose lsb is the most significant. */
2248 if (BYTES_BIG_ENDIAN)
2249 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2250 else
2251 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2252
2253 n->range = end - n_start->bytepos + 1;
2254
2255 /* Check that the range of memory covered can be represented by
2256 a symbolic number. */
2257 if (n->range > 64 / BITS_PER_MARKER)
2258 return NULL;
2259
2260 /* Reinterpret byte marks in symbolic number holding the value of
2261 bigger weight according to target endianness. */
2262 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2263 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2264 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2265 {
2266 unsigned marker
2267 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2268 if (marker && marker != MARKER_BYTE_UNKNOWN)
2269 toinc_n_ptr->n += inc;
2270 }
2271 }
2272 else
2273 {
2274 n->range = n1->range;
2275 n_start = n1;
2276 source_stmt = source_stmt1;
2277 }
2278
2279 if (!n1->alias_set
2280 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2281 n->alias_set = n1->alias_set;
2282 else
2283 n->alias_set = ptr_type_node;
2284 n->vuse = n_start->vuse;
2285 n->base_addr = n_start->base_addr;
2286 n->offset = n_start->offset;
2287 n->src = n_start->src;
2288 n->bytepos = n_start->bytepos;
2289 n->type = n_start->type;
2290 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2291
2292 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2293 {
2294 uint64_t masked1, masked2;
2295
2296 masked1 = n1->n & mask;
2297 masked2 = n2->n & mask;
2298 if (masked1 && masked2 && masked1 != masked2)
2299 return NULL;
2300 }
2301 n->n = n1->n | n2->n;
2302 n->n_ops = n1->n_ops + n2->n_ops;
2303
2304 return source_stmt;
2305 }
2306
2307 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2308 the operation given by the rhs of STMT on the result. If the operation
2309 could successfully be executed the function returns a gimple stmt whose
2310 rhs's first tree is the expression of the source operand and NULL
2311 otherwise. */
2312
2313 static gimple *
2314 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2315 {
2316 enum tree_code code;
2317 tree rhs1, rhs2 = NULL;
2318 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2319 enum gimple_rhs_class rhs_class;
2320
2321 if (!limit || !is_gimple_assign (stmt))
2322 return NULL;
2323
2324 rhs1 = gimple_assign_rhs1 (stmt);
2325
2326 if (find_bswap_or_nop_load (stmt, rhs1, n))
2327 return stmt;
2328
2329 /* Handle BIT_FIELD_REF. */
2330 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2331 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2332 {
2333 unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
2334 unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
2335 if (bitpos % BITS_PER_UNIT == 0
2336 && bitsize % BITS_PER_UNIT == 0
2337 && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
2338 {
2339 /* Handle big-endian bit numbering in BIT_FIELD_REF. */
2340 if (BYTES_BIG_ENDIAN)
2341 bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
2342
2343 /* Shift. */
2344 if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
2345 return NULL;
2346
2347 /* Mask. */
2348 uint64_t mask = 0;
2349 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2350 for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
2351 i++, tmp <<= BITS_PER_UNIT)
2352 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2353 n->n &= mask;
2354
2355 /* Convert. */
2356 n->type = TREE_TYPE (rhs1);
2357 if (!n->base_addr)
2358 n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2359
2360 return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
2361 }
2362
2363 return NULL;
2364 }
2365
2366 if (TREE_CODE (rhs1) != SSA_NAME)
2367 return NULL;
2368
2369 code = gimple_assign_rhs_code (stmt);
2370 rhs_class = gimple_assign_rhs_class (stmt);
2371 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2372
2373 if (rhs_class == GIMPLE_BINARY_RHS)
2374 rhs2 = gimple_assign_rhs2 (stmt);
2375
2376 /* Handle unary rhs and binary rhs with integer constants as second
2377 operand. */
2378
2379 if (rhs_class == GIMPLE_UNARY_RHS
2380 || (rhs_class == GIMPLE_BINARY_RHS
2381 && TREE_CODE (rhs2) == INTEGER_CST))
2382 {
2383 if (code != BIT_AND_EXPR
2384 && code != LSHIFT_EXPR
2385 && code != RSHIFT_EXPR
2386 && code != LROTATE_EXPR
2387 && code != RROTATE_EXPR
2388 && !CONVERT_EXPR_CODE_P (code))
2389 return NULL;
2390
2391 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2392
2393 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2394 we have to initialize the symbolic number. */
2395 if (!source_stmt1)
2396 {
2397 if (gimple_assign_load_p (stmt)
2398 || !init_symbolic_number (n, rhs1))
2399 return NULL;
2400 source_stmt1 = stmt;
2401 }
2402
2403 switch (code)
2404 {
2405 case BIT_AND_EXPR:
2406 {
2407 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2408 uint64_t val = int_cst_value (rhs2), mask = 0;
2409 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2410
2411 /* Only constants masking full bytes are allowed. */
2412 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2413 if ((val & tmp) != 0 && (val & tmp) != tmp)
2414 return NULL;
2415 else if (val & tmp)
2416 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2417
2418 n->n &= mask;
2419 }
2420 break;
2421 case LSHIFT_EXPR:
2422 case RSHIFT_EXPR:
2423 case LROTATE_EXPR:
2424 case RROTATE_EXPR:
2425 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2426 return NULL;
2427 break;
2428 CASE_CONVERT:
2429 {
2430 int i, type_size, old_type_size;
2431 tree type;
2432
2433 type = gimple_expr_type (stmt);
2434 type_size = TYPE_PRECISION (type);
2435 if (type_size % BITS_PER_UNIT != 0)
2436 return NULL;
2437 type_size /= BITS_PER_UNIT;
2438 if (type_size > 64 / BITS_PER_MARKER)
2439 return NULL;
2440
2441 /* Sign extension: result is dependent on the value. */
2442 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2443 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2444 && HEAD_MARKER (n->n, old_type_size))
2445 for (i = 0; i < type_size - old_type_size; i++)
2446 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2447 << ((type_size - 1 - i) * BITS_PER_MARKER);
2448
2449 if (type_size < 64 / BITS_PER_MARKER)
2450 {
2451 /* If STMT casts to a smaller type mask out the bits not
2452 belonging to the target type. */
2453 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2454 }
2455 n->type = type;
2456 if (!n->base_addr)
2457 n->range = type_size;
2458 }
2459 break;
2460 default:
2461 return NULL;
2462 };
2463 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2464 }
2465
2466 /* Handle binary rhs. */
2467
2468 if (rhs_class == GIMPLE_BINARY_RHS)
2469 {
2470 struct symbolic_number n1, n2;
2471 gimple *source_stmt, *source_stmt2;
2472
2473 if (code != BIT_IOR_EXPR)
2474 return NULL;
2475
2476 if (TREE_CODE (rhs2) != SSA_NAME)
2477 return NULL;
2478
2479 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2480
2481 switch (code)
2482 {
2483 case BIT_IOR_EXPR:
2484 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2485
2486 if (!source_stmt1)
2487 return NULL;
2488
2489 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2490
2491 if (!source_stmt2)
2492 return NULL;
2493
2494 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2495 return NULL;
2496
2497 if (!n1.vuse != !n2.vuse
2498 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2499 return NULL;
2500
2501 source_stmt
2502 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2503
2504 if (!source_stmt)
2505 return NULL;
2506
2507 if (!verify_symbolic_number_p (n, stmt))
2508 return NULL;
2509
2510 break;
2511 default:
2512 return NULL;
2513 }
2514 return source_stmt;
2515 }
2516 return NULL;
2517 }
2518
2519 /* Check if STMT completes a bswap implementation or a read in a given
2520 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2521 accordingly. It also sets N to represent the kind of operations
2522 performed: size of the resulting expression and whether it works on
2523 a memory source, and if so alias-set and vuse. At last, the
2524 function returns a stmt whose rhs's first tree is the source
2525 expression. */
2526
2527 static gimple *
2528 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2529 {
2530 unsigned rsize;
2531 uint64_t tmpn, mask;
2532 /* The number which the find_bswap_or_nop_1 result should match in order
2533 to have a full byte swap. The number is shifted to the right
2534 according to the size of the symbolic number before using it. */
2535 uint64_t cmpxchg = CMPXCHG;
2536 uint64_t cmpnop = CMPNOP;
2537
2538 gimple *ins_stmt;
2539 int limit;
2540
2541 /* The last parameter determines the depth search limit. It usually
2542 correlates directly to the number n of bytes to be touched. We
2543 increase that number by log2(n) + 1 here in order to also
2544 cover signed -> unsigned conversions of the src operand as can be seen
2545 in libgcc, and for initial shift/and operation of the src operand. */
2546 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2547 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2548 ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2549
2550 if (!ins_stmt)
2551 return NULL;
2552
2553 /* Find real size of result (highest non-zero byte). */
2554 if (n->base_addr)
2555 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2556 else
2557 rsize = n->range;
2558
2559 /* Zero out the bits corresponding to untouched bytes in original gimple
2560 expression. */
2561 if (n->range < (int) sizeof (int64_t))
2562 {
2563 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2564 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2565 cmpnop &= mask;
2566 }
2567
2568 /* Zero out the bits corresponding to unused bytes in the result of the
2569 gimple expression. */
2570 if (rsize < n->range)
2571 {
2572 if (BYTES_BIG_ENDIAN)
2573 {
2574 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2575 cmpxchg &= mask;
2576 cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
2577 }
2578 else
2579 {
2580 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2581 cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
2582 cmpnop &= mask;
2583 }
2584 n->range = rsize;
2585 }
2586
2587 /* A complete byte swap should make the symbolic number to start with
2588 the largest digit in the highest order byte. Unchanged symbolic
2589 number indicates a read with same endianness as target architecture. */
2590 if (n->n == cmpnop)
2591 *bswap = false;
2592 else if (n->n == cmpxchg)
2593 *bswap = true;
2594 else
2595 return NULL;
2596
2597 /* Useless bit manipulation performed by code. */
2598 if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
2599 return NULL;
2600
2601 n->range *= BITS_PER_UNIT;
2602 return ins_stmt;
2603 }
2604
2605 namespace {
2606
2607 const pass_data pass_data_optimize_bswap =
2608 {
2609 GIMPLE_PASS, /* type */
2610 "bswap", /* name */
2611 OPTGROUP_NONE, /* optinfo_flags */
2612 TV_NONE, /* tv_id */
2613 PROP_ssa, /* properties_required */
2614 0, /* properties_provided */
2615 0, /* properties_destroyed */
2616 0, /* todo_flags_start */
2617 0, /* todo_flags_finish */
2618 };
2619
2620 class pass_optimize_bswap : public gimple_opt_pass
2621 {
2622 public:
2623 pass_optimize_bswap (gcc::context *ctxt)
2624 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2625 {}
2626
2627 /* opt_pass methods: */
2628 virtual bool gate (function *)
2629 {
2630 return flag_expensive_optimizations && optimize;
2631 }
2632
2633 virtual unsigned int execute (function *);
2634
2635 }; // class pass_optimize_bswap
2636
2637 /* Perform the bswap optimization: replace the expression computed in the rhs
2638 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2639 Which of these alternatives replace the rhs is given by N->base_addr (non
2640 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2641 load to perform are also given in N while the builtin bswap invoke is given
2642 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2643 load statements involved to construct the rhs in CUR_STMT and N->range gives
2644 the size of the rhs expression for maintaining some statistics.
2645
2646 Note that if the replacement involve a load, CUR_STMT is moved just after
2647 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2648 changing of basic block. */
2649
2650 static bool
2651 bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl,
2652 tree bswap_type, tree load_type, struct symbolic_number *n,
2653 bool bswap)
2654 {
2655 gimple_stmt_iterator gsi;
2656 tree src, tmp, tgt;
2657 gimple *bswap_stmt;
2658
2659 gsi = gsi_for_stmt (cur_stmt);
2660 src = n->src;
2661 tgt = gimple_assign_lhs (cur_stmt);
2662
2663 /* Need to load the value from memory first. */
2664 if (n->base_addr)
2665 {
2666 gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
2667 tree addr_expr, addr_tmp, val_expr, val_tmp;
2668 tree load_offset_ptr, aligned_load_type;
2669 gimple *addr_stmt, *load_stmt;
2670 unsigned align;
2671 HOST_WIDE_INT load_offset = 0;
2672 basic_block ins_bb, cur_bb;
2673
2674 ins_bb = gimple_bb (ins_stmt);
2675 cur_bb = gimple_bb (cur_stmt);
2676 if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
2677 return false;
2678
2679 align = get_object_alignment (src);
2680
2681 /* Move cur_stmt just before one of the load of the original
2682 to ensure it has the same VUSE. See PR61517 for what could
2683 go wrong. */
2684 if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
2685 reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2686 gsi_move_before (&gsi, &gsi_ins);
2687 gsi = gsi_for_stmt (cur_stmt);
2688
2689 /* Compute address to load from and cast according to the size
2690 of the load. */
2691 addr_expr = build_fold_addr_expr (unshare_expr (src));
2692 if (is_gimple_mem_ref_addr (addr_expr))
2693 addr_tmp = addr_expr;
2694 else
2695 {
2696 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2697 "load_src");
2698 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2699 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2700 }
2701
2702 /* Perform the load. */
2703 aligned_load_type = load_type;
2704 if (align < TYPE_ALIGN (load_type))
2705 aligned_load_type = build_aligned_type (load_type, align);
2706 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2707 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2708 load_offset_ptr);
2709
2710 if (!bswap)
2711 {
2712 if (n->range == 16)
2713 nop_stats.found_16bit++;
2714 else if (n->range == 32)
2715 nop_stats.found_32bit++;
2716 else
2717 {
2718 gcc_assert (n->range == 64);
2719 nop_stats.found_64bit++;
2720 }
2721
2722 /* Convert the result of load if necessary. */
2723 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2724 {
2725 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2726 "load_dst");
2727 load_stmt = gimple_build_assign (val_tmp, val_expr);
2728 gimple_set_vuse (load_stmt, n->vuse);
2729 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2730 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2731 }
2732 else
2733 {
2734 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2735 gimple_set_vuse (cur_stmt, n->vuse);
2736 }
2737 update_stmt (cur_stmt);
2738
2739 if (dump_file)
2740 {
2741 fprintf (dump_file,
2742 "%d bit load in target endianness found at: ",
2743 (int) n->range);
2744 print_gimple_stmt (dump_file, cur_stmt, 0);
2745 }
2746 return true;
2747 }
2748 else
2749 {
2750 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2751 load_stmt = gimple_build_assign (val_tmp, val_expr);
2752 gimple_set_vuse (load_stmt, n->vuse);
2753 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2754 }
2755 src = val_tmp;
2756 }
2757 else if (!bswap)
2758 {
2759 gimple *g;
2760 if (!useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
2761 {
2762 if (!is_gimple_val (src))
2763 return false;
2764 g = gimple_build_assign (tgt, NOP_EXPR, src);
2765 }
2766 else
2767 g = gimple_build_assign (tgt, src);
2768 if (n->range == 16)
2769 nop_stats.found_16bit++;
2770 else if (n->range == 32)
2771 nop_stats.found_32bit++;
2772 else
2773 {
2774 gcc_assert (n->range == 64);
2775 nop_stats.found_64bit++;
2776 }
2777 if (dump_file)
2778 {
2779 fprintf (dump_file,
2780 "%d bit reshuffle in target endianness found at: ",
2781 (int) n->range);
2782 print_gimple_stmt (dump_file, cur_stmt, 0);
2783 }
2784 gsi_replace (&gsi, g, true);
2785 return true;
2786 }
2787 else if (TREE_CODE (src) == BIT_FIELD_REF)
2788 src = TREE_OPERAND (src, 0);
2789
2790 if (n->range == 16)
2791 bswap_stats.found_16bit++;
2792 else if (n->range == 32)
2793 bswap_stats.found_32bit++;
2794 else
2795 {
2796 gcc_assert (n->range == 64);
2797 bswap_stats.found_64bit++;
2798 }
2799
2800 tmp = src;
2801
2802 /* Convert the src expression if necessary. */
2803 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2804 {
2805 gimple *convert_stmt;
2806
2807 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2808 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2809 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2810 }
2811
2812 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2813 are considered as rotation of 2N bit values by N bits is generally not
2814 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2815 gives 0x03040102 while a bswap for that value is 0x04030201. */
2816 if (bswap && n->range == 16)
2817 {
2818 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2819 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2820 bswap_stmt = gimple_build_assign (NULL, src);
2821 }
2822 else
2823 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2824
2825 tmp = tgt;
2826
2827 /* Convert the result if necessary. */
2828 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2829 {
2830 gimple *convert_stmt;
2831
2832 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2833 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2834 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2835 }
2836
2837 gimple_set_lhs (bswap_stmt, tmp);
2838
2839 if (dump_file)
2840 {
2841 fprintf (dump_file, "%d bit bswap implementation found at: ",
2842 (int) n->range);
2843 print_gimple_stmt (dump_file, cur_stmt, 0);
2844 }
2845
2846 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2847 gsi_remove (&gsi, true);
2848 return true;
2849 }
2850
2851 /* Find manual byte swap implementations as well as load in a given
2852 endianness. Byte swaps are turned into a bswap builtin invokation
2853 while endian loads are converted to bswap builtin invokation or
2854 simple load according to the target endianness. */
2855
2856 unsigned int
2857 pass_optimize_bswap::execute (function *fun)
2858 {
2859 basic_block bb;
2860 bool bswap32_p, bswap64_p;
2861 bool changed = false;
2862 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2863
2864 if (BITS_PER_UNIT != 8)
2865 return 0;
2866
2867 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2868 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2869 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2870 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2871 || (bswap32_p && word_mode == SImode)));
2872
2873 /* Determine the argument type of the builtins. The code later on
2874 assumes that the return and argument type are the same. */
2875 if (bswap32_p)
2876 {
2877 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2878 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2879 }
2880
2881 if (bswap64_p)
2882 {
2883 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2884 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2885 }
2886
2887 memset (&nop_stats, 0, sizeof (nop_stats));
2888 memset (&bswap_stats, 0, sizeof (bswap_stats));
2889 calculate_dominance_info (CDI_DOMINATORS);
2890
2891 FOR_EACH_BB_FN (bb, fun)
2892 {
2893 gimple_stmt_iterator gsi;
2894
2895 /* We do a reverse scan for bswap patterns to make sure we get the
2896 widest match. As bswap pattern matching doesn't handle previously
2897 inserted smaller bswap replacements as sub-patterns, the wider
2898 variant wouldn't be detected. */
2899 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2900 {
2901 gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
2902 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2903 enum tree_code code;
2904 struct symbolic_number n;
2905 bool bswap;
2906
2907 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2908 might be moved to a different basic block by bswap_replace and gsi
2909 must not points to it if that's the case. Moving the gsi_prev
2910 there make sure that gsi points to the statement previous to
2911 cur_stmt while still making sure that all statements are
2912 considered in this basic block. */
2913 gsi_prev (&gsi);
2914
2915 if (!is_gimple_assign (cur_stmt))
2916 continue;
2917
2918 code = gimple_assign_rhs_code (cur_stmt);
2919 switch (code)
2920 {
2921 case LROTATE_EXPR:
2922 case RROTATE_EXPR:
2923 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2924 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2925 % BITS_PER_UNIT)
2926 continue;
2927 /* Fall through. */
2928 case BIT_IOR_EXPR:
2929 break;
2930 default:
2931 continue;
2932 }
2933
2934 ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2935
2936 if (!ins_stmt)
2937 continue;
2938
2939 switch (n.range)
2940 {
2941 case 16:
2942 /* Already in canonical form, nothing to do. */
2943 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2944 continue;
2945 load_type = bswap_type = uint16_type_node;
2946 break;
2947 case 32:
2948 load_type = uint32_type_node;
2949 if (bswap32_p)
2950 {
2951 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2952 bswap_type = bswap32_type;
2953 }
2954 break;
2955 case 64:
2956 load_type = uint64_type_node;
2957 if (bswap64_p)
2958 {
2959 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2960 bswap_type = bswap64_type;
2961 }
2962 break;
2963 default:
2964 continue;
2965 }
2966
2967 if (bswap && !fndecl && n.range != 16)
2968 continue;
2969
2970 if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
2971 &n, bswap))
2972 changed = true;
2973 }
2974 }
2975
2976 statistics_counter_event (fun, "16-bit nop implementations found",
2977 nop_stats.found_16bit);
2978 statistics_counter_event (fun, "32-bit nop implementations found",
2979 nop_stats.found_32bit);
2980 statistics_counter_event (fun, "64-bit nop implementations found",
2981 nop_stats.found_64bit);
2982 statistics_counter_event (fun, "16-bit bswap implementations found",
2983 bswap_stats.found_16bit);
2984 statistics_counter_event (fun, "32-bit bswap implementations found",
2985 bswap_stats.found_32bit);
2986 statistics_counter_event (fun, "64-bit bswap implementations found",
2987 bswap_stats.found_64bit);
2988
2989 return (changed ? TODO_update_ssa : 0);
2990 }
2991
2992 } // anon namespace
2993
2994 gimple_opt_pass *
2995 make_pass_optimize_bswap (gcc::context *ctxt)
2996 {
2997 return new pass_optimize_bswap (ctxt);
2998 }
2999
3000 /* Return true if stmt is a type conversion operation that can be stripped
3001 when used in a widening multiply operation. */
3002 static bool
3003 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
3004 {
3005 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
3006
3007 if (TREE_CODE (result_type) == INTEGER_TYPE)
3008 {
3009 tree op_type;
3010 tree inner_op_type;
3011
3012 if (!CONVERT_EXPR_CODE_P (rhs_code))
3013 return false;
3014
3015 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
3016
3017 /* If the type of OP has the same precision as the result, then
3018 we can strip this conversion. The multiply operation will be
3019 selected to create the correct extension as a by-product. */
3020 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
3021 return true;
3022
3023 /* We can also strip a conversion if it preserves the signed-ness of
3024 the operation and doesn't narrow the range. */
3025 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
3026
3027 /* If the inner-most type is unsigned, then we can strip any
3028 intermediate widening operation. If it's signed, then the
3029 intermediate widening operation must also be signed. */
3030 if ((TYPE_UNSIGNED (inner_op_type)
3031 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
3032 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
3033 return true;
3034
3035 return false;
3036 }
3037
3038 return rhs_code == FIXED_CONVERT_EXPR;
3039 }
3040
3041 /* Return true if RHS is a suitable operand for a widening multiplication,
3042 assuming a target type of TYPE.
3043 There are two cases:
3044
3045 - RHS makes some value at least twice as wide. Store that value
3046 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
3047
3048 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
3049 but leave *TYPE_OUT untouched. */
3050
3051 static bool
3052 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
3053 tree *new_rhs_out)
3054 {
3055 gimple *stmt;
3056 tree type1, rhs1;
3057
3058 if (TREE_CODE (rhs) == SSA_NAME)
3059 {
3060 stmt = SSA_NAME_DEF_STMT (rhs);
3061 if (is_gimple_assign (stmt))
3062 {
3063 if (! widening_mult_conversion_strippable_p (type, stmt))
3064 rhs1 = rhs;
3065 else
3066 {
3067 rhs1 = gimple_assign_rhs1 (stmt);
3068
3069 if (TREE_CODE (rhs1) == INTEGER_CST)
3070 {
3071 *new_rhs_out = rhs1;
3072 *type_out = NULL;
3073 return true;
3074 }
3075 }
3076 }
3077 else
3078 rhs1 = rhs;
3079
3080 type1 = TREE_TYPE (rhs1);
3081
3082 if (TREE_CODE (type1) != TREE_CODE (type)
3083 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
3084 return false;
3085
3086 *new_rhs_out = rhs1;
3087 *type_out = type1;
3088 return true;
3089 }
3090
3091 if (TREE_CODE (rhs) == INTEGER_CST)
3092 {
3093 *new_rhs_out = rhs;
3094 *type_out = NULL;
3095 return true;
3096 }
3097
3098 return false;
3099 }
3100
3101 /* Return true if STMT performs a widening multiplication, assuming the
3102 output type is TYPE. If so, store the unwidened types of the operands
3103 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
3104 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
3105 and *TYPE2_OUT would give the operands of the multiplication. */
3106
3107 static bool
3108 is_widening_mult_p (gimple *stmt,
3109 tree *type1_out, tree *rhs1_out,
3110 tree *type2_out, tree *rhs2_out)
3111 {
3112 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3113
3114 if (TREE_CODE (type) != INTEGER_TYPE
3115 && TREE_CODE (type) != FIXED_POINT_TYPE)
3116 return false;
3117
3118 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
3119 rhs1_out))
3120 return false;
3121
3122 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
3123 rhs2_out))
3124 return false;
3125
3126 if (*type1_out == NULL)
3127 {
3128 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
3129 return false;
3130 *type1_out = *type2_out;
3131 }
3132
3133 if (*type2_out == NULL)
3134 {
3135 if (!int_fits_type_p (*rhs2_out, *type1_out))
3136 return false;
3137 *type2_out = *type1_out;
3138 }
3139
3140 /* Ensure that the larger of the two operands comes first. */
3141 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3142 {
3143 std::swap (*type1_out, *type2_out);
3144 std::swap (*rhs1_out, *rhs2_out);
3145 }
3146
3147 return true;
3148 }
3149
3150 /* Check to see if the CALL statement is an invocation of copysign
3151 with 1. being the first argument. */
3152 static bool
3153 is_copysign_call_with_1 (gimple *call)
3154 {
3155 gcall *c = dyn_cast <gcall *> (call);
3156 if (! c)
3157 return false;
3158
3159 enum combined_fn code = gimple_call_combined_fn (c);
3160
3161 if (code == CFN_LAST)
3162 return false;
3163
3164 if (builtin_fn_p (code))
3165 {
3166 switch (as_builtin_fn (code))
3167 {
3168 CASE_FLT_FN (BUILT_IN_COPYSIGN):
3169 CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
3170 return real_onep (gimple_call_arg (c, 0));
3171 default:
3172 return false;
3173 }
3174 }
3175
3176 if (internal_fn_p (code))
3177 {
3178 switch (as_internal_fn (code))
3179 {
3180 case IFN_COPYSIGN:
3181 return real_onep (gimple_call_arg (c, 0));
3182 default:
3183 return false;
3184 }
3185 }
3186
3187 return false;
3188 }
3189
3190 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
3191 This only happens when the the xorsign optab is defined, if the
3192 pattern is not a xorsign pattern or if expansion fails FALSE is
3193 returned, otherwise TRUE is returned. */
3194 static bool
3195 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
3196 {
3197 tree treeop0, treeop1, lhs, type;
3198 location_t loc = gimple_location (stmt);
3199 lhs = gimple_assign_lhs (stmt);
3200 treeop0 = gimple_assign_rhs1 (stmt);
3201 treeop1 = gimple_assign_rhs2 (stmt);
3202 type = TREE_TYPE (lhs);
3203 machine_mode mode = TYPE_MODE (type);
3204
3205 if (HONOR_SNANS (type))
3206 return false;
3207
3208 if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
3209 {
3210 gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
3211 if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
3212 {
3213 call0 = SSA_NAME_DEF_STMT (treeop1);
3214 if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
3215 return false;
3216
3217 treeop1 = treeop0;
3218 }
3219 if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
3220 return false;
3221
3222 gcall *c = as_a<gcall*> (call0);
3223 treeop0 = gimple_call_arg (c, 1);
3224
3225 gcall *call_stmt
3226 = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
3227 gimple_set_lhs (call_stmt, lhs);
3228 gimple_set_location (call_stmt, loc);
3229 gsi_replace (gsi, call_stmt, true);
3230 return true;
3231 }
3232
3233 return false;
3234 }
3235
3236 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3237 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3238 value is true iff we converted the statement. */
3239
3240 static bool
3241 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
3242 {
3243 tree lhs, rhs1, rhs2, type, type1, type2;
3244 enum insn_code handler;
3245 machine_mode to_mode, from_mode, actual_mode;
3246 optab op;
3247 int actual_precision;
3248 location_t loc = gimple_location (stmt);
3249 bool from_unsigned1, from_unsigned2;
3250
3251 lhs = gimple_assign_lhs (stmt);
3252 type = TREE_TYPE (lhs);
3253 if (TREE_CODE (type) != INTEGER_TYPE)
3254 return false;
3255
3256 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3257 return false;
3258
3259 to_mode = TYPE_MODE (type);
3260 from_mode = TYPE_MODE (type1);
3261 from_unsigned1 = TYPE_UNSIGNED (type1);
3262 from_unsigned2 = TYPE_UNSIGNED (type2);
3263
3264 if (from_unsigned1 && from_unsigned2)
3265 op = umul_widen_optab;
3266 else if (!from_unsigned1 && !from_unsigned2)
3267 op = smul_widen_optab;
3268 else
3269 op = usmul_widen_optab;
3270
3271 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3272 0, &actual_mode);
3273
3274 if (handler == CODE_FOR_nothing)
3275 {
3276 if (op != smul_widen_optab)
3277 {
3278 /* We can use a signed multiply with unsigned types as long as
3279 there is a wider mode to use, or it is the smaller of the two
3280 types that is unsigned. Note that type1 >= type2, always. */
3281 if ((TYPE_UNSIGNED (type1)
3282 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3283 || (TYPE_UNSIGNED (type2)
3284 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3285 {
3286 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3287 || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3288 return false;
3289 }
3290
3291 op = smul_widen_optab;
3292 handler = find_widening_optab_handler_and_mode (op, to_mode,
3293 from_mode, 0,
3294 &actual_mode);
3295
3296 if (handler == CODE_FOR_nothing)
3297 return false;
3298
3299 from_unsigned1 = from_unsigned2 = false;
3300 }
3301 else
3302 return false;
3303 }
3304
3305 /* Ensure that the inputs to the handler are in the correct precison
3306 for the opcode. This will be the full mode size. */
3307 actual_precision = GET_MODE_PRECISION (actual_mode);
3308 if (2 * actual_precision > TYPE_PRECISION (type))
3309 return false;
3310 if (actual_precision != TYPE_PRECISION (type1)
3311 || from_unsigned1 != TYPE_UNSIGNED (type1))
3312 rhs1 = build_and_insert_cast (gsi, loc,
3313 build_nonstandard_integer_type
3314 (actual_precision, from_unsigned1), rhs1);
3315 if (actual_precision != TYPE_PRECISION (type2)
3316 || from_unsigned2 != TYPE_UNSIGNED (type2))
3317 rhs2 = build_and_insert_cast (gsi, loc,
3318 build_nonstandard_integer_type
3319 (actual_precision, from_unsigned2), rhs2);
3320
3321 /* Handle constants. */
3322 if (TREE_CODE (rhs1) == INTEGER_CST)
3323 rhs1 = fold_convert (type1, rhs1);
3324 if (TREE_CODE (rhs2) == INTEGER_CST)
3325 rhs2 = fold_convert (type2, rhs2);
3326
3327 gimple_assign_set_rhs1 (stmt, rhs1);
3328 gimple_assign_set_rhs2 (stmt, rhs2);
3329 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3330 update_stmt (stmt);
3331 widen_mul_stats.widen_mults_inserted++;
3332 return true;
3333 }
3334
3335 /* Process a single gimple statement STMT, which is found at the
3336 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3337 rhs (given by CODE), and try to convert it into a
3338 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3339 is true iff we converted the statement. */
3340
3341 static bool
3342 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3343 enum tree_code code)
3344 {
3345 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3346 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3347 tree type, type1, type2, optype;
3348 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3349 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3350 optab this_optab;
3351 enum tree_code wmult_code;
3352 enum insn_code handler;
3353 machine_mode to_mode, from_mode, actual_mode;
3354 location_t loc = gimple_location (stmt);
3355 int actual_precision;
3356 bool from_unsigned1, from_unsigned2;
3357
3358 lhs = gimple_assign_lhs (stmt);
3359 type = TREE_TYPE (lhs);
3360 if (TREE_CODE (type) != INTEGER_TYPE
3361 && TREE_CODE (type) != FIXED_POINT_TYPE)
3362 return false;
3363
3364 if (code == MINUS_EXPR)
3365 wmult_code = WIDEN_MULT_MINUS_EXPR;
3366 else
3367 wmult_code = WIDEN_MULT_PLUS_EXPR;
3368
3369 rhs1 = gimple_assign_rhs1 (stmt);
3370 rhs2 = gimple_assign_rhs2 (stmt);
3371
3372 if (TREE_CODE (rhs1) == SSA_NAME)
3373 {
3374 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3375 if (is_gimple_assign (rhs1_stmt))
3376 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3377 }
3378
3379 if (TREE_CODE (rhs2) == SSA_NAME)
3380 {
3381 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3382 if (is_gimple_assign (rhs2_stmt))
3383 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3384 }
3385
3386 /* Allow for one conversion statement between the multiply
3387 and addition/subtraction statement. If there are more than
3388 one conversions then we assume they would invalidate this
3389 transformation. If that's not the case then they should have
3390 been folded before now. */
3391 if (CONVERT_EXPR_CODE_P (rhs1_code))
3392 {
3393 conv1_stmt = rhs1_stmt;
3394 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3395 if (TREE_CODE (rhs1) == SSA_NAME)
3396 {
3397 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3398 if (is_gimple_assign (rhs1_stmt))
3399 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3400 }
3401 else
3402 return false;
3403 }
3404 if (CONVERT_EXPR_CODE_P (rhs2_code))
3405 {
3406 conv2_stmt = rhs2_stmt;
3407 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3408 if (TREE_CODE (rhs2) == SSA_NAME)
3409 {
3410 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3411 if (is_gimple_assign (rhs2_stmt))
3412 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3413 }
3414 else
3415 return false;
3416 }
3417
3418 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3419 is_widening_mult_p, but we still need the rhs returns.
3420
3421 It might also appear that it would be sufficient to use the existing
3422 operands of the widening multiply, but that would limit the choice of
3423 multiply-and-accumulate instructions.
3424
3425 If the widened-multiplication result has more than one uses, it is
3426 probably wiser not to do the conversion. */
3427 if (code == PLUS_EXPR
3428 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3429 {
3430 if (!has_single_use (rhs1)
3431 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3432 &type2, &mult_rhs2))
3433 return false;
3434 add_rhs = rhs2;
3435 conv_stmt = conv1_stmt;
3436 }
3437 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3438 {
3439 if (!has_single_use (rhs2)
3440 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3441 &type2, &mult_rhs2))
3442 return false;
3443 add_rhs = rhs1;
3444 conv_stmt = conv2_stmt;
3445 }
3446 else
3447 return false;
3448
3449 to_mode = TYPE_MODE (type);
3450 from_mode = TYPE_MODE (type1);
3451 from_unsigned1 = TYPE_UNSIGNED (type1);
3452 from_unsigned2 = TYPE_UNSIGNED (type2);
3453 optype = type1;
3454
3455 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3456 if (from_unsigned1 != from_unsigned2)
3457 {
3458 if (!INTEGRAL_TYPE_P (type))
3459 return false;
3460 /* We can use a signed multiply with unsigned types as long as
3461 there is a wider mode to use, or it is the smaller of the two
3462 types that is unsigned. Note that type1 >= type2, always. */
3463 if ((from_unsigned1
3464 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3465 || (from_unsigned2
3466 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3467 {
3468 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3469 || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3470 return false;
3471 }
3472
3473 from_unsigned1 = from_unsigned2 = false;
3474 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3475 false);
3476 }
3477
3478 /* If there was a conversion between the multiply and addition
3479 then we need to make sure it fits a multiply-and-accumulate.
3480 The should be a single mode change which does not change the
3481 value. */
3482 if (conv_stmt)
3483 {
3484 /* We use the original, unmodified data types for this. */
3485 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3486 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3487 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3488 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3489
3490 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3491 {
3492 /* Conversion is a truncate. */
3493 if (TYPE_PRECISION (to_type) < data_size)
3494 return false;
3495 }
3496 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3497 {
3498 /* Conversion is an extend. Check it's the right sort. */
3499 if (TYPE_UNSIGNED (from_type) != is_unsigned
3500 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3501 return false;
3502 }
3503 /* else convert is a no-op for our purposes. */
3504 }
3505
3506 /* Verify that the machine can perform a widening multiply
3507 accumulate in this mode/signedness combination, otherwise
3508 this transformation is likely to pessimize code. */
3509 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3510 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3511 from_mode, 0, &actual_mode);
3512
3513 if (handler == CODE_FOR_nothing)
3514 return false;
3515
3516 /* Ensure that the inputs to the handler are in the correct precison
3517 for the opcode. This will be the full mode size. */
3518 actual_precision = GET_MODE_PRECISION (actual_mode);
3519 if (actual_precision != TYPE_PRECISION (type1)
3520 || from_unsigned1 != TYPE_UNSIGNED (type1))
3521 mult_rhs1 = build_and_insert_cast (gsi, loc,
3522 build_nonstandard_integer_type
3523 (actual_precision, from_unsigned1),
3524 mult_rhs1);
3525 if (actual_precision != TYPE_PRECISION (type2)
3526 || from_unsigned2 != TYPE_UNSIGNED (type2))
3527 mult_rhs2 = build_and_insert_cast (gsi, loc,
3528 build_nonstandard_integer_type
3529 (actual_precision, from_unsigned2),
3530 mult_rhs2);
3531
3532 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3533 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3534
3535 /* Handle constants. */
3536 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3537 mult_rhs1 = fold_convert (type1, mult_rhs1);
3538 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3539 mult_rhs2 = fold_convert (type2, mult_rhs2);
3540
3541 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3542 add_rhs);
3543 update_stmt (gsi_stmt (*gsi));
3544 widen_mul_stats.maccs_inserted++;
3545 return true;
3546 }
3547
3548 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3549 with uses in additions and subtractions to form fused multiply-add
3550 operations. Returns true if successful and MUL_STMT should be removed. */
3551
3552 static bool
3553 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3554 {
3555 tree mul_result = gimple_get_lhs (mul_stmt);
3556 tree type = TREE_TYPE (mul_result);
3557 gimple *use_stmt, *neguse_stmt;
3558 gassign *fma_stmt;
3559 use_operand_p use_p;
3560 imm_use_iterator imm_iter;
3561
3562 if (FLOAT_TYPE_P (type)
3563 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3564 return false;
3565
3566 /* We don't want to do bitfield reduction ops. */
3567 if (INTEGRAL_TYPE_P (type)
3568 && !type_has_mode_precision_p (type))
3569 return false;
3570
3571 /* If the target doesn't support it, don't generate it. We assume that
3572 if fma isn't available then fms, fnma or fnms are not either. */
3573 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3574 return false;
3575
3576 /* If the multiplication has zero uses, it is kept around probably because
3577 of -fnon-call-exceptions. Don't optimize it away in that case,
3578 it is DCE job. */
3579 if (has_zero_uses (mul_result))
3580 return false;
3581
3582 /* Make sure that the multiplication statement becomes dead after
3583 the transformation, thus that all uses are transformed to FMAs.
3584 This means we assume that an FMA operation has the same cost
3585 as an addition. */
3586 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3587 {
3588 enum tree_code use_code;
3589 tree result = mul_result;
3590 bool negate_p = false;
3591
3592 use_stmt = USE_STMT (use_p);
3593
3594 if (is_gimple_debug (use_stmt))
3595 continue;
3596
3597 /* For now restrict this operations to single basic blocks. In theory
3598 we would want to support sinking the multiplication in
3599 m = a*b;
3600 if ()
3601 ma = m + c;
3602 else
3603 d = m;
3604 to form a fma in the then block and sink the multiplication to the
3605 else block. */
3606 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3607 return false;
3608
3609 if (!is_gimple_assign (use_stmt))
3610 return false;
3611
3612 use_code = gimple_assign_rhs_code (use_stmt);
3613
3614 /* A negate on the multiplication leads to FNMA. */
3615 if (use_code == NEGATE_EXPR)
3616 {
3617 ssa_op_iter iter;
3618 use_operand_p usep;
3619
3620 result = gimple_assign_lhs (use_stmt);
3621
3622 /* Make sure the negate statement becomes dead with this
3623 single transformation. */
3624 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3625 &use_p, &neguse_stmt))
3626 return false;
3627
3628 /* Make sure the multiplication isn't also used on that stmt. */
3629 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3630 if (USE_FROM_PTR (usep) == mul_result)
3631 return false;
3632
3633 /* Re-validate. */
3634 use_stmt = neguse_stmt;
3635 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3636 return false;
3637 if (!is_gimple_assign (use_stmt))
3638 return false;
3639
3640 use_code = gimple_assign_rhs_code (use_stmt);
3641 negate_p = true;
3642 }
3643
3644 switch (use_code)
3645 {
3646 case MINUS_EXPR:
3647 if (gimple_assign_rhs2 (use_stmt) == result)
3648 negate_p = !negate_p;
3649 break;
3650 case PLUS_EXPR:
3651 break;
3652 default:
3653 /* FMA can only be formed from PLUS and MINUS. */
3654 return false;
3655 }
3656
3657 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3658 by a MULT_EXPR that we'll visit later, we might be able to
3659 get a more profitable match with fnma.
3660 OTOH, if we don't, a negate / fma pair has likely lower latency
3661 that a mult / subtract pair. */
3662 if (use_code == MINUS_EXPR && !negate_p
3663 && gimple_assign_rhs1 (use_stmt) == result
3664 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3665 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3666 {
3667 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3668
3669 if (TREE_CODE (rhs2) == SSA_NAME)
3670 {
3671 gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3672 if (has_single_use (rhs2)
3673 && is_gimple_assign (stmt2)
3674 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3675 return false;
3676 }
3677 }
3678
3679 /* We can't handle a * b + a * b. */
3680 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3681 return false;
3682
3683 /* While it is possible to validate whether or not the exact form
3684 that we've recognized is available in the backend, the assumption
3685 is that the transformation is never a loss. For instance, suppose
3686 the target only has the plain FMA pattern available. Consider
3687 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3688 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3689 still have 3 operations, but in the FMA form the two NEGs are
3690 independent and could be run in parallel. */
3691 }
3692
3693 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3694 {
3695 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3696 enum tree_code use_code;
3697 tree addop, mulop1 = op1, result = mul_result;
3698 bool negate_p = false;
3699
3700 if (is_gimple_debug (use_stmt))
3701 continue;
3702
3703 use_code = gimple_assign_rhs_code (use_stmt);
3704 if (use_code == NEGATE_EXPR)
3705 {
3706 result = gimple_assign_lhs (use_stmt);
3707 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3708 gsi_remove (&gsi, true);
3709 release_defs (use_stmt);
3710
3711 use_stmt = neguse_stmt;
3712 gsi = gsi_for_stmt (use_stmt);
3713 use_code = gimple_assign_rhs_code (use_stmt);
3714 negate_p = true;
3715 }
3716
3717 if (gimple_assign_rhs1 (use_stmt) == result)
3718 {
3719 addop = gimple_assign_rhs2 (use_stmt);
3720 /* a * b - c -> a * b + (-c) */
3721 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3722 addop = force_gimple_operand_gsi (&gsi,
3723 build1 (NEGATE_EXPR,
3724 type, addop),
3725 true, NULL_TREE, true,
3726 GSI_SAME_STMT);
3727 }
3728 else
3729 {
3730 addop = gimple_assign_rhs1 (use_stmt);
3731 /* a - b * c -> (-b) * c + a */
3732 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3733 negate_p = !negate_p;
3734 }
3735
3736 if (negate_p)
3737 mulop1 = force_gimple_operand_gsi (&gsi,
3738 build1 (NEGATE_EXPR,
3739 type, mulop1),
3740 true, NULL_TREE, true,
3741 GSI_SAME_STMT);
3742
3743 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3744 FMA_EXPR, mulop1, op2, addop);
3745 gsi_replace (&gsi, fma_stmt, true);
3746 widen_mul_stats.fmas_inserted++;
3747 }
3748
3749 return true;
3750 }
3751
3752
3753 /* Helper function of match_uaddsub_overflow. Return 1
3754 if USE_STMT is unsigned overflow check ovf != 0 for
3755 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3756 and 0 otherwise. */
3757
3758 static int
3759 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3760 {
3761 enum tree_code ccode = ERROR_MARK;
3762 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3763 if (gimple_code (use_stmt) == GIMPLE_COND)
3764 {
3765 ccode = gimple_cond_code (use_stmt);
3766 crhs1 = gimple_cond_lhs (use_stmt);
3767 crhs2 = gimple_cond_rhs (use_stmt);
3768 }
3769 else if (is_gimple_assign (use_stmt))
3770 {
3771 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3772 {
3773 ccode = gimple_assign_rhs_code (use_stmt);
3774 crhs1 = gimple_assign_rhs1 (use_stmt);
3775 crhs2 = gimple_assign_rhs2 (use_stmt);
3776 }
3777 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3778 {
3779 tree cond = gimple_assign_rhs1 (use_stmt);
3780 if (COMPARISON_CLASS_P (cond))
3781 {
3782 ccode = TREE_CODE (cond);
3783 crhs1 = TREE_OPERAND (cond, 0);
3784 crhs2 = TREE_OPERAND (cond, 1);
3785 }
3786 else
3787 return 0;
3788 }
3789 else
3790 return 0;
3791 }
3792 else
3793 return 0;
3794
3795 if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3796 return 0;
3797
3798 enum tree_code code = gimple_assign_rhs_code (stmt);
3799 tree lhs = gimple_assign_lhs (stmt);
3800 tree rhs1 = gimple_assign_rhs1 (stmt);
3801 tree rhs2 = gimple_assign_rhs2 (stmt);
3802
3803 switch (ccode)
3804 {
3805 case GT_EXPR:
3806 case LE_EXPR:
3807 /* r = a - b; r > a or r <= a
3808 r = a + b; a > r or a <= r or b > r or b <= r. */
3809 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3810 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3811 && crhs2 == lhs))
3812 return ccode == GT_EXPR ? 1 : -1;
3813 break;
3814 case LT_EXPR:
3815 case GE_EXPR:
3816 /* r = a - b; a < r or a >= r
3817 r = a + b; r < a or r >= a or r < b or r >= b. */
3818 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3819 || (code == PLUS_EXPR && crhs1 == lhs
3820 && (crhs2 == rhs1 || crhs2 == rhs2)))
3821 return ccode == LT_EXPR ? 1 : -1;
3822 break;
3823 default:
3824 break;
3825 }
3826 return 0;
3827 }
3828
3829 /* Recognize for unsigned x
3830 x = y - z;
3831 if (x > y)
3832 where there are other uses of x and replace it with
3833 _7 = SUB_OVERFLOW (y, z);
3834 x = REALPART_EXPR <_7>;
3835 _8 = IMAGPART_EXPR <_7>;
3836 if (_8)
3837 and similarly for addition. */
3838
3839 static bool
3840 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3841 enum tree_code code)
3842 {
3843 tree lhs = gimple_assign_lhs (stmt);
3844 tree type = TREE_TYPE (lhs);
3845 use_operand_p use_p;
3846 imm_use_iterator iter;
3847 bool use_seen = false;
3848 bool ovf_use_seen = false;
3849 gimple *use_stmt;
3850
3851 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3852 if (!INTEGRAL_TYPE_P (type)
3853 || !TYPE_UNSIGNED (type)
3854 || has_zero_uses (lhs)
3855 || has_single_use (lhs)
3856 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3857 TYPE_MODE (type)) == CODE_FOR_nothing)
3858 return false;
3859
3860 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3861 {
3862 use_stmt = USE_STMT (use_p);
3863 if (is_gimple_debug (use_stmt))
3864 continue;
3865
3866 if (uaddsub_overflow_check_p (stmt, use_stmt))
3867 ovf_use_seen = true;
3868 else
3869 use_seen = true;
3870 if (ovf_use_seen && use_seen)
3871 break;
3872 }
3873
3874 if (!ovf_use_seen || !use_seen)
3875 return false;
3876
3877 tree ctype = build_complex_type (type);
3878 tree rhs1 = gimple_assign_rhs1 (stmt);
3879 tree rhs2 = gimple_assign_rhs2 (stmt);
3880 gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3881 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3882 2, rhs1, rhs2);
3883 tree ctmp = make_ssa_name (ctype);
3884 gimple_call_set_lhs (g, ctmp);
3885 gsi_insert_before (gsi, g, GSI_SAME_STMT);
3886 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3887 build1 (REALPART_EXPR, type, ctmp));
3888 gsi_replace (gsi, g2, true);
3889 tree ovf = make_ssa_name (type);
3890 g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3891 build1 (IMAGPART_EXPR, type, ctmp));
3892 gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3893
3894 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3895 {
3896 if (is_gimple_debug (use_stmt))
3897 continue;
3898
3899 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3900 if (ovf_use == 0)
3901 continue;
3902 if (gimple_code (use_stmt) == GIMPLE_COND)
3903 {
3904 gcond *cond_stmt = as_a <gcond *> (use_stmt);
3905 gimple_cond_set_lhs (cond_stmt, ovf);
3906 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3907 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3908 }
3909 else
3910 {
3911 gcc_checking_assert (is_gimple_assign (use_stmt));
3912 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3913 {
3914 gimple_assign_set_rhs1 (use_stmt, ovf);
3915 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3916 gimple_assign_set_rhs_code (use_stmt,
3917 ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3918 }
3919 else
3920 {
3921 gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3922 == COND_EXPR);
3923 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3924 boolean_type_node, ovf,
3925 build_int_cst (type, 0));
3926 gimple_assign_set_rhs1 (use_stmt, cond);
3927 }
3928 }
3929 update_stmt (use_stmt);
3930 }
3931 return true;
3932 }
3933
3934 /* Return true if target has support for divmod. */
3935
3936 static bool
3937 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3938 {
3939 /* If target supports hardware divmod insn, use it for divmod. */
3940 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3941 return true;
3942
3943 /* Check if libfunc for divmod is available. */
3944 rtx libfunc = optab_libfunc (divmod_optab, mode);
3945 if (libfunc != NULL_RTX)
3946 {
3947 /* If optab_handler exists for div_optab, perhaps in a wider mode,
3948 we don't want to use the libfunc even if it exists for given mode. */
3949 machine_mode div_mode;
3950 FOR_EACH_MODE_FROM (div_mode, mode)
3951 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3952 return false;
3953
3954 return targetm.expand_divmod_libfunc != NULL;
3955 }
3956
3957 return false;
3958 }
3959
3960 /* Check if stmt is candidate for divmod transform. */
3961
3962 static bool
3963 divmod_candidate_p (gassign *stmt)
3964 {
3965 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3966 machine_mode mode = TYPE_MODE (type);
3967 optab divmod_optab, div_optab;
3968
3969 if (TYPE_UNSIGNED (type))
3970 {
3971 divmod_optab = udivmod_optab;
3972 div_optab = udiv_optab;
3973 }
3974 else
3975 {
3976 divmod_optab = sdivmod_optab;
3977 div_optab = sdiv_optab;
3978 }
3979
3980 tree op1 = gimple_assign_rhs1 (stmt);
3981 tree op2 = gimple_assign_rhs2 (stmt);
3982
3983 /* Disable the transform if either is a constant, since division-by-constant
3984 may have specialized expansion. */
3985 if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3986 return false;
3987
3988 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3989 expand using the [su]divv optabs. */
3990 if (TYPE_OVERFLOW_TRAPS (type))
3991 return false;
3992
3993 if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3994 return false;
3995
3996 return true;
3997 }
3998
3999 /* This function looks for:
4000 t1 = a TRUNC_DIV_EXPR b;
4001 t2 = a TRUNC_MOD_EXPR b;
4002 and transforms it to the following sequence:
4003 complex_tmp = DIVMOD (a, b);
4004 t1 = REALPART_EXPR(a);
4005 t2 = IMAGPART_EXPR(b);
4006 For conditions enabling the transform see divmod_candidate_p().
4007
4008 The pass has three parts:
4009 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
4010 other trunc_div_expr and trunc_mod_expr stmts.
4011 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
4012 to stmts vector.
4013 3) Insert DIVMOD call just before top_stmt and update entries in
4014 stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
4015 IMAGPART_EXPR for mod). */
4016
4017 static bool
4018 convert_to_divmod (gassign *stmt)
4019 {
4020 if (stmt_can_throw_internal (stmt)
4021 || !divmod_candidate_p (stmt))
4022 return false;
4023
4024 tree op1 = gimple_assign_rhs1 (stmt);
4025 tree op2 = gimple_assign_rhs2 (stmt);
4026
4027 imm_use_iterator use_iter;
4028 gimple *use_stmt;
4029 auto_vec<gimple *> stmts;
4030
4031 gimple *top_stmt = stmt;
4032 basic_block top_bb = gimple_bb (stmt);
4033
4034 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
4035 at-least stmt and possibly other trunc_div/trunc_mod stmts
4036 having same operands as stmt. */
4037
4038 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
4039 {
4040 if (is_gimple_assign (use_stmt)
4041 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4042 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4043 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
4044 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
4045 {
4046 if (stmt_can_throw_internal (use_stmt))
4047 continue;
4048
4049 basic_block bb = gimple_bb (use_stmt);
4050
4051 if (bb == top_bb)
4052 {
4053 if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
4054 top_stmt = use_stmt;
4055 }
4056 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
4057 {
4058 top_bb = bb;
4059 top_stmt = use_stmt;
4060 }
4061 }
4062 }
4063
4064 tree top_op1 = gimple_assign_rhs1 (top_stmt);
4065 tree top_op2 = gimple_assign_rhs2 (top_stmt);
4066
4067 stmts.safe_push (top_stmt);
4068 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
4069
4070 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
4071 to stmts vector. The 2nd loop will always add stmt to stmts vector, since
4072 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
4073 2nd loop ends up adding at-least single trunc_mod_expr stmt. */
4074
4075 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
4076 {
4077 if (is_gimple_assign (use_stmt)
4078 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4079 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4080 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
4081 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
4082 {
4083 if (use_stmt == top_stmt
4084 || stmt_can_throw_internal (use_stmt)
4085 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
4086 continue;
4087
4088 stmts.safe_push (use_stmt);
4089 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
4090 div_seen = true;
4091 }
4092 }
4093
4094 if (!div_seen)
4095 return false;
4096
4097 /* Part 3: Create libcall to internal fn DIVMOD:
4098 divmod_tmp = DIVMOD (op1, op2). */
4099
4100 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
4101 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
4102 call_stmt, "divmod_tmp");
4103 gimple_call_set_lhs (call_stmt, res);
4104 /* We rejected throwing statements above. */
4105 gimple_call_set_nothrow (call_stmt, true);
4106
4107 /* Insert the call before top_stmt. */
4108 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
4109 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
4110
4111 widen_mul_stats.divmod_calls_inserted++;
4112
4113 /* Update all statements in stmts vector:
4114 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
4115 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */
4116
4117 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
4118 {
4119 tree new_rhs;
4120
4121 switch (gimple_assign_rhs_code (use_stmt))
4122 {
4123 case TRUNC_DIV_EXPR:
4124 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
4125 break;
4126
4127 case TRUNC_MOD_EXPR:
4128 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
4129 break;
4130
4131 default:
4132 gcc_unreachable ();
4133 }
4134
4135 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
4136 gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
4137 update_stmt (use_stmt);
4138 }
4139
4140 return true;
4141 }
4142
4143 /* Find integer multiplications where the operands are extended from
4144 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
4145 where appropriate. */
4146
4147 namespace {
4148
4149 const pass_data pass_data_optimize_widening_mul =
4150 {
4151 GIMPLE_PASS, /* type */
4152 "widening_mul", /* name */
4153 OPTGROUP_NONE, /* optinfo_flags */
4154 TV_NONE, /* tv_id */
4155 PROP_ssa, /* properties_required */
4156 0, /* properties_provided */
4157 0, /* properties_destroyed */
4158 0, /* todo_flags_start */
4159 TODO_update_ssa, /* todo_flags_finish */
4160 };
4161
4162 class pass_optimize_widening_mul : public gimple_opt_pass
4163 {
4164 public:
4165 pass_optimize_widening_mul (gcc::context *ctxt)
4166 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
4167 {}
4168
4169 /* opt_pass methods: */
4170 virtual bool gate (function *)
4171 {
4172 return flag_expensive_optimizations && optimize;
4173 }
4174
4175 virtual unsigned int execute (function *);
4176
4177 }; // class pass_optimize_widening_mul
4178
4179 unsigned int
4180 pass_optimize_widening_mul::execute (function *fun)
4181 {
4182 basic_block bb;
4183 bool cfg_changed = false;
4184
4185 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
4186 calculate_dominance_info (CDI_DOMINATORS);
4187 renumber_gimple_stmt_uids ();
4188
4189 FOR_EACH_BB_FN (bb, fun)
4190 {
4191 gimple_stmt_iterator gsi;
4192
4193 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
4194 {
4195 gimple *stmt = gsi_stmt (gsi);
4196 enum tree_code code;
4197
4198 if (is_gimple_assign (stmt))
4199 {
4200 code = gimple_assign_rhs_code (stmt);
4201 switch (code)
4202 {
4203 case MULT_EXPR:
4204 if (!convert_mult_to_widen (stmt, &gsi)
4205 && !convert_expand_mult_copysign (stmt, &gsi)
4206 && convert_mult_to_fma (stmt,
4207 gimple_assign_rhs1 (stmt),
4208 gimple_assign_rhs2 (stmt)))
4209 {
4210 gsi_remove (&gsi, true);
4211 release_defs (stmt);
4212 continue;
4213 }
4214 break;
4215
4216 case PLUS_EXPR:
4217 case MINUS_EXPR:
4218 if (!convert_plusminus_to_widen (&gsi, stmt, code))
4219 match_uaddsub_overflow (&gsi, stmt, code);
4220 break;
4221
4222 case TRUNC_MOD_EXPR:
4223 convert_to_divmod (as_a<gassign *> (stmt));
4224 break;
4225
4226 default:;
4227 }
4228 }
4229 else if (is_gimple_call (stmt)
4230 && gimple_call_lhs (stmt))
4231 {
4232 tree fndecl = gimple_call_fndecl (stmt);
4233 if (fndecl
4234 && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
4235 {
4236 switch (DECL_FUNCTION_CODE (fndecl))
4237 {
4238 case BUILT_IN_POWF:
4239 case BUILT_IN_POW:
4240 case BUILT_IN_POWL:
4241 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
4242 && real_equal
4243 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
4244 &dconst2)
4245 && convert_mult_to_fma (stmt,
4246 gimple_call_arg (stmt, 0),
4247 gimple_call_arg (stmt, 0)))
4248 {
4249 unlink_stmt_vdef (stmt);
4250 if (gsi_remove (&gsi, true)
4251 && gimple_purge_dead_eh_edges (bb))
4252 cfg_changed = true;
4253 release_defs (stmt);
4254 continue;
4255 }
4256 break;
4257
4258 default:;
4259 }
4260 }
4261 }
4262 gsi_next (&gsi);
4263 }
4264 }
4265
4266 statistics_counter_event (fun, "widening multiplications inserted",
4267 widen_mul_stats.widen_mults_inserted);
4268 statistics_counter_event (fun, "widening maccs inserted",
4269 widen_mul_stats.maccs_inserted);
4270 statistics_counter_event (fun, "fused multiply-adds inserted",
4271 widen_mul_stats.fmas_inserted);
4272 statistics_counter_event (fun, "divmod calls inserted",
4273 widen_mul_stats.divmod_calls_inserted);
4274
4275 return cfg_changed ? TODO_cleanup_cfg : 0;
4276 }
4277
4278 } // anon namespace
4279
4280 gimple_opt_pass *
4281 make_pass_optimize_widening_mul (gcc::context *ctxt)
4282 {
4283 return new pass_optimize_widening_mul (ctxt);
4284 }