]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-ssa-math-opts.c
Widening optab cleanup
[thirdparty/gcc.git] / gcc / tree-ssa-math-opts.c
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2017 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
22
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
27
28 that can be optimized to
29
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
35
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
38
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
42
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 by the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
50
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
56
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
60
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
68
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
75
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
79
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
86
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "params.h"
113 #include "internal-fn.h"
114 #include "case-cfn-macros.h"
115 #include "optabs-libfuncs.h"
116 #include "tree-eh.h"
117 #include "targhooks.h"
118
119 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a
121 division. */
122 struct occurrence {
123 /* The basic block represented by this structure. */
124 basic_block bb;
125
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */
128 tree recip_def;
129
130 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
131 was inserted in BB. */
132 gimple *recip_def_stmt;
133
134 /* Pointer to a list of "struct occurrence"s for blocks dominated
135 by BB. */
136 struct occurrence *children;
137
138 /* Pointer to the next "struct occurrence"s in the list of blocks
139 sharing a common dominator. */
140 struct occurrence *next;
141
142 /* The number of divisions that are in BB before compute_merit. The
143 number of divisions that are in BB or post-dominate it after
144 compute_merit. */
145 int num_divisions;
146
147 /* True if the basic block has a division, false if it is a common
148 dominator for basic blocks that do. If it is false and trapping
149 math is active, BB is not a candidate for inserting a reciprocal. */
150 bool bb_has_division;
151 };
152
153 static struct
154 {
155 /* Number of 1.0/X ops inserted. */
156 int rdivs_inserted;
157
158 /* Number of 1.0/FUNC ops inserted. */
159 int rfuncs_inserted;
160 } reciprocal_stats;
161
162 static struct
163 {
164 /* Number of cexpi calls inserted. */
165 int inserted;
166 } sincos_stats;
167
168 static struct
169 {
170 /* Number of hand-written 16-bit nop / bswaps found. */
171 int found_16bit;
172
173 /* Number of hand-written 32-bit nop / bswaps found. */
174 int found_32bit;
175
176 /* Number of hand-written 64-bit nop / bswaps found. */
177 int found_64bit;
178 } nop_stats, bswap_stats;
179
180 static struct
181 {
182 /* Number of widening multiplication ops inserted. */
183 int widen_mults_inserted;
184
185 /* Number of integer multiply-and-accumulate ops inserted. */
186 int maccs_inserted;
187
188 /* Number of fp fused multiply-add ops inserted. */
189 int fmas_inserted;
190
191 /* Number of divmod calls inserted. */
192 int divmod_calls_inserted;
193 } widen_mul_stats;
194
195 /* The instance of "struct occurrence" representing the highest
196 interesting block in the dominator tree. */
197 static struct occurrence *occ_head;
198
199 /* Allocation pool for getting instances of "struct occurrence". */
200 static object_allocator<occurrence> *occ_pool;
201
202
203
204 /* Allocate and return a new struct occurrence for basic block BB, and
205 whose children list is headed by CHILDREN. */
206 static struct occurrence *
207 occ_new (basic_block bb, struct occurrence *children)
208 {
209 struct occurrence *occ;
210
211 bb->aux = occ = occ_pool->allocate ();
212 memset (occ, 0, sizeof (struct occurrence));
213
214 occ->bb = bb;
215 occ->children = children;
216 return occ;
217 }
218
219
220 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
221 list of "struct occurrence"s, one per basic block, having IDOM as
222 their common dominator.
223
224 We try to insert NEW_OCC as deep as possible in the tree, and we also
225 insert any other block that is a common dominator for BB and one
226 block already in the tree. */
227
228 static void
229 insert_bb (struct occurrence *new_occ, basic_block idom,
230 struct occurrence **p_head)
231 {
232 struct occurrence *occ, **p_occ;
233
234 for (p_occ = p_head; (occ = *p_occ) != NULL; )
235 {
236 basic_block bb = new_occ->bb, occ_bb = occ->bb;
237 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
238 if (dom == bb)
239 {
240 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
241 from its list. */
242 *p_occ = occ->next;
243 occ->next = new_occ->children;
244 new_occ->children = occ;
245
246 /* Try the next block (it may as well be dominated by BB). */
247 }
248
249 else if (dom == occ_bb)
250 {
251 /* OCC_BB dominates BB. Tail recurse to look deeper. */
252 insert_bb (new_occ, dom, &occ->children);
253 return;
254 }
255
256 else if (dom != idom)
257 {
258 gcc_assert (!dom->aux);
259
260 /* There is a dominator between IDOM and BB, add it and make
261 two children out of NEW_OCC and OCC. First, remove OCC from
262 its list. */
263 *p_occ = occ->next;
264 new_occ->next = occ;
265 occ->next = NULL;
266
267 /* None of the previous blocks has DOM as a dominator: if we tail
268 recursed, we would reexamine them uselessly. Just switch BB with
269 DOM, and go on looking for blocks dominated by DOM. */
270 new_occ = occ_new (dom, new_occ);
271 }
272
273 else
274 {
275 /* Nothing special, go on with the next element. */
276 p_occ = &occ->next;
277 }
278 }
279
280 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
281 new_occ->next = *p_head;
282 *p_head = new_occ;
283 }
284
285 /* Register that we found a division in BB. */
286
287 static inline void
288 register_division_in (basic_block bb)
289 {
290 struct occurrence *occ;
291
292 occ = (struct occurrence *) bb->aux;
293 if (!occ)
294 {
295 occ = occ_new (bb, NULL);
296 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
297 }
298
299 occ->bb_has_division = true;
300 occ->num_divisions++;
301 }
302
303
304 /* Compute the number of divisions that postdominate each block in OCC and
305 its children. */
306
307 static void
308 compute_merit (struct occurrence *occ)
309 {
310 struct occurrence *occ_child;
311 basic_block dom = occ->bb;
312
313 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
314 {
315 basic_block bb;
316 if (occ_child->children)
317 compute_merit (occ_child);
318
319 if (flag_exceptions)
320 bb = single_noncomplex_succ (dom);
321 else
322 bb = dom;
323
324 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
325 occ->num_divisions += occ_child->num_divisions;
326 }
327 }
328
329
330 /* Return whether USE_STMT is a floating-point division by DEF. */
331 static inline bool
332 is_division_by (gimple *use_stmt, tree def)
333 {
334 return is_gimple_assign (use_stmt)
335 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
336 && gimple_assign_rhs2 (use_stmt) == def
337 /* Do not recognize x / x as valid division, as we are getting
338 confused later by replacing all immediate uses x in such
339 a stmt. */
340 && gimple_assign_rhs1 (use_stmt) != def;
341 }
342
343 /* Walk the subset of the dominator tree rooted at OCC, setting the
344 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
345 the given basic block. The field may be left NULL, of course,
346 if it is not possible or profitable to do the optimization.
347
348 DEF_BSI is an iterator pointing at the statement defining DEF.
349 If RECIP_DEF is set, a dominator already has a computation that can
350 be used. */
351
352 static void
353 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
354 tree def, tree recip_def, int threshold)
355 {
356 tree type;
357 gassign *new_stmt;
358 gimple_stmt_iterator gsi;
359 struct occurrence *occ_child;
360
361 if (!recip_def
362 && (occ->bb_has_division || !flag_trapping_math)
363 && occ->num_divisions >= threshold)
364 {
365 /* Make a variable with the replacement and substitute it. */
366 type = TREE_TYPE (def);
367 recip_def = create_tmp_reg (type, "reciptmp");
368 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
369 build_one_cst (type), def);
370
371 if (occ->bb_has_division)
372 {
373 /* Case 1: insert before an existing division. */
374 gsi = gsi_after_labels (occ->bb);
375 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
376 gsi_next (&gsi);
377
378 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
379 }
380 else if (def_gsi && occ->bb == def_gsi->bb)
381 {
382 /* Case 2: insert right after the definition. Note that this will
383 never happen if the definition statement can throw, because in
384 that case the sole successor of the statement's basic block will
385 dominate all the uses as well. */
386 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
387 }
388 else
389 {
390 /* Case 3: insert in a basic block not containing defs/uses. */
391 gsi = gsi_after_labels (occ->bb);
392 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
393 }
394
395 reciprocal_stats.rdivs_inserted++;
396
397 occ->recip_def_stmt = new_stmt;
398 }
399
400 occ->recip_def = recip_def;
401 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
402 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
403 }
404
405
406 /* Replace the division at USE_P with a multiplication by the reciprocal, if
407 possible. */
408
409 static inline void
410 replace_reciprocal (use_operand_p use_p)
411 {
412 gimple *use_stmt = USE_STMT (use_p);
413 basic_block bb = gimple_bb (use_stmt);
414 struct occurrence *occ = (struct occurrence *) bb->aux;
415
416 if (optimize_bb_for_speed_p (bb)
417 && occ->recip_def && use_stmt != occ->recip_def_stmt)
418 {
419 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
420 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
421 SET_USE (use_p, occ->recip_def);
422 fold_stmt_inplace (&gsi);
423 update_stmt (use_stmt);
424 }
425 }
426
427
428 /* Free OCC and return one more "struct occurrence" to be freed. */
429
430 static struct occurrence *
431 free_bb (struct occurrence *occ)
432 {
433 struct occurrence *child, *next;
434
435 /* First get the two pointers hanging off OCC. */
436 next = occ->next;
437 child = occ->children;
438 occ->bb->aux = NULL;
439 occ_pool->remove (occ);
440
441 /* Now ensure that we don't recurse unless it is necessary. */
442 if (!child)
443 return next;
444 else
445 {
446 while (next)
447 next = free_bb (next);
448
449 return child;
450 }
451 }
452
453
454 /* Look for floating-point divisions among DEF's uses, and try to
455 replace them by multiplications with the reciprocal. Add
456 as many statements computing the reciprocal as needed.
457
458 DEF must be a GIMPLE register of a floating-point type. */
459
460 static void
461 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
462 {
463 use_operand_p use_p;
464 imm_use_iterator use_iter;
465 struct occurrence *occ;
466 int count = 0, threshold;
467
468 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
469
470 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
471 {
472 gimple *use_stmt = USE_STMT (use_p);
473 if (is_division_by (use_stmt, def))
474 {
475 register_division_in (gimple_bb (use_stmt));
476 count++;
477 }
478 }
479
480 /* Do the expensive part only if we can hope to optimize something. */
481 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
482 if (count >= threshold)
483 {
484 gimple *use_stmt;
485 for (occ = occ_head; occ; occ = occ->next)
486 {
487 compute_merit (occ);
488 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
489 }
490
491 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
492 {
493 if (is_division_by (use_stmt, def))
494 {
495 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
496 replace_reciprocal (use_p);
497 }
498 }
499 }
500
501 for (occ = occ_head; occ; )
502 occ = free_bb (occ);
503
504 occ_head = NULL;
505 }
506
507 /* Return an internal function that implements the reciprocal of CALL,
508 or IFN_LAST if there is no such function that the target supports. */
509
510 internal_fn
511 internal_fn_reciprocal (gcall *call)
512 {
513 internal_fn ifn;
514
515 switch (gimple_call_combined_fn (call))
516 {
517 CASE_CFN_SQRT:
518 CASE_CFN_SQRT_FN:
519 ifn = IFN_RSQRT;
520 break;
521
522 default:
523 return IFN_LAST;
524 }
525
526 tree_pair types = direct_internal_fn_types (ifn, call);
527 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
528 return IFN_LAST;
529
530 return ifn;
531 }
532
533 /* Go through all the floating-point SSA_NAMEs, and call
534 execute_cse_reciprocals_1 on each of them. */
535 namespace {
536
537 const pass_data pass_data_cse_reciprocals =
538 {
539 GIMPLE_PASS, /* type */
540 "recip", /* name */
541 OPTGROUP_NONE, /* optinfo_flags */
542 TV_NONE, /* tv_id */
543 PROP_ssa, /* properties_required */
544 0, /* properties_provided */
545 0, /* properties_destroyed */
546 0, /* todo_flags_start */
547 TODO_update_ssa, /* todo_flags_finish */
548 };
549
550 class pass_cse_reciprocals : public gimple_opt_pass
551 {
552 public:
553 pass_cse_reciprocals (gcc::context *ctxt)
554 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
555 {}
556
557 /* opt_pass methods: */
558 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
559 virtual unsigned int execute (function *);
560
561 }; // class pass_cse_reciprocals
562
563 unsigned int
564 pass_cse_reciprocals::execute (function *fun)
565 {
566 basic_block bb;
567 tree arg;
568
569 occ_pool = new object_allocator<occurrence> ("dominators for recip");
570
571 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
572 calculate_dominance_info (CDI_DOMINATORS);
573 calculate_dominance_info (CDI_POST_DOMINATORS);
574
575 if (flag_checking)
576 FOR_EACH_BB_FN (bb, fun)
577 gcc_assert (!bb->aux);
578
579 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
580 if (FLOAT_TYPE_P (TREE_TYPE (arg))
581 && is_gimple_reg (arg))
582 {
583 tree name = ssa_default_def (fun, arg);
584 if (name)
585 execute_cse_reciprocals_1 (NULL, name);
586 }
587
588 FOR_EACH_BB_FN (bb, fun)
589 {
590 tree def;
591
592 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
593 gsi_next (&gsi))
594 {
595 gphi *phi = gsi.phi ();
596 def = PHI_RESULT (phi);
597 if (! virtual_operand_p (def)
598 && FLOAT_TYPE_P (TREE_TYPE (def)))
599 execute_cse_reciprocals_1 (NULL, def);
600 }
601
602 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
603 gsi_next (&gsi))
604 {
605 gimple *stmt = gsi_stmt (gsi);
606
607 if (gimple_has_lhs (stmt)
608 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
609 && FLOAT_TYPE_P (TREE_TYPE (def))
610 && TREE_CODE (def) == SSA_NAME)
611 execute_cse_reciprocals_1 (&gsi, def);
612 }
613
614 if (optimize_bb_for_size_p (bb))
615 continue;
616
617 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
618 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
619 gsi_next (&gsi))
620 {
621 gimple *stmt = gsi_stmt (gsi);
622
623 if (is_gimple_assign (stmt)
624 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
625 {
626 tree arg1 = gimple_assign_rhs2 (stmt);
627 gimple *stmt1;
628
629 if (TREE_CODE (arg1) != SSA_NAME)
630 continue;
631
632 stmt1 = SSA_NAME_DEF_STMT (arg1);
633
634 if (is_gimple_call (stmt1)
635 && gimple_call_lhs (stmt1))
636 {
637 bool fail;
638 imm_use_iterator ui;
639 use_operand_p use_p;
640 tree fndecl = NULL_TREE;
641
642 gcall *call = as_a <gcall *> (stmt1);
643 internal_fn ifn = internal_fn_reciprocal (call);
644 if (ifn == IFN_LAST)
645 {
646 fndecl = gimple_call_fndecl (call);
647 if (!fndecl
648 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD)
649 continue;
650 fndecl = targetm.builtin_reciprocal (fndecl);
651 if (!fndecl)
652 continue;
653 }
654
655 /* Check that all uses of the SSA name are divisions,
656 otherwise replacing the defining statement will do
657 the wrong thing. */
658 fail = false;
659 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
660 {
661 gimple *stmt2 = USE_STMT (use_p);
662 if (is_gimple_debug (stmt2))
663 continue;
664 if (!is_gimple_assign (stmt2)
665 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
666 || gimple_assign_rhs1 (stmt2) == arg1
667 || gimple_assign_rhs2 (stmt2) != arg1)
668 {
669 fail = true;
670 break;
671 }
672 }
673 if (fail)
674 continue;
675
676 gimple_replace_ssa_lhs (call, arg1);
677 if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
678 {
679 auto_vec<tree, 4> args;
680 for (unsigned int i = 0;
681 i < gimple_call_num_args (call); i++)
682 args.safe_push (gimple_call_arg (call, i));
683 gcall *stmt2;
684 if (ifn == IFN_LAST)
685 stmt2 = gimple_build_call_vec (fndecl, args);
686 else
687 stmt2 = gimple_build_call_internal_vec (ifn, args);
688 gimple_call_set_lhs (stmt2, arg1);
689 if (gimple_vdef (call))
690 {
691 gimple_set_vdef (stmt2, gimple_vdef (call));
692 SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
693 }
694 gimple_call_set_nothrow (stmt2,
695 gimple_call_nothrow_p (call));
696 gimple_set_vuse (stmt2, gimple_vuse (call));
697 gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
698 gsi_replace (&gsi2, stmt2, true);
699 }
700 else
701 {
702 if (ifn == IFN_LAST)
703 gimple_call_set_fndecl (call, fndecl);
704 else
705 gimple_call_set_internal_fn (call, ifn);
706 update_stmt (call);
707 }
708 reciprocal_stats.rfuncs_inserted++;
709
710 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
711 {
712 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
713 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
714 fold_stmt_inplace (&gsi);
715 update_stmt (stmt);
716 }
717 }
718 }
719 }
720 }
721
722 statistics_counter_event (fun, "reciprocal divs inserted",
723 reciprocal_stats.rdivs_inserted);
724 statistics_counter_event (fun, "reciprocal functions inserted",
725 reciprocal_stats.rfuncs_inserted);
726
727 free_dominance_info (CDI_DOMINATORS);
728 free_dominance_info (CDI_POST_DOMINATORS);
729 delete occ_pool;
730 return 0;
731 }
732
733 } // anon namespace
734
735 gimple_opt_pass *
736 make_pass_cse_reciprocals (gcc::context *ctxt)
737 {
738 return new pass_cse_reciprocals (ctxt);
739 }
740
741 /* Records an occurrence at statement USE_STMT in the vector of trees
742 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
743 is not yet initialized. Returns true if the occurrence was pushed on
744 the vector. Adjusts *TOP_BB to be the basic block dominating all
745 statements in the vector. */
746
747 static bool
748 maybe_record_sincos (vec<gimple *> *stmts,
749 basic_block *top_bb, gimple *use_stmt)
750 {
751 basic_block use_bb = gimple_bb (use_stmt);
752 if (*top_bb
753 && (*top_bb == use_bb
754 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
755 stmts->safe_push (use_stmt);
756 else if (!*top_bb
757 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
758 {
759 stmts->safe_push (use_stmt);
760 *top_bb = use_bb;
761 }
762 else
763 return false;
764
765 return true;
766 }
767
768 /* Look for sin, cos and cexpi calls with the same argument NAME and
769 create a single call to cexpi CSEing the result in this case.
770 We first walk over all immediate uses of the argument collecting
771 statements that we can CSE in a vector and in a second pass replace
772 the statement rhs with a REALPART or IMAGPART expression on the
773 result of the cexpi call we insert before the use statement that
774 dominates all other candidates. */
775
776 static bool
777 execute_cse_sincos_1 (tree name)
778 {
779 gimple_stmt_iterator gsi;
780 imm_use_iterator use_iter;
781 tree fndecl, res, type;
782 gimple *def_stmt, *use_stmt, *stmt;
783 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
784 auto_vec<gimple *> stmts;
785 basic_block top_bb = NULL;
786 int i;
787 bool cfg_changed = false;
788
789 type = TREE_TYPE (name);
790 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
791 {
792 if (gimple_code (use_stmt) != GIMPLE_CALL
793 || !gimple_call_lhs (use_stmt))
794 continue;
795
796 switch (gimple_call_combined_fn (use_stmt))
797 {
798 CASE_CFN_COS:
799 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
800 break;
801
802 CASE_CFN_SIN:
803 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
804 break;
805
806 CASE_CFN_CEXPI:
807 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
808 break;
809
810 default:;
811 }
812 }
813
814 if (seen_cos + seen_sin + seen_cexpi <= 1)
815 return false;
816
817 /* Simply insert cexpi at the beginning of top_bb but not earlier than
818 the name def statement. */
819 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
820 if (!fndecl)
821 return false;
822 stmt = gimple_build_call (fndecl, 1, name);
823 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
824 gimple_call_set_lhs (stmt, res);
825
826 def_stmt = SSA_NAME_DEF_STMT (name);
827 if (!SSA_NAME_IS_DEFAULT_DEF (name)
828 && gimple_code (def_stmt) != GIMPLE_PHI
829 && gimple_bb (def_stmt) == top_bb)
830 {
831 gsi = gsi_for_stmt (def_stmt);
832 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
833 }
834 else
835 {
836 gsi = gsi_after_labels (top_bb);
837 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
838 }
839 sincos_stats.inserted++;
840
841 /* And adjust the recorded old call sites. */
842 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
843 {
844 tree rhs = NULL;
845
846 switch (gimple_call_combined_fn (use_stmt))
847 {
848 CASE_CFN_COS:
849 rhs = fold_build1 (REALPART_EXPR, type, res);
850 break;
851
852 CASE_CFN_SIN:
853 rhs = fold_build1 (IMAGPART_EXPR, type, res);
854 break;
855
856 CASE_CFN_CEXPI:
857 rhs = res;
858 break;
859
860 default:;
861 gcc_unreachable ();
862 }
863
864 /* Replace call with a copy. */
865 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
866
867 gsi = gsi_for_stmt (use_stmt);
868 gsi_replace (&gsi, stmt, true);
869 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
870 cfg_changed = true;
871 }
872
873 return cfg_changed;
874 }
875
876 /* To evaluate powi(x,n), the floating point value x raised to the
877 constant integer exponent n, we use a hybrid algorithm that
878 combines the "window method" with look-up tables. For an
879 introduction to exponentiation algorithms and "addition chains",
880 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
881 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
882 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
883 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
884
885 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
886 multiplications to inline before calling the system library's pow
887 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
888 so this default never requires calling pow, powf or powl. */
889
890 #ifndef POWI_MAX_MULTS
891 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
892 #endif
893
894 /* The size of the "optimal power tree" lookup table. All
895 exponents less than this value are simply looked up in the
896 powi_table below. This threshold is also used to size the
897 cache of pseudo registers that hold intermediate results. */
898 #define POWI_TABLE_SIZE 256
899
900 /* The size, in bits of the window, used in the "window method"
901 exponentiation algorithm. This is equivalent to a radix of
902 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
903 #define POWI_WINDOW_SIZE 3
904
905 /* The following table is an efficient representation of an
906 "optimal power tree". For each value, i, the corresponding
907 value, j, in the table states than an optimal evaluation
908 sequence for calculating pow(x,i) can be found by evaluating
909 pow(x,j)*pow(x,i-j). An optimal power tree for the first
910 100 integers is given in Knuth's "Seminumerical algorithms". */
911
912 static const unsigned char powi_table[POWI_TABLE_SIZE] =
913 {
914 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
915 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
916 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
917 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
918 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
919 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
920 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
921 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
922 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
923 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
924 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
925 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
926 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
927 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
928 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
929 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
930 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
931 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
932 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
933 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
934 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
935 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
936 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
937 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
938 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
939 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
940 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
941 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
942 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
943 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
944 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
945 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
946 };
947
948
949 /* Return the number of multiplications required to calculate
950 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
951 subroutine of powi_cost. CACHE is an array indicating
952 which exponents have already been calculated. */
953
954 static int
955 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
956 {
957 /* If we've already calculated this exponent, then this evaluation
958 doesn't require any additional multiplications. */
959 if (cache[n])
960 return 0;
961
962 cache[n] = true;
963 return powi_lookup_cost (n - powi_table[n], cache)
964 + powi_lookup_cost (powi_table[n], cache) + 1;
965 }
966
967 /* Return the number of multiplications required to calculate
968 powi(x,n) for an arbitrary x, given the exponent N. This
969 function needs to be kept in sync with powi_as_mults below. */
970
971 static int
972 powi_cost (HOST_WIDE_INT n)
973 {
974 bool cache[POWI_TABLE_SIZE];
975 unsigned HOST_WIDE_INT digit;
976 unsigned HOST_WIDE_INT val;
977 int result;
978
979 if (n == 0)
980 return 0;
981
982 /* Ignore the reciprocal when calculating the cost. */
983 val = (n < 0) ? -n : n;
984
985 /* Initialize the exponent cache. */
986 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
987 cache[1] = true;
988
989 result = 0;
990
991 while (val >= POWI_TABLE_SIZE)
992 {
993 if (val & 1)
994 {
995 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
996 result += powi_lookup_cost (digit, cache)
997 + POWI_WINDOW_SIZE + 1;
998 val >>= POWI_WINDOW_SIZE;
999 }
1000 else
1001 {
1002 val >>= 1;
1003 result++;
1004 }
1005 }
1006
1007 return result + powi_lookup_cost (val, cache);
1008 }
1009
1010 /* Recursive subroutine of powi_as_mults. This function takes the
1011 array, CACHE, of already calculated exponents and an exponent N and
1012 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
1013
1014 static tree
1015 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1016 HOST_WIDE_INT n, tree *cache)
1017 {
1018 tree op0, op1, ssa_target;
1019 unsigned HOST_WIDE_INT digit;
1020 gassign *mult_stmt;
1021
1022 if (n < POWI_TABLE_SIZE && cache[n])
1023 return cache[n];
1024
1025 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1026
1027 if (n < POWI_TABLE_SIZE)
1028 {
1029 cache[n] = ssa_target;
1030 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1031 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1032 }
1033 else if (n & 1)
1034 {
1035 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1036 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1037 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1038 }
1039 else
1040 {
1041 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1042 op1 = op0;
1043 }
1044
1045 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1046 gimple_set_location (mult_stmt, loc);
1047 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1048
1049 return ssa_target;
1050 }
1051
1052 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1053 This function needs to be kept in sync with powi_cost above. */
1054
1055 static tree
1056 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1057 tree arg0, HOST_WIDE_INT n)
1058 {
1059 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1060 gassign *div_stmt;
1061 tree target;
1062
1063 if (n == 0)
1064 return build_real (type, dconst1);
1065
1066 memset (cache, 0, sizeof (cache));
1067 cache[1] = arg0;
1068
1069 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1070 if (n >= 0)
1071 return result;
1072
1073 /* If the original exponent was negative, reciprocate the result. */
1074 target = make_temp_ssa_name (type, NULL, "powmult");
1075 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1076 build_real (type, dconst1), result);
1077 gimple_set_location (div_stmt, loc);
1078 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1079
1080 return target;
1081 }
1082
1083 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1084 location info LOC. If the arguments are appropriate, create an
1085 equivalent sequence of statements prior to GSI using an optimal
1086 number of multiplications, and return an expession holding the
1087 result. */
1088
1089 static tree
1090 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1091 tree arg0, HOST_WIDE_INT n)
1092 {
1093 /* Avoid largest negative number. */
1094 if (n != -n
1095 && ((n >= -1 && n <= 2)
1096 || (optimize_function_for_speed_p (cfun)
1097 && powi_cost (n) <= POWI_MAX_MULTS)))
1098 return powi_as_mults (gsi, loc, arg0, n);
1099
1100 return NULL_TREE;
1101 }
1102
1103 /* Build a gimple call statement that calls FN with argument ARG.
1104 Set the lhs of the call statement to a fresh SSA name. Insert the
1105 statement prior to GSI's current position, and return the fresh
1106 SSA name. */
1107
1108 static tree
1109 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1110 tree fn, tree arg)
1111 {
1112 gcall *call_stmt;
1113 tree ssa_target;
1114
1115 call_stmt = gimple_build_call (fn, 1, arg);
1116 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1117 gimple_set_lhs (call_stmt, ssa_target);
1118 gimple_set_location (call_stmt, loc);
1119 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1120
1121 return ssa_target;
1122 }
1123
1124 /* Build a gimple binary operation with the given CODE and arguments
1125 ARG0, ARG1, assigning the result to a new SSA name for variable
1126 TARGET. Insert the statement prior to GSI's current position, and
1127 return the fresh SSA name.*/
1128
1129 static tree
1130 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1131 const char *name, enum tree_code code,
1132 tree arg0, tree arg1)
1133 {
1134 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1135 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1136 gimple_set_location (stmt, loc);
1137 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1138 return result;
1139 }
1140
1141 /* Build a gimple reference operation with the given CODE and argument
1142 ARG, assigning the result to a new SSA name of TYPE with NAME.
1143 Insert the statement prior to GSI's current position, and return
1144 the fresh SSA name. */
1145
1146 static inline tree
1147 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1148 const char *name, enum tree_code code, tree arg0)
1149 {
1150 tree result = make_temp_ssa_name (type, NULL, name);
1151 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1152 gimple_set_location (stmt, loc);
1153 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1154 return result;
1155 }
1156
1157 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1158 prior to GSI's current position, and return the fresh SSA name. */
1159
1160 static tree
1161 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1162 tree type, tree val)
1163 {
1164 tree result = make_ssa_name (type);
1165 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1166 gimple_set_location (stmt, loc);
1167 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1168 return result;
1169 }
1170
1171 struct pow_synth_sqrt_info
1172 {
1173 bool *factors;
1174 unsigned int deepest;
1175 unsigned int num_mults;
1176 };
1177
1178 /* Return true iff the real value C can be represented as a
1179 sum of powers of 0.5 up to N. That is:
1180 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1181 Record in INFO the various parameters of the synthesis algorithm such
1182 as the factors a[i], the maximum 0.5 power and the number of
1183 multiplications that will be required. */
1184
1185 bool
1186 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1187 struct pow_synth_sqrt_info *info)
1188 {
1189 REAL_VALUE_TYPE factor = dconsthalf;
1190 REAL_VALUE_TYPE remainder = c;
1191
1192 info->deepest = 0;
1193 info->num_mults = 0;
1194 memset (info->factors, 0, n * sizeof (bool));
1195
1196 for (unsigned i = 0; i < n; i++)
1197 {
1198 REAL_VALUE_TYPE res;
1199
1200 /* If something inexact happened bail out now. */
1201 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1202 return false;
1203
1204 /* We have hit zero. The number is representable as a sum
1205 of powers of 0.5. */
1206 if (real_equal (&res, &dconst0))
1207 {
1208 info->factors[i] = true;
1209 info->deepest = i + 1;
1210 return true;
1211 }
1212 else if (!REAL_VALUE_NEGATIVE (res))
1213 {
1214 remainder = res;
1215 info->factors[i] = true;
1216 info->num_mults++;
1217 }
1218 else
1219 info->factors[i] = false;
1220
1221 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1222 }
1223 return false;
1224 }
1225
1226 /* Return the tree corresponding to FN being applied
1227 to ARG N times at GSI and LOC.
1228 Look up previous results from CACHE if need be.
1229 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1230
1231 static tree
1232 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1233 tree fn, location_t loc, tree *cache)
1234 {
1235 tree res = cache[n];
1236 if (!res)
1237 {
1238 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1239 res = build_and_insert_call (gsi, loc, fn, prev);
1240 cache[n] = res;
1241 }
1242
1243 return res;
1244 }
1245
1246 /* Print to STREAM the repeated application of function FNAME to ARG
1247 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1248 "foo (foo (x))". */
1249
1250 static void
1251 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1252 unsigned int n)
1253 {
1254 if (n == 0)
1255 fprintf (stream, "%s", arg);
1256 else
1257 {
1258 fprintf (stream, "%s (", fname);
1259 print_nested_fn (stream, fname, arg, n - 1);
1260 fprintf (stream, ")");
1261 }
1262 }
1263
1264 /* Print to STREAM the fractional sequence of sqrt chains
1265 applied to ARG, described by INFO. Used for the dump file. */
1266
1267 static void
1268 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1269 struct pow_synth_sqrt_info *info)
1270 {
1271 for (unsigned int i = 0; i < info->deepest; i++)
1272 {
1273 bool is_set = info->factors[i];
1274 if (is_set)
1275 {
1276 print_nested_fn (stream, "sqrt", arg, i + 1);
1277 if (i != info->deepest - 1)
1278 fprintf (stream, " * ");
1279 }
1280 }
1281 }
1282
1283 /* Print to STREAM a representation of raising ARG to an integer
1284 power N. Used for the dump file. */
1285
1286 static void
1287 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1288 {
1289 if (n > 1)
1290 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1291 else if (n == 1)
1292 fprintf (stream, "%s", arg);
1293 }
1294
1295 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1296 square roots. Place at GSI and LOC. Limit the maximum depth
1297 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1298 result of the expanded sequence or NULL_TREE if the expansion failed.
1299
1300 This routine assumes that ARG1 is a real number with a fractional part
1301 (the integer exponent case will have been handled earlier in
1302 gimple_expand_builtin_pow).
1303
1304 For ARG1 > 0.0:
1305 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1306 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1307 FRAC_PART == ARG1 - WHOLE_PART:
1308 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1309 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1310 if it can be expressed as such, that is if FRAC_PART satisfies:
1311 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1312 where integer a[i] is either 0 or 1.
1313
1314 Example:
1315 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1316 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1317
1318 For ARG1 < 0.0 there are two approaches:
1319 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1320 is calculated as above.
1321
1322 Example:
1323 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1324 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1325
1326 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1327 FRAC_PART := ARG1 - WHOLE_PART
1328 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1329 Example:
1330 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1331 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1332
1333 For ARG1 < 0.0 we choose between (A) and (B) depending on
1334 how many multiplications we'd have to do.
1335 So, for the example in (B): POW (x, -5.875), if we were to
1336 follow algorithm (A) we would produce:
1337 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1338 which contains more multiplications than approach (B).
1339
1340 Hopefully, this approach will eliminate potentially expensive POW library
1341 calls when unsafe floating point math is enabled and allow the compiler to
1342 further optimise the multiplies, square roots and divides produced by this
1343 function. */
1344
1345 static tree
1346 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1347 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1348 {
1349 tree type = TREE_TYPE (arg0);
1350 machine_mode mode = TYPE_MODE (type);
1351 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1352 bool one_over = true;
1353
1354 if (!sqrtfn)
1355 return NULL_TREE;
1356
1357 if (TREE_CODE (arg1) != REAL_CST)
1358 return NULL_TREE;
1359
1360 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1361
1362 gcc_assert (max_depth > 0);
1363 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1364
1365 struct pow_synth_sqrt_info synth_info;
1366 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1367 synth_info.deepest = 0;
1368 synth_info.num_mults = 0;
1369
1370 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1371 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1372
1373 /* The whole and fractional parts of exp. */
1374 REAL_VALUE_TYPE whole_part;
1375 REAL_VALUE_TYPE frac_part;
1376
1377 real_floor (&whole_part, mode, &exp);
1378 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1379
1380
1381 REAL_VALUE_TYPE ceil_whole = dconst0;
1382 REAL_VALUE_TYPE ceil_fract = dconst0;
1383
1384 if (neg_exp)
1385 {
1386 real_ceil (&ceil_whole, mode, &exp);
1387 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1388 }
1389
1390 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1391 return NULL_TREE;
1392
1393 /* Check whether it's more profitable to not use 1.0 / ... */
1394 if (neg_exp)
1395 {
1396 struct pow_synth_sqrt_info alt_synth_info;
1397 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1398 alt_synth_info.deepest = 0;
1399 alt_synth_info.num_mults = 0;
1400
1401 if (representable_as_half_series_p (ceil_fract, max_depth,
1402 &alt_synth_info)
1403 && alt_synth_info.deepest <= synth_info.deepest
1404 && alt_synth_info.num_mults < synth_info.num_mults)
1405 {
1406 whole_part = ceil_whole;
1407 frac_part = ceil_fract;
1408 synth_info.deepest = alt_synth_info.deepest;
1409 synth_info.num_mults = alt_synth_info.num_mults;
1410 memcpy (synth_info.factors, alt_synth_info.factors,
1411 (max_depth + 1) * sizeof (bool));
1412 one_over = false;
1413 }
1414 }
1415
1416 HOST_WIDE_INT n = real_to_integer (&whole_part);
1417 REAL_VALUE_TYPE cint;
1418 real_from_integer (&cint, VOIDmode, n, SIGNED);
1419
1420 if (!real_identical (&whole_part, &cint))
1421 return NULL_TREE;
1422
1423 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1424 return NULL_TREE;
1425
1426 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1427
1428 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1429
1430 /* Calculate the integer part of the exponent. */
1431 if (n > 1)
1432 {
1433 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1434 if (!integer_res)
1435 return NULL_TREE;
1436 }
1437
1438 if (dump_file)
1439 {
1440 char string[64];
1441
1442 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1443 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1444
1445 if (neg_exp)
1446 {
1447 if (one_over)
1448 {
1449 fprintf (dump_file, "1.0 / (");
1450 dump_integer_part (dump_file, "x", n);
1451 if (n > 0)
1452 fprintf (dump_file, " * ");
1453 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1454 fprintf (dump_file, ")");
1455 }
1456 else
1457 {
1458 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1459 fprintf (dump_file, " / (");
1460 dump_integer_part (dump_file, "x", n);
1461 fprintf (dump_file, ")");
1462 }
1463 }
1464 else
1465 {
1466 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1467 if (n > 0)
1468 fprintf (dump_file, " * ");
1469 dump_integer_part (dump_file, "x", n);
1470 }
1471
1472 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1473 }
1474
1475
1476 tree fract_res = NULL_TREE;
1477 cache[0] = arg0;
1478
1479 /* Calculate the fractional part of the exponent. */
1480 for (unsigned i = 0; i < synth_info.deepest; i++)
1481 {
1482 if (synth_info.factors[i])
1483 {
1484 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1485
1486 if (!fract_res)
1487 fract_res = sqrt_chain;
1488
1489 else
1490 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1491 fract_res, sqrt_chain);
1492 }
1493 }
1494
1495 tree res = NULL_TREE;
1496
1497 if (neg_exp)
1498 {
1499 if (one_over)
1500 {
1501 if (n > 0)
1502 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1503 fract_res, integer_res);
1504 else
1505 res = fract_res;
1506
1507 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1508 build_real (type, dconst1), res);
1509 }
1510 else
1511 {
1512 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1513 fract_res, integer_res);
1514 }
1515 }
1516 else
1517 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1518 fract_res, integer_res);
1519 return res;
1520 }
1521
1522 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1523 with location info LOC. If possible, create an equivalent and
1524 less expensive sequence of statements prior to GSI, and return an
1525 expession holding the result. */
1526
1527 static tree
1528 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1529 tree arg0, tree arg1)
1530 {
1531 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1532 REAL_VALUE_TYPE c2, dconst3;
1533 HOST_WIDE_INT n;
1534 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1535 machine_mode mode;
1536 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1537 bool hw_sqrt_exists, c_is_int, c2_is_int;
1538
1539 dconst1_4 = dconst1;
1540 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1541
1542 /* If the exponent isn't a constant, there's nothing of interest
1543 to be done. */
1544 if (TREE_CODE (arg1) != REAL_CST)
1545 return NULL_TREE;
1546
1547 /* Don't perform the operation if flag_signaling_nans is on
1548 and the operand is a signaling NaN. */
1549 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1550 && ((TREE_CODE (arg0) == REAL_CST
1551 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1552 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1553 return NULL_TREE;
1554
1555 /* If the exponent is equivalent to an integer, expand to an optimal
1556 multiplication sequence when profitable. */
1557 c = TREE_REAL_CST (arg1);
1558 n = real_to_integer (&c);
1559 real_from_integer (&cint, VOIDmode, n, SIGNED);
1560 c_is_int = real_identical (&c, &cint);
1561
1562 if (c_is_int
1563 && ((n >= -1 && n <= 2)
1564 || (flag_unsafe_math_optimizations
1565 && speed_p
1566 && powi_cost (n) <= POWI_MAX_MULTS)))
1567 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1568
1569 /* Attempt various optimizations using sqrt and cbrt. */
1570 type = TREE_TYPE (arg0);
1571 mode = TYPE_MODE (type);
1572 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1573
1574 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1575 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1576 sqrt(-0) = -0. */
1577 if (sqrtfn
1578 && real_equal (&c, &dconsthalf)
1579 && !HONOR_SIGNED_ZEROS (mode))
1580 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1581
1582 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1583
1584 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1585 optimizations since 1./3. is not exactly representable. If x
1586 is negative and finite, the correct value of pow(x,1./3.) is
1587 a NaN with the "invalid" exception raised, because the value
1588 of 1./3. actually has an even denominator. The correct value
1589 of cbrt(x) is a negative real value. */
1590 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1591 dconst1_3 = real_value_truncate (mode, dconst_third ());
1592
1593 if (flag_unsafe_math_optimizations
1594 && cbrtfn
1595 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1596 && real_equal (&c, &dconst1_3))
1597 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1598
1599 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1600 if we don't have a hardware sqrt insn. */
1601 dconst1_6 = dconst1_3;
1602 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1603
1604 if (flag_unsafe_math_optimizations
1605 && sqrtfn
1606 && cbrtfn
1607 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1608 && speed_p
1609 && hw_sqrt_exists
1610 && real_equal (&c, &dconst1_6))
1611 {
1612 /* sqrt(x) */
1613 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1614
1615 /* cbrt(sqrt(x)) */
1616 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1617 }
1618
1619
1620 /* Attempt to expand the POW as a product of square root chains.
1621 Expand the 0.25 case even when otpimising for size. */
1622 if (flag_unsafe_math_optimizations
1623 && sqrtfn
1624 && hw_sqrt_exists
1625 && (speed_p || real_equal (&c, &dconst1_4))
1626 && !HONOR_SIGNED_ZEROS (mode))
1627 {
1628 unsigned int max_depth = speed_p
1629 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1630 : 2;
1631
1632 tree expand_with_sqrts
1633 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1634
1635 if (expand_with_sqrts)
1636 return expand_with_sqrts;
1637 }
1638
1639 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1640 n = real_to_integer (&c2);
1641 real_from_integer (&cint, VOIDmode, n, SIGNED);
1642 c2_is_int = real_identical (&c2, &cint);
1643
1644 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1645
1646 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1647 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1648
1649 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1650 different from pow(x, 1./3.) due to rounding and behavior with
1651 negative x, we need to constrain this transformation to unsafe
1652 math and positive x or finite math. */
1653 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1654 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1655 real_round (&c2, mode, &c2);
1656 n = real_to_integer (&c2);
1657 real_from_integer (&cint, VOIDmode, n, SIGNED);
1658 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1659 real_convert (&c2, mode, &c2);
1660
1661 if (flag_unsafe_math_optimizations
1662 && cbrtfn
1663 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1664 && real_identical (&c2, &c)
1665 && !c2_is_int
1666 && optimize_function_for_speed_p (cfun)
1667 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1668 {
1669 tree powi_x_ndiv3 = NULL_TREE;
1670
1671 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1672 possible or profitable, give up. Skip the degenerate case when
1673 abs(n) < 3, where the result is always 1. */
1674 if (absu_hwi (n) >= 3)
1675 {
1676 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1677 abs_hwi (n / 3));
1678 if (!powi_x_ndiv3)
1679 return NULL_TREE;
1680 }
1681
1682 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1683 as that creates an unnecessary variable. Instead, just produce
1684 either cbrt(x) or cbrt(x) * cbrt(x). */
1685 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1686
1687 if (absu_hwi (n) % 3 == 1)
1688 powi_cbrt_x = cbrt_x;
1689 else
1690 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1691 cbrt_x, cbrt_x);
1692
1693 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1694 if (absu_hwi (n) < 3)
1695 result = powi_cbrt_x;
1696 else
1697 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1698 powi_x_ndiv3, powi_cbrt_x);
1699
1700 /* If n is negative, reciprocate the result. */
1701 if (n < 0)
1702 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1703 build_real (type, dconst1), result);
1704
1705 return result;
1706 }
1707
1708 /* No optimizations succeeded. */
1709 return NULL_TREE;
1710 }
1711
1712 /* ARG is the argument to a cabs builtin call in GSI with location info
1713 LOC. Create a sequence of statements prior to GSI that calculates
1714 sqrt(R*R + I*I), where R and I are the real and imaginary components
1715 of ARG, respectively. Return an expression holding the result. */
1716
1717 static tree
1718 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1719 {
1720 tree real_part, imag_part, addend1, addend2, sum, result;
1721 tree type = TREE_TYPE (TREE_TYPE (arg));
1722 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1723 machine_mode mode = TYPE_MODE (type);
1724
1725 if (!flag_unsafe_math_optimizations
1726 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1727 || !sqrtfn
1728 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1729 return NULL_TREE;
1730
1731 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1732 REALPART_EXPR, arg);
1733 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1734 real_part, real_part);
1735 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1736 IMAGPART_EXPR, arg);
1737 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1738 imag_part, imag_part);
1739 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1740 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1741
1742 return result;
1743 }
1744
1745 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1746 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1747 an optimal number of multiplies, when n is a constant. */
1748
1749 namespace {
1750
1751 const pass_data pass_data_cse_sincos =
1752 {
1753 GIMPLE_PASS, /* type */
1754 "sincos", /* name */
1755 OPTGROUP_NONE, /* optinfo_flags */
1756 TV_NONE, /* tv_id */
1757 PROP_ssa, /* properties_required */
1758 PROP_gimple_opt_math, /* properties_provided */
1759 0, /* properties_destroyed */
1760 0, /* todo_flags_start */
1761 TODO_update_ssa, /* todo_flags_finish */
1762 };
1763
1764 class pass_cse_sincos : public gimple_opt_pass
1765 {
1766 public:
1767 pass_cse_sincos (gcc::context *ctxt)
1768 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1769 {}
1770
1771 /* opt_pass methods: */
1772 virtual bool gate (function *)
1773 {
1774 /* We no longer require either sincos or cexp, since powi expansion
1775 piggybacks on this pass. */
1776 return optimize;
1777 }
1778
1779 virtual unsigned int execute (function *);
1780
1781 }; // class pass_cse_sincos
1782
1783 unsigned int
1784 pass_cse_sincos::execute (function *fun)
1785 {
1786 basic_block bb;
1787 bool cfg_changed = false;
1788
1789 calculate_dominance_info (CDI_DOMINATORS);
1790 memset (&sincos_stats, 0, sizeof (sincos_stats));
1791
1792 FOR_EACH_BB_FN (bb, fun)
1793 {
1794 gimple_stmt_iterator gsi;
1795 bool cleanup_eh = false;
1796
1797 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1798 {
1799 gimple *stmt = gsi_stmt (gsi);
1800
1801 /* Only the last stmt in a bb could throw, no need to call
1802 gimple_purge_dead_eh_edges if we change something in the middle
1803 of a basic block. */
1804 cleanup_eh = false;
1805
1806 if (is_gimple_call (stmt)
1807 && gimple_call_lhs (stmt))
1808 {
1809 tree arg, arg0, arg1, result;
1810 HOST_WIDE_INT n;
1811 location_t loc;
1812
1813 switch (gimple_call_combined_fn (stmt))
1814 {
1815 CASE_CFN_COS:
1816 CASE_CFN_SIN:
1817 CASE_CFN_CEXPI:
1818 /* Make sure we have either sincos or cexp. */
1819 if (!targetm.libc_has_function (function_c99_math_complex)
1820 && !targetm.libc_has_function (function_sincos))
1821 break;
1822
1823 arg = gimple_call_arg (stmt, 0);
1824 if (TREE_CODE (arg) == SSA_NAME)
1825 cfg_changed |= execute_cse_sincos_1 (arg);
1826 break;
1827
1828 CASE_CFN_POW:
1829 arg0 = gimple_call_arg (stmt, 0);
1830 arg1 = gimple_call_arg (stmt, 1);
1831
1832 loc = gimple_location (stmt);
1833 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1834
1835 if (result)
1836 {
1837 tree lhs = gimple_get_lhs (stmt);
1838 gassign *new_stmt = gimple_build_assign (lhs, result);
1839 gimple_set_location (new_stmt, loc);
1840 unlink_stmt_vdef (stmt);
1841 gsi_replace (&gsi, new_stmt, true);
1842 cleanup_eh = true;
1843 if (gimple_vdef (stmt))
1844 release_ssa_name (gimple_vdef (stmt));
1845 }
1846 break;
1847
1848 CASE_CFN_POWI:
1849 arg0 = gimple_call_arg (stmt, 0);
1850 arg1 = gimple_call_arg (stmt, 1);
1851 loc = gimple_location (stmt);
1852
1853 if (real_minus_onep (arg0))
1854 {
1855 tree t0, t1, cond, one, minus_one;
1856 gassign *stmt;
1857
1858 t0 = TREE_TYPE (arg0);
1859 t1 = TREE_TYPE (arg1);
1860 one = build_real (t0, dconst1);
1861 minus_one = build_real (t0, dconstm1);
1862
1863 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1864 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1865 arg1, build_int_cst (t1, 1));
1866 gimple_set_location (stmt, loc);
1867 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1868
1869 result = make_temp_ssa_name (t0, NULL, "powi");
1870 stmt = gimple_build_assign (result, COND_EXPR, cond,
1871 minus_one, one);
1872 gimple_set_location (stmt, loc);
1873 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1874 }
1875 else
1876 {
1877 if (!tree_fits_shwi_p (arg1))
1878 break;
1879
1880 n = tree_to_shwi (arg1);
1881 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1882 }
1883
1884 if (result)
1885 {
1886 tree lhs = gimple_get_lhs (stmt);
1887 gassign *new_stmt = gimple_build_assign (lhs, result);
1888 gimple_set_location (new_stmt, loc);
1889 unlink_stmt_vdef (stmt);
1890 gsi_replace (&gsi, new_stmt, true);
1891 cleanup_eh = true;
1892 if (gimple_vdef (stmt))
1893 release_ssa_name (gimple_vdef (stmt));
1894 }
1895 break;
1896
1897 CASE_CFN_CABS:
1898 arg0 = gimple_call_arg (stmt, 0);
1899 loc = gimple_location (stmt);
1900 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1901
1902 if (result)
1903 {
1904 tree lhs = gimple_get_lhs (stmt);
1905 gassign *new_stmt = gimple_build_assign (lhs, result);
1906 gimple_set_location (new_stmt, loc);
1907 unlink_stmt_vdef (stmt);
1908 gsi_replace (&gsi, new_stmt, true);
1909 cleanup_eh = true;
1910 if (gimple_vdef (stmt))
1911 release_ssa_name (gimple_vdef (stmt));
1912 }
1913 break;
1914
1915 default:;
1916 }
1917 }
1918 }
1919 if (cleanup_eh)
1920 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1921 }
1922
1923 statistics_counter_event (fun, "sincos statements inserted",
1924 sincos_stats.inserted);
1925
1926 return cfg_changed ? TODO_cleanup_cfg : 0;
1927 }
1928
1929 } // anon namespace
1930
1931 gimple_opt_pass *
1932 make_pass_cse_sincos (gcc::context *ctxt)
1933 {
1934 return new pass_cse_sincos (ctxt);
1935 }
1936
1937 /* A symbolic number structure is used to detect byte permutation and selection
1938 patterns of a source. To achieve that, its field N contains an artificial
1939 number consisting of BITS_PER_MARKER sized markers tracking where does each
1940 byte come from in the source:
1941
1942 0 - target byte has the value 0
1943 FF - target byte has an unknown value (eg. due to sign extension)
1944 1..size - marker value is the byte index in the source (0 for lsb).
1945
1946 To detect permutations on memory sources (arrays and structures), a symbolic
1947 number is also associated:
1948 - a base address BASE_ADDR and an OFFSET giving the address of the source;
1949 - a range which gives the difference between the highest and lowest accessed
1950 memory location to make such a symbolic number;
1951 - the address SRC of the source element of lowest address as a convenience
1952 to easily get BASE_ADDR + offset + lowest bytepos;
1953 - number of expressions N_OPS bitwise ored together to represent
1954 approximate cost of the computation.
1955
1956 Note 1: the range is different from size as size reflects the size of the
1957 type of the current expression. For instance, for an array char a[],
1958 (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
1959 (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
1960 time a range of 1.
1961
1962 Note 2: for non-memory sources, range holds the same value as size.
1963
1964 Note 3: SRC points to the SSA_NAME in case of non-memory source. */
1965
1966 struct symbolic_number {
1967 uint64_t n;
1968 tree type;
1969 tree base_addr;
1970 tree offset;
1971 HOST_WIDE_INT bytepos;
1972 tree src;
1973 tree alias_set;
1974 tree vuse;
1975 unsigned HOST_WIDE_INT range;
1976 int n_ops;
1977 };
1978
1979 #define BITS_PER_MARKER 8
1980 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1981 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1982 #define HEAD_MARKER(n, size) \
1983 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1984
1985 /* The number which the find_bswap_or_nop_1 result should match in
1986 order to have a nop. The number is masked according to the size of
1987 the symbolic number before using it. */
1988 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1989 (uint64_t)0x08070605 << 32 | 0x04030201)
1990
1991 /* The number which the find_bswap_or_nop_1 result should match in
1992 order to have a byte swap. The number is masked according to the
1993 size of the symbolic number before using it. */
1994 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1995 (uint64_t)0x01020304 << 32 | 0x05060708)
1996
1997 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1998 number N. Return false if the requested operation is not permitted
1999 on a symbolic number. */
2000
2001 static inline bool
2002 do_shift_rotate (enum tree_code code,
2003 struct symbolic_number *n,
2004 int count)
2005 {
2006 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2007 unsigned head_marker;
2008
2009 if (count % BITS_PER_UNIT != 0)
2010 return false;
2011 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
2012
2013 /* Zero out the extra bits of N in order to avoid them being shifted
2014 into the significant bits. */
2015 if (size < 64 / BITS_PER_MARKER)
2016 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2017
2018 switch (code)
2019 {
2020 case LSHIFT_EXPR:
2021 n->n <<= count;
2022 break;
2023 case RSHIFT_EXPR:
2024 head_marker = HEAD_MARKER (n->n, size);
2025 n->n >>= count;
2026 /* Arithmetic shift of signed type: result is dependent on the value. */
2027 if (!TYPE_UNSIGNED (n->type) && head_marker)
2028 for (i = 0; i < count / BITS_PER_MARKER; i++)
2029 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2030 << ((size - 1 - i) * BITS_PER_MARKER);
2031 break;
2032 case LROTATE_EXPR:
2033 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
2034 break;
2035 case RROTATE_EXPR:
2036 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
2037 break;
2038 default:
2039 return false;
2040 }
2041 /* Zero unused bits for size. */
2042 if (size < 64 / BITS_PER_MARKER)
2043 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2044 return true;
2045 }
2046
2047 /* Perform sanity checking for the symbolic number N and the gimple
2048 statement STMT. */
2049
2050 static inline bool
2051 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
2052 {
2053 tree lhs_type;
2054
2055 lhs_type = gimple_expr_type (stmt);
2056
2057 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2058 return false;
2059
2060 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2061 return false;
2062
2063 return true;
2064 }
2065
2066 /* Initialize the symbolic number N for the bswap pass from the base element
2067 SRC manipulated by the bitwise OR expression. */
2068
2069 static bool
2070 init_symbolic_number (struct symbolic_number *n, tree src)
2071 {
2072 int size;
2073
2074 if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
2075 return false;
2076
2077 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2078 n->src = src;
2079
2080 /* Set up the symbolic number N by setting each byte to a value between 1 and
2081 the byte size of rhs1. The highest order byte is set to n->size and the
2082 lowest order byte to 1. */
2083 n->type = TREE_TYPE (src);
2084 size = TYPE_PRECISION (n->type);
2085 if (size % BITS_PER_UNIT != 0)
2086 return false;
2087 size /= BITS_PER_UNIT;
2088 if (size > 64 / BITS_PER_MARKER)
2089 return false;
2090 n->range = size;
2091 n->n = CMPNOP;
2092 n->n_ops = 1;
2093
2094 if (size < 64 / BITS_PER_MARKER)
2095 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2096
2097 return true;
2098 }
2099
2100 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2101 the answer. If so, REF is that memory source and the base of the memory area
2102 accessed and the offset of the access from that base are recorded in N. */
2103
2104 bool
2105 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2106 {
2107 /* Leaf node is an array or component ref. Memorize its base and
2108 offset from base to compare to other such leaf node. */
2109 HOST_WIDE_INT bitsize, bitpos;
2110 machine_mode mode;
2111 int unsignedp, reversep, volatilep;
2112 tree offset, base_addr;
2113
2114 /* Not prepared to handle PDP endian. */
2115 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2116 return false;
2117
2118 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2119 return false;
2120
2121 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2122 &unsignedp, &reversep, &volatilep);
2123
2124 if (TREE_CODE (base_addr) == MEM_REF)
2125 {
2126 offset_int bit_offset = 0;
2127 tree off = TREE_OPERAND (base_addr, 1);
2128
2129 if (!integer_zerop (off))
2130 {
2131 offset_int boff, coff = mem_ref_offset (base_addr);
2132 boff = coff << LOG2_BITS_PER_UNIT;
2133 bit_offset += boff;
2134 }
2135
2136 base_addr = TREE_OPERAND (base_addr, 0);
2137
2138 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2139 if (wi::neg_p (bit_offset))
2140 {
2141 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2142 offset_int tem = wi::bit_and_not (bit_offset, mask);
2143 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2144 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2145 bit_offset -= tem;
2146 tem >>= LOG2_BITS_PER_UNIT;
2147 if (offset)
2148 offset = size_binop (PLUS_EXPR, offset,
2149 wide_int_to_tree (sizetype, tem));
2150 else
2151 offset = wide_int_to_tree (sizetype, tem);
2152 }
2153
2154 bitpos += bit_offset.to_shwi ();
2155 }
2156
2157 if (bitpos % BITS_PER_UNIT)
2158 return false;
2159 if (bitsize % BITS_PER_UNIT)
2160 return false;
2161 if (reversep)
2162 return false;
2163
2164 if (!init_symbolic_number (n, ref))
2165 return false;
2166 n->base_addr = base_addr;
2167 n->offset = offset;
2168 n->bytepos = bitpos / BITS_PER_UNIT;
2169 n->alias_set = reference_alias_ptr_type (ref);
2170 n->vuse = gimple_vuse (stmt);
2171 return true;
2172 }
2173
2174 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2175 symbolic number N1 and N2 whose source statements are respectively
2176 SOURCE_STMT1 and SOURCE_STMT2. */
2177
2178 static gimple *
2179 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2180 gimple *source_stmt2, struct symbolic_number *n2,
2181 struct symbolic_number *n)
2182 {
2183 int i, size;
2184 uint64_t mask;
2185 gimple *source_stmt;
2186 struct symbolic_number *n_start;
2187
2188 tree rhs1 = gimple_assign_rhs1 (source_stmt1);
2189 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2190 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2191 rhs1 = TREE_OPERAND (rhs1, 0);
2192 tree rhs2 = gimple_assign_rhs1 (source_stmt2);
2193 if (TREE_CODE (rhs2) == BIT_FIELD_REF
2194 && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
2195 rhs2 = TREE_OPERAND (rhs2, 0);
2196
2197 /* Sources are different, cancel bswap if they are not memory location with
2198 the same base (array, structure, ...). */
2199 if (rhs1 != rhs2)
2200 {
2201 uint64_t inc;
2202 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2203 struct symbolic_number *toinc_n_ptr, *n_end;
2204 basic_block bb1, bb2;
2205
2206 if (!n1->base_addr || !n2->base_addr
2207 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2208 return NULL;
2209
2210 if (!n1->offset != !n2->offset
2211 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2212 return NULL;
2213
2214 if (n1->bytepos < n2->bytepos)
2215 {
2216 n_start = n1;
2217 start_sub = n2->bytepos - n1->bytepos;
2218 }
2219 else
2220 {
2221 n_start = n2;
2222 start_sub = n1->bytepos - n2->bytepos;
2223 }
2224
2225 bb1 = gimple_bb (source_stmt1);
2226 bb2 = gimple_bb (source_stmt2);
2227 if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
2228 source_stmt = source_stmt1;
2229 else
2230 source_stmt = source_stmt2;
2231
2232 /* Find the highest address at which a load is performed and
2233 compute related info. */
2234 end1 = n1->bytepos + (n1->range - 1);
2235 end2 = n2->bytepos + (n2->range - 1);
2236 if (end1 < end2)
2237 {
2238 end = end2;
2239 end_sub = end2 - end1;
2240 }
2241 else
2242 {
2243 end = end1;
2244 end_sub = end1 - end2;
2245 }
2246 n_end = (end2 > end1) ? n2 : n1;
2247
2248 /* Find symbolic number whose lsb is the most significant. */
2249 if (BYTES_BIG_ENDIAN)
2250 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2251 else
2252 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2253
2254 n->range = end - n_start->bytepos + 1;
2255
2256 /* Check that the range of memory covered can be represented by
2257 a symbolic number. */
2258 if (n->range > 64 / BITS_PER_MARKER)
2259 return NULL;
2260
2261 /* Reinterpret byte marks in symbolic number holding the value of
2262 bigger weight according to target endianness. */
2263 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2264 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2265 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2266 {
2267 unsigned marker
2268 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2269 if (marker && marker != MARKER_BYTE_UNKNOWN)
2270 toinc_n_ptr->n += inc;
2271 }
2272 }
2273 else
2274 {
2275 n->range = n1->range;
2276 n_start = n1;
2277 source_stmt = source_stmt1;
2278 }
2279
2280 if (!n1->alias_set
2281 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2282 n->alias_set = n1->alias_set;
2283 else
2284 n->alias_set = ptr_type_node;
2285 n->vuse = n_start->vuse;
2286 n->base_addr = n_start->base_addr;
2287 n->offset = n_start->offset;
2288 n->src = n_start->src;
2289 n->bytepos = n_start->bytepos;
2290 n->type = n_start->type;
2291 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2292
2293 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2294 {
2295 uint64_t masked1, masked2;
2296
2297 masked1 = n1->n & mask;
2298 masked2 = n2->n & mask;
2299 if (masked1 && masked2 && masked1 != masked2)
2300 return NULL;
2301 }
2302 n->n = n1->n | n2->n;
2303 n->n_ops = n1->n_ops + n2->n_ops;
2304
2305 return source_stmt;
2306 }
2307
2308 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2309 the operation given by the rhs of STMT on the result. If the operation
2310 could successfully be executed the function returns a gimple stmt whose
2311 rhs's first tree is the expression of the source operand and NULL
2312 otherwise. */
2313
2314 static gimple *
2315 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2316 {
2317 enum tree_code code;
2318 tree rhs1, rhs2 = NULL;
2319 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2320 enum gimple_rhs_class rhs_class;
2321
2322 if (!limit || !is_gimple_assign (stmt))
2323 return NULL;
2324
2325 rhs1 = gimple_assign_rhs1 (stmt);
2326
2327 if (find_bswap_or_nop_load (stmt, rhs1, n))
2328 return stmt;
2329
2330 /* Handle BIT_FIELD_REF. */
2331 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2332 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2333 {
2334 unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
2335 unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
2336 if (bitpos % BITS_PER_UNIT == 0
2337 && bitsize % BITS_PER_UNIT == 0
2338 && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
2339 {
2340 /* Handle big-endian bit numbering in BIT_FIELD_REF. */
2341 if (BYTES_BIG_ENDIAN)
2342 bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
2343
2344 /* Shift. */
2345 if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
2346 return NULL;
2347
2348 /* Mask. */
2349 uint64_t mask = 0;
2350 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2351 for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
2352 i++, tmp <<= BITS_PER_UNIT)
2353 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2354 n->n &= mask;
2355
2356 /* Convert. */
2357 n->type = TREE_TYPE (rhs1);
2358 if (!n->base_addr)
2359 n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2360
2361 return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
2362 }
2363
2364 return NULL;
2365 }
2366
2367 if (TREE_CODE (rhs1) != SSA_NAME)
2368 return NULL;
2369
2370 code = gimple_assign_rhs_code (stmt);
2371 rhs_class = gimple_assign_rhs_class (stmt);
2372 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2373
2374 if (rhs_class == GIMPLE_BINARY_RHS)
2375 rhs2 = gimple_assign_rhs2 (stmt);
2376
2377 /* Handle unary rhs and binary rhs with integer constants as second
2378 operand. */
2379
2380 if (rhs_class == GIMPLE_UNARY_RHS
2381 || (rhs_class == GIMPLE_BINARY_RHS
2382 && TREE_CODE (rhs2) == INTEGER_CST))
2383 {
2384 if (code != BIT_AND_EXPR
2385 && code != LSHIFT_EXPR
2386 && code != RSHIFT_EXPR
2387 && code != LROTATE_EXPR
2388 && code != RROTATE_EXPR
2389 && !CONVERT_EXPR_CODE_P (code))
2390 return NULL;
2391
2392 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2393
2394 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2395 we have to initialize the symbolic number. */
2396 if (!source_stmt1)
2397 {
2398 if (gimple_assign_load_p (stmt)
2399 || !init_symbolic_number (n, rhs1))
2400 return NULL;
2401 source_stmt1 = stmt;
2402 }
2403
2404 switch (code)
2405 {
2406 case BIT_AND_EXPR:
2407 {
2408 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2409 uint64_t val = int_cst_value (rhs2), mask = 0;
2410 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2411
2412 /* Only constants masking full bytes are allowed. */
2413 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2414 if ((val & tmp) != 0 && (val & tmp) != tmp)
2415 return NULL;
2416 else if (val & tmp)
2417 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2418
2419 n->n &= mask;
2420 }
2421 break;
2422 case LSHIFT_EXPR:
2423 case RSHIFT_EXPR:
2424 case LROTATE_EXPR:
2425 case RROTATE_EXPR:
2426 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2427 return NULL;
2428 break;
2429 CASE_CONVERT:
2430 {
2431 int i, type_size, old_type_size;
2432 tree type;
2433
2434 type = gimple_expr_type (stmt);
2435 type_size = TYPE_PRECISION (type);
2436 if (type_size % BITS_PER_UNIT != 0)
2437 return NULL;
2438 type_size /= BITS_PER_UNIT;
2439 if (type_size > 64 / BITS_PER_MARKER)
2440 return NULL;
2441
2442 /* Sign extension: result is dependent on the value. */
2443 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2444 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2445 && HEAD_MARKER (n->n, old_type_size))
2446 for (i = 0; i < type_size - old_type_size; i++)
2447 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2448 << ((type_size - 1 - i) * BITS_PER_MARKER);
2449
2450 if (type_size < 64 / BITS_PER_MARKER)
2451 {
2452 /* If STMT casts to a smaller type mask out the bits not
2453 belonging to the target type. */
2454 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2455 }
2456 n->type = type;
2457 if (!n->base_addr)
2458 n->range = type_size;
2459 }
2460 break;
2461 default:
2462 return NULL;
2463 };
2464 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2465 }
2466
2467 /* Handle binary rhs. */
2468
2469 if (rhs_class == GIMPLE_BINARY_RHS)
2470 {
2471 struct symbolic_number n1, n2;
2472 gimple *source_stmt, *source_stmt2;
2473
2474 if (code != BIT_IOR_EXPR)
2475 return NULL;
2476
2477 if (TREE_CODE (rhs2) != SSA_NAME)
2478 return NULL;
2479
2480 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2481
2482 switch (code)
2483 {
2484 case BIT_IOR_EXPR:
2485 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2486
2487 if (!source_stmt1)
2488 return NULL;
2489
2490 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2491
2492 if (!source_stmt2)
2493 return NULL;
2494
2495 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2496 return NULL;
2497
2498 if (!n1.vuse != !n2.vuse
2499 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2500 return NULL;
2501
2502 source_stmt
2503 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2504
2505 if (!source_stmt)
2506 return NULL;
2507
2508 if (!verify_symbolic_number_p (n, stmt))
2509 return NULL;
2510
2511 break;
2512 default:
2513 return NULL;
2514 }
2515 return source_stmt;
2516 }
2517 return NULL;
2518 }
2519
2520 /* Check if STMT completes a bswap implementation or a read in a given
2521 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2522 accordingly. It also sets N to represent the kind of operations
2523 performed: size of the resulting expression and whether it works on
2524 a memory source, and if so alias-set and vuse. At last, the
2525 function returns a stmt whose rhs's first tree is the source
2526 expression. */
2527
2528 static gimple *
2529 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2530 {
2531 unsigned rsize;
2532 uint64_t tmpn, mask;
2533 /* The number which the find_bswap_or_nop_1 result should match in order
2534 to have a full byte swap. The number is shifted to the right
2535 according to the size of the symbolic number before using it. */
2536 uint64_t cmpxchg = CMPXCHG;
2537 uint64_t cmpnop = CMPNOP;
2538
2539 gimple *ins_stmt;
2540 int limit;
2541
2542 /* The last parameter determines the depth search limit. It usually
2543 correlates directly to the number n of bytes to be touched. We
2544 increase that number by log2(n) + 1 here in order to also
2545 cover signed -> unsigned conversions of the src operand as can be seen
2546 in libgcc, and for initial shift/and operation of the src operand. */
2547 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2548 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2549 ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2550
2551 if (!ins_stmt)
2552 return NULL;
2553
2554 /* Find real size of result (highest non-zero byte). */
2555 if (n->base_addr)
2556 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2557 else
2558 rsize = n->range;
2559
2560 /* Zero out the bits corresponding to untouched bytes in original gimple
2561 expression. */
2562 if (n->range < (int) sizeof (int64_t))
2563 {
2564 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2565 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2566 cmpnop &= mask;
2567 }
2568
2569 /* Zero out the bits corresponding to unused bytes in the result of the
2570 gimple expression. */
2571 if (rsize < n->range)
2572 {
2573 if (BYTES_BIG_ENDIAN)
2574 {
2575 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2576 cmpxchg &= mask;
2577 cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
2578 }
2579 else
2580 {
2581 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2582 cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
2583 cmpnop &= mask;
2584 }
2585 n->range = rsize;
2586 }
2587
2588 /* A complete byte swap should make the symbolic number to start with
2589 the largest digit in the highest order byte. Unchanged symbolic
2590 number indicates a read with same endianness as target architecture. */
2591 if (n->n == cmpnop)
2592 *bswap = false;
2593 else if (n->n == cmpxchg)
2594 *bswap = true;
2595 else
2596 return NULL;
2597
2598 /* Useless bit manipulation performed by code. */
2599 if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
2600 return NULL;
2601
2602 n->range *= BITS_PER_UNIT;
2603 return ins_stmt;
2604 }
2605
2606 namespace {
2607
2608 const pass_data pass_data_optimize_bswap =
2609 {
2610 GIMPLE_PASS, /* type */
2611 "bswap", /* name */
2612 OPTGROUP_NONE, /* optinfo_flags */
2613 TV_NONE, /* tv_id */
2614 PROP_ssa, /* properties_required */
2615 0, /* properties_provided */
2616 0, /* properties_destroyed */
2617 0, /* todo_flags_start */
2618 0, /* todo_flags_finish */
2619 };
2620
2621 class pass_optimize_bswap : public gimple_opt_pass
2622 {
2623 public:
2624 pass_optimize_bswap (gcc::context *ctxt)
2625 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2626 {}
2627
2628 /* opt_pass methods: */
2629 virtual bool gate (function *)
2630 {
2631 return flag_expensive_optimizations && optimize;
2632 }
2633
2634 virtual unsigned int execute (function *);
2635
2636 }; // class pass_optimize_bswap
2637
2638 /* Perform the bswap optimization: replace the expression computed in the rhs
2639 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2640 Which of these alternatives replace the rhs is given by N->base_addr (non
2641 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2642 load to perform are also given in N while the builtin bswap invoke is given
2643 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2644 load statements involved to construct the rhs in CUR_STMT and N->range gives
2645 the size of the rhs expression for maintaining some statistics.
2646
2647 Note that if the replacement involve a load, CUR_STMT is moved just after
2648 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2649 changing of basic block. */
2650
2651 static bool
2652 bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl,
2653 tree bswap_type, tree load_type, struct symbolic_number *n,
2654 bool bswap)
2655 {
2656 gimple_stmt_iterator gsi;
2657 tree src, tmp, tgt;
2658 gimple *bswap_stmt;
2659
2660 gsi = gsi_for_stmt (cur_stmt);
2661 src = n->src;
2662 tgt = gimple_assign_lhs (cur_stmt);
2663
2664 /* Need to load the value from memory first. */
2665 if (n->base_addr)
2666 {
2667 gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
2668 tree addr_expr, addr_tmp, val_expr, val_tmp;
2669 tree load_offset_ptr, aligned_load_type;
2670 gimple *addr_stmt, *load_stmt;
2671 unsigned align;
2672 HOST_WIDE_INT load_offset = 0;
2673 basic_block ins_bb, cur_bb;
2674
2675 ins_bb = gimple_bb (ins_stmt);
2676 cur_bb = gimple_bb (cur_stmt);
2677 if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
2678 return false;
2679
2680 align = get_object_alignment (src);
2681
2682 /* Move cur_stmt just before one of the load of the original
2683 to ensure it has the same VUSE. See PR61517 for what could
2684 go wrong. */
2685 if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
2686 reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2687 gsi_move_before (&gsi, &gsi_ins);
2688 gsi = gsi_for_stmt (cur_stmt);
2689
2690 /* Compute address to load from and cast according to the size
2691 of the load. */
2692 addr_expr = build_fold_addr_expr (unshare_expr (src));
2693 if (is_gimple_mem_ref_addr (addr_expr))
2694 addr_tmp = addr_expr;
2695 else
2696 {
2697 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2698 "load_src");
2699 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2700 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2701 }
2702
2703 /* Perform the load. */
2704 aligned_load_type = load_type;
2705 if (align < TYPE_ALIGN (load_type))
2706 aligned_load_type = build_aligned_type (load_type, align);
2707 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2708 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2709 load_offset_ptr);
2710
2711 if (!bswap)
2712 {
2713 if (n->range == 16)
2714 nop_stats.found_16bit++;
2715 else if (n->range == 32)
2716 nop_stats.found_32bit++;
2717 else
2718 {
2719 gcc_assert (n->range == 64);
2720 nop_stats.found_64bit++;
2721 }
2722
2723 /* Convert the result of load if necessary. */
2724 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2725 {
2726 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2727 "load_dst");
2728 load_stmt = gimple_build_assign (val_tmp, val_expr);
2729 gimple_set_vuse (load_stmt, n->vuse);
2730 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2731 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2732 }
2733 else
2734 {
2735 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2736 gimple_set_vuse (cur_stmt, n->vuse);
2737 }
2738 update_stmt (cur_stmt);
2739
2740 if (dump_file)
2741 {
2742 fprintf (dump_file,
2743 "%d bit load in target endianness found at: ",
2744 (int) n->range);
2745 print_gimple_stmt (dump_file, cur_stmt, 0);
2746 }
2747 return true;
2748 }
2749 else
2750 {
2751 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2752 load_stmt = gimple_build_assign (val_tmp, val_expr);
2753 gimple_set_vuse (load_stmt, n->vuse);
2754 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2755 }
2756 src = val_tmp;
2757 }
2758 else if (!bswap)
2759 {
2760 gimple *g;
2761 if (!useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
2762 {
2763 if (!is_gimple_val (src))
2764 return false;
2765 g = gimple_build_assign (tgt, NOP_EXPR, src);
2766 }
2767 else
2768 g = gimple_build_assign (tgt, src);
2769 if (n->range == 16)
2770 nop_stats.found_16bit++;
2771 else if (n->range == 32)
2772 nop_stats.found_32bit++;
2773 else
2774 {
2775 gcc_assert (n->range == 64);
2776 nop_stats.found_64bit++;
2777 }
2778 if (dump_file)
2779 {
2780 fprintf (dump_file,
2781 "%d bit reshuffle in target endianness found at: ",
2782 (int) n->range);
2783 print_gimple_stmt (dump_file, cur_stmt, 0);
2784 }
2785 gsi_replace (&gsi, g, true);
2786 return true;
2787 }
2788 else if (TREE_CODE (src) == BIT_FIELD_REF)
2789 src = TREE_OPERAND (src, 0);
2790
2791 if (n->range == 16)
2792 bswap_stats.found_16bit++;
2793 else if (n->range == 32)
2794 bswap_stats.found_32bit++;
2795 else
2796 {
2797 gcc_assert (n->range == 64);
2798 bswap_stats.found_64bit++;
2799 }
2800
2801 tmp = src;
2802
2803 /* Convert the src expression if necessary. */
2804 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2805 {
2806 gimple *convert_stmt;
2807
2808 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2809 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2810 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2811 }
2812
2813 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2814 are considered as rotation of 2N bit values by N bits is generally not
2815 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2816 gives 0x03040102 while a bswap for that value is 0x04030201. */
2817 if (bswap && n->range == 16)
2818 {
2819 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2820 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2821 bswap_stmt = gimple_build_assign (NULL, src);
2822 }
2823 else
2824 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2825
2826 tmp = tgt;
2827
2828 /* Convert the result if necessary. */
2829 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2830 {
2831 gimple *convert_stmt;
2832
2833 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2834 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2835 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2836 }
2837
2838 gimple_set_lhs (bswap_stmt, tmp);
2839
2840 if (dump_file)
2841 {
2842 fprintf (dump_file, "%d bit bswap implementation found at: ",
2843 (int) n->range);
2844 print_gimple_stmt (dump_file, cur_stmt, 0);
2845 }
2846
2847 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2848 gsi_remove (&gsi, true);
2849 return true;
2850 }
2851
2852 /* Find manual byte swap implementations as well as load in a given
2853 endianness. Byte swaps are turned into a bswap builtin invokation
2854 while endian loads are converted to bswap builtin invokation or
2855 simple load according to the target endianness. */
2856
2857 unsigned int
2858 pass_optimize_bswap::execute (function *fun)
2859 {
2860 basic_block bb;
2861 bool bswap32_p, bswap64_p;
2862 bool changed = false;
2863 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2864
2865 if (BITS_PER_UNIT != 8)
2866 return 0;
2867
2868 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2869 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2870 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2871 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2872 || (bswap32_p && word_mode == SImode)));
2873
2874 /* Determine the argument type of the builtins. The code later on
2875 assumes that the return and argument type are the same. */
2876 if (bswap32_p)
2877 {
2878 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2879 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2880 }
2881
2882 if (bswap64_p)
2883 {
2884 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2885 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2886 }
2887
2888 memset (&nop_stats, 0, sizeof (nop_stats));
2889 memset (&bswap_stats, 0, sizeof (bswap_stats));
2890 calculate_dominance_info (CDI_DOMINATORS);
2891
2892 FOR_EACH_BB_FN (bb, fun)
2893 {
2894 gimple_stmt_iterator gsi;
2895
2896 /* We do a reverse scan for bswap patterns to make sure we get the
2897 widest match. As bswap pattern matching doesn't handle previously
2898 inserted smaller bswap replacements as sub-patterns, the wider
2899 variant wouldn't be detected. */
2900 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2901 {
2902 gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
2903 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2904 enum tree_code code;
2905 struct symbolic_number n;
2906 bool bswap;
2907
2908 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2909 might be moved to a different basic block by bswap_replace and gsi
2910 must not points to it if that's the case. Moving the gsi_prev
2911 there make sure that gsi points to the statement previous to
2912 cur_stmt while still making sure that all statements are
2913 considered in this basic block. */
2914 gsi_prev (&gsi);
2915
2916 if (!is_gimple_assign (cur_stmt))
2917 continue;
2918
2919 code = gimple_assign_rhs_code (cur_stmt);
2920 switch (code)
2921 {
2922 case LROTATE_EXPR:
2923 case RROTATE_EXPR:
2924 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2925 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2926 % BITS_PER_UNIT)
2927 continue;
2928 /* Fall through. */
2929 case BIT_IOR_EXPR:
2930 break;
2931 default:
2932 continue;
2933 }
2934
2935 ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2936
2937 if (!ins_stmt)
2938 continue;
2939
2940 switch (n.range)
2941 {
2942 case 16:
2943 /* Already in canonical form, nothing to do. */
2944 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2945 continue;
2946 load_type = bswap_type = uint16_type_node;
2947 break;
2948 case 32:
2949 load_type = uint32_type_node;
2950 if (bswap32_p)
2951 {
2952 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2953 bswap_type = bswap32_type;
2954 }
2955 break;
2956 case 64:
2957 load_type = uint64_type_node;
2958 if (bswap64_p)
2959 {
2960 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2961 bswap_type = bswap64_type;
2962 }
2963 break;
2964 default:
2965 continue;
2966 }
2967
2968 if (bswap && !fndecl && n.range != 16)
2969 continue;
2970
2971 if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
2972 &n, bswap))
2973 changed = true;
2974 }
2975 }
2976
2977 statistics_counter_event (fun, "16-bit nop implementations found",
2978 nop_stats.found_16bit);
2979 statistics_counter_event (fun, "32-bit nop implementations found",
2980 nop_stats.found_32bit);
2981 statistics_counter_event (fun, "64-bit nop implementations found",
2982 nop_stats.found_64bit);
2983 statistics_counter_event (fun, "16-bit bswap implementations found",
2984 bswap_stats.found_16bit);
2985 statistics_counter_event (fun, "32-bit bswap implementations found",
2986 bswap_stats.found_32bit);
2987 statistics_counter_event (fun, "64-bit bswap implementations found",
2988 bswap_stats.found_64bit);
2989
2990 return (changed ? TODO_update_ssa : 0);
2991 }
2992
2993 } // anon namespace
2994
2995 gimple_opt_pass *
2996 make_pass_optimize_bswap (gcc::context *ctxt)
2997 {
2998 return new pass_optimize_bswap (ctxt);
2999 }
3000
3001 /* Return true if stmt is a type conversion operation that can be stripped
3002 when used in a widening multiply operation. */
3003 static bool
3004 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
3005 {
3006 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
3007
3008 if (TREE_CODE (result_type) == INTEGER_TYPE)
3009 {
3010 tree op_type;
3011 tree inner_op_type;
3012
3013 if (!CONVERT_EXPR_CODE_P (rhs_code))
3014 return false;
3015
3016 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
3017
3018 /* If the type of OP has the same precision as the result, then
3019 we can strip this conversion. The multiply operation will be
3020 selected to create the correct extension as a by-product. */
3021 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
3022 return true;
3023
3024 /* We can also strip a conversion if it preserves the signed-ness of
3025 the operation and doesn't narrow the range. */
3026 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
3027
3028 /* If the inner-most type is unsigned, then we can strip any
3029 intermediate widening operation. If it's signed, then the
3030 intermediate widening operation must also be signed. */
3031 if ((TYPE_UNSIGNED (inner_op_type)
3032 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
3033 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
3034 return true;
3035
3036 return false;
3037 }
3038
3039 return rhs_code == FIXED_CONVERT_EXPR;
3040 }
3041
3042 /* Return true if RHS is a suitable operand for a widening multiplication,
3043 assuming a target type of TYPE.
3044 There are two cases:
3045
3046 - RHS makes some value at least twice as wide. Store that value
3047 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
3048
3049 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
3050 but leave *TYPE_OUT untouched. */
3051
3052 static bool
3053 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
3054 tree *new_rhs_out)
3055 {
3056 gimple *stmt;
3057 tree type1, rhs1;
3058
3059 if (TREE_CODE (rhs) == SSA_NAME)
3060 {
3061 stmt = SSA_NAME_DEF_STMT (rhs);
3062 if (is_gimple_assign (stmt))
3063 {
3064 if (! widening_mult_conversion_strippable_p (type, stmt))
3065 rhs1 = rhs;
3066 else
3067 {
3068 rhs1 = gimple_assign_rhs1 (stmt);
3069
3070 if (TREE_CODE (rhs1) == INTEGER_CST)
3071 {
3072 *new_rhs_out = rhs1;
3073 *type_out = NULL;
3074 return true;
3075 }
3076 }
3077 }
3078 else
3079 rhs1 = rhs;
3080
3081 type1 = TREE_TYPE (rhs1);
3082
3083 if (TREE_CODE (type1) != TREE_CODE (type)
3084 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
3085 return false;
3086
3087 *new_rhs_out = rhs1;
3088 *type_out = type1;
3089 return true;
3090 }
3091
3092 if (TREE_CODE (rhs) == INTEGER_CST)
3093 {
3094 *new_rhs_out = rhs;
3095 *type_out = NULL;
3096 return true;
3097 }
3098
3099 return false;
3100 }
3101
3102 /* Return true if STMT performs a widening multiplication, assuming the
3103 output type is TYPE. If so, store the unwidened types of the operands
3104 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
3105 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
3106 and *TYPE2_OUT would give the operands of the multiplication. */
3107
3108 static bool
3109 is_widening_mult_p (gimple *stmt,
3110 tree *type1_out, tree *rhs1_out,
3111 tree *type2_out, tree *rhs2_out)
3112 {
3113 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3114
3115 if (TREE_CODE (type) != INTEGER_TYPE
3116 && TREE_CODE (type) != FIXED_POINT_TYPE)
3117 return false;
3118
3119 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
3120 rhs1_out))
3121 return false;
3122
3123 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
3124 rhs2_out))
3125 return false;
3126
3127 if (*type1_out == NULL)
3128 {
3129 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
3130 return false;
3131 *type1_out = *type2_out;
3132 }
3133
3134 if (*type2_out == NULL)
3135 {
3136 if (!int_fits_type_p (*rhs2_out, *type1_out))
3137 return false;
3138 *type2_out = *type1_out;
3139 }
3140
3141 /* Ensure that the larger of the two operands comes first. */
3142 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3143 {
3144 std::swap (*type1_out, *type2_out);
3145 std::swap (*rhs1_out, *rhs2_out);
3146 }
3147
3148 return true;
3149 }
3150
3151 /* Check to see if the CALL statement is an invocation of copysign
3152 with 1. being the first argument. */
3153 static bool
3154 is_copysign_call_with_1 (gimple *call)
3155 {
3156 gcall *c = dyn_cast <gcall *> (call);
3157 if (! c)
3158 return false;
3159
3160 enum combined_fn code = gimple_call_combined_fn (c);
3161
3162 if (code == CFN_LAST)
3163 return false;
3164
3165 if (builtin_fn_p (code))
3166 {
3167 switch (as_builtin_fn (code))
3168 {
3169 CASE_FLT_FN (BUILT_IN_COPYSIGN):
3170 CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
3171 return real_onep (gimple_call_arg (c, 0));
3172 default:
3173 return false;
3174 }
3175 }
3176
3177 if (internal_fn_p (code))
3178 {
3179 switch (as_internal_fn (code))
3180 {
3181 case IFN_COPYSIGN:
3182 return real_onep (gimple_call_arg (c, 0));
3183 default:
3184 return false;
3185 }
3186 }
3187
3188 return false;
3189 }
3190
3191 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
3192 This only happens when the the xorsign optab is defined, if the
3193 pattern is not a xorsign pattern or if expansion fails FALSE is
3194 returned, otherwise TRUE is returned. */
3195 static bool
3196 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
3197 {
3198 tree treeop0, treeop1, lhs, type;
3199 location_t loc = gimple_location (stmt);
3200 lhs = gimple_assign_lhs (stmt);
3201 treeop0 = gimple_assign_rhs1 (stmt);
3202 treeop1 = gimple_assign_rhs2 (stmt);
3203 type = TREE_TYPE (lhs);
3204 machine_mode mode = TYPE_MODE (type);
3205
3206 if (HONOR_SNANS (type))
3207 return false;
3208
3209 if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
3210 {
3211 gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
3212 if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
3213 {
3214 call0 = SSA_NAME_DEF_STMT (treeop1);
3215 if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
3216 return false;
3217
3218 treeop1 = treeop0;
3219 }
3220 if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
3221 return false;
3222
3223 gcall *c = as_a<gcall*> (call0);
3224 treeop0 = gimple_call_arg (c, 1);
3225
3226 gcall *call_stmt
3227 = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
3228 gimple_set_lhs (call_stmt, lhs);
3229 gimple_set_location (call_stmt, loc);
3230 gsi_replace (gsi, call_stmt, true);
3231 return true;
3232 }
3233
3234 return false;
3235 }
3236
3237 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3238 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3239 value is true iff we converted the statement. */
3240
3241 static bool
3242 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
3243 {
3244 tree lhs, rhs1, rhs2, type, type1, type2;
3245 enum insn_code handler;
3246 scalar_int_mode to_mode, from_mode, actual_mode;
3247 optab op;
3248 int actual_precision;
3249 location_t loc = gimple_location (stmt);
3250 bool from_unsigned1, from_unsigned2;
3251
3252 lhs = gimple_assign_lhs (stmt);
3253 type = TREE_TYPE (lhs);
3254 if (TREE_CODE (type) != INTEGER_TYPE)
3255 return false;
3256
3257 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3258 return false;
3259
3260 to_mode = SCALAR_INT_TYPE_MODE (type);
3261 from_mode = SCALAR_INT_TYPE_MODE (type1);
3262 from_unsigned1 = TYPE_UNSIGNED (type1);
3263 from_unsigned2 = TYPE_UNSIGNED (type2);
3264
3265 if (from_unsigned1 && from_unsigned2)
3266 op = umul_widen_optab;
3267 else if (!from_unsigned1 && !from_unsigned2)
3268 op = smul_widen_optab;
3269 else
3270 op = usmul_widen_optab;
3271
3272 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3273 &actual_mode);
3274
3275 if (handler == CODE_FOR_nothing)
3276 {
3277 if (op != smul_widen_optab)
3278 {
3279 /* We can use a signed multiply with unsigned types as long as
3280 there is a wider mode to use, or it is the smaller of the two
3281 types that is unsigned. Note that type1 >= type2, always. */
3282 if ((TYPE_UNSIGNED (type1)
3283 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3284 || (TYPE_UNSIGNED (type2)
3285 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3286 {
3287 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3288 || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3289 return false;
3290 }
3291
3292 op = smul_widen_optab;
3293 handler = find_widening_optab_handler_and_mode (op, to_mode,
3294 from_mode,
3295 &actual_mode);
3296
3297 if (handler == CODE_FOR_nothing)
3298 return false;
3299
3300 from_unsigned1 = from_unsigned2 = false;
3301 }
3302 else
3303 return false;
3304 }
3305
3306 /* Ensure that the inputs to the handler are in the correct precison
3307 for the opcode. This will be the full mode size. */
3308 actual_precision = GET_MODE_PRECISION (actual_mode);
3309 if (2 * actual_precision > TYPE_PRECISION (type))
3310 return false;
3311 if (actual_precision != TYPE_PRECISION (type1)
3312 || from_unsigned1 != TYPE_UNSIGNED (type1))
3313 rhs1 = build_and_insert_cast (gsi, loc,
3314 build_nonstandard_integer_type
3315 (actual_precision, from_unsigned1), rhs1);
3316 if (actual_precision != TYPE_PRECISION (type2)
3317 || from_unsigned2 != TYPE_UNSIGNED (type2))
3318 rhs2 = build_and_insert_cast (gsi, loc,
3319 build_nonstandard_integer_type
3320 (actual_precision, from_unsigned2), rhs2);
3321
3322 /* Handle constants. */
3323 if (TREE_CODE (rhs1) == INTEGER_CST)
3324 rhs1 = fold_convert (type1, rhs1);
3325 if (TREE_CODE (rhs2) == INTEGER_CST)
3326 rhs2 = fold_convert (type2, rhs2);
3327
3328 gimple_assign_set_rhs1 (stmt, rhs1);
3329 gimple_assign_set_rhs2 (stmt, rhs2);
3330 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3331 update_stmt (stmt);
3332 widen_mul_stats.widen_mults_inserted++;
3333 return true;
3334 }
3335
3336 /* Process a single gimple statement STMT, which is found at the
3337 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3338 rhs (given by CODE), and try to convert it into a
3339 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3340 is true iff we converted the statement. */
3341
3342 static bool
3343 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3344 enum tree_code code)
3345 {
3346 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3347 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3348 tree type, type1, type2, optype;
3349 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3350 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3351 optab this_optab;
3352 enum tree_code wmult_code;
3353 enum insn_code handler;
3354 scalar_mode to_mode, from_mode, actual_mode;
3355 location_t loc = gimple_location (stmt);
3356 int actual_precision;
3357 bool from_unsigned1, from_unsigned2;
3358
3359 lhs = gimple_assign_lhs (stmt);
3360 type = TREE_TYPE (lhs);
3361 if (TREE_CODE (type) != INTEGER_TYPE
3362 && TREE_CODE (type) != FIXED_POINT_TYPE)
3363 return false;
3364
3365 if (code == MINUS_EXPR)
3366 wmult_code = WIDEN_MULT_MINUS_EXPR;
3367 else
3368 wmult_code = WIDEN_MULT_PLUS_EXPR;
3369
3370 rhs1 = gimple_assign_rhs1 (stmt);
3371 rhs2 = gimple_assign_rhs2 (stmt);
3372
3373 if (TREE_CODE (rhs1) == SSA_NAME)
3374 {
3375 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3376 if (is_gimple_assign (rhs1_stmt))
3377 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3378 }
3379
3380 if (TREE_CODE (rhs2) == SSA_NAME)
3381 {
3382 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3383 if (is_gimple_assign (rhs2_stmt))
3384 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3385 }
3386
3387 /* Allow for one conversion statement between the multiply
3388 and addition/subtraction statement. If there are more than
3389 one conversions then we assume they would invalidate this
3390 transformation. If that's not the case then they should have
3391 been folded before now. */
3392 if (CONVERT_EXPR_CODE_P (rhs1_code))
3393 {
3394 conv1_stmt = rhs1_stmt;
3395 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3396 if (TREE_CODE (rhs1) == SSA_NAME)
3397 {
3398 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3399 if (is_gimple_assign (rhs1_stmt))
3400 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3401 }
3402 else
3403 return false;
3404 }
3405 if (CONVERT_EXPR_CODE_P (rhs2_code))
3406 {
3407 conv2_stmt = rhs2_stmt;
3408 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3409 if (TREE_CODE (rhs2) == SSA_NAME)
3410 {
3411 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3412 if (is_gimple_assign (rhs2_stmt))
3413 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3414 }
3415 else
3416 return false;
3417 }
3418
3419 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3420 is_widening_mult_p, but we still need the rhs returns.
3421
3422 It might also appear that it would be sufficient to use the existing
3423 operands of the widening multiply, but that would limit the choice of
3424 multiply-and-accumulate instructions.
3425
3426 If the widened-multiplication result has more than one uses, it is
3427 probably wiser not to do the conversion. */
3428 if (code == PLUS_EXPR
3429 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3430 {
3431 if (!has_single_use (rhs1)
3432 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3433 &type2, &mult_rhs2))
3434 return false;
3435 add_rhs = rhs2;
3436 conv_stmt = conv1_stmt;
3437 }
3438 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3439 {
3440 if (!has_single_use (rhs2)
3441 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3442 &type2, &mult_rhs2))
3443 return false;
3444 add_rhs = rhs1;
3445 conv_stmt = conv2_stmt;
3446 }
3447 else
3448 return false;
3449
3450 to_mode = SCALAR_TYPE_MODE (type);
3451 from_mode = SCALAR_TYPE_MODE (type1);
3452 from_unsigned1 = TYPE_UNSIGNED (type1);
3453 from_unsigned2 = TYPE_UNSIGNED (type2);
3454 optype = type1;
3455
3456 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3457 if (from_unsigned1 != from_unsigned2)
3458 {
3459 if (!INTEGRAL_TYPE_P (type))
3460 return false;
3461 /* We can use a signed multiply with unsigned types as long as
3462 there is a wider mode to use, or it is the smaller of the two
3463 types that is unsigned. Note that type1 >= type2, always. */
3464 if ((from_unsigned1
3465 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3466 || (from_unsigned2
3467 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3468 {
3469 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3470 || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3471 return false;
3472 }
3473
3474 from_unsigned1 = from_unsigned2 = false;
3475 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3476 false);
3477 }
3478
3479 /* If there was a conversion between the multiply and addition
3480 then we need to make sure it fits a multiply-and-accumulate.
3481 The should be a single mode change which does not change the
3482 value. */
3483 if (conv_stmt)
3484 {
3485 /* We use the original, unmodified data types for this. */
3486 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3487 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3488 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3489 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3490
3491 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3492 {
3493 /* Conversion is a truncate. */
3494 if (TYPE_PRECISION (to_type) < data_size)
3495 return false;
3496 }
3497 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3498 {
3499 /* Conversion is an extend. Check it's the right sort. */
3500 if (TYPE_UNSIGNED (from_type) != is_unsigned
3501 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3502 return false;
3503 }
3504 /* else convert is a no-op for our purposes. */
3505 }
3506
3507 /* Verify that the machine can perform a widening multiply
3508 accumulate in this mode/signedness combination, otherwise
3509 this transformation is likely to pessimize code. */
3510 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3511 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3512 from_mode, &actual_mode);
3513
3514 if (handler == CODE_FOR_nothing)
3515 return false;
3516
3517 /* Ensure that the inputs to the handler are in the correct precison
3518 for the opcode. This will be the full mode size. */
3519 actual_precision = GET_MODE_PRECISION (actual_mode);
3520 if (actual_precision != TYPE_PRECISION (type1)
3521 || from_unsigned1 != TYPE_UNSIGNED (type1))
3522 mult_rhs1 = build_and_insert_cast (gsi, loc,
3523 build_nonstandard_integer_type
3524 (actual_precision, from_unsigned1),
3525 mult_rhs1);
3526 if (actual_precision != TYPE_PRECISION (type2)
3527 || from_unsigned2 != TYPE_UNSIGNED (type2))
3528 mult_rhs2 = build_and_insert_cast (gsi, loc,
3529 build_nonstandard_integer_type
3530 (actual_precision, from_unsigned2),
3531 mult_rhs2);
3532
3533 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3534 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3535
3536 /* Handle constants. */
3537 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3538 mult_rhs1 = fold_convert (type1, mult_rhs1);
3539 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3540 mult_rhs2 = fold_convert (type2, mult_rhs2);
3541
3542 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3543 add_rhs);
3544 update_stmt (gsi_stmt (*gsi));
3545 widen_mul_stats.maccs_inserted++;
3546 return true;
3547 }
3548
3549 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3550 with uses in additions and subtractions to form fused multiply-add
3551 operations. Returns true if successful and MUL_STMT should be removed. */
3552
3553 static bool
3554 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3555 {
3556 tree mul_result = gimple_get_lhs (mul_stmt);
3557 tree type = TREE_TYPE (mul_result);
3558 gimple *use_stmt, *neguse_stmt;
3559 gassign *fma_stmt;
3560 use_operand_p use_p;
3561 imm_use_iterator imm_iter;
3562
3563 if (FLOAT_TYPE_P (type)
3564 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3565 return false;
3566
3567 /* We don't want to do bitfield reduction ops. */
3568 if (INTEGRAL_TYPE_P (type)
3569 && !type_has_mode_precision_p (type))
3570 return false;
3571
3572 /* If the target doesn't support it, don't generate it. We assume that
3573 if fma isn't available then fms, fnma or fnms are not either. */
3574 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3575 return false;
3576
3577 /* If the multiplication has zero uses, it is kept around probably because
3578 of -fnon-call-exceptions. Don't optimize it away in that case,
3579 it is DCE job. */
3580 if (has_zero_uses (mul_result))
3581 return false;
3582
3583 /* Make sure that the multiplication statement becomes dead after
3584 the transformation, thus that all uses are transformed to FMAs.
3585 This means we assume that an FMA operation has the same cost
3586 as an addition. */
3587 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3588 {
3589 enum tree_code use_code;
3590 tree result = mul_result;
3591 bool negate_p = false;
3592
3593 use_stmt = USE_STMT (use_p);
3594
3595 if (is_gimple_debug (use_stmt))
3596 continue;
3597
3598 /* For now restrict this operations to single basic blocks. In theory
3599 we would want to support sinking the multiplication in
3600 m = a*b;
3601 if ()
3602 ma = m + c;
3603 else
3604 d = m;
3605 to form a fma in the then block and sink the multiplication to the
3606 else block. */
3607 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3608 return false;
3609
3610 if (!is_gimple_assign (use_stmt))
3611 return false;
3612
3613 use_code = gimple_assign_rhs_code (use_stmt);
3614
3615 /* A negate on the multiplication leads to FNMA. */
3616 if (use_code == NEGATE_EXPR)
3617 {
3618 ssa_op_iter iter;
3619 use_operand_p usep;
3620
3621 result = gimple_assign_lhs (use_stmt);
3622
3623 /* Make sure the negate statement becomes dead with this
3624 single transformation. */
3625 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3626 &use_p, &neguse_stmt))
3627 return false;
3628
3629 /* Make sure the multiplication isn't also used on that stmt. */
3630 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3631 if (USE_FROM_PTR (usep) == mul_result)
3632 return false;
3633
3634 /* Re-validate. */
3635 use_stmt = neguse_stmt;
3636 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3637 return false;
3638 if (!is_gimple_assign (use_stmt))
3639 return false;
3640
3641 use_code = gimple_assign_rhs_code (use_stmt);
3642 negate_p = true;
3643 }
3644
3645 switch (use_code)
3646 {
3647 case MINUS_EXPR:
3648 if (gimple_assign_rhs2 (use_stmt) == result)
3649 negate_p = !negate_p;
3650 break;
3651 case PLUS_EXPR:
3652 break;
3653 default:
3654 /* FMA can only be formed from PLUS and MINUS. */
3655 return false;
3656 }
3657
3658 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3659 by a MULT_EXPR that we'll visit later, we might be able to
3660 get a more profitable match with fnma.
3661 OTOH, if we don't, a negate / fma pair has likely lower latency
3662 that a mult / subtract pair. */
3663 if (use_code == MINUS_EXPR && !negate_p
3664 && gimple_assign_rhs1 (use_stmt) == result
3665 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3666 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3667 {
3668 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3669
3670 if (TREE_CODE (rhs2) == SSA_NAME)
3671 {
3672 gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3673 if (has_single_use (rhs2)
3674 && is_gimple_assign (stmt2)
3675 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3676 return false;
3677 }
3678 }
3679
3680 /* We can't handle a * b + a * b. */
3681 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3682 return false;
3683
3684 /* While it is possible to validate whether or not the exact form
3685 that we've recognized is available in the backend, the assumption
3686 is that the transformation is never a loss. For instance, suppose
3687 the target only has the plain FMA pattern available. Consider
3688 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3689 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3690 still have 3 operations, but in the FMA form the two NEGs are
3691 independent and could be run in parallel. */
3692 }
3693
3694 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3695 {
3696 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3697 enum tree_code use_code;
3698 tree addop, mulop1 = op1, result = mul_result;
3699 bool negate_p = false;
3700
3701 if (is_gimple_debug (use_stmt))
3702 continue;
3703
3704 use_code = gimple_assign_rhs_code (use_stmt);
3705 if (use_code == NEGATE_EXPR)
3706 {
3707 result = gimple_assign_lhs (use_stmt);
3708 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3709 gsi_remove (&gsi, true);
3710 release_defs (use_stmt);
3711
3712 use_stmt = neguse_stmt;
3713 gsi = gsi_for_stmt (use_stmt);
3714 use_code = gimple_assign_rhs_code (use_stmt);
3715 negate_p = true;
3716 }
3717
3718 if (gimple_assign_rhs1 (use_stmt) == result)
3719 {
3720 addop = gimple_assign_rhs2 (use_stmt);
3721 /* a * b - c -> a * b + (-c) */
3722 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3723 addop = force_gimple_operand_gsi (&gsi,
3724 build1 (NEGATE_EXPR,
3725 type, addop),
3726 true, NULL_TREE, true,
3727 GSI_SAME_STMT);
3728 }
3729 else
3730 {
3731 addop = gimple_assign_rhs1 (use_stmt);
3732 /* a - b * c -> (-b) * c + a */
3733 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3734 negate_p = !negate_p;
3735 }
3736
3737 if (negate_p)
3738 mulop1 = force_gimple_operand_gsi (&gsi,
3739 build1 (NEGATE_EXPR,
3740 type, mulop1),
3741 true, NULL_TREE, true,
3742 GSI_SAME_STMT);
3743
3744 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3745 FMA_EXPR, mulop1, op2, addop);
3746 gsi_replace (&gsi, fma_stmt, true);
3747 widen_mul_stats.fmas_inserted++;
3748 }
3749
3750 return true;
3751 }
3752
3753
3754 /* Helper function of match_uaddsub_overflow. Return 1
3755 if USE_STMT is unsigned overflow check ovf != 0 for
3756 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3757 and 0 otherwise. */
3758
3759 static int
3760 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3761 {
3762 enum tree_code ccode = ERROR_MARK;
3763 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3764 if (gimple_code (use_stmt) == GIMPLE_COND)
3765 {
3766 ccode = gimple_cond_code (use_stmt);
3767 crhs1 = gimple_cond_lhs (use_stmt);
3768 crhs2 = gimple_cond_rhs (use_stmt);
3769 }
3770 else if (is_gimple_assign (use_stmt))
3771 {
3772 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3773 {
3774 ccode = gimple_assign_rhs_code (use_stmt);
3775 crhs1 = gimple_assign_rhs1 (use_stmt);
3776 crhs2 = gimple_assign_rhs2 (use_stmt);
3777 }
3778 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3779 {
3780 tree cond = gimple_assign_rhs1 (use_stmt);
3781 if (COMPARISON_CLASS_P (cond))
3782 {
3783 ccode = TREE_CODE (cond);
3784 crhs1 = TREE_OPERAND (cond, 0);
3785 crhs2 = TREE_OPERAND (cond, 1);
3786 }
3787 else
3788 return 0;
3789 }
3790 else
3791 return 0;
3792 }
3793 else
3794 return 0;
3795
3796 if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3797 return 0;
3798
3799 enum tree_code code = gimple_assign_rhs_code (stmt);
3800 tree lhs = gimple_assign_lhs (stmt);
3801 tree rhs1 = gimple_assign_rhs1 (stmt);
3802 tree rhs2 = gimple_assign_rhs2 (stmt);
3803
3804 switch (ccode)
3805 {
3806 case GT_EXPR:
3807 case LE_EXPR:
3808 /* r = a - b; r > a or r <= a
3809 r = a + b; a > r or a <= r or b > r or b <= r. */
3810 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3811 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3812 && crhs2 == lhs))
3813 return ccode == GT_EXPR ? 1 : -1;
3814 break;
3815 case LT_EXPR:
3816 case GE_EXPR:
3817 /* r = a - b; a < r or a >= r
3818 r = a + b; r < a or r >= a or r < b or r >= b. */
3819 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3820 || (code == PLUS_EXPR && crhs1 == lhs
3821 && (crhs2 == rhs1 || crhs2 == rhs2)))
3822 return ccode == LT_EXPR ? 1 : -1;
3823 break;
3824 default:
3825 break;
3826 }
3827 return 0;
3828 }
3829
3830 /* Recognize for unsigned x
3831 x = y - z;
3832 if (x > y)
3833 where there are other uses of x and replace it with
3834 _7 = SUB_OVERFLOW (y, z);
3835 x = REALPART_EXPR <_7>;
3836 _8 = IMAGPART_EXPR <_7>;
3837 if (_8)
3838 and similarly for addition. */
3839
3840 static bool
3841 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3842 enum tree_code code)
3843 {
3844 tree lhs = gimple_assign_lhs (stmt);
3845 tree type = TREE_TYPE (lhs);
3846 use_operand_p use_p;
3847 imm_use_iterator iter;
3848 bool use_seen = false;
3849 bool ovf_use_seen = false;
3850 gimple *use_stmt;
3851
3852 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3853 if (!INTEGRAL_TYPE_P (type)
3854 || !TYPE_UNSIGNED (type)
3855 || has_zero_uses (lhs)
3856 || has_single_use (lhs)
3857 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3858 TYPE_MODE (type)) == CODE_FOR_nothing)
3859 return false;
3860
3861 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3862 {
3863 use_stmt = USE_STMT (use_p);
3864 if (is_gimple_debug (use_stmt))
3865 continue;
3866
3867 if (uaddsub_overflow_check_p (stmt, use_stmt))
3868 ovf_use_seen = true;
3869 else
3870 use_seen = true;
3871 if (ovf_use_seen && use_seen)
3872 break;
3873 }
3874
3875 if (!ovf_use_seen || !use_seen)
3876 return false;
3877
3878 tree ctype = build_complex_type (type);
3879 tree rhs1 = gimple_assign_rhs1 (stmt);
3880 tree rhs2 = gimple_assign_rhs2 (stmt);
3881 gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3882 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3883 2, rhs1, rhs2);
3884 tree ctmp = make_ssa_name (ctype);
3885 gimple_call_set_lhs (g, ctmp);
3886 gsi_insert_before (gsi, g, GSI_SAME_STMT);
3887 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3888 build1 (REALPART_EXPR, type, ctmp));
3889 gsi_replace (gsi, g2, true);
3890 tree ovf = make_ssa_name (type);
3891 g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3892 build1 (IMAGPART_EXPR, type, ctmp));
3893 gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3894
3895 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3896 {
3897 if (is_gimple_debug (use_stmt))
3898 continue;
3899
3900 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3901 if (ovf_use == 0)
3902 continue;
3903 if (gimple_code (use_stmt) == GIMPLE_COND)
3904 {
3905 gcond *cond_stmt = as_a <gcond *> (use_stmt);
3906 gimple_cond_set_lhs (cond_stmt, ovf);
3907 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3908 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3909 }
3910 else
3911 {
3912 gcc_checking_assert (is_gimple_assign (use_stmt));
3913 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3914 {
3915 gimple_assign_set_rhs1 (use_stmt, ovf);
3916 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3917 gimple_assign_set_rhs_code (use_stmt,
3918 ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3919 }
3920 else
3921 {
3922 gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3923 == COND_EXPR);
3924 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3925 boolean_type_node, ovf,
3926 build_int_cst (type, 0));
3927 gimple_assign_set_rhs1 (use_stmt, cond);
3928 }
3929 }
3930 update_stmt (use_stmt);
3931 }
3932 return true;
3933 }
3934
3935 /* Return true if target has support for divmod. */
3936
3937 static bool
3938 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3939 {
3940 /* If target supports hardware divmod insn, use it for divmod. */
3941 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3942 return true;
3943
3944 /* Check if libfunc for divmod is available. */
3945 rtx libfunc = optab_libfunc (divmod_optab, mode);
3946 if (libfunc != NULL_RTX)
3947 {
3948 /* If optab_handler exists for div_optab, perhaps in a wider mode,
3949 we don't want to use the libfunc even if it exists for given mode. */
3950 machine_mode div_mode;
3951 FOR_EACH_MODE_FROM (div_mode, mode)
3952 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3953 return false;
3954
3955 return targetm.expand_divmod_libfunc != NULL;
3956 }
3957
3958 return false;
3959 }
3960
3961 /* Check if stmt is candidate for divmod transform. */
3962
3963 static bool
3964 divmod_candidate_p (gassign *stmt)
3965 {
3966 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3967 machine_mode mode = TYPE_MODE (type);
3968 optab divmod_optab, div_optab;
3969
3970 if (TYPE_UNSIGNED (type))
3971 {
3972 divmod_optab = udivmod_optab;
3973 div_optab = udiv_optab;
3974 }
3975 else
3976 {
3977 divmod_optab = sdivmod_optab;
3978 div_optab = sdiv_optab;
3979 }
3980
3981 tree op1 = gimple_assign_rhs1 (stmt);
3982 tree op2 = gimple_assign_rhs2 (stmt);
3983
3984 /* Disable the transform if either is a constant, since division-by-constant
3985 may have specialized expansion. */
3986 if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3987 return false;
3988
3989 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3990 expand using the [su]divv optabs. */
3991 if (TYPE_OVERFLOW_TRAPS (type))
3992 return false;
3993
3994 if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3995 return false;
3996
3997 return true;
3998 }
3999
4000 /* This function looks for:
4001 t1 = a TRUNC_DIV_EXPR b;
4002 t2 = a TRUNC_MOD_EXPR b;
4003 and transforms it to the following sequence:
4004 complex_tmp = DIVMOD (a, b);
4005 t1 = REALPART_EXPR(a);
4006 t2 = IMAGPART_EXPR(b);
4007 For conditions enabling the transform see divmod_candidate_p().
4008
4009 The pass has three parts:
4010 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
4011 other trunc_div_expr and trunc_mod_expr stmts.
4012 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
4013 to stmts vector.
4014 3) Insert DIVMOD call just before top_stmt and update entries in
4015 stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
4016 IMAGPART_EXPR for mod). */
4017
4018 static bool
4019 convert_to_divmod (gassign *stmt)
4020 {
4021 if (stmt_can_throw_internal (stmt)
4022 || !divmod_candidate_p (stmt))
4023 return false;
4024
4025 tree op1 = gimple_assign_rhs1 (stmt);
4026 tree op2 = gimple_assign_rhs2 (stmt);
4027
4028 imm_use_iterator use_iter;
4029 gimple *use_stmt;
4030 auto_vec<gimple *> stmts;
4031
4032 gimple *top_stmt = stmt;
4033 basic_block top_bb = gimple_bb (stmt);
4034
4035 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
4036 at-least stmt and possibly other trunc_div/trunc_mod stmts
4037 having same operands as stmt. */
4038
4039 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
4040 {
4041 if (is_gimple_assign (use_stmt)
4042 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4043 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4044 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
4045 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
4046 {
4047 if (stmt_can_throw_internal (use_stmt))
4048 continue;
4049
4050 basic_block bb = gimple_bb (use_stmt);
4051
4052 if (bb == top_bb)
4053 {
4054 if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
4055 top_stmt = use_stmt;
4056 }
4057 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
4058 {
4059 top_bb = bb;
4060 top_stmt = use_stmt;
4061 }
4062 }
4063 }
4064
4065 tree top_op1 = gimple_assign_rhs1 (top_stmt);
4066 tree top_op2 = gimple_assign_rhs2 (top_stmt);
4067
4068 stmts.safe_push (top_stmt);
4069 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
4070
4071 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
4072 to stmts vector. The 2nd loop will always add stmt to stmts vector, since
4073 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
4074 2nd loop ends up adding at-least single trunc_mod_expr stmt. */
4075
4076 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
4077 {
4078 if (is_gimple_assign (use_stmt)
4079 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4080 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4081 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
4082 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
4083 {
4084 if (use_stmt == top_stmt
4085 || stmt_can_throw_internal (use_stmt)
4086 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
4087 continue;
4088
4089 stmts.safe_push (use_stmt);
4090 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
4091 div_seen = true;
4092 }
4093 }
4094
4095 if (!div_seen)
4096 return false;
4097
4098 /* Part 3: Create libcall to internal fn DIVMOD:
4099 divmod_tmp = DIVMOD (op1, op2). */
4100
4101 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
4102 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
4103 call_stmt, "divmod_tmp");
4104 gimple_call_set_lhs (call_stmt, res);
4105 /* We rejected throwing statements above. */
4106 gimple_call_set_nothrow (call_stmt, true);
4107
4108 /* Insert the call before top_stmt. */
4109 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
4110 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
4111
4112 widen_mul_stats.divmod_calls_inserted++;
4113
4114 /* Update all statements in stmts vector:
4115 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
4116 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */
4117
4118 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
4119 {
4120 tree new_rhs;
4121
4122 switch (gimple_assign_rhs_code (use_stmt))
4123 {
4124 case TRUNC_DIV_EXPR:
4125 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
4126 break;
4127
4128 case TRUNC_MOD_EXPR:
4129 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
4130 break;
4131
4132 default:
4133 gcc_unreachable ();
4134 }
4135
4136 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
4137 gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
4138 update_stmt (use_stmt);
4139 }
4140
4141 return true;
4142 }
4143
4144 /* Find integer multiplications where the operands are extended from
4145 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
4146 where appropriate. */
4147
4148 namespace {
4149
4150 const pass_data pass_data_optimize_widening_mul =
4151 {
4152 GIMPLE_PASS, /* type */
4153 "widening_mul", /* name */
4154 OPTGROUP_NONE, /* optinfo_flags */
4155 TV_NONE, /* tv_id */
4156 PROP_ssa, /* properties_required */
4157 0, /* properties_provided */
4158 0, /* properties_destroyed */
4159 0, /* todo_flags_start */
4160 TODO_update_ssa, /* todo_flags_finish */
4161 };
4162
4163 class pass_optimize_widening_mul : public gimple_opt_pass
4164 {
4165 public:
4166 pass_optimize_widening_mul (gcc::context *ctxt)
4167 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
4168 {}
4169
4170 /* opt_pass methods: */
4171 virtual bool gate (function *)
4172 {
4173 return flag_expensive_optimizations && optimize;
4174 }
4175
4176 virtual unsigned int execute (function *);
4177
4178 }; // class pass_optimize_widening_mul
4179
4180 unsigned int
4181 pass_optimize_widening_mul::execute (function *fun)
4182 {
4183 basic_block bb;
4184 bool cfg_changed = false;
4185
4186 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
4187 calculate_dominance_info (CDI_DOMINATORS);
4188 renumber_gimple_stmt_uids ();
4189
4190 FOR_EACH_BB_FN (bb, fun)
4191 {
4192 gimple_stmt_iterator gsi;
4193
4194 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
4195 {
4196 gimple *stmt = gsi_stmt (gsi);
4197 enum tree_code code;
4198
4199 if (is_gimple_assign (stmt))
4200 {
4201 code = gimple_assign_rhs_code (stmt);
4202 switch (code)
4203 {
4204 case MULT_EXPR:
4205 if (!convert_mult_to_widen (stmt, &gsi)
4206 && !convert_expand_mult_copysign (stmt, &gsi)
4207 && convert_mult_to_fma (stmt,
4208 gimple_assign_rhs1 (stmt),
4209 gimple_assign_rhs2 (stmt)))
4210 {
4211 gsi_remove (&gsi, true);
4212 release_defs (stmt);
4213 continue;
4214 }
4215 break;
4216
4217 case PLUS_EXPR:
4218 case MINUS_EXPR:
4219 if (!convert_plusminus_to_widen (&gsi, stmt, code))
4220 match_uaddsub_overflow (&gsi, stmt, code);
4221 break;
4222
4223 case TRUNC_MOD_EXPR:
4224 convert_to_divmod (as_a<gassign *> (stmt));
4225 break;
4226
4227 default:;
4228 }
4229 }
4230 else if (is_gimple_call (stmt)
4231 && gimple_call_lhs (stmt))
4232 {
4233 tree fndecl = gimple_call_fndecl (stmt);
4234 if (fndecl
4235 && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
4236 {
4237 switch (DECL_FUNCTION_CODE (fndecl))
4238 {
4239 case BUILT_IN_POWF:
4240 case BUILT_IN_POW:
4241 case BUILT_IN_POWL:
4242 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
4243 && real_equal
4244 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
4245 &dconst2)
4246 && convert_mult_to_fma (stmt,
4247 gimple_call_arg (stmt, 0),
4248 gimple_call_arg (stmt, 0)))
4249 {
4250 unlink_stmt_vdef (stmt);
4251 if (gsi_remove (&gsi, true)
4252 && gimple_purge_dead_eh_edges (bb))
4253 cfg_changed = true;
4254 release_defs (stmt);
4255 continue;
4256 }
4257 break;
4258
4259 default:;
4260 }
4261 }
4262 }
4263 gsi_next (&gsi);
4264 }
4265 }
4266
4267 statistics_counter_event (fun, "widening multiplications inserted",
4268 widen_mul_stats.widen_mults_inserted);
4269 statistics_counter_event (fun, "widening maccs inserted",
4270 widen_mul_stats.maccs_inserted);
4271 statistics_counter_event (fun, "fused multiply-adds inserted",
4272 widen_mul_stats.fmas_inserted);
4273 statistics_counter_event (fun, "divmod calls inserted",
4274 widen_mul_stats.divmod_calls_inserted);
4275
4276 return cfg_changed ? TODO_cleanup_cfg : 0;
4277 }
4278
4279 } // anon namespace
4280
4281 gimple_opt_pass *
4282 make_pass_optimize_widening_mul (gcc::context *ctxt)
4283 {
4284 return new pass_optimize_widening_mul (ctxt);
4285 }