gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 3, or (at your option) any
  10 later version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  22    operations.  These are common in sequences such as this one:
  23
  24         modulus = sqrt(x*x + y*y + z*z);
  25         x = x / modulus;
  26         y = y / modulus;
  27         z = z / modulus;
  28
  29    that can be optimized to
  30
  31         modulus = sqrt(x*x + y*y + z*z);
  32         rmodulus = 1.0 / modulus;
  33         x = x * rmodulus;
  34         y = y * rmodulus;
  35         z = z * rmodulus;
  36
  37    We do this for loop invariant divisors, and with this pass whenever
  38    we notice that a division has the same divisor multiple times.
  39
  40    Of course, like in PRE, we don't insert a division if a dominator
  41    already has one.  However, this cannot be done as an extension of
  42    PRE for several reasons.
  43
  44    First of all, with some experiments it was found out that the
  45    transformation is not always useful if there are only two divisions
  46    hy the same divisor.  This is probably because modern processors
  47    can pipeline the divisions; on older, in-order processors it should
  48    still be effective to optimize two divisions by the same number.
  49    We make this a param, and it shall be called N in the remainder of
  50    this comment.
  51
  52    Second, if trapping math is active, we have less freedom on where
  53    to insert divisions: we can only do so in basic blocks that already
  54    contain one.  (If divisions don't trap, instead, we can insert
  55    divisions elsewhere, which will be in blocks that are common dominators
  56    of those that have the division).
  57
  58    We really don't want to compute the reciprocal unless a division will
  59    be found.  To do this, we won't insert the division in a basic block
  60    that has less than N divisions *post-dominating* it.
  61
  62    The algorithm constructs a subset of the dominator tree, holding the
  63    blocks containing the divisions and the common dominators to them,
  64    and walk it twice.  The first walk is in post-order, and it annotates
  65    each block with the number of divisions that post-dominate it: this
  66    gives information on where divisions can be inserted profitably.
  67    The second walk is in pre-order, and it inserts divisions as explained
  68    above, and replaces divisions by multiplications.
  69
  70    In the best case, the cost of the pass is O(n_statements).  In the
  71    worst-case, the cost is due to creating the dominator tree subset,
  72    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  73    for n_statements / n_basic_blocks statements.  So, the amortized cost
  74    of creating the dominator tree subset is O(n_basic_blocks) and the
  75    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  76
  77    More practically, the cost will be small because there are few
  78    divisions, and they tend to be in the same basic block, so insert_bb
  79    is called very few times.
  80
  81    If we did this using domwalk.c, an efficient implementation would have
  82    to work on all the variables in a single pass, because we could not
  83    work on just a subset of the dominator tree, as we do now, and the
  84    cost would also be something like O(n_statements * n_basic_blocks).
  85    The data structures would be more complex in order to work on all the
  86    variables in a single pass.  */
  87
  88 #include "config.h"
  89 #include "system.h"
  90 #include "coretypes.h"
  91 #include "tm.h"
  92 #include "flags.h"
  93 #include "tree.h"
  94 #include "tree-flow.h"
  95 #include "tree-pass.h"
  96 #include "alloc-pool.h"
  97 #include "basic-block.h"
  98 #include "target.h"
  99 #include "gimple-pretty-print.h"
 100
 101 /* FIXME: RTL headers have to be included here for optabs.  */
 102 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 103 #include "expr.h"               /* Because optabs.h wants sepops.  */
 104 #include "optabs.h"
 105
 106 /* This structure represents one basic block that either computes a
 107    division, or is a common dominator for basic block that compute a
 108    division.  */
 109 struct occurrence {
 110   /* The basic block represented by this structure.  */
 111   basic_block bb;
 112
 113   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 114      inserted in BB.  */
 115   tree recip_def;
 116
 117   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 118      was inserted in BB.  */
 119   gimple recip_def_stmt;
 120
 121   /* Pointer to a list of "struct occurrence"s for blocks dominated
 122      by BB.  */
 123   struct occurrence *children;
 124
 125   /* Pointer to the next "struct occurrence"s in the list of blocks
 126      sharing a common dominator.  */
 127   struct occurrence *next;
 128
 129   /* The number of divisions that are in BB before compute_merit.  The
 130      number of divisions that are in BB or post-dominate it after
 131      compute_merit.  */
 132   int num_divisions;
 133
 134   /* True if the basic block has a division, false if it is a common
 135      dominator for basic blocks that do.  If it is false and trapping
 136      math is active, BB is not a candidate for inserting a reciprocal.  */
 137   bool bb_has_division;
 138 };
 139
 140 static struct
 141 {
 142   /* Number of 1.0/X ops inserted.  */
 143   int rdivs_inserted;
 144
 145   /* Number of 1.0/FUNC ops inserted.  */
 146   int rfuncs_inserted;
 147 } reciprocal_stats;
 148
 149 static struct
 150 {
 151   /* Number of cexpi calls inserted.  */
 152   int inserted;
 153 } sincos_stats;
 154
 155 static struct
 156 {
 157   /* Number of hand-written 32-bit bswaps found.  */
 158   int found_32bit;
 159
 160   /* Number of hand-written 64-bit bswaps found.  */
 161   int found_64bit;
 162 } bswap_stats;
 163
 164 static struct
 165 {
 166   /* Number of widening multiplication ops inserted.  */
 167   int widen_mults_inserted;
 168
 169   /* Number of integer multiply-and-accumulate ops inserted.  */
 170   int maccs_inserted;
 171
 172   /* Number of fp fused multiply-add ops inserted.  */
 173   int fmas_inserted;
 174 } widen_mul_stats;
 175
 176 /* The instance of "struct occurrence" representing the highest
 177    interesting block in the dominator tree.  */
 178 static struct occurrence *occ_head;
 179
 180 /* Allocation pool for getting instances of "struct occurrence".  */
 181 static alloc_pool occ_pool;
 182
 183
 184
 185 /* Allocate and return a new struct occurrence for basic block BB, and
 186    whose children list is headed by CHILDREN.  */
 187 static struct occurrence *
 188 occ_new (basic_block bb, struct occurrence *children)
 189 {
 190   struct occurrence *occ;
 191
 192   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 193   memset (occ, 0, sizeof (struct occurrence));
 194
 195   occ->bb = bb;
 196   occ->children = children;
 197   return occ;
 198 }
 199
 200
 201 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 202    list of "struct occurrence"s, one per basic block, having IDOM as
 203    their common dominator.
 204
 205    We try to insert NEW_OCC as deep as possible in the tree, and we also
 206    insert any other block that is a common dominator for BB and one
 207    block already in the tree.  */
 208
 209 static void
 210 insert_bb (struct occurrence *new_occ, basic_block idom,
 211            struct occurrence **p_head)
 212 {
 213   struct occurrence *occ, **p_occ;
 214
 215   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 216     {
 217       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 218       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 219       if (dom == bb)
 220         {
 221           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 222              from its list.  */
 223           *p_occ = occ->next;
 224           occ->next = new_occ->children;
 225           new_occ->children = occ;
 226
 227           /* Try the next block (it may as well be dominated by BB).  */
 228         }
 229
 230       else if (dom == occ_bb)
 231         {
 232           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 233           insert_bb (new_occ, dom, &occ->children);
 234           return;
 235         }
 236
 237       else if (dom != idom)
 238         {
 239           gcc_assert (!dom->aux);
 240
 241           /* There is a dominator between IDOM and BB, add it and make
 242              two children out of NEW_OCC and OCC.  First, remove OCC from
 243              its list.  */
 244           *p_occ = occ->next;
 245           new_occ->next = occ;
 246           occ->next = NULL;
 247
 248           /* None of the previous blocks has DOM as a dominator: if we tail
 249              recursed, we would reexamine them uselessly. Just switch BB with
 250              DOM, and go on looking for blocks dominated by DOM.  */
 251           new_occ = occ_new (dom, new_occ);
 252         }
 253
 254       else
 255         {
 256           /* Nothing special, go on with the next element.  */
 257           p_occ = &occ->next;
 258         }
 259     }
 260
 261   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 262   new_occ->next = *p_head;
 263   *p_head = new_occ;
 264 }
 265
 266 /* Register that we found a division in BB.  */
 267
 268 static inline void
 269 register_division_in (basic_block bb)
 270 {
 271   struct occurrence *occ;
 272
 273   occ = (struct occurrence *) bb->aux;
 274   if (!occ)
 275     {
 276       occ = occ_new (bb, NULL);
 277       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 278     }
 279
 280   occ->bb_has_division = true;
 281   occ->num_divisions++;
 282 }
 283
 284
 285 /* Compute the number of divisions that postdominate each block in OCC and
 286    its children.  */
 287
 288 static void
 289 compute_merit (struct occurrence *occ)
 290 {
 291   struct occurrence *occ_child;
 292   basic_block dom = occ->bb;
 293
 294   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 295     {
 296       basic_block bb;
 297       if (occ_child->children)
 298         compute_merit (occ_child);
 299
 300       if (flag_exceptions)
 301         bb = single_noncomplex_succ (dom);
 302       else
 303         bb = dom;
 304
 305       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 306         occ->num_divisions += occ_child->num_divisions;
 307     }
 308 }
 309
 310
 311 /* Return whether USE_STMT is a floating-point division by DEF.  */
 312 static inline bool
 313 is_division_by (gimple use_stmt, tree def)
 314 {
 315   return is_gimple_assign (use_stmt)
 316          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 317          && gimple_assign_rhs2 (use_stmt) == def
 318          /* Do not recognize x / x as valid division, as we are getting
 319             confused later by replacing all immediate uses x in such
 320             a stmt.  */
 321          && gimple_assign_rhs1 (use_stmt) != def;
 322 }
 323
 324 /* Walk the subset of the dominator tree rooted at OCC, setting the
 325    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 326    the given basic block.  The field may be left NULL, of course,
 327    if it is not possible or profitable to do the optimization.
 328
 329    DEF_BSI is an iterator pointing at the statement defining DEF.
 330    If RECIP_DEF is set, a dominator already has a computation that can
 331    be used.  */
 332
 333 static void
 334 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 335                     tree def, tree recip_def, int threshold)
 336 {
 337   tree type;
 338   gimple new_stmt;
 339   gimple_stmt_iterator gsi;
 340   struct occurrence *occ_child;
 341
 342   if (!recip_def
 343       && (occ->bb_has_division || !flag_trapping_math)
 344       && occ->num_divisions >= threshold)
 345     {
 346       /* Make a variable with the replacement and substitute it.  */
 347       type = TREE_TYPE (def);
 348       recip_def = create_tmp_reg (type, "reciptmp");
 349       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 350                                                build_one_cst (type), def);
 351
 352       if (occ->bb_has_division)
 353         {
 354           /* Case 1: insert before an existing division.  */
 355           gsi = gsi_after_labels (occ->bb);
 356           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 357             gsi_next (&gsi);
 358
 359           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 360         }
 361       else if (def_gsi && occ->bb == def_gsi->bb)
 362         {
 363           /* Case 2: insert right after the definition.  Note that this will
 364              never happen if the definition statement can throw, because in
 365              that case the sole successor of the statement's basic block will
 366              dominate all the uses as well.  */
 367           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 368         }
 369       else
 370         {
 371           /* Case 3: insert in a basic block not containing defs/uses.  */
 372           gsi = gsi_after_labels (occ->bb);
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375
 376       reciprocal_stats.rdivs_inserted++;
 377
 378       occ->recip_def_stmt = new_stmt;
 379     }
 380
 381   occ->recip_def = recip_def;
 382   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 383     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 384 }
 385
 386
 387 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 388    possible.  */
 389
 390 static inline void
 391 replace_reciprocal (use_operand_p use_p)
 392 {
 393   gimple use_stmt = USE_STMT (use_p);
 394   basic_block bb = gimple_bb (use_stmt);
 395   struct occurrence *occ = (struct occurrence *) bb->aux;
 396
 397   if (optimize_bb_for_speed_p (bb)
 398       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 399     {
 400       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 401       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 402       SET_USE (use_p, occ->recip_def);
 403       fold_stmt_inplace (&gsi);
 404       update_stmt (use_stmt);
 405     }
 406 }
 407
 408
 409 /* Free OCC and return one more "struct occurrence" to be freed.  */
 410
 411 static struct occurrence *
 412 free_bb (struct occurrence *occ)
 413 {
 414   struct occurrence *child, *next;
 415
 416   /* First get the two pointers hanging off OCC.  */
 417   next = occ->next;
 418   child = occ->children;
 419   occ->bb->aux = NULL;
 420   pool_free (occ_pool, occ);
 421
 422   /* Now ensure that we don't recurse unless it is necessary.  */
 423   if (!child)
 424     return next;
 425   else
 426     {
 427       while (next)
 428         next = free_bb (next);
 429
 430       return child;
 431     }
 432 }
 433
 434
 435 /* Look for floating-point divisions among DEF's uses, and try to
 436    replace them by multiplications with the reciprocal.  Add
 437    as many statements computing the reciprocal as needed.
 438
 439    DEF must be a GIMPLE register of a floating-point type.  */
 440
 441 static void
 442 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 443 {
 444   use_operand_p use_p;
 445   imm_use_iterator use_iter;
 446   struct occurrence *occ;
 447   int count = 0, threshold;
 448
 449   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 450
 451   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 452     {
 453       gimple use_stmt = USE_STMT (use_p);
 454       if (is_division_by (use_stmt, def))
 455         {
 456           register_division_in (gimple_bb (use_stmt));
 457           count++;
 458         }
 459     }
 460
 461   /* Do the expensive part only if we can hope to optimize something.  */
 462   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 463   if (count >= threshold)
 464     {
 465       gimple use_stmt;
 466       for (occ = occ_head; occ; occ = occ->next)
 467         {
 468           compute_merit (occ);
 469           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 470         }
 471
 472       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 473         {
 474           if (is_division_by (use_stmt, def))
 475             {
 476               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 477                 replace_reciprocal (use_p);
 478             }
 479         }
 480     }
 481
 482   for (occ = occ_head; occ; )
 483     occ = free_bb (occ);
 484
 485   occ_head = NULL;
 486 }
 487
 488 static bool
 489 gate_cse_reciprocals (void)
 490 {
 491   return optimize && flag_reciprocal_math;
 492 }
 493
 494 /* Go through all the floating-point SSA_NAMEs, and call
 495    execute_cse_reciprocals_1 on each of them.  */
 496 static unsigned int
 497 execute_cse_reciprocals (void)
 498 {
 499   basic_block bb;
 500   tree arg;
 501
 502   occ_pool = create_alloc_pool ("dominators for recip",
 503                                 sizeof (struct occurrence),
 504                                 n_basic_blocks / 3 + 1);
 505
 506   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 507   calculate_dominance_info (CDI_DOMINATORS);
 508   calculate_dominance_info (CDI_POST_DOMINATORS);
 509
 510 #ifdef ENABLE_CHECKING
 511   FOR_EACH_BB (bb)
 512     gcc_assert (!bb->aux);
 513 #endif
 514
 515   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 516     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 517         && is_gimple_reg (arg))
 518       {
 519         tree name = ssa_default_def (cfun, arg);
 520         if (name)
 521           execute_cse_reciprocals_1 (NULL, name);
 522       }
 523
 524   FOR_EACH_BB (bb)
 525     {
 526       gimple_stmt_iterator gsi;
 527       gimple phi;
 528       tree def;
 529
 530       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 531         {
 532           phi = gsi_stmt (gsi);
 533           def = PHI_RESULT (phi);
 534           if (! virtual_operand_p (def)
 535               && FLOAT_TYPE_P (TREE_TYPE (def)))
 536             execute_cse_reciprocals_1 (NULL, def);
 537         }
 538
 539       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 540         {
 541           gimple stmt = gsi_stmt (gsi);
 542
 543           if (gimple_has_lhs (stmt)
 544               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 545               && FLOAT_TYPE_P (TREE_TYPE (def))
 546               && TREE_CODE (def) == SSA_NAME)
 547             execute_cse_reciprocals_1 (&gsi, def);
 548         }
 549
 550       if (optimize_bb_for_size_p (bb))
 551         continue;
 552
 553       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 554       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 555         {
 556           gimple stmt = gsi_stmt (gsi);
 557           tree fndecl;
 558
 559           if (is_gimple_assign (stmt)
 560               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 561             {
 562               tree arg1 = gimple_assign_rhs2 (stmt);
 563               gimple stmt1;
 564
 565               if (TREE_CODE (arg1) != SSA_NAME)
 566                 continue;
 567
 568               stmt1 = SSA_NAME_DEF_STMT (arg1);
 569
 570               if (is_gimple_call (stmt1)
 571                   && gimple_call_lhs (stmt1)
 572                   && (fndecl = gimple_call_fndecl (stmt1))
 573                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 574                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 575                 {
 576                   enum built_in_function code;
 577                   bool md_code, fail;
 578                   imm_use_iterator ui;
 579                   use_operand_p use_p;
 580
 581                   code = DECL_FUNCTION_CODE (fndecl);
 582                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 583
 584                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 585                   if (!fndecl)
 586                     continue;
 587
 588                   /* Check that all uses of the SSA name are divisions,
 589                      otherwise replacing the defining statement will do
 590                      the wrong thing.  */
 591                   fail = false;
 592                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 593                     {
 594                       gimple stmt2 = USE_STMT (use_p);
 595                       if (is_gimple_debug (stmt2))
 596                         continue;
 597                       if (!is_gimple_assign (stmt2)
 598                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 599                           || gimple_assign_rhs1 (stmt2) == arg1
 600                           || gimple_assign_rhs2 (stmt2) != arg1)
 601                         {
 602                           fail = true;
 603                           break;
 604                         }
 605                     }
 606                   if (fail)
 607                     continue;
 608
 609                   gimple_replace_lhs (stmt1, arg1);
 610                   gimple_call_set_fndecl (stmt1, fndecl);
 611                   update_stmt (stmt1);
 612                   reciprocal_stats.rfuncs_inserted++;
 613
 614                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 615                     {
 616                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 617                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 618                       fold_stmt_inplace (&gsi);
 619                       update_stmt (stmt);
 620                     }
 621                 }
 622             }
 623         }
 624     }
 625
 626   statistics_counter_event (cfun, "reciprocal divs inserted",
 627                             reciprocal_stats.rdivs_inserted);
 628   statistics_counter_event (cfun, "reciprocal functions inserted",
 629                             reciprocal_stats.rfuncs_inserted);
 630
 631   free_dominance_info (CDI_DOMINATORS);
 632   free_dominance_info (CDI_POST_DOMINATORS);
 633   free_alloc_pool (occ_pool);
 634   return 0;
 635 }
 636
 637 struct gimple_opt_pass pass_cse_reciprocals =
 638 {
 639  {
 640   GIMPLE_PASS,
 641   "recip",                              /* name */
 642   gate_cse_reciprocals,                 /* gate */
 643   execute_cse_reciprocals,              /* execute */
 644   NULL,                                 /* sub */
 645   NULL,                                 /* next */
 646   0,                                    /* static_pass_number */
 647   TV_NONE,                              /* tv_id */
 648   PROP_ssa,                             /* properties_required */
 649   0,                                    /* properties_provided */
 650   0,                                    /* properties_destroyed */
 651   0,                                    /* todo_flags_start */
 652   TODO_update_ssa | TODO_verify_ssa
 653     | TODO_verify_stmts                /* todo_flags_finish */
 654  }
 655 };
 656
 657 /* Records an occurrence at statement USE_STMT in the vector of trees
 658    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 659    is not yet initialized.  Returns true if the occurrence was pushed on
 660    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 661    statements in the vector.  */
 662
 663 static bool
 664 maybe_record_sincos (VEC(gimple, heap) **stmts,
 665                      basic_block *top_bb, gimple use_stmt)
 666 {
 667   basic_block use_bb = gimple_bb (use_stmt);
 668   if (*top_bb
 669       && (*top_bb == use_bb
 670           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 671     VEC_safe_push (gimple, heap, *stmts, use_stmt);
 672   else if (!*top_bb
 673            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 674     {
 675       VEC_safe_push (gimple, heap, *stmts, use_stmt);
 676       *top_bb = use_bb;
 677     }
 678   else
 679     return false;
 680
 681   return true;
 682 }
 683
 684 /* Look for sin, cos and cexpi calls with the same argument NAME and
 685    create a single call to cexpi CSEing the result in this case.
 686    We first walk over all immediate uses of the argument collecting
 687    statements that we can CSE in a vector and in a second pass replace
 688    the statement rhs with a REALPART or IMAGPART expression on the
 689    result of the cexpi call we insert before the use statement that
 690    dominates all other candidates.  */
 691
 692 static bool
 693 execute_cse_sincos_1 (tree name)
 694 {
 695   gimple_stmt_iterator gsi;
 696   imm_use_iterator use_iter;
 697   tree fndecl, res, type;
 698   gimple def_stmt, use_stmt, stmt;
 699   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 700   VEC(gimple, heap) *stmts = NULL;
 701   basic_block top_bb = NULL;
 702   int i;
 703   bool cfg_changed = false;
 704
 705   type = TREE_TYPE (name);
 706   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 707     {
 708       if (gimple_code (use_stmt) != GIMPLE_CALL
 709           || !gimple_call_lhs (use_stmt)
 710           || !(fndecl = gimple_call_fndecl (use_stmt))
 711           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 712         continue;
 713
 714       switch (DECL_FUNCTION_CODE (fndecl))
 715         {
 716         CASE_FLT_FN (BUILT_IN_COS):
 717           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 718           break;
 719
 720         CASE_FLT_FN (BUILT_IN_SIN):
 721           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 722           break;
 723
 724         CASE_FLT_FN (BUILT_IN_CEXPI):
 725           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 726           break;
 727
 728         default:;
 729         }
 730     }
 731
 732   if (seen_cos + seen_sin + seen_cexpi <= 1)
 733     {
 734       VEC_free(gimple, heap, stmts);
 735       return false;
 736     }
 737
 738   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 739      the name def statement.  */
 740   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 741   if (!fndecl)
 742     return false;
 743   stmt = gimple_build_call (fndecl, 1, name);
 744   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 745   gimple_call_set_lhs (stmt, res);
 746
 747   def_stmt = SSA_NAME_DEF_STMT (name);
 748   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 749       && gimple_code (def_stmt) != GIMPLE_PHI
 750       && gimple_bb (def_stmt) == top_bb)
 751     {
 752       gsi = gsi_for_stmt (def_stmt);
 753       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 754     }
 755   else
 756     {
 757       gsi = gsi_after_labels (top_bb);
 758       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 759     }
 760   sincos_stats.inserted++;
 761
 762   /* And adjust the recorded old call sites.  */
 763   for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
 764     {
 765       tree rhs = NULL;
 766       fndecl = gimple_call_fndecl (use_stmt);
 767
 768       switch (DECL_FUNCTION_CODE (fndecl))
 769         {
 770         CASE_FLT_FN (BUILT_IN_COS):
 771           rhs = fold_build1 (REALPART_EXPR, type, res);
 772           break;
 773
 774         CASE_FLT_FN (BUILT_IN_SIN):
 775           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 776           break;
 777
 778         CASE_FLT_FN (BUILT_IN_CEXPI):
 779           rhs = res;
 780           break;
 781
 782         default:;
 783           gcc_unreachable ();
 784         }
 785
 786         /* Replace call with a copy.  */
 787         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 788
 789         gsi = gsi_for_stmt (use_stmt);
 790         gsi_replace (&gsi, stmt, true);
 791         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 792           cfg_changed = true;
 793     }
 794
 795   VEC_free(gimple, heap, stmts);
 796
 797   return cfg_changed;
 798 }
 799
 800 /* To evaluate powi(x,n), the floating point value x raised to the
 801    constant integer exponent n, we use a hybrid algorithm that
 802    combines the "window method" with look-up tables.  For an
 803    introduction to exponentiation algorithms and "addition chains",
 804    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 805    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 806    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 807    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 808
 809 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 810    multiplications to inline before calling the system library's pow
 811    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 812    so this default never requires calling pow, powf or powl.  */
 813
 814 #ifndef POWI_MAX_MULTS
 815 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 816 #endif
 817
 818 /* The size of the "optimal power tree" lookup table.  All
 819    exponents less than this value are simply looked up in the
 820    powi_table below.  This threshold is also used to size the
 821    cache of pseudo registers that hold intermediate results.  */
 822 #define POWI_TABLE_SIZE 256
 823
 824 /* The size, in bits of the window, used in the "window method"
 825    exponentiation algorithm.  This is equivalent to a radix of
 826    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 827 #define POWI_WINDOW_SIZE 3
 828
 829 /* The following table is an efficient representation of an
 830    "optimal power tree".  For each value, i, the corresponding
 831    value, j, in the table states than an optimal evaluation
 832    sequence for calculating pow(x,i) can be found by evaluating
 833    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 834    100 integers is given in Knuth's "Seminumerical algorithms".  */
 835
 836 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 837   {
 838       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 839       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 840       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 841      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 842      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 843      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 844      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 845      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 846      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 847      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 848      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 849      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 850      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 851      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 852      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 853      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 854      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 855      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 856      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 857      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 858      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 859      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 860      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 861      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 862      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 863     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 864     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 865     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 866     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 867     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 868     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 869     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 870   };
 871
 872
 873 /* Return the number of multiplications required to calculate
 874    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 875    subroutine of powi_cost.  CACHE is an array indicating
 876    which exponents have already been calculated.  */
 877
 878 static int
 879 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 880 {
 881   /* If we've already calculated this exponent, then this evaluation
 882      doesn't require any additional multiplications.  */
 883   if (cache[n])
 884     return 0;
 885
 886   cache[n] = true;
 887   return powi_lookup_cost (n - powi_table[n], cache)
 888          + powi_lookup_cost (powi_table[n], cache) + 1;
 889 }
 890
 891 /* Return the number of multiplications required to calculate
 892    powi(x,n) for an arbitrary x, given the exponent N.  This
 893    function needs to be kept in sync with powi_as_mults below.  */
 894
 895 static int
 896 powi_cost (HOST_WIDE_INT n)
 897 {
 898   bool cache[POWI_TABLE_SIZE];
 899   unsigned HOST_WIDE_INT digit;
 900   unsigned HOST_WIDE_INT val;
 901   int result;
 902
 903   if (n == 0)
 904     return 0;
 905
 906   /* Ignore the reciprocal when calculating the cost.  */
 907   val = (n < 0) ? -n : n;
 908
 909   /* Initialize the exponent cache.  */
 910   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 911   cache[1] = true;
 912
 913   result = 0;
 914
 915   while (val >= POWI_TABLE_SIZE)
 916     {
 917       if (val & 1)
 918         {
 919           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 920           result += powi_lookup_cost (digit, cache)
 921                     + POWI_WINDOW_SIZE + 1;
 922           val >>= POWI_WINDOW_SIZE;
 923         }
 924       else
 925         {
 926           val >>= 1;
 927           result++;
 928         }
 929     }
 930
 931   return result + powi_lookup_cost (val, cache);
 932 }
 933
 934 /* Recursive subroutine of powi_as_mults.  This function takes the
 935    array, CACHE, of already calculated exponents and an exponent N and
 936    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 937
 938 static tree
 939 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 940                  HOST_WIDE_INT n, tree *cache)
 941 {
 942   tree op0, op1, ssa_target;
 943   unsigned HOST_WIDE_INT digit;
 944   gimple mult_stmt;
 945
 946   if (n < POWI_TABLE_SIZE && cache[n])
 947     return cache[n];
 948
 949   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 950
 951   if (n < POWI_TABLE_SIZE)
 952     {
 953       cache[n] = ssa_target;
 954       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 955       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 956     }
 957   else if (n & 1)
 958     {
 959       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 960       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 961       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 962     }
 963   else
 964     {
 965       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 966       op1 = op0;
 967     }
 968
 969   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 970   gimple_set_location (mult_stmt, loc);
 971   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 972
 973   return ssa_target;
 974 }
 975
 976 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 977    This function needs to be kept in sync with powi_cost above.  */
 978
 979 static tree
 980 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 981                tree arg0, HOST_WIDE_INT n)
 982 {
 983   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
 984   gimple div_stmt;
 985   tree target;
 986
 987   if (n == 0)
 988     return build_real (type, dconst1);
 989
 990   memset (cache, 0,  sizeof (cache));
 991   cache[1] = arg0;
 992
 993   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
 994   if (n >= 0)
 995     return result;
 996
 997   /* If the original exponent was negative, reciprocate the result.  */
 998   target = make_temp_ssa_name (type, NULL, "powmult");
 999   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1000                                            build_real (type, dconst1),
1001                                            result);
1002   gimple_set_location (div_stmt, loc);
1003   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1004
1005   return target;
1006 }
1007
1008 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1009    location info LOC.  If the arguments are appropriate, create an
1010    equivalent sequence of statements prior to GSI using an optimal
1011    number of multiplications, and return an expession holding the
1012    result.  */
1013
1014 static tree
1015 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1016                             tree arg0, HOST_WIDE_INT n)
1017 {
1018   /* Avoid largest negative number.  */
1019   if (n != -n
1020       && ((n >= -1 && n <= 2)
1021           || (optimize_function_for_speed_p (cfun)
1022               && powi_cost (n) <= POWI_MAX_MULTS)))
1023     return powi_as_mults (gsi, loc, arg0, n);
1024
1025   return NULL_TREE;
1026 }
1027
1028 /* Build a gimple call statement that calls FN with argument ARG.
1029    Set the lhs of the call statement to a fresh SSA name.  Insert the
1030    statement prior to GSI's current position, and return the fresh
1031    SSA name.  */
1032
1033 static tree
1034 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1035                        tree fn, tree arg)
1036 {
1037   gimple call_stmt;
1038   tree ssa_target;
1039
1040   call_stmt = gimple_build_call (fn, 1, arg);
1041   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1042   gimple_set_lhs (call_stmt, ssa_target);
1043   gimple_set_location (call_stmt, loc);
1044   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1045
1046   return ssa_target;
1047 }
1048
1049 /* Build a gimple binary operation with the given CODE and arguments
1050    ARG0, ARG1, assigning the result to a new SSA name for variable
1051    TARGET.  Insert the statement prior to GSI's current position, and
1052    return the fresh SSA name.*/
1053
1054 static tree
1055 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1056                         const char *name, enum tree_code code,
1057                         tree arg0, tree arg1)
1058 {
1059   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1060   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1061   gimple_set_location (stmt, loc);
1062   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1063   return result;
1064 }
1065
1066 /* Build a gimple reference operation with the given CODE and argument
1067    ARG, assigning the result to a new SSA name of TYPE with NAME.
1068    Insert the statement prior to GSI's current position, and return
1069    the fresh SSA name.  */
1070
1071 static inline tree
1072 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1073                       const char *name, enum tree_code code, tree arg0)
1074 {
1075   tree result = make_temp_ssa_name (type, NULL, name);
1076   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1077   gimple_set_location (stmt, loc);
1078   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1079   return result;
1080 }
1081
1082 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1083    prior to GSI's current position, and return the fresh SSA name.  */
1084
1085 static tree
1086 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1087                        tree type, tree val)
1088 {
1089   tree result = make_ssa_name (type, NULL);
1090   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1091   gimple_set_location (stmt, loc);
1092   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1093   return result;
1094 }
1095
1096 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1097    with location info LOC.  If possible, create an equivalent and
1098    less expensive sequence of statements prior to GSI, and return an
1099    expession holding the result.  */
1100
1101 static tree
1102 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1103                            tree arg0, tree arg1)
1104 {
1105   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1106   REAL_VALUE_TYPE c2, dconst3;
1107   HOST_WIDE_INT n;
1108   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1109   enum machine_mode mode;
1110   bool hw_sqrt_exists;
1111
1112   /* If the exponent isn't a constant, there's nothing of interest
1113      to be done.  */
1114   if (TREE_CODE (arg1) != REAL_CST)
1115     return NULL_TREE;
1116
1117   /* If the exponent is equivalent to an integer, expand to an optimal
1118      multiplication sequence when profitable.  */
1119   c = TREE_REAL_CST (arg1);
1120   n = real_to_integer (&c);
1121   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1122
1123   if (real_identical (&c, &cint)
1124       && ((n >= -1 && n <= 2)
1125           || (flag_unsafe_math_optimizations
1126               && optimize_insn_for_speed_p ()
1127               && powi_cost (n) <= POWI_MAX_MULTS)))
1128     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1129
1130   /* Attempt various optimizations using sqrt and cbrt.  */
1131   type = TREE_TYPE (arg0);
1132   mode = TYPE_MODE (type);
1133   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1134
1135   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1136      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1137      sqrt(-0) = -0.  */
1138   if (sqrtfn
1139       && REAL_VALUES_EQUAL (c, dconsthalf)
1140       && !HONOR_SIGNED_ZEROS (mode))
1141     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1142
1143   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1144      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1145      so do this optimization even if -Os.  Don't do this optimization
1146      if we don't have a hardware sqrt insn.  */
1147   dconst1_4 = dconst1;
1148   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1149   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1150
1151   if (flag_unsafe_math_optimizations
1152       && sqrtfn
1153       && REAL_VALUES_EQUAL (c, dconst1_4)
1154       && hw_sqrt_exists)
1155     {
1156       /* sqrt(x)  */
1157       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1158
1159       /* sqrt(sqrt(x))  */
1160       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1161     }
1162
1163   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1164      optimizing for space.  Don't do this optimization if we don't have
1165      a hardware sqrt insn.  */
1166   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1167   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1168
1169   if (flag_unsafe_math_optimizations
1170       && sqrtfn
1171       && optimize_function_for_speed_p (cfun)
1172       && REAL_VALUES_EQUAL (c, dconst3_4)
1173       && hw_sqrt_exists)
1174     {
1175       /* sqrt(x)  */
1176       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1177
1178       /* sqrt(sqrt(x))  */
1179       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1180
1181       /* sqrt(x) * sqrt(sqrt(x))  */
1182       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1183                                      sqrt_arg0, sqrt_sqrt);
1184     }
1185
1186   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1187      optimizations since 1./3. is not exactly representable.  If x
1188      is negative and finite, the correct value of pow(x,1./3.) is
1189      a NaN with the "invalid" exception raised, because the value
1190      of 1./3. actually has an even denominator.  The correct value
1191      of cbrt(x) is a negative real value.  */
1192   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1193   dconst1_3 = real_value_truncate (mode, dconst_third ());
1194
1195   if (flag_unsafe_math_optimizations
1196       && cbrtfn
1197       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1198       && REAL_VALUES_EQUAL (c, dconst1_3))
1199     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1200
1201   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1202      if we don't have a hardware sqrt insn.  */
1203   dconst1_6 = dconst1_3;
1204   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1205
1206   if (flag_unsafe_math_optimizations
1207       && sqrtfn
1208       && cbrtfn
1209       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1210       && optimize_function_for_speed_p (cfun)
1211       && hw_sqrt_exists
1212       && REAL_VALUES_EQUAL (c, dconst1_6))
1213     {
1214       /* sqrt(x)  */
1215       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1216
1217       /* cbrt(sqrt(x))  */
1218       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1219     }
1220
1221   /* Optimize pow(x,c), where n = 2c for some nonzero integer n, into
1222
1223        sqrt(x) * powi(x, n/2),                n > 0;
1224        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1225
1226      Do not calculate the powi factor when n/2 = 0.  */
1227   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1228   n = real_to_integer (&c2);
1229   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1230
1231   if (flag_unsafe_math_optimizations
1232       && sqrtfn
1233       && real_identical (&c2, &cint))
1234     {
1235       tree powi_x_ndiv2 = NULL_TREE;
1236
1237       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1238          possible or profitable, give up.  Skip the degenerate case when
1239          n is 1 or -1, where the result is always 1.  */
1240       if (absu_hwi (n) != 1)
1241         {
1242           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1243                                                      abs_hwi (n / 2));
1244           if (!powi_x_ndiv2)
1245             return NULL_TREE;
1246         }
1247
1248       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1249          result of the optimal multiply sequence just calculated.  */
1250       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1251
1252       if (absu_hwi (n) == 1)
1253         result = sqrt_arg0;
1254       else
1255         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1256                                          sqrt_arg0, powi_x_ndiv2);
1257
1258       /* If n is negative, reciprocate the result.  */
1259       if (n < 0)
1260         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1261                                          build_real (type, dconst1), result);
1262       return result;
1263     }
1264
1265   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1266
1267      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1268      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1269
1270      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1271      different from pow(x, 1./3.) due to rounding and behavior with
1272      negative x, we need to constrain this transformation to unsafe
1273      math and positive x or finite math.  */
1274   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1275   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1276   real_round (&c2, mode, &c2);
1277   n = real_to_integer (&c2);
1278   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1279   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1280   real_convert (&c2, mode, &c2);
1281
1282   if (flag_unsafe_math_optimizations
1283       && cbrtfn
1284       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1285       && real_identical (&c2, &c)
1286       && optimize_function_for_speed_p (cfun)
1287       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1288     {
1289       tree powi_x_ndiv3 = NULL_TREE;
1290
1291       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1292          possible or profitable, give up.  Skip the degenerate case when
1293          abs(n) < 3, where the result is always 1.  */
1294       if (absu_hwi (n) >= 3)
1295         {
1296           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1297                                                      abs_hwi (n / 3));
1298           if (!powi_x_ndiv3)
1299             return NULL_TREE;
1300         }
1301
1302       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1303          as that creates an unnecessary variable.  Instead, just produce
1304          either cbrt(x) or cbrt(x) * cbrt(x).  */
1305       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1306
1307       if (absu_hwi (n) % 3 == 1)
1308         powi_cbrt_x = cbrt_x;
1309       else
1310         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1311                                               cbrt_x, cbrt_x);
1312
1313       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1314       if (absu_hwi (n) < 3)
1315         result = powi_cbrt_x;
1316       else
1317         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1318                                          powi_x_ndiv3, powi_cbrt_x);
1319
1320       /* If n is negative, reciprocate the result.  */
1321       if (n < 0)
1322         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1323                                          build_real (type, dconst1), result);
1324
1325       return result;
1326     }
1327
1328   /* No optimizations succeeded.  */
1329   return NULL_TREE;
1330 }
1331
1332 /* ARG is the argument to a cabs builtin call in GSI with location info
1333    LOC.  Create a sequence of statements prior to GSI that calculates
1334    sqrt(R*R + I*I), where R and I are the real and imaginary components
1335    of ARG, respectively.  Return an expression holding the result.  */
1336
1337 static tree
1338 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1339 {
1340   tree real_part, imag_part, addend1, addend2, sum, result;
1341   tree type = TREE_TYPE (TREE_TYPE (arg));
1342   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1343   enum machine_mode mode = TYPE_MODE (type);
1344
1345   if (!flag_unsafe_math_optimizations
1346       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1347       || !sqrtfn
1348       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1349     return NULL_TREE;
1350
1351   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1352                                     REALPART_EXPR, arg);
1353   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1354                                     real_part, real_part);
1355   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1356                                     IMAGPART_EXPR, arg);
1357   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1358                                     imag_part, imag_part);
1359   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1360   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1361
1362   return result;
1363 }
1364
1365 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1366    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1367    an optimal number of multiplies, when n is a constant.  */
1368
1369 static unsigned int
1370 execute_cse_sincos (void)
1371 {
1372   basic_block bb;
1373   bool cfg_changed = false;
1374
1375   calculate_dominance_info (CDI_DOMINATORS);
1376   memset (&sincos_stats, 0, sizeof (sincos_stats));
1377
1378   FOR_EACH_BB (bb)
1379     {
1380       gimple_stmt_iterator gsi;
1381
1382       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1383         {
1384           gimple stmt = gsi_stmt (gsi);
1385           tree fndecl;
1386
1387           if (is_gimple_call (stmt)
1388               && gimple_call_lhs (stmt)
1389               && (fndecl = gimple_call_fndecl (stmt))
1390               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1391             {
1392               tree arg, arg0, arg1, result;
1393               HOST_WIDE_INT n;
1394               location_t loc;
1395
1396               switch (DECL_FUNCTION_CODE (fndecl))
1397                 {
1398                 CASE_FLT_FN (BUILT_IN_COS):
1399                 CASE_FLT_FN (BUILT_IN_SIN):
1400                 CASE_FLT_FN (BUILT_IN_CEXPI):
1401                   /* Make sure we have either sincos or cexp.  */
1402                   if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1403                     break;
1404
1405                   arg = gimple_call_arg (stmt, 0);
1406                   if (TREE_CODE (arg) == SSA_NAME)
1407                     cfg_changed |= execute_cse_sincos_1 (arg);
1408                   break;
1409
1410                 CASE_FLT_FN (BUILT_IN_POW):
1411                   arg0 = gimple_call_arg (stmt, 0);
1412                   arg1 = gimple_call_arg (stmt, 1);
1413
1414                   loc = gimple_location (stmt);
1415                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1416
1417                   if (result)
1418                     {
1419                       tree lhs = gimple_get_lhs (stmt);
1420                       gimple new_stmt = gimple_build_assign (lhs, result);
1421                       gimple_set_location (new_stmt, loc);
1422                       unlink_stmt_vdef (stmt);
1423                       gsi_replace (&gsi, new_stmt, true);
1424                       if (gimple_vdef (stmt))
1425                         release_ssa_name (gimple_vdef (stmt));
1426                     }
1427                   break;
1428
1429                 CASE_FLT_FN (BUILT_IN_POWI):
1430                   arg0 = gimple_call_arg (stmt, 0);
1431                   arg1 = gimple_call_arg (stmt, 1);
1432                   if (!host_integerp (arg1, 0))
1433                     break;
1434
1435                   n = TREE_INT_CST_LOW (arg1);
1436                   loc = gimple_location (stmt);
1437                   result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1438
1439                   if (result)
1440                     {
1441                       tree lhs = gimple_get_lhs (stmt);
1442                       gimple new_stmt = gimple_build_assign (lhs, result);
1443                       gimple_set_location (new_stmt, loc);
1444                       unlink_stmt_vdef (stmt);
1445                       gsi_replace (&gsi, new_stmt, true);
1446                       if (gimple_vdef (stmt))
1447                         release_ssa_name (gimple_vdef (stmt));
1448                     }
1449                   break;
1450
1451                 CASE_FLT_FN (BUILT_IN_CABS):
1452                   arg0 = gimple_call_arg (stmt, 0);
1453                   loc = gimple_location (stmt);
1454                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1455
1456                   if (result)
1457                     {
1458                       tree lhs = gimple_get_lhs (stmt);
1459                       gimple new_stmt = gimple_build_assign (lhs, result);
1460                       gimple_set_location (new_stmt, loc);
1461                       unlink_stmt_vdef (stmt);
1462                       gsi_replace (&gsi, new_stmt, true);
1463                       if (gimple_vdef (stmt))
1464                         release_ssa_name (gimple_vdef (stmt));
1465                     }
1466                   break;
1467
1468                 default:;
1469                 }
1470             }
1471         }
1472     }
1473
1474   statistics_counter_event (cfun, "sincos statements inserted",
1475                             sincos_stats.inserted);
1476
1477   free_dominance_info (CDI_DOMINATORS);
1478   return cfg_changed ? TODO_cleanup_cfg : 0;
1479 }
1480
1481 static bool
1482 gate_cse_sincos (void)
1483 {
1484   /* We no longer require either sincos or cexp, since powi expansion
1485      piggybacks on this pass.  */
1486   return optimize;
1487 }
1488
1489 struct gimple_opt_pass pass_cse_sincos =
1490 {
1491  {
1492   GIMPLE_PASS,
1493   "sincos",                             /* name */
1494   gate_cse_sincos,                      /* gate */
1495   execute_cse_sincos,                   /* execute */
1496   NULL,                                 /* sub */
1497   NULL,                                 /* next */
1498   0,                                    /* static_pass_number */
1499   TV_NONE,                              /* tv_id */
1500   PROP_ssa,                             /* properties_required */
1501   0,                                    /* properties_provided */
1502   0,                                    /* properties_destroyed */
1503   0,                                    /* todo_flags_start */
1504   TODO_update_ssa | TODO_verify_ssa
1505     | TODO_verify_stmts                 /* todo_flags_finish */
1506  }
1507 };
1508
1509 /* A symbolic number is used to detect byte permutation and selection
1510    patterns.  Therefore the field N contains an artificial number
1511    consisting of byte size markers:
1512
1513    0    - byte has the value 0
1514    1..size - byte contains the content of the byte
1515    number indexed with that value minus one  */
1516
1517 struct symbolic_number {
1518   unsigned HOST_WIDEST_INT n;
1519   int size;
1520 };
1521
1522 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1523    number N.  Return false if the requested operation is not permitted
1524    on a symbolic number.  */
1525
1526 static inline bool
1527 do_shift_rotate (enum tree_code code,
1528                  struct symbolic_number *n,
1529                  int count)
1530 {
1531   if (count % 8 != 0)
1532     return false;
1533
1534   /* Zero out the extra bits of N in order to avoid them being shifted
1535      into the significant bits.  */
1536   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1537     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1538
1539   switch (code)
1540     {
1541     case LSHIFT_EXPR:
1542       n->n <<= count;
1543       break;
1544     case RSHIFT_EXPR:
1545       n->n >>= count;
1546       break;
1547     case LROTATE_EXPR:
1548       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1549       break;
1550     case RROTATE_EXPR:
1551       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1552       break;
1553     default:
1554       return false;
1555     }
1556   /* Zero unused bits for size.  */
1557   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1558     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1559   return true;
1560 }
1561
1562 /* Perform sanity checking for the symbolic number N and the gimple
1563    statement STMT.  */
1564
1565 static inline bool
1566 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1567 {
1568   tree lhs_type;
1569
1570   lhs_type = gimple_expr_type (stmt);
1571
1572   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1573     return false;
1574
1575   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1576     return false;
1577
1578   return true;
1579 }
1580
1581 /* find_bswap_1 invokes itself recursively with N and tries to perform
1582    the operation given by the rhs of STMT on the result.  If the
1583    operation could successfully be executed the function returns the
1584    tree expression of the source operand and NULL otherwise.  */
1585
1586 static tree
1587 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1588 {
1589   enum tree_code code;
1590   tree rhs1, rhs2 = NULL;
1591   gimple rhs1_stmt, rhs2_stmt;
1592   tree source_expr1;
1593   enum gimple_rhs_class rhs_class;
1594
1595   if (!limit || !is_gimple_assign (stmt))
1596     return NULL_TREE;
1597
1598   rhs1 = gimple_assign_rhs1 (stmt);
1599
1600   if (TREE_CODE (rhs1) != SSA_NAME)
1601     return NULL_TREE;
1602
1603   code = gimple_assign_rhs_code (stmt);
1604   rhs_class = gimple_assign_rhs_class (stmt);
1605   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1606
1607   if (rhs_class == GIMPLE_BINARY_RHS)
1608     rhs2 = gimple_assign_rhs2 (stmt);
1609
1610   /* Handle unary rhs and binary rhs with integer constants as second
1611      operand.  */
1612
1613   if (rhs_class == GIMPLE_UNARY_RHS
1614       || (rhs_class == GIMPLE_BINARY_RHS
1615           && TREE_CODE (rhs2) == INTEGER_CST))
1616     {
1617       if (code != BIT_AND_EXPR
1618           && code != LSHIFT_EXPR
1619           && code != RSHIFT_EXPR
1620           && code != LROTATE_EXPR
1621           && code != RROTATE_EXPR
1622           && code != NOP_EXPR
1623           && code != CONVERT_EXPR)
1624         return NULL_TREE;
1625
1626       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1627
1628       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1629          to initialize the symbolic number.  */
1630       if (!source_expr1)
1631         {
1632           /* Set up the symbolic number N by setting each byte to a
1633              value between 1 and the byte size of rhs1.  The highest
1634              order byte is set to n->size and the lowest order
1635              byte to 1.  */
1636           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1637           if (n->size % BITS_PER_UNIT != 0)
1638             return NULL_TREE;
1639           n->size /= BITS_PER_UNIT;
1640           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1641                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1642
1643           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1644             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1645                      (n->size * BITS_PER_UNIT)) - 1;
1646
1647           source_expr1 = rhs1;
1648         }
1649
1650       switch (code)
1651         {
1652         case BIT_AND_EXPR:
1653           {
1654             int i;
1655             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1656             unsigned HOST_WIDEST_INT tmp = val;
1657
1658             /* Only constants masking full bytes are allowed.  */
1659             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1660               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1661                 return NULL_TREE;
1662
1663             n->n &= val;
1664           }
1665           break;
1666         case LSHIFT_EXPR:
1667         case RSHIFT_EXPR:
1668         case LROTATE_EXPR:
1669         case RROTATE_EXPR:
1670           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1671             return NULL_TREE;
1672           break;
1673         CASE_CONVERT:
1674           {
1675             int type_size;
1676
1677             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1678             if (type_size % BITS_PER_UNIT != 0)
1679               return NULL_TREE;
1680
1681             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1682               {
1683                 /* If STMT casts to a smaller type mask out the bits not
1684                    belonging to the target type.  */
1685                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1686               }
1687             n->size = type_size / BITS_PER_UNIT;
1688           }
1689           break;
1690         default:
1691           return NULL_TREE;
1692         };
1693       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1694     }
1695
1696   /* Handle binary rhs.  */
1697
1698   if (rhs_class == GIMPLE_BINARY_RHS)
1699     {
1700       struct symbolic_number n1, n2;
1701       tree source_expr2;
1702
1703       if (code != BIT_IOR_EXPR)
1704         return NULL_TREE;
1705
1706       if (TREE_CODE (rhs2) != SSA_NAME)
1707         return NULL_TREE;
1708
1709       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1710
1711       switch (code)
1712         {
1713         case BIT_IOR_EXPR:
1714           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1715
1716           if (!source_expr1)
1717             return NULL_TREE;
1718
1719           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1720
1721           if (source_expr1 != source_expr2
1722               || n1.size != n2.size)
1723             return NULL_TREE;
1724
1725           n->size = n1.size;
1726           n->n = n1.n | n2.n;
1727
1728           if (!verify_symbolic_number_p (n, stmt))
1729             return NULL_TREE;
1730
1731           break;
1732         default:
1733           return NULL_TREE;
1734         }
1735       return source_expr1;
1736     }
1737   return NULL_TREE;
1738 }
1739
1740 /* Check if STMT completes a bswap implementation consisting of ORs,
1741    SHIFTs and ANDs.  Return the source tree expression on which the
1742    byte swap is performed and NULL if no bswap was found.  */
1743
1744 static tree
1745 find_bswap (gimple stmt)
1746 {
1747 /* The number which the find_bswap result should match in order to
1748    have a full byte swap.  The number is shifted to the left according
1749    to the size of the symbolic number before using it.  */
1750   unsigned HOST_WIDEST_INT cmp =
1751     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1752     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1753
1754   struct symbolic_number n;
1755   tree source_expr;
1756   int limit;
1757
1758   /* The last parameter determines the depth search limit.  It usually
1759      correlates directly to the number of bytes to be touched.  We
1760      increase that number by three  here in order to also
1761      cover signed -> unsigned converions of the src operand as can be seen
1762      in libgcc, and for initial shift/and operation of the src operand.  */
1763   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1764   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1765   source_expr =  find_bswap_1 (stmt, &n, limit);
1766
1767   if (!source_expr)
1768     return NULL_TREE;
1769
1770   /* Zero out the extra bits of N and CMP.  */
1771   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1772     {
1773       unsigned HOST_WIDEST_INT mask =
1774         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1775
1776       n.n &= mask;
1777       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1778     }
1779
1780   /* A complete byte swap should make the symbolic number to start
1781      with the largest digit in the highest order byte.  */
1782   if (cmp != n.n)
1783     return NULL_TREE;
1784
1785   return source_expr;
1786 }
1787
1788 /* Find manual byte swap implementations and turn them into a bswap
1789    builtin invokation.  */
1790
1791 static unsigned int
1792 execute_optimize_bswap (void)
1793 {
1794   basic_block bb;
1795   bool bswap32_p, bswap64_p;
1796   bool changed = false;
1797   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1798
1799   if (BITS_PER_UNIT != 8)
1800     return 0;
1801
1802   if (sizeof (HOST_WIDEST_INT) < 8)
1803     return 0;
1804
1805   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1806                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1807   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1808                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1809                    || (bswap32_p && word_mode == SImode)));
1810
1811   if (!bswap32_p && !bswap64_p)
1812     return 0;
1813
1814   /* Determine the argument type of the builtins.  The code later on
1815      assumes that the return and argument type are the same.  */
1816   if (bswap32_p)
1817     {
1818       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1819       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1820     }
1821
1822   if (bswap64_p)
1823     {
1824       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1825       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1826     }
1827
1828   memset (&bswap_stats, 0, sizeof (bswap_stats));
1829
1830   FOR_EACH_BB (bb)
1831     {
1832       gimple_stmt_iterator gsi;
1833
1834       /* We do a reverse scan for bswap patterns to make sure we get the
1835          widest match. As bswap pattern matching doesn't handle
1836          previously inserted smaller bswap replacements as sub-
1837          patterns, the wider variant wouldn't be detected.  */
1838       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1839         {
1840           gimple stmt = gsi_stmt (gsi);
1841           tree bswap_src, bswap_type;
1842           tree bswap_tmp;
1843           tree fndecl = NULL_TREE;
1844           int type_size;
1845           gimple call;
1846
1847           if (!is_gimple_assign (stmt)
1848               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1849             continue;
1850
1851           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1852
1853           switch (type_size)
1854             {
1855             case 32:
1856               if (bswap32_p)
1857                 {
1858                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1859                   bswap_type = bswap32_type;
1860                 }
1861               break;
1862             case 64:
1863               if (bswap64_p)
1864                 {
1865                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1866                   bswap_type = bswap64_type;
1867                 }
1868               break;
1869             default:
1870               continue;
1871             }
1872
1873           if (!fndecl)
1874             continue;
1875
1876           bswap_src = find_bswap (stmt);
1877
1878           if (!bswap_src)
1879             continue;
1880
1881           changed = true;
1882           if (type_size == 32)
1883             bswap_stats.found_32bit++;
1884           else
1885             bswap_stats.found_64bit++;
1886
1887           bswap_tmp = bswap_src;
1888
1889           /* Convert the src expression if necessary.  */
1890           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1891             {
1892               gimple convert_stmt;
1893               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
1894               convert_stmt = gimple_build_assign_with_ops
1895                                 (NOP_EXPR, bswap_tmp, bswap_src, NULL);
1896               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1897             }
1898
1899           call = gimple_build_call (fndecl, 1, bswap_tmp);
1900
1901           bswap_tmp = gimple_assign_lhs (stmt);
1902
1903           /* Convert the result if necessary.  */
1904           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1905             {
1906               gimple convert_stmt;
1907               bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
1908               convert_stmt = gimple_build_assign_with_ops
1909                         (NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1910               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1911             }
1912
1913           gimple_call_set_lhs (call, bswap_tmp);
1914
1915           if (dump_file)
1916             {
1917               fprintf (dump_file, "%d bit bswap implementation found at: ",
1918                        (int)type_size);
1919               print_gimple_stmt (dump_file, stmt, 0, 0);
1920             }
1921
1922           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1923           gsi_remove (&gsi, true);
1924         }
1925     }
1926
1927   statistics_counter_event (cfun, "32-bit bswap implementations found",
1928                             bswap_stats.found_32bit);
1929   statistics_counter_event (cfun, "64-bit bswap implementations found",
1930                             bswap_stats.found_64bit);
1931
1932   return (changed ? TODO_update_ssa | TODO_verify_ssa
1933           | TODO_verify_stmts : 0);
1934 }
1935
1936 static bool
1937 gate_optimize_bswap (void)
1938 {
1939   return flag_expensive_optimizations && optimize;
1940 }
1941
1942 struct gimple_opt_pass pass_optimize_bswap =
1943 {
1944  {
1945   GIMPLE_PASS,
1946   "bswap",                              /* name */
1947   gate_optimize_bswap,                  /* gate */
1948   execute_optimize_bswap,               /* execute */
1949   NULL,                                 /* sub */
1950   NULL,                                 /* next */
1951   0,                                    /* static_pass_number */
1952   TV_NONE,                              /* tv_id */
1953   PROP_ssa,                             /* properties_required */
1954   0,                                    /* properties_provided */
1955   0,                                    /* properties_destroyed */
1956   0,                                    /* todo_flags_start */
1957   0                                     /* todo_flags_finish */
1958  }
1959 };
1960
1961 /* Return true if stmt is a type conversion operation that can be stripped
1962    when used in a widening multiply operation.  */
1963 static bool
1964 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
1965 {
1966   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
1967
1968   if (TREE_CODE (result_type) == INTEGER_TYPE)
1969     {
1970       tree op_type;
1971       tree inner_op_type;
1972
1973       if (!CONVERT_EXPR_CODE_P (rhs_code))
1974         return false;
1975
1976       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
1977
1978       /* If the type of OP has the same precision as the result, then
1979          we can strip this conversion.  The multiply operation will be
1980          selected to create the correct extension as a by-product.  */
1981       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
1982         return true;
1983
1984       /* We can also strip a conversion if it preserves the signed-ness of
1985          the operation and doesn't narrow the range.  */
1986       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
1987
1988       /* If the inner-most type is unsigned, then we can strip any
1989          intermediate widening operation.  If it's signed, then the
1990          intermediate widening operation must also be signed.  */
1991       if ((TYPE_UNSIGNED (inner_op_type)
1992            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
1993           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
1994         return true;
1995
1996       return false;
1997     }
1998
1999   return rhs_code == FIXED_CONVERT_EXPR;
2000 }
2001
2002 /* Return true if RHS is a suitable operand for a widening multiplication,
2003    assuming a target type of TYPE.
2004    There are two cases:
2005
2006      - RHS makes some value at least twice as wide.  Store that value
2007        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2008
2009      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2010        but leave *TYPE_OUT untouched.  */
2011
2012 static bool
2013 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2014                         tree *new_rhs_out)
2015 {
2016   gimple stmt;
2017   tree type1, rhs1;
2018
2019   if (TREE_CODE (rhs) == SSA_NAME)
2020     {
2021       stmt = SSA_NAME_DEF_STMT (rhs);
2022       if (is_gimple_assign (stmt))
2023         {
2024           if (! widening_mult_conversion_strippable_p (type, stmt))
2025             rhs1 = rhs;
2026           else
2027             {
2028               rhs1 = gimple_assign_rhs1 (stmt);
2029
2030               if (TREE_CODE (rhs1) == INTEGER_CST)
2031                 {
2032                   *new_rhs_out = rhs1;
2033                   *type_out = NULL;
2034                   return true;
2035                 }
2036             }
2037         }
2038       else
2039         rhs1 = rhs;
2040
2041       type1 = TREE_TYPE (rhs1);
2042
2043       if (TREE_CODE (type1) != TREE_CODE (type)
2044           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2045         return false;
2046
2047       *new_rhs_out = rhs1;
2048       *type_out = type1;
2049       return true;
2050     }
2051
2052   if (TREE_CODE (rhs) == INTEGER_CST)
2053     {
2054       *new_rhs_out = rhs;
2055       *type_out = NULL;
2056       return true;
2057     }
2058
2059   return false;
2060 }
2061
2062 /* Return true if STMT performs a widening multiplication, assuming the
2063    output type is TYPE.  If so, store the unwidened types of the operands
2064    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2065    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2066    and *TYPE2_OUT would give the operands of the multiplication.  */
2067
2068 static bool
2069 is_widening_mult_p (gimple stmt,
2070                     tree *type1_out, tree *rhs1_out,
2071                     tree *type2_out, tree *rhs2_out)
2072 {
2073   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2074
2075   if (TREE_CODE (type) != INTEGER_TYPE
2076       && TREE_CODE (type) != FIXED_POINT_TYPE)
2077     return false;
2078
2079   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2080                                rhs1_out))
2081     return false;
2082
2083   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2084                                rhs2_out))
2085     return false;
2086
2087   if (*type1_out == NULL)
2088     {
2089       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2090         return false;
2091       *type1_out = *type2_out;
2092     }
2093
2094   if (*type2_out == NULL)
2095     {
2096       if (!int_fits_type_p (*rhs2_out, *type1_out))
2097         return false;
2098       *type2_out = *type1_out;
2099     }
2100
2101   /* Ensure that the larger of the two operands comes first. */
2102   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2103     {
2104       tree tmp;
2105       tmp = *type1_out;
2106       *type1_out = *type2_out;
2107       *type2_out = tmp;
2108       tmp = *rhs1_out;
2109       *rhs1_out = *rhs2_out;
2110       *rhs2_out = tmp;
2111     }
2112
2113   return true;
2114 }
2115
2116 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2117    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2118    value is true iff we converted the statement.  */
2119
2120 static bool
2121 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2122 {
2123   tree lhs, rhs1, rhs2, type, type1, type2;
2124   enum insn_code handler;
2125   enum machine_mode to_mode, from_mode, actual_mode;
2126   optab op;
2127   int actual_precision;
2128   location_t loc = gimple_location (stmt);
2129   bool from_unsigned1, from_unsigned2;
2130
2131   lhs = gimple_assign_lhs (stmt);
2132   type = TREE_TYPE (lhs);
2133   if (TREE_CODE (type) != INTEGER_TYPE)
2134     return false;
2135
2136   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2137     return false;
2138
2139   to_mode = TYPE_MODE (type);
2140   from_mode = TYPE_MODE (type1);
2141   from_unsigned1 = TYPE_UNSIGNED (type1);
2142   from_unsigned2 = TYPE_UNSIGNED (type2);
2143
2144   if (from_unsigned1 && from_unsigned2)
2145     op = umul_widen_optab;
2146   else if (!from_unsigned1 && !from_unsigned2)
2147     op = smul_widen_optab;
2148   else
2149     op = usmul_widen_optab;
2150
2151   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2152                                                   0, &actual_mode);
2153
2154   if (handler == CODE_FOR_nothing)
2155     {
2156       if (op != smul_widen_optab)
2157         {
2158           /* We can use a signed multiply with unsigned types as long as
2159              there is a wider mode to use, or it is the smaller of the two
2160              types that is unsigned.  Note that type1 >= type2, always.  */
2161           if ((TYPE_UNSIGNED (type1)
2162                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2163               || (TYPE_UNSIGNED (type2)
2164                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2165             {
2166               from_mode = GET_MODE_WIDER_MODE (from_mode);
2167               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2168                 return false;
2169             }
2170
2171           op = smul_widen_optab;
2172           handler = find_widening_optab_handler_and_mode (op, to_mode,
2173                                                           from_mode, 0,
2174                                                           &actual_mode);
2175
2176           if (handler == CODE_FOR_nothing)
2177             return false;
2178
2179           from_unsigned1 = from_unsigned2 = false;
2180         }
2181       else
2182         return false;
2183     }
2184
2185   /* Ensure that the inputs to the handler are in the correct precison
2186      for the opcode.  This will be the full mode size.  */
2187   actual_precision = GET_MODE_PRECISION (actual_mode);
2188   if (2 * actual_precision > TYPE_PRECISION (type))
2189     return false;
2190   if (actual_precision != TYPE_PRECISION (type1)
2191       || from_unsigned1 != TYPE_UNSIGNED (type1))
2192     rhs1 = build_and_insert_cast (gsi, loc,
2193                                   build_nonstandard_integer_type
2194                                     (actual_precision, from_unsigned1), rhs1);
2195   if (actual_precision != TYPE_PRECISION (type2)
2196       || from_unsigned2 != TYPE_UNSIGNED (type2))
2197     rhs2 = build_and_insert_cast (gsi, loc,
2198                                   build_nonstandard_integer_type
2199                                     (actual_precision, from_unsigned2), rhs2);
2200
2201   /* Handle constants.  */
2202   if (TREE_CODE (rhs1) == INTEGER_CST)
2203     rhs1 = fold_convert (type1, rhs1);
2204   if (TREE_CODE (rhs2) == INTEGER_CST)
2205     rhs2 = fold_convert (type2, rhs2);
2206
2207   gimple_assign_set_rhs1 (stmt, rhs1);
2208   gimple_assign_set_rhs2 (stmt, rhs2);
2209   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2210   update_stmt (stmt);
2211   widen_mul_stats.widen_mults_inserted++;
2212   return true;
2213 }
2214
2215 /* Process a single gimple statement STMT, which is found at the
2216    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2217    rhs (given by CODE), and try to convert it into a
2218    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2219    is true iff we converted the statement.  */
2220
2221 static bool
2222 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2223                             enum tree_code code)
2224 {
2225   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2226   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2227   tree type, type1, type2, optype;
2228   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2229   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2230   optab this_optab;
2231   enum tree_code wmult_code;
2232   enum insn_code handler;
2233   enum machine_mode to_mode, from_mode, actual_mode;
2234   location_t loc = gimple_location (stmt);
2235   int actual_precision;
2236   bool from_unsigned1, from_unsigned2;
2237
2238   lhs = gimple_assign_lhs (stmt);
2239   type = TREE_TYPE (lhs);
2240   if (TREE_CODE (type) != INTEGER_TYPE
2241       && TREE_CODE (type) != FIXED_POINT_TYPE)
2242     return false;
2243
2244   if (code == MINUS_EXPR)
2245     wmult_code = WIDEN_MULT_MINUS_EXPR;
2246   else
2247     wmult_code = WIDEN_MULT_PLUS_EXPR;
2248
2249   rhs1 = gimple_assign_rhs1 (stmt);
2250   rhs2 = gimple_assign_rhs2 (stmt);
2251
2252   if (TREE_CODE (rhs1) == SSA_NAME)
2253     {
2254       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2255       if (is_gimple_assign (rhs1_stmt))
2256         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2257     }
2258
2259   if (TREE_CODE (rhs2) == SSA_NAME)
2260     {
2261       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2262       if (is_gimple_assign (rhs2_stmt))
2263         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2264     }
2265
2266   /* Allow for one conversion statement between the multiply
2267      and addition/subtraction statement.  If there are more than
2268      one conversions then we assume they would invalidate this
2269      transformation.  If that's not the case then they should have
2270      been folded before now.  */
2271   if (CONVERT_EXPR_CODE_P (rhs1_code))
2272     {
2273       conv1_stmt = rhs1_stmt;
2274       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2275       if (TREE_CODE (rhs1) == SSA_NAME)
2276         {
2277           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2278           if (is_gimple_assign (rhs1_stmt))
2279             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2280         }
2281       else
2282         return false;
2283     }
2284   if (CONVERT_EXPR_CODE_P (rhs2_code))
2285     {
2286       conv2_stmt = rhs2_stmt;
2287       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2288       if (TREE_CODE (rhs2) == SSA_NAME)
2289         {
2290           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2291           if (is_gimple_assign (rhs2_stmt))
2292             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2293         }
2294       else
2295         return false;
2296     }
2297
2298   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2299      is_widening_mult_p, but we still need the rhs returns.
2300
2301      It might also appear that it would be sufficient to use the existing
2302      operands of the widening multiply, but that would limit the choice of
2303      multiply-and-accumulate instructions.  */
2304   if (code == PLUS_EXPR
2305       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2306     {
2307       if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2308                                &type2, &mult_rhs2))
2309         return false;
2310       add_rhs = rhs2;
2311       conv_stmt = conv1_stmt;
2312     }
2313   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2314     {
2315       if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2316                                &type2, &mult_rhs2))
2317         return false;
2318       add_rhs = rhs1;
2319       conv_stmt = conv2_stmt;
2320     }
2321   else
2322     return false;
2323
2324   to_mode = TYPE_MODE (type);
2325   from_mode = TYPE_MODE (type1);
2326   from_unsigned1 = TYPE_UNSIGNED (type1);
2327   from_unsigned2 = TYPE_UNSIGNED (type2);
2328   optype = type1;
2329
2330   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2331   if (from_unsigned1 != from_unsigned2)
2332     {
2333       if (!INTEGRAL_TYPE_P (type))
2334         return false;
2335       /* We can use a signed multiply with unsigned types as long as
2336          there is a wider mode to use, or it is the smaller of the two
2337          types that is unsigned.  Note that type1 >= type2, always.  */
2338       if ((from_unsigned1
2339            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2340           || (from_unsigned2
2341               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2342         {
2343           from_mode = GET_MODE_WIDER_MODE (from_mode);
2344           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2345             return false;
2346         }
2347
2348       from_unsigned1 = from_unsigned2 = false;
2349       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2350                                                false);
2351     }
2352
2353   /* If there was a conversion between the multiply and addition
2354      then we need to make sure it fits a multiply-and-accumulate.
2355      The should be a single mode change which does not change the
2356      value.  */
2357   if (conv_stmt)
2358     {
2359       /* We use the original, unmodified data types for this.  */
2360       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2361       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2362       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2363       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2364
2365       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2366         {
2367           /* Conversion is a truncate.  */
2368           if (TYPE_PRECISION (to_type) < data_size)
2369             return false;
2370         }
2371       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2372         {
2373           /* Conversion is an extend.  Check it's the right sort.  */
2374           if (TYPE_UNSIGNED (from_type) != is_unsigned
2375               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2376             return false;
2377         }
2378       /* else convert is a no-op for our purposes.  */
2379     }
2380
2381   /* Verify that the machine can perform a widening multiply
2382      accumulate in this mode/signedness combination, otherwise
2383      this transformation is likely to pessimize code.  */
2384   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2385   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2386                                                   from_mode, 0, &actual_mode);
2387
2388   if (handler == CODE_FOR_nothing)
2389     return false;
2390
2391   /* Ensure that the inputs to the handler are in the correct precison
2392      for the opcode.  This will be the full mode size.  */
2393   actual_precision = GET_MODE_PRECISION (actual_mode);
2394   if (actual_precision != TYPE_PRECISION (type1)
2395       || from_unsigned1 != TYPE_UNSIGNED (type1))
2396     mult_rhs1 = build_and_insert_cast (gsi, loc,
2397                                        build_nonstandard_integer_type
2398                                          (actual_precision, from_unsigned1),
2399                                        mult_rhs1);
2400   if (actual_precision != TYPE_PRECISION (type2)
2401       || from_unsigned2 != TYPE_UNSIGNED (type2))
2402     mult_rhs2 = build_and_insert_cast (gsi, loc,
2403                                        build_nonstandard_integer_type
2404                                          (actual_precision, from_unsigned2),
2405                                        mult_rhs2);
2406
2407   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2408     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2409
2410   /* Handle constants.  */
2411   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2412     mult_rhs1 = fold_convert (type1, mult_rhs1);
2413   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2414     mult_rhs2 = fold_convert (type2, mult_rhs2);
2415
2416   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2417                                     add_rhs);
2418   update_stmt (gsi_stmt (*gsi));
2419   widen_mul_stats.maccs_inserted++;
2420   return true;
2421 }
2422
2423 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2424    with uses in additions and subtractions to form fused multiply-add
2425    operations.  Returns true if successful and MUL_STMT should be removed.  */
2426
2427 static bool
2428 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2429 {
2430   tree mul_result = gimple_get_lhs (mul_stmt);
2431   tree type = TREE_TYPE (mul_result);
2432   gimple use_stmt, neguse_stmt, fma_stmt;
2433   use_operand_p use_p;
2434   imm_use_iterator imm_iter;
2435
2436   if (FLOAT_TYPE_P (type)
2437       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2438     return false;
2439
2440   /* We don't want to do bitfield reduction ops.  */
2441   if (INTEGRAL_TYPE_P (type)
2442       && (TYPE_PRECISION (type)
2443           != GET_MODE_PRECISION (TYPE_MODE (type))))
2444     return false;
2445
2446   /* If the target doesn't support it, don't generate it.  We assume that
2447      if fma isn't available then fms, fnma or fnms are not either.  */
2448   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2449     return false;
2450
2451   /* If the multiplication has zero uses, it is kept around probably because
2452      of -fnon-call-exceptions.  Don't optimize it away in that case,
2453      it is DCE job.  */
2454   if (has_zero_uses (mul_result))
2455     return false;
2456
2457   /* Make sure that the multiplication statement becomes dead after
2458      the transformation, thus that all uses are transformed to FMAs.
2459      This means we assume that an FMA operation has the same cost
2460      as an addition.  */
2461   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2462     {
2463       enum tree_code use_code;
2464       tree result = mul_result;
2465       bool negate_p = false;
2466
2467       use_stmt = USE_STMT (use_p);
2468
2469       if (is_gimple_debug (use_stmt))
2470         continue;
2471
2472       /* For now restrict this operations to single basic blocks.  In theory
2473          we would want to support sinking the multiplication in
2474          m = a*b;
2475          if ()
2476            ma = m + c;
2477          else
2478            d = m;
2479          to form a fma in the then block and sink the multiplication to the
2480          else block.  */
2481       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2482         return false;
2483
2484       if (!is_gimple_assign (use_stmt))
2485         return false;
2486
2487       use_code = gimple_assign_rhs_code (use_stmt);
2488
2489       /* A negate on the multiplication leads to FNMA.  */
2490       if (use_code == NEGATE_EXPR)
2491         {
2492           ssa_op_iter iter;
2493           use_operand_p usep;
2494
2495           result = gimple_assign_lhs (use_stmt);
2496
2497           /* Make sure the negate statement becomes dead with this
2498              single transformation.  */
2499           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2500                                &use_p, &neguse_stmt))
2501             return false;
2502
2503           /* Make sure the multiplication isn't also used on that stmt.  */
2504           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2505             if (USE_FROM_PTR (usep) == mul_result)
2506               return false;
2507
2508           /* Re-validate.  */
2509           use_stmt = neguse_stmt;
2510           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2511             return false;
2512           if (!is_gimple_assign (use_stmt))
2513             return false;
2514
2515           use_code = gimple_assign_rhs_code (use_stmt);
2516           negate_p = true;
2517         }
2518
2519       switch (use_code)
2520         {
2521         case MINUS_EXPR:
2522           if (gimple_assign_rhs2 (use_stmt) == result)
2523             negate_p = !negate_p;
2524           break;
2525         case PLUS_EXPR:
2526           break;
2527         default:
2528           /* FMA can only be formed from PLUS and MINUS.  */
2529           return false;
2530         }
2531
2532       /* We can't handle a * b + a * b.  */
2533       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2534         return false;
2535
2536       /* While it is possible to validate whether or not the exact form
2537          that we've recognized is available in the backend, the assumption
2538          is that the transformation is never a loss.  For instance, suppose
2539          the target only has the plain FMA pattern available.  Consider
2540          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2541          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2542          still have 3 operations, but in the FMA form the two NEGs are
2543          independent and could be run in parallel.  */
2544     }
2545
2546   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2547     {
2548       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2549       enum tree_code use_code;
2550       tree addop, mulop1 = op1, result = mul_result;
2551       bool negate_p = false;
2552
2553       if (is_gimple_debug (use_stmt))
2554         continue;
2555
2556       use_code = gimple_assign_rhs_code (use_stmt);
2557       if (use_code == NEGATE_EXPR)
2558         {
2559           result = gimple_assign_lhs (use_stmt);
2560           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2561           gsi_remove (&gsi, true);
2562           release_defs (use_stmt);
2563
2564           use_stmt = neguse_stmt;
2565           gsi = gsi_for_stmt (use_stmt);
2566           use_code = gimple_assign_rhs_code (use_stmt);
2567           negate_p = true;
2568         }
2569
2570       if (gimple_assign_rhs1 (use_stmt) == result)
2571         {
2572           addop = gimple_assign_rhs2 (use_stmt);
2573           /* a * b - c -> a * b + (-c)  */
2574           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2575             addop = force_gimple_operand_gsi (&gsi,
2576                                               build1 (NEGATE_EXPR,
2577                                                       type, addop),
2578                                               true, NULL_TREE, true,
2579                                               GSI_SAME_STMT);
2580         }
2581       else
2582         {
2583           addop = gimple_assign_rhs1 (use_stmt);
2584           /* a - b * c -> (-b) * c + a */
2585           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2586             negate_p = !negate_p;
2587         }
2588
2589       if (negate_p)
2590         mulop1 = force_gimple_operand_gsi (&gsi,
2591                                            build1 (NEGATE_EXPR,
2592                                                    type, mulop1),
2593                                            true, NULL_TREE, true,
2594                                            GSI_SAME_STMT);
2595
2596       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
2597                                                 gimple_assign_lhs (use_stmt),
2598                                                 mulop1, op2,
2599                                                 addop);
2600       gsi_replace (&gsi, fma_stmt, true);
2601       widen_mul_stats.fmas_inserted++;
2602     }
2603
2604   return true;
2605 }
2606
2607 /* Find integer multiplications where the operands are extended from
2608    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2609    where appropriate.  */
2610
2611 static unsigned int
2612 execute_optimize_widening_mul (void)
2613 {
2614   basic_block bb;
2615   bool cfg_changed = false;
2616
2617   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2618
2619   FOR_EACH_BB (bb)
2620     {
2621       gimple_stmt_iterator gsi;
2622
2623       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2624         {
2625           gimple stmt = gsi_stmt (gsi);
2626           enum tree_code code;
2627
2628           if (is_gimple_assign (stmt))
2629             {
2630               code = gimple_assign_rhs_code (stmt);
2631               switch (code)
2632                 {
2633                 case MULT_EXPR:
2634                   if (!convert_mult_to_widen (stmt, &gsi)
2635                       && convert_mult_to_fma (stmt,
2636                                               gimple_assign_rhs1 (stmt),
2637                                               gimple_assign_rhs2 (stmt)))
2638                     {
2639                       gsi_remove (&gsi, true);
2640                       release_defs (stmt);
2641                       continue;
2642                     }
2643                   break;
2644
2645                 case PLUS_EXPR:
2646                 case MINUS_EXPR:
2647                   convert_plusminus_to_widen (&gsi, stmt, code);
2648                   break;
2649
2650                 default:;
2651                 }
2652             }
2653           else if (is_gimple_call (stmt)
2654                    && gimple_call_lhs (stmt))
2655             {
2656               tree fndecl = gimple_call_fndecl (stmt);
2657               if (fndecl
2658                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2659                 {
2660                   switch (DECL_FUNCTION_CODE (fndecl))
2661                     {
2662                       case BUILT_IN_POWF:
2663                       case BUILT_IN_POW:
2664                       case BUILT_IN_POWL:
2665                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2666                             && REAL_VALUES_EQUAL
2667                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2668                                   dconst2)
2669                             && convert_mult_to_fma (stmt,
2670                                                     gimple_call_arg (stmt, 0),
2671                                                     gimple_call_arg (stmt, 0)))
2672                           {
2673                             unlink_stmt_vdef (stmt);
2674                             if (gsi_remove (&gsi, true)
2675                                 && gimple_purge_dead_eh_edges (bb))
2676                               cfg_changed = true;
2677                             release_defs (stmt);
2678                             continue;
2679                           }
2680                           break;
2681
2682                       default:;
2683                     }
2684                 }
2685             }
2686           gsi_next (&gsi);
2687         }
2688     }
2689
2690   statistics_counter_event (cfun, "widening multiplications inserted",
2691                             widen_mul_stats.widen_mults_inserted);
2692   statistics_counter_event (cfun, "widening maccs inserted",
2693                             widen_mul_stats.maccs_inserted);
2694   statistics_counter_event (cfun, "fused multiply-adds inserted",
2695                             widen_mul_stats.fmas_inserted);
2696
2697   return cfg_changed ? TODO_cleanup_cfg : 0;
2698 }
2699
2700 static bool
2701 gate_optimize_widening_mul (void)
2702 {
2703   return flag_expensive_optimizations && optimize;
2704 }
2705
2706 struct gimple_opt_pass pass_optimize_widening_mul =
2707 {
2708  {
2709   GIMPLE_PASS,
2710   "widening_mul",                       /* name */
2711   gate_optimize_widening_mul,           /* gate */
2712   execute_optimize_widening_mul,        /* execute */
2713   NULL,                                 /* sub */
2714   NULL,                                 /* next */
2715   0,                                    /* static_pass_number */
2716   TV_NONE,                              /* tv_id */
2717   PROP_ssa,                             /* properties_required */
2718   0,                                    /* properties_provided */
2719   0,                                    /* properties_destroyed */
2720   0,                                    /* todo_flags_start */
2721   TODO_verify_ssa
2722   | TODO_verify_stmts
2723   | TODO_update_ssa                     /* todo_flags_finish */
2724  }
2725 };