gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it
   8 under the terms of the GNU General Public License as published by the
   9 Free Software Foundation; either version 3, or (at your option) any
  10 later version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT
  13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  22    operations.  These are common in sequences such as this one:
  23
  24         modulus = sqrt(x*x + y*y + z*z);
  25         x = x / modulus;
  26         y = y / modulus;
  27         z = z / modulus;
  28
  29    that can be optimized to
  30
  31         modulus = sqrt(x*x + y*y + z*z);
  32         rmodulus = 1.0 / modulus;
  33         x = x * rmodulus;
  34         y = y * rmodulus;
  35         z = z * rmodulus;
  36
  37    We do this for loop invariant divisors, and with this pass whenever
  38    we notice that a division has the same divisor multiple times.
  39
  40    Of course, like in PRE, we don't insert a division if a dominator
  41    already has one.  However, this cannot be done as an extension of
  42    PRE for several reasons.
  43
  44    First of all, with some experiments it was found out that the
  45    transformation is not always useful if there are only two divisions
  46    hy the same divisor.  This is probably because modern processors
  47    can pipeline the divisions; on older, in-order processors it should
  48    still be effective to optimize two divisions by the same number.
  49    We make this a param, and it shall be called N in the remainder of
  50    this comment.
  51
  52    Second, if trapping math is active, we have less freedom on where
  53    to insert divisions: we can only do so in basic blocks that already
  54    contain one.  (If divisions don't trap, instead, we can insert
  55    divisions elsewhere, which will be in blocks that are common dominators
  56    of those that have the division).
  57
  58    We really don't want to compute the reciprocal unless a division will
  59    be found.  To do this, we won't insert the division in a basic block
  60    that has less than N divisions *post-dominating* it.
  61
  62    The algorithm constructs a subset of the dominator tree, holding the
  63    blocks containing the divisions and the common dominators to them,
  64    and walk it twice.  The first walk is in post-order, and it annotates
  65    each block with the number of divisions that post-dominate it: this
  66    gives information on where divisions can be inserted profitably.
  67    The second walk is in pre-order, and it inserts divisions as explained
  68    above, and replaces divisions by multiplications.
  69
  70    In the best case, the cost of the pass is O(n_statements).  In the
  71    worst-case, the cost is due to creating the dominator tree subset,
  72    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  73    for n_statements / n_basic_blocks statements.  So, the amortized cost
  74    of creating the dominator tree subset is O(n_basic_blocks) and the
  75    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  76
  77    More practically, the cost will be small because there are few
  78    divisions, and they tend to be in the same basic block, so insert_bb
  79    is called very few times.
  80
  81    If we did this using domwalk.c, an efficient implementation would have
  82    to work on all the variables in a single pass, because we could not
  83    work on just a subset of the dominator tree, as we do now, and the
  84    cost would also be something like O(n_statements * n_basic_blocks).
  85    The data structures would be more complex in order to work on all the
  86    variables in a single pass.  */
  87
  88 #include "config.h"
  89 #include "system.h"
  90 #include "coretypes.h"
  91 #include "tm.h"
  92 #include "flags.h"
  93 #include "tree.h"
  94 #include "tree-flow.h"
  95 #include "tree-pass.h"
  96 #include "alloc-pool.h"
  97 #include "basic-block.h"
  98 #include "target.h"
  99 #include "gimple-pretty-print.h"
 100
 101 /* FIXME: RTL headers have to be included here for optabs.  */
 102 #include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
 103 #include "expr.h"               /* Because optabs.h wants sepops.  */
 104 #include "optabs.h"
 105
 106 /* This structure represents one basic block that either computes a
 107    division, or is a common dominator for basic block that compute a
 108    division.  */
 109 struct occurrence {
 110   /* The basic block represented by this structure.  */
 111   basic_block bb;
 112
 113   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 114      inserted in BB.  */
 115   tree recip_def;
 116
 117   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 118      was inserted in BB.  */
 119   gimple recip_def_stmt;
 120
 121   /* Pointer to a list of "struct occurrence"s for blocks dominated
 122      by BB.  */
 123   struct occurrence *children;
 124
 125   /* Pointer to the next "struct occurrence"s in the list of blocks
 126      sharing a common dominator.  */
 127   struct occurrence *next;
 128
 129   /* The number of divisions that are in BB before compute_merit.  The
 130      number of divisions that are in BB or post-dominate it after
 131      compute_merit.  */
 132   int num_divisions;
 133
 134   /* True if the basic block has a division, false if it is a common
 135      dominator for basic blocks that do.  If it is false and trapping
 136      math is active, BB is not a candidate for inserting a reciprocal.  */
 137   bool bb_has_division;
 138 };
 139
 140 static struct
 141 {
 142   /* Number of 1.0/X ops inserted.  */
 143   int rdivs_inserted;
 144
 145   /* Number of 1.0/FUNC ops inserted.  */
 146   int rfuncs_inserted;
 147 } reciprocal_stats;
 148
 149 static struct
 150 {
 151   /* Number of cexpi calls inserted.  */
 152   int inserted;
 153 } sincos_stats;
 154
 155 static struct
 156 {
 157   /* Number of hand-written 32-bit bswaps found.  */
 158   int found_32bit;
 159
 160   /* Number of hand-written 64-bit bswaps found.  */
 161   int found_64bit;
 162 } bswap_stats;
 163
 164 static struct
 165 {
 166   /* Number of widening multiplication ops inserted.  */
 167   int widen_mults_inserted;
 168
 169   /* Number of integer multiply-and-accumulate ops inserted.  */
 170   int maccs_inserted;
 171
 172   /* Number of fp fused multiply-add ops inserted.  */
 173   int fmas_inserted;
 174 } widen_mul_stats;
 175
 176 /* The instance of "struct occurrence" representing the highest
 177    interesting block in the dominator tree.  */
 178 static struct occurrence *occ_head;
 179
 180 /* Allocation pool for getting instances of "struct occurrence".  */
 181 static alloc_pool occ_pool;
 182
 183
 184
 185 /* Allocate and return a new struct occurrence for basic block BB, and
 186    whose children list is headed by CHILDREN.  */
 187 static struct occurrence *
 188 occ_new (basic_block bb, struct occurrence *children)
 189 {
 190   struct occurrence *occ;
 191
 192   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
 193   memset (occ, 0, sizeof (struct occurrence));
 194
 195   occ->bb = bb;
 196   occ->children = children;
 197   return occ;
 198 }
 199
 200
 201 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 202    list of "struct occurrence"s, one per basic block, having IDOM as
 203    their common dominator.
 204
 205    We try to insert NEW_OCC as deep as possible in the tree, and we also
 206    insert any other block that is a common dominator for BB and one
 207    block already in the tree.  */
 208
 209 static void
 210 insert_bb (struct occurrence *new_occ, basic_block idom,
 211            struct occurrence **p_head)
 212 {
 213   struct occurrence *occ, **p_occ;
 214
 215   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 216     {
 217       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 218       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 219       if (dom == bb)
 220         {
 221           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 222              from its list.  */
 223           *p_occ = occ->next;
 224           occ->next = new_occ->children;
 225           new_occ->children = occ;
 226
 227           /* Try the next block (it may as well be dominated by BB).  */
 228         }
 229
 230       else if (dom == occ_bb)
 231         {
 232           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 233           insert_bb (new_occ, dom, &occ->children);
 234           return;
 235         }
 236
 237       else if (dom != idom)
 238         {
 239           gcc_assert (!dom->aux);
 240
 241           /* There is a dominator between IDOM and BB, add it and make
 242              two children out of NEW_OCC and OCC.  First, remove OCC from
 243              its list.  */
 244           *p_occ = occ->next;
 245           new_occ->next = occ;
 246           occ->next = NULL;
 247
 248           /* None of the previous blocks has DOM as a dominator: if we tail
 249              recursed, we would reexamine them uselessly. Just switch BB with
 250              DOM, and go on looking for blocks dominated by DOM.  */
 251           new_occ = occ_new (dom, new_occ);
 252         }
 253
 254       else
 255         {
 256           /* Nothing special, go on with the next element.  */
 257           p_occ = &occ->next;
 258         }
 259     }
 260
 261   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 262   new_occ->next = *p_head;
 263   *p_head = new_occ;
 264 }
 265
 266 /* Register that we found a division in BB.  */
 267
 268 static inline void
 269 register_division_in (basic_block bb)
 270 {
 271   struct occurrence *occ;
 272
 273   occ = (struct occurrence *) bb->aux;
 274   if (!occ)
 275     {
 276       occ = occ_new (bb, NULL);
 277       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
 278     }
 279
 280   occ->bb_has_division = true;
 281   occ->num_divisions++;
 282 }
 283
 284
 285 /* Compute the number of divisions that postdominate each block in OCC and
 286    its children.  */
 287
 288 static void
 289 compute_merit (struct occurrence *occ)
 290 {
 291   struct occurrence *occ_child;
 292   basic_block dom = occ->bb;
 293
 294   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 295     {
 296       basic_block bb;
 297       if (occ_child->children)
 298         compute_merit (occ_child);
 299
 300       if (flag_exceptions)
 301         bb = single_noncomplex_succ (dom);
 302       else
 303         bb = dom;
 304
 305       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 306         occ->num_divisions += occ_child->num_divisions;
 307     }
 308 }
 309
 310
 311 /* Return whether USE_STMT is a floating-point division by DEF.  */
 312 static inline bool
 313 is_division_by (gimple use_stmt, tree def)
 314 {
 315   return is_gimple_assign (use_stmt)
 316          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 317          && gimple_assign_rhs2 (use_stmt) == def
 318          /* Do not recognize x / x as valid division, as we are getting
 319             confused later by replacing all immediate uses x in such
 320             a stmt.  */
 321          && gimple_assign_rhs1 (use_stmt) != def;
 322 }
 323
 324 /* Walk the subset of the dominator tree rooted at OCC, setting the
 325    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 326    the given basic block.  The field may be left NULL, of course,
 327    if it is not possible or profitable to do the optimization.
 328
 329    DEF_BSI is an iterator pointing at the statement defining DEF.
 330    If RECIP_DEF is set, a dominator already has a computation that can
 331    be used.  */
 332
 333 static void
 334 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 335                     tree def, tree recip_def, int threshold)
 336 {
 337   tree type;
 338   gimple new_stmt;
 339   gimple_stmt_iterator gsi;
 340   struct occurrence *occ_child;
 341
 342   if (!recip_def
 343       && (occ->bb_has_division || !flag_trapping_math)
 344       && occ->num_divisions >= threshold)
 345     {
 346       /* Make a variable with the replacement and substitute it.  */
 347       type = TREE_TYPE (def);
 348       recip_def = make_rename_temp (type, "reciptmp");
 349       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
 350                                                build_one_cst (type), def);
 351
 352       if (occ->bb_has_division)
 353         {
 354           /* Case 1: insert before an existing division.  */
 355           gsi = gsi_after_labels (occ->bb);
 356           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 357             gsi_next (&gsi);
 358
 359           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 360         }
 361       else if (def_gsi && occ->bb == def_gsi->bb)
 362         {
 363           /* Case 2: insert right after the definition.  Note that this will
 364              never happen if the definition statement can throw, because in
 365              that case the sole successor of the statement's basic block will
 366              dominate all the uses as well.  */
 367           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 368         }
 369       else
 370         {
 371           /* Case 3: insert in a basic block not containing defs/uses.  */
 372           gsi = gsi_after_labels (occ->bb);
 373           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 374         }
 375
 376       reciprocal_stats.rdivs_inserted++;
 377
 378       occ->recip_def_stmt = new_stmt;
 379     }
 380
 381   occ->recip_def = recip_def;
 382   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 383     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 384 }
 385
 386
 387 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 388    possible.  */
 389
 390 static inline void
 391 replace_reciprocal (use_operand_p use_p)
 392 {
 393   gimple use_stmt = USE_STMT (use_p);
 394   basic_block bb = gimple_bb (use_stmt);
 395   struct occurrence *occ = (struct occurrence *) bb->aux;
 396
 397   if (optimize_bb_for_speed_p (bb)
 398       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 399     {
 400       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 401       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 402       SET_USE (use_p, occ->recip_def);
 403       fold_stmt_inplace (&gsi);
 404       update_stmt (use_stmt);
 405     }
 406 }
 407
 408
 409 /* Free OCC and return one more "struct occurrence" to be freed.  */
 410
 411 static struct occurrence *
 412 free_bb (struct occurrence *occ)
 413 {
 414   struct occurrence *child, *next;
 415
 416   /* First get the two pointers hanging off OCC.  */
 417   next = occ->next;
 418   child = occ->children;
 419   occ->bb->aux = NULL;
 420   pool_free (occ_pool, occ);
 421
 422   /* Now ensure that we don't recurse unless it is necessary.  */
 423   if (!child)
 424     return next;
 425   else
 426     {
 427       while (next)
 428         next = free_bb (next);
 429
 430       return child;
 431     }
 432 }
 433
 434
 435 /* Look for floating-point divisions among DEF's uses, and try to
 436    replace them by multiplications with the reciprocal.  Add
 437    as many statements computing the reciprocal as needed.
 438
 439    DEF must be a GIMPLE register of a floating-point type.  */
 440
 441 static void
 442 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 443 {
 444   use_operand_p use_p;
 445   imm_use_iterator use_iter;
 446   struct occurrence *occ;
 447   int count = 0, threshold;
 448
 449   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 450
 451   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 452     {
 453       gimple use_stmt = USE_STMT (use_p);
 454       if (is_division_by (use_stmt, def))
 455         {
 456           register_division_in (gimple_bb (use_stmt));
 457           count++;
 458         }
 459     }
 460
 461   /* Do the expensive part only if we can hope to optimize something.  */
 462   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 463   if (count >= threshold)
 464     {
 465       gimple use_stmt;
 466       for (occ = occ_head; occ; occ = occ->next)
 467         {
 468           compute_merit (occ);
 469           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 470         }
 471
 472       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 473         {
 474           if (is_division_by (use_stmt, def))
 475             {
 476               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 477                 replace_reciprocal (use_p);
 478             }
 479         }
 480     }
 481
 482   for (occ = occ_head; occ; )
 483     occ = free_bb (occ);
 484
 485   occ_head = NULL;
 486 }
 487
 488 static bool
 489 gate_cse_reciprocals (void)
 490 {
 491   return optimize && flag_reciprocal_math;
 492 }
 493
 494 /* Go through all the floating-point SSA_NAMEs, and call
 495    execute_cse_reciprocals_1 on each of them.  */
 496 static unsigned int
 497 execute_cse_reciprocals (void)
 498 {
 499   basic_block bb;
 500   tree arg;
 501
 502   occ_pool = create_alloc_pool ("dominators for recip",
 503                                 sizeof (struct occurrence),
 504                                 n_basic_blocks / 3 + 1);
 505
 506   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 507   calculate_dominance_info (CDI_DOMINATORS);
 508   calculate_dominance_info (CDI_POST_DOMINATORS);
 509
 510 #ifdef ENABLE_CHECKING
 511   FOR_EACH_BB (bb)
 512     gcc_assert (!bb->aux);
 513 #endif
 514
 515   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
 516     if (gimple_default_def (cfun, arg)
 517         && FLOAT_TYPE_P (TREE_TYPE (arg))
 518         && is_gimple_reg (arg))
 519       execute_cse_reciprocals_1 (NULL, gimple_default_def (cfun, arg));
 520
 521   FOR_EACH_BB (bb)
 522     {
 523       gimple_stmt_iterator gsi;
 524       gimple phi;
 525       tree def;
 526
 527       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 528         {
 529           phi = gsi_stmt (gsi);
 530           def = PHI_RESULT (phi);
 531           if (FLOAT_TYPE_P (TREE_TYPE (def))
 532               && is_gimple_reg (def))
 533             execute_cse_reciprocals_1 (NULL, def);
 534         }
 535
 536       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 537         {
 538           gimple stmt = gsi_stmt (gsi);
 539
 540           if (gimple_has_lhs (stmt)
 541               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 542               && FLOAT_TYPE_P (TREE_TYPE (def))
 543               && TREE_CODE (def) == SSA_NAME)
 544             execute_cse_reciprocals_1 (&gsi, def);
 545         }
 546
 547       if (optimize_bb_for_size_p (bb))
 548         continue;
 549
 550       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 551       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 552         {
 553           gimple stmt = gsi_stmt (gsi);
 554           tree fndecl;
 555
 556           if (is_gimple_assign (stmt)
 557               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 558             {
 559               tree arg1 = gimple_assign_rhs2 (stmt);
 560               gimple stmt1;
 561
 562               if (TREE_CODE (arg1) != SSA_NAME)
 563                 continue;
 564
 565               stmt1 = SSA_NAME_DEF_STMT (arg1);
 566
 567               if (is_gimple_call (stmt1)
 568                   && gimple_call_lhs (stmt1)
 569                   && (fndecl = gimple_call_fndecl (stmt1))
 570                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 571                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 572                 {
 573                   enum built_in_function code;
 574                   bool md_code, fail;
 575                   imm_use_iterator ui;
 576                   use_operand_p use_p;
 577
 578                   code = DECL_FUNCTION_CODE (fndecl);
 579                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 580
 581                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 582                   if (!fndecl)
 583                     continue;
 584
 585                   /* Check that all uses of the SSA name are divisions,
 586                      otherwise replacing the defining statement will do
 587                      the wrong thing.  */
 588                   fail = false;
 589                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 590                     {
 591                       gimple stmt2 = USE_STMT (use_p);
 592                       if (is_gimple_debug (stmt2))
 593                         continue;
 594                       if (!is_gimple_assign (stmt2)
 595                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 596                           || gimple_assign_rhs1 (stmt2) == arg1
 597                           || gimple_assign_rhs2 (stmt2) != arg1)
 598                         {
 599                           fail = true;
 600                           break;
 601                         }
 602                     }
 603                   if (fail)
 604                     continue;
 605
 606                   gimple_replace_lhs (stmt1, arg1);
 607                   gimple_call_set_fndecl (stmt1, fndecl);
 608                   update_stmt (stmt1);
 609                   reciprocal_stats.rfuncs_inserted++;
 610
 611                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 612                     {
 613                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 614                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 615                       fold_stmt_inplace (&gsi);
 616                       update_stmt (stmt);
 617                     }
 618                 }
 619             }
 620         }
 621     }
 622
 623   statistics_counter_event (cfun, "reciprocal divs inserted",
 624                             reciprocal_stats.rdivs_inserted);
 625   statistics_counter_event (cfun, "reciprocal functions inserted",
 626                             reciprocal_stats.rfuncs_inserted);
 627
 628   free_dominance_info (CDI_DOMINATORS);
 629   free_dominance_info (CDI_POST_DOMINATORS);
 630   free_alloc_pool (occ_pool);
 631   return 0;
 632 }
 633
 634 struct gimple_opt_pass pass_cse_reciprocals =
 635 {
 636  {
 637   GIMPLE_PASS,
 638   "recip",                              /* name */
 639   gate_cse_reciprocals,                 /* gate */
 640   execute_cse_reciprocals,              /* execute */
 641   NULL,                                 /* sub */
 642   NULL,                                 /* next */
 643   0,                                    /* static_pass_number */
 644   TV_NONE,                              /* tv_id */
 645   PROP_ssa,                             /* properties_required */
 646   0,                                    /* properties_provided */
 647   0,                                    /* properties_destroyed */
 648   0,                                    /* todo_flags_start */
 649   TODO_update_ssa | TODO_verify_ssa
 650     | TODO_verify_stmts                /* todo_flags_finish */
 651  }
 652 };
 653
 654 /* Records an occurrence at statement USE_STMT in the vector of trees
 655    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 656    is not yet initialized.  Returns true if the occurrence was pushed on
 657    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 658    statements in the vector.  */
 659
 660 static bool
 661 maybe_record_sincos (VEC(gimple, heap) **stmts,
 662                      basic_block *top_bb, gimple use_stmt)
 663 {
 664   basic_block use_bb = gimple_bb (use_stmt);
 665   if (*top_bb
 666       && (*top_bb == use_bb
 667           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 668     VEC_safe_push (gimple, heap, *stmts, use_stmt);
 669   else if (!*top_bb
 670            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 671     {
 672       VEC_safe_push (gimple, heap, *stmts, use_stmt);
 673       *top_bb = use_bb;
 674     }
 675   else
 676     return false;
 677
 678   return true;
 679 }
 680
 681 /* Look for sin, cos and cexpi calls with the same argument NAME and
 682    create a single call to cexpi CSEing the result in this case.
 683    We first walk over all immediate uses of the argument collecting
 684    statements that we can CSE in a vector and in a second pass replace
 685    the statement rhs with a REALPART or IMAGPART expression on the
 686    result of the cexpi call we insert before the use statement that
 687    dominates all other candidates.  */
 688
 689 static bool
 690 execute_cse_sincos_1 (tree name)
 691 {
 692   gimple_stmt_iterator gsi;
 693   imm_use_iterator use_iter;
 694   tree fndecl, res, type;
 695   gimple def_stmt, use_stmt, stmt;
 696   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 697   VEC(gimple, heap) *stmts = NULL;
 698   basic_block top_bb = NULL;
 699   int i;
 700   bool cfg_changed = false;
 701
 702   type = TREE_TYPE (name);
 703   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 704     {
 705       if (gimple_code (use_stmt) != GIMPLE_CALL
 706           || !gimple_call_lhs (use_stmt)
 707           || !(fndecl = gimple_call_fndecl (use_stmt))
 708           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 709         continue;
 710
 711       switch (DECL_FUNCTION_CODE (fndecl))
 712         {
 713         CASE_FLT_FN (BUILT_IN_COS):
 714           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 715           break;
 716
 717         CASE_FLT_FN (BUILT_IN_SIN):
 718           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 719           break;
 720
 721         CASE_FLT_FN (BUILT_IN_CEXPI):
 722           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 723           break;
 724
 725         default:;
 726         }
 727     }
 728
 729   if (seen_cos + seen_sin + seen_cexpi <= 1)
 730     {
 731       VEC_free(gimple, heap, stmts);
 732       return false;
 733     }
 734
 735   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 736      the name def statement.  */
 737   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 738   if (!fndecl)
 739     return false;
 740   res = create_tmp_reg (TREE_TYPE (TREE_TYPE (fndecl)), "sincostmp");
 741   stmt = gimple_build_call (fndecl, 1, name);
 742   res = make_ssa_name (res, stmt);
 743   gimple_call_set_lhs (stmt, res);
 744
 745   def_stmt = SSA_NAME_DEF_STMT (name);
 746   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 747       && gimple_code (def_stmt) != GIMPLE_PHI
 748       && gimple_bb (def_stmt) == top_bb)
 749     {
 750       gsi = gsi_for_stmt (def_stmt);
 751       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 752     }
 753   else
 754     {
 755       gsi = gsi_after_labels (top_bb);
 756       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 757     }
 758   update_stmt (stmt);
 759   sincos_stats.inserted++;
 760
 761   /* And adjust the recorded old call sites.  */
 762   for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
 763     {
 764       tree rhs = NULL;
 765       fndecl = gimple_call_fndecl (use_stmt);
 766
 767       switch (DECL_FUNCTION_CODE (fndecl))
 768         {
 769         CASE_FLT_FN (BUILT_IN_COS):
 770           rhs = fold_build1 (REALPART_EXPR, type, res);
 771           break;
 772
 773         CASE_FLT_FN (BUILT_IN_SIN):
 774           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 775           break;
 776
 777         CASE_FLT_FN (BUILT_IN_CEXPI):
 778           rhs = res;
 779           break;
 780
 781         default:;
 782           gcc_unreachable ();
 783         }
 784
 785         /* Replace call with a copy.  */
 786         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 787
 788         gsi = gsi_for_stmt (use_stmt);
 789         gsi_replace (&gsi, stmt, true);
 790         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 791           cfg_changed = true;
 792     }
 793
 794   VEC_free(gimple, heap, stmts);
 795
 796   return cfg_changed;
 797 }
 798
 799 /* To evaluate powi(x,n), the floating point value x raised to the
 800    constant integer exponent n, we use a hybrid algorithm that
 801    combines the "window method" with look-up tables.  For an
 802    introduction to exponentiation algorithms and "addition chains",
 803    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 804    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 805    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 806    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 807
 808 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 809    multiplications to inline before calling the system library's pow
 810    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 811    so this default never requires calling pow, powf or powl.  */
 812
 813 #ifndef POWI_MAX_MULTS
 814 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 815 #endif
 816
 817 /* The size of the "optimal power tree" lookup table.  All
 818    exponents less than this value are simply looked up in the
 819    powi_table below.  This threshold is also used to size the
 820    cache of pseudo registers that hold intermediate results.  */
 821 #define POWI_TABLE_SIZE 256
 822
 823 /* The size, in bits of the window, used in the "window method"
 824    exponentiation algorithm.  This is equivalent to a radix of
 825    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 826 #define POWI_WINDOW_SIZE 3
 827
 828 /* The following table is an efficient representation of an
 829    "optimal power tree".  For each value, i, the corresponding
 830    value, j, in the table states than an optimal evaluation
 831    sequence for calculating pow(x,i) can be found by evaluating
 832    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 833    100 integers is given in Knuth's "Seminumerical algorithms".  */
 834
 835 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 836   {
 837       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 838       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 839       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 840      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 841      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 842      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 843      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 844      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 845      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 846      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 847      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 848      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 849      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 850      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 851      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 852      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 853      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 854      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 855      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 856      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 857      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 858      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 859      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 860      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 861      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 862     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 863     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 864     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 865     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 866     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 867     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 868     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 869   };
 870
 871
 872 /* Return the number of multiplications required to calculate
 873    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 874    subroutine of powi_cost.  CACHE is an array indicating
 875    which exponents have already been calculated.  */
 876
 877 static int
 878 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 879 {
 880   /* If we've already calculated this exponent, then this evaluation
 881      doesn't require any additional multiplications.  */
 882   if (cache[n])
 883     return 0;
 884
 885   cache[n] = true;
 886   return powi_lookup_cost (n - powi_table[n], cache)
 887          + powi_lookup_cost (powi_table[n], cache) + 1;
 888 }
 889
 890 /* Return the number of multiplications required to calculate
 891    powi(x,n) for an arbitrary x, given the exponent N.  This
 892    function needs to be kept in sync with powi_as_mults below.  */
 893
 894 static int
 895 powi_cost (HOST_WIDE_INT n)
 896 {
 897   bool cache[POWI_TABLE_SIZE];
 898   unsigned HOST_WIDE_INT digit;
 899   unsigned HOST_WIDE_INT val;
 900   int result;
 901
 902   if (n == 0)
 903     return 0;
 904
 905   /* Ignore the reciprocal when calculating the cost.  */
 906   val = (n < 0) ? -n : n;
 907
 908   /* Initialize the exponent cache.  */
 909   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 910   cache[1] = true;
 911
 912   result = 0;
 913
 914   while (val >= POWI_TABLE_SIZE)
 915     {
 916       if (val & 1)
 917         {
 918           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 919           result += powi_lookup_cost (digit, cache)
 920                     + POWI_WINDOW_SIZE + 1;
 921           val >>= POWI_WINDOW_SIZE;
 922         }
 923       else
 924         {
 925           val >>= 1;
 926           result++;
 927         }
 928     }
 929
 930   return result + powi_lookup_cost (val, cache);
 931 }
 932
 933 /* Recursive subroutine of powi_as_mults.  This function takes the
 934    array, CACHE, of already calculated exponents and an exponent N and
 935    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 936
 937 static tree
 938 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 939                  HOST_WIDE_INT n, tree *cache, tree target)
 940 {
 941   tree op0, op1, ssa_target;
 942   unsigned HOST_WIDE_INT digit;
 943   gimple mult_stmt;
 944
 945   if (n < POWI_TABLE_SIZE && cache[n])
 946     return cache[n];
 947
 948   ssa_target = make_ssa_name (target, NULL);
 949
 950   if (n < POWI_TABLE_SIZE)
 951     {
 952       cache[n] = ssa_target;
 953       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
 954       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
 955     }
 956   else if (n & 1)
 957     {
 958       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 959       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
 960       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
 961     }
 962   else
 963     {
 964       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
 965       op1 = op0;
 966     }
 967
 968   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
 969   gimple_set_location (mult_stmt, loc);
 970   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 971
 972   return ssa_target;
 973 }
 974
 975 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 976    This function needs to be kept in sync with powi_cost above.  */
 977
 978 static tree
 979 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 980                tree arg0, HOST_WIDE_INT n)
 981 {
 982   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
 983   gimple div_stmt;
 984
 985   if (n == 0)
 986     return build_real (type, dconst1);
 987
 988   memset (cache, 0,  sizeof (cache));
 989   cache[1] = arg0;
 990
 991   target = create_tmp_reg (type, "powmult");
 992   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
 993
 994   if (n >= 0)
 995     return result;
 996
 997   /* If the original exponent was negative, reciprocate the result.  */
 998   target = make_ssa_name (target, NULL);
 999   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1000                                            build_real (type, dconst1),
1001                                            result);
1002   gimple_set_location (div_stmt, loc);
1003   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1004
1005   return target;
1006 }
1007
1008 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1009    location info LOC.  If the arguments are appropriate, create an
1010    equivalent sequence of statements prior to GSI using an optimal
1011    number of multiplications, and return an expession holding the
1012    result.  */
1013
1014 static tree
1015 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1016                             tree arg0, HOST_WIDE_INT n)
1017 {
1018   /* Avoid largest negative number.  */
1019   if (n != -n
1020       && ((n >= -1 && n <= 2)
1021           || (optimize_function_for_speed_p (cfun)
1022               && powi_cost (n) <= POWI_MAX_MULTS)))
1023     return powi_as_mults (gsi, loc, arg0, n);
1024
1025   return NULL_TREE;
1026 }
1027
1028 /* Build a gimple call statement that calls FN with argument ARG.
1029    Set the lhs of the call statement to a fresh SSA name for
1030    variable VAR.  If VAR is NULL, first allocate it.  Insert the
1031    statement prior to GSI's current position, and return the fresh
1032    SSA name.  */
1033
1034 static tree
1035 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1036                        tree *var, tree fn, tree arg)
1037 {
1038   gimple call_stmt;
1039   tree ssa_target;
1040
1041   if (!*var)
1042     *var = create_tmp_reg (TREE_TYPE (arg), "powroot");
1043
1044   call_stmt = gimple_build_call (fn, 1, arg);
1045   ssa_target = make_ssa_name (*var, NULL);
1046   gimple_set_lhs (call_stmt, ssa_target);
1047   gimple_set_location (call_stmt, loc);
1048   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1049
1050   return ssa_target;
1051 }
1052
1053 /* Build a gimple binary operation with the given CODE and arguments
1054    ARG0, ARG1, assigning the result to a new SSA name for variable
1055    TARGET.  Insert the statement prior to GSI's current position, and
1056    return the fresh SSA name.*/
1057
1058 static tree
1059 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1060                         tree target, enum tree_code code, tree arg0, tree arg1)
1061 {
1062   tree result = make_ssa_name (target, NULL);
1063   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1064   gimple_set_location (stmt, loc);
1065   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1066   return result;
1067 }
1068
1069 /* Build a gimple reference operation with the given CODE and argument
1070    ARG, assigning the result to a new SSA name for variable TARGET.
1071    Insert the statement prior to GSI's current position, and return
1072    the fresh SSA name.  */
1073
1074 static inline tree
1075 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1076                       tree target, enum tree_code code, tree arg0)
1077 {
1078   tree result = make_ssa_name (target, NULL);
1079   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1080   gimple_set_location (stmt, loc);
1081   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1082   return result;
1083 }
1084
1085 /* Build a gimple assignment to cast VAL to TARGET.  Insert the statement
1086    prior to GSI's current position, and return the fresh SSA name.  */
1087
1088 static tree
1089 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1090                        tree target, tree val)
1091 {
1092   return build_and_insert_binop (gsi, loc, target, CONVERT_EXPR, val, NULL);
1093 }
1094
1095 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1096    with location info LOC.  If possible, create an equivalent and
1097    less expensive sequence of statements prior to GSI, and return an
1098    expession holding the result.  */
1099
1100 static tree
1101 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1102                            tree arg0, tree arg1)
1103 {
1104   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1105   REAL_VALUE_TYPE c2, dconst3;
1106   HOST_WIDE_INT n;
1107   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1108   tree target = NULL_TREE;
1109   enum machine_mode mode;
1110   bool hw_sqrt_exists;
1111
1112   /* If the exponent isn't a constant, there's nothing of interest
1113      to be done.  */
1114   if (TREE_CODE (arg1) != REAL_CST)
1115     return NULL_TREE;
1116
1117   /* If the exponent is equivalent to an integer, expand to an optimal
1118      multiplication sequence when profitable.  */
1119   c = TREE_REAL_CST (arg1);
1120   n = real_to_integer (&c);
1121   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1122
1123   if (real_identical (&c, &cint)
1124       && ((n >= -1 && n <= 2)
1125           || (flag_unsafe_math_optimizations
1126               && optimize_insn_for_speed_p ()
1127               && powi_cost (n) <= POWI_MAX_MULTS)))
1128     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1129
1130   /* Attempt various optimizations using sqrt and cbrt.  */
1131   type = TREE_TYPE (arg0);
1132   mode = TYPE_MODE (type);
1133   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1134
1135   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1136      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1137      sqrt(-0) = -0.  */
1138   if (sqrtfn
1139       && REAL_VALUES_EQUAL (c, dconsthalf)
1140       && !HONOR_SIGNED_ZEROS (mode))
1141     return build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1142
1143   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1144      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1145      so do this optimization even if -Os.  Don't do this optimization
1146      if we don't have a hardware sqrt insn.  */
1147   dconst1_4 = dconst1;
1148   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1149   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1150
1151   if (flag_unsafe_math_optimizations
1152       && sqrtfn
1153       && REAL_VALUES_EQUAL (c, dconst1_4)
1154       && hw_sqrt_exists)
1155     {
1156       /* sqrt(x)  */
1157       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1158
1159       /* sqrt(sqrt(x))  */
1160       return build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1161     }
1162
1163   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1164      optimizing for space.  Don't do this optimization if we don't have
1165      a hardware sqrt insn.  */
1166   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1167   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1168
1169   if (flag_unsafe_math_optimizations
1170       && sqrtfn
1171       && optimize_function_for_speed_p (cfun)
1172       && REAL_VALUES_EQUAL (c, dconst3_4)
1173       && hw_sqrt_exists)
1174     {
1175       /* sqrt(x)  */
1176       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1177
1178       /* sqrt(sqrt(x))  */
1179       sqrt_sqrt = build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1180
1181       /* sqrt(x) * sqrt(sqrt(x))  */
1182       return build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1183                                      sqrt_arg0, sqrt_sqrt);
1184     }
1185
1186   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1187      optimizations since 1./3. is not exactly representable.  If x
1188      is negative and finite, the correct value of pow(x,1./3.) is
1189      a NaN with the "invalid" exception raised, because the value
1190      of 1./3. actually has an even denominator.  The correct value
1191      of cbrt(x) is a negative real value.  */
1192   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1193   dconst1_3 = real_value_truncate (mode, dconst_third ());
1194
1195   if (flag_unsafe_math_optimizations
1196       && cbrtfn
1197       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1198       && REAL_VALUES_EQUAL (c, dconst1_3))
1199     return build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1200
1201   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1202      if we don't have a hardware sqrt insn.  */
1203   dconst1_6 = dconst1_3;
1204   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1205
1206   if (flag_unsafe_math_optimizations
1207       && sqrtfn
1208       && cbrtfn
1209       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1210       && optimize_function_for_speed_p (cfun)
1211       && hw_sqrt_exists
1212       && REAL_VALUES_EQUAL (c, dconst1_6))
1213     {
1214       /* sqrt(x)  */
1215       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1216
1217       /* cbrt(sqrt(x))  */
1218       return build_and_insert_call (gsi, loc, &target, cbrtfn, sqrt_arg0);
1219     }
1220
1221   /* Optimize pow(x,c), where n = 2c for some nonzero integer n, into
1222
1223        sqrt(x) * powi(x, n/2),                n > 0;
1224        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1225
1226      Do not calculate the powi factor when n/2 = 0.  */
1227   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1228   n = real_to_integer (&c2);
1229   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1230
1231   if (flag_unsafe_math_optimizations
1232       && sqrtfn
1233       && real_identical (&c2, &cint))
1234     {
1235       tree powi_x_ndiv2 = NULL_TREE;
1236
1237       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1238          possible or profitable, give up.  Skip the degenerate case when
1239          n is 1 or -1, where the result is always 1.  */
1240       if (absu_hwi (n) != 1)
1241         {
1242           powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1243                                                      abs_hwi (n / 2));
1244           if (!powi_x_ndiv2)
1245             return NULL_TREE;
1246         }
1247
1248       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1249          result of the optimal multiply sequence just calculated.  */
1250       sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1251
1252       if (absu_hwi (n) == 1)
1253         result = sqrt_arg0;
1254       else
1255         result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1256                                          sqrt_arg0, powi_x_ndiv2);
1257
1258       /* If n is negative, reciprocate the result.  */
1259       if (n < 0)
1260         result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1261                                          build_real (type, dconst1), result);
1262       return result;
1263     }
1264
1265   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1266
1267      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1268      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1269
1270      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1271      different from pow(x, 1./3.) due to rounding and behavior with
1272      negative x, we need to constrain this transformation to unsafe
1273      math and positive x or finite math.  */
1274   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1275   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1276   real_round (&c2, mode, &c2);
1277   n = real_to_integer (&c2);
1278   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1279   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1280   real_convert (&c2, mode, &c2);
1281
1282   if (flag_unsafe_math_optimizations
1283       && cbrtfn
1284       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1285       && real_identical (&c2, &c)
1286       && optimize_function_for_speed_p (cfun)
1287       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1288     {
1289       tree powi_x_ndiv3 = NULL_TREE;
1290
1291       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1292          possible or profitable, give up.  Skip the degenerate case when
1293          abs(n) < 3, where the result is always 1.  */
1294       if (absu_hwi (n) >= 3)
1295         {
1296           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1297                                                      abs_hwi (n / 3));
1298           if (!powi_x_ndiv3)
1299             return NULL_TREE;
1300         }
1301
1302       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1303          as that creates an unnecessary variable.  Instead, just produce
1304          either cbrt(x) or cbrt(x) * cbrt(x).  */
1305       cbrt_x = build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1306
1307       if (absu_hwi (n) % 3 == 1)
1308         powi_cbrt_x = cbrt_x;
1309       else
1310         powi_cbrt_x = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1311                                               cbrt_x, cbrt_x);
1312
1313       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1314       if (absu_hwi (n) < 3)
1315         result = powi_cbrt_x;
1316       else
1317         result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1318                                          powi_x_ndiv3, powi_cbrt_x);
1319
1320       /* If n is negative, reciprocate the result.  */
1321       if (n < 0)
1322         result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1323                                          build_real (type, dconst1), result);
1324
1325       return result;
1326     }
1327
1328   /* No optimizations succeeded.  */
1329   return NULL_TREE;
1330 }
1331
1332 /* ARG is the argument to a cabs builtin call in GSI with location info
1333    LOC.  Create a sequence of statements prior to GSI that calculates
1334    sqrt(R*R + I*I), where R and I are the real and imaginary components
1335    of ARG, respectively.  Return an expression holding the result.  */
1336
1337 static tree
1338 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1339 {
1340   tree target, real_part, imag_part, addend1, addend2, sum, result;
1341   tree type = TREE_TYPE (TREE_TYPE (arg));
1342   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1343   enum machine_mode mode = TYPE_MODE (type);
1344
1345   if (!flag_unsafe_math_optimizations
1346       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1347       || !sqrtfn
1348       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1349     return NULL_TREE;
1350
1351   target = create_tmp_reg (type, "cabs");
1352   real_part = build_and_insert_ref (gsi, loc, type, target,
1353                                     REALPART_EXPR, arg);
1354   addend1 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1355                                     real_part, real_part);
1356   imag_part = build_and_insert_ref (gsi, loc, type, target,
1357                                     IMAGPART_EXPR, arg);
1358   addend2 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1359                                     imag_part, imag_part);
1360   sum = build_and_insert_binop (gsi, loc, target, PLUS_EXPR, addend1, addend2);
1361   result = build_and_insert_call (gsi, loc, &target, sqrtfn, sum);
1362
1363   return result;
1364 }
1365
1366 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1367    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1368    an optimal number of multiplies, when n is a constant.  */
1369
1370 static unsigned int
1371 execute_cse_sincos (void)
1372 {
1373   basic_block bb;
1374   bool cfg_changed = false;
1375
1376   calculate_dominance_info (CDI_DOMINATORS);
1377   memset (&sincos_stats, 0, sizeof (sincos_stats));
1378
1379   FOR_EACH_BB (bb)
1380     {
1381       gimple_stmt_iterator gsi;
1382
1383       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1384         {
1385           gimple stmt = gsi_stmt (gsi);
1386           tree fndecl;
1387
1388           if (is_gimple_call (stmt)
1389               && gimple_call_lhs (stmt)
1390               && (fndecl = gimple_call_fndecl (stmt))
1391               && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1392             {
1393               tree arg, arg0, arg1, result;
1394               HOST_WIDE_INT n;
1395               location_t loc;
1396
1397               switch (DECL_FUNCTION_CODE (fndecl))
1398                 {
1399                 CASE_FLT_FN (BUILT_IN_COS):
1400                 CASE_FLT_FN (BUILT_IN_SIN):
1401                 CASE_FLT_FN (BUILT_IN_CEXPI):
1402                   /* Make sure we have either sincos or cexp.  */
1403                   if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1404                     break;
1405
1406                   arg = gimple_call_arg (stmt, 0);
1407                   if (TREE_CODE (arg) == SSA_NAME)
1408                     cfg_changed |= execute_cse_sincos_1 (arg);
1409                   break;
1410
1411                 CASE_FLT_FN (BUILT_IN_POW):
1412                   arg0 = gimple_call_arg (stmt, 0);
1413                   arg1 = gimple_call_arg (stmt, 1);
1414
1415                   loc = gimple_location (stmt);
1416                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1417
1418                   if (result)
1419                     {
1420                       tree lhs = gimple_get_lhs (stmt);
1421                       gimple new_stmt = gimple_build_assign (lhs, result);
1422                       gimple_set_location (new_stmt, loc);
1423                       unlink_stmt_vdef (stmt);
1424                       gsi_replace (&gsi, new_stmt, true);
1425                       if (gimple_vdef (stmt))
1426                         release_ssa_name (gimple_vdef (stmt));
1427                     }
1428                   break;
1429
1430                 CASE_FLT_FN (BUILT_IN_POWI):
1431                   arg0 = gimple_call_arg (stmt, 0);
1432                   arg1 = gimple_call_arg (stmt, 1);
1433                   if (!host_integerp (arg1, 0))
1434                     break;
1435
1436                   n = TREE_INT_CST_LOW (arg1);
1437                   loc = gimple_location (stmt);
1438                   result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1439
1440                   if (result)
1441                     {
1442                       tree lhs = gimple_get_lhs (stmt);
1443                       gimple new_stmt = gimple_build_assign (lhs, result);
1444                       gimple_set_location (new_stmt, loc);
1445                       unlink_stmt_vdef (stmt);
1446                       gsi_replace (&gsi, new_stmt, true);
1447                       if (gimple_vdef (stmt))
1448                         release_ssa_name (gimple_vdef (stmt));
1449                     }
1450                   break;
1451
1452                 CASE_FLT_FN (BUILT_IN_CABS):
1453                   arg0 = gimple_call_arg (stmt, 0);
1454                   loc = gimple_location (stmt);
1455                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1456
1457                   if (result)
1458                     {
1459                       tree lhs = gimple_get_lhs (stmt);
1460                       gimple new_stmt = gimple_build_assign (lhs, result);
1461                       gimple_set_location (new_stmt, loc);
1462                       unlink_stmt_vdef (stmt);
1463                       gsi_replace (&gsi, new_stmt, true);
1464                       if (gimple_vdef (stmt))
1465                         release_ssa_name (gimple_vdef (stmt));
1466                     }
1467                   break;
1468
1469                 default:;
1470                 }
1471             }
1472         }
1473     }
1474
1475   statistics_counter_event (cfun, "sincos statements inserted",
1476                             sincos_stats.inserted);
1477
1478   free_dominance_info (CDI_DOMINATORS);
1479   return cfg_changed ? TODO_cleanup_cfg : 0;
1480 }
1481
1482 static bool
1483 gate_cse_sincos (void)
1484 {
1485   /* We no longer require either sincos or cexp, since powi expansion
1486      piggybacks on this pass.  */
1487   return optimize;
1488 }
1489
1490 struct gimple_opt_pass pass_cse_sincos =
1491 {
1492  {
1493   GIMPLE_PASS,
1494   "sincos",                             /* name */
1495   gate_cse_sincos,                      /* gate */
1496   execute_cse_sincos,                   /* execute */
1497   NULL,                                 /* sub */
1498   NULL,                                 /* next */
1499   0,                                    /* static_pass_number */
1500   TV_NONE,                              /* tv_id */
1501   PROP_ssa,                             /* properties_required */
1502   0,                                    /* properties_provided */
1503   0,                                    /* properties_destroyed */
1504   0,                                    /* todo_flags_start */
1505   TODO_update_ssa | TODO_verify_ssa
1506     | TODO_verify_stmts                 /* todo_flags_finish */
1507  }
1508 };
1509
1510 /* A symbolic number is used to detect byte permutation and selection
1511    patterns.  Therefore the field N contains an artificial number
1512    consisting of byte size markers:
1513
1514    0    - byte has the value 0
1515    1..size - byte contains the content of the byte
1516    number indexed with that value minus one  */
1517
1518 struct symbolic_number {
1519   unsigned HOST_WIDEST_INT n;
1520   int size;
1521 };
1522
1523 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1524    number N.  Return false if the requested operation is not permitted
1525    on a symbolic number.  */
1526
1527 static inline bool
1528 do_shift_rotate (enum tree_code code,
1529                  struct symbolic_number *n,
1530                  int count)
1531 {
1532   if (count % 8 != 0)
1533     return false;
1534
1535   /* Zero out the extra bits of N in order to avoid them being shifted
1536      into the significant bits.  */
1537   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1538     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1539
1540   switch (code)
1541     {
1542     case LSHIFT_EXPR:
1543       n->n <<= count;
1544       break;
1545     case RSHIFT_EXPR:
1546       n->n >>= count;
1547       break;
1548     case LROTATE_EXPR:
1549       n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1550       break;
1551     case RROTATE_EXPR:
1552       n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1553       break;
1554     default:
1555       return false;
1556     }
1557   /* Zero unused bits for size.  */
1558   if (n->size < (int)sizeof (HOST_WIDEST_INT))
1559     n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1560   return true;
1561 }
1562
1563 /* Perform sanity checking for the symbolic number N and the gimple
1564    statement STMT.  */
1565
1566 static inline bool
1567 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1568 {
1569   tree lhs_type;
1570
1571   lhs_type = gimple_expr_type (stmt);
1572
1573   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1574     return false;
1575
1576   if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1577     return false;
1578
1579   return true;
1580 }
1581
1582 /* find_bswap_1 invokes itself recursively with N and tries to perform
1583    the operation given by the rhs of STMT on the result.  If the
1584    operation could successfully be executed the function returns the
1585    tree expression of the source operand and NULL otherwise.  */
1586
1587 static tree
1588 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1589 {
1590   enum tree_code code;
1591   tree rhs1, rhs2 = NULL;
1592   gimple rhs1_stmt, rhs2_stmt;
1593   tree source_expr1;
1594   enum gimple_rhs_class rhs_class;
1595
1596   if (!limit || !is_gimple_assign (stmt))
1597     return NULL_TREE;
1598
1599   rhs1 = gimple_assign_rhs1 (stmt);
1600
1601   if (TREE_CODE (rhs1) != SSA_NAME)
1602     return NULL_TREE;
1603
1604   code = gimple_assign_rhs_code (stmt);
1605   rhs_class = gimple_assign_rhs_class (stmt);
1606   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1607
1608   if (rhs_class == GIMPLE_BINARY_RHS)
1609     rhs2 = gimple_assign_rhs2 (stmt);
1610
1611   /* Handle unary rhs and binary rhs with integer constants as second
1612      operand.  */
1613
1614   if (rhs_class == GIMPLE_UNARY_RHS
1615       || (rhs_class == GIMPLE_BINARY_RHS
1616           && TREE_CODE (rhs2) == INTEGER_CST))
1617     {
1618       if (code != BIT_AND_EXPR
1619           && code != LSHIFT_EXPR
1620           && code != RSHIFT_EXPR
1621           && code != LROTATE_EXPR
1622           && code != RROTATE_EXPR
1623           && code != NOP_EXPR
1624           && code != CONVERT_EXPR)
1625         return NULL_TREE;
1626
1627       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1628
1629       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1630          to initialize the symbolic number.  */
1631       if (!source_expr1)
1632         {
1633           /* Set up the symbolic number N by setting each byte to a
1634              value between 1 and the byte size of rhs1.  The highest
1635              order byte is set to n->size and the lowest order
1636              byte to 1.  */
1637           n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1638           if (n->size % BITS_PER_UNIT != 0)
1639             return NULL_TREE;
1640           n->size /= BITS_PER_UNIT;
1641           n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1642                   (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1643
1644           if (n->size < (int)sizeof (HOST_WIDEST_INT))
1645             n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1646                      (n->size * BITS_PER_UNIT)) - 1;
1647
1648           source_expr1 = rhs1;
1649         }
1650
1651       switch (code)
1652         {
1653         case BIT_AND_EXPR:
1654           {
1655             int i;
1656             unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1657             unsigned HOST_WIDEST_INT tmp = val;
1658
1659             /* Only constants masking full bytes are allowed.  */
1660             for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1661               if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1662                 return NULL_TREE;
1663
1664             n->n &= val;
1665           }
1666           break;
1667         case LSHIFT_EXPR:
1668         case RSHIFT_EXPR:
1669         case LROTATE_EXPR:
1670         case RROTATE_EXPR:
1671           if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1672             return NULL_TREE;
1673           break;
1674         CASE_CONVERT:
1675           {
1676             int type_size;
1677
1678             type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1679             if (type_size % BITS_PER_UNIT != 0)
1680               return NULL_TREE;
1681
1682             if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1683               {
1684                 /* If STMT casts to a smaller type mask out the bits not
1685                    belonging to the target type.  */
1686                 n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1687               }
1688             n->size = type_size / BITS_PER_UNIT;
1689           }
1690           break;
1691         default:
1692           return NULL_TREE;
1693         };
1694       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1695     }
1696
1697   /* Handle binary rhs.  */
1698
1699   if (rhs_class == GIMPLE_BINARY_RHS)
1700     {
1701       struct symbolic_number n1, n2;
1702       tree source_expr2;
1703
1704       if (code != BIT_IOR_EXPR)
1705         return NULL_TREE;
1706
1707       if (TREE_CODE (rhs2) != SSA_NAME)
1708         return NULL_TREE;
1709
1710       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1711
1712       switch (code)
1713         {
1714         case BIT_IOR_EXPR:
1715           source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1716
1717           if (!source_expr1)
1718             return NULL_TREE;
1719
1720           source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1721
1722           if (source_expr1 != source_expr2
1723               || n1.size != n2.size)
1724             return NULL_TREE;
1725
1726           n->size = n1.size;
1727           n->n = n1.n | n2.n;
1728
1729           if (!verify_symbolic_number_p (n, stmt))
1730             return NULL_TREE;
1731
1732           break;
1733         default:
1734           return NULL_TREE;
1735         }
1736       return source_expr1;
1737     }
1738   return NULL_TREE;
1739 }
1740
1741 /* Check if STMT completes a bswap implementation consisting of ORs,
1742    SHIFTs and ANDs.  Return the source tree expression on which the
1743    byte swap is performed and NULL if no bswap was found.  */
1744
1745 static tree
1746 find_bswap (gimple stmt)
1747 {
1748 /* The number which the find_bswap result should match in order to
1749    have a full byte swap.  The number is shifted to the left according
1750    to the size of the symbolic number before using it.  */
1751   unsigned HOST_WIDEST_INT cmp =
1752     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1753     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1754
1755   struct symbolic_number n;
1756   tree source_expr;
1757   int limit;
1758
1759   /* The last parameter determines the depth search limit.  It usually
1760      correlates directly to the number of bytes to be touched.  We
1761      increase that number by three  here in order to also
1762      cover signed -> unsigned converions of the src operand as can be seen
1763      in libgcc, and for initial shift/and operation of the src operand.  */
1764   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1765   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1766   source_expr =  find_bswap_1 (stmt, &n, limit);
1767
1768   if (!source_expr)
1769     return NULL_TREE;
1770
1771   /* Zero out the extra bits of N and CMP.  */
1772   if (n.size < (int)sizeof (HOST_WIDEST_INT))
1773     {
1774       unsigned HOST_WIDEST_INT mask =
1775         ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1776
1777       n.n &= mask;
1778       cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1779     }
1780
1781   /* A complete byte swap should make the symbolic number to start
1782      with the largest digit in the highest order byte.  */
1783   if (cmp != n.n)
1784     return NULL_TREE;
1785
1786   return source_expr;
1787 }
1788
1789 /* Find manual byte swap implementations and turn them into a bswap
1790    builtin invokation.  */
1791
1792 static unsigned int
1793 execute_optimize_bswap (void)
1794 {
1795   basic_block bb;
1796   bool bswap32_p, bswap64_p;
1797   bool changed = false;
1798   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1799
1800   if (BITS_PER_UNIT != 8)
1801     return 0;
1802
1803   if (sizeof (HOST_WIDEST_INT) < 8)
1804     return 0;
1805
1806   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1807                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1808   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1809                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1810                    || (bswap32_p && word_mode == SImode)));
1811
1812   if (!bswap32_p && !bswap64_p)
1813     return 0;
1814
1815   /* Determine the argument type of the builtins.  The code later on
1816      assumes that the return and argument type are the same.  */
1817   if (bswap32_p)
1818     {
1819       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1820       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1821     }
1822
1823   if (bswap64_p)
1824     {
1825       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1826       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1827     }
1828
1829   memset (&bswap_stats, 0, sizeof (bswap_stats));
1830
1831   FOR_EACH_BB (bb)
1832     {
1833       gimple_stmt_iterator gsi;
1834
1835       /* We do a reverse scan for bswap patterns to make sure we get the
1836          widest match. As bswap pattern matching doesn't handle
1837          previously inserted smaller bswap replacements as sub-
1838          patterns, the wider variant wouldn't be detected.  */
1839       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1840         {
1841           gimple stmt = gsi_stmt (gsi);
1842           tree bswap_src, bswap_type;
1843           tree bswap_tmp;
1844           tree fndecl = NULL_TREE;
1845           int type_size;
1846           gimple call;
1847
1848           if (!is_gimple_assign (stmt)
1849               || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1850             continue;
1851
1852           type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1853
1854           switch (type_size)
1855             {
1856             case 32:
1857               if (bswap32_p)
1858                 {
1859                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1860                   bswap_type = bswap32_type;
1861                 }
1862               break;
1863             case 64:
1864               if (bswap64_p)
1865                 {
1866                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1867                   bswap_type = bswap64_type;
1868                 }
1869               break;
1870             default:
1871               continue;
1872             }
1873
1874           if (!fndecl)
1875             continue;
1876
1877           bswap_src = find_bswap (stmt);
1878
1879           if (!bswap_src)
1880             continue;
1881
1882           changed = true;
1883           if (type_size == 32)
1884             bswap_stats.found_32bit++;
1885           else
1886             bswap_stats.found_64bit++;
1887
1888           bswap_tmp = bswap_src;
1889
1890           /* Convert the src expression if necessary.  */
1891           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1892             {
1893               gimple convert_stmt;
1894
1895               bswap_tmp = create_tmp_var (bswap_type, "bswapsrc");
1896               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1897
1898               convert_stmt = gimple_build_assign_with_ops (
1899                                CONVERT_EXPR, bswap_tmp, bswap_src, NULL);
1900               gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1901             }
1902
1903           call = gimple_build_call (fndecl, 1, bswap_tmp);
1904
1905           bswap_tmp = gimple_assign_lhs (stmt);
1906
1907           /* Convert the result if necessary.  */
1908           if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1909             {
1910               gimple convert_stmt;
1911
1912               bswap_tmp = create_tmp_var (bswap_type, "bswapdst");
1913               bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1914               convert_stmt = gimple_build_assign_with_ops (
1915                                CONVERT_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1916               gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1917             }
1918
1919           gimple_call_set_lhs (call, bswap_tmp);
1920
1921           if (dump_file)
1922             {
1923               fprintf (dump_file, "%d bit bswap implementation found at: ",
1924                        (int)type_size);
1925               print_gimple_stmt (dump_file, stmt, 0, 0);
1926             }
1927
1928           gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1929           gsi_remove (&gsi, true);
1930         }
1931     }
1932
1933   statistics_counter_event (cfun, "32-bit bswap implementations found",
1934                             bswap_stats.found_32bit);
1935   statistics_counter_event (cfun, "64-bit bswap implementations found",
1936                             bswap_stats.found_64bit);
1937
1938   return (changed ? TODO_update_ssa | TODO_verify_ssa
1939           | TODO_verify_stmts : 0);
1940 }
1941
1942 static bool
1943 gate_optimize_bswap (void)
1944 {
1945   return flag_expensive_optimizations && optimize;
1946 }
1947
1948 struct gimple_opt_pass pass_optimize_bswap =
1949 {
1950  {
1951   GIMPLE_PASS,
1952   "bswap",                              /* name */
1953   gate_optimize_bswap,                  /* gate */
1954   execute_optimize_bswap,               /* execute */
1955   NULL,                                 /* sub */
1956   NULL,                                 /* next */
1957   0,                                    /* static_pass_number */
1958   TV_NONE,                              /* tv_id */
1959   PROP_ssa,                             /* properties_required */
1960   0,                                    /* properties_provided */
1961   0,                                    /* properties_destroyed */
1962   0,                                    /* todo_flags_start */
1963   0                                     /* todo_flags_finish */
1964  }
1965 };
1966
1967 /* Return true if RHS is a suitable operand for a widening multiplication,
1968    assuming a target type of TYPE.
1969    There are two cases:
1970
1971      - RHS makes some value at least twice as wide.  Store that value
1972        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
1973
1974      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
1975        but leave *TYPE_OUT untouched.  */
1976
1977 static bool
1978 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
1979                         tree *new_rhs_out)
1980 {
1981   gimple stmt;
1982   tree type1, rhs1;
1983   enum tree_code rhs_code;
1984
1985   if (TREE_CODE (rhs) == SSA_NAME)
1986     {
1987       stmt = SSA_NAME_DEF_STMT (rhs);
1988       if (is_gimple_assign (stmt))
1989         {
1990           rhs_code = gimple_assign_rhs_code (stmt);
1991           if (TREE_CODE (type) == INTEGER_TYPE
1992               ? !CONVERT_EXPR_CODE_P (rhs_code)
1993               : rhs_code != FIXED_CONVERT_EXPR)
1994             rhs1 = rhs;
1995           else
1996             {
1997               rhs1 = gimple_assign_rhs1 (stmt);
1998
1999               if (TREE_CODE (rhs1) == INTEGER_CST)
2000                 {
2001                   *new_rhs_out = rhs1;
2002                   *type_out = NULL;
2003                   return true;
2004                 }
2005             }
2006         }
2007       else
2008         rhs1 = rhs;
2009
2010       type1 = TREE_TYPE (rhs1);
2011
2012       if (TREE_CODE (type1) != TREE_CODE (type)
2013           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2014         return false;
2015
2016       *new_rhs_out = rhs1;
2017       *type_out = type1;
2018       return true;
2019     }
2020
2021   if (TREE_CODE (rhs) == INTEGER_CST)
2022     {
2023       *new_rhs_out = rhs;
2024       *type_out = NULL;
2025       return true;
2026     }
2027
2028   return false;
2029 }
2030
2031 /* Return true if STMT performs a widening multiplication, assuming the
2032    output type is TYPE.  If so, store the unwidened types of the operands
2033    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2034    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2035    and *TYPE2_OUT would give the operands of the multiplication.  */
2036
2037 static bool
2038 is_widening_mult_p (gimple stmt,
2039                     tree *type1_out, tree *rhs1_out,
2040                     tree *type2_out, tree *rhs2_out)
2041 {
2042   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2043
2044   if (TREE_CODE (type) != INTEGER_TYPE
2045       && TREE_CODE (type) != FIXED_POINT_TYPE)
2046     return false;
2047
2048   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2049                                rhs1_out))
2050     return false;
2051
2052   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2053                                rhs2_out))
2054     return false;
2055
2056   if (*type1_out == NULL)
2057     {
2058       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2059         return false;
2060       *type1_out = *type2_out;
2061     }
2062
2063   if (*type2_out == NULL)
2064     {
2065       if (!int_fits_type_p (*rhs2_out, *type1_out))
2066         return false;
2067       *type2_out = *type1_out;
2068     }
2069
2070   /* Ensure that the larger of the two operands comes first. */
2071   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2072     {
2073       tree tmp;
2074       tmp = *type1_out;
2075       *type1_out = *type2_out;
2076       *type2_out = tmp;
2077       tmp = *rhs1_out;
2078       *rhs1_out = *rhs2_out;
2079       *rhs2_out = tmp;
2080     }
2081
2082   return true;
2083 }
2084
2085 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2086    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2087    value is true iff we converted the statement.  */
2088
2089 static bool
2090 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2091 {
2092   tree lhs, rhs1, rhs2, type, type1, type2, tmp = NULL;
2093   enum insn_code handler;
2094   enum machine_mode to_mode, from_mode, actual_mode;
2095   optab op;
2096   int actual_precision;
2097   location_t loc = gimple_location (stmt);
2098   bool from_unsigned1, from_unsigned2;
2099
2100   lhs = gimple_assign_lhs (stmt);
2101   type = TREE_TYPE (lhs);
2102   if (TREE_CODE (type) != INTEGER_TYPE)
2103     return false;
2104
2105   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2106     return false;
2107
2108   to_mode = TYPE_MODE (type);
2109   from_mode = TYPE_MODE (type1);
2110   from_unsigned1 = TYPE_UNSIGNED (type1);
2111   from_unsigned2 = TYPE_UNSIGNED (type2);
2112
2113   if (from_unsigned1 && from_unsigned2)
2114     op = umul_widen_optab;
2115   else if (!from_unsigned1 && !from_unsigned2)
2116     op = smul_widen_optab;
2117   else
2118     op = usmul_widen_optab;
2119
2120   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2121                                                   0, &actual_mode);
2122
2123   if (handler == CODE_FOR_nothing)
2124     {
2125       if (op != smul_widen_optab)
2126         {
2127           /* We can use a signed multiply with unsigned types as long as
2128              there is a wider mode to use, or it is the smaller of the two
2129              types that is unsigned.  Note that type1 >= type2, always.  */
2130           if ((TYPE_UNSIGNED (type1)
2131                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2132               || (TYPE_UNSIGNED (type2)
2133                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2134             {
2135               from_mode = GET_MODE_WIDER_MODE (from_mode);
2136               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2137                 return false;
2138             }
2139
2140           op = smul_widen_optab;
2141           handler = find_widening_optab_handler_and_mode (op, to_mode,
2142                                                           from_mode, 0,
2143                                                           &actual_mode);
2144
2145           if (handler == CODE_FOR_nothing)
2146             return false;
2147
2148           from_unsigned1 = from_unsigned2 = false;
2149         }
2150       else
2151         return false;
2152     }
2153
2154   /* Ensure that the inputs to the handler are in the correct precison
2155      for the opcode.  This will be the full mode size.  */
2156   actual_precision = GET_MODE_PRECISION (actual_mode);
2157   if (2 * actual_precision > TYPE_PRECISION (type))
2158     return false;
2159   if (actual_precision != TYPE_PRECISION (type1)
2160       || from_unsigned1 != TYPE_UNSIGNED (type1))
2161     {
2162       tmp = create_tmp_var (build_nonstandard_integer_type
2163                                 (actual_precision, from_unsigned1),
2164                             NULL);
2165       rhs1 = build_and_insert_cast (gsi, loc, tmp, rhs1);
2166     }
2167   if (actual_precision != TYPE_PRECISION (type2)
2168       || from_unsigned2 != TYPE_UNSIGNED (type2))
2169     {
2170       /* Reuse the same type info, if possible.  */
2171       if (!tmp || from_unsigned1 != from_unsigned2)
2172         tmp = create_tmp_var (build_nonstandard_integer_type
2173                                 (actual_precision, from_unsigned2),
2174                               NULL);
2175       rhs2 = build_and_insert_cast (gsi, loc, tmp, rhs2);
2176     }
2177
2178   /* Handle constants.  */
2179   if (TREE_CODE (rhs1) == INTEGER_CST)
2180     rhs1 = fold_convert (type1, rhs1);
2181   if (TREE_CODE (rhs2) == INTEGER_CST)
2182     rhs2 = fold_convert (type2, rhs2);
2183
2184   gimple_assign_set_rhs1 (stmt, rhs1);
2185   gimple_assign_set_rhs2 (stmt, rhs2);
2186   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2187   update_stmt (stmt);
2188   widen_mul_stats.widen_mults_inserted++;
2189   return true;
2190 }
2191
2192 /* Process a single gimple statement STMT, which is found at the
2193    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2194    rhs (given by CODE), and try to convert it into a
2195    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2196    is true iff we converted the statement.  */
2197
2198 static bool
2199 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2200                             enum tree_code code)
2201 {
2202   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2203   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2204   tree type, type1, type2, optype, tmp = NULL;
2205   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2206   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2207   optab this_optab;
2208   enum tree_code wmult_code;
2209   enum insn_code handler;
2210   enum machine_mode to_mode, from_mode, actual_mode;
2211   location_t loc = gimple_location (stmt);
2212   int actual_precision;
2213   bool from_unsigned1, from_unsigned2;
2214
2215   lhs = gimple_assign_lhs (stmt);
2216   type = TREE_TYPE (lhs);
2217   if (TREE_CODE (type) != INTEGER_TYPE
2218       && TREE_CODE (type) != FIXED_POINT_TYPE)
2219     return false;
2220
2221   if (code == MINUS_EXPR)
2222     wmult_code = WIDEN_MULT_MINUS_EXPR;
2223   else
2224     wmult_code = WIDEN_MULT_PLUS_EXPR;
2225
2226   rhs1 = gimple_assign_rhs1 (stmt);
2227   rhs2 = gimple_assign_rhs2 (stmt);
2228
2229   if (TREE_CODE (rhs1) == SSA_NAME)
2230     {
2231       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2232       if (is_gimple_assign (rhs1_stmt))
2233         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2234     }
2235
2236   if (TREE_CODE (rhs2) == SSA_NAME)
2237     {
2238       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2239       if (is_gimple_assign (rhs2_stmt))
2240         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2241     }
2242
2243   /* Allow for one conversion statement between the multiply
2244      and addition/subtraction statement.  If there are more than
2245      one conversions then we assume they would invalidate this
2246      transformation.  If that's not the case then they should have
2247      been folded before now.  */
2248   if (CONVERT_EXPR_CODE_P (rhs1_code))
2249     {
2250       conv1_stmt = rhs1_stmt;
2251       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2252       if (TREE_CODE (rhs1) == SSA_NAME)
2253         {
2254           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2255           if (is_gimple_assign (rhs1_stmt))
2256             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2257         }
2258       else
2259         return false;
2260     }
2261   if (CONVERT_EXPR_CODE_P (rhs2_code))
2262     {
2263       conv2_stmt = rhs2_stmt;
2264       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2265       if (TREE_CODE (rhs2) == SSA_NAME)
2266         {
2267           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2268           if (is_gimple_assign (rhs2_stmt))
2269             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2270         }
2271       else
2272         return false;
2273     }
2274
2275   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2276      is_widening_mult_p, but we still need the rhs returns.
2277
2278      It might also appear that it would be sufficient to use the existing
2279      operands of the widening multiply, but that would limit the choice of
2280      multiply-and-accumulate instructions.  */
2281   if (code == PLUS_EXPR
2282       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2283     {
2284       if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2285                                &type2, &mult_rhs2))
2286         return false;
2287       add_rhs = rhs2;
2288       conv_stmt = conv1_stmt;
2289     }
2290   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2291     {
2292       if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2293                                &type2, &mult_rhs2))
2294         return false;
2295       add_rhs = rhs1;
2296       conv_stmt = conv2_stmt;
2297     }
2298   else
2299     return false;
2300
2301   to_mode = TYPE_MODE (type);
2302   from_mode = TYPE_MODE (type1);
2303   from_unsigned1 = TYPE_UNSIGNED (type1);
2304   from_unsigned2 = TYPE_UNSIGNED (type2);
2305   optype = type1;
2306
2307   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2308   if (from_unsigned1 != from_unsigned2)
2309     {
2310       if (!INTEGRAL_TYPE_P (type))
2311         return false;
2312       /* We can use a signed multiply with unsigned types as long as
2313          there is a wider mode to use, or it is the smaller of the two
2314          types that is unsigned.  Note that type1 >= type2, always.  */
2315       if ((from_unsigned1
2316            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2317           || (from_unsigned2
2318               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2319         {
2320           from_mode = GET_MODE_WIDER_MODE (from_mode);
2321           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2322             return false;
2323         }
2324
2325       from_unsigned1 = from_unsigned2 = false;
2326       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2327                                                false);
2328     }
2329
2330   /* If there was a conversion between the multiply and addition
2331      then we need to make sure it fits a multiply-and-accumulate.
2332      The should be a single mode change which does not change the
2333      value.  */
2334   if (conv_stmt)
2335     {
2336       /* We use the original, unmodified data types for this.  */
2337       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2338       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2339       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2340       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2341
2342       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2343         {
2344           /* Conversion is a truncate.  */
2345           if (TYPE_PRECISION (to_type) < data_size)
2346             return false;
2347         }
2348       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2349         {
2350           /* Conversion is an extend.  Check it's the right sort.  */
2351           if (TYPE_UNSIGNED (from_type) != is_unsigned
2352               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2353             return false;
2354         }
2355       /* else convert is a no-op for our purposes.  */
2356     }
2357
2358   /* Verify that the machine can perform a widening multiply
2359      accumulate in this mode/signedness combination, otherwise
2360      this transformation is likely to pessimize code.  */
2361   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2362   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2363                                                   from_mode, 0, &actual_mode);
2364
2365   if (handler == CODE_FOR_nothing)
2366     return false;
2367
2368   /* Ensure that the inputs to the handler are in the correct precison
2369      for the opcode.  This will be the full mode size.  */
2370   actual_precision = GET_MODE_PRECISION (actual_mode);
2371   if (actual_precision != TYPE_PRECISION (type1)
2372       || from_unsigned1 != TYPE_UNSIGNED (type1))
2373     {
2374       tmp = create_tmp_var (build_nonstandard_integer_type
2375                                 (actual_precision, from_unsigned1),
2376                             NULL);
2377       mult_rhs1 = build_and_insert_cast (gsi, loc, tmp, mult_rhs1);
2378     }
2379   if (actual_precision != TYPE_PRECISION (type2)
2380       || from_unsigned2 != TYPE_UNSIGNED (type2))
2381     {
2382       if (!tmp || from_unsigned1 != from_unsigned2)
2383         tmp = create_tmp_var (build_nonstandard_integer_type
2384                                 (actual_precision, from_unsigned2),
2385                               NULL);
2386       mult_rhs2 = build_and_insert_cast (gsi, loc, tmp, mult_rhs2);
2387     }
2388
2389   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2390     add_rhs = build_and_insert_cast (gsi, loc, create_tmp_var (type, NULL),
2391                                      add_rhs);
2392
2393   /* Handle constants.  */
2394   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2395     mult_rhs1 = fold_convert (type1, mult_rhs1);
2396   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2397     mult_rhs2 = fold_convert (type2, mult_rhs2);
2398
2399   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2400                                     add_rhs);
2401   update_stmt (gsi_stmt (*gsi));
2402   widen_mul_stats.maccs_inserted++;
2403   return true;
2404 }
2405
2406 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2407    with uses in additions and subtractions to form fused multiply-add
2408    operations.  Returns true if successful and MUL_STMT should be removed.  */
2409
2410 static bool
2411 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2412 {
2413   tree mul_result = gimple_get_lhs (mul_stmt);
2414   tree type = TREE_TYPE (mul_result);
2415   gimple use_stmt, neguse_stmt, fma_stmt;
2416   use_operand_p use_p;
2417   imm_use_iterator imm_iter;
2418
2419   if (FLOAT_TYPE_P (type)
2420       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2421     return false;
2422
2423   /* We don't want to do bitfield reduction ops.  */
2424   if (INTEGRAL_TYPE_P (type)
2425       && (TYPE_PRECISION (type)
2426           != GET_MODE_PRECISION (TYPE_MODE (type))))
2427     return false;
2428
2429   /* If the target doesn't support it, don't generate it.  We assume that
2430      if fma isn't available then fms, fnma or fnms are not either.  */
2431   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2432     return false;
2433
2434   /* If the multiplication has zero uses, it is kept around probably because
2435      of -fnon-call-exceptions.  Don't optimize it away in that case,
2436      it is DCE job.  */
2437   if (has_zero_uses (mul_result))
2438     return false;
2439
2440   /* Make sure that the multiplication statement becomes dead after
2441      the transformation, thus that all uses are transformed to FMAs.
2442      This means we assume that an FMA operation has the same cost
2443      as an addition.  */
2444   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2445     {
2446       enum tree_code use_code;
2447       tree result = mul_result;
2448       bool negate_p = false;
2449
2450       use_stmt = USE_STMT (use_p);
2451
2452       if (is_gimple_debug (use_stmt))
2453         continue;
2454
2455       /* For now restrict this operations to single basic blocks.  In theory
2456          we would want to support sinking the multiplication in
2457          m = a*b;
2458          if ()
2459            ma = m + c;
2460          else
2461            d = m;
2462          to form a fma in the then block and sink the multiplication to the
2463          else block.  */
2464       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2465         return false;
2466
2467       if (!is_gimple_assign (use_stmt))
2468         return false;
2469
2470       use_code = gimple_assign_rhs_code (use_stmt);
2471
2472       /* A negate on the multiplication leads to FNMA.  */
2473       if (use_code == NEGATE_EXPR)
2474         {
2475           ssa_op_iter iter;
2476           use_operand_p usep;
2477
2478           result = gimple_assign_lhs (use_stmt);
2479
2480           /* Make sure the negate statement becomes dead with this
2481              single transformation.  */
2482           if (!single_imm_use (gimple_assign_lhs (use_stmt),
2483                                &use_p, &neguse_stmt))
2484             return false;
2485
2486           /* Make sure the multiplication isn't also used on that stmt.  */
2487           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2488             if (USE_FROM_PTR (usep) == mul_result)
2489               return false;
2490
2491           /* Re-validate.  */
2492           use_stmt = neguse_stmt;
2493           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2494             return false;
2495           if (!is_gimple_assign (use_stmt))
2496             return false;
2497
2498           use_code = gimple_assign_rhs_code (use_stmt);
2499           negate_p = true;
2500         }
2501
2502       switch (use_code)
2503         {
2504         case MINUS_EXPR:
2505           if (gimple_assign_rhs2 (use_stmt) == result)
2506             negate_p = !negate_p;
2507           break;
2508         case PLUS_EXPR:
2509           break;
2510         default:
2511           /* FMA can only be formed from PLUS and MINUS.  */
2512           return false;
2513         }
2514
2515       /* We can't handle a * b + a * b.  */
2516       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2517         return false;
2518
2519       /* While it is possible to validate whether or not the exact form
2520          that we've recognized is available in the backend, the assumption
2521          is that the transformation is never a loss.  For instance, suppose
2522          the target only has the plain FMA pattern available.  Consider
2523          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2524          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2525          still have 3 operations, but in the FMA form the two NEGs are
2526          independent and could be run in parallel.  */
2527     }
2528
2529   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2530     {
2531       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2532       enum tree_code use_code;
2533       tree addop, mulop1 = op1, result = mul_result;
2534       bool negate_p = false;
2535
2536       if (is_gimple_debug (use_stmt))
2537         continue;
2538
2539       use_code = gimple_assign_rhs_code (use_stmt);
2540       if (use_code == NEGATE_EXPR)
2541         {
2542           result = gimple_assign_lhs (use_stmt);
2543           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2544           gsi_remove (&gsi, true);
2545           release_defs (use_stmt);
2546
2547           use_stmt = neguse_stmt;
2548           gsi = gsi_for_stmt (use_stmt);
2549           use_code = gimple_assign_rhs_code (use_stmt);
2550           negate_p = true;
2551         }
2552
2553       if (gimple_assign_rhs1 (use_stmt) == result)
2554         {
2555           addop = gimple_assign_rhs2 (use_stmt);
2556           /* a * b - c -> a * b + (-c)  */
2557           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2558             addop = force_gimple_operand_gsi (&gsi,
2559                                               build1 (NEGATE_EXPR,
2560                                                       type, addop),
2561                                               true, NULL_TREE, true,
2562                                               GSI_SAME_STMT);
2563         }
2564       else
2565         {
2566           addop = gimple_assign_rhs1 (use_stmt);
2567           /* a - b * c -> (-b) * c + a */
2568           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2569             negate_p = !negate_p;
2570         }
2571
2572       if (negate_p)
2573         mulop1 = force_gimple_operand_gsi (&gsi,
2574                                            build1 (NEGATE_EXPR,
2575                                                    type, mulop1),
2576                                            true, NULL_TREE, true,
2577                                            GSI_SAME_STMT);
2578
2579       fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
2580                                                 gimple_assign_lhs (use_stmt),
2581                                                 mulop1, op2,
2582                                                 addop);
2583       gsi_replace (&gsi, fma_stmt, true);
2584       widen_mul_stats.fmas_inserted++;
2585     }
2586
2587   return true;
2588 }
2589
2590 /* Find integer multiplications where the operands are extended from
2591    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2592    where appropriate.  */
2593
2594 static unsigned int
2595 execute_optimize_widening_mul (void)
2596 {
2597   basic_block bb;
2598   bool cfg_changed = false;
2599
2600   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2601
2602   FOR_EACH_BB (bb)
2603     {
2604       gimple_stmt_iterator gsi;
2605
2606       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2607         {
2608           gimple stmt = gsi_stmt (gsi);
2609           enum tree_code code;
2610
2611           if (is_gimple_assign (stmt))
2612             {
2613               code = gimple_assign_rhs_code (stmt);
2614               switch (code)
2615                 {
2616                 case MULT_EXPR:
2617                   if (!convert_mult_to_widen (stmt, &gsi)
2618                       && convert_mult_to_fma (stmt,
2619                                               gimple_assign_rhs1 (stmt),
2620                                               gimple_assign_rhs2 (stmt)))
2621                     {
2622                       gsi_remove (&gsi, true);
2623                       release_defs (stmt);
2624                       continue;
2625                     }
2626                   break;
2627
2628                 case PLUS_EXPR:
2629                 case MINUS_EXPR:
2630                   convert_plusminus_to_widen (&gsi, stmt, code);
2631                   break;
2632
2633                 default:;
2634                 }
2635             }
2636           else if (is_gimple_call (stmt)
2637                    && gimple_call_lhs (stmt))
2638             {
2639               tree fndecl = gimple_call_fndecl (stmt);
2640               if (fndecl
2641                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2642                 {
2643                   switch (DECL_FUNCTION_CODE (fndecl))
2644                     {
2645                       case BUILT_IN_POWF:
2646                       case BUILT_IN_POW:
2647                       case BUILT_IN_POWL:
2648                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2649                             && REAL_VALUES_EQUAL
2650                                  (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2651                                   dconst2)
2652                             && convert_mult_to_fma (stmt,
2653                                                     gimple_call_arg (stmt, 0),
2654                                                     gimple_call_arg (stmt, 0)))
2655                           {
2656                             unlink_stmt_vdef (stmt);
2657                             if (gsi_remove (&gsi, true)
2658                                 && gimple_purge_dead_eh_edges (bb))
2659                               cfg_changed = true;
2660                             release_defs (stmt);
2661                             continue;
2662                           }
2663                           break;
2664
2665                       default:;
2666                     }
2667                 }
2668             }
2669           gsi_next (&gsi);
2670         }
2671     }
2672
2673   statistics_counter_event (cfun, "widening multiplications inserted",
2674                             widen_mul_stats.widen_mults_inserted);
2675   statistics_counter_event (cfun, "widening maccs inserted",
2676                             widen_mul_stats.maccs_inserted);
2677   statistics_counter_event (cfun, "fused multiply-adds inserted",
2678                             widen_mul_stats.fmas_inserted);
2679
2680   return cfg_changed ? TODO_cleanup_cfg : 0;
2681 }
2682
2683 static bool
2684 gate_optimize_widening_mul (void)
2685 {
2686   return flag_expensive_optimizations && optimize;
2687 }
2688
2689 struct gimple_opt_pass pass_optimize_widening_mul =
2690 {
2691  {
2692   GIMPLE_PASS,
2693   "widening_mul",                       /* name */
2694   gate_optimize_widening_mul,           /* gate */
2695   execute_optimize_widening_mul,        /* execute */
2696   NULL,                                 /* sub */
2697   NULL,                                 /* next */
2698   0,                                    /* static_pass_number */
2699   TV_NONE,                              /* tv_id */
2700   PROP_ssa,                             /* properties_required */
2701   0,                                    /* properties_provided */
2702   0,                                    /* properties_destroyed */
2703   0,                                    /* todo_flags_start */
2704   TODO_verify_ssa
2705   | TODO_verify_stmts
2706   | TODO_update_ssa                     /* todo_flags_finish */
2707  }
2708 };