gcc/tree-ssa-math-opts.c

   1 /* Global, SSA-based optimizations using mathematical identities.
   2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
  21    operations.  These are common in sequences such as this one:
  22
  23         modulus = sqrt(x*x + y*y + z*z);
  24         x = x / modulus;
  25         y = y / modulus;
  26         z = z / modulus;
  27
  28    that can be optimized to
  29
  30         modulus = sqrt(x*x + y*y + z*z);
  31         rmodulus = 1.0 / modulus;
  32         x = x * rmodulus;
  33         y = y * rmodulus;
  34         z = z * rmodulus;
  35
  36    We do this for loop invariant divisors, and with this pass whenever
  37    we notice that a division has the same divisor multiple times.
  38
  39    Of course, like in PRE, we don't insert a division if a dominator
  40    already has one.  However, this cannot be done as an extension of
  41    PRE for several reasons.
  42
  43    First of all, with some experiments it was found out that the
  44    transformation is not always useful if there are only two divisions
  45    hy the same divisor.  This is probably because modern processors
  46    can pipeline the divisions; on older, in-order processors it should
  47    still be effective to optimize two divisions by the same number.
  48    We make this a param, and it shall be called N in the remainder of
  49    this comment.
  50
  51    Second, if trapping math is active, we have less freedom on where
  52    to insert divisions: we can only do so in basic blocks that already
  53    contain one.  (If divisions don't trap, instead, we can insert
  54    divisions elsewhere, which will be in blocks that are common dominators
  55    of those that have the division).
  56
  57    We really don't want to compute the reciprocal unless a division will
  58    be found.  To do this, we won't insert the division in a basic block
  59    that has less than N divisions *post-dominating* it.
  60
  61    The algorithm constructs a subset of the dominator tree, holding the
  62    blocks containing the divisions and the common dominators to them,
  63    and walk it twice.  The first walk is in post-order, and it annotates
  64    each block with the number of divisions that post-dominate it: this
  65    gives information on where divisions can be inserted profitably.
  66    The second walk is in pre-order, and it inserts divisions as explained
  67    above, and replaces divisions by multiplications.
  68
  69    In the best case, the cost of the pass is O(n_statements).  In the
  70    worst-case, the cost is due to creating the dominator tree subset,
  71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
  72    for n_statements / n_basic_blocks statements.  So, the amortized cost
  73    of creating the dominator tree subset is O(n_basic_blocks) and the
  74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
  75
  76    More practically, the cost will be small because there are few
  77    divisions, and they tend to be in the same basic block, so insert_bb
  78    is called very few times.
  79
  80    If we did this using domwalk.c, an efficient implementation would have
  81    to work on all the variables in a single pass, because we could not
  82    work on just a subset of the dominator tree, as we do now, and the
  83    cost would also be something like O(n_statements * n_basic_blocks).
  84    The data structures would be more complex in order to work on all the
  85    variables in a single pass.  */
  86
  87 #include "config.h"
  88 #include "system.h"
  89 #include "coretypes.h"
  90 #include "backend.h"
  91 #include "target.h"
  92 #include "rtl.h"
  93 #include "tree.h"
  94 #include "gimple.h"
  95 #include "predict.h"
  96 #include "alloc-pool.h"
  97 #include "tree-pass.h"
  98 #include "ssa.h"
  99 #include "optabs-tree.h"
 100 #include "gimple-pretty-print.h"
 101 #include "flags.h"
 102 #include "alias.h"
 103 #include "fold-const.h"
 104 #include "internal-fn.h"
 105 #include "gimple-fold.h"
 106 #include "gimple-iterator.h"
 107 #include "gimplify.h"
 108 #include "gimplify-me.h"
 109 #include "stor-layout.h"
 110 #include "tree-cfg.h"
 111 #include "tree-dfa.h"
 112 #include "tree-ssa.h"
 113 #include "builtins.h"
 114 #include "params.h"
 115
 116 /* This structure represents one basic block that either computes a
 117    division, or is a common dominator for basic block that compute a
 118    division.  */
 119 struct occurrence {
 120   /* The basic block represented by this structure.  */
 121   basic_block bb;
 122
 123   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
 124      inserted in BB.  */
 125   tree recip_def;
 126
 127   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
 128      was inserted in BB.  */
 129   gimple *recip_def_stmt;
 130
 131   /* Pointer to a list of "struct occurrence"s for blocks dominated
 132      by BB.  */
 133   struct occurrence *children;
 134
 135   /* Pointer to the next "struct occurrence"s in the list of blocks
 136      sharing a common dominator.  */
 137   struct occurrence *next;
 138
 139   /* The number of divisions that are in BB before compute_merit.  The
 140      number of divisions that are in BB or post-dominate it after
 141      compute_merit.  */
 142   int num_divisions;
 143
 144   /* True if the basic block has a division, false if it is a common
 145      dominator for basic blocks that do.  If it is false and trapping
 146      math is active, BB is not a candidate for inserting a reciprocal.  */
 147   bool bb_has_division;
 148 };
 149
 150 static struct
 151 {
 152   /* Number of 1.0/X ops inserted.  */
 153   int rdivs_inserted;
 154
 155   /* Number of 1.0/FUNC ops inserted.  */
 156   int rfuncs_inserted;
 157 } reciprocal_stats;
 158
 159 static struct
 160 {
 161   /* Number of cexpi calls inserted.  */
 162   int inserted;
 163 } sincos_stats;
 164
 165 static struct
 166 {
 167   /* Number of hand-written 16-bit nop / bswaps found.  */
 168   int found_16bit;
 169
 170   /* Number of hand-written 32-bit nop / bswaps found.  */
 171   int found_32bit;
 172
 173   /* Number of hand-written 64-bit nop / bswaps found.  */
 174   int found_64bit;
 175 } nop_stats, bswap_stats;
 176
 177 static struct
 178 {
 179   /* Number of widening multiplication ops inserted.  */
 180   int widen_mults_inserted;
 181
 182   /* Number of integer multiply-and-accumulate ops inserted.  */
 183   int maccs_inserted;
 184
 185   /* Number of fp fused multiply-add ops inserted.  */
 186   int fmas_inserted;
 187 } widen_mul_stats;
 188
 189 /* The instance of "struct occurrence" representing the highest
 190    interesting block in the dominator tree.  */
 191 static struct occurrence *occ_head;
 192
 193 /* Allocation pool for getting instances of "struct occurrence".  */
 194 static object_allocator<occurrence> *occ_pool;
 195
 196
 197
 198 /* Allocate and return a new struct occurrence for basic block BB, and
 199    whose children list is headed by CHILDREN.  */
 200 static struct occurrence *
 201 occ_new (basic_block bb, struct occurrence *children)
 202 {
 203   struct occurrence *occ;
 204
 205   bb->aux = occ = occ_pool->allocate ();
 206   memset (occ, 0, sizeof (struct occurrence));
 207
 208   occ->bb = bb;
 209   occ->children = children;
 210   return occ;
 211 }
 212
 213
 214 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
 215    list of "struct occurrence"s, one per basic block, having IDOM as
 216    their common dominator.
 217
 218    We try to insert NEW_OCC as deep as possible in the tree, and we also
 219    insert any other block that is a common dominator for BB and one
 220    block already in the tree.  */
 221
 222 static void
 223 insert_bb (struct occurrence *new_occ, basic_block idom,
 224            struct occurrence **p_head)
 225 {
 226   struct occurrence *occ, **p_occ;
 227
 228   for (p_occ = p_head; (occ = *p_occ) != NULL; )
 229     {
 230       basic_block bb = new_occ->bb, occ_bb = occ->bb;
 231       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
 232       if (dom == bb)
 233         {
 234           /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
 235              from its list.  */
 236           *p_occ = occ->next;
 237           occ->next = new_occ->children;
 238           new_occ->children = occ;
 239
 240           /* Try the next block (it may as well be dominated by BB).  */
 241         }
 242
 243       else if (dom == occ_bb)
 244         {
 245           /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
 246           insert_bb (new_occ, dom, &occ->children);
 247           return;
 248         }
 249
 250       else if (dom != idom)
 251         {
 252           gcc_assert (!dom->aux);
 253
 254           /* There is a dominator between IDOM and BB, add it and make
 255              two children out of NEW_OCC and OCC.  First, remove OCC from
 256              its list.  */
 257           *p_occ = occ->next;
 258           new_occ->next = occ;
 259           occ->next = NULL;
 260
 261           /* None of the previous blocks has DOM as a dominator: if we tail
 262              recursed, we would reexamine them uselessly. Just switch BB with
 263              DOM, and go on looking for blocks dominated by DOM.  */
 264           new_occ = occ_new (dom, new_occ);
 265         }
 266
 267       else
 268         {
 269           /* Nothing special, go on with the next element.  */
 270           p_occ = &occ->next;
 271         }
 272     }
 273
 274   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
 275   new_occ->next = *p_head;
 276   *p_head = new_occ;
 277 }
 278
 279 /* Register that we found a division in BB.  */
 280
 281 static inline void
 282 register_division_in (basic_block bb)
 283 {
 284   struct occurrence *occ;
 285
 286   occ = (struct occurrence *) bb->aux;
 287   if (!occ)
 288     {
 289       occ = occ_new (bb, NULL);
 290       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
 291     }
 292
 293   occ->bb_has_division = true;
 294   occ->num_divisions++;
 295 }
 296
 297
 298 /* Compute the number of divisions that postdominate each block in OCC and
 299    its children.  */
 300
 301 static void
 302 compute_merit (struct occurrence *occ)
 303 {
 304   struct occurrence *occ_child;
 305   basic_block dom = occ->bb;
 306
 307   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 308     {
 309       basic_block bb;
 310       if (occ_child->children)
 311         compute_merit (occ_child);
 312
 313       if (flag_exceptions)
 314         bb = single_noncomplex_succ (dom);
 315       else
 316         bb = dom;
 317
 318       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
 319         occ->num_divisions += occ_child->num_divisions;
 320     }
 321 }
 322
 323
 324 /* Return whether USE_STMT is a floating-point division by DEF.  */
 325 static inline bool
 326 is_division_by (gimple *use_stmt, tree def)
 327 {
 328   return is_gimple_assign (use_stmt)
 329          && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
 330          && gimple_assign_rhs2 (use_stmt) == def
 331          /* Do not recognize x / x as valid division, as we are getting
 332             confused later by replacing all immediate uses x in such
 333             a stmt.  */
 334          && gimple_assign_rhs1 (use_stmt) != def;
 335 }
 336
 337 /* Walk the subset of the dominator tree rooted at OCC, setting the
 338    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
 339    the given basic block.  The field may be left NULL, of course,
 340    if it is not possible or profitable to do the optimization.
 341
 342    DEF_BSI is an iterator pointing at the statement defining DEF.
 343    If RECIP_DEF is set, a dominator already has a computation that can
 344    be used.  */
 345
 346 static void
 347 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
 348                     tree def, tree recip_def, int threshold)
 349 {
 350   tree type;
 351   gassign *new_stmt;
 352   gimple_stmt_iterator gsi;
 353   struct occurrence *occ_child;
 354
 355   if (!recip_def
 356       && (occ->bb_has_division || !flag_trapping_math)
 357       && occ->num_divisions >= threshold)
 358     {
 359       /* Make a variable with the replacement and substitute it.  */
 360       type = TREE_TYPE (def);
 361       recip_def = create_tmp_reg (type, "reciptmp");
 362       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
 363                                       build_one_cst (type), def);
 364
 365       if (occ->bb_has_division)
 366         {
 367           /* Case 1: insert before an existing division.  */
 368           gsi = gsi_after_labels (occ->bb);
 369           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
 370             gsi_next (&gsi);
 371
 372           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 373         }
 374       else if (def_gsi && occ->bb == def_gsi->bb)
 375         {
 376           /* Case 2: insert right after the definition.  Note that this will
 377              never happen if the definition statement can throw, because in
 378              that case the sole successor of the statement's basic block will
 379              dominate all the uses as well.  */
 380           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
 381         }
 382       else
 383         {
 384           /* Case 3: insert in a basic block not containing defs/uses.  */
 385           gsi = gsi_after_labels (occ->bb);
 386           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
 387         }
 388
 389       reciprocal_stats.rdivs_inserted++;
 390
 391       occ->recip_def_stmt = new_stmt;
 392     }
 393
 394   occ->recip_def = recip_def;
 395   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
 396     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
 397 }
 398
 399
 400 /* Replace the division at USE_P with a multiplication by the reciprocal, if
 401    possible.  */
 402
 403 static inline void
 404 replace_reciprocal (use_operand_p use_p)
 405 {
 406   gimple *use_stmt = USE_STMT (use_p);
 407   basic_block bb = gimple_bb (use_stmt);
 408   struct occurrence *occ = (struct occurrence *) bb->aux;
 409
 410   if (optimize_bb_for_speed_p (bb)
 411       && occ->recip_def && use_stmt != occ->recip_def_stmt)
 412     {
 413       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
 414       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
 415       SET_USE (use_p, occ->recip_def);
 416       fold_stmt_inplace (&gsi);
 417       update_stmt (use_stmt);
 418     }
 419 }
 420
 421
 422 /* Free OCC and return one more "struct occurrence" to be freed.  */
 423
 424 static struct occurrence *
 425 free_bb (struct occurrence *occ)
 426 {
 427   struct occurrence *child, *next;
 428
 429   /* First get the two pointers hanging off OCC.  */
 430   next = occ->next;
 431   child = occ->children;
 432   occ->bb->aux = NULL;
 433   occ_pool->remove (occ);
 434
 435   /* Now ensure that we don't recurse unless it is necessary.  */
 436   if (!child)
 437     return next;
 438   else
 439     {
 440       while (next)
 441         next = free_bb (next);
 442
 443       return child;
 444     }
 445 }
 446
 447
 448 /* Look for floating-point divisions among DEF's uses, and try to
 449    replace them by multiplications with the reciprocal.  Add
 450    as many statements computing the reciprocal as needed.
 451
 452    DEF must be a GIMPLE register of a floating-point type.  */
 453
 454 static void
 455 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
 456 {
 457   use_operand_p use_p;
 458   imm_use_iterator use_iter;
 459   struct occurrence *occ;
 460   int count = 0, threshold;
 461
 462   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
 463
 464   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
 465     {
 466       gimple *use_stmt = USE_STMT (use_p);
 467       if (is_division_by (use_stmt, def))
 468         {
 469           register_division_in (gimple_bb (use_stmt));
 470           count++;
 471         }
 472     }
 473
 474   /* Do the expensive part only if we can hope to optimize something.  */
 475   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 476   if (count >= threshold)
 477     {
 478       gimple *use_stmt;
 479       for (occ = occ_head; occ; occ = occ->next)
 480         {
 481           compute_merit (occ);
 482           insert_reciprocals (def_gsi, occ, def, NULL, threshold);
 483         }
 484
 485       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
 486         {
 487           if (is_division_by (use_stmt, def))
 488             {
 489               FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
 490                 replace_reciprocal (use_p);
 491             }
 492         }
 493     }
 494
 495   for (occ = occ_head; occ; )
 496     occ = free_bb (occ);
 497
 498   occ_head = NULL;
 499 }
 500
 501 /* Go through all the floating-point SSA_NAMEs, and call
 502    execute_cse_reciprocals_1 on each of them.  */
 503 namespace {
 504
 505 const pass_data pass_data_cse_reciprocals =
 506 {
 507   GIMPLE_PASS, /* type */
 508   "recip", /* name */
 509   OPTGROUP_NONE, /* optinfo_flags */
 510   TV_NONE, /* tv_id */
 511   PROP_ssa, /* properties_required */
 512   0, /* properties_provided */
 513   0, /* properties_destroyed */
 514   0, /* todo_flags_start */
 515   TODO_update_ssa, /* todo_flags_finish */
 516 };
 517
 518 class pass_cse_reciprocals : public gimple_opt_pass
 519 {
 520 public:
 521   pass_cse_reciprocals (gcc::context *ctxt)
 522     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
 523   {}
 524
 525   /* opt_pass methods: */
 526   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
 527   virtual unsigned int execute (function *);
 528
 529 }; // class pass_cse_reciprocals
 530
 531 unsigned int
 532 pass_cse_reciprocals::execute (function *fun)
 533 {
 534   basic_block bb;
 535   tree arg;
 536
 537   occ_pool = new object_allocator<occurrence> ("dominators for recip");
 538
 539   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
 540   calculate_dominance_info (CDI_DOMINATORS);
 541   calculate_dominance_info (CDI_POST_DOMINATORS);
 542
 543   if (flag_checking)
 544     FOR_EACH_BB_FN (bb, fun)
 545       gcc_assert (!bb->aux);
 546
 547   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
 548     if (FLOAT_TYPE_P (TREE_TYPE (arg))
 549         && is_gimple_reg (arg))
 550       {
 551         tree name = ssa_default_def (fun, arg);
 552         if (name)
 553           execute_cse_reciprocals_1 (NULL, name);
 554       }
 555
 556   FOR_EACH_BB_FN (bb, fun)
 557     {
 558       tree def;
 559
 560       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 561            gsi_next (&gsi))
 562         {
 563           gphi *phi = gsi.phi ();
 564           def = PHI_RESULT (phi);
 565           if (! virtual_operand_p (def)
 566               && FLOAT_TYPE_P (TREE_TYPE (def)))
 567             execute_cse_reciprocals_1 (NULL, def);
 568         }
 569
 570       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 571            gsi_next (&gsi))
 572         {
 573           gimple *stmt = gsi_stmt (gsi);
 574
 575           if (gimple_has_lhs (stmt)
 576               && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 577               && FLOAT_TYPE_P (TREE_TYPE (def))
 578               && TREE_CODE (def) == SSA_NAME)
 579             execute_cse_reciprocals_1 (&gsi, def);
 580         }
 581
 582       if (optimize_bb_for_size_p (bb))
 583         continue;
 584
 585       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
 586       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
 587            gsi_next (&gsi))
 588         {
 589           gimple *stmt = gsi_stmt (gsi);
 590           tree fndecl;
 591
 592           if (is_gimple_assign (stmt)
 593               && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
 594             {
 595               tree arg1 = gimple_assign_rhs2 (stmt);
 596               gimple *stmt1;
 597
 598               if (TREE_CODE (arg1) != SSA_NAME)
 599                 continue;
 600
 601               stmt1 = SSA_NAME_DEF_STMT (arg1);
 602
 603               if (is_gimple_call (stmt1)
 604                   && gimple_call_lhs (stmt1)
 605                   && (fndecl = gimple_call_fndecl (stmt1))
 606                   && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
 607                       || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
 608                 {
 609                   enum built_in_function code;
 610                   bool md_code, fail;
 611                   imm_use_iterator ui;
 612                   use_operand_p use_p;
 613
 614                   code = DECL_FUNCTION_CODE (fndecl);
 615                   md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
 616
 617                   fndecl = targetm.builtin_reciprocal (code, md_code, false);
 618                   if (!fndecl)
 619                     continue;
 620
 621                   /* Check that all uses of the SSA name are divisions,
 622                      otherwise replacing the defining statement will do
 623                      the wrong thing.  */
 624                   fail = false;
 625                   FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
 626                     {
 627                       gimple *stmt2 = USE_STMT (use_p);
 628                       if (is_gimple_debug (stmt2))
 629                         continue;
 630                       if (!is_gimple_assign (stmt2)
 631                           || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
 632                           || gimple_assign_rhs1 (stmt2) == arg1
 633                           || gimple_assign_rhs2 (stmt2) != arg1)
 634                         {
 635                           fail = true;
 636                           break;
 637                         }
 638                     }
 639                   if (fail)
 640                     continue;
 641
 642                   gimple_replace_ssa_lhs (stmt1, arg1);
 643                   gimple_call_set_fndecl (stmt1, fndecl);
 644                   update_stmt (stmt1);
 645                   reciprocal_stats.rfuncs_inserted++;
 646
 647                   FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
 648                     {
 649                       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
 650                       gimple_assign_set_rhs_code (stmt, MULT_EXPR);
 651                       fold_stmt_inplace (&gsi);
 652                       update_stmt (stmt);
 653                     }
 654                 }
 655             }
 656         }
 657     }
 658
 659   statistics_counter_event (fun, "reciprocal divs inserted",
 660                             reciprocal_stats.rdivs_inserted);
 661   statistics_counter_event (fun, "reciprocal functions inserted",
 662                             reciprocal_stats.rfuncs_inserted);
 663
 664   free_dominance_info (CDI_DOMINATORS);
 665   free_dominance_info (CDI_POST_DOMINATORS);
 666   delete occ_pool;
 667   return 0;
 668 }
 669
 670 } // anon namespace
 671
 672 gimple_opt_pass *
 673 make_pass_cse_reciprocals (gcc::context *ctxt)
 674 {
 675   return new pass_cse_reciprocals (ctxt);
 676 }
 677
 678 /* Records an occurrence at statement USE_STMT in the vector of trees
 679    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
 680    is not yet initialized.  Returns true if the occurrence was pushed on
 681    the vector.  Adjusts *TOP_BB to be the basic block dominating all
 682    statements in the vector.  */
 683
 684 static bool
 685 maybe_record_sincos (vec<gimple *> *stmts,
 686                      basic_block *top_bb, gimple *use_stmt)
 687 {
 688   basic_block use_bb = gimple_bb (use_stmt);
 689   if (*top_bb
 690       && (*top_bb == use_bb
 691           || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
 692     stmts->safe_push (use_stmt);
 693   else if (!*top_bb
 694            || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
 695     {
 696       stmts->safe_push (use_stmt);
 697       *top_bb = use_bb;
 698     }
 699   else
 700     return false;
 701
 702   return true;
 703 }
 704
 705 /* Look for sin, cos and cexpi calls with the same argument NAME and
 706    create a single call to cexpi CSEing the result in this case.
 707    We first walk over all immediate uses of the argument collecting
 708    statements that we can CSE in a vector and in a second pass replace
 709    the statement rhs with a REALPART or IMAGPART expression on the
 710    result of the cexpi call we insert before the use statement that
 711    dominates all other candidates.  */
 712
 713 static bool
 714 execute_cse_sincos_1 (tree name)
 715 {
 716   gimple_stmt_iterator gsi;
 717   imm_use_iterator use_iter;
 718   tree fndecl, res, type;
 719   gimple *def_stmt, *use_stmt, *stmt;
 720   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
 721   auto_vec<gimple *> stmts;
 722   basic_block top_bb = NULL;
 723   int i;
 724   bool cfg_changed = false;
 725
 726   type = TREE_TYPE (name);
 727   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
 728     {
 729       if (gimple_code (use_stmt) != GIMPLE_CALL
 730           || !gimple_call_lhs (use_stmt)
 731           || !(fndecl = gimple_call_fndecl (use_stmt))
 732           || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
 733         continue;
 734
 735       switch (DECL_FUNCTION_CODE (fndecl))
 736         {
 737         CASE_FLT_FN (BUILT_IN_COS):
 738           seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 739           break;
 740
 741         CASE_FLT_FN (BUILT_IN_SIN):
 742           seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 743           break;
 744
 745         CASE_FLT_FN (BUILT_IN_CEXPI):
 746           seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
 747           break;
 748
 749         default:;
 750         }
 751     }
 752
 753   if (seen_cos + seen_sin + seen_cexpi <= 1)
 754     return false;
 755
 756   /* Simply insert cexpi at the beginning of top_bb but not earlier than
 757      the name def statement.  */
 758   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
 759   if (!fndecl)
 760     return false;
 761   stmt = gimple_build_call (fndecl, 1, name);
 762   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
 763   gimple_call_set_lhs (stmt, res);
 764
 765   def_stmt = SSA_NAME_DEF_STMT (name);
 766   if (!SSA_NAME_IS_DEFAULT_DEF (name)
 767       && gimple_code (def_stmt) != GIMPLE_PHI
 768       && gimple_bb (def_stmt) == top_bb)
 769     {
 770       gsi = gsi_for_stmt (def_stmt);
 771       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
 772     }
 773   else
 774     {
 775       gsi = gsi_after_labels (top_bb);
 776       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
 777     }
 778   sincos_stats.inserted++;
 779
 780   /* And adjust the recorded old call sites.  */
 781   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
 782     {
 783       tree rhs = NULL;
 784       fndecl = gimple_call_fndecl (use_stmt);
 785
 786       switch (DECL_FUNCTION_CODE (fndecl))
 787         {
 788         CASE_FLT_FN (BUILT_IN_COS):
 789           rhs = fold_build1 (REALPART_EXPR, type, res);
 790           break;
 791
 792         CASE_FLT_FN (BUILT_IN_SIN):
 793           rhs = fold_build1 (IMAGPART_EXPR, type, res);
 794           break;
 795
 796         CASE_FLT_FN (BUILT_IN_CEXPI):
 797           rhs = res;
 798           break;
 799
 800         default:;
 801           gcc_unreachable ();
 802         }
 803
 804         /* Replace call with a copy.  */
 805         stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
 806
 807         gsi = gsi_for_stmt (use_stmt);
 808         gsi_replace (&gsi, stmt, true);
 809         if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
 810           cfg_changed = true;
 811     }
 812
 813   return cfg_changed;
 814 }
 815
 816 /* To evaluate powi(x,n), the floating point value x raised to the
 817    constant integer exponent n, we use a hybrid algorithm that
 818    combines the "window method" with look-up tables.  For an
 819    introduction to exponentiation algorithms and "addition chains",
 820    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
 821    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
 822    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
 823    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
 824
 825 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
 826    multiplications to inline before calling the system library's pow
 827    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
 828    so this default never requires calling pow, powf or powl.  */
 829
 830 #ifndef POWI_MAX_MULTS
 831 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
 832 #endif
 833
 834 /* The size of the "optimal power tree" lookup table.  All
 835    exponents less than this value are simply looked up in the
 836    powi_table below.  This threshold is also used to size the
 837    cache of pseudo registers that hold intermediate results.  */
 838 #define POWI_TABLE_SIZE 256
 839
 840 /* The size, in bits of the window, used in the "window method"
 841    exponentiation algorithm.  This is equivalent to a radix of
 842    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
 843 #define POWI_WINDOW_SIZE 3
 844
 845 /* The following table is an efficient representation of an
 846    "optimal power tree".  For each value, i, the corresponding
 847    value, j, in the table states than an optimal evaluation
 848    sequence for calculating pow(x,i) can be found by evaluating
 849    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
 850    100 integers is given in Knuth's "Seminumerical algorithms".  */
 851
 852 static const unsigned char powi_table[POWI_TABLE_SIZE] =
 853   {
 854       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
 855       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
 856       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
 857      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
 858      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
 859      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
 860      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
 861      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
 862      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
 863      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
 864      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
 865      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
 866      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
 867      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
 868      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
 869      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
 870      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
 871      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
 872      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
 873      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
 874      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
 875      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
 876      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
 877      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
 878      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
 879     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
 880     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
 881     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
 882     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
 883     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
 884     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
 885     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
 886   };
 887
 888
 889 /* Return the number of multiplications required to calculate
 890    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
 891    subroutine of powi_cost.  CACHE is an array indicating
 892    which exponents have already been calculated.  */
 893
 894 static int
 895 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
 896 {
 897   /* If we've already calculated this exponent, then this evaluation
 898      doesn't require any additional multiplications.  */
 899   if (cache[n])
 900     return 0;
 901
 902   cache[n] = true;
 903   return powi_lookup_cost (n - powi_table[n], cache)
 904          + powi_lookup_cost (powi_table[n], cache) + 1;
 905 }
 906
 907 /* Return the number of multiplications required to calculate
 908    powi(x,n) for an arbitrary x, given the exponent N.  This
 909    function needs to be kept in sync with powi_as_mults below.  */
 910
 911 static int
 912 powi_cost (HOST_WIDE_INT n)
 913 {
 914   bool cache[POWI_TABLE_SIZE];
 915   unsigned HOST_WIDE_INT digit;
 916   unsigned HOST_WIDE_INT val;
 917   int result;
 918
 919   if (n == 0)
 920     return 0;
 921
 922   /* Ignore the reciprocal when calculating the cost.  */
 923   val = (n < 0) ? -n : n;
 924
 925   /* Initialize the exponent cache.  */
 926   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
 927   cache[1] = true;
 928
 929   result = 0;
 930
 931   while (val >= POWI_TABLE_SIZE)
 932     {
 933       if (val & 1)
 934         {
 935           digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
 936           result += powi_lookup_cost (digit, cache)
 937                     + POWI_WINDOW_SIZE + 1;
 938           val >>= POWI_WINDOW_SIZE;
 939         }
 940       else
 941         {
 942           val >>= 1;
 943           result++;
 944         }
 945     }
 946
 947   return result + powi_lookup_cost (val, cache);
 948 }
 949
 950 /* Recursive subroutine of powi_as_mults.  This function takes the
 951    array, CACHE, of already calculated exponents and an exponent N and
 952    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
 953
 954 static tree
 955 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
 956                  HOST_WIDE_INT n, tree *cache)
 957 {
 958   tree op0, op1, ssa_target;
 959   unsigned HOST_WIDE_INT digit;
 960   gassign *mult_stmt;
 961
 962   if (n < POWI_TABLE_SIZE && cache[n])
 963     return cache[n];
 964
 965   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
 966
 967   if (n < POWI_TABLE_SIZE)
 968     {
 969       cache[n] = ssa_target;
 970       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
 971       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
 972     }
 973   else if (n & 1)
 974     {
 975       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
 976       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
 977       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
 978     }
 979   else
 980     {
 981       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
 982       op1 = op0;
 983     }
 984
 985   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
 986   gimple_set_location (mult_stmt, loc);
 987   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
 988
 989   return ssa_target;
 990 }
 991
 992 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
 993    This function needs to be kept in sync with powi_cost above.  */
 994
 995 static tree
 996 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
 997                tree arg0, HOST_WIDE_INT n)
 998 {
 999   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1000   gassign *div_stmt;
1001   tree target;
1002
1003   if (n == 0)
1004     return build_real (type, dconst1);
1005
1006   memset (cache, 0,  sizeof (cache));
1007   cache[1] = arg0;
1008
1009   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1010   if (n >= 0)
1011     return result;
1012
1013   /* If the original exponent was negative, reciprocate the result.  */
1014   target = make_temp_ssa_name (type, NULL, "powmult");
1015   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1016                                   build_real (type, dconst1), result);
1017   gimple_set_location (div_stmt, loc);
1018   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1019
1020   return target;
1021 }
1022
1023 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1024    location info LOC.  If the arguments are appropriate, create an
1025    equivalent sequence of statements prior to GSI using an optimal
1026    number of multiplications, and return an expession holding the
1027    result.  */
1028
1029 static tree
1030 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1031                             tree arg0, HOST_WIDE_INT n)
1032 {
1033   /* Avoid largest negative number.  */
1034   if (n != -n
1035       && ((n >= -1 && n <= 2)
1036           || (optimize_function_for_speed_p (cfun)
1037               && powi_cost (n) <= POWI_MAX_MULTS)))
1038     return powi_as_mults (gsi, loc, arg0, n);
1039
1040   return NULL_TREE;
1041 }
1042
1043 /* Build a gimple call statement that calls FN with argument ARG.
1044    Set the lhs of the call statement to a fresh SSA name.  Insert the
1045    statement prior to GSI's current position, and return the fresh
1046    SSA name.  */
1047
1048 static tree
1049 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1050                        tree fn, tree arg)
1051 {
1052   gcall *call_stmt;
1053   tree ssa_target;
1054
1055   call_stmt = gimple_build_call (fn, 1, arg);
1056   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1057   gimple_set_lhs (call_stmt, ssa_target);
1058   gimple_set_location (call_stmt, loc);
1059   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1060
1061   return ssa_target;
1062 }
1063
1064 /* Build a gimple binary operation with the given CODE and arguments
1065    ARG0, ARG1, assigning the result to a new SSA name for variable
1066    TARGET.  Insert the statement prior to GSI's current position, and
1067    return the fresh SSA name.*/
1068
1069 static tree
1070 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1071                         const char *name, enum tree_code code,
1072                         tree arg0, tree arg1)
1073 {
1074   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1075   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1076   gimple_set_location (stmt, loc);
1077   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1078   return result;
1079 }
1080
1081 /* Build a gimple reference operation with the given CODE and argument
1082    ARG, assigning the result to a new SSA name of TYPE with NAME.
1083    Insert the statement prior to GSI's current position, and return
1084    the fresh SSA name.  */
1085
1086 static inline tree
1087 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1088                       const char *name, enum tree_code code, tree arg0)
1089 {
1090   tree result = make_temp_ssa_name (type, NULL, name);
1091   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1092   gimple_set_location (stmt, loc);
1093   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1094   return result;
1095 }
1096
1097 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1098    prior to GSI's current position, and return the fresh SSA name.  */
1099
1100 static tree
1101 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1102                        tree type, tree val)
1103 {
1104   tree result = make_ssa_name (type);
1105   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1106   gimple_set_location (stmt, loc);
1107   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1108   return result;
1109 }
1110
1111 struct pow_synth_sqrt_info
1112 {
1113   bool *factors;
1114   unsigned int deepest;
1115   unsigned int num_mults;
1116 };
1117
1118 /* Return true iff the real value C can be represented as a
1119    sum of powers of 0.5 up to N.  That is:
1120    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1121    Record in INFO the various parameters of the synthesis algorithm such
1122    as the factors a[i], the maximum 0.5 power and the number of
1123    multiplications that will be required.  */
1124
1125 bool
1126 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1127                                  struct pow_synth_sqrt_info *info)
1128 {
1129   REAL_VALUE_TYPE factor = dconsthalf;
1130   REAL_VALUE_TYPE remainder = c;
1131
1132   info->deepest = 0;
1133   info->num_mults = 0;
1134   memset (info->factors, 0, n * sizeof (bool));
1135
1136   for (unsigned i = 0; i < n; i++)
1137     {
1138       REAL_VALUE_TYPE res;
1139
1140       /* If something inexact happened bail out now.  */
1141       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1142         return false;
1143
1144       /* We have hit zero.  The number is representable as a sum
1145          of powers of 0.5.  */
1146       if (real_equal (&res, &dconst0))
1147         {
1148           info->factors[i] = true;
1149           info->deepest = i + 1;
1150           return true;
1151         }
1152       else if (!REAL_VALUE_NEGATIVE (res))
1153         {
1154           remainder = res;
1155           info->factors[i] = true;
1156           info->num_mults++;
1157         }
1158       else
1159         info->factors[i] = false;
1160
1161       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1162     }
1163   return false;
1164 }
1165
1166 /* Return the tree corresponding to FN being applied
1167    to ARG N times at GSI and LOC.
1168    Look up previous results from CACHE if need be.
1169    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1170
1171 static tree
1172 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1173               tree fn, location_t loc, tree *cache)
1174 {
1175   tree res = cache[n];
1176   if (!res)
1177     {
1178       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1179       res = build_and_insert_call (gsi, loc, fn, prev);
1180       cache[n] = res;
1181     }
1182
1183   return res;
1184 }
1185
1186 /* Print to STREAM the repeated application of function FNAME to ARG
1187    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1188    "foo (foo (x))".  */
1189
1190 static void
1191 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1192                  unsigned int n)
1193 {
1194   if (n == 0)
1195     fprintf (stream, "%s", arg);
1196   else
1197     {
1198       fprintf (stream, "%s (", fname);
1199       print_nested_fn (stream, fname, arg, n - 1);
1200       fprintf (stream, ")");
1201     }
1202 }
1203
1204 /* Print to STREAM the fractional sequence of sqrt chains
1205    applied to ARG, described by INFO.  Used for the dump file.  */
1206
1207 static void
1208 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1209                                 struct pow_synth_sqrt_info *info)
1210 {
1211   for (unsigned int i = 0; i < info->deepest; i++)
1212     {
1213       bool is_set = info->factors[i];
1214       if (is_set)
1215         {
1216           print_nested_fn (stream, "sqrt", arg, i + 1);
1217           if (i != info->deepest - 1)
1218             fprintf (stream, " * ");
1219         }
1220     }
1221 }
1222
1223 /* Print to STREAM a representation of raising ARG to an integer
1224    power N.  Used for the dump file.  */
1225
1226 static void
1227 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1228 {
1229   if (n > 1)
1230     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1231   else if (n == 1)
1232     fprintf (stream, "%s", arg);
1233 }
1234
1235 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1236    square roots.  Place at GSI and LOC.  Limit the maximum depth
1237    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1238    result of the expanded sequence or NULL_TREE if the expansion failed.
1239
1240    This routine assumes that ARG1 is a real number with a fractional part
1241    (the integer exponent case will have been handled earlier in
1242    gimple_expand_builtin_pow).
1243
1244    For ARG1 > 0.0:
1245    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1246      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1247                     FRAC_PART == ARG1 - WHOLE_PART:
1248      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1249      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1250      if it can be expressed as such, that is if FRAC_PART satisfies:
1251      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1252      where integer a[i] is either 0 or 1.
1253
1254      Example:
1255      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1256        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1257
1258    For ARG1 < 0.0 there are two approaches:
1259    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1260          is calculated as above.
1261
1262      Example:
1263      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1264        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1265
1266    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1267            FRAC_PART  := ARG1 - WHOLE_PART
1268      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1269      Example:
1270      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1271        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1272
1273    For ARG1 < 0.0 we choose between (A) and (B) depending on
1274    how many multiplications we'd have to do.
1275    So, for the example in (B): POW (x, -5.875), if we were to
1276    follow algorithm (A) we would produce:
1277    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1278    which contains more multiplications than approach (B).
1279
1280    Hopefully, this approach will eliminate potentially expensive POW library
1281    calls when unsafe floating point math is enabled and allow the compiler to
1282    further optimise the multiplies, square roots and divides produced by this
1283    function.  */
1284
1285 static tree
1286 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1287                      tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1288 {
1289   tree type = TREE_TYPE (arg0);
1290   machine_mode mode = TYPE_MODE (type);
1291   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1292   bool one_over = true;
1293
1294   if (!sqrtfn)
1295     return NULL_TREE;
1296
1297   if (TREE_CODE (arg1) != REAL_CST)
1298     return NULL_TREE;
1299
1300   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1301
1302   gcc_assert (max_depth > 0);
1303   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1304
1305   struct pow_synth_sqrt_info synth_info;
1306   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1307   synth_info.deepest = 0;
1308   synth_info.num_mults = 0;
1309
1310   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1311   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1312
1313   /* The whole and fractional parts of exp.  */
1314   REAL_VALUE_TYPE whole_part;
1315   REAL_VALUE_TYPE frac_part;
1316
1317   real_floor (&whole_part, mode, &exp);
1318   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1319
1320
1321   REAL_VALUE_TYPE ceil_whole = dconst0;
1322   REAL_VALUE_TYPE ceil_fract = dconst0;
1323
1324   if (neg_exp)
1325     {
1326       real_ceil (&ceil_whole, mode, &exp);
1327       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1328     }
1329
1330   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1331     return NULL_TREE;
1332
1333   /* Check whether it's more profitable to not use 1.0 / ...  */
1334   if (neg_exp)
1335     {
1336       struct pow_synth_sqrt_info alt_synth_info;
1337       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1338       alt_synth_info.deepest = 0;
1339       alt_synth_info.num_mults = 0;
1340
1341       if (representable_as_half_series_p (ceil_fract, max_depth,
1342                                            &alt_synth_info)
1343           && alt_synth_info.deepest <= synth_info.deepest
1344           && alt_synth_info.num_mults < synth_info.num_mults)
1345         {
1346           whole_part = ceil_whole;
1347           frac_part = ceil_fract;
1348           synth_info.deepest = alt_synth_info.deepest;
1349           synth_info.num_mults = alt_synth_info.num_mults;
1350           memcpy (synth_info.factors, alt_synth_info.factors,
1351                   (max_depth + 1) * sizeof (bool));
1352           one_over = false;
1353         }
1354     }
1355
1356   HOST_WIDE_INT n = real_to_integer (&whole_part);
1357   REAL_VALUE_TYPE cint;
1358   real_from_integer (&cint, VOIDmode, n, SIGNED);
1359
1360   if (!real_identical (&whole_part, &cint))
1361     return NULL_TREE;
1362
1363   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1364     return NULL_TREE;
1365
1366   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1367
1368   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1369
1370   /* Calculate the integer part of the exponent.  */
1371   if (n > 1)
1372     {
1373       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1374       if (!integer_res)
1375         return NULL_TREE;
1376     }
1377
1378   if (dump_file)
1379     {
1380       char string[64];
1381
1382       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1383       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1384
1385       if (neg_exp)
1386         {
1387           if (one_over)
1388             {
1389               fprintf (dump_file, "1.0 / (");
1390               dump_integer_part (dump_file, "x", n);
1391               if (n > 0)
1392                 fprintf (dump_file, " * ");
1393               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1394               fprintf (dump_file, ")");
1395             }
1396           else
1397             {
1398               dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1399               fprintf (dump_file, " / (");
1400               dump_integer_part (dump_file, "x", n);
1401               fprintf (dump_file, ")");
1402             }
1403         }
1404       else
1405         {
1406           dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1407           if (n > 0)
1408             fprintf (dump_file, " * ");
1409           dump_integer_part (dump_file, "x", n);
1410         }
1411
1412       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1413     }
1414
1415
1416   tree fract_res = NULL_TREE;
1417   cache[0] = arg0;
1418
1419   /* Calculate the fractional part of the exponent.  */
1420   for (unsigned i = 0; i < synth_info.deepest; i++)
1421     {
1422       if (synth_info.factors[i])
1423         {
1424           tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1425
1426           if (!fract_res)
1427               fract_res = sqrt_chain;
1428
1429           else
1430             fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1431                                            fract_res, sqrt_chain);
1432         }
1433     }
1434
1435   tree res = NULL_TREE;
1436
1437   if (neg_exp)
1438     {
1439       if (one_over)
1440         {
1441           if (n > 0)
1442             res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1443                                            fract_res, integer_res);
1444           else
1445             res = fract_res;
1446
1447           res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1448                                           build_real (type, dconst1), res);
1449         }
1450       else
1451         {
1452           res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1453                                          fract_res, integer_res);
1454         }
1455     }
1456   else
1457     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1458                                    fract_res, integer_res);
1459   return res;
1460 }
1461
1462 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1463    with location info LOC.  If possible, create an equivalent and
1464    less expensive sequence of statements prior to GSI, and return an
1465    expession holding the result.  */
1466
1467 static tree
1468 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1469                            tree arg0, tree arg1)
1470 {
1471   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1472   REAL_VALUE_TYPE c2, dconst3;
1473   HOST_WIDE_INT n;
1474   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1475   machine_mode mode;
1476   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1477   bool hw_sqrt_exists, c_is_int, c2_is_int;
1478
1479   dconst1_4 = dconst1;
1480   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1481
1482   /* If the exponent isn't a constant, there's nothing of interest
1483      to be done.  */
1484   if (TREE_CODE (arg1) != REAL_CST)
1485     return NULL_TREE;
1486
1487   /* If the exponent is equivalent to an integer, expand to an optimal
1488      multiplication sequence when profitable.  */
1489   c = TREE_REAL_CST (arg1);
1490   n = real_to_integer (&c);
1491   real_from_integer (&cint, VOIDmode, n, SIGNED);
1492   c_is_int = real_identical (&c, &cint);
1493
1494   if (c_is_int
1495       && ((n >= -1 && n <= 2)
1496           || (flag_unsafe_math_optimizations
1497               && speed_p
1498               && powi_cost (n) <= POWI_MAX_MULTS)))
1499     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1500
1501   /* Attempt various optimizations using sqrt and cbrt.  */
1502   type = TREE_TYPE (arg0);
1503   mode = TYPE_MODE (type);
1504   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1505
1506   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1507      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1508      sqrt(-0) = -0.  */
1509   if (sqrtfn
1510       && real_equal (&c, &dconsthalf)
1511       && !HONOR_SIGNED_ZEROS (mode))
1512     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1513
1514   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1515
1516   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1517      optimizations since 1./3. is not exactly representable.  If x
1518      is negative and finite, the correct value of pow(x,1./3.) is
1519      a NaN with the "invalid" exception raised, because the value
1520      of 1./3. actually has an even denominator.  The correct value
1521      of cbrt(x) is a negative real value.  */
1522   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1523   dconst1_3 = real_value_truncate (mode, dconst_third ());
1524
1525   if (flag_unsafe_math_optimizations
1526       && cbrtfn
1527       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1528       && real_equal (&c, &dconst1_3))
1529     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1530
1531   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1532      if we don't have a hardware sqrt insn.  */
1533   dconst1_6 = dconst1_3;
1534   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1535
1536   if (flag_unsafe_math_optimizations
1537       && sqrtfn
1538       && cbrtfn
1539       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1540       && speed_p
1541       && hw_sqrt_exists
1542       && real_equal (&c, &dconst1_6))
1543     {
1544       /* sqrt(x)  */
1545       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1546
1547       /* cbrt(sqrt(x))  */
1548       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1549     }
1550
1551
1552   /* Attempt to expand the POW as a product of square root chains.
1553      Expand the 0.25 case even when otpimising for size.  */
1554   if (flag_unsafe_math_optimizations
1555       && sqrtfn
1556       && hw_sqrt_exists
1557       && (speed_p || real_equal (&c, &dconst1_4))
1558       && !HONOR_SIGNED_ZEROS (mode))
1559     {
1560       unsigned int max_depth = speed_p
1561                                 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1562                                 : 2;
1563
1564       tree expand_with_sqrts
1565         = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1566
1567       if (expand_with_sqrts)
1568         return expand_with_sqrts;
1569     }
1570
1571   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1572   n = real_to_integer (&c2);
1573   real_from_integer (&cint, VOIDmode, n, SIGNED);
1574   c2_is_int = real_identical (&c2, &cint);
1575
1576   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1577
1578      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1579      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1580
1581      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1582      different from pow(x, 1./3.) due to rounding and behavior with
1583      negative x, we need to constrain this transformation to unsafe
1584      math and positive x or finite math.  */
1585   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1586   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1587   real_round (&c2, mode, &c2);
1588   n = real_to_integer (&c2);
1589   real_from_integer (&cint, VOIDmode, n, SIGNED);
1590   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1591   real_convert (&c2, mode, &c2);
1592
1593   if (flag_unsafe_math_optimizations
1594       && cbrtfn
1595       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1596       && real_identical (&c2, &c)
1597       && !c2_is_int
1598       && optimize_function_for_speed_p (cfun)
1599       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1600     {
1601       tree powi_x_ndiv3 = NULL_TREE;
1602
1603       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1604          possible or profitable, give up.  Skip the degenerate case when
1605          abs(n) < 3, where the result is always 1.  */
1606       if (absu_hwi (n) >= 3)
1607         {
1608           powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1609                                                      abs_hwi (n / 3));
1610           if (!powi_x_ndiv3)
1611             return NULL_TREE;
1612         }
1613
1614       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1615          as that creates an unnecessary variable.  Instead, just produce
1616          either cbrt(x) or cbrt(x) * cbrt(x).  */
1617       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1618
1619       if (absu_hwi (n) % 3 == 1)
1620         powi_cbrt_x = cbrt_x;
1621       else
1622         powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1623                                               cbrt_x, cbrt_x);
1624
1625       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1626       if (absu_hwi (n) < 3)
1627         result = powi_cbrt_x;
1628       else
1629         result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1630                                          powi_x_ndiv3, powi_cbrt_x);
1631
1632       /* If n is negative, reciprocate the result.  */
1633       if (n < 0)
1634         result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1635                                          build_real (type, dconst1), result);
1636
1637       return result;
1638     }
1639
1640   /* No optimizations succeeded.  */
1641   return NULL_TREE;
1642 }
1643
1644 /* ARG is the argument to a cabs builtin call in GSI with location info
1645    LOC.  Create a sequence of statements prior to GSI that calculates
1646    sqrt(R*R + I*I), where R and I are the real and imaginary components
1647    of ARG, respectively.  Return an expression holding the result.  */
1648
1649 static tree
1650 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1651 {
1652   tree real_part, imag_part, addend1, addend2, sum, result;
1653   tree type = TREE_TYPE (TREE_TYPE (arg));
1654   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1655   machine_mode mode = TYPE_MODE (type);
1656
1657   if (!flag_unsafe_math_optimizations
1658       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1659       || !sqrtfn
1660       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1661     return NULL_TREE;
1662
1663   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1664                                     REALPART_EXPR, arg);
1665   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1666                                     real_part, real_part);
1667   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1668                                     IMAGPART_EXPR, arg);
1669   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1670                                     imag_part, imag_part);
1671   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1672   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1673
1674   return result;
1675 }
1676
1677 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1678    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1679    an optimal number of multiplies, when n is a constant.  */
1680
1681 namespace {
1682
1683 const pass_data pass_data_cse_sincos =
1684 {
1685   GIMPLE_PASS, /* type */
1686   "sincos", /* name */
1687   OPTGROUP_NONE, /* optinfo_flags */
1688   TV_NONE, /* tv_id */
1689   PROP_ssa, /* properties_required */
1690   PROP_gimple_opt_math, /* properties_provided */
1691   0, /* properties_destroyed */
1692   0, /* todo_flags_start */
1693   TODO_update_ssa, /* todo_flags_finish */
1694 };
1695
1696 class pass_cse_sincos : public gimple_opt_pass
1697 {
1698 public:
1699   pass_cse_sincos (gcc::context *ctxt)
1700     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1701   {}
1702
1703   /* opt_pass methods: */
1704   virtual bool gate (function *)
1705     {
1706       /* We no longer require either sincos or cexp, since powi expansion
1707          piggybacks on this pass.  */
1708       return optimize;
1709     }
1710
1711   virtual unsigned int execute (function *);
1712
1713 }; // class pass_cse_sincos
1714
1715 unsigned int
1716 pass_cse_sincos::execute (function *fun)
1717 {
1718   basic_block bb;
1719   bool cfg_changed = false;
1720
1721   calculate_dominance_info (CDI_DOMINATORS);
1722   memset (&sincos_stats, 0, sizeof (sincos_stats));
1723
1724   FOR_EACH_BB_FN (bb, fun)
1725     {
1726       gimple_stmt_iterator gsi;
1727       bool cleanup_eh = false;
1728
1729       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1730         {
1731           gimple *stmt = gsi_stmt (gsi);
1732           tree fndecl;
1733
1734           /* Only the last stmt in a bb could throw, no need to call
1735              gimple_purge_dead_eh_edges if we change something in the middle
1736              of a basic block.  */
1737           cleanup_eh = false;
1738
1739           if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)
1740               && gimple_call_lhs (stmt))
1741             {
1742               tree arg, arg0, arg1, result;
1743               HOST_WIDE_INT n;
1744               location_t loc;
1745
1746               fndecl = gimple_call_fndecl (stmt);
1747               switch (DECL_FUNCTION_CODE (fndecl))
1748                 {
1749                 CASE_FLT_FN (BUILT_IN_COS):
1750                 CASE_FLT_FN (BUILT_IN_SIN):
1751                 CASE_FLT_FN (BUILT_IN_CEXPI):
1752                   /* Make sure we have either sincos or cexp.  */
1753                   if (!targetm.libc_has_function (function_c99_math_complex)
1754                       && !targetm.libc_has_function (function_sincos))
1755                     break;
1756
1757                   arg = gimple_call_arg (stmt, 0);
1758                   if (TREE_CODE (arg) == SSA_NAME)
1759                     cfg_changed |= execute_cse_sincos_1 (arg);
1760                   break;
1761
1762                 CASE_FLT_FN (BUILT_IN_POW):
1763                   arg0 = gimple_call_arg (stmt, 0);
1764                   arg1 = gimple_call_arg (stmt, 1);
1765
1766                   loc = gimple_location (stmt);
1767                   result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1768
1769                   if (result)
1770                     {
1771                       tree lhs = gimple_get_lhs (stmt);
1772                       gassign *new_stmt = gimple_build_assign (lhs, result);
1773                       gimple_set_location (new_stmt, loc);
1774                       unlink_stmt_vdef (stmt);
1775                       gsi_replace (&gsi, new_stmt, true);
1776                       cleanup_eh = true;
1777                       if (gimple_vdef (stmt))
1778                         release_ssa_name (gimple_vdef (stmt));
1779                     }
1780                   break;
1781
1782                 CASE_FLT_FN (BUILT_IN_POWI):
1783                   arg0 = gimple_call_arg (stmt, 0);
1784                   arg1 = gimple_call_arg (stmt, 1);
1785                   loc = gimple_location (stmt);
1786
1787                   if (real_minus_onep (arg0))
1788                     {
1789                       tree t0, t1, cond, one, minus_one;
1790                       gassign *stmt;
1791
1792                       t0 = TREE_TYPE (arg0);
1793                       t1 = TREE_TYPE (arg1);
1794                       one = build_real (t0, dconst1);
1795                       minus_one = build_real (t0, dconstm1);
1796
1797                       cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1798                       stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1799                                                   arg1, build_int_cst (t1, 1));
1800                       gimple_set_location (stmt, loc);
1801                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1802
1803                       result = make_temp_ssa_name (t0, NULL, "powi");
1804                       stmt = gimple_build_assign (result, COND_EXPR, cond,
1805                                                   minus_one, one);
1806                       gimple_set_location (stmt, loc);
1807                       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1808                     }
1809                   else
1810                     {
1811                       if (!tree_fits_shwi_p (arg1))
1812                         break;
1813
1814                       n = tree_to_shwi (arg1);
1815                       result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1816                     }
1817
1818                   if (result)
1819                     {
1820                       tree lhs = gimple_get_lhs (stmt);
1821                       gassign *new_stmt = gimple_build_assign (lhs, result);
1822                       gimple_set_location (new_stmt, loc);
1823                       unlink_stmt_vdef (stmt);
1824                       gsi_replace (&gsi, new_stmt, true);
1825                       cleanup_eh = true;
1826                       if (gimple_vdef (stmt))
1827                         release_ssa_name (gimple_vdef (stmt));
1828                     }
1829                   break;
1830
1831                 CASE_FLT_FN (BUILT_IN_CABS):
1832                   arg0 = gimple_call_arg (stmt, 0);
1833                   loc = gimple_location (stmt);
1834                   result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1835
1836                   if (result)
1837                     {
1838                       tree lhs = gimple_get_lhs (stmt);
1839                       gassign *new_stmt = gimple_build_assign (lhs, result);
1840                       gimple_set_location (new_stmt, loc);
1841                       unlink_stmt_vdef (stmt);
1842                       gsi_replace (&gsi, new_stmt, true);
1843                       cleanup_eh = true;
1844                       if (gimple_vdef (stmt))
1845                         release_ssa_name (gimple_vdef (stmt));
1846                     }
1847                   break;
1848
1849                 default:;
1850                 }
1851             }
1852         }
1853       if (cleanup_eh)
1854         cfg_changed |= gimple_purge_dead_eh_edges (bb);
1855     }
1856
1857   statistics_counter_event (fun, "sincos statements inserted",
1858                             sincos_stats.inserted);
1859
1860   return cfg_changed ? TODO_cleanup_cfg : 0;
1861 }
1862
1863 } // anon namespace
1864
1865 gimple_opt_pass *
1866 make_pass_cse_sincos (gcc::context *ctxt)
1867 {
1868   return new pass_cse_sincos (ctxt);
1869 }
1870
1871 /* A symbolic number is used to detect byte permutation and selection
1872    patterns.  Therefore the field N contains an artificial number
1873    consisting of octet sized markers:
1874
1875    0    - target byte has the value 0
1876    FF   - target byte has an unknown value (eg. due to sign extension)
1877    1..size - marker value is the target byte index minus one.
1878
1879    To detect permutations on memory sources (arrays and structures), a symbolic
1880    number is also associated a base address (the array or structure the load is
1881    made from), an offset from the base address and a range which gives the
1882    difference between the highest and lowest accessed memory location to make
1883    such a symbolic number. The range is thus different from size which reflects
1884    the size of the type of current expression. Note that for non memory source,
1885    range holds the same value as size.
1886
1887    For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1888    a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1889    still have a size of 2 but this time a range of 1.  */
1890
1891 struct symbolic_number {
1892   uint64_t n;
1893   tree type;
1894   tree base_addr;
1895   tree offset;
1896   HOST_WIDE_INT bytepos;
1897   tree alias_set;
1898   tree vuse;
1899   unsigned HOST_WIDE_INT range;
1900 };
1901
1902 #define BITS_PER_MARKER 8
1903 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1904 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1905 #define HEAD_MARKER(n, size) \
1906   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1907
1908 /* The number which the find_bswap_or_nop_1 result should match in
1909    order to have a nop.  The number is masked according to the size of
1910    the symbolic number before using it.  */
1911 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1912   (uint64_t)0x08070605 << 32 | 0x04030201)
1913
1914 /* The number which the find_bswap_or_nop_1 result should match in
1915    order to have a byte swap.  The number is masked according to the
1916    size of the symbolic number before using it.  */
1917 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1918   (uint64_t)0x01020304 << 32 | 0x05060708)
1919
1920 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1921    number N.  Return false if the requested operation is not permitted
1922    on a symbolic number.  */
1923
1924 static inline bool
1925 do_shift_rotate (enum tree_code code,
1926                  struct symbolic_number *n,
1927                  int count)
1928 {
1929   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1930   unsigned head_marker;
1931
1932   if (count % BITS_PER_UNIT != 0)
1933     return false;
1934   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1935
1936   /* Zero out the extra bits of N in order to avoid them being shifted
1937      into the significant bits.  */
1938   if (size < 64 / BITS_PER_MARKER)
1939     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1940
1941   switch (code)
1942     {
1943     case LSHIFT_EXPR:
1944       n->n <<= count;
1945       break;
1946     case RSHIFT_EXPR:
1947       head_marker = HEAD_MARKER (n->n, size);
1948       n->n >>= count;
1949       /* Arithmetic shift of signed type: result is dependent on the value.  */
1950       if (!TYPE_UNSIGNED (n->type) && head_marker)
1951         for (i = 0; i < count / BITS_PER_MARKER; i++)
1952           n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1953                   << ((size - 1 - i) * BITS_PER_MARKER);
1954       break;
1955     case LROTATE_EXPR:
1956       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1957       break;
1958     case RROTATE_EXPR:
1959       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1960       break;
1961     default:
1962       return false;
1963     }
1964   /* Zero unused bits for size.  */
1965   if (size < 64 / BITS_PER_MARKER)
1966     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1967   return true;
1968 }
1969
1970 /* Perform sanity checking for the symbolic number N and the gimple
1971    statement STMT.  */
1972
1973 static inline bool
1974 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
1975 {
1976   tree lhs_type;
1977
1978   lhs_type = gimple_expr_type (stmt);
1979
1980   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1981     return false;
1982
1983   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1984     return false;
1985
1986   return true;
1987 }
1988
1989 /* Initialize the symbolic number N for the bswap pass from the base element
1990    SRC manipulated by the bitwise OR expression.  */
1991
1992 static bool
1993 init_symbolic_number (struct symbolic_number *n, tree src)
1994 {
1995   int size;
1996
1997   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1998
1999   /* Set up the symbolic number N by setting each byte to a value between 1 and
2000      the byte size of rhs1.  The highest order byte is set to n->size and the
2001      lowest order byte to 1.  */
2002   n->type = TREE_TYPE (src);
2003   size = TYPE_PRECISION (n->type);
2004   if (size % BITS_PER_UNIT != 0)
2005     return false;
2006   size /= BITS_PER_UNIT;
2007   if (size > 64 / BITS_PER_MARKER)
2008     return false;
2009   n->range = size;
2010   n->n = CMPNOP;
2011
2012   if (size < 64 / BITS_PER_MARKER)
2013     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2014
2015   return true;
2016 }
2017
2018 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2019    the answer. If so, REF is that memory source and the base of the memory area
2020    accessed and the offset of the access from that base are recorded in N.  */
2021
2022 bool
2023 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2024 {
2025   /* Leaf node is an array or component ref. Memorize its base and
2026      offset from base to compare to other such leaf node.  */
2027   HOST_WIDE_INT bitsize, bitpos;
2028   machine_mode mode;
2029   int unsignedp, volatilep;
2030   tree offset, base_addr;
2031
2032   /* Not prepared to handle PDP endian.  */
2033   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2034     return false;
2035
2036   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2037     return false;
2038
2039   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2040                                    &unsignedp, &volatilep, false);
2041
2042   if (TREE_CODE (base_addr) == MEM_REF)
2043     {
2044       offset_int bit_offset = 0;
2045       tree off = TREE_OPERAND (base_addr, 1);
2046
2047       if (!integer_zerop (off))
2048         {
2049           offset_int boff, coff = mem_ref_offset (base_addr);
2050           boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2051           bit_offset += boff;
2052         }
2053
2054       base_addr = TREE_OPERAND (base_addr, 0);
2055
2056       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
2057       if (wi::neg_p (bit_offset))
2058         {
2059           offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2060           offset_int tem = bit_offset.and_not (mask);
2061           /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2062              Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
2063           bit_offset -= tem;
2064           tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2065           if (offset)
2066             offset = size_binop (PLUS_EXPR, offset,
2067                                     wide_int_to_tree (sizetype, tem));
2068           else
2069             offset = wide_int_to_tree (sizetype, tem);
2070         }
2071
2072       bitpos += bit_offset.to_shwi ();
2073     }
2074
2075   if (bitpos % BITS_PER_UNIT)
2076     return false;
2077   if (bitsize % BITS_PER_UNIT)
2078     return false;
2079
2080   if (!init_symbolic_number (n, ref))
2081     return false;
2082   n->base_addr = base_addr;
2083   n->offset = offset;
2084   n->bytepos = bitpos / BITS_PER_UNIT;
2085   n->alias_set = reference_alias_ptr_type (ref);
2086   n->vuse = gimple_vuse (stmt);
2087   return true;
2088 }
2089
2090 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2091    symbolic number N1 and N2 whose source statements are respectively
2092    SOURCE_STMT1 and SOURCE_STMT2.  */
2093
2094 static gimple *
2095 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2096                         gimple *source_stmt2, struct symbolic_number *n2,
2097                         struct symbolic_number *n)
2098 {
2099   int i, size;
2100   uint64_t mask;
2101   gimple *source_stmt;
2102   struct symbolic_number *n_start;
2103
2104   /* Sources are different, cancel bswap if they are not memory location with
2105      the same base (array, structure, ...).  */
2106   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2107     {
2108       uint64_t inc;
2109       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2110       struct symbolic_number *toinc_n_ptr, *n_end;
2111
2112       if (!n1->base_addr || !n2->base_addr
2113           || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2114         return NULL;
2115
2116       if (!n1->offset != !n2->offset
2117           || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2118         return NULL;
2119
2120       if (n1->bytepos < n2->bytepos)
2121         {
2122           n_start = n1;
2123           start_sub = n2->bytepos - n1->bytepos;
2124           source_stmt = source_stmt1;
2125         }
2126       else
2127         {
2128           n_start = n2;
2129           start_sub = n1->bytepos - n2->bytepos;
2130           source_stmt = source_stmt2;
2131         }
2132
2133       /* Find the highest address at which a load is performed and
2134          compute related info.  */
2135       end1 = n1->bytepos + (n1->range - 1);
2136       end2 = n2->bytepos + (n2->range - 1);
2137       if (end1 < end2)
2138         {
2139           end = end2;
2140           end_sub = end2 - end1;
2141         }
2142       else
2143         {
2144           end = end1;
2145           end_sub = end1 - end2;
2146         }
2147       n_end = (end2 > end1) ? n2 : n1;
2148
2149       /* Find symbolic number whose lsb is the most significant.  */
2150       if (BYTES_BIG_ENDIAN)
2151         toinc_n_ptr = (n_end == n1) ? n2 : n1;
2152       else
2153         toinc_n_ptr = (n_start == n1) ? n2 : n1;
2154
2155       n->range = end - n_start->bytepos + 1;
2156
2157       /* Check that the range of memory covered can be represented by
2158          a symbolic number.  */
2159       if (n->range > 64 / BITS_PER_MARKER)
2160         return NULL;
2161
2162       /* Reinterpret byte marks in symbolic number holding the value of
2163          bigger weight according to target endianness.  */
2164       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2165       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2166       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2167         {
2168           unsigned marker
2169             = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2170           if (marker && marker != MARKER_BYTE_UNKNOWN)
2171             toinc_n_ptr->n += inc;
2172         }
2173     }
2174   else
2175     {
2176       n->range = n1->range;
2177       n_start = n1;
2178       source_stmt = source_stmt1;
2179     }
2180
2181   if (!n1->alias_set
2182       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2183     n->alias_set = n1->alias_set;
2184   else
2185     n->alias_set = ptr_type_node;
2186   n->vuse = n_start->vuse;
2187   n->base_addr = n_start->base_addr;
2188   n->offset = n_start->offset;
2189   n->bytepos = n_start->bytepos;
2190   n->type = n_start->type;
2191   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2192
2193   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2194     {
2195       uint64_t masked1, masked2;
2196
2197       masked1 = n1->n & mask;
2198       masked2 = n2->n & mask;
2199       if (masked1 && masked2 && masked1 != masked2)
2200         return NULL;
2201     }
2202   n->n = n1->n | n2->n;
2203
2204   return source_stmt;
2205 }
2206
2207 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2208    the operation given by the rhs of STMT on the result.  If the operation
2209    could successfully be executed the function returns a gimple stmt whose
2210    rhs's first tree is the expression of the source operand and NULL
2211    otherwise.  */
2212
2213 static gimple *
2214 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2215 {
2216   enum tree_code code;
2217   tree rhs1, rhs2 = NULL;
2218   gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2219   enum gimple_rhs_class rhs_class;
2220
2221   if (!limit || !is_gimple_assign (stmt))
2222     return NULL;
2223
2224   rhs1 = gimple_assign_rhs1 (stmt);
2225
2226   if (find_bswap_or_nop_load (stmt, rhs1, n))
2227     return stmt;
2228
2229   if (TREE_CODE (rhs1) != SSA_NAME)
2230     return NULL;
2231
2232   code = gimple_assign_rhs_code (stmt);
2233   rhs_class = gimple_assign_rhs_class (stmt);
2234   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2235
2236   if (rhs_class == GIMPLE_BINARY_RHS)
2237     rhs2 = gimple_assign_rhs2 (stmt);
2238
2239   /* Handle unary rhs and binary rhs with integer constants as second
2240      operand.  */
2241
2242   if (rhs_class == GIMPLE_UNARY_RHS
2243       || (rhs_class == GIMPLE_BINARY_RHS
2244           && TREE_CODE (rhs2) == INTEGER_CST))
2245     {
2246       if (code != BIT_AND_EXPR
2247           && code != LSHIFT_EXPR
2248           && code != RSHIFT_EXPR
2249           && code != LROTATE_EXPR
2250           && code != RROTATE_EXPR
2251           && !CONVERT_EXPR_CODE_P (code))
2252         return NULL;
2253
2254       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2255
2256       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2257          we have to initialize the symbolic number.  */
2258       if (!source_stmt1)
2259         {
2260           if (gimple_assign_load_p (stmt)
2261               || !init_symbolic_number (n, rhs1))
2262             return NULL;
2263           source_stmt1 = stmt;
2264         }
2265
2266       switch (code)
2267         {
2268         case BIT_AND_EXPR:
2269           {
2270             int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2271             uint64_t val = int_cst_value (rhs2), mask = 0;
2272             uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2273
2274             /* Only constants masking full bytes are allowed.  */
2275             for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2276               if ((val & tmp) != 0 && (val & tmp) != tmp)
2277                 return NULL;
2278               else if (val & tmp)
2279                 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2280
2281             n->n &= mask;
2282           }
2283           break;
2284         case LSHIFT_EXPR:
2285         case RSHIFT_EXPR:
2286         case LROTATE_EXPR:
2287         case RROTATE_EXPR:
2288           if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2289             return NULL;
2290           break;
2291         CASE_CONVERT:
2292           {
2293             int i, type_size, old_type_size;
2294             tree type;
2295
2296             type = gimple_expr_type (stmt);
2297             type_size = TYPE_PRECISION (type);
2298             if (type_size % BITS_PER_UNIT != 0)
2299               return NULL;
2300             type_size /= BITS_PER_UNIT;
2301             if (type_size > 64 / BITS_PER_MARKER)
2302               return NULL;
2303
2304             /* Sign extension: result is dependent on the value.  */
2305             old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2306             if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2307                 && HEAD_MARKER (n->n, old_type_size))
2308               for (i = 0; i < type_size - old_type_size; i++)
2309                 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2310                         << ((type_size - 1 - i) * BITS_PER_MARKER);
2311
2312             if (type_size < 64 / BITS_PER_MARKER)
2313               {
2314                 /* If STMT casts to a smaller type mask out the bits not
2315                    belonging to the target type.  */
2316                 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2317               }
2318             n->type = type;
2319             if (!n->base_addr)
2320               n->range = type_size;
2321           }
2322           break;
2323         default:
2324           return NULL;
2325         };
2326       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2327     }
2328
2329   /* Handle binary rhs.  */
2330
2331   if (rhs_class == GIMPLE_BINARY_RHS)
2332     {
2333       struct symbolic_number n1, n2;
2334       gimple *source_stmt, *source_stmt2;
2335
2336       if (code != BIT_IOR_EXPR)
2337         return NULL;
2338
2339       if (TREE_CODE (rhs2) != SSA_NAME)
2340         return NULL;
2341
2342       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2343
2344       switch (code)
2345         {
2346         case BIT_IOR_EXPR:
2347           source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2348
2349           if (!source_stmt1)
2350             return NULL;
2351
2352           source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2353
2354           if (!source_stmt2)
2355             return NULL;
2356
2357           if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2358             return NULL;
2359
2360           if (!n1.vuse != !n2.vuse
2361               || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2362             return NULL;
2363
2364           source_stmt
2365             = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2366
2367           if (!source_stmt)
2368             return NULL;
2369
2370           if (!verify_symbolic_number_p (n, stmt))
2371             return NULL;
2372
2373           break;
2374         default:
2375           return NULL;
2376         }
2377       return source_stmt;
2378     }
2379   return NULL;
2380 }
2381
2382 /* Check if STMT completes a bswap implementation or a read in a given
2383    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2384    accordingly.  It also sets N to represent the kind of operations
2385    performed: size of the resulting expression and whether it works on
2386    a memory source, and if so alias-set and vuse.  At last, the
2387    function returns a stmt whose rhs's first tree is the source
2388    expression.  */
2389
2390 static gimple *
2391 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2392 {
2393 /* The number which the find_bswap_or_nop_1 result should match in order
2394    to have a full byte swap.  The number is shifted to the right
2395    according to the size of the symbolic number before using it.  */
2396   uint64_t cmpxchg = CMPXCHG;
2397   uint64_t cmpnop = CMPNOP;
2398
2399   gimple *source_stmt;
2400   int limit;
2401
2402   /* The last parameter determines the depth search limit.  It usually
2403      correlates directly to the number n of bytes to be touched.  We
2404      increase that number by log2(n) + 1 here in order to also
2405      cover signed -> unsigned conversions of the src operand as can be seen
2406      in libgcc, and for initial shift/and operation of the src operand.  */
2407   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2408   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2409   source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2410
2411   if (!source_stmt)
2412     return NULL;
2413
2414   /* Find real size of result (highest non-zero byte).  */
2415   if (n->base_addr)
2416     {
2417       int rsize;
2418       uint64_t tmpn;
2419
2420       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2421       n->range = rsize;
2422     }
2423
2424   /* Zero out the extra bits of N and CMP*.  */
2425   if (n->range < (int) sizeof (int64_t))
2426     {
2427       uint64_t mask;
2428
2429       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2430       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2431       cmpnop &= mask;
2432     }
2433
2434   /* A complete byte swap should make the symbolic number to start with
2435      the largest digit in the highest order byte. Unchanged symbolic
2436      number indicates a read with same endianness as target architecture.  */
2437   if (n->n == cmpnop)
2438     *bswap = false;
2439   else if (n->n == cmpxchg)
2440     *bswap = true;
2441   else
2442     return NULL;
2443
2444   /* Useless bit manipulation performed by code.  */
2445   if (!n->base_addr && n->n == cmpnop)
2446     return NULL;
2447
2448   n->range *= BITS_PER_UNIT;
2449   return source_stmt;
2450 }
2451
2452 namespace {
2453
2454 const pass_data pass_data_optimize_bswap =
2455 {
2456   GIMPLE_PASS, /* type */
2457   "bswap", /* name */
2458   OPTGROUP_NONE, /* optinfo_flags */
2459   TV_NONE, /* tv_id */
2460   PROP_ssa, /* properties_required */
2461   0, /* properties_provided */
2462   0, /* properties_destroyed */
2463   0, /* todo_flags_start */
2464   0, /* todo_flags_finish */
2465 };
2466
2467 class pass_optimize_bswap : public gimple_opt_pass
2468 {
2469 public:
2470   pass_optimize_bswap (gcc::context *ctxt)
2471     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2472   {}
2473
2474   /* opt_pass methods: */
2475   virtual bool gate (function *)
2476     {
2477       return flag_expensive_optimizations && optimize;
2478     }
2479
2480   virtual unsigned int execute (function *);
2481
2482 }; // class pass_optimize_bswap
2483
2484 /* Perform the bswap optimization: replace the expression computed in the rhs
2485    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2486    Which of these alternatives replace the rhs is given by N->base_addr (non
2487    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2488    load to perform are also given in N while the builtin bswap invoke is given
2489    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2490    load statements involved to construct the rhs in CUR_STMT and N->range gives
2491    the size of the rhs expression for maintaining some statistics.
2492
2493    Note that if the replacement involve a load, CUR_STMT is moved just after
2494    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2495    changing of basic block.  */
2496
2497 static bool
2498 bswap_replace (gimple *cur_stmt, gimple *src_stmt, tree fndecl,
2499                tree bswap_type, tree load_type, struct symbolic_number *n,
2500                bool bswap)
2501 {
2502   gimple_stmt_iterator gsi;
2503   tree src, tmp, tgt;
2504   gimple *bswap_stmt;
2505
2506   gsi = gsi_for_stmt (cur_stmt);
2507   src = gimple_assign_rhs1 (src_stmt);
2508   tgt = gimple_assign_lhs (cur_stmt);
2509
2510   /* Need to load the value from memory first.  */
2511   if (n->base_addr)
2512     {
2513       gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2514       tree addr_expr, addr_tmp, val_expr, val_tmp;
2515       tree load_offset_ptr, aligned_load_type;
2516       gimple *addr_stmt, *load_stmt;
2517       unsigned align;
2518       HOST_WIDE_INT load_offset = 0;
2519
2520       align = get_object_alignment (src);
2521       /* If the new access is smaller than the original one, we need
2522          to perform big endian adjustment.  */
2523       if (BYTES_BIG_ENDIAN)
2524         {
2525           HOST_WIDE_INT bitsize, bitpos;
2526           machine_mode mode;
2527           int unsignedp, volatilep;
2528           tree offset;
2529
2530           get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2531                                &unsignedp, &volatilep, false);
2532           if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2533             {
2534               load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2535               unsigned HOST_WIDE_INT l
2536                 = (load_offset * BITS_PER_UNIT) & (align - 1);
2537               if (l)
2538                 align = l & -l;
2539             }
2540         }
2541
2542       if (bswap
2543           && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2544           && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2545         return false;
2546
2547       /* Move cur_stmt just before  one of the load of the original
2548          to ensure it has the same VUSE.  See PR61517 for what could
2549          go wrong.  */
2550       gsi_move_before (&gsi, &gsi_ins);
2551       gsi = gsi_for_stmt (cur_stmt);
2552
2553       /* Compute address to load from and cast according to the size
2554          of the load.  */
2555       addr_expr = build_fold_addr_expr (unshare_expr (src));
2556       if (is_gimple_mem_ref_addr (addr_expr))
2557         addr_tmp = addr_expr;
2558       else
2559         {
2560           addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2561                                          "load_src");
2562           addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2563           gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2564         }
2565
2566       /* Perform the load.  */
2567       aligned_load_type = load_type;
2568       if (align < TYPE_ALIGN (load_type))
2569         aligned_load_type = build_aligned_type (load_type, align);
2570       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2571       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2572                               load_offset_ptr);
2573
2574       if (!bswap)
2575         {
2576           if (n->range == 16)
2577             nop_stats.found_16bit++;
2578           else if (n->range == 32)
2579             nop_stats.found_32bit++;
2580           else
2581             {
2582               gcc_assert (n->range == 64);
2583               nop_stats.found_64bit++;
2584             }
2585
2586           /* Convert the result of load if necessary.  */
2587           if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2588             {
2589               val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2590                                             "load_dst");
2591               load_stmt = gimple_build_assign (val_tmp, val_expr);
2592               gimple_set_vuse (load_stmt, n->vuse);
2593               gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2594               gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2595             }
2596           else
2597             {
2598               gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2599               gimple_set_vuse (cur_stmt, n->vuse);
2600             }
2601           update_stmt (cur_stmt);
2602
2603           if (dump_file)
2604             {
2605               fprintf (dump_file,
2606                        "%d bit load in target endianness found at: ",
2607                        (int) n->range);
2608               print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2609             }
2610           return true;
2611         }
2612       else
2613         {
2614           val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2615           load_stmt = gimple_build_assign (val_tmp, val_expr);
2616           gimple_set_vuse (load_stmt, n->vuse);
2617           gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2618         }
2619       src = val_tmp;
2620     }
2621
2622   if (n->range == 16)
2623     bswap_stats.found_16bit++;
2624   else if (n->range == 32)
2625     bswap_stats.found_32bit++;
2626   else
2627     {
2628       gcc_assert (n->range == 64);
2629       bswap_stats.found_64bit++;
2630     }
2631
2632   tmp = src;
2633
2634   /* Convert the src expression if necessary.  */
2635   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2636     {
2637       gimple *convert_stmt;
2638
2639       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2640       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2641       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2642     }
2643
2644   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2645      are considered as rotation of 2N bit values by N bits is generally not
2646      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2647      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2648   if (bswap && n->range == 16)
2649     {
2650       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2651       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2652       bswap_stmt = gimple_build_assign (NULL, src);
2653     }
2654   else
2655     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2656
2657   tmp = tgt;
2658
2659   /* Convert the result if necessary.  */
2660   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2661     {
2662       gimple *convert_stmt;
2663
2664       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2665       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2666       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2667     }
2668
2669   gimple_set_lhs (bswap_stmt, tmp);
2670
2671   if (dump_file)
2672     {
2673       fprintf (dump_file, "%d bit bswap implementation found at: ",
2674                (int) n->range);
2675       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2676     }
2677
2678   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2679   gsi_remove (&gsi, true);
2680   return true;
2681 }
2682
2683 /* Find manual byte swap implementations as well as load in a given
2684    endianness. Byte swaps are turned into a bswap builtin invokation
2685    while endian loads are converted to bswap builtin invokation or
2686    simple load according to the target endianness.  */
2687
2688 unsigned int
2689 pass_optimize_bswap::execute (function *fun)
2690 {
2691   basic_block bb;
2692   bool bswap32_p, bswap64_p;
2693   bool changed = false;
2694   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2695
2696   if (BITS_PER_UNIT != 8)
2697     return 0;
2698
2699   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2700                && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2701   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2702                && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2703                    || (bswap32_p && word_mode == SImode)));
2704
2705   /* Determine the argument type of the builtins.  The code later on
2706      assumes that the return and argument type are the same.  */
2707   if (bswap32_p)
2708     {
2709       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2710       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2711     }
2712
2713   if (bswap64_p)
2714     {
2715       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2716       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2717     }
2718
2719   memset (&nop_stats, 0, sizeof (nop_stats));
2720   memset (&bswap_stats, 0, sizeof (bswap_stats));
2721
2722   FOR_EACH_BB_FN (bb, fun)
2723     {
2724       gimple_stmt_iterator gsi;
2725
2726       /* We do a reverse scan for bswap patterns to make sure we get the
2727          widest match. As bswap pattern matching doesn't handle previously
2728          inserted smaller bswap replacements as sub-patterns, the wider
2729          variant wouldn't be detected.  */
2730       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2731         {
2732           gimple *src_stmt, *cur_stmt = gsi_stmt (gsi);
2733           tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2734           enum tree_code code;
2735           struct symbolic_number n;
2736           bool bswap;
2737
2738           /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2739              might be moved to a different basic block by bswap_replace and gsi
2740              must not points to it if that's the case.  Moving the gsi_prev
2741              there make sure that gsi points to the statement previous to
2742              cur_stmt while still making sure that all statements are
2743              considered in this basic block.  */
2744           gsi_prev (&gsi);
2745
2746           if (!is_gimple_assign (cur_stmt))
2747             continue;
2748
2749           code = gimple_assign_rhs_code (cur_stmt);
2750           switch (code)
2751             {
2752             case LROTATE_EXPR:
2753             case RROTATE_EXPR:
2754               if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2755                   || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2756                      % BITS_PER_UNIT)
2757                 continue;
2758               /* Fall through.  */
2759             case BIT_IOR_EXPR:
2760               break;
2761             default:
2762               continue;
2763             }
2764
2765           src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2766
2767           if (!src_stmt)
2768             continue;
2769
2770           switch (n.range)
2771             {
2772             case 16:
2773               /* Already in canonical form, nothing to do.  */
2774               if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2775                 continue;
2776               load_type = bswap_type = uint16_type_node;
2777               break;
2778             case 32:
2779               load_type = uint32_type_node;
2780               if (bswap32_p)
2781                 {
2782                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2783                   bswap_type = bswap32_type;
2784                 }
2785               break;
2786             case 64:
2787               load_type = uint64_type_node;
2788               if (bswap64_p)
2789                 {
2790                   fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2791                   bswap_type = bswap64_type;
2792                 }
2793               break;
2794             default:
2795               continue;
2796             }
2797
2798           if (bswap && !fndecl && n.range != 16)
2799             continue;
2800
2801           if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2802                              &n, bswap))
2803             changed = true;
2804         }
2805     }
2806
2807   statistics_counter_event (fun, "16-bit nop implementations found",
2808                             nop_stats.found_16bit);
2809   statistics_counter_event (fun, "32-bit nop implementations found",
2810                             nop_stats.found_32bit);
2811   statistics_counter_event (fun, "64-bit nop implementations found",
2812                             nop_stats.found_64bit);
2813   statistics_counter_event (fun, "16-bit bswap implementations found",
2814                             bswap_stats.found_16bit);
2815   statistics_counter_event (fun, "32-bit bswap implementations found",
2816                             bswap_stats.found_32bit);
2817   statistics_counter_event (fun, "64-bit bswap implementations found",
2818                             bswap_stats.found_64bit);
2819
2820   return (changed ? TODO_update_ssa : 0);
2821 }
2822
2823 } // anon namespace
2824
2825 gimple_opt_pass *
2826 make_pass_optimize_bswap (gcc::context *ctxt)
2827 {
2828   return new pass_optimize_bswap (ctxt);
2829 }
2830
2831 /* Return true if stmt is a type conversion operation that can be stripped
2832    when used in a widening multiply operation.  */
2833 static bool
2834 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2835 {
2836   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2837
2838   if (TREE_CODE (result_type) == INTEGER_TYPE)
2839     {
2840       tree op_type;
2841       tree inner_op_type;
2842
2843       if (!CONVERT_EXPR_CODE_P (rhs_code))
2844         return false;
2845
2846       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2847
2848       /* If the type of OP has the same precision as the result, then
2849          we can strip this conversion.  The multiply operation will be
2850          selected to create the correct extension as a by-product.  */
2851       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2852         return true;
2853
2854       /* We can also strip a conversion if it preserves the signed-ness of
2855          the operation and doesn't narrow the range.  */
2856       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2857
2858       /* If the inner-most type is unsigned, then we can strip any
2859          intermediate widening operation.  If it's signed, then the
2860          intermediate widening operation must also be signed.  */
2861       if ((TYPE_UNSIGNED (inner_op_type)
2862            || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2863           && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2864         return true;
2865
2866       return false;
2867     }
2868
2869   return rhs_code == FIXED_CONVERT_EXPR;
2870 }
2871
2872 /* Return true if RHS is a suitable operand for a widening multiplication,
2873    assuming a target type of TYPE.
2874    There are two cases:
2875
2876      - RHS makes some value at least twice as wide.  Store that value
2877        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2878
2879      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2880        but leave *TYPE_OUT untouched.  */
2881
2882 static bool
2883 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2884                         tree *new_rhs_out)
2885 {
2886   gimple *stmt;
2887   tree type1, rhs1;
2888
2889   if (TREE_CODE (rhs) == SSA_NAME)
2890     {
2891       stmt = SSA_NAME_DEF_STMT (rhs);
2892       if (is_gimple_assign (stmt))
2893         {
2894           if (! widening_mult_conversion_strippable_p (type, stmt))
2895             rhs1 = rhs;
2896           else
2897             {
2898               rhs1 = gimple_assign_rhs1 (stmt);
2899
2900               if (TREE_CODE (rhs1) == INTEGER_CST)
2901                 {
2902                   *new_rhs_out = rhs1;
2903                   *type_out = NULL;
2904                   return true;
2905                 }
2906             }
2907         }
2908       else
2909         rhs1 = rhs;
2910
2911       type1 = TREE_TYPE (rhs1);
2912
2913       if (TREE_CODE (type1) != TREE_CODE (type)
2914           || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2915         return false;
2916
2917       *new_rhs_out = rhs1;
2918       *type_out = type1;
2919       return true;
2920     }
2921
2922   if (TREE_CODE (rhs) == INTEGER_CST)
2923     {
2924       *new_rhs_out = rhs;
2925       *type_out = NULL;
2926       return true;
2927     }
2928
2929   return false;
2930 }
2931
2932 /* Return true if STMT performs a widening multiplication, assuming the
2933    output type is TYPE.  If so, store the unwidened types of the operands
2934    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2935    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2936    and *TYPE2_OUT would give the operands of the multiplication.  */
2937
2938 static bool
2939 is_widening_mult_p (gimple *stmt,
2940                     tree *type1_out, tree *rhs1_out,
2941                     tree *type2_out, tree *rhs2_out)
2942 {
2943   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2944
2945   if (TREE_CODE (type) != INTEGER_TYPE
2946       && TREE_CODE (type) != FIXED_POINT_TYPE)
2947     return false;
2948
2949   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2950                                rhs1_out))
2951     return false;
2952
2953   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2954                                rhs2_out))
2955     return false;
2956
2957   if (*type1_out == NULL)
2958     {
2959       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2960         return false;
2961       *type1_out = *type2_out;
2962     }
2963
2964   if (*type2_out == NULL)
2965     {
2966       if (!int_fits_type_p (*rhs2_out, *type1_out))
2967         return false;
2968       *type2_out = *type1_out;
2969     }
2970
2971   /* Ensure that the larger of the two operands comes first. */
2972   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2973     {
2974       std::swap (*type1_out, *type2_out);
2975       std::swap (*rhs1_out, *rhs2_out);
2976     }
2977
2978   return true;
2979 }
2980
2981 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2982    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2983    value is true iff we converted the statement.  */
2984
2985 static bool
2986 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2987 {
2988   tree lhs, rhs1, rhs2, type, type1, type2;
2989   enum insn_code handler;
2990   machine_mode to_mode, from_mode, actual_mode;
2991   optab op;
2992   int actual_precision;
2993   location_t loc = gimple_location (stmt);
2994   bool from_unsigned1, from_unsigned2;
2995
2996   lhs = gimple_assign_lhs (stmt);
2997   type = TREE_TYPE (lhs);
2998   if (TREE_CODE (type) != INTEGER_TYPE)
2999     return false;
3000
3001   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3002     return false;
3003
3004   to_mode = TYPE_MODE (type);
3005   from_mode = TYPE_MODE (type1);
3006   from_unsigned1 = TYPE_UNSIGNED (type1);
3007   from_unsigned2 = TYPE_UNSIGNED (type2);
3008
3009   if (from_unsigned1 && from_unsigned2)
3010     op = umul_widen_optab;
3011   else if (!from_unsigned1 && !from_unsigned2)
3012     op = smul_widen_optab;
3013   else
3014     op = usmul_widen_optab;
3015
3016   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3017                                                   0, &actual_mode);
3018
3019   if (handler == CODE_FOR_nothing)
3020     {
3021       if (op != smul_widen_optab)
3022         {
3023           /* We can use a signed multiply with unsigned types as long as
3024              there is a wider mode to use, or it is the smaller of the two
3025              types that is unsigned.  Note that type1 >= type2, always.  */
3026           if ((TYPE_UNSIGNED (type1)
3027                && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3028               || (TYPE_UNSIGNED (type2)
3029                   && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3030             {
3031               from_mode = GET_MODE_WIDER_MODE (from_mode);
3032               if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3033                 return false;
3034             }
3035
3036           op = smul_widen_optab;
3037           handler = find_widening_optab_handler_and_mode (op, to_mode,
3038                                                           from_mode, 0,
3039                                                           &actual_mode);
3040
3041           if (handler == CODE_FOR_nothing)
3042             return false;
3043
3044           from_unsigned1 = from_unsigned2 = false;
3045         }
3046       else
3047         return false;
3048     }
3049
3050   /* Ensure that the inputs to the handler are in the correct precison
3051      for the opcode.  This will be the full mode size.  */
3052   actual_precision = GET_MODE_PRECISION (actual_mode);
3053   if (2 * actual_precision > TYPE_PRECISION (type))
3054     return false;
3055   if (actual_precision != TYPE_PRECISION (type1)
3056       || from_unsigned1 != TYPE_UNSIGNED (type1))
3057     rhs1 = build_and_insert_cast (gsi, loc,
3058                                   build_nonstandard_integer_type
3059                                     (actual_precision, from_unsigned1), rhs1);
3060   if (actual_precision != TYPE_PRECISION (type2)
3061       || from_unsigned2 != TYPE_UNSIGNED (type2))
3062     rhs2 = build_and_insert_cast (gsi, loc,
3063                                   build_nonstandard_integer_type
3064                                     (actual_precision, from_unsigned2), rhs2);
3065
3066   /* Handle constants.  */
3067   if (TREE_CODE (rhs1) == INTEGER_CST)
3068     rhs1 = fold_convert (type1, rhs1);
3069   if (TREE_CODE (rhs2) == INTEGER_CST)
3070     rhs2 = fold_convert (type2, rhs2);
3071
3072   gimple_assign_set_rhs1 (stmt, rhs1);
3073   gimple_assign_set_rhs2 (stmt, rhs2);
3074   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3075   update_stmt (stmt);
3076   widen_mul_stats.widen_mults_inserted++;
3077   return true;
3078 }
3079
3080 /* Process a single gimple statement STMT, which is found at the
3081    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3082    rhs (given by CODE), and try to convert it into a
3083    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
3084    is true iff we converted the statement.  */
3085
3086 static bool
3087 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3088                             enum tree_code code)
3089 {
3090   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3091   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3092   tree type, type1, type2, optype;
3093   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3094   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3095   optab this_optab;
3096   enum tree_code wmult_code;
3097   enum insn_code handler;
3098   machine_mode to_mode, from_mode, actual_mode;
3099   location_t loc = gimple_location (stmt);
3100   int actual_precision;
3101   bool from_unsigned1, from_unsigned2;
3102
3103   lhs = gimple_assign_lhs (stmt);
3104   type = TREE_TYPE (lhs);
3105   if (TREE_CODE (type) != INTEGER_TYPE
3106       && TREE_CODE (type) != FIXED_POINT_TYPE)
3107     return false;
3108
3109   if (code == MINUS_EXPR)
3110     wmult_code = WIDEN_MULT_MINUS_EXPR;
3111   else
3112     wmult_code = WIDEN_MULT_PLUS_EXPR;
3113
3114   rhs1 = gimple_assign_rhs1 (stmt);
3115   rhs2 = gimple_assign_rhs2 (stmt);
3116
3117   if (TREE_CODE (rhs1) == SSA_NAME)
3118     {
3119       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3120       if (is_gimple_assign (rhs1_stmt))
3121         rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3122     }
3123
3124   if (TREE_CODE (rhs2) == SSA_NAME)
3125     {
3126       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3127       if (is_gimple_assign (rhs2_stmt))
3128         rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3129     }
3130
3131   /* Allow for one conversion statement between the multiply
3132      and addition/subtraction statement.  If there are more than
3133      one conversions then we assume they would invalidate this
3134      transformation.  If that's not the case then they should have
3135      been folded before now.  */
3136   if (CONVERT_EXPR_CODE_P (rhs1_code))
3137     {
3138       conv1_stmt = rhs1_stmt;
3139       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3140       if (TREE_CODE (rhs1) == SSA_NAME)
3141         {
3142           rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3143           if (is_gimple_assign (rhs1_stmt))
3144             rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3145         }
3146       else
3147         return false;
3148     }
3149   if (CONVERT_EXPR_CODE_P (rhs2_code))
3150     {
3151       conv2_stmt = rhs2_stmt;
3152       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3153       if (TREE_CODE (rhs2) == SSA_NAME)
3154         {
3155           rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3156           if (is_gimple_assign (rhs2_stmt))
3157             rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3158         }
3159       else
3160         return false;
3161     }
3162
3163   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3164      is_widening_mult_p, but we still need the rhs returns.
3165
3166      It might also appear that it would be sufficient to use the existing
3167      operands of the widening multiply, but that would limit the choice of
3168      multiply-and-accumulate instructions.
3169
3170      If the widened-multiplication result has more than one uses, it is
3171      probably wiser not to do the conversion.  */
3172   if (code == PLUS_EXPR
3173       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3174     {
3175       if (!has_single_use (rhs1)
3176           || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3177                                   &type2, &mult_rhs2))
3178         return false;
3179       add_rhs = rhs2;
3180       conv_stmt = conv1_stmt;
3181     }
3182   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3183     {
3184       if (!has_single_use (rhs2)
3185           || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3186                                   &type2, &mult_rhs2))
3187         return false;
3188       add_rhs = rhs1;
3189       conv_stmt = conv2_stmt;
3190     }
3191   else
3192     return false;
3193
3194   to_mode = TYPE_MODE (type);
3195   from_mode = TYPE_MODE (type1);
3196   from_unsigned1 = TYPE_UNSIGNED (type1);
3197   from_unsigned2 = TYPE_UNSIGNED (type2);
3198   optype = type1;
3199
3200   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
3201   if (from_unsigned1 != from_unsigned2)
3202     {
3203       if (!INTEGRAL_TYPE_P (type))
3204         return false;
3205       /* We can use a signed multiply with unsigned types as long as
3206          there is a wider mode to use, or it is the smaller of the two
3207          types that is unsigned.  Note that type1 >= type2, always.  */
3208       if ((from_unsigned1
3209            && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3210           || (from_unsigned2
3211               && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3212         {
3213           from_mode = GET_MODE_WIDER_MODE (from_mode);
3214           if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3215             return false;
3216         }
3217
3218       from_unsigned1 = from_unsigned2 = false;
3219       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3220                                                false);
3221     }
3222
3223   /* If there was a conversion between the multiply and addition
3224      then we need to make sure it fits a multiply-and-accumulate.
3225      The should be a single mode change which does not change the
3226      value.  */
3227   if (conv_stmt)
3228     {
3229       /* We use the original, unmodified data types for this.  */
3230       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3231       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3232       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3233       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3234
3235       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3236         {
3237           /* Conversion is a truncate.  */
3238           if (TYPE_PRECISION (to_type) < data_size)
3239             return false;
3240         }
3241       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3242         {
3243           /* Conversion is an extend.  Check it's the right sort.  */
3244           if (TYPE_UNSIGNED (from_type) != is_unsigned
3245               && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3246             return false;
3247         }
3248       /* else convert is a no-op for our purposes.  */
3249     }
3250
3251   /* Verify that the machine can perform a widening multiply
3252      accumulate in this mode/signedness combination, otherwise
3253      this transformation is likely to pessimize code.  */
3254   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3255   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3256                                                   from_mode, 0, &actual_mode);
3257
3258   if (handler == CODE_FOR_nothing)
3259     return false;
3260
3261   /* Ensure that the inputs to the handler are in the correct precison
3262      for the opcode.  This will be the full mode size.  */
3263   actual_precision = GET_MODE_PRECISION (actual_mode);
3264   if (actual_precision != TYPE_PRECISION (type1)
3265       || from_unsigned1 != TYPE_UNSIGNED (type1))
3266     mult_rhs1 = build_and_insert_cast (gsi, loc,
3267                                        build_nonstandard_integer_type
3268                                          (actual_precision, from_unsigned1),
3269                                        mult_rhs1);
3270   if (actual_precision != TYPE_PRECISION (type2)
3271       || from_unsigned2 != TYPE_UNSIGNED (type2))
3272     mult_rhs2 = build_and_insert_cast (gsi, loc,
3273                                        build_nonstandard_integer_type
3274                                          (actual_precision, from_unsigned2),
3275                                        mult_rhs2);
3276
3277   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3278     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3279
3280   /* Handle constants.  */
3281   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3282     mult_rhs1 = fold_convert (type1, mult_rhs1);
3283   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3284     mult_rhs2 = fold_convert (type2, mult_rhs2);
3285
3286   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3287                                   add_rhs);
3288   update_stmt (gsi_stmt (*gsi));
3289   widen_mul_stats.maccs_inserted++;
3290   return true;
3291 }
3292
3293 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3294    with uses in additions and subtractions to form fused multiply-add
3295    operations.  Returns true if successful and MUL_STMT should be removed.  */
3296
3297 static bool
3298 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3299 {
3300   tree mul_result = gimple_get_lhs (mul_stmt);
3301   tree type = TREE_TYPE (mul_result);
3302   gimple *use_stmt, *neguse_stmt;
3303   gassign *fma_stmt;
3304   use_operand_p use_p;
3305   imm_use_iterator imm_iter;
3306
3307   if (FLOAT_TYPE_P (type)
3308       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3309     return false;
3310
3311   /* We don't want to do bitfield reduction ops.  */
3312   if (INTEGRAL_TYPE_P (type)
3313       && (TYPE_PRECISION (type)
3314           != GET_MODE_PRECISION (TYPE_MODE (type))))
3315     return false;
3316
3317   /* If the target doesn't support it, don't generate it.  We assume that
3318      if fma isn't available then fms, fnma or fnms are not either.  */
3319   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3320     return false;
3321
3322   /* If the multiplication has zero uses, it is kept around probably because
3323      of -fnon-call-exceptions.  Don't optimize it away in that case,
3324      it is DCE job.  */
3325   if (has_zero_uses (mul_result))
3326     return false;
3327
3328   /* Make sure that the multiplication statement becomes dead after
3329      the transformation, thus that all uses are transformed to FMAs.
3330      This means we assume that an FMA operation has the same cost
3331      as an addition.  */
3332   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3333     {
3334       enum tree_code use_code;
3335       tree result = mul_result;
3336       bool negate_p = false;
3337
3338       use_stmt = USE_STMT (use_p);
3339
3340       if (is_gimple_debug (use_stmt))
3341         continue;
3342
3343       /* For now restrict this operations to single basic blocks.  In theory
3344          we would want to support sinking the multiplication in
3345          m = a*b;
3346          if ()
3347            ma = m + c;
3348          else
3349            d = m;
3350          to form a fma in the then block and sink the multiplication to the
3351          else block.  */
3352       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3353         return false;
3354
3355       if (!is_gimple_assign (use_stmt))
3356         return false;
3357
3358       use_code = gimple_assign_rhs_code (use_stmt);
3359
3360       /* A negate on the multiplication leads to FNMA.  */
3361       if (use_code == NEGATE_EXPR)
3362         {
3363           ssa_op_iter iter;
3364           use_operand_p usep;
3365
3366           result = gimple_assign_lhs (use_stmt);
3367
3368           /* Make sure the negate statement becomes dead with this
3369              single transformation.  */
3370           if (!single_imm_use (gimple_assign_lhs (use_stmt),
3371                                &use_p, &neguse_stmt))
3372             return false;
3373
3374           /* Make sure the multiplication isn't also used on that stmt.  */
3375           FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3376             if (USE_FROM_PTR (usep) == mul_result)
3377               return false;
3378
3379           /* Re-validate.  */
3380           use_stmt = neguse_stmt;
3381           if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3382             return false;
3383           if (!is_gimple_assign (use_stmt))
3384             return false;
3385
3386           use_code = gimple_assign_rhs_code (use_stmt);
3387           negate_p = true;
3388         }
3389
3390       switch (use_code)
3391         {
3392         case MINUS_EXPR:
3393           if (gimple_assign_rhs2 (use_stmt) == result)
3394             negate_p = !negate_p;
3395           break;
3396         case PLUS_EXPR:
3397           break;
3398         default:
3399           /* FMA can only be formed from PLUS and MINUS.  */
3400           return false;
3401         }
3402
3403       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3404          by a MULT_EXPR that we'll visit later, we might be able to
3405          get a more profitable match with fnma.
3406          OTOH, if we don't, a negate / fma pair has likely lower latency
3407          that a mult / subtract pair.  */
3408       if (use_code == MINUS_EXPR && !negate_p
3409           && gimple_assign_rhs1 (use_stmt) == result
3410           && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3411           && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3412         {
3413           tree rhs2 = gimple_assign_rhs2 (use_stmt);
3414
3415           if (TREE_CODE (rhs2) == SSA_NAME)
3416             {
3417               gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3418               if (has_single_use (rhs2)
3419                   && is_gimple_assign (stmt2)
3420                   && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3421               return false;
3422             }
3423         }
3424
3425       /* We can't handle a * b + a * b.  */
3426       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3427         return false;
3428
3429       /* While it is possible to validate whether or not the exact form
3430          that we've recognized is available in the backend, the assumption
3431          is that the transformation is never a loss.  For instance, suppose
3432          the target only has the plain FMA pattern available.  Consider
3433          a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3434          is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3435          still have 3 operations, but in the FMA form the two NEGs are
3436          independent and could be run in parallel.  */
3437     }
3438
3439   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3440     {
3441       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3442       enum tree_code use_code;
3443       tree addop, mulop1 = op1, result = mul_result;
3444       bool negate_p = false;
3445
3446       if (is_gimple_debug (use_stmt))
3447         continue;
3448
3449       use_code = gimple_assign_rhs_code (use_stmt);
3450       if (use_code == NEGATE_EXPR)
3451         {
3452           result = gimple_assign_lhs (use_stmt);
3453           single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3454           gsi_remove (&gsi, true);
3455           release_defs (use_stmt);
3456
3457           use_stmt = neguse_stmt;
3458           gsi = gsi_for_stmt (use_stmt);
3459           use_code = gimple_assign_rhs_code (use_stmt);
3460           negate_p = true;
3461         }
3462
3463       if (gimple_assign_rhs1 (use_stmt) == result)
3464         {
3465           addop = gimple_assign_rhs2 (use_stmt);
3466           /* a * b - c -> a * b + (-c)  */
3467           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3468             addop = force_gimple_operand_gsi (&gsi,
3469                                               build1 (NEGATE_EXPR,
3470                                                       type, addop),
3471                                               true, NULL_TREE, true,
3472                                               GSI_SAME_STMT);
3473         }
3474       else
3475         {
3476           addop = gimple_assign_rhs1 (use_stmt);
3477           /* a - b * c -> (-b) * c + a */
3478           if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3479             negate_p = !negate_p;
3480         }
3481
3482       if (negate_p)
3483         mulop1 = force_gimple_operand_gsi (&gsi,
3484                                            build1 (NEGATE_EXPR,
3485                                                    type, mulop1),
3486                                            true, NULL_TREE, true,
3487                                            GSI_SAME_STMT);
3488
3489       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3490                                       FMA_EXPR, mulop1, op2, addop);
3491       gsi_replace (&gsi, fma_stmt, true);
3492       widen_mul_stats.fmas_inserted++;
3493     }
3494
3495   return true;
3496 }
3497
3498 /* Find integer multiplications where the operands are extended from
3499    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3500    where appropriate.  */
3501
3502 namespace {
3503
3504 const pass_data pass_data_optimize_widening_mul =
3505 {
3506   GIMPLE_PASS, /* type */
3507   "widening_mul", /* name */
3508   OPTGROUP_NONE, /* optinfo_flags */
3509   TV_NONE, /* tv_id */
3510   PROP_ssa, /* properties_required */
3511   0, /* properties_provided */
3512   0, /* properties_destroyed */
3513   0, /* todo_flags_start */
3514   TODO_update_ssa, /* todo_flags_finish */
3515 };
3516
3517 class pass_optimize_widening_mul : public gimple_opt_pass
3518 {
3519 public:
3520   pass_optimize_widening_mul (gcc::context *ctxt)
3521     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3522   {}
3523
3524   /* opt_pass methods: */
3525   virtual bool gate (function *)
3526     {
3527       return flag_expensive_optimizations && optimize;
3528     }
3529
3530   virtual unsigned int execute (function *);
3531
3532 }; // class pass_optimize_widening_mul
3533
3534 unsigned int
3535 pass_optimize_widening_mul::execute (function *fun)
3536 {
3537   basic_block bb;
3538   bool cfg_changed = false;
3539
3540   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3541
3542   FOR_EACH_BB_FN (bb, fun)
3543     {
3544       gimple_stmt_iterator gsi;
3545
3546       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3547         {
3548           gimple *stmt = gsi_stmt (gsi);
3549           enum tree_code code;
3550
3551           if (is_gimple_assign (stmt))
3552             {
3553               code = gimple_assign_rhs_code (stmt);
3554               switch (code)
3555                 {
3556                 case MULT_EXPR:
3557                   if (!convert_mult_to_widen (stmt, &gsi)
3558                       && convert_mult_to_fma (stmt,
3559                                               gimple_assign_rhs1 (stmt),
3560                                               gimple_assign_rhs2 (stmt)))
3561                     {
3562                       gsi_remove (&gsi, true);
3563                       release_defs (stmt);
3564                       continue;
3565                     }
3566                   break;
3567
3568                 case PLUS_EXPR:
3569                 case MINUS_EXPR:
3570                   convert_plusminus_to_widen (&gsi, stmt, code);
3571                   break;
3572
3573                 default:;
3574                 }
3575             }
3576           else if (is_gimple_call (stmt)
3577                    && gimple_call_lhs (stmt))
3578             {
3579               tree fndecl = gimple_call_fndecl (stmt);
3580               if (fndecl
3581                   && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3582                 {
3583                   switch (DECL_FUNCTION_CODE (fndecl))
3584                     {
3585                       case BUILT_IN_POWF:
3586                       case BUILT_IN_POW:
3587                       case BUILT_IN_POWL:
3588                         if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3589                             && real_equal
3590                                  (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3591                                   &dconst2)
3592                             && convert_mult_to_fma (stmt,
3593                                                     gimple_call_arg (stmt, 0),
3594                                                     gimple_call_arg (stmt, 0)))
3595                           {
3596                             unlink_stmt_vdef (stmt);
3597                             if (gsi_remove (&gsi, true)
3598                                 && gimple_purge_dead_eh_edges (bb))
3599                               cfg_changed = true;
3600                             release_defs (stmt);
3601                             continue;
3602                           }
3603                           break;
3604
3605                       default:;
3606                     }
3607                 }
3608             }
3609           gsi_next (&gsi);
3610         }
3611     }
3612
3613   statistics_counter_event (fun, "widening multiplications inserted",
3614                             widen_mul_stats.widen_mults_inserted);
3615   statistics_counter_event (fun, "widening maccs inserted",
3616                             widen_mul_stats.maccs_inserted);
3617   statistics_counter_event (fun, "fused multiply-adds inserted",
3618                             widen_mul_stats.fmas_inserted);
3619
3620   return cfg_changed ? TODO_cleanup_cfg : 0;
3621 }
3622
3623 } // anon namespace
3624
3625 gimple_opt_pass *
3626 make_pass_optimize_widening_mul (gcc::context *ctxt)
3627 {
3628   return new pass_optimize_widening_mul (ctxt);
3629 }