Correct a function pre/postcondition [PR102403].

[thirdparty/gcc.git] / gcc / tree-ssa-loop-ivopts.c
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c

index fd5e99b3886e98588b5eb97149a531a0ef73c534..4a498abe3b018b68f25c4fec3ae6321f7ad15bc3 100644 (file)
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -1,5 +1,5 @@
  /* Induction variable optimizations.
-   Copyright (C) 2003-2019 Free Software Foundation, Inc.
+   Copyright (C) 2003-2021 Free Software Foundation, Inc.
  
  This file is part of GCC.
  
@@ -64,7 +64,30 @@ along with GCC; see the file COPYING3.  If not see
     All of this is done loop by loop.  Doing it globally is theoretically
     possible, it might give a better performance and it might enable us
     to decide costs more precisely, but getting all the interactions right
-   would be complicated.  */
+   would be complicated.
+
+   For the targets supporting low-overhead loops, IVOPTs has to take care of
+   the loops which will probably be transformed in RTL doloop optimization,
+   to try to make selected IV candidate set optimal.  The process of doloop
+   support includes:
+
+   1) Analyze the current loop will be transformed to doloop or not, find and
+      mark its compare type IV use as doloop use (iv_group field doloop_p), and
+      set flag doloop_use_p of ivopts_data to notify subsequent processings on
+      doloop.  See analyze_and_mark_doloop_use and its callees for the details.
+      The target hook predict_doloop_p can be used for target specific checks.
+
+   2) Add one doloop dedicated IV cand {(may_be_zero ? 1 : (niter + 1)), +, -1},
+      set flag doloop_p of iv_cand, step cost is set as zero and no extra cost
+      like biv.  For cost determination between doloop IV cand and IV use, the
+      target hooks doloop_cost_for_generic and doloop_cost_for_address are
+      provided to add on extra costs for generic type and address type IV use.
+      Zero cost is assigned to the pair between doloop IV cand and doloop IV
+      use, and bound zero is set for IV elimination.
+
+   3) With the cost setting in step 2), the current cost model based IV
+      selection algorithm will process as usual, pick up doloop dedicated IV if
+      profitable.  */
  
  #include "config.h"
  #include "system.h"
@@ -102,12 +125,15 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-ssa.h"
  #include "cfgloop.h"
  #include "tree-scalar-evolution.h"
-#include "params.h"
  #include "tree-affine.h"
  #include "tree-ssa-propagate.h"
  #include "tree-ssa-address.h"
  #include "builtins.h"
  #include "tree-vectorizer.h"
+#include "dbgcnt.h"
+
+/* For lang_hooks.types.type_for_mode.  */
+#include "langhooks.h"
  
  /* FIXME: Expressions are expanded to RTL in this pass to determine the
     cost of different addressing modes.  This should be moved to a TBD
@@ -128,8 +154,8 @@ avg_loop_niter (class loop *loop)
      {
        niter = likely_max_stmt_executions_int (loop);
  
-      if (niter == -1 || niter > PARAM_VALUE (PARAM_AVG_LOOP_NITER))
-       return PARAM_VALUE (PARAM_AVG_LOOP_NITER);
+      if (niter == -1 || niter > param_avg_loop_niter)
+       return param_avg_loop_niter;
      }
  
    return niter;
@@ -276,6 +302,9 @@ comp_cost::operator+= (comp_cost cost)
  comp_cost
  comp_cost::operator+= (HOST_WIDE_INT c)
  {
+  if (c >= INFTY)
+    this->cost = INFTY;
+
    if (infinite_cost_p ())
      return *this;
  
@@ -401,6 +430,8 @@ struct iv_group
    class cost_pair *cost_map;
    /* The selected candidate for the group.  */
    struct iv_cand *selected;
+  /* To indicate this is a doloop use group.  */
+  bool doloop_p;
    /* Uses in the group.  */
    vec<struct iv_use *> vuses;
  };
@@ -441,6 +472,7 @@ struct iv_cand
                            be hoisted out of loop.  */
    struct iv *orig_iv;  /* The original iv if this cand is added from biv with
                            smaller type.  */
+  bool doloop_p;       /* Whether this is a doloop candidate.  */
  };
  
  /* Hashtable entry for common candidate derived from iv uses.  */
@@ -591,6 +623,9 @@ struct ivopts_data
    /* The common candidates.  */
    vec<iv_common_cand *> iv_common_cands;
  
+  /* Hash map recording base object information of tree exp.  */
+  hash_map<tree, tree> *base_object_map;
+
    /* The maximum invariant variable id.  */
    unsigned max_inv_var_id;
  
@@ -615,6 +650,9 @@ struct ivopts_data
  
    /* Whether the loop body can only be exited via single exit.  */
    bool loop_single_exit_p;
+
+  /* Whether the loop has doloop comparison use.  */
+  bool doloop_use_p;
  };
  
  /* An assignment of iv candidates to uses.  */
@@ -680,19 +718,19 @@ struct iv_ca_delta
  /* Bound on number of candidates below that all candidates are considered.  */
  
  #define CONSIDER_ALL_CANDIDATES_BOUND \
-  ((unsigned) PARAM_VALUE (PARAM_IV_CONSIDER_ALL_CANDIDATES_BOUND))
+  ((unsigned) param_iv_consider_all_candidates_bound)
  
  /* If there are more iv occurrences, we just give up (it is quite unlikely that
     optimizing such a loop would help, and it would take ages).  */
  
  #define MAX_CONSIDERED_GROUPS \
-  ((unsigned) PARAM_VALUE (PARAM_IV_MAX_CONSIDERED_USES))
+  ((unsigned) param_iv_max_considered_uses)
  
  /* If there are at most this number of ivs in the set, try removing unnecessary
     ivs from the set always.  */
  
  #define ALWAYS_PRUNE_CAND_SET_BOUND \
-  ((unsigned) PARAM_VALUE (PARAM_IV_ALWAYS_PRUNE_CAND_SET_BOUND))
+  ((unsigned) param_iv_always_prune_cand_set_bound)
  
  /* The list of trees for that the decl_rtl field must be reset is stored
     here.  */
@@ -1043,61 +1081,68 @@ tree_ssa_iv_optimize_init (struct ivopts_data *data)
    data->vcands.create (20);
    data->inv_expr_tab = new hash_table<iv_inv_expr_hasher> (10);
    data->name_expansion_cache = NULL;
+  data->base_object_map = NULL;
    data->iv_common_cand_tab = new hash_table<iv_common_cand_hasher> (10);
    data->iv_common_cands.create (20);
    decl_rtl_to_reset.create (20);
    gcc_obstack_init (&data->iv_obstack);
  }
  
-/* Returns a memory object to that EXPR points.  In case we are able to
-   determine that it does not point to any such object, NULL is returned.  */
+/* walk_tree callback for determine_base_object.  */
  
  static tree
-determine_base_object (tree expr)
+determine_base_object_1 (tree *tp, int *walk_subtrees, void *wdata)
  {
-  enum tree_code code = TREE_CODE (expr);
-  tree base, obj;
-
-  /* If this is a pointer casted to any type, we need to determine
-     the base object for the pointer; so handle conversions before
-     throwing away non-pointer expressions.  */
-  if (CONVERT_EXPR_P (expr))
-    return determine_base_object (TREE_OPERAND (expr, 0));
-
-  if (!POINTER_TYPE_P (TREE_TYPE (expr)))
-    return NULL_TREE;
-
-  switch (code)
+  tree_code code = TREE_CODE (*tp);
+  tree obj = NULL_TREE;
+  if (code == ADDR_EXPR)
      {
-    case INTEGER_CST:
-      return NULL_TREE;
-
-    case ADDR_EXPR:
-      obj = TREE_OPERAND (expr, 0);
-      base = get_base_address (obj);
-
+      tree base = get_base_address (TREE_OPERAND (*tp, 0));
        if (!base)
-       return expr;
-
-      if (TREE_CODE (base) == MEM_REF)
-       return determine_base_object (TREE_OPERAND (base, 0));
+       obj = *tp;
+      else if (TREE_CODE (base) != MEM_REF)
+       obj = fold_convert (ptr_type_node, build_fold_addr_expr (base));
+    }
+  else if (code == SSA_NAME && POINTER_TYPE_P (TREE_TYPE (*tp)))
+       obj = fold_convert (ptr_type_node, *tp);
  
-      return fold_convert (ptr_type_node,
-                          build_fold_addr_expr (base));
+  if (!obj)
+    {
+      if (!EXPR_P (*tp))
+       *walk_subtrees = 0;
  
-    case POINTER_PLUS_EXPR:
-      return determine_base_object (TREE_OPERAND (expr, 0));
+      return NULL_TREE;
+    }
+  /* Record special node for multiple base objects and stop.  */
+  if (*static_cast<tree *> (wdata))
+    {
+      *static_cast<tree *> (wdata) = integer_zero_node;
+      return integer_zero_node;
+    }
+  /* Record the base object and continue looking.  */
+  *static_cast<tree *> (wdata) = obj;
+  return NULL_TREE;
+}
  
-    case PLUS_EXPR:
-    case MINUS_EXPR:
-      /* Pointer addition is done solely using POINTER_PLUS_EXPR.  */
-      gcc_unreachable ();
+/* Returns a memory object to that EXPR points with caching.  Return NULL if we
+   are able to determine that it does not point to any such object; specially
+   return integer_zero_node if EXPR contains multiple base objects.  */
  
-    default:
-      if (POLY_INT_CST_P (expr))
-       return NULL_TREE;
-      return fold_convert (ptr_type_node, expr);
+static tree
+determine_base_object (struct ivopts_data *data, tree expr)
+{
+  tree *slot, obj = NULL_TREE;
+  if (data->base_object_map)
+    {
+      if ((slot = data->base_object_map->get(expr)) != NULL)
+       return *slot;
      }
+  else
+    data->base_object_map = new hash_map<tree, tree>;
+
+  (void) walk_tree_without_duplicates (&expr, determine_base_object_1, &obj);
+  data->base_object_map->put (expr, obj);
+  return obj;
  }
  
  /* Return true if address expression with non-DECL_P operand appears
@@ -1155,7 +1200,7 @@ alloc_iv (struct ivopts_data *data, tree base, tree step,
      }
  
    iv->base = base;
-  iv->base_object = determine_base_object (base);
+  iv->base_object = determine_base_object (data, base);
    iv->step = step;
    iv->biv_p = false;
    iv->nonlin_use = NULL;
@@ -1204,7 +1249,11 @@ get_iv (struct ivopts_data *data, tree var)
  
        if (!bb
           || !flow_bb_inside_loop_p (data->current_loop, bb))
-       set_iv (data, var, var, build_int_cst (type, 0), true);
+       {
+         if (POINTER_TYPE_P (type))
+           type = sizetype;
+         set_iv (data, var, var, build_int_cst (type, 0), true);
+       }
      }
  
    return name_info (data, var)->iv;
@@ -1532,6 +1581,7 @@ record_group (struct ivopts_data *data, enum use_type type)
    group->type = type;
    group->related_cands = BITMAP_ALLOC (NULL);
    group->vuses.create (1);
+  group->doloop_p = false;
  
    data->vgroups.safe_push (group);
    return group;
@@ -2386,12 +2436,14 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
      {
      case IFN_MASK_LOAD:
      case IFN_MASK_LOAD_LANES:
+    case IFN_LEN_LOAD:
        if (op_p == gimple_call_arg_ptr (call, 0))
         return TREE_TYPE (gimple_call_lhs (call));
        return NULL_TREE;
  
      case IFN_MASK_STORE:
      case IFN_MASK_STORE_LANES:
+    case IFN_LEN_STORE:
        if (op_p == gimple_call_arg_ptr (call, 0))
         return TREE_TYPE (gimple_call_arg (call, 3));
        return NULL_TREE;
@@ -2549,7 +2601,7 @@ addr_offset_valid_p (struct iv_use *use, poly_int64 offset)
  
    list_index = (unsigned) as * MAX_MACHINE_MODE + (unsigned) mem_mode;
    if (list_index >= vec_safe_length (addr_list))
-    vec_safe_grow_cleared (addr_list, list_index + MAX_MACHINE_MODE);
+    vec_safe_grow_cleared (addr_list, list_index + MAX_MACHINE_MODE, true);
  
    addr = (*addr_list)[list_index];
    if (!addr)
@@ -2947,7 +2999,10 @@ find_inv_vars_cb (tree *expr_p, int *ws ATTRIBUTE_UNUSED, void *data)
  
        if (!bb || !flow_bb_inside_loop_p (idata->current_loop, bb))
         {
-         set_iv (idata, op, op, build_int_cst (TREE_TYPE (op), 0), true);
+         tree steptype = TREE_TYPE (op);
+         if (POINTER_TYPE_P (steptype))
+           steptype = sizetype;
+         set_iv (idata, op, op, build_int_cst (steptype, 0), true);
           record_invariant (idata, op, false);
         }
      }
@@ -3023,10 +3078,10 @@ get_loop_invariant_expr (struct ivopts_data *data, tree inv_expr)
     replacement of the final value of the iv by a direct computation.  */
  
  static struct iv_cand *
-add_candidate_1 (struct ivopts_data *data,
-                tree base, tree step, bool important, enum iv_position pos,
-                struct iv_use *use, gimple *incremented_at,
-                struct iv *orig_iv = NULL)
+add_candidate_1 (struct ivopts_data *data, tree base, tree step, bool important,
+                enum iv_position pos, struct iv_use *use,
+                gimple *incremented_at, struct iv *orig_iv = NULL,
+                bool doloop = false)
  {
    unsigned i;
    struct iv_cand *cand = NULL;
@@ -3085,11 +3140,15 @@ add_candidate_1 (struct ivopts_data *data,
        cand->pos = pos;
        if (pos != IP_ORIGINAL)
         {
-         cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
+         if (doloop)
+           cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "doloop");
+         else
+           cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
           cand->var_after = cand->var_before;
         }
        cand->important = important;
        cand->incremented_at = incremented_at;
+      cand->doloop_p = doloop;
        data->vcands.safe_push (cand);
  
        if (!poly_int_tree_p (step))
@@ -3122,6 +3181,7 @@ add_candidate_1 (struct ivopts_data *data,
      }
  
    cand->important |= important;
+  cand->doloop_p |= doloop;
  
    /* Relate candidate to the group for which it is added.  */
    if (use)
@@ -3215,14 +3275,16 @@ add_autoinc_candidates (struct ivopts_data *data, tree base, tree step,
     the end of loop.  */
  
  static void
-add_candidate (struct ivopts_data *data,
-              tree base, tree step, bool important, struct iv_use *use,
-              struct iv *orig_iv = NULL)
+add_candidate (struct ivopts_data *data, tree base, tree step, bool important,
+              struct iv_use *use, struct iv *orig_iv = NULL,
+              bool doloop = false)
  {
    if (ip_normal_pos (data->current_loop))
-    add_candidate_1 (data, base, step, important,
-                    IP_NORMAL, use, NULL, orig_iv);
-  if (ip_end_pos (data->current_loop)
+    add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, orig_iv,
+                    doloop);
+  /* Exclude doloop candidate here since it requires decrement then comparison
+     and jump, the IP_END position doesn't match.  */
+  if (!doloop && ip_end_pos (data->current_loop)
        && allow_ip_end_pos_p (data->current_loop))
      add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
  }
@@ -3422,8 +3484,21 @@ add_iv_candidate_for_use (struct ivopts_data *data, struct iv_use *use)
  {
    poly_uint64 offset;
    tree base;
-  tree basetype;
    struct iv *iv = use->iv;
+  tree basetype = TREE_TYPE (iv->base);
+
+  /* Don't add candidate for iv_use with non integer, pointer or non-mode
+     precision types, instead, add candidate for the corresponding scev in
+     unsigned type with the same precision.  See PR93674 for more info.  */
+  if ((TREE_CODE (basetype) != INTEGER_TYPE && !POINTER_TYPE_P (basetype))
+      || !type_has_mode_precision_p (basetype))
+    {
+      basetype = lang_hooks.types.type_for_mode (TYPE_MODE (basetype),
+                                                TYPE_UNSIGNED (basetype));
+      add_candidate (data, fold_convert (basetype, iv->base),
+                    fold_convert (basetype, iv->step), false, NULL);
+      return;
+    }
  
    add_candidate (data, iv->base, iv->step, false, use);
  
@@ -3750,7 +3825,7 @@ prepare_decl_rtl (tree *expr_p, int *ws, void *data)
     Some RTL specific checks seems unable to be checked in gimple, if any new
     checks or easy checks _are_ missing here, please add them.  */
  
-static bool ATTRIBUTE_UNUSED
+static bool
  generic_predict_doloop_p (struct ivopts_data *data)
  {
    class loop *loop = data->current_loop;
@@ -4039,6 +4114,92 @@ get_computation_at (class loop *loop, gimple *at,
    return fold_convert (type, aff_combination_to_tree (&aff));
  }
  
+/* Like get_computation_at, but try harder, even if the computation
+   is more expensive.  Intended for debug stmts.  */
+
+static tree
+get_debug_computation_at (class loop *loop, gimple *at,
+                         struct iv_use *use, struct iv_cand *cand)
+{
+  if (tree ret = get_computation_at (loop, at, use, cand))
+    return ret;
+
+  tree ubase = use->iv->base, ustep = use->iv->step;
+  tree cbase = cand->iv->base, cstep = cand->iv->step;
+  tree var;
+  tree utype = TREE_TYPE (ubase), ctype = TREE_TYPE (cbase);
+  widest_int rat;
+
+  /* We must have a precision to express the values of use.  */
+  if (TYPE_PRECISION (utype) >= TYPE_PRECISION (ctype))
+    return NULL_TREE;
+
+  /* Try to handle the case that get_computation_at doesn't,
+     try to express
+     use = ubase + (var - cbase) / ratio.  */
+  if (!constant_multiple_of (cstep, fold_convert (TREE_TYPE (cstep), ustep),
+                            &rat))
+    return NULL_TREE;
+
+  bool neg_p = false;
+  if (wi::neg_p (rat))
+    {
+      if (TYPE_UNSIGNED (ctype))
+       return NULL_TREE;
+      neg_p = true;
+      rat = wi::neg (rat);
+    }
+
+  /* If both IVs can wrap around and CAND doesn't have a power of two step,
+     it is unsafe.  Consider uint16_t CAND with step 9, when wrapping around,
+     the values will be ... 0xfff0, 0xfff9, 2, 11 ... and when use is say
+     uint8_t with step 3, those values divided by 3 cast to uint8_t will be
+     ... 0x50, 0x53, 0, 3 ... rather than expected 0x50, 0x53, 0x56, 0x59.  */
+  if (!use->iv->no_overflow
+      && !cand->iv->no_overflow
+      && !integer_pow2p (cstep))
+    return NULL_TREE;
+
+  int bits = wi::exact_log2 (rat);
+  if (bits == -1)
+    bits = wi::floor_log2 (rat) + 1;
+  if (!cand->iv->no_overflow
+      && TYPE_PRECISION (utype) + bits > TYPE_PRECISION (ctype))
+    return NULL_TREE;
+
+  var = var_at_stmt (loop, cand, at);
+
+  if (POINTER_TYPE_P (ctype))
+    {
+      ctype = unsigned_type_for (ctype);
+      cbase = fold_convert (ctype, cbase);
+      cstep = fold_convert (ctype, cstep);
+      var = fold_convert (ctype, var);
+    }
+
+  if (stmt_after_increment (loop, cand, at))
+    var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var,
+                      unshare_expr (cstep));
+
+  var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var, cbase);
+  var = fold_build2 (EXACT_DIV_EXPR, TREE_TYPE (var), var,
+                    wide_int_to_tree (TREE_TYPE (var), rat));
+  if (POINTER_TYPE_P (utype))
+    {
+      var = fold_convert (sizetype, var);
+      if (neg_p)
+       var = fold_build1 (NEGATE_EXPR, sizetype, var);
+      var = fold_build2 (POINTER_PLUS_EXPR, utype, ubase, var);
+    }
+  else
+    {
+      var = fold_convert (utype, var);
+      var = fold_build2 (neg_p ? MINUS_EXPR : PLUS_EXPR, utype,
+                        ubase, var);
+    }
+  return var;
+}
+
  /* Adjust the cost COST for being in loop setup rather than loop body.
     If we're optimizing for space, the loop setup overhead is constant;
     if we're optimizing for speed, amortize it over the per-iteration cost.
@@ -4203,6 +4364,36 @@ force_expr_to_var_cost (tree expr, bool speed)
        STRIP_NOPS (op0);
        op1 = NULL_TREE;
        break;
+    /* See add_iv_candidate_for_doloop, for doloop may_be_zero case, we
+       introduce COND_EXPR for IV base, need to support better cost estimation
+       for this COND_EXPR and tcc_comparison.  */
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 2);
+      STRIP_NOPS (op1);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      op1 = TREE_OPERAND (expr, 1);
+      STRIP_NOPS (op1);
+      break;
  
      default:
        /* Just an arbitrary value, FIXME.  */
@@ -4284,6 +4475,35 @@ force_expr_to_var_cost (tree expr, bool speed)
      case RSHIFT_EXPR:
        cost = comp_cost (add_cost (speed, mode), 0);
        break;
+    case COND_EXPR:
+      op0 = TREE_OPERAND (expr, 0);
+      STRIP_NOPS (op0);
+      if (op0 == NULL_TREE || TREE_CODE (op0) == SSA_NAME
+         || CONSTANT_CLASS_P (op0))
+       cost = no_cost;
+      else
+       cost = force_expr_to_var_cost (op0, speed);
+      break;
+    case LT_EXPR:
+    case LE_EXPR:
+    case GT_EXPR:
+    case GE_EXPR:
+    case EQ_EXPR:
+    case NE_EXPR:
+    case UNORDERED_EXPR:
+    case ORDERED_EXPR:
+    case UNLT_EXPR:
+    case UNLE_EXPR:
+    case UNGT_EXPR:
+    case UNGE_EXPR:
+    case UNEQ_EXPR:
+    case LTGT_EXPR:
+    case MAX_EXPR:
+    case MIN_EXPR:
+      /* Simply use add cost for now, FIXME if there is some more accurate cost
+        evaluation way.  */
+      cost = comp_cost (add_cost (speed, mode), 0);
+      break;
  
      default:
        gcc_unreachable ();
@@ -4349,7 +4569,7 @@ get_address_cost_ainc (poly_int64 ainc_step, poly_int64 ainc_offset,
        unsigned nsize = ((unsigned) as + 1) *MAX_MACHINE_MODE;
  
        gcc_assert (nsize > idx);
-      ainc_cost_data_list.safe_grow_cleared (nsize);
+      ainc_cost_data_list.safe_grow_cleared (nsize, true);
      }
  
    ainc_cost_data *data = ainc_cost_data_list[idx];
@@ -4660,7 +4880,10 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use,
      {
        cost = get_address_cost (data, use, cand, &aff_inv, &aff_var, ratio,
                                inv_vars, inv_expr, can_autoinc, speed);
-      return get_scaled_computation_cost_at (data, at, cost);
+      cost = get_scaled_computation_cost_at (data, at, cost);
+      /* For doloop IV cand, add on the extra cost.  */
+      cost += cand->doloop_p ? targetm.doloop_cost_for_address : 0;
+      return cost;
      }
  
    bool simple_inv = (aff_combination_const_p (&aff_inv)
@@ -4710,7 +4933,13 @@ get_computation_cost (struct ivopts_data *data, struct iv_use *use,
    if (comp_inv && !integer_zerop (comp_inv))
      cost += add_cost (speed, TYPE_MODE (utype));
  
-  return get_scaled_computation_cost_at (data, at, cost);
+  cost = get_scaled_computation_cost_at (data, at, cost);
+
+  /* For doloop IV cand, add on the extra cost.  */
+  if (cand->doloop_p && use->type == USE_NONLINEAR_EXPR)
+    cost += targetm.doloop_cost_for_generic;
+
+  return cost;
  }
  
  /* Determines cost of computing the use in GROUP with CAND in a generic
@@ -5168,6 +5397,15 @@ may_eliminate_iv (struct ivopts_data *data,
         }
      }
  
+  /* For doloop IV cand, the bound would be zero.  It's safe whether
+     may_be_zero set or not.  */
+  if (cand->doloop_p)
+    {
+      *bound = build_int_cst (TREE_TYPE (cand->iv->base), 0);
+      *comp = iv_elimination_compare (data, use);
+      return true;
+    }
+
    cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
  
    *bound = fold_convert (TREE_TYPE (cand->iv->base),
@@ -5290,6 +5528,9 @@ determine_group_iv_cost_cond (struct ivopts_data *data,
        inv_vars = inv_vars_elim;
        inv_vars_elim = NULL;
        inv_expr = inv_expr_elim;
+      /* For doloop candidate/use pair, adjust to zero cost.  */
+      if (group->doloop_p && cand->doloop_p && elim_cost.cost > no_cost.cost)
+       cost = no_cost;
      }
    else
      {
@@ -5416,6 +5657,107 @@ relate_compare_use_with_all_cands (struct ivopts_data *data)
      }
  }
  
+/* If PREFERRED_MODE is suitable and profitable, use the preferred
+   PREFERRED_MODE to compute doloop iv base from niter: base = niter + 1.  */
+
+static tree
+compute_doloop_base_on_mode (machine_mode preferred_mode, tree niter,
+                            const widest_int &iterations_max)
+{
+  tree ntype = TREE_TYPE (niter);
+  tree pref_type = lang_hooks.types.type_for_mode (preferred_mode, 1);
+  if (!pref_type)
+    return fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+                       build_int_cst (ntype, 1));
+
+  gcc_assert (TREE_CODE (pref_type) == INTEGER_TYPE);
+
+  int prec = TYPE_PRECISION (ntype);
+  int pref_prec = TYPE_PRECISION (pref_type);
+
+  tree base;
+
+  /* Check if the PREFERRED_MODED is able to present niter.  */
+  if (pref_prec > prec
+      || wi::ltu_p (iterations_max,
+                   widest_int::from (wi::max_value (pref_prec, UNSIGNED),
+                                     UNSIGNED)))
+    {
+      /* No wrap, it is safe to use preferred type after niter + 1.  */
+      if (wi::ltu_p (iterations_max,
+                    widest_int::from (wi::max_value (prec, UNSIGNED),
+                                      UNSIGNED)))
+       {
+         /* This could help to optimize "-1 +1" pair when niter looks
+            like "n-1": n is in original mode.  "base = (n - 1) + 1"
+            in PREFERRED_MODED: it could be base = (PREFERRED_TYPE)n.  */
+         base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+                             build_int_cst (ntype, 1));
+         base = fold_convert (pref_type, base);
+       }
+
+      /* To avoid wrap, convert niter to preferred type before plus 1.  */
+      else
+       {
+         niter = fold_convert (pref_type, niter);
+         base = fold_build2 (PLUS_EXPR, pref_type, unshare_expr (niter),
+                             build_int_cst (pref_type, 1));
+       }
+    }
+  else
+    base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+                       build_int_cst (ntype, 1));
+  return base;
+}
+
+/* Add one doloop dedicated IV candidate:
+     - Base is (may_be_zero ? 1 : (niter + 1)).
+     - Step is -1.  */
+
+static void
+add_iv_candidate_for_doloop (struct ivopts_data *data)
+{
+  tree_niter_desc *niter_desc = niter_for_single_dom_exit (data);
+  gcc_assert (niter_desc && niter_desc->assumptions);
+
+  tree niter = niter_desc->niter;
+  tree ntype = TREE_TYPE (niter);
+  gcc_assert (TREE_CODE (ntype) == INTEGER_TYPE);
+
+  tree may_be_zero = niter_desc->may_be_zero;
+  if (may_be_zero && integer_zerop (may_be_zero))
+    may_be_zero = NULL_TREE;
+  if (may_be_zero)
+    {
+      if (COMPARISON_CLASS_P (may_be_zero))
+       {
+         niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
+                              build_int_cst (ntype, 0),
+                              rewrite_to_non_trapping_overflow (niter));
+       }
+      /* Don't try to obtain the iteration count expression when may_be_zero is
+        integer_nonzerop (actually iteration count is one) or else.  */
+      else
+       return;
+    }
+
+  machine_mode mode = TYPE_MODE (ntype);
+  machine_mode pref_mode = targetm.preferred_doloop_mode (mode);
+
+  tree base;
+  if (mode != pref_mode)
+    {
+      base = compute_doloop_base_on_mode (pref_mode, niter, niter_desc->max);
+      ntype = TREE_TYPE (base);
+    }
+  else
+    base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+                       build_int_cst (ntype, 1));
+
+
+  add_candidate (data, base, build_int_cst (ntype, -1), true, NULL, NULL, true);
+}
+
  /* Finds the candidates for the induction variables.  */
  
  static void
@@ -5424,6 +5766,10 @@ find_iv_candidates (struct ivopts_data *data)
    /* Add commonly used ivs.  */
    add_standard_iv_candidates (data);
  
+  /* Add doloop dedicated ivs.  */
+  if (data->doloop_use_p)
+    add_iv_candidate_for_doloop (data);
+
    /* Add old induction variables.  */
    add_iv_candidate_for_bivs (data);
  
@@ -5604,16 +5950,21 @@ determine_iv_cost (struct ivopts_data *data, struct iv_cand *cand)
       or a const set.  */
    if (cost_base.cost == 0)
      cost_base.cost = COSTS_N_INSNS (1);
-  cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
-
+  /* Doloop decrement should be considered as zero cost.  */
+  if (cand->doloop_p)
+    cost_step = 0;
+  else
+    cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
    cost = cost_step + adjust_setup_cost (data, cost_base.cost);
  
    /* Prefer the original ivs unless we may gain something by replacing it.
       The reason is to make debugging simpler; so this is not relevant for
       artificial ivs created by other optimization passes.  */
-  if (cand->pos != IP_ORIGINAL
-      || !SSA_NAME_VAR (cand->var_before)
-      || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+  if ((cand->pos != IP_ORIGINAL
+       || !SSA_NAME_VAR (cand->var_before)
+       || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+      /* Prefer doloop as well.  */
+      && !cand->doloop_p)
      cost++;
  
    /* Prefer not to insert statements into latch unless there are some
@@ -5858,7 +6209,8 @@ iv_ca_set_no_cp (struct ivopts_data *data, class iv_ca *ivs,
    if (ivs->n_cand_uses[cid] == 0)
      {
        bitmap_clear_bit (ivs->cands, cid);
-      ivs->n_cands--;
+      if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+       ivs->n_cands--;
        ivs->cand_cost -= cp->cand->cost;
        iv_ca_set_remove_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
        iv_ca_set_remove_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -5915,7 +6267,8 @@ iv_ca_set_cp (struct ivopts_data *data, class iv_ca *ivs,
        if (ivs->n_cand_uses[cid] == 1)
         {
           bitmap_set_bit (ivs->cands, cid);
-         ivs->n_cands++;
+         if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+           ivs->n_cands++;
           ivs->cand_cost += cp->cand->cost;
           iv_ca_set_add_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
           iv_ca_set_add_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
@@ -6160,6 +6513,8 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, class iv_ca *ivs)
  
    fprintf (file, "  cost: %" PRId64 " (complexity %d)\n", cost.cost,
            cost.complexity);
+  fprintf (file, "  reg_cost: %d\n",
+          ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands));
    fprintf (file, "  cand_cost: %" PRId64 "\n  cand_group_cost: "
            "%" PRId64 " (complexity %d)\n", ivs->cand_cost,
            ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity);
@@ -6996,12 +7351,13 @@ rewrite_use_nonlinear_expr (struct ivopts_data *data,
      }
  
    comp = fold_convert (type, comp);
-  if (!valid_gimple_rhs_p (comp)
-      || (gimple_code (use->stmt) != GIMPLE_PHI
-         /* We can't allow re-allocating the stmt as it might be pointed
-            to still.  */
-         && (get_gimple_rhs_num_ops (TREE_CODE (comp))
-             >= gimple_num_ops (gsi_stmt (bsi)))))
+  comp = force_gimple_operand (comp, &seq, false, NULL);
+  gimple_seq_add_seq (&stmt_list, seq);
+  if (gimple_code (use->stmt) != GIMPLE_PHI
+      /* We can't allow re-allocating the stmt as it might be pointed
+        to still.  */
+      && (get_gimple_rhs_num_ops (TREE_CODE (comp))
+         >= gimple_num_ops (gsi_stmt (bsi))))
      {
        comp = force_gimple_operand (comp, &seq, true, NULL);
        gimple_seq_add_seq (&stmt_list, seq);
@@ -7127,6 +7483,8 @@ get_alias_ptr_type_for_ptr_address (iv_use *use)
      case IFN_MASK_STORE:
      case IFN_MASK_LOAD_LANES:
      case IFN_MASK_STORE_LANES:
+    case IFN_LEN_LOAD:
+    case IFN_LEN_STORE:
        /* The second argument contains the correct alias type.  */
        gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
        return TREE_TYPE (gimple_call_arg (call, 1));
@@ -7335,7 +7693,7 @@ remove_unused_ivs (struct ivopts_data *data, bitmap toremove)
                     count++;
  
                   if (count > 1)
-                   BREAK_FROM_IMM_USE_STMT (imm_iter);
+                   break;
                 }
  
               if (!count)
@@ -7344,6 +7702,7 @@ remove_unused_ivs (struct ivopts_data *data, bitmap toremove)
               struct iv_use dummy_use;
               struct iv_cand *best_cand = NULL, *cand;
               unsigned i, best_pref = 0, cand_pref;
+             tree comp = NULL_TREE;
  
               memset (&dummy_use, 0, sizeof (dummy_use));
               dummy_use.iv = info->iv;
@@ -7364,20 +7723,23 @@ remove_unused_ivs (struct ivopts_data *data, bitmap toremove)
                     ? 1 : 0;
                   if (best_cand == NULL || best_pref < cand_pref)
                     {
-                     best_cand = cand;
-                     best_pref = cand_pref;
+                     tree this_comp
+                       = get_debug_computation_at (data->current_loop,
+                                                   SSA_NAME_DEF_STMT (def),
+                                                   &dummy_use, cand);
+                     if (this_comp)
+                       {
+                         best_cand = cand;
+                         best_pref = cand_pref;
+                         comp = this_comp;
+                       }
                     }
                 }
  
               if (!best_cand)
                 continue;
  
-             tree comp = get_computation_at (data->current_loop,
-                                             SSA_NAME_DEF_STMT (def),
-                                             &dummy_use, best_cand);
-             if (!comp)
-               continue;
-
+             comp = unshare_expr (comp);
               if (count > 1)
                 {
                   tree vexpr = make_node (DEBUG_EXPR_DECL);
@@ -7527,6 +7889,8 @@ tree_ssa_iv_optimize_finalize (struct ivopts_data *data)
    delete data->inv_expr_tab;
    data->inv_expr_tab = NULL;
    free_affine_expand_cache (&data->name_expansion_cache);
+  if (data->base_object_map)
+    delete data->base_object_map;
    delete data->iv_common_cand_tab;
    data->iv_common_cand_tab = NULL;
    data->iv_common_cands.release ();
@@ -7596,6 +7960,80 @@ determine_scaling_factor (struct ivopts_data *data, basic_block *body)
      }
  }
  
+/* Find doloop comparison use and set its doloop_p on if found.  */
+
+static bool
+find_doloop_use (struct ivopts_data *data)
+{
+  struct loop *loop = data->current_loop;
+
+  for (unsigned i = 0; i < data->vgroups.length (); i++)
+    {
+      struct iv_group *group = data->vgroups[i];
+      if (group->type == USE_COMPARE)
+       {
+         gcc_assert (group->vuses.length () == 1);
+         struct iv_use *use = group->vuses[0];
+         gimple *stmt = use->stmt;
+         if (gimple_code (stmt) == GIMPLE_COND)
+           {
+             basic_block bb = gimple_bb (stmt);
+             edge true_edge, false_edge;
+             extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+             /* This comparison is used for loop latch.  Require latch is empty
+                for now.  */
+             if ((loop->latch == true_edge->dest
+                  || loop->latch == false_edge->dest)
+                 && empty_block_p (loop->latch))
+               {
+                 group->doloop_p = true;
+                 if (dump_file && (dump_flags & TDF_DETAILS))
+                   {
+                     fprintf (dump_file, "Doloop cmp iv use: ");
+                     print_gimple_stmt (dump_file, stmt, TDF_DETAILS);
+                   }
+                 return true;
+               }
+           }
+       }
+    }
+
+  return false;
+}
+
+/* For the targets which support doloop, to predict whether later RTL doloop
+   transformation will perform on this loop, further detect the doloop use and
+   mark the flag doloop_use_p if predicted.  */
+
+void
+analyze_and_mark_doloop_use (struct ivopts_data *data)
+{
+  data->doloop_use_p = false;
+
+  if (!flag_branch_on_count_reg)
+    return;
+
+  if (data->current_loop->unroll == USHRT_MAX)
+    return;
+
+  if (!generic_predict_doloop_p (data))
+    return;
+
+  if (find_doloop_use (data))
+    {
+      data->doloop_use_p = true;
+      if (dump_file && (dump_flags & TDF_DETAILS))
+       {
+         struct loop *loop = data->current_loop;
+         fprintf (dump_file,
+                  "Predict loop %d can perform"
+                  " doloop optimization later.\n",
+                  loop->num);
+         flow_loop_dump (loop, dump_file, NULL, 1);
+       }
+    }
+}
+
  /* Optimizes the LOOP.  Returns true if anything changed.  */
  
  static bool
@@ -7635,7 +8073,8 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, class loop *loop,
    data->body_includes_call = loop_body_includes_call (body, loop->num_nodes);
    renumber_gimple_stmt_uids_in_blocks (body, loop->num_nodes);
  
-  data->loop_single_exit_p = exit != NULL && loop_only_exit_p (loop, exit);
+  data->loop_single_exit_p
+    = exit != NULL && loop_only_exit_p (loop, body, exit);
  
    /* For each ssa name determines whether it behaves as an induction variable
       in some loop.  */
@@ -7650,6 +8089,9 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, class loop *loop,
    /* Determine cost scaling factor for basic blocks in loop.  */
    determine_scaling_factor (data, body);
  
+  /* Analyze doloop possibility and mark the doloop use if predicted.  */
+  analyze_and_mark_doloop_use (data);
+
    /* Finds candidates for the induction variables (item 2).  */
    find_iv_candidates (data);
  
@@ -7689,15 +8131,17 @@ finish:
  void
  tree_ssa_iv_optimize (void)
  {
-  class loop *loop;
    struct ivopts_data data;
    auto_bitmap toremove;
  
    tree_ssa_iv_optimize_init (&data);
  
    /* Optimize the loops starting with the innermost ones.  */
-  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+  for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
      {
+      if (!dbg_cnt (ivopts_loop))
+       continue;
+
        if (dump_file && (dump_flags & TDF_DETAILS))
         flow_loop_dump (loop, dump_file, NULL, 1);