/* Induction variable optimizations.
- Copyright (C) 2003-2019 Free Software Foundation, Inc.
+ Copyright (C) 2003-2021 Free Software Foundation, Inc.
This file is part of GCC.
All of this is done loop by loop. Doing it globally is theoretically
possible, it might give a better performance and it might enable us
to decide costs more precisely, but getting all the interactions right
- would be complicated. */
+ would be complicated.
+
+ For the targets supporting low-overhead loops, IVOPTs has to take care of
+ the loops which will probably be transformed in RTL doloop optimization,
+ to try to make selected IV candidate set optimal. The process of doloop
+ support includes:
+
+ 1) Analyze the current loop will be transformed to doloop or not, find and
+ mark its compare type IV use as doloop use (iv_group field doloop_p), and
+ set flag doloop_use_p of ivopts_data to notify subsequent processings on
+ doloop. See analyze_and_mark_doloop_use and its callees for the details.
+ The target hook predict_doloop_p can be used for target specific checks.
+
+ 2) Add one doloop dedicated IV cand {(may_be_zero ? 1 : (niter + 1)), +, -1},
+ set flag doloop_p of iv_cand, step cost is set as zero and no extra cost
+ like biv. For cost determination between doloop IV cand and IV use, the
+ target hooks doloop_cost_for_generic and doloop_cost_for_address are
+ provided to add on extra costs for generic type and address type IV use.
+ Zero cost is assigned to the pair between doloop IV cand and doloop IV
+ use, and bound zero is set for IV elimination.
+
+ 3) With the cost setting in step 2), the current cost model based IV
+ selection algorithm will process as usual, pick up doloop dedicated IV if
+ profitable. */
#include "config.h"
#include "system.h"
#include "tree-ssa.h"
#include "cfgloop.h"
#include "tree-scalar-evolution.h"
-#include "params.h"
#include "tree-affine.h"
#include "tree-ssa-propagate.h"
#include "tree-ssa-address.h"
#include "builtins.h"
#include "tree-vectorizer.h"
+#include "dbgcnt.h"
+
+/* For lang_hooks.types.type_for_mode. */
+#include "langhooks.h"
/* FIXME: Expressions are expanded to RTL in this pass to determine the
cost of different addressing modes. This should be moved to a TBD
{
niter = likely_max_stmt_executions_int (loop);
- if (niter == -1 || niter > PARAM_VALUE (PARAM_AVG_LOOP_NITER))
- return PARAM_VALUE (PARAM_AVG_LOOP_NITER);
+ if (niter == -1 || niter > param_avg_loop_niter)
+ return param_avg_loop_niter;
}
return niter;
comp_cost
comp_cost::operator+= (HOST_WIDE_INT c)
{
+ if (c >= INFTY)
+ this->cost = INFTY;
+
if (infinite_cost_p ())
return *this;
class cost_pair *cost_map;
/* The selected candidate for the group. */
struct iv_cand *selected;
+ /* To indicate this is a doloop use group. */
+ bool doloop_p;
/* Uses in the group. */
vec<struct iv_use *> vuses;
};
be hoisted out of loop. */
struct iv *orig_iv; /* The original iv if this cand is added from biv with
smaller type. */
+ bool doloop_p; /* Whether this is a doloop candidate. */
};
/* Hashtable entry for common candidate derived from iv uses. */
/* The common candidates. */
vec<iv_common_cand *> iv_common_cands;
+ /* Hash map recording base object information of tree exp. */
+ hash_map<tree, tree> *base_object_map;
+
/* The maximum invariant variable id. */
unsigned max_inv_var_id;
/* Whether the loop body can only be exited via single exit. */
bool loop_single_exit_p;
+
+ /* Whether the loop has doloop comparison use. */
+ bool doloop_use_p;
};
/* An assignment of iv candidates to uses. */
/* Bound on number of candidates below that all candidates are considered. */
#define CONSIDER_ALL_CANDIDATES_BOUND \
- ((unsigned) PARAM_VALUE (PARAM_IV_CONSIDER_ALL_CANDIDATES_BOUND))
+ ((unsigned) param_iv_consider_all_candidates_bound)
/* If there are more iv occurrences, we just give up (it is quite unlikely that
optimizing such a loop would help, and it would take ages). */
#define MAX_CONSIDERED_GROUPS \
- ((unsigned) PARAM_VALUE (PARAM_IV_MAX_CONSIDERED_USES))
+ ((unsigned) param_iv_max_considered_uses)
/* If there are at most this number of ivs in the set, try removing unnecessary
ivs from the set always. */
#define ALWAYS_PRUNE_CAND_SET_BOUND \
- ((unsigned) PARAM_VALUE (PARAM_IV_ALWAYS_PRUNE_CAND_SET_BOUND))
+ ((unsigned) param_iv_always_prune_cand_set_bound)
/* The list of trees for that the decl_rtl field must be reset is stored
here. */
data->vcands.create (20);
data->inv_expr_tab = new hash_table<iv_inv_expr_hasher> (10);
data->name_expansion_cache = NULL;
+ data->base_object_map = NULL;
data->iv_common_cand_tab = new hash_table<iv_common_cand_hasher> (10);
data->iv_common_cands.create (20);
decl_rtl_to_reset.create (20);
gcc_obstack_init (&data->iv_obstack);
}
-/* Returns a memory object to that EXPR points. In case we are able to
- determine that it does not point to any such object, NULL is returned. */
+/* walk_tree callback for determine_base_object. */
static tree
-determine_base_object (tree expr)
+determine_base_object_1 (tree *tp, int *walk_subtrees, void *wdata)
{
- enum tree_code code = TREE_CODE (expr);
- tree base, obj;
-
- /* If this is a pointer casted to any type, we need to determine
- the base object for the pointer; so handle conversions before
- throwing away non-pointer expressions. */
- if (CONVERT_EXPR_P (expr))
- return determine_base_object (TREE_OPERAND (expr, 0));
-
- if (!POINTER_TYPE_P (TREE_TYPE (expr)))
- return NULL_TREE;
-
- switch (code)
+ tree_code code = TREE_CODE (*tp);
+ tree obj = NULL_TREE;
+ if (code == ADDR_EXPR)
{
- case INTEGER_CST:
- return NULL_TREE;
-
- case ADDR_EXPR:
- obj = TREE_OPERAND (expr, 0);
- base = get_base_address (obj);
-
+ tree base = get_base_address (TREE_OPERAND (*tp, 0));
if (!base)
- return expr;
-
- if (TREE_CODE (base) == MEM_REF)
- return determine_base_object (TREE_OPERAND (base, 0));
+ obj = *tp;
+ else if (TREE_CODE (base) != MEM_REF)
+ obj = fold_convert (ptr_type_node, build_fold_addr_expr (base));
+ }
+ else if (code == SSA_NAME && POINTER_TYPE_P (TREE_TYPE (*tp)))
+ obj = fold_convert (ptr_type_node, *tp);
- return fold_convert (ptr_type_node,
- build_fold_addr_expr (base));
+ if (!obj)
+ {
+ if (!EXPR_P (*tp))
+ *walk_subtrees = 0;
- case POINTER_PLUS_EXPR:
- return determine_base_object (TREE_OPERAND (expr, 0));
+ return NULL_TREE;
+ }
+ /* Record special node for multiple base objects and stop. */
+ if (*static_cast<tree *> (wdata))
+ {
+ *static_cast<tree *> (wdata) = integer_zero_node;
+ return integer_zero_node;
+ }
+ /* Record the base object and continue looking. */
+ *static_cast<tree *> (wdata) = obj;
+ return NULL_TREE;
+}
- case PLUS_EXPR:
- case MINUS_EXPR:
- /* Pointer addition is done solely using POINTER_PLUS_EXPR. */
- gcc_unreachable ();
+/* Returns a memory object to that EXPR points with caching. Return NULL if we
+ are able to determine that it does not point to any such object; specially
+ return integer_zero_node if EXPR contains multiple base objects. */
- default:
- if (POLY_INT_CST_P (expr))
- return NULL_TREE;
- return fold_convert (ptr_type_node, expr);
+static tree
+determine_base_object (struct ivopts_data *data, tree expr)
+{
+ tree *slot, obj = NULL_TREE;
+ if (data->base_object_map)
+ {
+ if ((slot = data->base_object_map->get(expr)) != NULL)
+ return *slot;
}
+ else
+ data->base_object_map = new hash_map<tree, tree>;
+
+ (void) walk_tree_without_duplicates (&expr, determine_base_object_1, &obj);
+ data->base_object_map->put (expr, obj);
+ return obj;
}
/* Return true if address expression with non-DECL_P operand appears
}
iv->base = base;
- iv->base_object = determine_base_object (base);
+ iv->base_object = determine_base_object (data, base);
iv->step = step;
iv->biv_p = false;
iv->nonlin_use = NULL;
if (!bb
|| !flow_bb_inside_loop_p (data->current_loop, bb))
- set_iv (data, var, var, build_int_cst (type, 0), true);
+ {
+ if (POINTER_TYPE_P (type))
+ type = sizetype;
+ set_iv (data, var, var, build_int_cst (type, 0), true);
+ }
}
return name_info (data, var)->iv;
group->type = type;
group->related_cands = BITMAP_ALLOC (NULL);
group->vuses.create (1);
+ group->doloop_p = false;
data->vgroups.safe_push (group);
return group;
{
case IFN_MASK_LOAD:
case IFN_MASK_LOAD_LANES:
+ case IFN_LEN_LOAD:
if (op_p == gimple_call_arg_ptr (call, 0))
return TREE_TYPE (gimple_call_lhs (call));
return NULL_TREE;
case IFN_MASK_STORE:
case IFN_MASK_STORE_LANES:
+ case IFN_LEN_STORE:
if (op_p == gimple_call_arg_ptr (call, 0))
return TREE_TYPE (gimple_call_arg (call, 3));
return NULL_TREE;
list_index = (unsigned) as * MAX_MACHINE_MODE + (unsigned) mem_mode;
if (list_index >= vec_safe_length (addr_list))
- vec_safe_grow_cleared (addr_list, list_index + MAX_MACHINE_MODE);
+ vec_safe_grow_cleared (addr_list, list_index + MAX_MACHINE_MODE, true);
addr = (*addr_list)[list_index];
if (!addr)
if (!bb || !flow_bb_inside_loop_p (idata->current_loop, bb))
{
- set_iv (idata, op, op, build_int_cst (TREE_TYPE (op), 0), true);
+ tree steptype = TREE_TYPE (op);
+ if (POINTER_TYPE_P (steptype))
+ steptype = sizetype;
+ set_iv (idata, op, op, build_int_cst (steptype, 0), true);
record_invariant (idata, op, false);
}
}
replacement of the final value of the iv by a direct computation. */
static struct iv_cand *
-add_candidate_1 (struct ivopts_data *data,
- tree base, tree step, bool important, enum iv_position pos,
- struct iv_use *use, gimple *incremented_at,
- struct iv *orig_iv = NULL)
+add_candidate_1 (struct ivopts_data *data, tree base, tree step, bool important,
+ enum iv_position pos, struct iv_use *use,
+ gimple *incremented_at, struct iv *orig_iv = NULL,
+ bool doloop = false)
{
unsigned i;
struct iv_cand *cand = NULL;
cand->pos = pos;
if (pos != IP_ORIGINAL)
{
- cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
+ if (doloop)
+ cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "doloop");
+ else
+ cand->var_before = create_tmp_var_raw (TREE_TYPE (base), "ivtmp");
cand->var_after = cand->var_before;
}
cand->important = important;
cand->incremented_at = incremented_at;
+ cand->doloop_p = doloop;
data->vcands.safe_push (cand);
if (!poly_int_tree_p (step))
}
cand->important |= important;
+ cand->doloop_p |= doloop;
/* Relate candidate to the group for which it is added. */
if (use)
the end of loop. */
static void
-add_candidate (struct ivopts_data *data,
- tree base, tree step, bool important, struct iv_use *use,
- struct iv *orig_iv = NULL)
+add_candidate (struct ivopts_data *data, tree base, tree step, bool important,
+ struct iv_use *use, struct iv *orig_iv = NULL,
+ bool doloop = false)
{
if (ip_normal_pos (data->current_loop))
- add_candidate_1 (data, base, step, important,
- IP_NORMAL, use, NULL, orig_iv);
- if (ip_end_pos (data->current_loop)
+ add_candidate_1 (data, base, step, important, IP_NORMAL, use, NULL, orig_iv,
+ doloop);
+ /* Exclude doloop candidate here since it requires decrement then comparison
+ and jump, the IP_END position doesn't match. */
+ if (!doloop && ip_end_pos (data->current_loop)
&& allow_ip_end_pos_p (data->current_loop))
add_candidate_1 (data, base, step, important, IP_END, use, NULL, orig_iv);
}
{
poly_uint64 offset;
tree base;
- tree basetype;
struct iv *iv = use->iv;
+ tree basetype = TREE_TYPE (iv->base);
+
+ /* Don't add candidate for iv_use with non integer, pointer or non-mode
+ precision types, instead, add candidate for the corresponding scev in
+ unsigned type with the same precision. See PR93674 for more info. */
+ if ((TREE_CODE (basetype) != INTEGER_TYPE && !POINTER_TYPE_P (basetype))
+ || !type_has_mode_precision_p (basetype))
+ {
+ basetype = lang_hooks.types.type_for_mode (TYPE_MODE (basetype),
+ TYPE_UNSIGNED (basetype));
+ add_candidate (data, fold_convert (basetype, iv->base),
+ fold_convert (basetype, iv->step), false, NULL);
+ return;
+ }
add_candidate (data, iv->base, iv->step, false, use);
Some RTL specific checks seems unable to be checked in gimple, if any new
checks or easy checks _are_ missing here, please add them. */
-static bool ATTRIBUTE_UNUSED
+static bool
generic_predict_doloop_p (struct ivopts_data *data)
{
class loop *loop = data->current_loop;
return fold_convert (type, aff_combination_to_tree (&aff));
}
+/* Like get_computation_at, but try harder, even if the computation
+ is more expensive. Intended for debug stmts. */
+
+static tree
+get_debug_computation_at (class loop *loop, gimple *at,
+ struct iv_use *use, struct iv_cand *cand)
+{
+ if (tree ret = get_computation_at (loop, at, use, cand))
+ return ret;
+
+ tree ubase = use->iv->base, ustep = use->iv->step;
+ tree cbase = cand->iv->base, cstep = cand->iv->step;
+ tree var;
+ tree utype = TREE_TYPE (ubase), ctype = TREE_TYPE (cbase);
+ widest_int rat;
+
+ /* We must have a precision to express the values of use. */
+ if (TYPE_PRECISION (utype) >= TYPE_PRECISION (ctype))
+ return NULL_TREE;
+
+ /* Try to handle the case that get_computation_at doesn't,
+ try to express
+ use = ubase + (var - cbase) / ratio. */
+ if (!constant_multiple_of (cstep, fold_convert (TREE_TYPE (cstep), ustep),
+ &rat))
+ return NULL_TREE;
+
+ bool neg_p = false;
+ if (wi::neg_p (rat))
+ {
+ if (TYPE_UNSIGNED (ctype))
+ return NULL_TREE;
+ neg_p = true;
+ rat = wi::neg (rat);
+ }
+
+ /* If both IVs can wrap around and CAND doesn't have a power of two step,
+ it is unsafe. Consider uint16_t CAND with step 9, when wrapping around,
+ the values will be ... 0xfff0, 0xfff9, 2, 11 ... and when use is say
+ uint8_t with step 3, those values divided by 3 cast to uint8_t will be
+ ... 0x50, 0x53, 0, 3 ... rather than expected 0x50, 0x53, 0x56, 0x59. */
+ if (!use->iv->no_overflow
+ && !cand->iv->no_overflow
+ && !integer_pow2p (cstep))
+ return NULL_TREE;
+
+ int bits = wi::exact_log2 (rat);
+ if (bits == -1)
+ bits = wi::floor_log2 (rat) + 1;
+ if (!cand->iv->no_overflow
+ && TYPE_PRECISION (utype) + bits > TYPE_PRECISION (ctype))
+ return NULL_TREE;
+
+ var = var_at_stmt (loop, cand, at);
+
+ if (POINTER_TYPE_P (ctype))
+ {
+ ctype = unsigned_type_for (ctype);
+ cbase = fold_convert (ctype, cbase);
+ cstep = fold_convert (ctype, cstep);
+ var = fold_convert (ctype, var);
+ }
+
+ if (stmt_after_increment (loop, cand, at))
+ var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var,
+ unshare_expr (cstep));
+
+ var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var, cbase);
+ var = fold_build2 (EXACT_DIV_EXPR, TREE_TYPE (var), var,
+ wide_int_to_tree (TREE_TYPE (var), rat));
+ if (POINTER_TYPE_P (utype))
+ {
+ var = fold_convert (sizetype, var);
+ if (neg_p)
+ var = fold_build1 (NEGATE_EXPR, sizetype, var);
+ var = fold_build2 (POINTER_PLUS_EXPR, utype, ubase, var);
+ }
+ else
+ {
+ var = fold_convert (utype, var);
+ var = fold_build2 (neg_p ? MINUS_EXPR : PLUS_EXPR, utype,
+ ubase, var);
+ }
+ return var;
+}
+
/* Adjust the cost COST for being in loop setup rather than loop body.
If we're optimizing for space, the loop setup overhead is constant;
if we're optimizing for speed, amortize it over the per-iteration cost.
STRIP_NOPS (op0);
op1 = NULL_TREE;
break;
+ /* See add_iv_candidate_for_doloop, for doloop may_be_zero case, we
+ introduce COND_EXPR for IV base, need to support better cost estimation
+ for this COND_EXPR and tcc_comparison. */
+ case COND_EXPR:
+ op0 = TREE_OPERAND (expr, 1);
+ STRIP_NOPS (op0);
+ op1 = TREE_OPERAND (expr, 2);
+ STRIP_NOPS (op1);
+ break;
+ case LT_EXPR:
+ case LE_EXPR:
+ case GT_EXPR:
+ case GE_EXPR:
+ case EQ_EXPR:
+ case NE_EXPR:
+ case UNORDERED_EXPR:
+ case ORDERED_EXPR:
+ case UNLT_EXPR:
+ case UNLE_EXPR:
+ case UNGT_EXPR:
+ case UNGE_EXPR:
+ case UNEQ_EXPR:
+ case LTGT_EXPR:
+ case MAX_EXPR:
+ case MIN_EXPR:
+ op0 = TREE_OPERAND (expr, 0);
+ STRIP_NOPS (op0);
+ op1 = TREE_OPERAND (expr, 1);
+ STRIP_NOPS (op1);
+ break;
default:
/* Just an arbitrary value, FIXME. */
case RSHIFT_EXPR:
cost = comp_cost (add_cost (speed, mode), 0);
break;
+ case COND_EXPR:
+ op0 = TREE_OPERAND (expr, 0);
+ STRIP_NOPS (op0);
+ if (op0 == NULL_TREE || TREE_CODE (op0) == SSA_NAME
+ || CONSTANT_CLASS_P (op0))
+ cost = no_cost;
+ else
+ cost = force_expr_to_var_cost (op0, speed);
+ break;
+ case LT_EXPR:
+ case LE_EXPR:
+ case GT_EXPR:
+ case GE_EXPR:
+ case EQ_EXPR:
+ case NE_EXPR:
+ case UNORDERED_EXPR:
+ case ORDERED_EXPR:
+ case UNLT_EXPR:
+ case UNLE_EXPR:
+ case UNGT_EXPR:
+ case UNGE_EXPR:
+ case UNEQ_EXPR:
+ case LTGT_EXPR:
+ case MAX_EXPR:
+ case MIN_EXPR:
+ /* Simply use add cost for now, FIXME if there is some more accurate cost
+ evaluation way. */
+ cost = comp_cost (add_cost (speed, mode), 0);
+ break;
default:
gcc_unreachable ();
unsigned nsize = ((unsigned) as + 1) *MAX_MACHINE_MODE;
gcc_assert (nsize > idx);
- ainc_cost_data_list.safe_grow_cleared (nsize);
+ ainc_cost_data_list.safe_grow_cleared (nsize, true);
}
ainc_cost_data *data = ainc_cost_data_list[idx];
{
cost = get_address_cost (data, use, cand, &aff_inv, &aff_var, ratio,
inv_vars, inv_expr, can_autoinc, speed);
- return get_scaled_computation_cost_at (data, at, cost);
+ cost = get_scaled_computation_cost_at (data, at, cost);
+ /* For doloop IV cand, add on the extra cost. */
+ cost += cand->doloop_p ? targetm.doloop_cost_for_address : 0;
+ return cost;
}
bool simple_inv = (aff_combination_const_p (&aff_inv)
if (comp_inv && !integer_zerop (comp_inv))
cost += add_cost (speed, TYPE_MODE (utype));
- return get_scaled_computation_cost_at (data, at, cost);
+ cost = get_scaled_computation_cost_at (data, at, cost);
+
+ /* For doloop IV cand, add on the extra cost. */
+ if (cand->doloop_p && use->type == USE_NONLINEAR_EXPR)
+ cost += targetm.doloop_cost_for_generic;
+
+ return cost;
}
/* Determines cost of computing the use in GROUP with CAND in a generic
}
}
+ /* For doloop IV cand, the bound would be zero. It's safe whether
+ may_be_zero set or not. */
+ if (cand->doloop_p)
+ {
+ *bound = build_int_cst (TREE_TYPE (cand->iv->base), 0);
+ *comp = iv_elimination_compare (data, use);
+ return true;
+ }
+
cand_value_at (loop, cand, use->stmt, desc->niter, &bnd);
*bound = fold_convert (TREE_TYPE (cand->iv->base),
inv_vars = inv_vars_elim;
inv_vars_elim = NULL;
inv_expr = inv_expr_elim;
+ /* For doloop candidate/use pair, adjust to zero cost. */
+ if (group->doloop_p && cand->doloop_p && elim_cost.cost > no_cost.cost)
+ cost = no_cost;
}
else
{
}
}
+/* If PREFERRED_MODE is suitable and profitable, use the preferred
+ PREFERRED_MODE to compute doloop iv base from niter: base = niter + 1. */
+
+static tree
+compute_doloop_base_on_mode (machine_mode preferred_mode, tree niter,
+ const widest_int &iterations_max)
+{
+ tree ntype = TREE_TYPE (niter);
+ tree pref_type = lang_hooks.types.type_for_mode (preferred_mode, 1);
+ if (!pref_type)
+ return fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+ build_int_cst (ntype, 1));
+
+ gcc_assert (TREE_CODE (pref_type) == INTEGER_TYPE);
+
+ int prec = TYPE_PRECISION (ntype);
+ int pref_prec = TYPE_PRECISION (pref_type);
+
+ tree base;
+
+ /* Check if the PREFERRED_MODED is able to present niter. */
+ if (pref_prec > prec
+ || wi::ltu_p (iterations_max,
+ widest_int::from (wi::max_value (pref_prec, UNSIGNED),
+ UNSIGNED)))
+ {
+ /* No wrap, it is safe to use preferred type after niter + 1. */
+ if (wi::ltu_p (iterations_max,
+ widest_int::from (wi::max_value (prec, UNSIGNED),
+ UNSIGNED)))
+ {
+ /* This could help to optimize "-1 +1" pair when niter looks
+ like "n-1": n is in original mode. "base = (n - 1) + 1"
+ in PREFERRED_MODED: it could be base = (PREFERRED_TYPE)n. */
+ base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+ build_int_cst (ntype, 1));
+ base = fold_convert (pref_type, base);
+ }
+
+ /* To avoid wrap, convert niter to preferred type before plus 1. */
+ else
+ {
+ niter = fold_convert (pref_type, niter);
+ base = fold_build2 (PLUS_EXPR, pref_type, unshare_expr (niter),
+ build_int_cst (pref_type, 1));
+ }
+ }
+ else
+ base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+ build_int_cst (ntype, 1));
+ return base;
+}
+
+/* Add one doloop dedicated IV candidate:
+ - Base is (may_be_zero ? 1 : (niter + 1)).
+ - Step is -1. */
+
+static void
+add_iv_candidate_for_doloop (struct ivopts_data *data)
+{
+ tree_niter_desc *niter_desc = niter_for_single_dom_exit (data);
+ gcc_assert (niter_desc && niter_desc->assumptions);
+
+ tree niter = niter_desc->niter;
+ tree ntype = TREE_TYPE (niter);
+ gcc_assert (TREE_CODE (ntype) == INTEGER_TYPE);
+
+ tree may_be_zero = niter_desc->may_be_zero;
+ if (may_be_zero && integer_zerop (may_be_zero))
+ may_be_zero = NULL_TREE;
+ if (may_be_zero)
+ {
+ if (COMPARISON_CLASS_P (may_be_zero))
+ {
+ niter = fold_build3 (COND_EXPR, ntype, may_be_zero,
+ build_int_cst (ntype, 0),
+ rewrite_to_non_trapping_overflow (niter));
+ }
+ /* Don't try to obtain the iteration count expression when may_be_zero is
+ integer_nonzerop (actually iteration count is one) or else. */
+ else
+ return;
+ }
+
+ machine_mode mode = TYPE_MODE (ntype);
+ machine_mode pref_mode = targetm.preferred_doloop_mode (mode);
+
+ tree base;
+ if (mode != pref_mode)
+ {
+ base = compute_doloop_base_on_mode (pref_mode, niter, niter_desc->max);
+ ntype = TREE_TYPE (base);
+ }
+ else
+ base = fold_build2 (PLUS_EXPR, ntype, unshare_expr (niter),
+ build_int_cst (ntype, 1));
+
+
+ add_candidate (data, base, build_int_cst (ntype, -1), true, NULL, NULL, true);
+}
+
/* Finds the candidates for the induction variables. */
static void
/* Add commonly used ivs. */
add_standard_iv_candidates (data);
+ /* Add doloop dedicated ivs. */
+ if (data->doloop_use_p)
+ add_iv_candidate_for_doloop (data);
+
/* Add old induction variables. */
add_iv_candidate_for_bivs (data);
or a const set. */
if (cost_base.cost == 0)
cost_base.cost = COSTS_N_INSNS (1);
- cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
-
+ /* Doloop decrement should be considered as zero cost. */
+ if (cand->doloop_p)
+ cost_step = 0;
+ else
+ cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base)));
cost = cost_step + adjust_setup_cost (data, cost_base.cost);
/* Prefer the original ivs unless we may gain something by replacing it.
The reason is to make debugging simpler; so this is not relevant for
artificial ivs created by other optimization passes. */
- if (cand->pos != IP_ORIGINAL
- || !SSA_NAME_VAR (cand->var_before)
- || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+ if ((cand->pos != IP_ORIGINAL
+ || !SSA_NAME_VAR (cand->var_before)
+ || DECL_ARTIFICIAL (SSA_NAME_VAR (cand->var_before)))
+ /* Prefer doloop as well. */
+ && !cand->doloop_p)
cost++;
/* Prefer not to insert statements into latch unless there are some
if (ivs->n_cand_uses[cid] == 0)
{
bitmap_clear_bit (ivs->cands, cid);
- ivs->n_cands--;
+ if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+ ivs->n_cands--;
ivs->cand_cost -= cp->cand->cost;
iv_ca_set_remove_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
iv_ca_set_remove_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
if (ivs->n_cand_uses[cid] == 1)
{
bitmap_set_bit (ivs->cands, cid);
- ivs->n_cands++;
+ if (!cp->cand->doloop_p || !targetm.have_count_reg_decr_p)
+ ivs->n_cands++;
ivs->cand_cost += cp->cand->cost;
iv_ca_set_add_invs (ivs, cp->cand->inv_vars, ivs->n_inv_var_uses);
iv_ca_set_add_invs (ivs, cp->cand->inv_exprs, ivs->n_inv_expr_uses);
fprintf (file, " cost: %" PRId64 " (complexity %d)\n", cost.cost,
cost.complexity);
+ fprintf (file, " reg_cost: %d\n",
+ ivopts_estimate_reg_pressure (data, ivs->n_invs, ivs->n_cands));
fprintf (file, " cand_cost: %" PRId64 "\n cand_group_cost: "
"%" PRId64 " (complexity %d)\n", ivs->cand_cost,
ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity);
}
comp = fold_convert (type, comp);
- if (!valid_gimple_rhs_p (comp)
- || (gimple_code (use->stmt) != GIMPLE_PHI
- /* We can't allow re-allocating the stmt as it might be pointed
- to still. */
- && (get_gimple_rhs_num_ops (TREE_CODE (comp))
- >= gimple_num_ops (gsi_stmt (bsi)))))
+ comp = force_gimple_operand (comp, &seq, false, NULL);
+ gimple_seq_add_seq (&stmt_list, seq);
+ if (gimple_code (use->stmt) != GIMPLE_PHI
+ /* We can't allow re-allocating the stmt as it might be pointed
+ to still. */
+ && (get_gimple_rhs_num_ops (TREE_CODE (comp))
+ >= gimple_num_ops (gsi_stmt (bsi))))
{
comp = force_gimple_operand (comp, &seq, true, NULL);
gimple_seq_add_seq (&stmt_list, seq);
case IFN_MASK_STORE:
case IFN_MASK_LOAD_LANES:
case IFN_MASK_STORE_LANES:
+ case IFN_LEN_LOAD:
+ case IFN_LEN_STORE:
/* The second argument contains the correct alias type. */
gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
return TREE_TYPE (gimple_call_arg (call, 1));
count++;
if (count > 1)
- BREAK_FROM_IMM_USE_STMT (imm_iter);
+ break;
}
if (!count)
struct iv_use dummy_use;
struct iv_cand *best_cand = NULL, *cand;
unsigned i, best_pref = 0, cand_pref;
+ tree comp = NULL_TREE;
memset (&dummy_use, 0, sizeof (dummy_use));
dummy_use.iv = info->iv;
? 1 : 0;
if (best_cand == NULL || best_pref < cand_pref)
{
- best_cand = cand;
- best_pref = cand_pref;
+ tree this_comp
+ = get_debug_computation_at (data->current_loop,
+ SSA_NAME_DEF_STMT (def),
+ &dummy_use, cand);
+ if (this_comp)
+ {
+ best_cand = cand;
+ best_pref = cand_pref;
+ comp = this_comp;
+ }
}
}
if (!best_cand)
continue;
- tree comp = get_computation_at (data->current_loop,
- SSA_NAME_DEF_STMT (def),
- &dummy_use, best_cand);
- if (!comp)
- continue;
-
+ comp = unshare_expr (comp);
if (count > 1)
{
tree vexpr = make_node (DEBUG_EXPR_DECL);
delete data->inv_expr_tab;
data->inv_expr_tab = NULL;
free_affine_expand_cache (&data->name_expansion_cache);
+ if (data->base_object_map)
+ delete data->base_object_map;
delete data->iv_common_cand_tab;
data->iv_common_cand_tab = NULL;
data->iv_common_cands.release ();
}
}
+/* Find doloop comparison use and set its doloop_p on if found. */
+
+static bool
+find_doloop_use (struct ivopts_data *data)
+{
+ struct loop *loop = data->current_loop;
+
+ for (unsigned i = 0; i < data->vgroups.length (); i++)
+ {
+ struct iv_group *group = data->vgroups[i];
+ if (group->type == USE_COMPARE)
+ {
+ gcc_assert (group->vuses.length () == 1);
+ struct iv_use *use = group->vuses[0];
+ gimple *stmt = use->stmt;
+ if (gimple_code (stmt) == GIMPLE_COND)
+ {
+ basic_block bb = gimple_bb (stmt);
+ edge true_edge, false_edge;
+ extract_true_false_edges_from_block (bb, &true_edge, &false_edge);
+ /* This comparison is used for loop latch. Require latch is empty
+ for now. */
+ if ((loop->latch == true_edge->dest
+ || loop->latch == false_edge->dest)
+ && empty_block_p (loop->latch))
+ {
+ group->doloop_p = true;
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Doloop cmp iv use: ");
+ print_gimple_stmt (dump_file, stmt, TDF_DETAILS);
+ }
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+/* For the targets which support doloop, to predict whether later RTL doloop
+ transformation will perform on this loop, further detect the doloop use and
+ mark the flag doloop_use_p if predicted. */
+
+void
+analyze_and_mark_doloop_use (struct ivopts_data *data)
+{
+ data->doloop_use_p = false;
+
+ if (!flag_branch_on_count_reg)
+ return;
+
+ if (data->current_loop->unroll == USHRT_MAX)
+ return;
+
+ if (!generic_predict_doloop_p (data))
+ return;
+
+ if (find_doloop_use (data))
+ {
+ data->doloop_use_p = true;
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ struct loop *loop = data->current_loop;
+ fprintf (dump_file,
+ "Predict loop %d can perform"
+ " doloop optimization later.\n",
+ loop->num);
+ flow_loop_dump (loop, dump_file, NULL, 1);
+ }
+ }
+}
+
/* Optimizes the LOOP. Returns true if anything changed. */
static bool
data->body_includes_call = loop_body_includes_call (body, loop->num_nodes);
renumber_gimple_stmt_uids_in_blocks (body, loop->num_nodes);
- data->loop_single_exit_p = exit != NULL && loop_only_exit_p (loop, exit);
+ data->loop_single_exit_p
+ = exit != NULL && loop_only_exit_p (loop, body, exit);
/* For each ssa name determines whether it behaves as an induction variable
in some loop. */
/* Determine cost scaling factor for basic blocks in loop. */
determine_scaling_factor (data, body);
+ /* Analyze doloop possibility and mark the doloop use if predicted. */
+ analyze_and_mark_doloop_use (data);
+
/* Finds candidates for the induction variables (item 2). */
find_iv_candidates (data);
void
tree_ssa_iv_optimize (void)
{
- class loop *loop;
struct ivopts_data data;
auto_bitmap toremove;
tree_ssa_iv_optimize_init (&data);
/* Optimize the loops starting with the innermost ones. */
- FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
+ for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
{
+ if (!dbg_cnt (ivopts_loop))
+ continue;
+
if (dump_file && (dump_flags & TDF_DETAILS))
flow_loop_dump (loop, dump_file, NULL, 1);