#define IN_TARGET_CODE 1
-#include "config.h"
#define INCLUDE_STRING
+#define INCLUDE_ALGORITHM
+#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "function-abi.h"
#include "gimple-pretty-print.h"
#include "tree-ssa-loop-niter.h"
+#include "fractional-cost.h"
/* This file should be included last. */
#include "target-def.h"
for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
try to subdivide the target-independent categorization provided by KIND
to get a more accurate cost. */
-static unsigned int
+static fractional_cost
aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
stmt_vec_info stmt_info,
- unsigned int stmt_cost)
+ fractional_cost stmt_cost)
{
/* Detect an extension of a loaded value. In general, we'll be able to fuse
the extension with the load. */
the target-independent categorization provided by KIND to get a more
accurate cost. WHERE specifies where the cost associated with KIND
occurs. */
-static unsigned int
+static fractional_cost
aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, tree vectype,
enum vect_cost_model_location where,
- unsigned int stmt_cost)
+ fractional_cost stmt_cost)
{
const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
const sve_vec_cost *sve_costs = nullptr;
for STMT_INFO, which has cost kind KIND and which when vectorized would
operate on vector type VECTYPE. Adjust the cost as necessary for SVE
targets. */
-static unsigned int
+static fractional_cost
aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, tree vectype,
- unsigned int stmt_cost)
+ fractional_cost stmt_cost)
{
/* Unlike vec_promote_demote, vector_stmt conversions do not change the
vector register size or number of units. Integer promotions of this
/* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
and which when vectorized would operate on vector type VECTYPE. Add the
cost of any embedded operations. */
-static unsigned int
+static fractional_cost
aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
- tree vectype, unsigned int stmt_cost)
+ tree vectype, fractional_cost stmt_cost)
{
if (vectype)
{
if (flag_vect_cost_model)
{
- int stmt_cost
+ fractional_cost stmt_cost
= aarch64_builtin_vectorization_cost (kind, vectype, misalign);
/* Do one-time initialization based on the vinfo. */
count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME */
}
- retval = (unsigned) (count * stmt_cost);
+ retval = (count * stmt_cost).ceil ();
costs->region[where] += retval;
}
/* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue
the operations described by OPS. This is a very simplistic model! */
-static unsigned int
+static fractional_cost
aarch64_estimate_min_cycles_per_iter
(const aarch64_vec_op_count *ops,
const aarch64_base_vec_issue_info *issue_info)
{
- unsigned int cycles = MAX (ops->reduction_latency, 1);
- cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle));
- cycles = MAX (cycles, CEIL (ops->loads + ops->stores,
- issue_info->loads_stores_per_cycle));
- cycles = MAX (cycles, CEIL (ops->general_ops,
- issue_info->general_ops_per_cycle));
+ fractional_cost cycles = MAX (ops->reduction_latency, 1);
+ cycles = std::max (cycles, { ops->stores, issue_info->stores_per_cycle });
+ cycles = std::max (cycles, { ops->loads + ops->stores,
+ issue_info->loads_stores_per_cycle });
+ cycles = std::max (cycles, { ops->general_ops,
+ issue_info->general_ops_per_cycle });
return cycles;
}
if (!issue_info)
return body_cost;
- unsigned int scalar_cycles_per_iter
+ fractional_cost scalar_cycles_per_iter
= aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops,
issue_info->scalar);
- unsigned int advsimd_cycles_per_iter
+
+ fractional_cost advsimd_cycles_per_iter
= aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops,
issue_info->advsimd);
+
bool could_use_advsimd
= ((costs->vec_flags & VEC_ADVSIMD)
|| (aarch64_autovec_preference != 2
dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
costs->scalar_ops.dump ();
dump_printf_loc (MSG_NOTE, vect_location,
- " estimated cycles per iteration = %d\n",
- scalar_cycles_per_iter);
+ " estimated cycles per iteration = %f\n",
+ scalar_cycles_per_iter.as_double ());
if (could_use_advsimd)
{
dump_printf_loc (MSG_NOTE, vect_location,
"Advanced SIMD issue estimate:\n");
costs->advsimd_ops.dump ();
dump_printf_loc (MSG_NOTE, vect_location,
- " estimated cycles per iteration = %d\n",
- advsimd_cycles_per_iter);
+ " estimated cycles per iteration = %f\n",
+ advsimd_cycles_per_iter.as_double ());
}
else
dump_printf_loc (MSG_NOTE, vect_location,
"Loop could not use Advanced SIMD\n");
}
- uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter;
+ fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter;
unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency;
+
if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve)
{
/* Estimate the minimum number of cycles per iteration needed to issue
non-predicate operations. */
- unsigned int sve_cycles_per_iter
+ fractional_cost sve_cycles_per_iter
= aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
issue_info->sve);
/* Separately estimate the minimum number of cycles per iteration needed
to issue the predicate operations. */
- unsigned int pred_cycles_per_iter
- = CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle);
+ fractional_cost pred_cycles_per_iter
+ = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
if (dump_enabled_p ())
{
costs->sve_ops.dump ();
dump_printf_loc (MSG_NOTE, vect_location,
" estimated cycles per iteration for non-predicate"
- " operations = %d\n", sve_cycles_per_iter);
+ " operations = %f\n",
+ sve_cycles_per_iter.as_double ());
if (costs->sve_ops.pred_ops)
dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per"
" iteration for predicate operations = %d\n",
- pred_cycles_per_iter);
+ pred_cycles_per_iter.as_double ());
}
- vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter);
+ vector_cycles_per_iter = std::max (sve_cycles_per_iter,
+ pred_cycles_per_iter);
vector_reduction_latency = costs->sve_ops.reduction_latency;
/* If the scalar version of the loop could issue at least as
too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
code later in the function handles that case in a more
conservative way. */
- uint64_t sve_estimate = pred_cycles_per_iter + 1;
+ fractional_cost sve_estimate = pred_cycles_per_iter + 1;
if (scalar_cycles_per_iter < sve_estimate)
{
unsigned int min_cost
if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate)
{
/* This ensures that min_cost > orig_body_cost * 2. */
- unsigned int min_cost
- = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1;
+ unsigned int factor
+ = fractional_cost::scale (1, sve_estimate,
+ advsimd_cycles_per_iter);
+ unsigned int min_cost = orig_body_cost * factor + 1;
if (body_cost < min_cost)
{
if (dump_enabled_p ())
so minor differences should only result in minor changes. */
else if (scalar_cycles_per_iter < vector_cycles_per_iter)
{
- body_cost = CEIL (body_cost * vector_cycles_per_iter,
- scalar_cycles_per_iter);
+ body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
+ scalar_cycles_per_iter);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Increasing body cost to %d because scalar code"
&& scalar_cycles_per_iter > vector_cycles_per_iter
&& !should_disparage)
{
- body_cost = CEIL (body_cost * vector_cycles_per_iter,
- scalar_cycles_per_iter);
+ body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
+ scalar_cycles_per_iter);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Decreasing body cost to %d account for smaller"
ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
}
+/* Test the fractional_cost class. */
+
+static void
+aarch64_test_fractional_cost ()
+{
+ using cf = fractional_cost;
+
+ ASSERT_EQ (cf (0, 20), 0);
+
+ ASSERT_EQ (cf (4, 2), 2);
+ ASSERT_EQ (3, cf (9, 3));
+
+ ASSERT_NE (cf (5, 2), 2);
+ ASSERT_NE (3, cf (8, 3));
+
+ ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
+ ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
+ ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
+
+ ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
+ ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
+ ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
+ ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
+ ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
+ ASSERT_EQ (3 - cf (10, 3), 0);
+
+ ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
+ ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
+
+ ASSERT_TRUE (cf (4, 15) < cf (5, 15));
+ ASSERT_FALSE (cf (5, 15) < cf (5, 15));
+ ASSERT_FALSE (cf (6, 15) < cf (5, 15));
+ ASSERT_TRUE (cf (1, 3) < cf (2, 5));
+ ASSERT_TRUE (cf (1, 12) < cf (1, 6));
+ ASSERT_FALSE (cf (5, 3) < cf (5, 3));
+ ASSERT_TRUE (cf (239, 240) < 1);
+ ASSERT_FALSE (cf (240, 240) < 1);
+ ASSERT_FALSE (cf (241, 240) < 1);
+ ASSERT_FALSE (2 < cf (207, 104));
+ ASSERT_FALSE (2 < cf (208, 104));
+ ASSERT_TRUE (2 < cf (209, 104));
+
+ ASSERT_TRUE (cf (4, 15) < cf (5, 15));
+ ASSERT_FALSE (cf (5, 15) < cf (5, 15));
+ ASSERT_FALSE (cf (6, 15) < cf (5, 15));
+ ASSERT_TRUE (cf (1, 3) < cf (2, 5));
+ ASSERT_TRUE (cf (1, 12) < cf (1, 6));
+ ASSERT_FALSE (cf (5, 3) < cf (5, 3));
+ ASSERT_TRUE (cf (239, 240) < 1);
+ ASSERT_FALSE (cf (240, 240) < 1);
+ ASSERT_FALSE (cf (241, 240) < 1);
+ ASSERT_FALSE (2 < cf (207, 104));
+ ASSERT_FALSE (2 < cf (208, 104));
+ ASSERT_TRUE (2 < cf (209, 104));
+
+ ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
+ ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
+ ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
+ ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
+ ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
+ ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
+ ASSERT_FALSE (cf (239, 240) >= 1);
+ ASSERT_TRUE (cf (240, 240) >= 1);
+ ASSERT_TRUE (cf (241, 240) >= 1);
+ ASSERT_TRUE (2 >= cf (207, 104));
+ ASSERT_TRUE (2 >= cf (208, 104));
+ ASSERT_FALSE (2 >= cf (209, 104));
+
+ ASSERT_FALSE (cf (4, 15) > cf (5, 15));
+ ASSERT_FALSE (cf (5, 15) > cf (5, 15));
+ ASSERT_TRUE (cf (6, 15) > cf (5, 15));
+ ASSERT_FALSE (cf (1, 3) > cf (2, 5));
+ ASSERT_FALSE (cf (1, 12) > cf (1, 6));
+ ASSERT_FALSE (cf (5, 3) > cf (5, 3));
+ ASSERT_FALSE (cf (239, 240) > 1);
+ ASSERT_FALSE (cf (240, 240) > 1);
+ ASSERT_TRUE (cf (241, 240) > 1);
+ ASSERT_TRUE (2 > cf (207, 104));
+ ASSERT_FALSE (2 > cf (208, 104));
+ ASSERT_FALSE (2 > cf (209, 104));
+
+ ASSERT_EQ (cf (1, 2).ceil (), 1);
+ ASSERT_EQ (cf (11, 7).ceil (), 2);
+ ASSERT_EQ (cf (20, 1).ceil (), 20);
+ ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
+ ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
+ ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
+ ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
+ ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
+
+ ASSERT_EQ (cf (1, 2).as_double (), 0.5);
+}
+
/* Run all target-specific selftests. */
static void
aarch64_run_selftests (void)
{
aarch64_test_loading_full_dump ();
+ aarch64_test_fractional_cost ();
}
} // namespace selftest
--- /dev/null
+// Simple fixed-point representation of fractional costs
+// Copyright (C) 2021 Free Software Foundation, Inc.
+//
+// This file is part of GCC.
+//
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+//
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// A simple saturating fixed-point type for representing fractional
+// intermediate results in cost calculations. The input and result
+// costs are assumed to be uint32_ts. Unlike sreal, the class can
+// represent most values that we care about exactly (without rounding).
+// See the comment above the SCALE field for the current set of
+// exactly-representable reciprocals.
+class fractional_cost
+{
+public:
+ // Construct an object equal to INT_VALUE.
+ constexpr fractional_cost (uint32_t int_value = 0)
+ : m_value (uint64_t (int_value) * SCALE) {}
+
+ fractional_cost (uint32_t a, uint32_t b);
+
+ fractional_cost operator+ (const fractional_cost &) const;
+ fractional_cost operator- (const fractional_cost &) const;
+ fractional_cost operator* (uint32_t) const;
+
+ fractional_cost &operator+= (const fractional_cost &);
+ fractional_cost &operator-= (const fractional_cost &);
+ fractional_cost &operator*= (uint32_t);
+
+ bool operator== (const fractional_cost &) const;
+ bool operator!= (const fractional_cost &) const;
+ bool operator< (const fractional_cost &) const;
+ bool operator<= (const fractional_cost &) const;
+ bool operator>= (const fractional_cost &) const;
+ bool operator> (const fractional_cost &) const;
+
+ uint32_t ceil () const;
+
+ static uint32_t scale (uint32_t, fractional_cost, fractional_cost);
+
+ explicit operator bool () const { return m_value != 0; }
+
+ // Convert the value to a double.
+ double as_double () const { return double (m_value) / SCALE; }
+
+private:
+ enum raw { RAW };
+ constexpr fractional_cost (uint64_t value, raw) : m_value (value) {}
+
+ // A multiple of [1, 16] * 16. This ensures that 1/N is representable
+ // for every every possible SVE element count N, or for any "X per cycle"
+ // value N in the range [1, 16].
+ static const uint32_t SCALE = 11531520;
+
+ // The value multiplied by BIAS.
+ uint64_t m_value;
+};
+
+// Construct a representation of A / B, rounding up if (contrary to
+// expectations) we can't represent the value exactly. For now we
+// treat inexact values as a bug, since all values of B should come
+// from a small set of values that are known at compile time.
+inline fractional_cost::fractional_cost (uint32_t a, uint32_t b)
+ : m_value (CEIL (uint64_t (a) * SCALE, uint64_t (b)))
+{
+ gcc_checking_assert (SCALE % b == 0);
+}
+
+inline fractional_cost
+fractional_cost::operator+ (const fractional_cost &other) const
+{
+ uint64_t sum = m_value + other.m_value;
+ return { sum >= m_value ? sum : ~uint64_t (0), RAW };
+}
+
+inline fractional_cost &
+fractional_cost::operator+= (const fractional_cost &other)
+{
+ *this = *this + other;
+ return *this;
+}
+
+inline fractional_cost
+fractional_cost::operator- (const fractional_cost &other) const
+{
+ uint64_t diff = m_value - other.m_value;
+ return { diff <= m_value ? diff : 0, RAW };
+}
+
+inline fractional_cost &
+fractional_cost::operator-= (const fractional_cost &other)
+{
+ *this = *this - other;
+ return *this;
+}
+
+inline fractional_cost
+fractional_cost::operator* (uint32_t other) const
+{
+ if (other == 0)
+ return 0;
+
+ uint64_t max = ~uint64_t (0);
+ return { m_value <= max / other ? m_value * other : max, RAW };
+}
+
+inline fractional_cost &
+fractional_cost::operator*= (uint32_t other)
+{
+ *this = *this * other;
+ return *this;
+}
+
+inline bool
+fractional_cost::operator== (const fractional_cost &other) const
+{
+ return m_value == other.m_value;
+}
+
+inline bool
+fractional_cost::operator!= (const fractional_cost &other) const
+{
+ return m_value != other.m_value;
+}
+
+inline bool
+fractional_cost::operator< (const fractional_cost &other) const
+{
+ return m_value < other.m_value;
+}
+
+inline bool
+fractional_cost::operator<= (const fractional_cost &other) const
+{
+ return m_value <= other.m_value;
+}
+
+inline bool
+fractional_cost::operator>= (const fractional_cost &other) const
+{
+ return m_value >= other.m_value;
+}
+
+inline bool
+fractional_cost::operator> (const fractional_cost &other) const
+{
+ return m_value > other.m_value;
+}
+
+// Round the value up to the nearest integer and saturate to a uint32_t.
+inline uint32_t
+fractional_cost::ceil () const
+{
+ uint32_t max = ~uint32_t (0);
+ if (m_value <= uint64_t (max - 1) * SCALE)
+ return (m_value + SCALE - 1) / SCALE;
+ return max;
+}
+
+// Round (COST * A) / B up to the nearest integer and saturate to a uint32_t.
+inline uint32_t
+fractional_cost::scale (uint32_t cost, fractional_cost a, fractional_cost b)
+{
+ widest_int result = wi::div_ceil (widest_int (cost) * a.m_value,
+ b.m_value, SIGNED);
+ if (result < ~uint32_t (0))
+ return result.to_shwi ();
+ return ~uint32_t (0);
+}
+
+inline fractional_cost
+operator+ (uint32_t a, const fractional_cost &b)
+{
+ return b.operator+ (a);
+}
+
+inline fractional_cost
+operator- (uint32_t a, const fractional_cost &b)
+{
+ return fractional_cost (a).operator- (b);
+}
+
+inline fractional_cost
+operator* (uint32_t a, const fractional_cost &b)
+{
+ return b.operator* (a);
+}
+
+inline bool
+operator== (uint32_t a, const fractional_cost &b)
+{
+ return b.operator== (a);
+}
+
+inline bool
+operator!= (uint32_t a, const fractional_cost &b)
+{
+ return b.operator!= (a);
+}
+
+inline bool
+operator< (uint32_t a, const fractional_cost &b)
+{
+ return b.operator> (a);
+}
+
+inline bool
+operator<= (uint32_t a, const fractional_cost &b)
+{
+ return b.operator>= (a);
+}
+
+inline bool
+operator>= (uint32_t a, const fractional_cost &b)
+{
+ return b.operator<= (a);
+}
+
+inline bool
+operator> (uint32_t a, const fractional_cost &b)
+{
+ return b.operator< (a);
+}