+2013-12-19 Ganesh Gopalasubramanian <Ganesh.Gopalasubramanian@amd.com>
+
+ * config/i386/i386.c: Include cfgloop.h.
+ (ix86_loop_memcount): New function.
+ (ix86_loop_unroll_adjust): New function.
+ (TARGET_LOOP_UNROLL_ADJUST): Define.
+ * config/i386/i386.h
+ (TARGET_ADJUST_UNROLL): Define.
+ * config/i386/x86-tune.def
+ (X86_TUNE_ADJUST_UNROLL): Define.
+
2013-12-19 Marek Polacek <polacek@redhat.com>
* config/i386/i386.c (ix86_parse_stringop_strategy_string): Remove
#include "is-a.h"
#include "gimple.h"
#include "gimplify.h"
+#include "cfgloop.h"
#include "dwarf2.h"
#include "df.h"
#include "tm-constrs.h"
}
}
+/* This function gives out the number of memory references.
+ This value determines the unrolling factor for
+ bdver3 and bdver4 architectures. */
+
+static int
+ix86_loop_memcount (rtx *x, unsigned *mem_count)
+{
+ if (*x != NULL_RTX && MEM_P (*x))
+ {
+ enum machine_mode mode;
+ unsigned int n_words;
+
+ mode = GET_MODE (*x);
+ n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+
+ if (n_words > 4)
+ (*mem_count)+=2;
+ else
+ (*mem_count)+=1;
+ }
+ return 0;
+}
+
+/* This function adjusts the unroll factor based on
+ the hardware capabilities. For ex, bdver3 has
+ a loop buffer which makes unrolling of smaller
+ loops less important. This function decides the
+ unroll factor using number of memory references
+ (value 32 is used) as a heuristic. */
+
+static unsigned
+ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+ basic_block *bbs;
+ rtx insn;
+ unsigned i;
+ unsigned mem_count = 0;
+
+ if (!TARGET_ADJUST_UNROLL)
+ return nunroll;
+
+ /* Count the number of memory references within the loop body. */
+ bbs = get_loop_body (loop);
+ for (i = 0; i < loop->num_nodes; i++)
+ {
+ for (insn = BB_HEAD (bbs[i]); insn != BB_END (bbs[i]); insn = NEXT_INSN (insn))
+ if (NONDEBUG_INSN_P (insn))
+ for_each_rtx (&insn, (rtx_function) ix86_loop_memcount, &mem_count);
+ }
+ free (bbs);
+
+ if (mem_count && mem_count <=32)
+ return 32/mem_count;
+
+ return nunroll;
+}
+
+
/* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
static bool
#define TARGET_INIT_LIBFUNCS darwin_rename_builtins
#endif
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
+
#undef TARGET_SPILL_CLASS
#define TARGET_SPILL_CLASS ix86_spill_class
ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE]
#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \
ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
+#define TARGET_ADJUST_UNROLL \
+ ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
is usually used for RISC targets. */
DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0)
+
+/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
+ on hardware capabilities. Bdver3 hardware has a loop buffer which makes
+ unrolling small loop less important. For, such architectures we adjust
+ the unroll factor so that the unrolled loop fits the loop buffer. */
+DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)