instead of using a scalar one. */
int
-general_scalar_chain::vector_const_cost (rtx exp)
+general_scalar_chain::vector_const_cost (rtx exp, basic_block bb)
{
gcc_assert (CONST_INT_P (exp));
if (standard_sse_constant_p (exp, vmode))
return ix86_cost->sse_op;
+ if (optimize_bb_for_size_p (bb))
+ return COSTS_N_BYTES (8);
/* We have separate costs for SImode and DImode, use SImode costs
for smaller modes. */
- return ix86_cost->sse_load[smode == DImode ? 1 : 0];
+ return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
}
/* Compute a gain for chain conversion. */
smaller modes than SImode the int load/store costs need to be
adjusted as well. */
unsigned sse_cost_idx = smode == DImode ? 1 : 0;
- unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
+ int m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
{
rtx def_set = single_set (insn);
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
+ basic_block bb = BLOCK_FOR_INSN (insn);
int igain = 0;
if (REG_P (src) && REG_P (dst))
- igain += 2 * m - ix86_cost->xmm_move;
+ {
+ if (optimize_bb_for_size_p (bb))
+ /* reg-reg move is 2 bytes, while SSE 3. */
+ igain += COSTS_N_BYTES (2 * m - 3);
+ else
+ /* Move costs are normalized to reg-reg move having cost 2. */
+ igain += COSTS_N_INSNS (2 * m - ix86_cost->xmm_move) / 2;
+ }
else if (REG_P (src) && MEM_P (dst))
- igain
- += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
+ {
+ if (optimize_bb_for_size_p (bb))
+ /* Integer load/store is 3+ bytes and SSE 4+. */
+ igain += COSTS_N_BYTES (3 * m - 4);
+ else
+ igain
+ += COSTS_N_INSNS (m * ix86_cost->int_store[2]
+ - ix86_cost->sse_store[sse_cost_idx]) / 2;
+ }
else if (MEM_P (src) && REG_P (dst))
- igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
+ {
+ if (optimize_bb_for_size_p (bb))
+ igain += COSTS_N_BYTES (3 * m - 4);
+ else
+ igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
+ - ix86_cost->sse_load[sse_cost_idx]) / 2;
+ }
else
{
/* For operations on memory operands, include the overhead
of explicit load and store instructions. */
if (MEM_P (dst))
- igain += optimize_insn_for_size_p ()
- ? -COSTS_N_BYTES (8)
- : (m * (ix86_cost->int_load[2]
- + ix86_cost->int_store[2])
- - (ix86_cost->sse_load[sse_cost_idx] +
- ix86_cost->sse_store[sse_cost_idx]));
+ {
+ if (optimize_bb_for_size_p (bb))
+ /* ??? This probably should account size difference
+ of SSE and integer load rather than full SSE load. */
+ igain -= COSTS_N_BYTES (8);
+ else
+ {
+ int cost = (m * (ix86_cost->int_load[2]
+ + ix86_cost->int_store[2])
+ - (ix86_cost->sse_load[sse_cost_idx] +
+ ix86_cost->sse_store[sse_cost_idx]));
+ igain += COSTS_N_INSNS (cost) / 2;
+ }
+ }
switch (GET_CODE (src))
{
igain += ix86_cost->shift_const - ix86_cost->sse_op;
if (CONST_INT_P (XEXP (src, 0)))
- igain -= vector_const_cost (XEXP (src, 0));
+ igain -= vector_const_cost (XEXP (src, 0), bb);
break;
case ROTATE:
igain += m * ix86_cost->add;
if (CONST_INT_P (XEXP (src, 0)))
- igain -= vector_const_cost (XEXP (src, 0));
+ igain -= vector_const_cost (XEXP (src, 0), bb);
if (CONST_INT_P (XEXP (src, 1)))
- igain -= vector_const_cost (XEXP (src, 1));
+ igain -= vector_const_cost (XEXP (src, 1), bb);
if (MEM_P (XEXP (src, 1)))
{
- if (optimize_insn_for_size_p ())
+ if (optimize_bb_for_size_p (bb))
igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
else
- igain += m * ix86_cost->int_load[2]
- - ix86_cost->sse_load[sse_cost_idx];
+ igain += COSTS_N_INSNS
+ (m * ix86_cost->int_load[2]
+ - ix86_cost->sse_load[sse_cost_idx]) / 2;
}
break;
case CONST_INT:
if (REG_P (dst))
{
- if (optimize_insn_for_size_p ())
+ if (optimize_bb_for_size_p (bb))
{
/* xor (2 bytes) vs. xorps (3 bytes). */
if (src == const0_rtx)
/* DImode can be immediate for TARGET_64BIT
and SImode always. */
igain += m * COSTS_N_INSNS (1);
- igain -= vector_const_cost (src);
+ igain -= vector_const_cost (src, bb);
}
}
else if (MEM_P (dst))
{
igain += (m * ix86_cost->int_store[2]
- ix86_cost->sse_store[sse_cost_idx]);
- igain -= vector_const_cost (src);
+ igain -= vector_const_cost (src, bb);
}
break;
if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
{
// movd (4 bytes) replaced with movdqa (4 bytes).
- if (!optimize_insn_for_size_p ())
- igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
+ if (!optimize_bb_for_size_p (bb))
+ igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
+ - ix86_cost->xmm_move) / 2;
}
else
{
// pshufd; movd replaced with pshufd.
- if (optimize_insn_for_size_p ())
+ if (optimize_bb_for_size_p (bb))
igain += COSTS_N_BYTES (4);
else
igain += ix86_cost->sse_to_integer;
/* Cost the integer to sse and sse to integer moves. */
if (!optimize_function_for_size_p (cfun))
{
- cost += n_sse_to_integer * ix86_cost->sse_to_integer;
+ cost += n_sse_to_integer * COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
/* ??? integer_to_sse but we only have that in the RA cost table.
Assume sse_to_integer/integer_to_sse are the same which they
are at the moment. */
- cost += n_integer_to_sse * ix86_cost->sse_to_integer;
+ cost += n_integer_to_sse * COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
}
else if (TARGET_64BIT || smode == SImode)
{
with numerous special cases. */
static int
-timode_immed_const_gain (rtx cst)
+timode_immed_const_gain (rtx cst, basic_block bb)
{
/* movabsq vs. movabsq+vmovq+vunpacklqdq. */
if (CONST_WIDE_INT_P (cst)
&& CONST_WIDE_INT_NUNITS (cst) == 2
&& CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
- return optimize_insn_for_size_p () ? -COSTS_N_BYTES (9)
+ return optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (9)
: -COSTS_N_INSNS (2);
/* 2x movabsq ~ vmovdqa. */
return 0;
rtx src = SET_SRC (def_set);
rtx dst = SET_DEST (def_set);
HOST_WIDE_INT op1val;
+ basic_block bb = BLOCK_FOR_INSN (insn);
int scost, vcost;
int igain = 0;
switch (GET_CODE (src))
{
case REG:
- if (optimize_insn_for_size_p ())
+ if (optimize_bb_for_size_p (bb))
igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
else
igain = COSTS_N_INSNS (1);
break;
case MEM:
- igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7)
+ igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (7)
: COSTS_N_INSNS (1);
break;
case CONST_INT:
if (MEM_P (dst)
&& standard_sse_constant_p (src, V1TImode))
- igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1;
+ igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (11) : 1;
break;
case CONST_WIDE_INT:
/* 2 x mov vs. vmovdqa. */
if (MEM_P (dst))
- igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (3)
+ igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (3)
: COSTS_N_INSNS (1);
break;
if (!MEM_P (dst))
igain = COSTS_N_INSNS (1);
if (CONST_SCALAR_INT_P (XEXP (src, 1)))
- igain += timode_immed_const_gain (XEXP (src, 1));
+ igain += timode_immed_const_gain (XEXP (src, 1), bb);
break;
case ASHIFT:
case LSHIFTRT:
/* See ix86_expand_v1ti_shift. */
op1val = INTVAL (XEXP (src, 1));
- if (optimize_insn_for_size_p ())
+ if (optimize_bb_for_size_p (bb))
{
if (op1val == 64 || op1val == 65)
scost = COSTS_N_BYTES (5);
case ASHIFTRT:
/* See ix86_expand_v1ti_ashiftrt. */
op1val = INTVAL (XEXP (src, 1));
- if (optimize_insn_for_size_p ())
+ if (optimize_bb_for_size_p (bb))
{
if (op1val == 64 || op1val == 127)
scost = COSTS_N_BYTES (7);
case ROTATERT:
/* See ix86_expand_v1ti_rotate. */
op1val = INTVAL (XEXP (src, 1));
- if (optimize_insn_for_size_p ())
+ if (optimize_bb_for_size_p (bb))
{
scost = COSTS_N_BYTES (13);
if ((op1val & 31) == 0)
{
if (GET_CODE (XEXP (src, 0)) == AND)
/* and;and;or (9 bytes) vs. ptest (5 bytes). */
- igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (4)
- : COSTS_N_INSNS (2);
+ igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (4)
+ : COSTS_N_INSNS (2);
/* or (3 bytes) vs. ptest (5 bytes). */
- else if (optimize_insn_for_size_p ())
+ else if (optimize_bb_for_size_p (bb))
igain = -COSTS_N_BYTES (2);
}
else if (XEXP (src, 1) == const1_rtx)
/* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
- igain = optimize_insn_for_size_p() ? -COSTS_N_BYTES (6)
- : -COSTS_N_INSNS (1);
+ igain = optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (6)
+ : -COSTS_N_INSNS (1);
break;
default:
in 128bit, 256bit and 512bit */
4, 4, 6, /* cost of moving XMM,YMM,ZMM register */
4, /* cost of moving SSE register to integer. */
+ 4, /* cost of moving integer register to SSE. */
COSTS_N_BYTES (5), 0, /* Gather load static, per_elt. */
COSTS_N_BYTES (5), 0, /* Gather store static, per_elt. */
0, /* size of l1 cache */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
+ 3, /* cost of moving integer register to SSE. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
0, /* size of l1 cache */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
+ 3, /* cost of moving integer register to SSE. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
4, /* size of l1 cache. 486 has 8kB cache
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
+ 3, /* cost of moving integer register to SSE. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
+ 3, /* cost of moving integer register to SSE. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
{4, 8, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
+ 3, /* cost of moving integer register to SSE. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
2, 2, /* Gather load static, per_elt. */
2, 2, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
{2, 2, 8, 16, 32}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
2, 2, /* Gather load static, per_elt. */
2, 2, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{4, 4, 10, 10, 20}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
5, /* cost of moving SSE register to integer. */
+ 5, /* cost of moving integer register to SSE. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
{4, 4, 10, 10, 20}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
5, /* cost of moving SSE register to integer. */
+ 5, /* cost of moving integer register to SSE. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
{4, 4, 5, 10, 20}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3, /* cost of moving SSE register to integer. */
+ 3, /* cost of moving integer register to SSE. */
4, 4, /* Gather load static, per_elt. */
4, 4, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
{10, 10, 10, 40, 60}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
16, /* cost of moving SSE register to integer. */
+ 16, /* cost of moving integer register to SSE. */
12, 12, /* Gather load static, per_elt. */
10, 10, /* Gather store static, per_elt. */
16, /* size of l1 cache. */
{8, 8, 8, 16, 32}, /* cost of unaligned stores. */
2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
/* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
throughput 12. Approx 9 uops do not depend on vector size and every load
is 7 uops. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
/* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
throughput 12. Approx 9 uops do not depend on vector size and every load
is 7 uops. */
2, 2, 3, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
/* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
throughput 9. Approx 7 uops do not depend on vector size and every load
is 4 uops. */
2, 2, 2, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
/* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
throughput 5. Approx 7 uops do not depend on vector size and every load
is 5 uops. */
2, 2, 2, /* cost of moving XMM,YMM,ZMM
register. */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
/* TODO: gather and scatter instructions are currently disabled in
x86-tune.def. In some cases they are however a win, see PR116582
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
20, 8, /* Gather load static, per_elt. */
22, 10, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
{8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
20, 8, /* Gather load static, per_elt. */
22, 10, /* Gather store static, per_elt. */
64, /* size of l1 cache. */
{8, 8, 8, 10, 15}, /* cost of unaligned storess. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
18, 6, /* Gather load static, per_elt. */
18, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
14, /* cost of moving SSE register to integer. */
+ 14, /* cost of moving integer register to SSE. */
10, 10, /* Gather load static, per_elt. */
10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
14, /* cost of moving SSE register to integer. */
+ 14, /* cost of moving integer register to SSE. */
10, 10, /* Gather load static, per_elt. */
10, 10, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{32, 32, 32, 64, 128}, /* cost of unaligned stores. */
12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
20, /* cost of moving SSE register to integer. */
+ 20, /* cost of moving integer register to SSE. */
16, 16, /* Gather load static, per_elt. */
16, 16, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
{24, 24, 24, 48, 96}, /* cost of unaligned stores. */
6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
20, /* cost of moving SSE register to integer. */
+ 20, /* cost of moving integer register to SSE. */
12, 12, /* Gather load static, per_elt. */
12, 12, /* Gather store static, per_elt. */
8, /* size of l1 cache. */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
8, /* cost of moving SSE register to integer. */
+ 8, /* cost of moving integer register to SSE. */
8, 8, /* Gather load static, per_elt. */
8, 8, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
8, /* cost of moving SSE register to integer. */
+ 8, /* cost of moving integer register to SSE. */
8, 8, /* Gather load static, per_elt. */
8, 8, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
18, 6, /* Gather load static, per_elt. */
18, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
4, /* cost of moving SSE register to integer. */
+ 4, /* cost of moving integer register to SSE. */
6, 6, /* Gather load static, per_elt. */
6, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{6, 6, 6}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
- {6, 6, 6}, /* cost of storing integer registers. */
+ {6, 6, 6}, /* cost of storing integer registers. */
{6, 6, 6, 10, 15}, /* cost of loading SSE register
- in 32bit, 64bit, 128bit, 256bit and 512bit. */
+ in 32bit, 64bit, 128bit, 256bit and 512bit. */
{6, 6, 6, 10, 15}, /* cost of storing SSE register
- in 32bit, 64bit, 128bit, 256bit and 512bit. */
+ in 32bit, 64bit, 128bit, 256bit and 512bit. */
{6, 6, 6, 10, 15}, /* cost of unaligned loads. */
{6, 6, 6, 10, 15}, /* cost of unaligned storess. */
- 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
- 6, /* cost of moving SSE register to integer. */
+ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
+ 6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
18, 6, /* Gather load static, per_elt. */
18, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{8, 8, 8, 12, 15}, /* cost of unaligned storess. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
8, /* cost of moving SSE register to integer. */
+ 8, /* cost of moving integer register to SSE. */
18, 6, /* Gather load static, per_elt. */
18, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{8, 8, 8, 12, 15}, /* cost of unaligned storess. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
8, /* cost of moving SSE register to integer. */
+ 8, /* cost of moving integer register to SSE. */
18, 6, /* Gather load static, per_elt. */
18, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
6, /* cost of moving SSE register to integer. */
+ 6, /* cost of moving integer register to SSE. */
18, 6, /* Gather load static, per_elt. */
18, 6, /* Gather store static, per_elt. */
32, /* size of l1 cache. */
{6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2, /* cost of moving SSE register to integer. */
+ 2, /* cost of moving integer register to SSE. */
/* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
rec. throughput 6.
So 5 uops statically and one uops per load. */