case PROCESSOR_ZNVER2:
case PROCESSOR_ZNVER3:
case PROCESSOR_ZNVER4:
- case PROCESSOR_ZNVER5:
case PROCESSOR_CORE2:
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
return 5;
case PROCESSOR_SAPPHIRERAPIDS:
+ /* For znver5 decoder can handle 4 or 8 instructions per cycle,
+ op cache 12 instruction/cycle, dispatch 8 instructions
+ integer rename 8 instructions and Fp 6 instructions.
+
+ The scheduler, without understanding out of order nature of the CPU
+ is unlikely going to be able to fill all of these. */
+ case PROCESSOR_ZNVER5:
return 6;
default:
enum attr_unit unit = get_attr_unit (insn);
int loadcost;
+ /* TODO: On znver5 complex addressing modes have
+ greater latency. */
if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
loadcost = 4;
else
return TARGET_FUSE_CMP_AND_BRANCH;
}
+static bool
+ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
+{
+ /* Validate mov:
+ - It should be reg-reg move with opcode 0x89 or 0x8B. */
+ rtx set1 = PATTERN (mov);
+ if (GET_CODE (set1) != SET
+ || !GENERAL_REG_P (SET_SRC (set1))
+ || !GENERAL_REG_P (SET_DEST (set1)))
+ return false;
+ rtx reg = SET_DEST (set1);
+ /* - it should have 0x89 or 0x8B opcode. */
+ if (!INTEGRAL_MODE_P (GET_MODE (reg))
+ || GET_MODE_SIZE (GET_MODE (reg)) < 2
+ || GET_MODE_SIZE (GET_MODE (reg)) > 8)
+ return false;
+ /* Validate ALU. */
+ if (GET_CODE (PATTERN (alu)) != PARALLEL)
+ return false;
+ rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
+ if (GET_CODE (set2) != SET)
+ return false;
+ /* Match one of:
+ ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
+ We also may add insn attribute to handle some of sporadic
+ case we output those with different RTX expressions. */
+
+ if (GET_CODE (SET_SRC (set2)) != PLUS
+ && GET_CODE (SET_SRC (set2)) != MINUS
+ && GET_CODE (SET_SRC (set2)) != XOR
+ && GET_CODE (SET_SRC (set2)) != AND
+ && GET_CODE (SET_SRC (set2)) != IOR
+ && GET_CODE (SET_SRC (set2)) != NOT
+ && GET_CODE (SET_SRC (set2)) != ASHIFT
+ && GET_CODE (SET_SRC (set2)) != ASHIFTRT
+ && GET_CODE (SET_SRC (set2)) != LSHIFTRT)
+ return false;
+ rtx op0 = XEXP (SET_SRC (set2), 0);
+ rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL;
+ /* One of operands should be register. */
+ if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
+ std::swap (op0, op1);
+ if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
+ return false;
+ if (op1
+ && !REG_P (op1)
+ && !x86_64_immediate_operand (op1, VOIDmode))
+ return false;
+ /* Only one of two paramters must be move destination. */
+ if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg))
+ return false;
+ return true;
+}
+
/* Check whether current microarchitecture support macro fusion
for insn pair "CONDGEN + CONDJMP". Refer to
"Intel Architectures Optimization Reference Manual". */
bool
ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
{
+ if (TARGET_FUSE_MOV_AND_ALU
+ && ix86_fuse_mov_alu_p (condgen, condjmp))
+ return true;
rtx src, dest;
enum rtx_code ccode;
rtx compare_set = NULL_RTX, test_if, cond;
/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
jump instruction when the alu instruction produces the CCFLAG consumed by
- the conditional jump instruction. */
+ the conditional jump instruction.
+
+ TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
+ There is also limitation for immediate and displacement supported. */
DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
- m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC)
+ m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5)
+/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
+ and the destination is used by alu. alu must be one of
+ ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */
+DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5)
/*****************************************************************************/
/* Function prologue, epilogue and function calling sequences. */