1 /* Scheduler hooks for IA-32 which implement CPU specific logic.
2 Copyright (C) 1988-2019 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
24 #include "coretypes.h"
30 #include "insn-config.h"
31 #include "insn-attr.h"
35 /* Return the maximum number of instructions a cpu can issue. */
38 ix86_issue_rate (void)
42 case PROCESSOR_PENTIUM
:
43 case PROCESSOR_LAKEMONT
:
44 case PROCESSOR_BONNELL
:
45 case PROCESSOR_SILVERMONT
:
50 case PROCESSOR_BTVER2
:
51 case PROCESSOR_PENTIUM4
:
52 case PROCESSOR_NOCONA
:
55 case PROCESSOR_PENTIUMPRO
:
56 case PROCESSOR_ATHLON
:
58 case PROCESSOR_AMDFAM10
:
59 case PROCESSOR_BTVER1
:
62 case PROCESSOR_BDVER1
:
63 case PROCESSOR_BDVER2
:
64 case PROCESSOR_BDVER3
:
65 case PROCESSOR_BDVER4
:
66 case PROCESSOR_ZNVER1
:
67 case PROCESSOR_ZNVER2
:
69 case PROCESSOR_NEHALEM
:
70 case PROCESSOR_SANDYBRIDGE
:
71 case PROCESSOR_HASWELL
:
72 case PROCESSOR_GENERIC
:
80 /* Return true iff USE_INSN has a memory address with operands set by
84 ix86_agi_dependent (rtx_insn
*set_insn
, rtx_insn
*use_insn
)
87 extract_insn_cached (use_insn
);
88 for (i
= recog_data
.n_operands
- 1; i
>= 0; --i
)
89 if (MEM_P (recog_data
.operand
[i
]))
91 rtx addr
= XEXP (recog_data
.operand
[i
], 0);
92 if (modified_in_p (addr
, set_insn
) != 0)
94 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
95 has SP based memory (unless index reg is modified in a pop). */
96 rtx set
= single_set (set_insn
);
98 && (push_operand (SET_DEST (set
), GET_MODE (SET_DEST (set
)))
99 || pop_operand (SET_SRC (set
), GET_MODE (SET_SRC (set
)))))
101 struct ix86_address parts
;
102 if (ix86_decompose_address (addr
, &parts
)
103 && parts
.base
== stack_pointer_rtx
104 && (parts
.index
== NULL_RTX
105 || MEM_P (SET_DEST (set
))
106 || !modified_in_p (parts
.index
, set_insn
)))
116 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
117 by DEP_INSN and nothing set by DEP_INSN. */
120 ix86_flags_dependent (rtx_insn
*insn
, rtx_insn
*dep_insn
, enum attr_type insn_type
)
124 /* Simplify the test for uninteresting insns. */
125 if (insn_type
!= TYPE_SETCC
126 && insn_type
!= TYPE_ICMOV
127 && insn_type
!= TYPE_FCMOV
128 && insn_type
!= TYPE_IBR
)
131 if ((set
= single_set (dep_insn
)) != 0)
133 set
= SET_DEST (set
);
136 else if (GET_CODE (PATTERN (dep_insn
)) == PARALLEL
137 && XVECLEN (PATTERN (dep_insn
), 0) == 2
138 && GET_CODE (XVECEXP (PATTERN (dep_insn
), 0, 0)) == SET
139 && GET_CODE (XVECEXP (PATTERN (dep_insn
), 0, 1)) == SET
)
141 set
= SET_DEST (XVECEXP (PATTERN (dep_insn
), 0, 0));
142 set2
= SET_DEST (XVECEXP (PATTERN (dep_insn
), 0, 0));
147 if (!REG_P (set
) || REGNO (set
) != FLAGS_REG
)
150 /* This test is true if the dependent insn reads the flags but
151 not any other potentially set register. */
152 if (!reg_overlap_mentioned_p (set
, PATTERN (insn
)))
155 if (set2
&& reg_overlap_mentioned_p (set2
, PATTERN (insn
)))
161 /* Helper function for exact_store_load_dependency.
162 Return true if addr is found in insn. */
164 exact_dependency_1 (rtx addr
, rtx insn
)
167 const char *format_ptr
;
170 code
= GET_CODE (insn
);
174 if (rtx_equal_p (addr
, insn
))
189 format_ptr
= GET_RTX_FORMAT (code
);
190 for (i
= 0; i
< GET_RTX_LENGTH (code
); i
++)
192 switch (*format_ptr
++)
195 if (exact_dependency_1 (addr
, XEXP (insn
, i
)))
199 for (j
= 0; j
< XVECLEN (insn
, i
); j
++)
200 if (exact_dependency_1 (addr
, XVECEXP (insn
, i
, j
)))
208 /* Return true if there exists exact dependency for store & load, i.e.
209 the same memory address is used in them. */
211 exact_store_load_dependency (rtx_insn
*store
, rtx_insn
*load
)
215 set1
= single_set (store
);
218 if (!MEM_P (SET_DEST (set1
)))
220 set2
= single_set (load
);
223 if (exact_dependency_1 (SET_DEST (set1
), SET_SRC (set2
)))
229 /* This function corrects the value of COST (latency) based on the relationship
230 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
231 DW. It should return the new value.
233 On x86 CPUs this is most commonly used to model the fact that valus of
234 registers used to compute address of memory operand needs to be ready
235 earlier than values of registers used in the actual operation. */
238 ix86_adjust_cost (rtx_insn
*insn
, int dep_type
, rtx_insn
*dep_insn
, int cost
,
241 enum attr_type insn_type
, dep_insn_type
;
242 enum attr_memory memory
;
244 int dep_insn_code_number
;
246 /* Anti and output dependencies have zero cost on all CPUs. */
250 dep_insn_code_number
= recog_memoized (dep_insn
);
252 /* If we can't recognize the insns, we can't really do anything. */
253 if (dep_insn_code_number
< 0 || recog_memoized (insn
) < 0)
256 insn_type
= get_attr_type (insn
);
257 dep_insn_type
= get_attr_type (dep_insn
);
261 case PROCESSOR_PENTIUM
:
262 case PROCESSOR_LAKEMONT
:
263 /* Address Generation Interlock adds a cycle of latency. */
264 if (insn_type
== TYPE_LEA
)
266 rtx addr
= PATTERN (insn
);
268 if (GET_CODE (addr
) == PARALLEL
)
269 addr
= XVECEXP (addr
, 0, 0);
271 gcc_assert (GET_CODE (addr
) == SET
);
273 addr
= SET_SRC (addr
);
274 if (modified_in_p (addr
, dep_insn
))
277 else if (ix86_agi_dependent (dep_insn
, insn
))
280 /* ??? Compares pair with jump/setcc. */
281 if (ix86_flags_dependent (insn
, dep_insn
, insn_type
))
284 /* Floating point stores require value to be ready one cycle earlier. */
285 if (insn_type
== TYPE_FMOV
286 && get_attr_memory (insn
) == MEMORY_STORE
287 && !ix86_agi_dependent (dep_insn
, insn
))
291 case PROCESSOR_PENTIUMPRO
:
292 /* INT->FP conversion is expensive. */
293 if (get_attr_fp_int_src (dep_insn
))
296 /* There is one cycle extra latency between an FP op and a store. */
297 if (insn_type
== TYPE_FMOV
298 && (set
= single_set (dep_insn
)) != NULL_RTX
299 && (set2
= single_set (insn
)) != NULL_RTX
300 && rtx_equal_p (SET_DEST (set
), SET_SRC (set2
))
301 && MEM_P (SET_DEST (set2
)))
304 memory
= get_attr_memory (insn
);
306 /* Show ability of reorder buffer to hide latency of load by executing
307 in parallel with previous instruction in case
308 previous instruction is not needed to compute the address. */
309 if ((memory
== MEMORY_LOAD
|| memory
== MEMORY_BOTH
)
310 && !ix86_agi_dependent (dep_insn
, insn
))
312 /* Claim moves to take one cycle, as core can issue one load
313 at time and the next load can start cycle later. */
314 if (dep_insn_type
== TYPE_IMOV
315 || dep_insn_type
== TYPE_FMOV
)
323 /* The esp dependency is resolved before
324 the instruction is really finished. */
325 if ((insn_type
== TYPE_PUSH
|| insn_type
== TYPE_POP
)
326 && (dep_insn_type
== TYPE_PUSH
|| dep_insn_type
== TYPE_POP
))
329 /* INT->FP conversion is expensive. */
330 if (get_attr_fp_int_src (dep_insn
))
333 memory
= get_attr_memory (insn
);
335 /* Show ability of reorder buffer to hide latency of load by executing
336 in parallel with previous instruction in case
337 previous instruction is not needed to compute the address. */
338 if ((memory
== MEMORY_LOAD
|| memory
== MEMORY_BOTH
)
339 && !ix86_agi_dependent (dep_insn
, insn
))
341 /* Claim moves to take one cycle, as core can issue one load
342 at time and the next load can start cycle later. */
343 if (dep_insn_type
== TYPE_IMOV
344 || dep_insn_type
== TYPE_FMOV
)
353 case PROCESSOR_AMDFAM10
:
354 case PROCESSOR_BDVER1
:
355 case PROCESSOR_BDVER2
:
356 case PROCESSOR_BDVER3
:
357 case PROCESSOR_BDVER4
:
358 case PROCESSOR_BTVER1
:
359 case PROCESSOR_BTVER2
:
360 /* Stack engine allows to execute push&pop instructions in parall. */
361 if ((insn_type
== TYPE_PUSH
|| insn_type
== TYPE_POP
)
362 && (dep_insn_type
== TYPE_PUSH
|| dep_insn_type
== TYPE_POP
))
366 case PROCESSOR_ATHLON
:
368 memory
= get_attr_memory (insn
);
370 /* Show ability of reorder buffer to hide latency of load by executing
371 in parallel with previous instruction in case
372 previous instruction is not needed to compute the address. */
373 if ((memory
== MEMORY_LOAD
|| memory
== MEMORY_BOTH
)
374 && !ix86_agi_dependent (dep_insn
, insn
))
376 enum attr_unit unit
= get_attr_unit (insn
);
379 /* Because of the difference between the length of integer and
380 floating unit pipeline preparation stages, the memory operands
381 for floating point are cheaper.
383 ??? For Athlon it the difference is most probably 2. */
384 if (unit
== UNIT_INTEGER
|| unit
== UNIT_UNKNOWN
)
387 loadcost
= TARGET_ATHLON
? 2 : 0;
389 if (cost
>= loadcost
)
396 case PROCESSOR_ZNVER1
:
397 case PROCESSOR_ZNVER2
:
398 /* Stack engine allows to execute push&pop instructions in parall. */
399 if ((insn_type
== TYPE_PUSH
|| insn_type
== TYPE_POP
)
400 && (dep_insn_type
== TYPE_PUSH
|| dep_insn_type
== TYPE_POP
))
403 memory
= get_attr_memory (insn
);
405 /* Show ability of reorder buffer to hide latency of load by executing
406 in parallel with previous instruction in case
407 previous instruction is not needed to compute the address. */
408 if ((memory
== MEMORY_LOAD
|| memory
== MEMORY_BOTH
)
409 && !ix86_agi_dependent (dep_insn
, insn
))
411 enum attr_unit unit
= get_attr_unit (insn
);
414 if (unit
== UNIT_INTEGER
|| unit
== UNIT_UNKNOWN
)
419 if (cost
>= loadcost
)
426 case PROCESSOR_CORE2
:
427 case PROCESSOR_NEHALEM
:
428 case PROCESSOR_SANDYBRIDGE
:
429 case PROCESSOR_HASWELL
:
430 case PROCESSOR_GENERIC
:
431 /* Stack engine allows to execute push&pop instructions in parall. */
432 if ((insn_type
== TYPE_PUSH
|| insn_type
== TYPE_POP
)
433 && (dep_insn_type
== TYPE_PUSH
|| dep_insn_type
== TYPE_POP
))
436 memory
= get_attr_memory (insn
);
438 /* Show ability of reorder buffer to hide latency of load by executing
439 in parallel with previous instruction in case
440 previous instruction is not needed to compute the address. */
441 if ((memory
== MEMORY_LOAD
|| memory
== MEMORY_BOTH
)
442 && !ix86_agi_dependent (dep_insn
, insn
))
451 case PROCESSOR_SILVERMONT
:
454 case PROCESSOR_INTEL
:
455 if (!reload_completed
)
458 /* Increase cost of integer loads. */
459 memory
= get_attr_memory (dep_insn
);
460 if (memory
== MEMORY_LOAD
|| memory
== MEMORY_BOTH
)
462 enum attr_unit unit
= get_attr_unit (dep_insn
);
463 if (unit
== UNIT_INTEGER
&& cost
== 1)
465 if (memory
== MEMORY_LOAD
)
469 /* Increase cost of ld/st for short int types only
470 because of store forwarding issue. */
471 rtx set
= single_set (dep_insn
);
472 if (set
&& (GET_MODE (SET_DEST (set
)) == QImode
473 || GET_MODE (SET_DEST (set
)) == HImode
))
475 /* Increase cost of store/load insn if exact
476 dependence exists and it is load insn. */
477 enum attr_memory insn_memory
= get_attr_memory (insn
);
478 if (insn_memory
== MEMORY_LOAD
479 && exact_store_load_dependency (dep_insn
, insn
))
493 /* How many alternative schedules to try. This should be as wide as the
494 scheduling freedom in the DFA, but no wider. Making this value too
495 large results extra work for the scheduler. */
498 ia32_multipass_dfa_lookahead (void)
500 /* Generally, we want haifa-sched:max_issue() to look ahead as far
501 as many instructions can be executed on a cycle, i.e.,
503 if (reload_completed
)
504 return ix86_issue_rate ();
505 /* Don't use lookahead for pre-reload schedule to save compile time. */
509 /* Return true if target platform supports macro-fusion. */
512 ix86_macro_fusion_p ()
514 return TARGET_FUSE_CMP_AND_BRANCH
;
517 /* Check whether current microarchitecture support macro fusion
518 for insn pair "CONDGEN + CONDJMP". Refer to
519 "Intel Architectures Optimization Reference Manual". */
522 ix86_macro_fusion_pair_p (rtx_insn
*condgen
, rtx_insn
*condjmp
)
526 rtx compare_set
= NULL_RTX
, test_if
, cond
;
527 rtx alu_set
= NULL_RTX
, addr
= NULL_RTX
;
529 if (!any_condjump_p (condjmp
))
532 unsigned int condreg1
, condreg2
;
534 targetm
.fixed_condition_code_regs (&condreg1
, &condreg2
);
535 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
536 if (!reg_referenced_p (cc_reg_1
, PATTERN (condjmp
))
538 || !modified_in_p (cc_reg_1
, condgen
))
541 if (get_attr_type (condgen
) != TYPE_TEST
542 && get_attr_type (condgen
) != TYPE_ICMP
543 && get_attr_type (condgen
) != TYPE_INCDEC
544 && get_attr_type (condgen
) != TYPE_ALU
)
547 compare_set
= single_set (condgen
);
548 if (compare_set
== NULL_RTX
549 && !TARGET_FUSE_ALU_AND_BRANCH
)
552 if (compare_set
== NULL_RTX
)
555 rtx pat
= PATTERN (condgen
);
556 for (i
= 0; i
< XVECLEN (pat
, 0); i
++)
557 if (GET_CODE (XVECEXP (pat
, 0, i
)) == SET
)
559 rtx set_src
= SET_SRC (XVECEXP (pat
, 0, i
));
560 if (GET_CODE (set_src
) == COMPARE
)
561 compare_set
= XVECEXP (pat
, 0, i
);
563 alu_set
= XVECEXP (pat
, 0, i
);
566 if (compare_set
== NULL_RTX
)
568 src
= SET_SRC (compare_set
);
569 if (GET_CODE (src
) != COMPARE
)
572 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
574 if ((MEM_P (XEXP (src
, 0))
575 && CONST_INT_P (XEXP (src
, 1)))
576 || (MEM_P (XEXP (src
, 1))
577 && CONST_INT_P (XEXP (src
, 0))))
580 /* No fusion for RIP-relative address. */
581 if (MEM_P (XEXP (src
, 0)))
582 addr
= XEXP (XEXP (src
, 0), 0);
583 else if (MEM_P (XEXP (src
, 1)))
584 addr
= XEXP (XEXP (src
, 1), 0);
588 int ok
= ix86_decompose_address (addr
, &parts
);
591 if (ix86_rip_relative_addr_p (&parts
))
595 test_if
= SET_SRC (pc_set (condjmp
));
596 cond
= XEXP (test_if
, 0);
597 ccode
= GET_CODE (cond
);
598 /* Check whether conditional jump use Sign or Overflow Flags. */
599 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
606 /* Return true for TYPE_TEST and TYPE_ICMP. */
607 if (get_attr_type (condgen
) == TYPE_TEST
608 || get_attr_type (condgen
) == TYPE_ICMP
)
611 /* The following is the case that macro-fusion for alu + jmp. */
612 if (!TARGET_FUSE_ALU_AND_BRANCH
|| !alu_set
)
615 /* No fusion for alu op with memory destination operand. */
616 dest
= SET_DEST (alu_set
);
620 /* Macro-fusion for inc/dec + unsigned conditional jump is not
622 if (get_attr_type (condgen
) == TYPE_INCDEC