]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/x86-tune-sched.c
Remove CC0
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-sched.c
1 /* Scheduler hooks for IA-32 which implement CPU specific logic.
2 Copyright (C) 1988-2021 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #define IN_TARGET_CODE 1
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "cfghooks.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "insn-config.h"
32 #include "insn-attr.h"
33 #include "insn-opinit.h"
34 #include "recog.h"
35
36 /* Return the maximum number of instructions a cpu can issue. */
37
38 int
39 ix86_issue_rate (void)
40 {
41 switch (ix86_tune)
42 {
43 case PROCESSOR_PENTIUM:
44 case PROCESSOR_LAKEMONT:
45 case PROCESSOR_BONNELL:
46 case PROCESSOR_SILVERMONT:
47 case PROCESSOR_KNL:
48 case PROCESSOR_KNM:
49 case PROCESSOR_INTEL:
50 case PROCESSOR_K6:
51 case PROCESSOR_BTVER2:
52 case PROCESSOR_PENTIUM4:
53 case PROCESSOR_NOCONA:
54 return 2;
55
56 case PROCESSOR_PENTIUMPRO:
57 case PROCESSOR_ATHLON:
58 case PROCESSOR_K8:
59 case PROCESSOR_AMDFAM10:
60 case PROCESSOR_BTVER1:
61 return 3;
62
63 case PROCESSOR_BDVER1:
64 case PROCESSOR_BDVER2:
65 case PROCESSOR_BDVER3:
66 case PROCESSOR_BDVER4:
67 case PROCESSOR_ZNVER1:
68 case PROCESSOR_ZNVER2:
69 case PROCESSOR_ZNVER3:
70 case PROCESSOR_CORE2:
71 case PROCESSOR_NEHALEM:
72 case PROCESSOR_SANDYBRIDGE:
73 case PROCESSOR_HASWELL:
74 case PROCESSOR_GENERIC:
75 return 4;
76
77 default:
78 return 1;
79 }
80 }
81
82 /* Return true iff USE_INSN has a memory address with operands set by
83 SET_INSN. */
84
85 bool
86 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
87 {
88 int i;
89 extract_insn_cached (use_insn);
90 for (i = recog_data.n_operands - 1; i >= 0; --i)
91 if (MEM_P (recog_data.operand[i]))
92 {
93 rtx addr = XEXP (recog_data.operand[i], 0);
94 if (modified_in_p (addr, set_insn) != 0)
95 {
96 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
97 has SP based memory (unless index reg is modified in a pop). */
98 rtx set = single_set (set_insn);
99 if (set
100 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
101 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
102 {
103 struct ix86_address parts;
104 if (ix86_decompose_address (addr, &parts)
105 && parts.base == stack_pointer_rtx
106 && (parts.index == NULL_RTX
107 || MEM_P (SET_DEST (set))
108 || !modified_in_p (parts.index, set_insn)))
109 return false;
110 }
111 return true;
112 }
113 return false;
114 }
115 return false;
116 }
117
118 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
119 by DEP_INSN and nothing set by DEP_INSN. */
120
121 static bool
122 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
123 {
124 rtx set, set2;
125
126 /* Simplify the test for uninteresting insns. */
127 if (insn_type != TYPE_SETCC
128 && insn_type != TYPE_ICMOV
129 && insn_type != TYPE_FCMOV
130 && insn_type != TYPE_IBR)
131 return false;
132
133 if ((set = single_set (dep_insn)) != 0)
134 {
135 set = SET_DEST (set);
136 set2 = NULL_RTX;
137 }
138 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
139 && XVECLEN (PATTERN (dep_insn), 0) == 2
140 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
141 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
142 {
143 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
144 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
145 }
146 else
147 return false;
148
149 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
150 return false;
151
152 /* This test is true if the dependent insn reads the flags but
153 not any other potentially set register. */
154 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
155 return false;
156
157 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
158 return false;
159
160 return true;
161 }
162
163 /* Helper function for exact_store_load_dependency.
164 Return true if addr is found in insn. */
165 static bool
166 exact_dependency_1 (rtx addr, rtx insn)
167 {
168 enum rtx_code code;
169 const char *format_ptr;
170 int i, j;
171
172 code = GET_CODE (insn);
173 switch (code)
174 {
175 case MEM:
176 if (rtx_equal_p (addr, insn))
177 return true;
178 break;
179 case REG:
180 CASE_CONST_ANY:
181 case SYMBOL_REF:
182 case CODE_LABEL:
183 case PC:
184 case EXPR_LIST:
185 return false;
186 default:
187 break;
188 }
189
190 format_ptr = GET_RTX_FORMAT (code);
191 for (i = 0; i < GET_RTX_LENGTH (code); i++)
192 {
193 switch (*format_ptr++)
194 {
195 case 'e':
196 if (exact_dependency_1 (addr, XEXP (insn, i)))
197 return true;
198 break;
199 case 'E':
200 for (j = 0; j < XVECLEN (insn, i); j++)
201 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
202 return true;
203 break;
204 }
205 }
206 return false;
207 }
208
209 /* Return true if there exists exact dependency for store & load, i.e.
210 the same memory address is used in them. */
211 static bool
212 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
213 {
214 rtx set1, set2;
215
216 set1 = single_set (store);
217 if (!set1)
218 return false;
219 if (!MEM_P (SET_DEST (set1)))
220 return false;
221 set2 = single_set (load);
222 if (!set2)
223 return false;
224 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
225 return true;
226 return false;
227 }
228
229
230 /* This function corrects the value of COST (latency) based on the relationship
231 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
232 DW. It should return the new value.
233
234 On x86 CPUs this is most commonly used to model the fact that valus of
235 registers used to compute address of memory operand needs to be ready
236 earlier than values of registers used in the actual operation. */
237
238 int
239 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
240 unsigned int)
241 {
242 enum attr_type insn_type, dep_insn_type;
243 enum attr_memory memory;
244 rtx set, set2;
245 int dep_insn_code_number;
246
247 /* Anti and output dependencies have zero cost on all CPUs. */
248 if (dep_type != 0)
249 return 0;
250
251 dep_insn_code_number = recog_memoized (dep_insn);
252
253 /* If we can't recognize the insns, we can't really do anything. */
254 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
255 return cost;
256
257 insn_type = get_attr_type (insn);
258 dep_insn_type = get_attr_type (dep_insn);
259
260 switch (ix86_tune)
261 {
262 case PROCESSOR_PENTIUM:
263 case PROCESSOR_LAKEMONT:
264 /* Address Generation Interlock adds a cycle of latency. */
265 if (insn_type == TYPE_LEA)
266 {
267 rtx addr = PATTERN (insn);
268
269 if (GET_CODE (addr) == PARALLEL)
270 addr = XVECEXP (addr, 0, 0);
271
272 gcc_assert (GET_CODE (addr) == SET);
273
274 addr = SET_SRC (addr);
275 if (modified_in_p (addr, dep_insn))
276 cost += 1;
277 }
278 else if (ix86_agi_dependent (dep_insn, insn))
279 cost += 1;
280
281 /* ??? Compares pair with jump/setcc. */
282 if (ix86_flags_dependent (insn, dep_insn, insn_type))
283 cost = 0;
284
285 /* Floating point stores require value to be ready one cycle earlier. */
286 if (insn_type == TYPE_FMOV
287 && get_attr_memory (insn) == MEMORY_STORE
288 && !ix86_agi_dependent (dep_insn, insn))
289 cost += 1;
290 break;
291
292 case PROCESSOR_PENTIUMPRO:
293 /* INT->FP conversion is expensive. */
294 if (get_attr_fp_int_src (dep_insn))
295 cost += 5;
296
297 /* There is one cycle extra latency between an FP op and a store. */
298 if (insn_type == TYPE_FMOV
299 && (set = single_set (dep_insn)) != NULL_RTX
300 && (set2 = single_set (insn)) != NULL_RTX
301 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
302 && MEM_P (SET_DEST (set2)))
303 cost += 1;
304
305 memory = get_attr_memory (insn);
306
307 /* Show ability of reorder buffer to hide latency of load by executing
308 in parallel with previous instruction in case
309 previous instruction is not needed to compute the address. */
310 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
311 && !ix86_agi_dependent (dep_insn, insn))
312 {
313 /* Claim moves to take one cycle, as core can issue one load
314 at time and the next load can start cycle later. */
315 if (dep_insn_type == TYPE_IMOV
316 || dep_insn_type == TYPE_FMOV)
317 cost = 1;
318 else if (cost > 1)
319 cost--;
320 }
321 break;
322
323 case PROCESSOR_K6:
324 /* The esp dependency is resolved before
325 the instruction is really finished. */
326 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
327 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
328 return 1;
329
330 /* INT->FP conversion is expensive. */
331 if (get_attr_fp_int_src (dep_insn))
332 cost += 5;
333
334 memory = get_attr_memory (insn);
335
336 /* Show ability of reorder buffer to hide latency of load by executing
337 in parallel with previous instruction in case
338 previous instruction is not needed to compute the address. */
339 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
340 && !ix86_agi_dependent (dep_insn, insn))
341 {
342 /* Claim moves to take one cycle, as core can issue one load
343 at time and the next load can start cycle later. */
344 if (dep_insn_type == TYPE_IMOV
345 || dep_insn_type == TYPE_FMOV)
346 cost = 1;
347 else if (cost > 2)
348 cost -= 2;
349 else
350 cost = 1;
351 }
352 break;
353
354 case PROCESSOR_AMDFAM10:
355 case PROCESSOR_BDVER1:
356 case PROCESSOR_BDVER2:
357 case PROCESSOR_BDVER3:
358 case PROCESSOR_BDVER4:
359 case PROCESSOR_BTVER1:
360 case PROCESSOR_BTVER2:
361 /* Stack engine allows to execute push&pop instructions in parall. */
362 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
363 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
364 return 0;
365 /* FALLTHRU */
366
367 case PROCESSOR_ATHLON:
368 case PROCESSOR_K8:
369 memory = get_attr_memory (insn);
370
371 /* Show ability of reorder buffer to hide latency of load by executing
372 in parallel with previous instruction in case
373 previous instruction is not needed to compute the address. */
374 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
375 && !ix86_agi_dependent (dep_insn, insn))
376 {
377 enum attr_unit unit = get_attr_unit (insn);
378 int loadcost = 3;
379
380 /* Because of the difference between the length of integer and
381 floating unit pipeline preparation stages, the memory operands
382 for floating point are cheaper.
383
384 ??? For Athlon it the difference is most probably 2. */
385 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
386 loadcost = 3;
387 else
388 loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0;
389
390 if (cost >= loadcost)
391 cost -= loadcost;
392 else
393 cost = 0;
394 }
395 break;
396
397 case PROCESSOR_ZNVER1:
398 case PROCESSOR_ZNVER2:
399 case PROCESSOR_ZNVER3:
400 /* Stack engine allows to execute push&pop instructions in parall. */
401 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
402 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
403 return 0;
404
405 memory = get_attr_memory (insn);
406
407 /* Show ability of reorder buffer to hide latency of load by executing
408 in parallel with previous instruction in case
409 previous instruction is not needed to compute the address. */
410 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
411 && !ix86_agi_dependent (dep_insn, insn))
412 {
413 enum attr_unit unit = get_attr_unit (insn);
414 int loadcost;
415
416 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
417 loadcost = 4;
418 else
419 loadcost = 7;
420
421 if (cost >= loadcost)
422 cost -= loadcost;
423 else
424 cost = 0;
425 }
426 break;
427
428 case PROCESSOR_CORE2:
429 case PROCESSOR_NEHALEM:
430 case PROCESSOR_SANDYBRIDGE:
431 case PROCESSOR_HASWELL:
432 case PROCESSOR_GENERIC:
433 /* Stack engine allows to execute push&pop instructions in parall. */
434 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
435 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
436 return 0;
437
438 memory = get_attr_memory (insn);
439
440 /* Show ability of reorder buffer to hide latency of load by executing
441 in parallel with previous instruction in case
442 previous instruction is not needed to compute the address. */
443 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
444 && !ix86_agi_dependent (dep_insn, insn))
445 {
446 if (cost >= 4)
447 cost -= 4;
448 else
449 cost = 0;
450 }
451 break;
452
453 case PROCESSOR_SILVERMONT:
454 case PROCESSOR_KNL:
455 case PROCESSOR_KNM:
456 case PROCESSOR_INTEL:
457 if (!reload_completed)
458 return cost;
459
460 /* Increase cost of integer loads. */
461 memory = get_attr_memory (dep_insn);
462 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
463 {
464 enum attr_unit unit = get_attr_unit (dep_insn);
465 if (unit == UNIT_INTEGER && cost == 1)
466 {
467 if (memory == MEMORY_LOAD)
468 cost = 3;
469 else
470 {
471 /* Increase cost of ld/st for short int types only
472 because of store forwarding issue. */
473 rtx set = single_set (dep_insn);
474 if (set && (GET_MODE (SET_DEST (set)) == QImode
475 || GET_MODE (SET_DEST (set)) == HImode))
476 {
477 /* Increase cost of store/load insn if exact
478 dependence exists and it is load insn. */
479 enum attr_memory insn_memory = get_attr_memory (insn);
480 if (insn_memory == MEMORY_LOAD
481 && exact_store_load_dependency (dep_insn, insn))
482 cost = 3;
483 }
484 }
485 }
486 }
487
488 default:
489 break;
490 }
491
492 return cost;
493 }
494
495 /* How many alternative schedules to try. This should be as wide as the
496 scheduling freedom in the DFA, but no wider. Making this value too
497 large results extra work for the scheduler. */
498
499 int
500 ia32_multipass_dfa_lookahead (void)
501 {
502 /* Generally, we want haifa-sched:max_issue() to look ahead as far
503 as many instructions can be executed on a cycle, i.e.,
504 issue_rate. */
505 if (reload_completed)
506 return ix86_issue_rate ();
507 /* Don't use lookahead for pre-reload schedule to save compile time. */
508 return 0;
509 }
510
511 /* Return true if target platform supports macro-fusion. */
512
513 bool
514 ix86_macro_fusion_p ()
515 {
516 return TARGET_FUSE_CMP_AND_BRANCH;
517 }
518
519 /* Check whether current microarchitecture support macro fusion
520 for insn pair "CONDGEN + CONDJMP". Refer to
521 "Intel Architectures Optimization Reference Manual". */
522
523 bool
524 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
525 {
526 rtx src, dest;
527 enum rtx_code ccode;
528 rtx compare_set = NULL_RTX, test_if, cond;
529 rtx alu_set = NULL_RTX, addr = NULL_RTX;
530 enum attr_type condgen_type;
531
532 if (!any_condjump_p (condjmp))
533 return false;
534
535 unsigned int condreg1, condreg2;
536 rtx cc_reg_1;
537 targetm.fixed_condition_code_regs (&condreg1, &condreg2);
538 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
539 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
540 || !condgen
541 || !modified_in_p (cc_reg_1, condgen))
542 return false;
543
544 condgen_type = get_attr_type (condgen);
545 if (condgen_type == TYPE_MULTI
546 && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode)
547 && TARGET_FUSE_ALU_AND_BRANCH)
548 {
549 /* stack_protect_test_<mode> ends with a sub, which subtracts
550 a non-rip special memory operand from a GPR. */
551 src = NULL_RTX;
552 alu_set = XVECEXP (PATTERN (condgen), 0, 1);
553 goto handle_stack_protect_test;
554 }
555 else if (condgen_type != TYPE_TEST
556 && condgen_type != TYPE_ICMP
557 && condgen_type != TYPE_INCDEC
558 && condgen_type != TYPE_ALU)
559 return false;
560
561 compare_set = single_set (condgen);
562 if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
563 return false;
564
565 if (compare_set == NULL_RTX)
566 {
567 int i;
568 rtx pat = PATTERN (condgen);
569 for (i = 0; i < XVECLEN (pat, 0); i++)
570 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
571 {
572 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
573 if (GET_CODE (set_src) == COMPARE)
574 compare_set = XVECEXP (pat, 0, i);
575 else
576 alu_set = XVECEXP (pat, 0, i);
577 }
578 }
579 if (compare_set == NULL_RTX)
580 return false;
581 src = SET_SRC (compare_set);
582 if (GET_CODE (src) != COMPARE)
583 return false;
584
585 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
586 supported. */
587 if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
588 || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
589 return false;
590
591 /* No fusion for RIP-relative address. */
592 if (MEM_P (XEXP (src, 0)))
593 addr = XEXP (XEXP (src, 0), 0);
594 else if (MEM_P (XEXP (src, 1)))
595 addr = XEXP (XEXP (src, 1), 0);
596
597 if (addr)
598 {
599 ix86_address parts;
600 int ok = ix86_decompose_address (addr, &parts);
601 gcc_assert (ok);
602
603 if (ix86_rip_relative_addr_p (&parts))
604 return false;
605 }
606
607 handle_stack_protect_test:
608 test_if = SET_SRC (pc_set (condjmp));
609 cond = XEXP (test_if, 0);
610 ccode = GET_CODE (cond);
611 /* Check whether conditional jump use Sign or Overflow Flags. */
612 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
613 && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
614 return false;
615
616 /* Return true for TYPE_TEST and TYPE_ICMP. */
617 if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
618 return true;
619
620 /* The following is the case that macro-fusion for alu + jmp. */
621 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
622 return false;
623
624 /* No fusion for alu op with memory destination operand. */
625 dest = SET_DEST (alu_set);
626 if (MEM_P (dest))
627 return false;
628
629 /* Macro-fusion for inc/dec + unsigned conditional jump is not
630 supported. */
631 if (condgen_type == TYPE_INCDEC
632 && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
633 return false;
634
635 return true;
636 }
637