]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/x86-tune-sched.c
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-sched.c
1 /* Scheduler hooks for IA-32 which implement CPU specific logic.
2 Copyright (C) 1988-2019 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #define IN_TARGET_CODE 1
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "cfghooks.h"
29 #include "tm_p.h"
30 #include "insn-config.h"
31 #include "insn-attr.h"
32 #include "recog.h"
33 #include "target.h"
34
35 /* Return the maximum number of instructions a cpu can issue. */
36
37 int
38 ix86_issue_rate (void)
39 {
40 switch (ix86_tune)
41 {
42 case PROCESSOR_PENTIUM:
43 case PROCESSOR_LAKEMONT:
44 case PROCESSOR_BONNELL:
45 case PROCESSOR_SILVERMONT:
46 case PROCESSOR_KNL:
47 case PROCESSOR_KNM:
48 case PROCESSOR_INTEL:
49 case PROCESSOR_K6:
50 case PROCESSOR_BTVER2:
51 case PROCESSOR_PENTIUM4:
52 case PROCESSOR_NOCONA:
53 return 2;
54
55 case PROCESSOR_PENTIUMPRO:
56 case PROCESSOR_ATHLON:
57 case PROCESSOR_K8:
58 case PROCESSOR_AMDFAM10:
59 case PROCESSOR_BTVER1:
60 return 3;
61
62 case PROCESSOR_BDVER1:
63 case PROCESSOR_BDVER2:
64 case PROCESSOR_BDVER3:
65 case PROCESSOR_BDVER4:
66 case PROCESSOR_ZNVER1:
67 case PROCESSOR_ZNVER2:
68 case PROCESSOR_CORE2:
69 case PROCESSOR_NEHALEM:
70 case PROCESSOR_SANDYBRIDGE:
71 case PROCESSOR_HASWELL:
72 case PROCESSOR_GENERIC:
73 return 4;
74
75 default:
76 return 1;
77 }
78 }
79
80 /* Return true iff USE_INSN has a memory address with operands set by
81 SET_INSN. */
82
83 bool
84 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
85 {
86 int i;
87 extract_insn_cached (use_insn);
88 for (i = recog_data.n_operands - 1; i >= 0; --i)
89 if (MEM_P (recog_data.operand[i]))
90 {
91 rtx addr = XEXP (recog_data.operand[i], 0);
92 if (modified_in_p (addr, set_insn) != 0)
93 {
94 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
95 has SP based memory (unless index reg is modified in a pop). */
96 rtx set = single_set (set_insn);
97 if (set
98 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
99 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
100 {
101 struct ix86_address parts;
102 if (ix86_decompose_address (addr, &parts)
103 && parts.base == stack_pointer_rtx
104 && (parts.index == NULL_RTX
105 || MEM_P (SET_DEST (set))
106 || !modified_in_p (parts.index, set_insn)))
107 return false;
108 }
109 return true;
110 }
111 return false;
112 }
113 return false;
114 }
115
116 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
117 by DEP_INSN and nothing set by DEP_INSN. */
118
119 static bool
120 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
121 {
122 rtx set, set2;
123
124 /* Simplify the test for uninteresting insns. */
125 if (insn_type != TYPE_SETCC
126 && insn_type != TYPE_ICMOV
127 && insn_type != TYPE_FCMOV
128 && insn_type != TYPE_IBR)
129 return false;
130
131 if ((set = single_set (dep_insn)) != 0)
132 {
133 set = SET_DEST (set);
134 set2 = NULL_RTX;
135 }
136 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
137 && XVECLEN (PATTERN (dep_insn), 0) == 2
138 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
139 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
140 {
141 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
142 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
143 }
144 else
145 return false;
146
147 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
148 return false;
149
150 /* This test is true if the dependent insn reads the flags but
151 not any other potentially set register. */
152 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
153 return false;
154
155 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
156 return false;
157
158 return true;
159 }
160
161 /* Helper function for exact_store_load_dependency.
162 Return true if addr is found in insn. */
163 static bool
164 exact_dependency_1 (rtx addr, rtx insn)
165 {
166 enum rtx_code code;
167 const char *format_ptr;
168 int i, j;
169
170 code = GET_CODE (insn);
171 switch (code)
172 {
173 case MEM:
174 if (rtx_equal_p (addr, insn))
175 return true;
176 break;
177 case REG:
178 CASE_CONST_ANY:
179 case SYMBOL_REF:
180 case CODE_LABEL:
181 case PC:
182 case CC0:
183 case EXPR_LIST:
184 return false;
185 default:
186 break;
187 }
188
189 format_ptr = GET_RTX_FORMAT (code);
190 for (i = 0; i < GET_RTX_LENGTH (code); i++)
191 {
192 switch (*format_ptr++)
193 {
194 case 'e':
195 if (exact_dependency_1 (addr, XEXP (insn, i)))
196 return true;
197 break;
198 case 'E':
199 for (j = 0; j < XVECLEN (insn, i); j++)
200 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
201 return true;
202 break;
203 }
204 }
205 return false;
206 }
207
208 /* Return true if there exists exact dependency for store & load, i.e.
209 the same memory address is used in them. */
210 static bool
211 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
212 {
213 rtx set1, set2;
214
215 set1 = single_set (store);
216 if (!set1)
217 return false;
218 if (!MEM_P (SET_DEST (set1)))
219 return false;
220 set2 = single_set (load);
221 if (!set2)
222 return false;
223 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
224 return true;
225 return false;
226 }
227
228
229 /* This function corrects the value of COST (latency) based on the relationship
230 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
231 DW. It should return the new value.
232
233 On x86 CPUs this is most commonly used to model the fact that valus of
234 registers used to compute address of memory operand needs to be ready
235 earlier than values of registers used in the actual operation. */
236
237 int
238 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
239 unsigned int)
240 {
241 enum attr_type insn_type, dep_insn_type;
242 enum attr_memory memory;
243 rtx set, set2;
244 int dep_insn_code_number;
245
246 /* Anti and output dependencies have zero cost on all CPUs. */
247 if (dep_type != 0)
248 return 0;
249
250 dep_insn_code_number = recog_memoized (dep_insn);
251
252 /* If we can't recognize the insns, we can't really do anything. */
253 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
254 return cost;
255
256 insn_type = get_attr_type (insn);
257 dep_insn_type = get_attr_type (dep_insn);
258
259 switch (ix86_tune)
260 {
261 case PROCESSOR_PENTIUM:
262 case PROCESSOR_LAKEMONT:
263 /* Address Generation Interlock adds a cycle of latency. */
264 if (insn_type == TYPE_LEA)
265 {
266 rtx addr = PATTERN (insn);
267
268 if (GET_CODE (addr) == PARALLEL)
269 addr = XVECEXP (addr, 0, 0);
270
271 gcc_assert (GET_CODE (addr) == SET);
272
273 addr = SET_SRC (addr);
274 if (modified_in_p (addr, dep_insn))
275 cost += 1;
276 }
277 else if (ix86_agi_dependent (dep_insn, insn))
278 cost += 1;
279
280 /* ??? Compares pair with jump/setcc. */
281 if (ix86_flags_dependent (insn, dep_insn, insn_type))
282 cost = 0;
283
284 /* Floating point stores require value to be ready one cycle earlier. */
285 if (insn_type == TYPE_FMOV
286 && get_attr_memory (insn) == MEMORY_STORE
287 && !ix86_agi_dependent (dep_insn, insn))
288 cost += 1;
289 break;
290
291 case PROCESSOR_PENTIUMPRO:
292 /* INT->FP conversion is expensive. */
293 if (get_attr_fp_int_src (dep_insn))
294 cost += 5;
295
296 /* There is one cycle extra latency between an FP op and a store. */
297 if (insn_type == TYPE_FMOV
298 && (set = single_set (dep_insn)) != NULL_RTX
299 && (set2 = single_set (insn)) != NULL_RTX
300 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
301 && MEM_P (SET_DEST (set2)))
302 cost += 1;
303
304 memory = get_attr_memory (insn);
305
306 /* Show ability of reorder buffer to hide latency of load by executing
307 in parallel with previous instruction in case
308 previous instruction is not needed to compute the address. */
309 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
310 && !ix86_agi_dependent (dep_insn, insn))
311 {
312 /* Claim moves to take one cycle, as core can issue one load
313 at time and the next load can start cycle later. */
314 if (dep_insn_type == TYPE_IMOV
315 || dep_insn_type == TYPE_FMOV)
316 cost = 1;
317 else if (cost > 1)
318 cost--;
319 }
320 break;
321
322 case PROCESSOR_K6:
323 /* The esp dependency is resolved before
324 the instruction is really finished. */
325 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
326 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
327 return 1;
328
329 /* INT->FP conversion is expensive. */
330 if (get_attr_fp_int_src (dep_insn))
331 cost += 5;
332
333 memory = get_attr_memory (insn);
334
335 /* Show ability of reorder buffer to hide latency of load by executing
336 in parallel with previous instruction in case
337 previous instruction is not needed to compute the address. */
338 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
339 && !ix86_agi_dependent (dep_insn, insn))
340 {
341 /* Claim moves to take one cycle, as core can issue one load
342 at time and the next load can start cycle later. */
343 if (dep_insn_type == TYPE_IMOV
344 || dep_insn_type == TYPE_FMOV)
345 cost = 1;
346 else if (cost > 2)
347 cost -= 2;
348 else
349 cost = 1;
350 }
351 break;
352
353 case PROCESSOR_AMDFAM10:
354 case PROCESSOR_BDVER1:
355 case PROCESSOR_BDVER2:
356 case PROCESSOR_BDVER3:
357 case PROCESSOR_BDVER4:
358 case PROCESSOR_BTVER1:
359 case PROCESSOR_BTVER2:
360 /* Stack engine allows to execute push&pop instructions in parall. */
361 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
362 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
363 return 0;
364 /* FALLTHRU */
365
366 case PROCESSOR_ATHLON:
367 case PROCESSOR_K8:
368 memory = get_attr_memory (insn);
369
370 /* Show ability of reorder buffer to hide latency of load by executing
371 in parallel with previous instruction in case
372 previous instruction is not needed to compute the address. */
373 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
374 && !ix86_agi_dependent (dep_insn, insn))
375 {
376 enum attr_unit unit = get_attr_unit (insn);
377 int loadcost = 3;
378
379 /* Because of the difference between the length of integer and
380 floating unit pipeline preparation stages, the memory operands
381 for floating point are cheaper.
382
383 ??? For Athlon it the difference is most probably 2. */
384 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
385 loadcost = 3;
386 else
387 loadcost = TARGET_ATHLON ? 2 : 0;
388
389 if (cost >= loadcost)
390 cost -= loadcost;
391 else
392 cost = 0;
393 }
394 break;
395
396 case PROCESSOR_ZNVER1:
397 case PROCESSOR_ZNVER2:
398 /* Stack engine allows to execute push&pop instructions in parall. */
399 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
400 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
401 return 0;
402
403 memory = get_attr_memory (insn);
404
405 /* Show ability of reorder buffer to hide latency of load by executing
406 in parallel with previous instruction in case
407 previous instruction is not needed to compute the address. */
408 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
409 && !ix86_agi_dependent (dep_insn, insn))
410 {
411 enum attr_unit unit = get_attr_unit (insn);
412 int loadcost;
413
414 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
415 loadcost = 4;
416 else
417 loadcost = 7;
418
419 if (cost >= loadcost)
420 cost -= loadcost;
421 else
422 cost = 0;
423 }
424 break;
425
426 case PROCESSOR_CORE2:
427 case PROCESSOR_NEHALEM:
428 case PROCESSOR_SANDYBRIDGE:
429 case PROCESSOR_HASWELL:
430 case PROCESSOR_GENERIC:
431 /* Stack engine allows to execute push&pop instructions in parall. */
432 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
433 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
434 return 0;
435
436 memory = get_attr_memory (insn);
437
438 /* Show ability of reorder buffer to hide latency of load by executing
439 in parallel with previous instruction in case
440 previous instruction is not needed to compute the address. */
441 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
442 && !ix86_agi_dependent (dep_insn, insn))
443 {
444 if (cost >= 4)
445 cost -= 4;
446 else
447 cost = 0;
448 }
449 break;
450
451 case PROCESSOR_SILVERMONT:
452 case PROCESSOR_KNL:
453 case PROCESSOR_KNM:
454 case PROCESSOR_INTEL:
455 if (!reload_completed)
456 return cost;
457
458 /* Increase cost of integer loads. */
459 memory = get_attr_memory (dep_insn);
460 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
461 {
462 enum attr_unit unit = get_attr_unit (dep_insn);
463 if (unit == UNIT_INTEGER && cost == 1)
464 {
465 if (memory == MEMORY_LOAD)
466 cost = 3;
467 else
468 {
469 /* Increase cost of ld/st for short int types only
470 because of store forwarding issue. */
471 rtx set = single_set (dep_insn);
472 if (set && (GET_MODE (SET_DEST (set)) == QImode
473 || GET_MODE (SET_DEST (set)) == HImode))
474 {
475 /* Increase cost of store/load insn if exact
476 dependence exists and it is load insn. */
477 enum attr_memory insn_memory = get_attr_memory (insn);
478 if (insn_memory == MEMORY_LOAD
479 && exact_store_load_dependency (dep_insn, insn))
480 cost = 3;
481 }
482 }
483 }
484 }
485
486 default:
487 break;
488 }
489
490 return cost;
491 }
492
493 /* How many alternative schedules to try. This should be as wide as the
494 scheduling freedom in the DFA, but no wider. Making this value too
495 large results extra work for the scheduler. */
496
497 int
498 ia32_multipass_dfa_lookahead (void)
499 {
500 /* Generally, we want haifa-sched:max_issue() to look ahead as far
501 as many instructions can be executed on a cycle, i.e.,
502 issue_rate. */
503 if (reload_completed)
504 return ix86_issue_rate ();
505 /* Don't use lookahead for pre-reload schedule to save compile time. */
506 return 0;
507 }
508
509 /* Return true if target platform supports macro-fusion. */
510
511 bool
512 ix86_macro_fusion_p ()
513 {
514 return TARGET_FUSE_CMP_AND_BRANCH;
515 }
516
517 /* Check whether current microarchitecture support macro fusion
518 for insn pair "CONDGEN + CONDJMP". Refer to
519 "Intel Architectures Optimization Reference Manual". */
520
521 bool
522 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
523 {
524 rtx src, dest;
525 enum rtx_code ccode;
526 rtx compare_set = NULL_RTX, test_if, cond;
527 rtx alu_set = NULL_RTX, addr = NULL_RTX;
528
529 if (!any_condjump_p (condjmp))
530 return false;
531
532 unsigned int condreg1, condreg2;
533 rtx cc_reg_1;
534 targetm.fixed_condition_code_regs (&condreg1, &condreg2);
535 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
536 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
537 || !condgen
538 || !modified_in_p (cc_reg_1, condgen))
539 return false;
540
541 if (get_attr_type (condgen) != TYPE_TEST
542 && get_attr_type (condgen) != TYPE_ICMP
543 && get_attr_type (condgen) != TYPE_INCDEC
544 && get_attr_type (condgen) != TYPE_ALU)
545 return false;
546
547 compare_set = single_set (condgen);
548 if (compare_set == NULL_RTX
549 && !TARGET_FUSE_ALU_AND_BRANCH)
550 return false;
551
552 if (compare_set == NULL_RTX)
553 {
554 int i;
555 rtx pat = PATTERN (condgen);
556 for (i = 0; i < XVECLEN (pat, 0); i++)
557 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
558 {
559 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
560 if (GET_CODE (set_src) == COMPARE)
561 compare_set = XVECEXP (pat, 0, i);
562 else
563 alu_set = XVECEXP (pat, 0, i);
564 }
565 }
566 if (compare_set == NULL_RTX)
567 return false;
568 src = SET_SRC (compare_set);
569 if (GET_CODE (src) != COMPARE)
570 return false;
571
572 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
573 supported. */
574 if ((MEM_P (XEXP (src, 0))
575 && CONST_INT_P (XEXP (src, 1)))
576 || (MEM_P (XEXP (src, 1))
577 && CONST_INT_P (XEXP (src, 0))))
578 return false;
579
580 /* No fusion for RIP-relative address. */
581 if (MEM_P (XEXP (src, 0)))
582 addr = XEXP (XEXP (src, 0), 0);
583 else if (MEM_P (XEXP (src, 1)))
584 addr = XEXP (XEXP (src, 1), 0);
585
586 if (addr) {
587 ix86_address parts;
588 int ok = ix86_decompose_address (addr, &parts);
589 gcc_assert (ok);
590
591 if (ix86_rip_relative_addr_p (&parts))
592 return false;
593 }
594
595 test_if = SET_SRC (pc_set (condjmp));
596 cond = XEXP (test_if, 0);
597 ccode = GET_CODE (cond);
598 /* Check whether conditional jump use Sign or Overflow Flags. */
599 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
600 && (ccode == GE
601 || ccode == GT
602 || ccode == LE
603 || ccode == LT))
604 return false;
605
606 /* Return true for TYPE_TEST and TYPE_ICMP. */
607 if (get_attr_type (condgen) == TYPE_TEST
608 || get_attr_type (condgen) == TYPE_ICMP)
609 return true;
610
611 /* The following is the case that macro-fusion for alu + jmp. */
612 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
613 return false;
614
615 /* No fusion for alu op with memory destination operand. */
616 dest = SET_DEST (alu_set);
617 if (MEM_P (dest))
618 return false;
619
620 /* Macro-fusion for inc/dec + unsigned conditional jump is not
621 supported. */
622 if (get_attr_type (condgen) == TYPE_INCDEC
623 && (ccode == GEU
624 || ccode == GTU
625 || ccode == LEU
626 || ccode == LTU))
627 return false;
628
629 return true;
630 }
631