]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/ppro.md
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / ppro.md
CommitLineData
56bab446 1;; Scheduling for the Intel P6 family of processors
8d9254fc 2;; Copyright (C) 2004-2020 Free Software Foundation, Inc.
af2728a4 3;;
a805d35f 4;; This file is part of GCC.
af2728a4 5;;
a805d35f 6;; GCC is free software; you can redistribute it and/or modify
af2728a4 7;; it under the terms of the GNU General Public License as published by
2f83c7d6 8;; the Free Software Foundation; either version 3, or (at your option)
af2728a4
JL
9;; any later version.
10;;
a805d35f 11;; GCC is distributed in the hope that it will be useful,
af2728a4
JL
12;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14;; GNU General Public License for more details.
15;;
16;; You should have received a copy of the GNU General Public License
2f83c7d6
NC
17;; along with GCC; see the file COPYING3. If not see
18;; <http://www.gnu.org/licenses/>. */
af2728a4 19
71cc389b 20;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
56bab446
SB
21;; and Xeon lines of CPUs. The DFA scheduler description in this file is
22;; based on information that can be found in the following three documents:
23;;
24;; "P6 Family of Processors Hardware Developer's Manual",
25;; Intel, September 1999.
26;;
27;; "Intel Architecture Optimization Manual",
28;; Intel, 1999 (Order Number: 245127-001).
29;;
30;; "How to optimize for the Pentium family of microprocessors",
31;; by Agner Fog, PhD.
32;;
33;; The P6 pipeline has three major components:
34;; 1) the FETCH/DECODE unit, an in-order issue front-end
35;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core
36;; 3) the RETIRE unit, an in-order retirement unit
37;;
38;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
39;; retirement unit are naturally in-order.
40;;
41;; BUS INTERFACE UNIT
42;; / \
43;; L1 ICACHE L1 DCACHE
44;; / | \ | \
45;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE
46;; \ | / | |
47;; INSTRUCTION POOL __________|_______/
48;; (inc. reorder buffer)
49;;
50;; Since the P6 CPUs execute instructions out-of-order, the most important
51;; consideration in performance tuning is making sure enough micro-ops are
52;; ready for execution in the out-of-order core, while not stalling the
53;; decoder.
54;;
55;; TODO:
56;; - Find a less crude way to model complex instructions, in
57;; particular how many cycles they take to be decoded.
58;; - Include decoder latencies in the total reservation latencies.
59;; This isn't necessary right now because we assume for every
60;; instruction that it never blocks a decoder.
61;; - Figure out where the p0 and p1 reservations come from. These
90c56b45 62;; appear not to be in the manual
56bab446
SB
63;; - Lots more because I'm sure this is still far from optimal :-)
64
65;; The ppro_idiv and ppro_fdiv automata are used to model issue
66;; latencies of idiv and fdiv type insns.
67(define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
68
69;; Simple instructions of the register-register form have only one uop.
70;; Load instructions are also only one uop. Store instructions decode to
71;; two uops, and simple read-modify instructions also take two uops.
72;; Simple instructions of the register-memory form have two to three uops.
73;; Simple read-modify-write instructions have four uops. The rules for
74;; the decoder are simple:
75;; - an instruction with 1 uop can be decoded by any of the three
76;; decoders in one cycle.
77;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
78;; but still in only one cycle.
79;; - a complex (microcode) instruction can also only be decoded by
80;; decoder 0, and this takes an unspecified number of cycles.
4f3f76e6 81;;
56bab446
SB
82;; The goal is to schedule such that we have a few-one-one uops sequence
83;; in each cycle, to decode as many instructions per cycle as possible.
84(define_cpu_unit "decoder0" "ppro_decoder")
85(define_cpu_unit "decoder1" "ppro_decoder")
86(define_cpu_unit "decoder2" "ppro_decoder")
87
88;; We first wish to find an instruction for decoder0, so exclude
89;; decoder1 and decoder2 from being reserved until decoder 0 is
90;; reserved.
91(presence_set "decoder1" "decoder0")
92(presence_set "decoder2" "decoder0")
93
94;; Most instructions can be decoded on any of the three decoders.
95(define_reservation "decodern" "(decoder0|decoder1|decoder2)")
96
97;; The out-of-order core has five pipelines. During each cycle, the core
98;; may dispatch zero or one uop on the port of any of the five pipelines
99;; so the maximum number of dispatched uops per cycle is 5. In practicer,
100;; 3 uops per cycle is more realistic.
101;;
102;; Two of the five pipelines contain several execution units:
103;;
104;; Port 0 Port 1 Port 2 Port 3 Port 4
105;; ALU ALU LOAD SAC SDA
106;; FPU JUE
107;; AGU MMX
108;; MMX P3FPU
109;; P3FPU
110;;
111;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
71cc389b 112;; JUE = Jump Execution Unit, AGU = Address Generation Unit)
56bab446
SB
113;;
114(define_cpu_unit "p0,p1" "ppro_core")
115(define_cpu_unit "p2" "ppro_load")
116(define_cpu_unit "p3,p4" "ppro_store")
117(define_cpu_unit "idiv" "ppro_idiv")
118(define_cpu_unit "fdiv" "ppro_fdiv")
119
120;; Only the irregular instructions have to be modeled here. A load
121;; increases the latency by 2 or 3, or by nothing if the manual gives
122;; a latency already. Store latencies are not accounted for.
123;;
124;; The simple instructions follow a very regular pattern of 1 uop per
125;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
126;; on port 4 and port 3. These instructions are modelled at the bottom
127;; of this file.
128;;
129;; For microcoded instructions we don't know how many uops are produced.
130;; These instructions are the "complex" ones in the Intel manuals. All
131;; we _do_ know is that they typically produce four or more uops, so
132;; they can only be decoded on decoder0. Modelling their latencies
133;; doesn't make sense because we don't know how these instructions are
134;; executed in the core. So we just model that they can only be decoded
135;; on decoder 0, and say that it takes a little while before the result
71cc389b 136;; is available.
56bab446 137(define_insn_reservation "ppro_complex_insn" 6
7ab91c5f 138 (and (eq_attr "cpu" "pentiumpro")
324a6c95 139 (eq_attr "type" "other,multi,call,callv,str"))
56bab446
SB
140 "decoder0")
141
142;; imov with memory operands does not use the integer units.
143(define_insn_reservation "ppro_imov" 1
7ab91c5f 144 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
145 (and (eq_attr "memory" "none")
146 (eq_attr "type" "imov")))
147 "decodern,(p0|p1)")
148
149(define_insn_reservation "ppro_imov_load" 4
7ab91c5f 150 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
151 (and (eq_attr "memory" "load")
152 (eq_attr "type" "imov")))
153 "decodern,p2")
154
155(define_insn_reservation "ppro_imov_store" 1
7ab91c5f 156 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
157 (and (eq_attr "memory" "store")
158 (eq_attr "type" "imov")))
159 "decoder0,p4+p3")
160
161;; imovx always decodes to one uop, and also doesn't use the integer
162;; units if it has memory operands.
163(define_insn_reservation "ppro_imovx" 1
7ab91c5f 164 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
165 (and (eq_attr "memory" "none")
166 (eq_attr "type" "imovx")))
167 "decodern,(p0|p1)")
168
169(define_insn_reservation "ppro_imovx_load" 4
7ab91c5f 170 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
171 (and (eq_attr "memory" "load")
172 (eq_attr "type" "imovx")))
173 "decodern,p2")
174
175;; lea executes on port 0 with latency one and throughput 1.
176(define_insn_reservation "ppro_lea" 1
7ab91c5f 177 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
178 (and (eq_attr "memory" "none")
179 (eq_attr "type" "lea")))
180 "decodern,p0")
181
182;; Shift and rotate execute on port 0 with latency and throughput 1.
183;; The load and store units need to be reserved when memory operands
184;; are involved.
185(define_insn_reservation "ppro_shift_rotate" 1
7ab91c5f 186 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
187 (and (eq_attr "memory" "none")
188 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
189 "decodern,p0")
190
191(define_insn_reservation "ppro_shift_rotate_mem" 4
7ab91c5f 192 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
193 (and (eq_attr "memory" "!none")
194 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
195 "decoder0,p2+p0,p4+p3")
196
56bab446 197
71cc389b 198;; The P6 has a sophisticated branch prediction mechanism to minimize
56bab446
SB
199;; latencies due to branching. In particular, it has a fast way to
200;; execute branches that are taken multiple times (such as in loops).
201;; Branches not taken suffer no penalty, and correctly predicted
202;; branches cost only one fetch cycle. Mispredicted branches are very
203;; costly: typically 15 cycles and possibly as many as 26 cycles.
204;;
71cc389b 205;; Unfortunately all this makes it quite difficult to properly model
56bab446
SB
206;; the latencies for the compiler. Here I've made the choice to be
207;; optimistic and assume branches are often predicted correctly, so
208;; they have latency 1, and the decoders are not blocked.
209;;
210;; In addition, the model assumes a branch always decodes to only 1 uop,
211;; which is not exactly true because there are a few instructions that
212;; decode to 2 uops or microcode. But this probably gives the best
213;; results because we can assume these instructions can decode on all
214;; decoders.
215(define_insn_reservation "ppro_branch" 1
7ab91c5f 216 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
217 (and (eq_attr "memory" "none")
218 (eq_attr "type" "ibr")))
219 "decodern,p1")
220
221;; ??? Indirect branches probably have worse latency than this.
222(define_insn_reservation "ppro_indirect_branch" 6
7ab91c5f 223 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
224 (and (eq_attr "memory" "!none")
225 (eq_attr "type" "ibr")))
226 "decoder0,p2+p1")
227
228(define_insn_reservation "ppro_leave" 4
7ab91c5f 229 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
230 (eq_attr "type" "leave"))
231 "decoder0,p2+(p0|p1),(p0|p1)")
232
233;; imul has throughput one, but latency 4, and can only execute on port 0.
234(define_insn_reservation "ppro_imul" 4
7ab91c5f 235 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
236 (and (eq_attr "memory" "none")
237 (eq_attr "type" "imul")))
238 "decodern,p0")
239
240(define_insn_reservation "ppro_imul_mem" 4
7ab91c5f 241 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
242 (and (eq_attr "memory" "!none")
243 (eq_attr "type" "imul")))
244 "decoder0,p2+p0")
245
246;; div and idiv are very similar, so we model them the same.
247;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
248;; These issue latencies are modelled via the ppro_div automaton.
249(define_insn_reservation "ppro_idiv_QI" 19
7ab91c5f 250 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
251 (and (eq_attr "memory" "none")
252 (and (eq_attr "mode" "QI")
253 (eq_attr "type" "idiv"))))
254 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
255
256(define_insn_reservation "ppro_idiv_QI_load" 19
7ab91c5f 257 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
258 (and (eq_attr "memory" "load")
259 (and (eq_attr "mode" "QI")
260 (eq_attr "type" "idiv"))))
261 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
262
263(define_insn_reservation "ppro_idiv_HI" 23
7ab91c5f 264 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
265 (and (eq_attr "memory" "none")
266 (and (eq_attr "mode" "HI")
267 (eq_attr "type" "idiv"))))
268 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
269
270(define_insn_reservation "ppro_idiv_HI_load" 23
7ab91c5f 271 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
272 (and (eq_attr "memory" "load")
273 (and (eq_attr "mode" "HI")
274 (eq_attr "type" "idiv"))))
275 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
276
277(define_insn_reservation "ppro_idiv_SI" 39
7ab91c5f 278 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
279 (and (eq_attr "memory" "none")
280 (and (eq_attr "mode" "SI")
281 (eq_attr "type" "idiv"))))
282 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
283
284(define_insn_reservation "ppro_idiv_SI_load" 39
7ab91c5f 285 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
286 (and (eq_attr "memory" "load")
287 (and (eq_attr "mode" "SI")
288 (eq_attr "type" "idiv"))))
289 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
290
291;; Floating point operations always execute on port 0.
292;; ??? where do these latencies come from? fadd has latency 3 and
293;; has throughput "1/cycle (align with FADD)". What do they
294;; mean and how can we model that?
295(define_insn_reservation "ppro_fop" 3
7ab91c5f 296 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
297 (and (eq_attr "memory" "none,unknown")
298 (eq_attr "type" "fop")))
299 "decodern,p0")
300
301(define_insn_reservation "ppro_fop_load" 5
7ab91c5f 302 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
303 (and (eq_attr "memory" "load")
304 (eq_attr "type" "fop")))
305 "decoder0,p2+p0,p0")
306
307(define_insn_reservation "ppro_fop_store" 3
7ab91c5f 308 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
309 (and (eq_attr "memory" "store")
310 (eq_attr "type" "fop")))
311 "decoder0,p0,p0,p0+p4+p3")
312
313(define_insn_reservation "ppro_fop_both" 5
7ab91c5f 314 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
315 (and (eq_attr "memory" "both")
316 (eq_attr "type" "fop")))
317 "decoder0,p2+p0,p0+p4+p3")
318
319(define_insn_reservation "ppro_fsgn" 1
7ab91c5f 320 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
321 (eq_attr "type" "fsgn"))
322 "decodern,p0")
323
324(define_insn_reservation "ppro_fistp" 5
7ab91c5f 325 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
326 (eq_attr "type" "fistp"))
327 "decoder0,p0*2,p4+p3")
328
329(define_insn_reservation "ppro_fcmov" 2
7ab91c5f 330 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
331 (eq_attr "type" "fcmov"))
332 "decoder0,p0*2")
333
334(define_insn_reservation "ppro_fcmp" 1
7ab91c5f 335 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
336 (and (eq_attr "memory" "none")
337 (eq_attr "type" "fcmp")))
338 "decodern,p0")
339
340(define_insn_reservation "ppro_fcmp_load" 4
7ab91c5f 341 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
342 (and (eq_attr "memory" "load")
343 (eq_attr "type" "fcmp")))
344 "decoder0,p2+p0")
345
346(define_insn_reservation "ppro_fmov" 1
7ab91c5f 347 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
348 (and (eq_attr "memory" "none")
349 (eq_attr "type" "fmov")))
350 "decodern,p0")
351
352(define_insn_reservation "ppro_fmov_load" 1
7ab91c5f 353 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
354 (and (eq_attr "memory" "load")
355 (and (eq_attr "mode" "!XF")
356 (eq_attr "type" "fmov"))))
357 "decodern,p2")
358
359(define_insn_reservation "ppro_fmov_XF_load" 3
7ab91c5f 360 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
361 (and (eq_attr "memory" "load")
362 (and (eq_attr "mode" "XF")
363 (eq_attr "type" "fmov"))))
364 "decoder0,(p2+p0)*2")
365
366(define_insn_reservation "ppro_fmov_store" 1
7ab91c5f 367 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
368 (and (eq_attr "memory" "store")
369 (and (eq_attr "mode" "!XF")
370 (eq_attr "type" "fmov"))))
371 "decodern,p0")
372
373(define_insn_reservation "ppro_fmov_XF_store" 3
7ab91c5f 374 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
375 (and (eq_attr "memory" "store")
376 (and (eq_attr "mode" "XF")
377 (eq_attr "type" "fmov"))))
378 "decoder0,(p0+p4),(p0+p3)")
379
380;; fmul executes on port 0 with latency 5. It has issue latency 2,
381;; but we don't model this.
382(define_insn_reservation "ppro_fmul" 5
7ab91c5f 383 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
384 (and (eq_attr "memory" "none")
385 (eq_attr "type" "fmul")))
386 "decoder0,p0*2")
387
388(define_insn_reservation "ppro_fmul_load" 6
7ab91c5f 389 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
390 (and (eq_attr "memory" "load")
391 (eq_attr "type" "fmul")))
392 "decoder0,p2+p0,p0")
393
394;; fdiv latencies depend on the mode of the operands. XFmode gives
395;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
396;; Division by a power of 2 takes only 9 cycles, but we cannot model
397;; that. Throughput is equal to latency - 1, which we model using the
398;; ppro_div automaton.
399(define_insn_reservation "ppro_fdiv_SF" 18
7ab91c5f 400 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
401 (and (eq_attr "memory" "none")
402 (and (eq_attr "mode" "SF")
403 (eq_attr "type" "fdiv,fpspc"))))
404 "decodern,p0+fdiv,fdiv*16")
405
406(define_insn_reservation "ppro_fdiv_SF_load" 19
7ab91c5f 407 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
408 (and (eq_attr "memory" "load")
409 (and (eq_attr "mode" "SF")
410 (eq_attr "type" "fdiv,fpspc"))))
411 "decoder0,p2+p0+fdiv,fdiv*16")
412
413(define_insn_reservation "ppro_fdiv_DF" 32
7ab91c5f 414 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
415 (and (eq_attr "memory" "none")
416 (and (eq_attr "mode" "DF")
417 (eq_attr "type" "fdiv,fpspc"))))
418 "decodern,p0+fdiv,fdiv*30")
419
420(define_insn_reservation "ppro_fdiv_DF_load" 33
7ab91c5f 421 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
422 (and (eq_attr "memory" "load")
423 (and (eq_attr "mode" "DF")
424 (eq_attr "type" "fdiv,fpspc"))))
425 "decoder0,p2+p0+fdiv,fdiv*30")
426
427(define_insn_reservation "ppro_fdiv_XF" 38
7ab91c5f 428 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
429 (and (eq_attr "memory" "none")
430 (and (eq_attr "mode" "XF")
431 (eq_attr "type" "fdiv,fpspc"))))
432 "decodern,p0+fdiv,fdiv*36")
433
434(define_insn_reservation "ppro_fdiv_XF_load" 39
7ab91c5f 435 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
436 (and (eq_attr "memory" "load")
437 (and (eq_attr "mode" "XF")
438 (eq_attr "type" "fdiv,fpspc"))))
439 "decoder0,p2+p0+fdiv,fdiv*36")
440
441;; MMX instructions can execute on either port 0 or port 1 with a
442;; throughput of 1/cycle.
443;; on port 0: - ALU (latency 1)
444;; - Multiplier Unit (latency 3)
445;; on port 1: - ALU (latency 1)
446;; - Shift Unit (latency 1)
447;;
448;; MMX instructions are either of the type reg-reg, or read-modify, and
449;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
450;; so they behave as "simple" instructions that need no special modelling.
451;; We only have to model mmxshft and mmxmul.
452(define_insn_reservation "ppro_mmx_shft" 1
7ab91c5f 453 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
454 (and (eq_attr "memory" "none")
455 (eq_attr "type" "mmxshft")))
456 "decodern,p1")
457
458(define_insn_reservation "ppro_mmx_shft_load" 2
7ab91c5f 459 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
460 (and (eq_attr "memory" "none")
461 (eq_attr "type" "mmxshft")))
462 "decoder0,p2+p1")
463
464(define_insn_reservation "ppro_mmx_mul" 3
7ab91c5f 465 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
466 (and (eq_attr "memory" "none")
467 (eq_attr "type" "mmxmul")))
468 "decodern,p0")
469
470(define_insn_reservation "ppro_mmx_mul_load" 3
7ab91c5f 471 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
472 (and (eq_attr "memory" "none")
473 (eq_attr "type" "mmxmul")))
474 "decoder0,p2+p0")
475
476(define_insn_reservation "ppro_sse_mmxcvt" 4
7ab91c5f 477 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
478 (and (eq_attr "mode" "DI")
479 (eq_attr "type" "mmxcvt")))
480 "decodern,p1")
481
482;; FIXME: These are Pentium III only, but we cannot tell here if
483;; we're generating code for PentiumPro/Pentium II or Pentium III
484;; (define_insn_reservation "ppro_sse_mmxshft" 2
7ab91c5f 485;; (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
486;; (and (eq_attr "mode" "DI")
487;; (eq_attr "type" "mmxshft")))
488;; "decodern,p0")
489
490;; SSE is very complicated, and takes a bit more effort.
491;; ??? I assumed that all SSE instructions decode on decoder0,
492;; but is this correct?
493
494;; The sfence instruction.
495(define_insn_reservation "ppro_sse_sfence" 3
7ab91c5f 496 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
497 (and (eq_attr "memory" "unknown")
498 (eq_attr "type" "sse")))
499 "decoder0,p4+p3")
500
501;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
502(define_insn_reservation "ppro_sse_SF" 3
7ab91c5f 503 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
504 (and (eq_attr "mode" "SF")
505 (eq_attr "type" "sse")))
506 "decodern,p0")
507
508(define_insn_reservation "ppro_sse_add_SF" 3
7ab91c5f 509 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
510 (and (eq_attr "memory" "none")
511 (and (eq_attr "mode" "SF")
b790dea2 512 (eq_attr "type" "sseadd,sseadd1"))))
56bab446
SB
513 "decodern,p1")
514
515(define_insn_reservation "ppro_sse_add_SF_load" 3
7ab91c5f 516 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
517 (and (eq_attr "memory" "load")
518 (and (eq_attr "mode" "SF")
b790dea2 519 (eq_attr "type" "sseadd,sseadd1"))))
56bab446
SB
520 "decoder0,p2+p1")
521
522(define_insn_reservation "ppro_sse_cmp_SF" 3
7ab91c5f 523 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
524 (and (eq_attr "memory" "none")
525 (and (eq_attr "mode" "SF")
526 (eq_attr "type" "ssecmp"))))
527 "decoder0,p1")
528
529(define_insn_reservation "ppro_sse_cmp_SF_load" 3
7ab91c5f 530 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
531 (and (eq_attr "memory" "load")
532 (and (eq_attr "mode" "SF")
533 (eq_attr "type" "ssecmp"))))
534 "decoder0,p2+p1")
535
536(define_insn_reservation "ppro_sse_comi_SF" 1
7ab91c5f 537 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
538 (and (eq_attr "memory" "none")
539 (and (eq_attr "mode" "SF")
540 (eq_attr "type" "ssecomi"))))
541 "decodern,p0")
542
543(define_insn_reservation "ppro_sse_comi_SF_load" 1
7ab91c5f 544 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
545 (and (eq_attr "memory" "load")
546 (and (eq_attr "mode" "SF")
547 (eq_attr "type" "ssecomi"))))
548 "decoder0,p2+p0")
549
550(define_insn_reservation "ppro_sse_mul_SF" 4
7ab91c5f 551 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
552 (and (eq_attr "memory" "none")
553 (and (eq_attr "mode" "SF")
554 (eq_attr "type" "ssemul"))))
555 "decodern,p0")
556
557(define_insn_reservation "ppro_sse_mul_SF_load" 4
7ab91c5f 558 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
559 (and (eq_attr "memory" "load")
560 (and (eq_attr "mode" "SF")
561 (eq_attr "type" "ssemul"))))
562 "decoder0,p2+p0")
563
564;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
565(define_insn_reservation "ppro_sse_div_SF" 18
7ab91c5f 566 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
567 (and (eq_attr "memory" "none")
568 (and (eq_attr "mode" "SF")
569 (eq_attr "type" "ssediv"))))
570 "decoder0,p0*17")
571
572(define_insn_reservation "ppro_sse_div_SF_load" 18
7ab91c5f 573 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
574 (and (eq_attr "memory" "none")
575 (and (eq_attr "mode" "SF")
576 (eq_attr "type" "ssediv"))))
577 "decoder0,(p2+p0),p0*16")
578
579(define_insn_reservation "ppro_sse_icvt_SF" 4
7ab91c5f 580 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
581 (and (eq_attr "mode" "SF")
582 (eq_attr "type" "sseicvt")))
583 "decoder0,(p2+p1)*2")
584
585(define_insn_reservation "ppro_sse_icvt_SI" 3
7ab91c5f 586 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
587 (and (eq_attr "mode" "SI")
588 (eq_attr "type" "sseicvt")))
589 "decoder0,(p2+p1)")
590
591(define_insn_reservation "ppro_sse_mov_SF" 3
7ab91c5f 592 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
593 (and (eq_attr "memory" "none")
594 (and (eq_attr "mode" "SF")
595 (eq_attr "type" "ssemov"))))
596 "decoder0,(p0|p1)")
597
598(define_insn_reservation "ppro_sse_mov_SF_load" 3
7ab91c5f 599 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
600 (and (eq_attr "memory" "load")
601 (and (eq_attr "mode" "SF")
602 (eq_attr "type" "ssemov"))))
603 "decoder0,p2+(p0|p1)")
604
605(define_insn_reservation "ppro_sse_mov_SF_store" 3
7ab91c5f 606 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
607 (and (eq_attr "memory" "store")
608 (and (eq_attr "mode" "SF")
609 (eq_attr "type" "ssemov"))))
610 "decoder0,p4+p3")
611
612(define_insn_reservation "ppro_sse_V4SF" 4
7ab91c5f 613 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
614 (and (eq_attr "mode" "V4SF")
615 (eq_attr "type" "sse")))
616 "decoder0,p1*2")
617
618(define_insn_reservation "ppro_sse_add_V4SF" 3
7ab91c5f 619 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
620 (and (eq_attr "memory" "none")
621 (and (eq_attr "mode" "V4SF")
b790dea2 622 (eq_attr "type" "sseadd,sseadd1"))))
56bab446
SB
623 "decoder0,p1*2")
624
625(define_insn_reservation "ppro_sse_add_V4SF_load" 3
7ab91c5f 626 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
627 (and (eq_attr "memory" "load")
628 (and (eq_attr "mode" "V4SF")
b790dea2 629 (eq_attr "type" "sseadd,sseadd1"))))
56bab446
SB
630 "decoder0,(p2+p1)*2")
631
632(define_insn_reservation "ppro_sse_cmp_V4SF" 3
7ab91c5f 633 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
634 (and (eq_attr "memory" "none")
635 (and (eq_attr "mode" "V4SF")
636 (eq_attr "type" "ssecmp"))))
637 "decoder0,p1*2")
638
639(define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
7ab91c5f 640 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
641 (and (eq_attr "memory" "load")
642 (and (eq_attr "mode" "V4SF")
643 (eq_attr "type" "ssecmp"))))
644 "decoder0,(p2+p1)*2")
645
646(define_insn_reservation "ppro_sse_cvt_V4SF" 3
7ab91c5f 647 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
648 (and (eq_attr "memory" "none,unknown")
649 (and (eq_attr "mode" "V4SF")
650 (eq_attr "type" "ssecvt"))))
651 "decoder0,p1*2")
652
653(define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
7ab91c5f 654 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
655 (and (eq_attr "memory" "!none,unknown")
656 (and (eq_attr "mode" "V4SF")
657 (eq_attr "type" "ssecmp"))))
658 "decoder0,p1,p4+p3")
659
660(define_insn_reservation "ppro_sse_mul_V4SF" 5
7ab91c5f 661 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
662 (and (eq_attr "memory" "none")
663 (and (eq_attr "mode" "V4SF")
664 (eq_attr "type" "ssemul"))))
665 "decoder0,p0*2")
666
667(define_insn_reservation "ppro_sse_mul_V4SF_load" 5
7ab91c5f 668 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
669 (and (eq_attr "memory" "load")
670 (and (eq_attr "mode" "V4SF")
671 (eq_attr "type" "ssemul"))))
672 "decoder0,(p2+p0)*2")
673
674;; FIXME: p0 really closed this long???
675(define_insn_reservation "ppro_sse_div_V4SF" 48
7ab91c5f 676 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
677 (and (eq_attr "memory" "none")
678 (and (eq_attr "mode" "V4SF")
679 (eq_attr "type" "ssediv"))))
680 "decoder0,p0*34")
681
682(define_insn_reservation "ppro_sse_div_V4SF_load" 48
7ab91c5f 683 (and (eq_attr "cpu" "pentiumpro")
ef719a44 684 (and (eq_attr "memory" "load")
56bab446
SB
685 (and (eq_attr "mode" "V4SF")
686 (eq_attr "type" "ssediv"))))
687 "decoder0,(p2+p0)*2,p0*32")
688
689(define_insn_reservation "ppro_sse_log_V4SF" 2
7ab91c5f 690 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
691 (and (eq_attr "memory" "none")
692 (and (eq_attr "mode" "V4SF")
eb2f2b44 693 (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
56bab446
SB
694 "decodern,p1")
695
696(define_insn_reservation "ppro_sse_log_V4SF_load" 2
7ab91c5f 697 (and (eq_attr "cpu" "pentiumpro")
ef719a44 698 (and (eq_attr "memory" "load")
56bab446 699 (and (eq_attr "mode" "V4SF")
eb2f2b44 700 (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
56bab446
SB
701 "decoder0,(p2+p1)")
702
703(define_insn_reservation "ppro_sse_mov_V4SF" 1
7ab91c5f 704 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
705 (and (eq_attr "memory" "none")
706 (and (eq_attr "mode" "V4SF")
707 (eq_attr "type" "ssemov"))))
708 "decoder0,(p0|p1)*2")
709
710(define_insn_reservation "ppro_sse_mov_V4SF_load" 2
7ab91c5f 711 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
712 (and (eq_attr "memory" "load")
713 (and (eq_attr "mode" "V4SF")
714 (eq_attr "type" "ssemov"))))
715 "decoder0,p2*2")
716
717(define_insn_reservation "ppro_sse_mov_V4SF_store" 3
7ab91c5f 718 (and (eq_attr "cpu" "pentiumpro")
56bab446
SB
719 (and (eq_attr "memory" "store")
720 (and (eq_attr "mode" "V4SF")
721 (eq_attr "type" "ssemov"))))
722 "decoder0,(p4+p3)*2")
723
724;; All other instructions are modelled as simple instructions.
725;; We have already modelled all i387 floating point instructions, so all
726;; other instructions execute on either port 0 or port 1. This includes
727;; the ALU units, and the MMX units.
728;;
729;; reg-reg instructions produce 1 uop so they can be decoded on any of
730;; the three decoders.
731(define_insn_reservation "ppro_insn" 1
7ab91c5f 732 (and (eq_attr "cpu" "pentiumpro")
56bab446 733 (and (eq_attr "memory" "none,unknown")
c1b157a6 734 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
56bab446
SB
735 "decodern,(p0|p1)")
736
737;; read-modify and register-memory instructions have 2 or three uops,
738;; so they have to be decoded on decoder0.
739(define_insn_reservation "ppro_insn_load" 3
7ab91c5f 740 (and (eq_attr "cpu" "pentiumpro")
56bab446 741 (and (eq_attr "memory" "load")
c1b157a6 742 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
56bab446
SB
743 "decoder0,p2+(p0|p1)")
744
745(define_insn_reservation "ppro_insn_store" 1
7ab91c5f 746 (and (eq_attr "cpu" "pentiumpro")
56bab446 747 (and (eq_attr "memory" "store")
c1b157a6 748 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
56bab446
SB
749 "decoder0,(p0|p1),p4+p3")
750
751;; read-modify-store instructions produce 4 uops so they have to be
752;; decoded on decoder0 as well.
753(define_insn_reservation "ppro_insn_both" 4
7ab91c5f 754 (and (eq_attr "cpu" "pentiumpro")
56bab446 755 (and (eq_attr "memory" "both")
c1b157a6 756 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
56bab446
SB
757 "decoder0,p2+(p0|p1),p4+p3")
758