]>
Commit | Line | Data |
---|---|---|
af2728a4 | 1 | ;; AMD K6/K6-2 Scheduling |
2f83c7d6 | 2 | ;; Copyright (C) 2002, 2004, 2007 |
8695f61e | 3 | ;; Free Software Foundation, Inc. |
af2728a4 | 4 | ;; |
188fc5b5 | 5 | ;; This file is part of GCC. |
af2728a4 | 6 | ;; |
188fc5b5 | 7 | ;; GCC is free software; you can redistribute it and/or modify |
af2728a4 | 8 | ;; it under the terms of the GNU General Public License as published by |
2f83c7d6 | 9 | ;; the Free Software Foundation; either version 3, or (at your option) |
af2728a4 JL |
10 | ;; any later version. |
11 | ;; | |
188fc5b5 | 12 | ;; GCC is distributed in the hope that it will be useful, |
af2728a4 JL |
13 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | ;; GNU General Public License for more details. | |
16 | ;; | |
17 | ;; You should have received a copy of the GNU General Public License | |
2f83c7d6 NC |
18 | ;; along with GCC; see the file COPYING3. If not see |
19 | ;; <http://www.gnu.org/licenses/>. | |
af2728a4 | 20 | ;; |
8695f61e SB |
21 | ;; The K6 architecture is quite similar to PPro. Important difference is |
22 | ;; that there are only two decoders and they seems to be much slower than | |
23 | ;; any of the execution units. So we have to pay much more attention to | |
24 | ;; proper scheduling for the decoders. | |
25 | ;; FIXME: We don't do that right now. A good start would be to sort the | |
26 | ;; instructions based on length. | |
af2728a4 | 27 | ;; |
8695f61e | 28 | ;; This description is based on data from the following documents: |
af2728a4 | 29 | ;; |
8695f61e SB |
30 | ;; "AMD-K6 Processor Data Sheet (Preliminary information)" |
31 | ;; Advanced Micro Devices, Inc., 1998. | |
32 | ;; | |
33 | ;; "AMD-K6 Processor Code Optimization Application Note" | |
34 | ;; Advanced Micro Devices, Inc., 2000. | |
35 | ;; | |
36 | ;; CPU execution units of the K6: | |
37 | ;; | |
38 | ;; store describes the Store unit. This unit is not modelled | |
39 | ;; completely and it is only used to model lea operation. | |
40 | ;; Otherwise it lies outside of any critical path. | |
41 | ;; load describes the Load unit | |
42 | ;; alux describes the Integer X unit | |
43 | ;; mm describes the Multimedia unit, which shares a pipe | |
44 | ;; with the Integer X unit. This unit is used for MMX, | |
45 | ;; which is not implemented for K6. | |
46 | ;; aluy describes the Integer Y unit | |
47 | ;; fpu describes the FPU unit | |
48 | ;; branch describes the Branch unit | |
49 | ;; | |
50 | ;; The fp unit is not pipelined, and it can only do one operation per two | |
51 | ;; cycles, including fxcg. | |
52 | ;; | |
53 | ;; Generally this is a very poor description, but at least no worse than | |
54 | ;; the old description, and a lot easier to extend to something more | |
55 | ;; reasonable if anyone still cares enough about this architecture in 2004. | |
af2728a4 JL |
56 | ;; |
57 | ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. | |
58 | ||
8695f61e SB |
59 | (define_automaton "k6_decoder,k6_load_unit,k6_store_unit,k6_integer_units,k6_fpu_unit,k6_branch_unit") |
60 | ||
61 | ;; The K6 instruction decoding begins before the on-chip instruction cache is | |
62 | ;; filled. Depending on the length of the instruction, two simple instructions | |
63 | ;; can be decoded in two parallel short decoders, or one complex instruction can | |
64 | ;; be decoded in either the long or the vector decoder. For all practical | |
65 | ;; purposes, the long and vector decoder can be modelled as one decoder. | |
66 | (define_cpu_unit "k6_decode_short0" "k6_decoder") | |
67 | (define_cpu_unit "k6_decode_short1" "k6_decoder") | |
68 | (define_cpu_unit "k6_decode_long" "k6_decoder") | |
69 | (exclusion_set "k6_decode_long" "k6_decode_short0,k6_decode_short1") | |
70 | (define_reservation "k6_decode_short" "k6_decode_short0|k6_decode_short1") | |
71 | (define_reservation "k6_decode_vector" "k6_decode_long") | |
72 | ||
73 | (define_cpu_unit "k6_store" "k6_store_unit") | |
74 | (define_cpu_unit "k6_load" "k6_load_unit") | |
75 | (define_cpu_unit "k6_alux,k6_aluy" "k6_integer_units") | |
76 | (define_cpu_unit "k6_fpu" "k6_fpu_unit") | |
77 | (define_cpu_unit "k6_branch" "k6_branch_unit") | |
78 | ||
79 | ;; Shift instructions and certain arithmetic are issued only on Integer X. | |
80 | (define_insn_reservation "k6_alux_only" 1 | |
81 | (and (eq_attr "cpu" "k6") | |
90c56b45 | 82 | (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") |
8695f61e SB |
83 | (eq_attr "memory" "none"))) |
84 | "k6_decode_short,k6_alux") | |
85 | ||
86 | (define_insn_reservation "k6_alux_only_load" 3 | |
87 | (and (eq_attr "cpu" "k6") | |
90c56b45 | 88 | (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") |
8695f61e SB |
89 | (eq_attr "memory" "load"))) |
90 | "k6_decode_short,k6_load,k6_alux") | |
91 | ||
92 | (define_insn_reservation "k6_alux_only_store" 3 | |
93 | (and (eq_attr "cpu" "k6") | |
90c56b45 | 94 | (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot") |
8695f61e SB |
95 | (eq_attr "memory" "store,both,unknown"))) |
96 | "k6_decode_long,k6_load,k6_alux,k6_store") | |
97 | ||
98 | ;; Integer divide and multiply can only be issued on Integer X, too. | |
99 | (define_insn_reservation "k6_alu_imul" 2 | |
100 | (and (eq_attr "cpu" "k6") | |
101 | (eq_attr "type" "imul")) | |
102 | "k6_decode_vector,k6_alux*3") | |
103 | ||
104 | (define_insn_reservation "k6_alu_imul_load" 4 | |
105 | (and (eq_attr "cpu" "k6") | |
106 | (and (eq_attr "type" "imul") | |
107 | (eq_attr "memory" "load"))) | |
108 | "k6_decode_vector,k6_load,k6_alux*3") | |
109 | ||
110 | (define_insn_reservation "k6_alu_imul_store" 4 | |
111 | (and (eq_attr "cpu" "k6") | |
112 | (and (eq_attr "type" "imul") | |
113 | (eq_attr "memory" "store,both,unknown"))) | |
114 | "k6_decode_vector,k6_load,k6_alux*3,k6_store") | |
115 | ||
116 | ;; ??? Guessed latencies based on the old pipeline description. | |
117 | (define_insn_reservation "k6_alu_idiv" 17 | |
118 | (and (eq_attr "cpu" "k6") | |
119 | (and (eq_attr "type" "idiv") | |
120 | (eq_attr "memory" "none"))) | |
121 | "k6_decode_vector,k6_alux*17") | |
122 | ||
123 | (define_insn_reservation "k6_alu_idiv_mem" 19 | |
124 | (and (eq_attr "cpu" "k6") | |
125 | (and (eq_attr "type" "idiv") | |
126 | (eq_attr "memory" "!none"))) | |
127 | "k6_decode_vector,k6_load,k6_alux*17") | |
128 | ||
129 | ;; Basic word and doubleword ALU ops can be issued on both Integer units. | |
130 | (define_insn_reservation "k6_alu" 1 | |
131 | (and (eq_attr "cpu" "k6") | |
132 | (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") | |
133 | (eq_attr "memory" "none"))) | |
134 | "k6_decode_short,k6_alux|k6_aluy") | |
135 | ||
136 | (define_insn_reservation "k6_alu_load" 3 | |
137 | (and (eq_attr "cpu" "k6") | |
138 | (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") | |
139 | (eq_attr "memory" "load"))) | |
140 | "k6_decode_short,k6_load,k6_alux|k6_aluy") | |
141 | ||
142 | (define_insn_reservation "k6_alu_store" 3 | |
143 | (and (eq_attr "cpu" "k6") | |
144 | (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") | |
145 | (eq_attr "memory" "store,both,unknown"))) | |
146 | "k6_decode_long,k6_load,k6_alux|k6_aluy,k6_store") | |
147 | ||
148 | ;; A "load immediate" operation does not require execution at all, | |
149 | ;; it is available immediately after decoding. Special-case this. | |
150 | (define_insn_reservation "k6_alu_imov" 1 | |
151 | (and (eq_attr "cpu" "k6") | |
152 | (and (eq_attr "type" "imov") | |
153 | (and (eq_attr "memory" "none") | |
154 | (match_operand 1 "nonimmediate_operand")))) | |
155 | "k6_decode_short,k6_alux|k6_aluy") | |
156 | ||
157 | (define_insn_reservation "k6_alu_imov_imm" 0 | |
158 | (and (eq_attr "cpu" "k6") | |
159 | (and (eq_attr "type" "imov") | |
160 | (and (eq_attr "memory" "none") | |
161 | (match_operand 1 "immediate_operand")))) | |
162 | "k6_decode_short") | |
163 | ||
164 | (define_insn_reservation "k6_alu_imov_load" 2 | |
165 | (and (eq_attr "cpu" "k6") | |
166 | (and (eq_attr "type" "imov") | |
167 | (eq_attr "memory" "load"))) | |
168 | "k6_decode_short,k6_load") | |
169 | ||
170 | (define_insn_reservation "k6_alu_imov_store" 1 | |
171 | (and (eq_attr "cpu" "k6") | |
172 | (and (eq_attr "type" "imov") | |
173 | (eq_attr "memory" "store"))) | |
174 | "k6_decode_short,k6_store") | |
175 | ||
176 | (define_insn_reservation "k6_alu_imov_both" 2 | |
177 | (and (eq_attr "cpu" "k6") | |
178 | (and (eq_attr "type" "imov") | |
179 | (eq_attr "memory" "both,unknown"))) | |
180 | "k6_decode_long,k6_load,k6_alux|k6_aluy") | |
181 | ||
182 | ;; The branch unit. | |
183 | (define_insn_reservation "k6_branch_call" 1 | |
184 | (and (eq_attr "cpu" "k6") | |
185 | (eq_attr "type" "call,callv")) | |
186 | "k6_decode_vector,k6_branch") | |
187 | ||
188 | (define_insn_reservation "k6_branch_branch" 1 | |
189 | (and (eq_attr "cpu" "k6") | |
190 | (eq_attr "type" "ibr")) | |
191 | "k6_decode_short,k6_branch") | |
192 | ||
193 | ;; The load and units have two pipeline stages. The load latency is | |
194 | ;; two cycles. | |
195 | (define_insn_reservation "k6_load_pop" 3 | |
196 | (and (eq_attr "cpu" "k6") | |
197 | (ior (eq_attr "type" "pop") | |
198 | (eq_attr "memory" "load,both"))) | |
199 | "k6_decode_short,k6_load") | |
200 | ||
201 | (define_insn_reservation "k6_load_leave" 5 | |
202 | (and (eq_attr "cpu" "k6") | |
203 | (eq_attr "type" "leave")) | |
204 | "k6_decode_long,k6_load,(k6_alux|k6_aluy)*2") | |
205 | ||
206 | ;; ??? From the old pipeline description. Egad! | |
207 | ;; ??? Apparently we take care of this reservation in adjust_cost. | |
208 | (define_insn_reservation "k6_load_str" 10 | |
209 | (and (eq_attr "cpu" "k6") | |
210 | (and (eq_attr "type" "str") | |
211 | (eq_attr "memory" "load,both"))) | |
212 | "k6_decode_vector,k6_load*10") | |
213 | ||
214 | ;; The store unit handles lea and push. It is otherwise unmodelled. | |
215 | (define_insn_reservation "k6_store_lea" 2 | |
216 | (and (eq_attr "cpu" "k6") | |
217 | (eq_attr "type" "lea")) | |
218 | "k6_decode_short,k6_store,k6_alux|k6_aluy") | |
219 | ||
220 | (define_insn_reservation "k6_store_push" 2 | |
221 | (and (eq_attr "cpu" "k6") | |
222 | (ior (eq_attr "type" "push") | |
223 | (eq_attr "memory" "store,both"))) | |
224 | "k6_decode_short,k6_store") | |
225 | ||
226 | (define_insn_reservation "k6_store_str" 10 | |
227 | (and (eq_attr "cpu" "k6") | |
228 | (eq_attr "type" "str")) | |
229 | "k6_store*10") | |
230 | ||
231 | ;; Most FPU instructions have latency 2 and throughput 2. | |
232 | (define_insn_reservation "k6_fpu" 2 | |
233 | (and (eq_attr "cpu" "k6") | |
234 | (and (eq_attr "type" "fop,fmov,fcmp,fistp") | |
235 | (eq_attr "memory" "none"))) | |
236 | "k6_decode_vector,k6_fpu*2") | |
237 | ||
238 | (define_insn_reservation "k6_fpu_load" 6 | |
239 | (and (eq_attr "cpu" "k6") | |
240 | (and (eq_attr "type" "fop,fmov,fcmp,fistp") | |
241 | (eq_attr "memory" "load,both"))) | |
242 | "k6_decode_short,k6_load,k6_fpu*2") | |
243 | ||
244 | (define_insn_reservation "k6_fpu_store" 6 | |
245 | (and (eq_attr "cpu" "k6") | |
246 | (and (eq_attr "type" "fop,fmov,fcmp,fistp") | |
247 | (eq_attr "memory" "store"))) | |
248 | "k6_decode_short,k6_store,k6_fpu*2") | |
249 | ||
250 | (define_insn_reservation "k6_fpu_fmul" 2 | |
251 | (and (eq_attr "cpu" "k6") | |
252 | (and (eq_attr "type" "fmul") | |
253 | (eq_attr "memory" "none"))) | |
254 | "k6_decode_short,k6_fpu*2") | |
255 | ||
256 | (define_insn_reservation "k6_fpu_fmul_load" 2 | |
257 | (and (eq_attr "cpu" "k6") | |
258 | (and (eq_attr "type" "fmul") | |
259 | (eq_attr "memory" "load,both"))) | |
260 | "k6_decode_short,k6_load,k6_fpu*2") | |
261 | ||
262 | ;; ??? Guessed latencies from the old pipeline description. | |
263 | (define_insn_reservation "k6_fpu_expensive" 56 | |
264 | (and (eq_attr "cpu" "k6") | |
265 | (eq_attr "type" "fdiv,fpspc")) | |
266 | "k6_decode_short,k6_fpu*56") | |
267 |