]>
Commit | Line | Data |
---|---|---|
d296e02e | 1 | ;; Scheduling description for cell processor. |
47f67e51 | 2 | ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009 |
d296e02e AP |
3 | ;; Free Software Foundation, Inc. |
4 | ;; Contributed by Sony Computer Entertainment, Inc., | |
5 | ||
6 | ||
7 | ;; This file is free software; you can redistribute it and/or modify it under | |
8 | ;; the terms of the GNU General Public License as published by the Free | |
2f83c7d6 | 9 | ;; Software Foundation; either version 3 of the License, or (at your option) |
d296e02e AP |
10 | ;; any later version. |
11 | ||
12 | ;; This file is distributed in the hope that it will be useful, but WITHOUT | |
13 | ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 | ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 | ;; for more details. | |
16 | ||
17 | ;; You should have received a copy of the GNU General Public License | |
2f83c7d6 NC |
18 | ;; along with GCC; see the file COPYING3. If not see |
19 | ;; <http://www.gnu.org/licenses/>. | |
d296e02e AP |
20 | |
21 | ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf) | |
22 | ||
2f8e468b | 23 | ;; BE Architecture *DD3.0 and DD3.1* |
d296e02e AP |
24 | ;; This file simulate PPU processor unit backend of pipeline, maualP24. |
25 | ;; manual P27, stall and flush points | |
2f8e468b | 26 | ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program |
9f5ed61a | 27 | ;; order, the grouped address are aligned by 8 |
d296e02e AP |
28 | ;; This file only simulate one thread situation |
29 | ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, | |
30 | ;; and load/store unit) | |
31 | ;; VSU executes all scalar floating points insn(a float unit), | |
32 | ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point) | |
33 | ||
34 | ;; Dual issue combination | |
35 | ||
36 | ;; FXU LSU BR VMX VMX | |
37 | ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) | |
38 | ;;FXU X | |
39 | ;;LSU X X X | |
40 | ;;BR X | |
41 | ;;VMX(sx,cx,vsu_fp,fp_arth) X | |
42 | ;;VMX(perm,vsu_ls, fp_ls) X | |
43 | ;; X are illegal combination. | |
44 | ||
2f8e468b | 45 | ;; Dual issue exceptions: |
d296e02e AP |
46 | ;;(1) nop-pipelined FXU instr in slot 0 |
47 | ;;(2) non-pipelined FPU inst in slot 0 | |
48 | ;; CSI instr(contex-synchronizing insn) | |
49 | ;; Microcode insn | |
50 | ||
51 | ;; BRU unit: bru(none register stall), bru_cr(cr register stall) | |
52 | ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), | |
2f8e468b | 53 | ;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for |
d296e02e AP |
54 | ;; nonpipelined simulation |
55 | ;; micr insns will stall at least 7 cycles to get the first instr from ROM, | |
56 | ;; micro instructions are not dual issued. | |
57 | ||
58 | ;; slot0 is older than slot1 | |
59 | ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall | |
60 | ||
61 | ;; There different stall point | |
62 | ;; IB2, only stall one thread if stall here, so try to stall here as much as | |
63 | ;; we can | |
64 | ;; condition(1) insert nop, OR and ORI instruction form | |
65 | ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or | |
66 | ;; CR0-access while stdcx, or stwcx | |
67 | ;; IS2 stall ;; Page91 for details | |
68 | ;; VQ8 stall | |
69 | ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to | |
70 | ;; the vsu issue queue | |
71 | ||
72 | ;;(define_automaton "cellxu") | |
73 | ||
74 | ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu") | |
75 | ||
76 | ;; ndfa | |
77 | (define_automaton "cellxu,cellvsu,cellbru,cell_mis") | |
78 | ||
79 | (define_cpu_unit "fxu_cell,lsu_cell" "cellxu") | |
80 | (define_cpu_unit "bru_cell" "cellbru") | |
81 | (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu") | |
82 | ||
83 | (define_cpu_unit "slot0,slot1" "cell_mis") | |
84 | ||
85 | (absence_set "slot0" "slot1") | |
86 | ||
87 | (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell") | |
88 | (define_reservation "slot01" "slot0|slot1") | |
89 | ||
90 | ||
91 | ;; Load/store | |
92 | ;; lmw, lswi, lswx are only generated for optimize for space, MC, | |
93 | ;; these instr are not simulated | |
94 | (define_insn_reservation "cell-load" 2 | |
95 | (and (eq_attr "type" "load") | |
96 | (eq_attr "cpu" "cell")) | |
97 | "slot01,lsu_cell") | |
98 | ||
99 | ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, | |
100 | ;; if with 32bytes alignment, CMC | |
101 | (define_insn_reservation "cell-load-ux" 2 | |
102 | (and (eq_attr "type" "load_ux,load_u") | |
103 | (eq_attr "cpu" "cell")) | |
104 | "slot01,fxu_cell+lsu_cell") | |
105 | ||
106 | ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown | |
107 | ;; 11/7, 11/8, 11/12 | |
108 | (define_insn_reservation "cell-load-ext" 2 | |
109 | (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux") | |
110 | (eq_attr "cpu" "cell")) | |
111 | "slot01,fxu_cell+lsu_cell") | |
112 | ||
113 | ;;lfs,lfsx,lfd,lfdx, 1 cycle | |
114 | (define_insn_reservation "cell-fpload" 1 | |
115 | (and (eq_attr "type" "fpload") | |
116 | (eq_attr "cpu" "cell")) | |
117 | "vsu2_cell+lsu_cell+slot01") | |
118 | ||
119 | ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) | |
120 | (define_insn_reservation "cell-fpload-update" 1 | |
121 | (and (eq_attr "type" "fpload,fpload_u,fpload_ux") | |
122 | (eq_attr "cpu" "cell")) | |
123 | "fxu_cell+vsu2_cell+lsu_cell+slot01") | |
124 | ||
125 | (define_insn_reservation "cell-vecload" 2 | |
126 | (and (eq_attr "type" "vecload") | |
127 | (eq_attr "cpu" "cell")) | |
128 | "slot01,vsu2_cell+lsu_cell") | |
129 | ||
130 | ;;st? stw(MC) | |
131 | (define_insn_reservation "cell-store" 1 | |
132 | (and (eq_attr "type" "store") | |
133 | (eq_attr "cpu" "cell")) | |
134 | "lsu_cell+slot01") | |
135 | ||
136 | ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg | |
137 | (define_insn_reservation "cell-store-update" 1 | |
138 | (and (eq_attr "type" "store_ux,store_u") | |
139 | (eq_attr "cpu" "cell")) | |
140 | "fxu_cell+lsu_cell+slot01") | |
141 | ||
142 | (define_insn_reservation "cell-fpstore" 1 | |
143 | (and (eq_attr "type" "fpstore") | |
144 | (eq_attr "cpu" "cell")) | |
145 | "vsu2_cell+lsu_cell+slot01") | |
146 | ||
147 | (define_insn_reservation "cell-fpstore-update" 1 | |
148 | (and (eq_attr "type" "fpstore_ux,fpstore_u") | |
149 | (eq_attr "cpu" "cell")) | |
150 | "vsu2_cell+fxu_cell+lsu_cell+slot01") | |
151 | ||
152 | (define_insn_reservation "cell-vecstore" 1 | |
153 | (and (eq_attr "type" "vecstore") | |
154 | (eq_attr "cpu" "cell")) | |
155 | "vsu2_cell+lsu_cell+slot01") | |
156 | ||
157 | ;; Integer latency is 2 cycles | |
158 | (define_insn_reservation "cell-integer" 2 | |
159 | (and (eq_attr "type" "integer,insert_dword,shift,trap,\ | |
47f67e51 | 160 | var_shift_rotate,cntlz,exts,isel") |
d296e02e AP |
161 | (eq_attr "cpu" "cell")) |
162 | "slot01,fxu_cell") | |
163 | ||
164 | ;; Two integer latency is 4 cycles | |
165 | (define_insn_reservation "cell-two" 4 | |
166 | (and (eq_attr "type" "two") | |
167 | (eq_attr "cpu" "cell")) | |
168 | "slot01,fxu_cell,fxu_cell*2") | |
169 | ||
170 | ;; Three integer latency is 6 cycles | |
171 | (define_insn_reservation "cell-three" 6 | |
172 | (and (eq_attr "type" "three") | |
173 | (eq_attr "cpu" "cell")) | |
174 | "slot01,fxu_cell,fxu_cell*4") | |
175 | ||
176 | ;; rlwimi, alter cr0 | |
177 | (define_insn_reservation "cell-insert" 2 | |
178 | (and (eq_attr "type" "insert_word") | |
179 | (eq_attr "cpu" "cell")) | |
180 | "slot01,fxu_cell") | |
181 | ||
182 | ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 | |
183 | (define_insn_reservation "cell-cmp" 1 | |
184 | (and (eq_attr "type" "cmp") | |
185 | (eq_attr "cpu" "cell")) | |
186 | "fxu_cell+slot01") | |
187 | ||
188 | ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm | |
189 | (define_insn_reservation "cell-fast-cmp" 2 | |
190 | (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\ | |
191 | var_delayed_compare") | |
192 | (eq_attr "cpu" "cell")) | |
193 | (eq_attr "cell_micro" "not")) | |
194 | "slot01,fxu_cell") | |
195 | ||
196 | (define_insn_reservation "cell-cmp-microcoded" 9 | |
197 | (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\ | |
198 | var_delayed_compare") | |
199 | (eq_attr "cpu" "cell")) | |
200 | (eq_attr "cell_micro" "always")) | |
201 | "slot0+slot1,fxu_cell,fxu_cell*7") | |
202 | ||
203 | ;; mulld | |
204 | (define_insn_reservation "cell-lmul" 15 | |
205 | (and (eq_attr "type" "lmul") | |
206 | (eq_attr "cpu" "cell")) | |
207 | "slot1,nonpipeline,nonpipeline*13") | |
208 | ||
209 | ;; mulld. is microcoded | |
210 | (define_insn_reservation "cell-lmul-cmp" 22 | |
211 | (and (eq_attr "type" "lmul_compare") | |
212 | (eq_attr "cpu" "cell")) | |
213 | "slot0+slot1,nonpipeline,nonpipeline*20") | |
214 | ||
215 | ;; mulli, 6 cycles | |
216 | (define_insn_reservation "cell-imul23" 6 | |
217 | (and (eq_attr "type" "imul2,imul3") | |
218 | (eq_attr "cpu" "cell")) | |
219 | "slot1,nonpipeline,nonpipeline*4") | |
220 | ||
221 | ;; mullw, 9 | |
222 | (define_insn_reservation "cell-imul" 9 | |
223 | (and (eq_attr "type" "imul") | |
224 | (eq_attr "cpu" "cell")) | |
225 | "slot1,nonpipeline,nonpipeline*7") | |
226 | ||
227 | ;; divide | |
228 | (define_insn_reservation "cell-idiv" 32 | |
229 | (and (eq_attr "type" "idiv") | |
230 | (eq_attr "cpu" "cell")) | |
231 | "slot1,nonpipeline,nonpipeline*30") | |
232 | ||
233 | (define_insn_reservation "cell-ldiv" 64 | |
234 | (and (eq_attr "type" "ldiv") | |
235 | (eq_attr "cpu" "cell")) | |
236 | "slot1,nonpipeline,nonpipeline*62") | |
237 | ||
238 | ;;mflr and mfctr are pipelined | |
239 | (define_insn_reservation "cell-mfjmpr" 1 | |
240 | (and (eq_attr "type" "mfjmpr") | |
241 | (eq_attr "cpu" "cell")) | |
242 | "slot01+bru_cell") | |
243 | ||
244 | ;;mtlr and mtctr, | |
245 | ;;mtspr fully pipelined | |
246 | (define_insn_reservation "cell-mtjmpr" 1 | |
247 | (and (eq_attr "type" "mtjmpr") | |
248 | (eq_attr "cpu" "cell")) | |
249 | "bru_cell+slot01") | |
250 | ||
251 | ;; Branches | |
252 | ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency | |
253 | ;; bcctr, bcctrl, latency 2, actually adjust by be to 4 | |
254 | (define_insn_reservation "cell-branch" 1 | |
255 | (and (eq_attr "type" "branch") | |
256 | (eq_attr "cpu" "cell")) | |
257 | "bru_cell+slot1") | |
258 | ||
259 | (define_insn_reservation "cell-branchreg" 1 | |
260 | (and (eq_attr "type" "jmpreg") | |
261 | (eq_attr "cpu" "cell")) | |
262 | "bru_cell+slot1") | |
263 | ||
264 | ;; cr hazard | |
265 | ;; page 90, special cases for CR hazard, only one instr can access cr per cycle | |
266 | ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish | |
267 | (define_insn_reservation "cell-crlogical" 1 | |
268 | (and (eq_attr "type" "cr_logical,delayed_cr") | |
269 | (eq_attr "cpu" "cell")) | |
270 | "bru_cell+slot01") | |
271 | ||
272 | ;; mfcrf and mfcr is about 34 cycles and nonpipelined | |
273 | (define_insn_reservation "cell-mfcr" 34 | |
274 | (and (eq_attr "type" "mfcrf,mfcr") | |
275 | (eq_attr "cpu" "cell")) | |
276 | "slot1,nonpipeline,nonpipeline*32") | |
277 | ||
278 | ;; mtcrf (1 field) | |
279 | (define_insn_reservation "cell-mtcrf" 1 | |
280 | (and (eq_attr "type" "mtcr") | |
281 | (eq_attr "cpu" "cell")) | |
282 | "fxu_cell+slot01") | |
283 | ||
284 | ; Basic FP latency is 10 cycles, thoughput is 1/cycle | |
285 | (define_insn_reservation "cell-fp" 10 | |
286 | (and (eq_attr "type" "fp,dmul") | |
287 | (eq_attr "cpu" "cell")) | |
288 | "slot01,vsu1_cell,vsu1_cell*8") | |
289 | ||
290 | (define_insn_reservation "cell-fpcompare" 1 | |
291 | (and (eq_attr "type" "fpcompare") | |
292 | (eq_attr "cpu" "cell")) | |
293 | "vsu1_cell+slot01") | |
294 | ||
295 | ;; sdiv thoughput 1/74, not pipelined but only in the FPU | |
296 | (define_insn_reservation "cell-sdiv" 74 | |
297 | (and (eq_attr "type" "sdiv,ddiv") | |
298 | (eq_attr "cpu" "cell")) | |
299 | "slot1,nonpipeline,nonpipeline*72") | |
300 | ||
301 | ;; fsqrt thoughput 1/84, not pipelined but only in the FPU | |
302 | (define_insn_reservation "cell-sqrt" 84 | |
303 | (and (eq_attr "type" "ssqrt,dsqrt") | |
304 | (eq_attr "cpu" "cell")) | |
305 | "slot1,nonpipeline,nonpipeline*82") | |
306 | ||
307 | ; VMX | |
308 | (define_insn_reservation "cell-vecsimple" 4 | |
309 | (and (eq_attr "type" "vecsimple") | |
310 | (eq_attr "cpu" "cell")) | |
311 | "slot01,vsu1_cell,vsu1_cell*2") | |
312 | ||
313 | ;; mult, div, madd | |
314 | (define_insn_reservation "cell-veccomplex" 10 | |
315 | (and (eq_attr "type" "veccomplex") | |
316 | (eq_attr "cpu" "cell")) | |
317 | "slot01,vsu1_cell,vsu1_cell*8") | |
318 | ||
319 | ;; TODO: add support for recording instructions | |
320 | (define_insn_reservation "cell-veccmp" 4 | |
321 | (and (eq_attr "type" "veccmp") | |
322 | (eq_attr "cpu" "cell")) | |
323 | "slot01,vsu1_cell,vsu1_cell*2") | |
324 | ||
325 | (define_insn_reservation "cell-vecfloat" 12 | |
326 | (and (eq_attr "type" "vecfloat") | |
327 | (eq_attr "cpu" "cell")) | |
328 | "slot01,vsu1_cell,vsu1_cell*10") | |
329 | ||
330 | (define_insn_reservation "cell-vecperm" 4 | |
331 | (and (eq_attr "type" "vecperm") | |
332 | (eq_attr "cpu" "cell")) | |
333 | "slot01,vsu2_cell,vsu2_cell*2") | |
334 | ||
335 | ;; New for 4.2, syncs | |
336 | ||
337 | (define_insn_reservation "cell-sync" 11 | |
338 | (and (eq_attr "type" "sync") | |
339 | (eq_attr "cpu" "cell")) | |
340 | "slot01,lsu_cell,lsu_cell*9") | |
341 | ||
342 | (define_insn_reservation "cell-isync" 11 | |
343 | (and (eq_attr "type" "isync") | |
344 | (eq_attr "cpu" "cell")) | |
345 | "slot01,lsu_cell,lsu_cell*9") | |
346 | ||
347 | (define_insn_reservation "cell-load_l" 11 | |
348 | (and (eq_attr "type" "load_l") | |
349 | (eq_attr "cpu" "cell")) | |
350 | "slot01,lsu_cell,lsu_cell*9") | |
351 | ||
352 | (define_insn_reservation "cell-store_c" 11 | |
353 | (and (eq_attr "type" "store_c") | |
354 | (eq_attr "cpu" "cell")) | |
355 | "slot01,lsu_cell,lsu_cell*9") | |
356 | ||
357 | ;; RAW register dependency | |
358 | ||
359 | ;; addi r3, r3, 1 | |
360 | ;; lw r4,offset(r3) | |
361 | ;; there are 5 cycle deplay for r3 bypassing | |
362 | ;; there are 5 cycle delay for a dependent load after a load | |
363 | (define_bypass 5 "cell-integer" "cell-load") | |
364 | (define_bypass 5 "cell-integer" "cell-load-ext") | |
365 | (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext") | |
366 | ||
367 | ;; there is a 6 cycle delay after a fp compare until you can use the cr. | |
368 | (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical") | |
369 | ||
370 | ;; VXU float RAW | |
371 | (define_bypass 11 "cell-vecfloat" "cell-vecfloat") | |
372 | ||
373 | ;; VXU and FPU | |
374 | (define_bypass 6 "cell-veccomplex" "cell-vecsimple") | |
375 | ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg") | |
376 | (define_bypass 3 "cell-vecfloat" "cell-veccomplex") | |
377 | ; this is not correct, | |
378 | ;; this is a stall in general and not dependent on result | |
379 | (define_bypass 13 "cell-vecstore" "cell-fpstore") | |
2f8e468b | 380 | ; this is not correct, this can never be true, not dependent on result |
d296e02e AP |
381 | (define_bypass 7 "cell-fp" "cell-fpload") |
382 | ;; vsu1 should avoid writing to the same target register as vsu2 insn | |
383 | ;; within 12 cycles. | |
384 | ||
385 | ;; WAW hazard | |
386 | ||
387 | ;; the target of VSU estimate should not be reused within 10 dispatch groups | |
388 | ;; the target of VSU float should not be reused within 8 dispatch groups | |
389 | ;; the target of VSU complex should not be reused within 5 dispatch groups | |
390 | ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus | |
391 | ||
392 | ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at | |
393 | ;; ex4 stage(10 cycles) | |
394 | (define_bypass 10 "cell-mtjmpr" "cell-branchreg") | |
395 | ||
396 | ;;Things are not simulated: | |
397 | ;; update instruction, update address gpr are not simulated | |
2f8e468b | 398 | ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float |
d296e02e AP |
399 | ;; insns |
400 |