]>
Commit | Line | Data |
---|---|---|
d296e02e | 1 | ;; Scheduling description for cell processor. |
99dee823 | 2 | ;; Copyright (C) 2001-2021 Free Software Foundation, Inc. |
d296e02e AP |
3 | ;; Contributed by Sony Computer Entertainment, Inc., |
4 | ||
5 | ||
6 | ;; This file is free software; you can redistribute it and/or modify it under | |
7 | ;; the terms of the GNU General Public License as published by the Free | |
2f83c7d6 | 8 | ;; Software Foundation; either version 3 of the License, or (at your option) |
d296e02e AP |
9 | ;; any later version. |
10 | ||
11 | ;; This file is distributed in the hope that it will be useful, but WITHOUT | |
12 | ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | ;; for more details. | |
15 | ||
16 | ;; You should have received a copy of the GNU General Public License | |
2f83c7d6 NC |
17 | ;; along with GCC; see the file COPYING3. If not see |
18 | ;; <http://www.gnu.org/licenses/>. | |
d296e02e AP |
19 | |
20 | ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf) | |
21 | ||
2f8e468b | 22 | ;; BE Architecture *DD3.0 and DD3.1* |
d296e02e AP |
23 | ;; This file simulate PPU processor unit backend of pipeline, maualP24. |
24 | ;; manual P27, stall and flush points | |
2f8e468b | 25 | ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program |
9f5ed61a | 26 | ;; order, the grouped address are aligned by 8 |
d296e02e AP |
27 | ;; This file only simulate one thread situation |
28 | ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, | |
29 | ;; and load/store unit) | |
30 | ;; VSU executes all scalar floating points insn(a float unit), | |
31 | ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point) | |
32 | ||
33 | ;; Dual issue combination | |
34 | ||
35 | ;; FXU LSU BR VMX VMX | |
36 | ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) | |
37 | ;;FXU X | |
38 | ;;LSU X X X | |
39 | ;;BR X | |
40 | ;;VMX(sx,cx,vsu_fp,fp_arth) X | |
41 | ;;VMX(perm,vsu_ls, fp_ls) X | |
42 | ;; X are illegal combination. | |
43 | ||
2f8e468b | 44 | ;; Dual issue exceptions: |
d296e02e AP |
45 | ;;(1) nop-pipelined FXU instr in slot 0 |
46 | ;;(2) non-pipelined FPU inst in slot 0 | |
47 | ;; CSI instr(contex-synchronizing insn) | |
48 | ;; Microcode insn | |
49 | ||
50 | ;; BRU unit: bru(none register stall), bru_cr(cr register stall) | |
51 | ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), | |
2f8e468b | 52 | ;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for |
d296e02e AP |
53 | ;; nonpipelined simulation |
54 | ;; micr insns will stall at least 7 cycles to get the first instr from ROM, | |
55 | ;; micro instructions are not dual issued. | |
56 | ||
57 | ;; slot0 is older than slot1 | |
58 | ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall | |
59 | ||
60 | ;; There different stall point | |
61 | ;; IB2, only stall one thread if stall here, so try to stall here as much as | |
62 | ;; we can | |
63 | ;; condition(1) insert nop, OR and ORI instruction form | |
64 | ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or | |
65 | ;; CR0-access while stdcx, or stwcx | |
66 | ;; IS2 stall ;; Page91 for details | |
67 | ;; VQ8 stall | |
68 | ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to | |
69 | ;; the vsu issue queue | |
70 | ||
71 | ;;(define_automaton "cellxu") | |
72 | ||
73 | ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu") | |
74 | ||
75 | ;; ndfa | |
76 | (define_automaton "cellxu,cellvsu,cellbru,cell_mis") | |
77 | ||
78 | (define_cpu_unit "fxu_cell,lsu_cell" "cellxu") | |
79 | (define_cpu_unit "bru_cell" "cellbru") | |
80 | (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu") | |
81 | ||
82 | (define_cpu_unit "slot0,slot1" "cell_mis") | |
83 | ||
84 | (absence_set "slot0" "slot1") | |
85 | ||
86 | (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell") | |
87 | (define_reservation "slot01" "slot0|slot1") | |
88 | ||
89 | ||
90 | ;; Load/store | |
91 | ;; lmw, lswi, lswx are only generated for optimize for space, MC, | |
92 | ;; these instr are not simulated | |
93 | (define_insn_reservation "cell-load" 2 | |
94 | (and (eq_attr "type" "load") | |
d839f53b SB |
95 | (eq_attr "sign_extend" "no") |
96 | (eq_attr "update" "no") | |
d296e02e AP |
97 | (eq_attr "cpu" "cell")) |
98 | "slot01,lsu_cell") | |
99 | ||
100 | ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, | |
101 | ;; if with 32bytes alignment, CMC | |
102 | (define_insn_reservation "cell-load-ux" 2 | |
d839f53b SB |
103 | (and (eq_attr "type" "load") |
104 | (eq_attr "sign_extend" "no") | |
105 | (eq_attr "update" "yes") | |
d296e02e AP |
106 | (eq_attr "cpu" "cell")) |
107 | "slot01,fxu_cell+lsu_cell") | |
108 | ||
109 | ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown | |
110 | ;; 11/7, 11/8, 11/12 | |
111 | (define_insn_reservation "cell-load-ext" 2 | |
d839f53b SB |
112 | (and (eq_attr "type" "load") |
113 | (eq_attr "sign_extend" "yes") | |
114 | (eq_attr "cpu" "cell")) | |
d296e02e AP |
115 | "slot01,fxu_cell+lsu_cell") |
116 | ||
117 | ;;lfs,lfsx,lfd,lfdx, 1 cycle | |
118 | (define_insn_reservation "cell-fpload" 1 | |
119 | (and (eq_attr "type" "fpload") | |
d839f53b | 120 | (eq_attr "update" "no") |
d296e02e AP |
121 | (eq_attr "cpu" "cell")) |
122 | "vsu2_cell+lsu_cell+slot01") | |
123 | ||
124 | ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) | |
125 | (define_insn_reservation "cell-fpload-update" 1 | |
d839f53b SB |
126 | (and (eq_attr "type" "fpload") |
127 | (eq_attr "update" "yes") | |
d296e02e AP |
128 | (eq_attr "cpu" "cell")) |
129 | "fxu_cell+vsu2_cell+lsu_cell+slot01") | |
130 | ||
131 | (define_insn_reservation "cell-vecload" 2 | |
132 | (and (eq_attr "type" "vecload") | |
133 | (eq_attr "cpu" "cell")) | |
134 | "slot01,vsu2_cell+lsu_cell") | |
135 | ||
136 | ;;st? stw(MC) | |
137 | (define_insn_reservation "cell-store" 1 | |
138 | (and (eq_attr "type" "store") | |
d839f53b | 139 | (eq_attr "update" "no") |
d296e02e AP |
140 | (eq_attr "cpu" "cell")) |
141 | "lsu_cell+slot01") | |
142 | ||
143 | ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg | |
144 | (define_insn_reservation "cell-store-update" 1 | |
d839f53b SB |
145 | (and (eq_attr "type" "store") |
146 | (eq_attr "update" "yes") | |
d296e02e AP |
147 | (eq_attr "cpu" "cell")) |
148 | "fxu_cell+lsu_cell+slot01") | |
149 | ||
150 | (define_insn_reservation "cell-fpstore" 1 | |
151 | (and (eq_attr "type" "fpstore") | |
d839f53b | 152 | (eq_attr "update" "no") |
d296e02e AP |
153 | (eq_attr "cpu" "cell")) |
154 | "vsu2_cell+lsu_cell+slot01") | |
155 | ||
156 | (define_insn_reservation "cell-fpstore-update" 1 | |
d839f53b SB |
157 | (and (eq_attr "type" "fpstore") |
158 | (eq_attr "update" "yes") | |
d296e02e AP |
159 | (eq_attr "cpu" "cell")) |
160 | "vsu2_cell+fxu_cell+lsu_cell+slot01") | |
161 | ||
162 | (define_insn_reservation "cell-vecstore" 1 | |
163 | (and (eq_attr "type" "vecstore") | |
164 | (eq_attr "cpu" "cell")) | |
165 | "vsu2_cell+lsu_cell+slot01") | |
166 | ||
167 | ;; Integer latency is 2 cycles | |
168 | (define_insn_reservation "cell-integer" 2 | |
79430730 SB |
169 | (and (ior (eq_attr "type" "integer,trap,cntlz,isel") |
170 | (and (eq_attr "type" "add,logical,shift,exts") | |
892e7fa6 | 171 | (eq_attr "dot" "no")) |
58ee9e66 SB |
172 | (and (eq_attr "type" "insert") |
173 | (eq_attr "size" "64"))) | |
d296e02e AP |
174 | (eq_attr "cpu" "cell")) |
175 | "slot01,fxu_cell") | |
176 | ||
177 | ;; Two integer latency is 4 cycles | |
178 | (define_insn_reservation "cell-two" 4 | |
179 | (and (eq_attr "type" "two") | |
180 | (eq_attr "cpu" "cell")) | |
181 | "slot01,fxu_cell,fxu_cell*2") | |
182 | ||
183 | ;; Three integer latency is 6 cycles | |
184 | (define_insn_reservation "cell-three" 6 | |
185 | (and (eq_attr "type" "three") | |
186 | (eq_attr "cpu" "cell")) | |
187 | "slot01,fxu_cell,fxu_cell*4") | |
188 | ||
189 | ;; rlwimi, alter cr0 | |
190 | (define_insn_reservation "cell-insert" 2 | |
58ee9e66 SB |
191 | (and (eq_attr "type" "insert") |
192 | (eq_attr "size" "32") | |
d296e02e AP |
193 | (eq_attr "cpu" "cell")) |
194 | "slot01,fxu_cell") | |
195 | ||
196 | ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 | |
197 | (define_insn_reservation "cell-cmp" 1 | |
198 | (and (eq_attr "type" "cmp") | |
199 | (eq_attr "cpu" "cell")) | |
200 | "fxu_cell+slot01") | |
201 | ||
202 | ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm | |
203 | (define_insn_reservation "cell-fast-cmp" 2 | |
f5ae5a23 SB |
204 | (and (eq_attr "type" "add,logical,shift,exts") |
205 | (eq_attr "dot" "yes") | |
892e7fa6 SB |
206 | (eq_attr "cpu" "cell") |
207 | (eq_attr "cell_micro" "not")) | |
d296e02e AP |
208 | "slot01,fxu_cell") |
209 | ||
210 | (define_insn_reservation "cell-cmp-microcoded" 9 | |
f5ae5a23 SB |
211 | (and (eq_attr "type" "add,logical,shift,exts") |
212 | (eq_attr "dot" "yes") | |
892e7fa6 SB |
213 | (eq_attr "cpu" "cell") |
214 | (eq_attr "cell_micro" "always")) | |
d296e02e AP |
215 | "slot0+slot1,fxu_cell,fxu_cell*7") |
216 | ||
217 | ;; mulld | |
218 | (define_insn_reservation "cell-lmul" 15 | |
e0528ed9 SB |
219 | (and (eq_attr "type" "mul") |
220 | (eq_attr "dot" "no") | |
221 | (eq_attr "size" "64") | |
d296e02e AP |
222 | (eq_attr "cpu" "cell")) |
223 | "slot1,nonpipeline,nonpipeline*13") | |
224 | ||
225 | ;; mulld. is microcoded | |
226 | (define_insn_reservation "cell-lmul-cmp" 22 | |
e0528ed9 SB |
227 | (and (eq_attr "type" "mul") |
228 | (eq_attr "dot" "yes") | |
229 | (eq_attr "size" "64") | |
d296e02e AP |
230 | (eq_attr "cpu" "cell")) |
231 | "slot0+slot1,nonpipeline,nonpipeline*20") | |
232 | ||
233 | ;; mulli, 6 cycles | |
234 | (define_insn_reservation "cell-imul23" 6 | |
e0528ed9 SB |
235 | (and (eq_attr "type" "mul") |
236 | (eq_attr "size" "8,16") | |
d296e02e AP |
237 | (eq_attr "cpu" "cell")) |
238 | "slot1,nonpipeline,nonpipeline*4") | |
239 | ||
240 | ;; mullw, 9 | |
241 | (define_insn_reservation "cell-imul" 9 | |
e0528ed9 SB |
242 | (and (eq_attr "type" "mul") |
243 | (eq_attr "dot" "no") | |
244 | (eq_attr "size" "32") | |
d296e02e AP |
245 | (eq_attr "cpu" "cell")) |
246 | "slot1,nonpipeline,nonpipeline*7") | |
247 | ||
248 | ;; divide | |
249 | (define_insn_reservation "cell-idiv" 32 | |
441e02a5 SB |
250 | (and (eq_attr "type" "div") |
251 | (eq_attr "size" "32") | |
d296e02e AP |
252 | (eq_attr "cpu" "cell")) |
253 | "slot1,nonpipeline,nonpipeline*30") | |
254 | ||
255 | (define_insn_reservation "cell-ldiv" 64 | |
441e02a5 SB |
256 | (and (eq_attr "type" "div") |
257 | (eq_attr "size" "64") | |
d296e02e AP |
258 | (eq_attr "cpu" "cell")) |
259 | "slot1,nonpipeline,nonpipeline*62") | |
260 | ||
261 | ;;mflr and mfctr are pipelined | |
262 | (define_insn_reservation "cell-mfjmpr" 1 | |
263 | (and (eq_attr "type" "mfjmpr") | |
264 | (eq_attr "cpu" "cell")) | |
265 | "slot01+bru_cell") | |
266 | ||
267 | ;;mtlr and mtctr, | |
268 | ;;mtspr fully pipelined | |
269 | (define_insn_reservation "cell-mtjmpr" 1 | |
270 | (and (eq_attr "type" "mtjmpr") | |
271 | (eq_attr "cpu" "cell")) | |
272 | "bru_cell+slot01") | |
273 | ||
274 | ;; Branches | |
275 | ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency | |
276 | ;; bcctr, bcctrl, latency 2, actually adjust by be to 4 | |
277 | (define_insn_reservation "cell-branch" 1 | |
278 | (and (eq_attr "type" "branch") | |
279 | (eq_attr "cpu" "cell")) | |
280 | "bru_cell+slot1") | |
281 | ||
282 | (define_insn_reservation "cell-branchreg" 1 | |
283 | (and (eq_attr "type" "jmpreg") | |
284 | (eq_attr "cpu" "cell")) | |
285 | "bru_cell+slot1") | |
286 | ||
287 | ;; cr hazard | |
288 | ;; page 90, special cases for CR hazard, only one instr can access cr per cycle | |
289 | ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish | |
290 | (define_insn_reservation "cell-crlogical" 1 | |
34ef0745 | 291 | (and (eq_attr "type" "cr_logical") |
d296e02e AP |
292 | (eq_attr "cpu" "cell")) |
293 | "bru_cell+slot01") | |
294 | ||
295 | ;; mfcrf and mfcr is about 34 cycles and nonpipelined | |
296 | (define_insn_reservation "cell-mfcr" 34 | |
297 | (and (eq_attr "type" "mfcrf,mfcr") | |
298 | (eq_attr "cpu" "cell")) | |
299 | "slot1,nonpipeline,nonpipeline*32") | |
300 | ||
301 | ;; mtcrf (1 field) | |
302 | (define_insn_reservation "cell-mtcrf" 1 | |
303 | (and (eq_attr "type" "mtcr") | |
304 | (eq_attr "cpu" "cell")) | |
305 | "fxu_cell+slot01") | |
306 | ||
307 | ; Basic FP latency is 10 cycles, thoughput is 1/cycle | |
308 | (define_insn_reservation "cell-fp" 10 | |
7c788ce2 | 309 | (and (eq_attr "type" "fp,fpsimple,dmul") |
d296e02e AP |
310 | (eq_attr "cpu" "cell")) |
311 | "slot01,vsu1_cell,vsu1_cell*8") | |
312 | ||
313 | (define_insn_reservation "cell-fpcompare" 1 | |
314 | (and (eq_attr "type" "fpcompare") | |
315 | (eq_attr "cpu" "cell")) | |
316 | "vsu1_cell+slot01") | |
317 | ||
318 | ;; sdiv thoughput 1/74, not pipelined but only in the FPU | |
319 | (define_insn_reservation "cell-sdiv" 74 | |
320 | (and (eq_attr "type" "sdiv,ddiv") | |
321 | (eq_attr "cpu" "cell")) | |
322 | "slot1,nonpipeline,nonpipeline*72") | |
323 | ||
324 | ;; fsqrt thoughput 1/84, not pipelined but only in the FPU | |
325 | (define_insn_reservation "cell-sqrt" 84 | |
326 | (and (eq_attr "type" "ssqrt,dsqrt") | |
327 | (eq_attr "cpu" "cell")) | |
328 | "slot1,nonpipeline,nonpipeline*82") | |
329 | ||
330 | ; VMX | |
331 | (define_insn_reservation "cell-vecsimple" 4 | |
7c788ce2 | 332 | (and (eq_attr "type" "vecsimple,veclogical,vecmove") |
d296e02e AP |
333 | (eq_attr "cpu" "cell")) |
334 | "slot01,vsu1_cell,vsu1_cell*2") | |
335 | ||
336 | ;; mult, div, madd | |
337 | (define_insn_reservation "cell-veccomplex" 10 | |
338 | (and (eq_attr "type" "veccomplex") | |
339 | (eq_attr "cpu" "cell")) | |
340 | "slot01,vsu1_cell,vsu1_cell*8") | |
341 | ||
342 | ;; TODO: add support for recording instructions | |
343 | (define_insn_reservation "cell-veccmp" 4 | |
7c788ce2 | 344 | (and (eq_attr "type" "veccmp,veccmpfx") |
d296e02e AP |
345 | (eq_attr "cpu" "cell")) |
346 | "slot01,vsu1_cell,vsu1_cell*2") | |
347 | ||
348 | (define_insn_reservation "cell-vecfloat" 12 | |
349 | (and (eq_attr "type" "vecfloat") | |
350 | (eq_attr "cpu" "cell")) | |
351 | "slot01,vsu1_cell,vsu1_cell*10") | |
352 | ||
353 | (define_insn_reservation "cell-vecperm" 4 | |
354 | (and (eq_attr "type" "vecperm") | |
355 | (eq_attr "cpu" "cell")) | |
356 | "slot01,vsu2_cell,vsu2_cell*2") | |
357 | ||
358 | ;; New for 4.2, syncs | |
359 | ||
360 | (define_insn_reservation "cell-sync" 11 | |
361 | (and (eq_attr "type" "sync") | |
362 | (eq_attr "cpu" "cell")) | |
363 | "slot01,lsu_cell,lsu_cell*9") | |
364 | ||
365 | (define_insn_reservation "cell-isync" 11 | |
366 | (and (eq_attr "type" "isync") | |
367 | (eq_attr "cpu" "cell")) | |
368 | "slot01,lsu_cell,lsu_cell*9") | |
369 | ||
370 | (define_insn_reservation "cell-load_l" 11 | |
371 | (and (eq_attr "type" "load_l") | |
372 | (eq_attr "cpu" "cell")) | |
373 | "slot01,lsu_cell,lsu_cell*9") | |
374 | ||
375 | (define_insn_reservation "cell-store_c" 11 | |
376 | (and (eq_attr "type" "store_c") | |
377 | (eq_attr "cpu" "cell")) | |
378 | "slot01,lsu_cell,lsu_cell*9") | |
379 | ||
380 | ;; RAW register dependency | |
381 | ||
382 | ;; addi r3, r3, 1 | |
383 | ;; lw r4,offset(r3) | |
384 | ;; there are 5 cycle deplay for r3 bypassing | |
385 | ;; there are 5 cycle delay for a dependent load after a load | |
386 | (define_bypass 5 "cell-integer" "cell-load") | |
387 | (define_bypass 5 "cell-integer" "cell-load-ext") | |
388 | (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext") | |
389 | ||
390 | ;; there is a 6 cycle delay after a fp compare until you can use the cr. | |
391 | (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical") | |
392 | ||
393 | ;; VXU float RAW | |
394 | (define_bypass 11 "cell-vecfloat" "cell-vecfloat") | |
395 | ||
396 | ;; VXU and FPU | |
397 | (define_bypass 6 "cell-veccomplex" "cell-vecsimple") | |
398 | ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg") | |
399 | (define_bypass 3 "cell-vecfloat" "cell-veccomplex") | |
400 | ; this is not correct, | |
401 | ;; this is a stall in general and not dependent on result | |
402 | (define_bypass 13 "cell-vecstore" "cell-fpstore") | |
2f8e468b | 403 | ; this is not correct, this can never be true, not dependent on result |
d296e02e AP |
404 | (define_bypass 7 "cell-fp" "cell-fpload") |
405 | ;; vsu1 should avoid writing to the same target register as vsu2 insn | |
406 | ;; within 12 cycles. | |
407 | ||
408 | ;; WAW hazard | |
409 | ||
410 | ;; the target of VSU estimate should not be reused within 10 dispatch groups | |
411 | ;; the target of VSU float should not be reused within 8 dispatch groups | |
412 | ;; the target of VSU complex should not be reused within 5 dispatch groups | |
413 | ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus | |
414 | ||
415 | ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at | |
416 | ;; ex4 stage(10 cycles) | |
417 | (define_bypass 10 "cell-mtjmpr" "cell-branchreg") | |
418 | ||
419 | ;;Things are not simulated: | |
420 | ;; update instruction, update address gpr are not simulated | |
2f8e468b | 421 | ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float |
d296e02e AP |
422 | ;; insns |
423 |