[thirdparty/gcc.git] / gcc / config / rs6000 / cell.md

;; Scheduling description for cell processor.
;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
;; Free Software Foundation, Inc.
;; Contributed by Sony Computer Entertainment, Inc.,


;; This file is free software; you can redistribute it and/or modify it under
;; the terms of the GNU General Public License as published by the Free
;; Software Foundation; either version 3 of the License, or (at your option) 
;; any later version.

;; This file is distributed in the hope that it will be useful, but WITHOUT
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
;; for more details.

;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; <http://www.gnu.org/licenses/>.

;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)

;; BE Architecture *DD3.0 and DD3.1*
;; This file simulate PPU processor unit backend of pipeline, maualP24. 
;; manual P27, stall and flush points
;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
;;  order, the grouped address are aligned by 8
;; This file only simulate one thread situation
;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
;;   and load/store unit)
;; VSU executes all scalar floating points insn(a float unit),
;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)

;; Dual issue combination

;;	FXU	LSU	BR 	        VMX	               VMX
;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
;;FXU	X
;;LSU		X               	X               	X	
;;BR			X
;;VMX(sx,cx,vsu_fp,fp_arth)		X
;;VMX(perm,vsu_ls, fp_ls)					X
;;    X are illegal combination.

;; Dual issue exceptions:
;;(1) nop-pipelined FXU instr in slot 0 
;;(2) non-pipelined FPU inst in slot 0
;; CSI instr(contex-synchronizing insn)
;; Microcode insn

;; BRU unit: bru(none register stall), bru_cr(cr register stall)
;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
;;  nonpipelined simulation
;; micr insns will stall at least 7 cycles to get the first instr from ROM,
;;  micro instructions are not dual issued. 

;; slot0 is older than slot1
;; non-pipelined insn need to be in slot1 to avoid 1cycle stall

;; There different stall point
;; IB2, only stall one thread if stall here, so try to stall here as much as
;; we can 
;; condition(1) insert nop, OR and ORI instruction form 
;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
;;   CR0-access while stdcx, or stwcx
;; IS2 stall ;; Page91 for details
;; VQ8 stall
;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
;;  the vsu issue queue

;;(define_automaton "cellxu")

;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")

;; ndfa
(define_automaton "cellxu,cellvsu,cellbru,cell_mis")

(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
(define_cpu_unit "bru_cell" "cellbru")
(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")

(define_cpu_unit "slot0,slot1" "cell_mis")

(absence_set "slot0" "slot1")

(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
(define_reservation "slot01" "slot0|slot1")


;; Load/store
;; lmw, lswi, lswx are only generated for optimize for space, MC,
;;   these instr are not simulated
(define_insn_reservation "cell-load" 2
  (and (eq_attr "type" "load")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell")

;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
;;  if with 32bytes alignment, CMC
(define_insn_reservation "cell-load-ux" 2
  (and (eq_attr "type" "load_ux,load_u")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell+lsu_cell")

;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
;;   11/7, 11/8, 11/12
(define_insn_reservation "cell-load-ext" 2
  (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
       (eq_attr "cpu" "cell")) 
  "slot01,fxu_cell+lsu_cell")

;;lfs,lfsx,lfd,lfdx, 1 cycle
(define_insn_reservation "cell-fpload" 1
  (and (eq_attr "type" "fpload")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")

;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
(define_insn_reservation "cell-fpload-update" 1
  (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
       (eq_attr "cpu" "cell"))
  "fxu_cell+vsu2_cell+lsu_cell+slot01")

(define_insn_reservation "cell-vecload" 2
  (and (eq_attr "type" "vecload")
       (eq_attr "cpu" "cell"))
  "slot01,vsu2_cell+lsu_cell")

;;st? stw(MC)
(define_insn_reservation "cell-store" 1
  (and (eq_attr "type" "store")
       (eq_attr "cpu" "cell"))
  "lsu_cell+slot01")

;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
(define_insn_reservation "cell-store-update" 1
  (and (eq_attr "type" "store_ux,store_u")
       (eq_attr "cpu" "cell"))
  "fxu_cell+lsu_cell+slot01")

(define_insn_reservation "cell-fpstore" 1
  (and (eq_attr "type" "fpstore")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")

(define_insn_reservation "cell-fpstore-update" 1
  (and (eq_attr "type" "fpstore_ux,fpstore_u")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+fxu_cell+lsu_cell+slot01")

(define_insn_reservation "cell-vecstore" 1
  (and (eq_attr "type" "vecstore")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")

;; Integer latency is 2 cycles
(define_insn_reservation "cell-integer" 2
  (and (eq_attr "type" "integer,insert_dword,shift,trap,\
			var_shift_rotate,cntlz,exts,isel")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell")

;; Two integer latency is 4 cycles
(define_insn_reservation "cell-two" 4
  (and (eq_attr "type" "two")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell,fxu_cell*2")

;; Three integer latency is 6 cycles
(define_insn_reservation "cell-three" 6
  (and (eq_attr "type" "three")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell,fxu_cell*4")

;; rlwimi, alter cr0  
(define_insn_reservation "cell-insert" 2
  (and (eq_attr "type" "insert_word")
       (eq_attr "cpu" "cell"))
 "slot01,fxu_cell")

;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 
(define_insn_reservation "cell-cmp" 1
  (and (eq_attr "type" "cmp")
       (eq_attr "cpu" "cell"))
  "fxu_cell+slot01")

;; add, addo, sub, subo, alter cr0, rldcli, rlwinm 
(define_insn_reservation "cell-fast-cmp" 2
  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
			    var_delayed_compare")
            (eq_attr "cpu" "cell"))
        (eq_attr "cell_micro" "not"))
  "slot01,fxu_cell")

(define_insn_reservation "cell-cmp-microcoded" 9
  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
			    var_delayed_compare")
            (eq_attr "cpu" "cell"))
        (eq_attr "cell_micro" "always"))
  "slot0+slot1,fxu_cell,fxu_cell*7")

;; mulld
(define_insn_reservation "cell-lmul" 15
  (and (eq_attr "type" "lmul")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*13")

;; mulld. is microcoded
(define_insn_reservation "cell-lmul-cmp" 22
  (and (eq_attr "type" "lmul_compare")
       (eq_attr "cpu" "cell"))
  "slot0+slot1,nonpipeline,nonpipeline*20")

;; mulli, 6 cycles
(define_insn_reservation "cell-imul23" 6
  (and (eq_attr "type" "imul2,imul3")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*4")

;; mullw, 9
(define_insn_reservation "cell-imul" 9
  (and (eq_attr "type" "imul")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*7")
 
;; divide
(define_insn_reservation "cell-idiv" 32
  (and (eq_attr "type" "idiv")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*30")

(define_insn_reservation "cell-ldiv" 64
  (and (eq_attr "type" "ldiv")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*62")

;;mflr and mfctr are pipelined
(define_insn_reservation "cell-mfjmpr" 1
  (and (eq_attr "type" "mfjmpr")
       (eq_attr "cpu" "cell"))
  "slot01+bru_cell")

;;mtlr and mtctr,
;;mtspr fully pipelined 
(define_insn_reservation "cell-mtjmpr" 1
 (and (eq_attr "type" "mtjmpr")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot01")

;; Branches
;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
;; bcctr, bcctrl, latency 2, actually adjust by be to 4
(define_insn_reservation "cell-branch" 1
  (and (eq_attr "type" "branch")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot1")

(define_insn_reservation "cell-branchreg" 1
  (and (eq_attr "type" "jmpreg")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot1")

;; cr hazard
;; page 90, special cases for CR hazard, only one instr can access cr per cycle
;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
(define_insn_reservation "cell-crlogical" 1
  (and (eq_attr "type" "cr_logical,delayed_cr")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot01")

;; mfcrf and mfcr is about 34 cycles and nonpipelined
(define_insn_reservation "cell-mfcr" 34
  (and (eq_attr "type" "mfcrf,mfcr")
       (eq_attr "cpu" "cell"))
   "slot1,nonpipeline,nonpipeline*32")

;; mtcrf (1 field)
(define_insn_reservation "cell-mtcrf" 1
  (and (eq_attr "type" "mtcr")
       (eq_attr "cpu" "cell"))
  "fxu_cell+slot01")

; Basic FP latency is 10 cycles, thoughput is 1/cycle
(define_insn_reservation "cell-fp" 10
  (and (eq_attr "type" "fp,dmul")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*8")

(define_insn_reservation "cell-fpcompare" 1
  (and (eq_attr "type" "fpcompare")
       (eq_attr "cpu" "cell"))
  "vsu1_cell+slot01")

;; sdiv thoughput 1/74, not pipelined but only in the FPU
(define_insn_reservation "cell-sdiv" 74
  (and (eq_attr "type" "sdiv,ddiv")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*72")

;; fsqrt thoughput 1/84, not pipelined but only in the FPU
(define_insn_reservation "cell-sqrt" 84
  (and (eq_attr "type" "ssqrt,dsqrt")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*82")

; VMX
(define_insn_reservation "cell-vecsimple" 4
  (and (eq_attr "type" "vecsimple")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*2")

;; mult, div, madd
(define_insn_reservation "cell-veccomplex" 10
  (and (eq_attr "type" "veccomplex")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*8")

;; TODO: add support for recording instructions
(define_insn_reservation "cell-veccmp" 4
  (and (eq_attr "type" "veccmp")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*2")

(define_insn_reservation "cell-vecfloat" 12
  (and (eq_attr "type" "vecfloat")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*10")

(define_insn_reservation "cell-vecperm" 4
  (and (eq_attr "type" "vecperm")
       (eq_attr "cpu" "cell"))
  "slot01,vsu2_cell,vsu2_cell*2")

;; New for 4.2, syncs

(define_insn_reservation "cell-sync" 11
  (and (eq_attr "type" "sync")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")

(define_insn_reservation "cell-isync" 11
  (and (eq_attr "type" "isync")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")

(define_insn_reservation "cell-load_l" 11
  (and (eq_attr "type" "load_l")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")

(define_insn_reservation "cell-store_c" 11
  (and (eq_attr "type" "store_c")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")

;; RAW register dependency

;; addi r3, r3, 1
;; lw r4,offset(r3)
;; there are 5 cycle deplay for r3 bypassing
;; there are 5 cycle delay for a dependent load after a load
(define_bypass 5 "cell-integer" "cell-load")
(define_bypass 5 "cell-integer" "cell-load-ext")
(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")

;; there is a 6 cycle delay after a fp compare until you can use the cr.
(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")

;; VXU float RAW
(define_bypass 11 "cell-vecfloat" "cell-vecfloat")

;; VXU and FPU
(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
; this is not correct, 
;;  this is a stall in general and not dependent on result
(define_bypass 13 "cell-vecstore" "cell-fpstore")
; this is not correct, this can never be true, not dependent on result
(define_bypass 7 "cell-fp" "cell-fpload")
;; vsu1 should avoid writing to the same target register as vsu2 insn
;;   within 12 cycles. 

;; WAW hazard

;; the target of VSU estimate should not be reused within 10 dispatch groups
;; the target of VSU float should not be reused within 8 dispatch groups
;; the target of VSU complex should not be reused within 5 dispatch groups
;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus

;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
;;  ex4 stage(10 cycles)
(define_bypass 10 "cell-mtjmpr" "cell-branchreg")

;;Things are not simulated:
;; update instruction, update address gpr are not simulated
;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
;;  insns
Commit	Line	Data
d296e02e	1	;; Scheduling description for cell processor.
47f67e51	2	;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
d296e02e AP	3	;; Free Software Foundation, Inc.
	4	;; Contributed by Sony Computer Entertainment, Inc.,
	5
	6
	7	;; This file is free software; you can redistribute it and/or modify it under
	8	;; the terms of the GNU General Public License as published by the Free
2f83c7d6	9	;; Software Foundation; either version 3 of the License, or (at your option)
d296e02e AP	10	;; any later version.
	11
	12	;; This file is distributed in the hope that it will be useful, but WITHOUT
	13	;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	14	;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	15	;; for more details.
	16
	17	;; You should have received a copy of the GNU General Public License
2f83c7d6 NC	18	;; along with GCC; see the file COPYING3. If not see
2f83c7d6 NC	19	;; <http://www.gnu.org/licenses/>.
d296e02e AP	20
	21	;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
	22
2f8e468b	23	;; BE Architecture DD3.0 and DD3.1
d296e02e AP	24	;; This file simulate PPU processor unit backend of pipeline, maualP24.
d296e02e AP	25	;; manual P27, stall and flush points
2f8e468b	26	;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
9f5ed61a	27	;; order, the grouped address are aligned by 8
d296e02e AP	28	;; This file only simulate one thread situation
	29	;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
	30	;; and load/store unit)
	31	;; VSU executes all scalar floating points insn(a float unit),
	32	;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
	33
	34	;; Dual issue combination
	35
	36	;; FXU LSU BR VMX VMX
	37	;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls)
	38	;;FXU X
	39	;;LSU X X X
	40	;;BR X
	41	;;VMX(sx,cx,vsu_fp,fp_arth) X
	42	;;VMX(perm,vsu_ls, fp_ls) X
	43	;; X are illegal combination.
	44
2f8e468b	45	;; Dual issue exceptions:
d296e02e AP	46	;;(1) nop-pipelined FXU instr in slot 0
	47	;;(2) non-pipelined FPU inst in slot 0
	48	;; CSI instr(contex-synchronizing insn)
	49	;; Microcode insn
	50
	51	;; BRU unit: bru(none register stall), bru_cr(cr register stall)
	52	;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
2f8e468b	53	;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
d296e02e AP	54	;; nonpipelined simulation
	55	;; micr insns will stall at least 7 cycles to get the first instr from ROM,
	56	;; micro instructions are not dual issued.
	57
	58	;; slot0 is older than slot1
	59	;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
	60
	61	;; There different stall point
	62	;; IB2, only stall one thread if stall here, so try to stall here as much as
	63	;; we can
	64	;; condition(1) insert nop, OR and ORI instruction form
	65	;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
	66	;; CR0-access while stdcx, or stwcx
	67	;; IS2 stall ;; Page91 for details
	68	;; VQ8 stall
	69	;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
	70	;; the vsu issue queue
	71
	72	;;(define_automaton "cellxu")
	73
	74	;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
	75
	76	;; ndfa
	77	(define_automaton "cellxu,cellvsu,cellbru,cell_mis")
	78
	79	(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
	80	(define_cpu_unit "bru_cell" "cellbru")
	81	(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
	82
	83	(define_cpu_unit "slot0,slot1" "cell_mis")
	84
	85	(absence_set "slot0" "slot1")
	86
	87	(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
	88	(define_reservation "slot01" "slot0\|slot1")
	89
	90
	91	;; Load/store
	92	;; lmw, lswi, lswx are only generated for optimize for space, MC,
	93	;; these instr are not simulated
	94	(define_insn_reservation "cell-load" 2
	95	(and (eq_attr "type" "load")
	96	(eq_attr "cpu" "cell"))
	97	"slot01,lsu_cell")
	98
	99	;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
	100	;; if with 32bytes alignment, CMC
	101	(define_insn_reservation "cell-load-ux" 2
	102	(and (eq_attr "type" "load_ux,load_u")
	103	(eq_attr "cpu" "cell"))
	104	"slot01,fxu_cell+lsu_cell")
	105
	106	;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
	107	;; 11/7, 11/8, 11/12
	108	(define_insn_reservation "cell-load-ext" 2
	109	(and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
	110	(eq_attr "cpu" "cell"))
	111	"slot01,fxu_cell+lsu_cell")
	112
	113	;;lfs,lfsx,lfd,lfdx, 1 cycle
	114	(define_insn_reservation "cell-fpload" 1
	115	(and (eq_attr "type" "fpload")
	116	(eq_attr "cpu" "cell"))
	117	"vsu2_cell+lsu_cell+slot01")
118
119	;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
120	(define_insn_reservation "cell-fpload-update" 1
121	(and (eq_attr "type" "fpload,fpload_u,fpload_ux")
122	(eq_attr "cpu" "cell"))
123	"fxu_cell+vsu2_cell+lsu_cell+slot01")
124
125	(define_insn_reservation "cell-vecload" 2
126	(and (eq_attr "type" "vecload")
127	(eq_attr "cpu" "cell"))
128	"slot01,vsu2_cell+lsu_cell")
129
130	;;st? stw(MC)
131	(define_insn_reservation "cell-store" 1
132	(and (eq_attr "type" "store")
133	(eq_attr "cpu" "cell"))
134	"lsu_cell+slot01")
135
136	;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
137	(define_insn_reservation "cell-store-update" 1
138	(and (eq_attr "type" "store_ux,store_u")
139	(eq_attr "cpu" "cell"))
140	"fxu_cell+lsu_cell+slot01")
141
142	(define_insn_reservation "cell-fpstore" 1
143	(and (eq_attr "type" "fpstore")
144	(eq_attr "cpu" "cell"))
145	"vsu2_cell+lsu_cell+slot01")
146
147	(define_insn_reservation "cell-fpstore-update" 1
148	(and (eq_attr "type" "fpstore_ux,fpstore_u")
149	(eq_attr "cpu" "cell"))
150	"vsu2_cell+fxu_cell+lsu_cell+slot01")
151
152	(define_insn_reservation "cell-vecstore" 1
153	(and (eq_attr "type" "vecstore")
154	(eq_attr "cpu" "cell"))
155	"vsu2_cell+lsu_cell+slot01")
156
157	;; Integer latency is 2 cycles
158	(define_insn_reservation "cell-integer" 2
159	(and (eq_attr "type" "integer,insert_dword,shift,trap,\
47f67e51	160	var_shift_rotate,cntlz,exts,isel")
d296e02e AP	161	(eq_attr "cpu" "cell"))
	162	"slot01,fxu_cell")
	163
	164	;; Two integer latency is 4 cycles
	165	(define_insn_reservation "cell-two" 4
	166	(and (eq_attr "type" "two")
	167	(eq_attr "cpu" "cell"))
	168	"slot01,fxu_cell,fxu_cell*2")
	169
	170	;; Three integer latency is 6 cycles
	171	(define_insn_reservation "cell-three" 6
	172	(and (eq_attr "type" "three")
	173	(eq_attr "cpu" "cell"))
	174	"slot01,fxu_cell,fxu_cell*4")
	175
	176	;; rlwimi, alter cr0
	177	(define_insn_reservation "cell-insert" 2
	178	(and (eq_attr "type" "insert_word")
	179	(eq_attr "cpu" "cell"))
	180	"slot01,fxu_cell")
	181
	182	;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
	183	(define_insn_reservation "cell-cmp" 1
	184	(and (eq_attr "type" "cmp")
	185	(eq_attr "cpu" "cell"))
	186	"fxu_cell+slot01")
	187
	188	;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
	189	(define_insn_reservation "cell-fast-cmp" 2
	190	(and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
	191	var_delayed_compare")
	192	(eq_attr "cpu" "cell"))
	193	(eq_attr "cell_micro" "not"))
	194	"slot01,fxu_cell")
	195
	196	(define_insn_reservation "cell-cmp-microcoded" 9
	197	(and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
	198	var_delayed_compare")
	199	(eq_attr "cpu" "cell"))
	200	(eq_attr "cell_micro" "always"))
	201	"slot0+slot1,fxu_cell,fxu_cell*7")
	202
	203	;; mulld
	204	(define_insn_reservation "cell-lmul" 15
	205	(and (eq_attr "type" "lmul")
	206	(eq_attr "cpu" "cell"))
	207	"slot1,nonpipeline,nonpipeline*13")
	208
	209	;; mulld. is microcoded
	210	(define_insn_reservation "cell-lmul-cmp" 22
	211	(and (eq_attr "type" "lmul_compare")
	212	(eq_attr "cpu" "cell"))
	213	"slot0+slot1,nonpipeline,nonpipeline*20")
	214
	215	;; mulli, 6 cycles
	216	(define_insn_reservation "cell-imul23" 6
	217	(and (eq_attr "type" "imul2,imul3")
	218	(eq_attr "cpu" "cell"))
	219	"slot1,nonpipeline,nonpipeline*4")
	220
	221	;; mullw, 9
	222	(define_insn_reservation "cell-imul" 9
	223	(and (eq_attr "type" "imul")
	224	(eq_attr "cpu" "cell"))
225	"slot1,nonpipeline,nonpipeline*7")
226
227	;; divide
228	(define_insn_reservation "cell-idiv" 32
229	(and (eq_attr "type" "idiv")
230	(eq_attr "cpu" "cell"))
231	"slot1,nonpipeline,nonpipeline*30")
232
233	(define_insn_reservation "cell-ldiv" 64
234	(and (eq_attr "type" "ldiv")
235	(eq_attr "cpu" "cell"))
236	"slot1,nonpipeline,nonpipeline*62")
237
238	;;mflr and mfctr are pipelined
239	(define_insn_reservation "cell-mfjmpr" 1
240	(and (eq_attr "type" "mfjmpr")
241	(eq_attr "cpu" "cell"))
242	"slot01+bru_cell")
243
244	;;mtlr and mtctr,
245	;;mtspr fully pipelined
246	(define_insn_reservation "cell-mtjmpr" 1
247	(and (eq_attr "type" "mtjmpr")
248	(eq_attr "cpu" "cell"))
249	"bru_cell+slot01")
250
251	;; Branches
252	;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
253	;; bcctr, bcctrl, latency 2, actually adjust by be to 4
254	(define_insn_reservation "cell-branch" 1
255	(and (eq_attr "type" "branch")
256	(eq_attr "cpu" "cell"))
257	"bru_cell+slot1")
258
259	(define_insn_reservation "cell-branchreg" 1
260	(and (eq_attr "type" "jmpreg")
261	(eq_attr "cpu" "cell"))
262	"bru_cell+slot1")
263
264	;; cr hazard
265	;; page 90, special cases for CR hazard, only one instr can access cr per cycle
266	;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
267	(define_insn_reservation "cell-crlogical" 1
268	(and (eq_attr "type" "cr_logical,delayed_cr")
269	(eq_attr "cpu" "cell"))
270	"bru_cell+slot01")
271
272	;; mfcrf and mfcr is about 34 cycles and nonpipelined
273	(define_insn_reservation "cell-mfcr" 34
274	(and (eq_attr "type" "mfcrf,mfcr")
275	(eq_attr "cpu" "cell"))
276	"slot1,nonpipeline,nonpipeline*32")
277
278	;; mtcrf (1 field)
279	(define_insn_reservation "cell-mtcrf" 1
280	(and (eq_attr "type" "mtcr")
281	(eq_attr "cpu" "cell"))
282	"fxu_cell+slot01")
283
284	; Basic FP latency is 10 cycles, thoughput is 1/cycle
285	(define_insn_reservation "cell-fp" 10
286	(and (eq_attr "type" "fp,dmul")
287	(eq_attr "cpu" "cell"))
288	"slot01,vsu1_cell,vsu1_cell*8")
289
290	(define_insn_reservation "cell-fpcompare" 1
291	(and (eq_attr "type" "fpcompare")
292	(eq_attr "cpu" "cell"))
293	"vsu1_cell+slot01")
294
295	;; sdiv thoughput 1/74, not pipelined but only in the FPU
296	(define_insn_reservation "cell-sdiv" 74
297	(and (eq_attr "type" "sdiv,ddiv")
298	(eq_attr "cpu" "cell"))
299	"slot1,nonpipeline,nonpipeline*72")
300
301	;; fsqrt thoughput 1/84, not pipelined but only in the FPU
302	(define_insn_reservation "cell-sqrt" 84
303	(and (eq_attr "type" "ssqrt,dsqrt")
304	(eq_attr "cpu" "cell"))
305	"slot1,nonpipeline,nonpipeline*82")
306
307	; VMX
308	(define_insn_reservation "cell-vecsimple" 4
309	(and (eq_attr "type" "vecsimple")
310	(eq_attr "cpu" "cell"))
311	"slot01,vsu1_cell,vsu1_cell*2")
312
313	;; mult, div, madd
314	(define_insn_reservation "cell-veccomplex" 10
315	(and (eq_attr "type" "veccomplex")
316	(eq_attr "cpu" "cell"))
317	"slot01,vsu1_cell,vsu1_cell*8")
318
319	;; TODO: add support for recording instructions
320	(define_insn_reservation "cell-veccmp" 4
321	(and (eq_attr "type" "veccmp")
322	(eq_attr "cpu" "cell"))
323	"slot01,vsu1_cell,vsu1_cell*2")
324
325	(define_insn_reservation "cell-vecfloat" 12
326	(and (eq_attr "type" "vecfloat")
327	(eq_attr "cpu" "cell"))
328	"slot01,vsu1_cell,vsu1_cell*10")
329
330	(define_insn_reservation "cell-vecperm" 4
331	(and (eq_attr "type" "vecperm")
332	(eq_attr "cpu" "cell"))
333	"slot01,vsu2_cell,vsu2_cell*2")
334
335	;; New for 4.2, syncs
336
337	(define_insn_reservation "cell-sync" 11
338	(and (eq_attr "type" "sync")
339	(eq_attr "cpu" "cell"))
340	"slot01,lsu_cell,lsu_cell*9")
341
342	(define_insn_reservation "cell-isync" 11
343	(and (eq_attr "type" "isync")
344	(eq_attr "cpu" "cell"))
345	"slot01,lsu_cell,lsu_cell*9")
346
347	(define_insn_reservation "cell-load_l" 11
348	(and (eq_attr "type" "load_l")
349	(eq_attr "cpu" "cell"))
350	"slot01,lsu_cell,lsu_cell*9")
351
352	(define_insn_reservation "cell-store_c" 11
353	(and (eq_attr "type" "store_c")
354	(eq_attr "cpu" "cell"))
355	"slot01,lsu_cell,lsu_cell*9")
356
357	;; RAW register dependency
358
359	;; addi r3, r3, 1
360	;; lw r4,offset(r3)
361	;; there are 5 cycle deplay for r3 bypassing
362	;; there are 5 cycle delay for a dependent load after a load
363	(define_bypass 5 "cell-integer" "cell-load")
364	(define_bypass 5 "cell-integer" "cell-load-ext")
365	(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
366
367	;; there is a 6 cycle delay after a fp compare until you can use the cr.
368	(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
369
370	;; VXU float RAW
371	(define_bypass 11 "cell-vecfloat" "cell-vecfloat")
372
373	;; VXU and FPU
374	(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
375	;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
376	(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
377	; this is not correct,
378	;; this is a stall in general and not dependent on result
379	(define_bypass 13 "cell-vecstore" "cell-fpstore")
2f8e468b	380	; this is not correct, this can never be true, not dependent on result
d296e02e AP	381	(define_bypass 7 "cell-fp" "cell-fpload")
	382	;; vsu1 should avoid writing to the same target register as vsu2 insn
	383	;; within 12 cycles.
	384
	385	;; WAW hazard
	386
	387	;; the target of VSU estimate should not be reused within 10 dispatch groups
	388	;; the target of VSU float should not be reused within 8 dispatch groups
	389	;; the target of VSU complex should not be reused within 5 dispatch groups
	390	;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
	391
	392	;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
	393	;; ex4 stage(10 cycles)
	394	(define_bypass 10 "cell-mtjmpr" "cell-branchreg")
	395
	396	;;Things are not simulated:
	397	;; update instruction, update address gpr are not simulated
2f8e468b	398	;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
d296e02e AP	399	;; insns
d296e02e AP	400