[thirdparty/gcc.git] / gcc / config / rs6000 / cell.md

;; Scheduling description for cell processor.
;; Copyright (C) 2001-2015 Free Software Foundation, Inc.
;; Contributed by Sony Computer Entertainment, Inc.,


;; This file is free software; you can redistribute it and/or modify it under
;; the terms of the GNU General Public License as published by the Free
;; Software Foundation; either version 3 of the License, or (at your option) 
;; any later version.

;; This file is distributed in the hope that it will be useful, but WITHOUT
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
;; for more details.

;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; <http://www.gnu.org/licenses/>.

;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)

;; BE Architecture *DD3.0 and DD3.1*
;; This file simulate PPU processor unit backend of pipeline, maualP24. 
;; manual P27, stall and flush points
;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
;;  order, the grouped address are aligned by 8
;; This file only simulate one thread situation
;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
;;   and load/store unit)
;; VSU executes all scalar floating points insn(a float unit),
;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)

;; Dual issue combination

;;	FXU	LSU	BR 	        VMX	               VMX
;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
;;FXU	X
;;LSU		X               	X               	X	
;;BR			X
;;VMX(sx,cx,vsu_fp,fp_arth)		X
;;VMX(perm,vsu_ls, fp_ls)					X
;;    X are illegal combination.

;; Dual issue exceptions:
;;(1) nop-pipelined FXU instr in slot 0 
;;(2) non-pipelined FPU inst in slot 0
;; CSI instr(contex-synchronizing insn)
;; Microcode insn

;; BRU unit: bru(none register stall), bru_cr(cr register stall)
;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
;;  nonpipelined simulation
;; micr insns will stall at least 7 cycles to get the first instr from ROM,
;;  micro instructions are not dual issued. 

;; slot0 is older than slot1
;; non-pipelined insn need to be in slot1 to avoid 1cycle stall

;; There different stall point
;; IB2, only stall one thread if stall here, so try to stall here as much as
;; we can 
;; condition(1) insert nop, OR and ORI instruction form 
;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
;;   CR0-access while stdcx, or stwcx
;; IS2 stall ;; Page91 for details
;; VQ8 stall
;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
;;  the vsu issue queue

;;(define_automaton "cellxu")

;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")

;; ndfa
(define_automaton "cellxu,cellvsu,cellbru,cell_mis")

(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
(define_cpu_unit "bru_cell" "cellbru")
(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")

(define_cpu_unit "slot0,slot1" "cell_mis")

(absence_set "slot0" "slot1")

(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
(define_reservation "slot01" "slot0|slot1")


;; Load/store
;; lmw, lswi, lswx are only generated for optimize for space, MC,
;;   these instr are not simulated
(define_insn_reservation "cell-load" 2
  (and (eq_attr "type" "load")
       (eq_attr "sign_extend" "no")
       (eq_attr "update" "no")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell")

;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
;;  if with 32bytes alignment, CMC
(define_insn_reservation "cell-load-ux" 2
  (and (eq_attr "type" "load")
       (eq_attr "sign_extend" "no")
       (eq_attr "update" "yes")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell+lsu_cell")

;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
;;   11/7, 11/8, 11/12
(define_insn_reservation "cell-load-ext" 2
  (and (eq_attr "type" "load")
       (eq_attr "sign_extend" "yes")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell+lsu_cell")

;;lfs,lfsx,lfd,lfdx, 1 cycle
(define_insn_reservation "cell-fpload" 1
  (and (eq_attr "type" "fpload")
       (eq_attr "update" "no")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")

;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
(define_insn_reservation "cell-fpload-update" 1
  (and (eq_attr "type" "fpload")
       (eq_attr "update" "yes")
       (eq_attr "cpu" "cell"))
  "fxu_cell+vsu2_cell+lsu_cell+slot01")

(define_insn_reservation "cell-vecload" 2
  (and (eq_attr "type" "vecload")
       (eq_attr "cpu" "cell"))
  "slot01,vsu2_cell+lsu_cell")

;;st? stw(MC)
(define_insn_reservation "cell-store" 1
  (and (eq_attr "type" "store")
       (eq_attr "update" "no")
       (eq_attr "cpu" "cell"))
  "lsu_cell+slot01")

;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
(define_insn_reservation "cell-store-update" 1
  (and (eq_attr "type" "store")
       (eq_attr "update" "yes")
       (eq_attr "cpu" "cell"))
  "fxu_cell+lsu_cell+slot01")

(define_insn_reservation "cell-fpstore" 1
  (and (eq_attr "type" "fpstore")
       (eq_attr "update" "no")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")

(define_insn_reservation "cell-fpstore-update" 1
  (and (eq_attr "type" "fpstore")
       (eq_attr "update" "yes")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+fxu_cell+lsu_cell+slot01")

(define_insn_reservation "cell-vecstore" 1
  (and (eq_attr "type" "vecstore")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")

;; Integer latency is 2 cycles
(define_insn_reservation "cell-integer" 2
  (and (ior (eq_attr "type" "integer,trap,cntlz,isel")
	    (and (eq_attr "type" "add,logical,shift,exts")
		 (eq_attr "dot" "no"))
	    (and (eq_attr "type" "insert")
		 (eq_attr "size" "64")))
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell")

;; Two integer latency is 4 cycles
(define_insn_reservation "cell-two" 4
  (and (eq_attr "type" "two")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell,fxu_cell*2")

;; Three integer latency is 6 cycles
(define_insn_reservation "cell-three" 6
  (and (eq_attr "type" "three")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell,fxu_cell*4")

;; rlwimi, alter cr0  
(define_insn_reservation "cell-insert" 2
  (and (eq_attr "type" "insert")
       (eq_attr "size" "32")
       (eq_attr "cpu" "cell"))
 "slot01,fxu_cell")

;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 
(define_insn_reservation "cell-cmp" 1
  (and (eq_attr "type" "cmp")
       (eq_attr "cpu" "cell"))
  "fxu_cell+slot01")

;; add, addo, sub, subo, alter cr0, rldcli, rlwinm 
(define_insn_reservation "cell-fast-cmp" 2
  (and (eq_attr "type" "add,logical,shift,exts")
       (eq_attr "dot" "yes")
       (eq_attr "cpu" "cell")
       (eq_attr "cell_micro" "not"))
  "slot01,fxu_cell")

(define_insn_reservation "cell-cmp-microcoded" 9
  (and (eq_attr "type" "add,logical,shift,exts")
       (eq_attr "dot" "yes")
       (eq_attr "cpu" "cell")
       (eq_attr "cell_micro" "always"))
  "slot0+slot1,fxu_cell,fxu_cell*7")

;; mulld
(define_insn_reservation "cell-lmul" 15
  (and (eq_attr "type" "mul")
       (eq_attr "dot" "no")
       (eq_attr "size" "64")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*13")

;; mulld. is microcoded
(define_insn_reservation "cell-lmul-cmp" 22
  (and (eq_attr "type" "mul")
       (eq_attr "dot" "yes")
       (eq_attr "size" "64")
       (eq_attr "cpu" "cell"))
  "slot0+slot1,nonpipeline,nonpipeline*20")

;; mulli, 6 cycles
(define_insn_reservation "cell-imul23" 6
  (and (eq_attr "type" "mul")
       (eq_attr "size" "8,16")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*4")

;; mullw, 9
(define_insn_reservation "cell-imul" 9
  (and (eq_attr "type" "mul")
       (eq_attr "dot" "no")
       (eq_attr "size" "32")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*7")
 
;; divide
(define_insn_reservation "cell-idiv" 32
  (and (eq_attr "type" "div")
       (eq_attr "size" "32")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*30")

(define_insn_reservation "cell-ldiv" 64
  (and (eq_attr "type" "div")
       (eq_attr "size" "64")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*62")

;;mflr and mfctr are pipelined
(define_insn_reservation "cell-mfjmpr" 1
  (and (eq_attr "type" "mfjmpr")
       (eq_attr "cpu" "cell"))
  "slot01+bru_cell")

;;mtlr and mtctr,
;;mtspr fully pipelined 
(define_insn_reservation "cell-mtjmpr" 1
 (and (eq_attr "type" "mtjmpr")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot01")

;; Branches
;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
;; bcctr, bcctrl, latency 2, actually adjust by be to 4
(define_insn_reservation "cell-branch" 1
  (and (eq_attr "type" "branch")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot1")

(define_insn_reservation "cell-branchreg" 1
  (and (eq_attr "type" "jmpreg")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot1")

;; cr hazard
;; page 90, special cases for CR hazard, only one instr can access cr per cycle
;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
(define_insn_reservation "cell-crlogical" 1
  (and (eq_attr "type" "cr_logical,delayed_cr")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot01")

;; mfcrf and mfcr is about 34 cycles and nonpipelined
(define_insn_reservation "cell-mfcr" 34
  (and (eq_attr "type" "mfcrf,mfcr")
       (eq_attr "cpu" "cell"))
   "slot1,nonpipeline,nonpipeline*32")

;; mtcrf (1 field)
(define_insn_reservation "cell-mtcrf" 1
  (and (eq_attr "type" "mtcr")
       (eq_attr "cpu" "cell"))
  "fxu_cell+slot01")

; Basic FP latency is 10 cycles, thoughput is 1/cycle
(define_insn_reservation "cell-fp" 10
  (and (eq_attr "type" "fp,dmul")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*8")

(define_insn_reservation "cell-fpcompare" 1
  (and (eq_attr "type" "fpcompare")
       (eq_attr "cpu" "cell"))
  "vsu1_cell+slot01")

;; sdiv thoughput 1/74, not pipelined but only in the FPU
(define_insn_reservation "cell-sdiv" 74
  (and (eq_attr "type" "sdiv,ddiv")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*72")

;; fsqrt thoughput 1/84, not pipelined but only in the FPU
(define_insn_reservation "cell-sqrt" 84
  (and (eq_attr "type" "ssqrt,dsqrt")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*82")

; VMX
(define_insn_reservation "cell-vecsimple" 4
  (and (eq_attr "type" "vecsimple")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*2")

;; mult, div, madd
(define_insn_reservation "cell-veccomplex" 10
  (and (eq_attr "type" "veccomplex")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*8")

;; TODO: add support for recording instructions
(define_insn_reservation "cell-veccmp" 4
  (and (eq_attr "type" "veccmp")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*2")

(define_insn_reservation "cell-vecfloat" 12
  (and (eq_attr "type" "vecfloat")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*10")

(define_insn_reservation "cell-vecperm" 4
  (and (eq_attr "type" "vecperm")
       (eq_attr "cpu" "cell"))
  "slot01,vsu2_cell,vsu2_cell*2")

;; New for 4.2, syncs

(define_insn_reservation "cell-sync" 11
  (and (eq_attr "type" "sync")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")

(define_insn_reservation "cell-isync" 11
  (and (eq_attr "type" "isync")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")

(define_insn_reservation "cell-load_l" 11
  (and (eq_attr "type" "load_l")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")

(define_insn_reservation "cell-store_c" 11
  (and (eq_attr "type" "store_c")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")

;; RAW register dependency

;; addi r3, r3, 1
;; lw r4,offset(r3)
;; there are 5 cycle deplay for r3 bypassing
;; there are 5 cycle delay for a dependent load after a load
(define_bypass 5 "cell-integer" "cell-load")
(define_bypass 5 "cell-integer" "cell-load-ext")
(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")

;; there is a 6 cycle delay after a fp compare until you can use the cr.
(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")

;; VXU float RAW
(define_bypass 11 "cell-vecfloat" "cell-vecfloat")

;; VXU and FPU
(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
; this is not correct, 
;;  this is a stall in general and not dependent on result
(define_bypass 13 "cell-vecstore" "cell-fpstore")
; this is not correct, this can never be true, not dependent on result
(define_bypass 7 "cell-fp" "cell-fpload")
;; vsu1 should avoid writing to the same target register as vsu2 insn
;;   within 12 cycles. 

;; WAW hazard

;; the target of VSU estimate should not be reused within 10 dispatch groups
;; the target of VSU float should not be reused within 8 dispatch groups
;; the target of VSU complex should not be reused within 5 dispatch groups
;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus

;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
;;  ex4 stage(10 cycles)
(define_bypass 10 "cell-mtjmpr" "cell-branchreg")

;;Things are not simulated:
;; update instruction, update address gpr are not simulated
;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
;;  insns
Commit	Line	Data
d296e02e	1	;; Scheduling description for cell processor.
5624e564	2	;; Copyright (C) 2001-2015 Free Software Foundation, Inc.
d296e02e AP	3	;; Contributed by Sony Computer Entertainment, Inc.,
	4
	5
	6	;; This file is free software; you can redistribute it and/or modify it under
	7	;; the terms of the GNU General Public License as published by the Free
2f83c7d6	8	;; Software Foundation; either version 3 of the License, or (at your option)
d296e02e AP	9	;; any later version.
	10
	11	;; This file is distributed in the hope that it will be useful, but WITHOUT
	12	;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	13	;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	14	;; for more details.
	15
	16	;; You should have received a copy of the GNU General Public License
2f83c7d6 NC	17	;; along with GCC; see the file COPYING3. If not see
2f83c7d6 NC	18	;; <http://www.gnu.org/licenses/>.
d296e02e AP	19
	20	;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
	21
2f8e468b	22	;; BE Architecture DD3.0 and DD3.1
d296e02e AP	23	;; This file simulate PPU processor unit backend of pipeline, maualP24.
d296e02e AP	24	;; manual P27, stall and flush points
2f8e468b	25	;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
9f5ed61a	26	;; order, the grouped address are aligned by 8
d296e02e AP	27	;; This file only simulate one thread situation
	28	;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
	29	;; and load/store unit)
	30	;; VSU executes all scalar floating points insn(a float unit),
	31	;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
	32
	33	;; Dual issue combination
	34
	35	;; FXU LSU BR VMX VMX
	36	;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls)
	37	;;FXU X
	38	;;LSU X X X
	39	;;BR X
	40	;;VMX(sx,cx,vsu_fp,fp_arth) X
	41	;;VMX(perm,vsu_ls, fp_ls) X
	42	;; X are illegal combination.
	43
2f8e468b	44	;; Dual issue exceptions:
d296e02e AP	45	;;(1) nop-pipelined FXU instr in slot 0
	46	;;(2) non-pipelined FPU inst in slot 0
	47	;; CSI instr(contex-synchronizing insn)
	48	;; Microcode insn
	49
	50	;; BRU unit: bru(none register stall), bru_cr(cr register stall)
	51	;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
2f8e468b	52	;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
d296e02e AP	53	;; nonpipelined simulation
	54	;; micr insns will stall at least 7 cycles to get the first instr from ROM,
	55	;; micro instructions are not dual issued.
	56
	57	;; slot0 is older than slot1
	58	;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
	59
	60	;; There different stall point
	61	;; IB2, only stall one thread if stall here, so try to stall here as much as
	62	;; we can
	63	;; condition(1) insert nop, OR and ORI instruction form
	64	;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
	65	;; CR0-access while stdcx, or stwcx
	66	;; IS2 stall ;; Page91 for details
	67	;; VQ8 stall
	68	;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
	69	;; the vsu issue queue
	70
	71	;;(define_automaton "cellxu")
	72
	73	;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
	74
	75	;; ndfa
	76	(define_automaton "cellxu,cellvsu,cellbru,cell_mis")
	77
	78	(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
	79	(define_cpu_unit "bru_cell" "cellbru")
	80	(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
	81
	82	(define_cpu_unit "slot0,slot1" "cell_mis")
	83
	84	(absence_set "slot0" "slot1")
	85
	86	(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
	87	(define_reservation "slot01" "slot0\|slot1")
	88
	89
	90	;; Load/store
	91	;; lmw, lswi, lswx are only generated for optimize for space, MC,
	92	;; these instr are not simulated
	93	(define_insn_reservation "cell-load" 2
	94	(and (eq_attr "type" "load")
d839f53b SB	95	(eq_attr "sign_extend" "no")
d839f53b SB	96	(eq_attr "update" "no")
d296e02e AP	97	(eq_attr "cpu" "cell"))
	98	"slot01,lsu_cell")
	99
	100	;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
	101	;; if with 32bytes alignment, CMC
	102	(define_insn_reservation "cell-load-ux" 2
d839f53b SB	103	(and (eq_attr "type" "load")
	104	(eq_attr "sign_extend" "no")
	105	(eq_attr "update" "yes")
d296e02e AP	106	(eq_attr "cpu" "cell"))
	107	"slot01,fxu_cell+lsu_cell")
	108
	109	;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
	110	;; 11/7, 11/8, 11/12
	111	(define_insn_reservation "cell-load-ext" 2
d839f53b SB	112	(and (eq_attr "type" "load")
	113	(eq_attr "sign_extend" "yes")
	114	(eq_attr "cpu" "cell"))
d296e02e AP	115	"slot01,fxu_cell+lsu_cell")
	116
	117	;;lfs,lfsx,lfd,lfdx, 1 cycle
	118	(define_insn_reservation "cell-fpload" 1
	119	(and (eq_attr "type" "fpload")
d839f53b	120	(eq_attr "update" "no")
d296e02e AP	121	(eq_attr "cpu" "cell"))
	122	"vsu2_cell+lsu_cell+slot01")
	123
	124	;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
	125	(define_insn_reservation "cell-fpload-update" 1
d839f53b SB	126	(and (eq_attr "type" "fpload")
d839f53b SB	127	(eq_attr "update" "yes")
d296e02e AP	128	(eq_attr "cpu" "cell"))
	129	"fxu_cell+vsu2_cell+lsu_cell+slot01")
	130
	131	(define_insn_reservation "cell-vecload" 2
	132	(and (eq_attr "type" "vecload")
	133	(eq_attr "cpu" "cell"))
	134	"slot01,vsu2_cell+lsu_cell")
	135
	136	;;st? stw(MC)
	137	(define_insn_reservation "cell-store" 1
	138	(and (eq_attr "type" "store")
d839f53b	139	(eq_attr "update" "no")
d296e02e AP	140	(eq_attr "cpu" "cell"))
	141	"lsu_cell+slot01")
	142
	143	;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
	144	(define_insn_reservation "cell-store-update" 1
d839f53b SB	145	(and (eq_attr "type" "store")
d839f53b SB	146	(eq_attr "update" "yes")
d296e02e AP	147	(eq_attr "cpu" "cell"))
	148	"fxu_cell+lsu_cell+slot01")
	149
	150	(define_insn_reservation "cell-fpstore" 1
	151	(and (eq_attr "type" "fpstore")
d839f53b	152	(eq_attr "update" "no")
d296e02e AP	153	(eq_attr "cpu" "cell"))
	154	"vsu2_cell+lsu_cell+slot01")
	155
	156	(define_insn_reservation "cell-fpstore-update" 1
d839f53b SB	157	(and (eq_attr "type" "fpstore")
d839f53b SB	158	(eq_attr "update" "yes")
d296e02e AP	159	(eq_attr "cpu" "cell"))
	160	"vsu2_cell+fxu_cell+lsu_cell+slot01")
	161
	162	(define_insn_reservation "cell-vecstore" 1
	163	(and (eq_attr "type" "vecstore")
	164	(eq_attr "cpu" "cell"))
	165	"vsu2_cell+lsu_cell+slot01")
	166
	167	;; Integer latency is 2 cycles
	168	(define_insn_reservation "cell-integer" 2
79430730 SB	169	(and (ior (eq_attr "type" "integer,trap,cntlz,isel")
79430730 SB	170	(and (eq_attr "type" "add,logical,shift,exts")
892e7fa6	171	(eq_attr "dot" "no"))
58ee9e66 SB	172	(and (eq_attr "type" "insert")
58ee9e66 SB	173	(eq_attr "size" "64")))
d296e02e AP	174	(eq_attr "cpu" "cell"))
	175	"slot01,fxu_cell")
	176
	177	;; Two integer latency is 4 cycles
	178	(define_insn_reservation "cell-two" 4
	179	(and (eq_attr "type" "two")
	180	(eq_attr "cpu" "cell"))
	181	"slot01,fxu_cell,fxu_cell*2")
	182
	183	;; Three integer latency is 6 cycles
	184	(define_insn_reservation "cell-three" 6
	185	(and (eq_attr "type" "three")
	186	(eq_attr "cpu" "cell"))
	187	"slot01,fxu_cell,fxu_cell*4")
	188
	189	;; rlwimi, alter cr0
	190	(define_insn_reservation "cell-insert" 2
58ee9e66 SB	191	(and (eq_attr "type" "insert")
58ee9e66 SB	192	(eq_attr "size" "32")
d296e02e AP	193	(eq_attr "cpu" "cell"))
	194	"slot01,fxu_cell")
	195
	196	;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
	197	(define_insn_reservation "cell-cmp" 1
	198	(and (eq_attr "type" "cmp")
	199	(eq_attr "cpu" "cell"))
	200	"fxu_cell+slot01")
	201
	202	;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
	203	(define_insn_reservation "cell-fast-cmp" 2
f5ae5a23 SB	204	(and (eq_attr "type" "add,logical,shift,exts")
f5ae5a23 SB	205	(eq_attr "dot" "yes")
892e7fa6 SB	206	(eq_attr "cpu" "cell")
892e7fa6 SB	207	(eq_attr "cell_micro" "not"))
d296e02e AP	208	"slot01,fxu_cell")
	209
	210	(define_insn_reservation "cell-cmp-microcoded" 9
f5ae5a23 SB	211	(and (eq_attr "type" "add,logical,shift,exts")
f5ae5a23 SB	212	(eq_attr "dot" "yes")
892e7fa6 SB	213	(eq_attr "cpu" "cell")
892e7fa6 SB	214	(eq_attr "cell_micro" "always"))
d296e02e AP	215	"slot0+slot1,fxu_cell,fxu_cell*7")
	216
	217	;; mulld
	218	(define_insn_reservation "cell-lmul" 15
e0528ed9 SB	219	(and (eq_attr "type" "mul")
	220	(eq_attr "dot" "no")
	221	(eq_attr "size" "64")
d296e02e AP	222	(eq_attr "cpu" "cell"))
	223	"slot1,nonpipeline,nonpipeline*13")
	224
	225	;; mulld. is microcoded
	226	(define_insn_reservation "cell-lmul-cmp" 22
e0528ed9 SB	227	(and (eq_attr "type" "mul")
	228	(eq_attr "dot" "yes")
	229	(eq_attr "size" "64")
d296e02e AP	230	(eq_attr "cpu" "cell"))
	231	"slot0+slot1,nonpipeline,nonpipeline*20")
	232
	233	;; mulli, 6 cycles
	234	(define_insn_reservation "cell-imul23" 6
e0528ed9 SB	235	(and (eq_attr "type" "mul")
e0528ed9 SB	236	(eq_attr "size" "8,16")
d296e02e AP	237	(eq_attr "cpu" "cell"))
	238	"slot1,nonpipeline,nonpipeline*4")
	239
	240	;; mullw, 9
	241	(define_insn_reservation "cell-imul" 9
e0528ed9 SB	242	(and (eq_attr "type" "mul")
	243	(eq_attr "dot" "no")
	244	(eq_attr "size" "32")
d296e02e AP	245	(eq_attr "cpu" "cell"))
	246	"slot1,nonpipeline,nonpipeline*7")
	247
	248	;; divide
	249	(define_insn_reservation "cell-idiv" 32
441e02a5 SB	250	(and (eq_attr "type" "div")
441e02a5 SB	251	(eq_attr "size" "32")
d296e02e AP	252	(eq_attr "cpu" "cell"))
	253	"slot1,nonpipeline,nonpipeline*30")
	254
	255	(define_insn_reservation "cell-ldiv" 64
441e02a5 SB	256	(and (eq_attr "type" "div")
441e02a5 SB	257	(eq_attr "size" "64")
d296e02e AP	258	(eq_attr "cpu" "cell"))
	259	"slot1,nonpipeline,nonpipeline*62")
	260
	261	;;mflr and mfctr are pipelined
	262	(define_insn_reservation "cell-mfjmpr" 1
	263	(and (eq_attr "type" "mfjmpr")
	264	(eq_attr "cpu" "cell"))
	265	"slot01+bru_cell")
	266
	267	;;mtlr and mtctr,
	268	;;mtspr fully pipelined
	269	(define_insn_reservation "cell-mtjmpr" 1
	270	(and (eq_attr "type" "mtjmpr")
	271	(eq_attr "cpu" "cell"))
	272	"bru_cell+slot01")
	273
	274	;; Branches
	275	;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
	276	;; bcctr, bcctrl, latency 2, actually adjust by be to 4
	277	(define_insn_reservation "cell-branch" 1
	278	(and (eq_attr "type" "branch")
	279	(eq_attr "cpu" "cell"))
	280	"bru_cell+slot1")
	281
	282	(define_insn_reservation "cell-branchreg" 1
	283	(and (eq_attr "type" "jmpreg")
	284	(eq_attr "cpu" "cell"))
	285	"bru_cell+slot1")
	286
	287	;; cr hazard
	288	;; page 90, special cases for CR hazard, only one instr can access cr per cycle
	289	;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
	290	(define_insn_reservation "cell-crlogical" 1
	291	(and (eq_attr "type" "cr_logical,delayed_cr")
	292	(eq_attr "cpu" "cell"))
	293	"bru_cell+slot01")
	294
	295	;; mfcrf and mfcr is about 34 cycles and nonpipelined
	296	(define_insn_reservation "cell-mfcr" 34
	297	(and (eq_attr "type" "mfcrf,mfcr")
	298	(eq_attr "cpu" "cell"))
	299	"slot1,nonpipeline,nonpipeline*32")
	300
	301	;; mtcrf (1 field)
	302	(define_insn_reservation "cell-mtcrf" 1
	303	(and (eq_attr "type" "mtcr")
	304	(eq_attr "cpu" "cell"))
	305	"fxu_cell+slot01")
	306
	307	; Basic FP latency is 10 cycles, thoughput is 1/cycle
	308	(define_insn_reservation "cell-fp" 10
	309	(and (eq_attr "type" "fp,dmul")
	310	(eq_attr "cpu" "cell"))
	311	"slot01,vsu1_cell,vsu1_cell*8")
	312
	313	(define_insn_reservation "cell-fpcompare" 1
	314	(and (eq_attr "type" "fpcompare")
	315	(eq_attr "cpu" "cell"))
	316	"vsu1_cell+slot01")
	317
	318	;; sdiv thoughput 1/74, not pipelined but only in the FPU
	319	(define_insn_reservation "cell-sdiv" 74
	320	(and (eq_attr "type" "sdiv,ddiv")
	321	(eq_attr "cpu" "cell"))
322	"slot1,nonpipeline,nonpipeline*72")
323
324	;; fsqrt thoughput 1/84, not pipelined but only in the FPU
325	(define_insn_reservation "cell-sqrt" 84
326	(and (eq_attr "type" "ssqrt,dsqrt")
327	(eq_attr "cpu" "cell"))
328	"slot1,nonpipeline,nonpipeline*82")
329
330	; VMX
331	(define_insn_reservation "cell-vecsimple" 4
332	(and (eq_attr "type" "vecsimple")
333	(eq_attr "cpu" "cell"))
334	"slot01,vsu1_cell,vsu1_cell*2")
335
336	;; mult, div, madd
337	(define_insn_reservation "cell-veccomplex" 10
338	(and (eq_attr "type" "veccomplex")
339	(eq_attr "cpu" "cell"))
340	"slot01,vsu1_cell,vsu1_cell*8")
341
342	;; TODO: add support for recording instructions
343	(define_insn_reservation "cell-veccmp" 4
344	(and (eq_attr "type" "veccmp")
345	(eq_attr "cpu" "cell"))
346	"slot01,vsu1_cell,vsu1_cell*2")
347
348	(define_insn_reservation "cell-vecfloat" 12
349	(and (eq_attr "type" "vecfloat")
350	(eq_attr "cpu" "cell"))
351	"slot01,vsu1_cell,vsu1_cell*10")
352
353	(define_insn_reservation "cell-vecperm" 4
354	(and (eq_attr "type" "vecperm")
355	(eq_attr "cpu" "cell"))
356	"slot01,vsu2_cell,vsu2_cell*2")
357
358	;; New for 4.2, syncs
359
360	(define_insn_reservation "cell-sync" 11
361	(and (eq_attr "type" "sync")
362	(eq_attr "cpu" "cell"))
363	"slot01,lsu_cell,lsu_cell*9")
364
365	(define_insn_reservation "cell-isync" 11
366	(and (eq_attr "type" "isync")
367	(eq_attr "cpu" "cell"))
368	"slot01,lsu_cell,lsu_cell*9")
369
370	(define_insn_reservation "cell-load_l" 11
371	(and (eq_attr "type" "load_l")
372	(eq_attr "cpu" "cell"))
373	"slot01,lsu_cell,lsu_cell*9")
374
375	(define_insn_reservation "cell-store_c" 11
376	(and (eq_attr "type" "store_c")
377	(eq_attr "cpu" "cell"))
378	"slot01,lsu_cell,lsu_cell*9")
379
380	;; RAW register dependency
381
382	;; addi r3, r3, 1
383	;; lw r4,offset(r3)
384	;; there are 5 cycle deplay for r3 bypassing
385	;; there are 5 cycle delay for a dependent load after a load
386	(define_bypass 5 "cell-integer" "cell-load")
387	(define_bypass 5 "cell-integer" "cell-load-ext")
388	(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
389
390	;; there is a 6 cycle delay after a fp compare until you can use the cr.
391	(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
392
393	;; VXU float RAW
394	(define_bypass 11 "cell-vecfloat" "cell-vecfloat")
395
396	;; VXU and FPU
397	(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
398	;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
399	(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
400	; this is not correct,
401	;; this is a stall in general and not dependent on result
402	(define_bypass 13 "cell-vecstore" "cell-fpstore")
2f8e468b	403	; this is not correct, this can never be true, not dependent on result
d296e02e AP	404	(define_bypass 7 "cell-fp" "cell-fpload")
	405	;; vsu1 should avoid writing to the same target register as vsu2 insn
	406	;; within 12 cycles.
	407
	408	;; WAW hazard
	409
	410	;; the target of VSU estimate should not be reused within 10 dispatch groups
	411	;; the target of VSU float should not be reused within 8 dispatch groups
	412	;; the target of VSU complex should not be reused within 5 dispatch groups
	413	;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
	414
	415	;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
	416	;; ex4 stage(10 cycles)
	417	(define_bypass 10 "cell-mtjmpr" "cell-branchreg")
	418
	419	;;Things are not simulated:
	420	;; update instruction, update address gpr are not simulated
2f8e468b	421	;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
d296e02e AP	422	;; insns
d296e02e AP	423