[thirdparty/gcc.git] / gcc / config / mips / 10000.md

;; DFA-based pipeline description for the VR1x000.
;;   Copyright (C) 2005-2022 Free Software Foundation, Inc.
;;
;; This file is part of GCC.

;; GCC is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published
;; by the Free Software Foundation; either version 3, or (at your
;; option) any later version.

;; GCC is distributed in the hope that it will be useful, but WITHOUT
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
;; License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; <http://www.gnu.org/licenses/>.


;; R12K/R14K/R16K are derivatives of R10K, thus copy its description
;; until specific tuning for each is added.

;; R10000 has an int queue, fp queue, address queue.
;; The int queue feeds ALU1 and ALU2.
;; The fp queue feeds the fp-adder and fp-multiplier.
;; The addr queue feeds the Load/Store unit.
;;
;; However, we define the fp-adder and fp-multiplier as
;; separate automatons, because the fp-multiplier is
;; divided into fp-multiplier, fp-division, and
;; fp-squareroot units, all of which share the same
;; issue and completion logic, yet can operate in
;; parallel.
;;
;; This is based on the model described in the R10K Manual
;; and it helps to reduce the size of the automata.
(define_automaton "r10k_a_int, r10k_a_fpadder, r10k_a_addr,
                   r10k_a_fpmpy, r10k_a_fpdiv, r10k_a_fpsqrt")

(define_cpu_unit "r10k_alu1" "r10k_a_int")
(define_cpu_unit "r10k_alu2" "r10k_a_int")
(define_cpu_unit "r10k_fpadd" "r10k_a_fpadder")
(define_cpu_unit "r10k_fpmpy" "r10k_a_fpmpy")
(define_cpu_unit "r10k_fpdiv" "r10k_a_fpdiv")
(define_cpu_unit "r10k_fpsqrt" "r10k_a_fpsqrt")
(define_cpu_unit "r10k_loadstore" "r10k_a_addr")


;; R10k Loads and Stores.
(define_insn_reservation "r10k_load" 2
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "load,prefetch,prefetchx"))
  "r10k_loadstore")

(define_insn_reservation "r10k_store" 0
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "store,fpstore,fpidxstore"))
  "r10k_loadstore")

(define_insn_reservation "r10k_fpload" 3
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "fpload,fpidxload"))
  "r10k_loadstore")


;; Integer add/sub + logic ops, and mt hi/lo can be done by alu1 or alu2.
;; Miscellaneous arith goes here too (this is a guess).
(define_insn_reservation "r10k_arith" 1
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "arith,mthi,mtlo,slt,clz,const,nop,trap,logical"))
  "r10k_alu1 | r10k_alu2")

;; We treat mfhilo differently, because we need to know when
;; it's HI and when it's LO.
(define_insn_reservation "r10k_mfhi" 1
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "mfhi"))
  "r10k_alu1 | r10k_alu2")

(define_insn_reservation "r10k_mflo" 1
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "mflo"))
  "r10k_alu1 | r10k_alu2")


;; ALU1 handles shifts, branch eval, and condmove.
;;
;; Brancher is separate, but part of ALU1, but can only
;; do one branch per cycle (is this even implementable?).
;;
;; Unsure if the brancher handles jumps and calls as well, but since
;; they're related, we'll add them here for now.
(define_insn_reservation "r10k_brancher" 1
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "shift,branch,jump,call"))
  "r10k_alu1")

(define_insn_reservation "r10k_int_cmove" 1
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "condmove")
            (eq_attr "mode" "SI,DI")))
  "r10k_alu1")


;; Coprocessor Moves.
;; mtc1/dmtc1 are handled by ALU1.
;; mfc1/dmfc1 are handled by the fp-multiplier.
(define_insn_reservation "r10k_mt_xfer" 3
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "mtc"))
  "r10k_alu1")

(define_insn_reservation "r10k_mf_xfer" 2
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "mfc"))
  "r10k_fpmpy")


;; Only ALU2 does int multiplications and divisions.
;;
;; According to the Vr10000 series user manual,
;; integer mult and div insns can be issued one
;; cycle earlier if using register Lo.  We model
;; this by using the Lo value by default, as it
;; is the more common value, and use a bypass
;; for the Hi value when needed.
;;
;; Also of note, There are different latencies
;; for MULT/DMULT (Lo 5/Hi 6) and MULTU/DMULTU (Lo 6/Hi 7).
;; However, gcc does not have separate types
;; for these insns.  Thus to strike a balance,
;; we use the Hi latency value for imul
;; operations until the imul type can be split.
(define_insn_reservation "r10k_imul_single" 6
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "imul,imul3")
            (eq_attr "mode" "SI")))
  "r10k_alu2 * 6")

(define_insn_reservation "r10k_imul_double" 10
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "imul,imul3")
            (eq_attr "mode" "DI")))
  "r10k_alu2 * 10")

;; Divides keep ALU2 busy.
(define_insn_reservation "r10k_idiv_single" 34
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "idiv")
            (eq_attr "mode" "SI")))
  "r10k_alu2 * 35")

(define_insn_reservation "r10k_idiv_double" 66
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "idiv")
            (eq_attr "mode" "DI")))
  "r10k_alu2 * 67")

(define_bypass 35 "r10k_idiv_single" "r10k_mfhi")
(define_bypass 67 "r10k_idiv_double" "r10k_mfhi")


;; Floating point add/sub, mul, abs value, neg, comp, & moves.
(define_insn_reservation "r10k_fp_miscadd" 2
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "fadd,fabs,fneg,fcmp"))
  "r10k_fpadd")

(define_insn_reservation "r10k_fp_miscmul" 2
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "fmul,fmove"))
  "r10k_fpmpy")

(define_insn_reservation "r10k_fp_cmove" 2
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "condmove")
            (eq_attr "mode" "SF,DF")))
  "r10k_fpmpy")


;; The fcvt.s.[wl] insn has latency 4, repeat 2.
;; All other fcvt insns have latency 2, repeat 1.
(define_insn_reservation "r10k_fcvt_single" 4
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "fcvt")
            (eq_attr "cnv_mode" "I2S")))
  "r10k_fpadd * 2")

(define_insn_reservation "r10k_fcvt_other" 2
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "fcvt")
            (eq_attr "cnv_mode" "!I2S")))
  "r10k_fpadd")


;; Run the fmadd insn through fp-adder first, then fp-multiplier.
;;
;; The latency for fmadd is 2 cycles if the result is used
;; by another fmadd instruction.
(define_insn_reservation "r10k_fmadd" 4
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "fmadd"))
  "r10k_fpadd, r10k_fpmpy")

(define_bypass 2 "r10k_fmadd" "r10k_fmadd")


;; Floating point Divisions & square roots.
(define_insn_reservation "r10k_fdiv_single" 12
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "fdiv,frdiv")
            (eq_attr "mode" "SF")))
  "r10k_fpdiv * 14")

(define_insn_reservation "r10k_fdiv_double" 19
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "fdiv,frdiv")
            (eq_attr "mode" "DF")))
  "r10k_fpdiv * 21")

(define_insn_reservation "r10k_fsqrt_single" 18
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "fsqrt")
            (eq_attr "mode" "SF")))
  "r10k_fpsqrt * 20")

(define_insn_reservation "r10k_fsqrt_double" 33
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "fsqrt")
            (eq_attr "mode" "DF")))
  "r10k_fpsqrt * 35")

(define_insn_reservation "r10k_frsqrt_single" 30
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "frsqrt")
            (eq_attr "mode" "SF")))
  "r10k_fpsqrt * 20")

(define_insn_reservation "r10k_frsqrt_double" 52
  (and (eq_attr "cpu" "r10000")
       (and (eq_attr "type" "frsqrt")
            (eq_attr "mode" "DF")))
  "r10k_fpsqrt * 35")


;; Handle unknown/multi insns here (this is a guess).
(define_insn_reservation "r10k_unknown" 1
  (and (eq_attr "cpu" "r10000")
       (eq_attr "type" "unknown,multi,atomic,syncloop"))
  "r10k_alu1 + r10k_alu2")
Commit	Line	Data
7a3446ec	1	;; DFA-based pipeline description for the VR1x000.
7adcbafe	2	;; Copyright (C) 2005-2022 Free Software Foundation, Inc.
7a3446ec JK	3	;;
	4	;; This file is part of GCC.
	5
	6	;; GCC is free software; you can redistribute it and/or modify it
	7	;; under the terms of the GNU General Public License as published
	8	;; by the Free Software Foundation; either version 3, or (at your
	9	;; option) any later version.
	10
	11	;; GCC is distributed in the hope that it will be useful, but WITHOUT
	12	;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	13	;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
	14	;; License for more details.
	15
	16	;; You should have received a copy of the GNU General Public License
	17	;; along with GCC; see the file COPYING3. If not see
	18	;; <http://www.gnu.org/licenses/>.
	19
	20
	21	;; R12K/R14K/R16K are derivatives of R10K, thus copy its description
	22	;; until specific tuning for each is added.
	23
	24	;; R10000 has an int queue, fp queue, address queue.
	25	;; The int queue feeds ALU1 and ALU2.
	26	;; The fp queue feeds the fp-adder and fp-multiplier.
	27	;; The addr queue feeds the Load/Store unit.
	28	;;
	29	;; However, we define the fp-adder and fp-multiplier as
	30	;; separate automatons, because the fp-multiplier is
	31	;; divided into fp-multiplier, fp-division, and
	32	;; fp-squareroot units, all of which share the same
	33	;; issue and completion logic, yet can operate in
	34	;; parallel.
	35	;;
	36	;; This is based on the model described in the R10K Manual
	37	;; and it helps to reduce the size of the automata.
	38	(define_automaton "r10k_a_int, r10k_a_fpadder, r10k_a_addr,
	39	r10k_a_fpmpy, r10k_a_fpdiv, r10k_a_fpsqrt")
	40
	41	(define_cpu_unit "r10k_alu1" "r10k_a_int")
	42	(define_cpu_unit "r10k_alu2" "r10k_a_int")
	43	(define_cpu_unit "r10k_fpadd" "r10k_a_fpadder")
	44	(define_cpu_unit "r10k_fpmpy" "r10k_a_fpmpy")
	45	(define_cpu_unit "r10k_fpdiv" "r10k_a_fpdiv")
	46	(define_cpu_unit "r10k_fpsqrt" "r10k_a_fpsqrt")
	47	(define_cpu_unit "r10k_loadstore" "r10k_a_addr")
	48
	49
	50	;; R10k Loads and Stores.
	51	(define_insn_reservation "r10k_load" 2
	52	(and (eq_attr "cpu" "r10000")
	53	(eq_attr "type" "load,prefetch,prefetchx"))
	54	"r10k_loadstore")
	55
	56	(define_insn_reservation "r10k_store" 0
	57	(and (eq_attr "cpu" "r10000")
	58	(eq_attr "type" "store,fpstore,fpidxstore"))
	59	"r10k_loadstore")
	60
	61	(define_insn_reservation "r10k_fpload" 3
	62	(and (eq_attr "cpu" "r10000")
	63	(eq_attr "type" "fpload,fpidxload"))
	64	"r10k_loadstore")
	65
	66
67	;; Integer add/sub + logic ops, and mt hi/lo can be done by alu1 or alu2.
68	;; Miscellaneous arith goes here too (this is a guess).
69	(define_insn_reservation "r10k_arith" 1
70	(and (eq_attr "cpu" "r10000")
cb00489c	71	(eq_attr "type" "arith,mthi,mtlo,slt,clz,const,nop,trap,logical"))
7a3446ec JK	72	"r10k_alu1 \| r10k_alu2")
	73
	74	;; We treat mfhilo differently, because we need to know when
	75	;; it's HI and when it's LO.
	76	(define_insn_reservation "r10k_mfhi" 1
	77	(and (eq_attr "cpu" "r10000")
cb00489c	78	(eq_attr "type" "mfhi"))
7a3446ec JK	79	"r10k_alu1 \| r10k_alu2")
	80
	81	(define_insn_reservation "r10k_mflo" 1
	82	(and (eq_attr "cpu" "r10000")
cb00489c	83	(eq_attr "type" "mflo"))
7a3446ec JK	84	"r10k_alu1 \| r10k_alu2")
	85
	86
	87	;; ALU1 handles shifts, branch eval, and condmove.
	88	;;
	89	;; Brancher is separate, but part of ALU1, but can only
	90	;; do one branch per cycle (is this even implementable?).
	91	;;
	92	;; Unsure if the brancher handles jumps and calls as well, but since
	93	;; they're related, we'll add them here for now.
	94	(define_insn_reservation "r10k_brancher" 1
	95	(and (eq_attr "cpu" "r10000")
	96	(eq_attr "type" "shift,branch,jump,call"))
	97	"r10k_alu1")
	98
	99	(define_insn_reservation "r10k_int_cmove" 1
	100	(and (eq_attr "cpu" "r10000")
	101	(and (eq_attr "type" "condmove")
	102	(eq_attr "mode" "SI,DI")))
	103	"r10k_alu1")
	104
	105
	106	;; Coprocessor Moves.
	107	;; mtc1/dmtc1 are handled by ALU1.
	108	;; mfc1/dmfc1 are handled by the fp-multiplier.
	109	(define_insn_reservation "r10k_mt_xfer" 3
	110	(and (eq_attr "cpu" "r10000")
	111	(eq_attr "type" "mtc"))
	112	"r10k_alu1")
	113
	114	(define_insn_reservation "r10k_mf_xfer" 2
	115	(and (eq_attr "cpu" "r10000")
	116	(eq_attr "type" "mfc"))
	117	"r10k_fpmpy")
	118
	119
	120	;; Only ALU2 does int multiplications and divisions.
	121	;;
	122	;; According to the Vr10000 series user manual,
	123	;; integer mult and div insns can be issued one
	124	;; cycle earlier if using register Lo. We model
	125	;; this by using the Lo value by default, as it
	126	;; is the more common value, and use a bypass
	127	;; for the Hi value when needed.
	128	;;
	129	;; Also of note, There are different latencies
	130	;; for MULT/DMULT (Lo 5/Hi 6) and MULTU/DMULTU (Lo 6/Hi 7).
	131	;; However, gcc does not have separate types
	132	;; for these insns. Thus to strike a balance,
	133	;; we use the Hi latency value for imul
	134	;; operations until the imul type can be split.
	135	(define_insn_reservation "r10k_imul_single" 6
	136	(and (eq_attr "cpu" "r10000")
	137	(and (eq_attr "type" "imul,imul3")
	138	(eq_attr "mode" "SI")))
	139	"r10k_alu2 * 6")
	140
	141	(define_insn_reservation "r10k_imul_double" 10
	142	(and (eq_attr "cpu" "r10000")
	143	(and (eq_attr "type" "imul,imul3")
	144	(eq_attr "mode" "DI")))
	145	"r10k_alu2 * 10")
	146
	147	;; Divides keep ALU2 busy.
148	(define_insn_reservation "r10k_idiv_single" 34
149	(and (eq_attr "cpu" "r10000")
150	(and (eq_attr "type" "idiv")
151	(eq_attr "mode" "SI")))
152	"r10k_alu2 * 35")
153
154	(define_insn_reservation "r10k_idiv_double" 66
155	(and (eq_attr "cpu" "r10000")
156	(and (eq_attr "type" "idiv")
157	(eq_attr "mode" "DI")))
158	"r10k_alu2 * 67")
159
160	(define_bypass 35 "r10k_idiv_single" "r10k_mfhi")
161	(define_bypass 67 "r10k_idiv_double" "r10k_mfhi")
162
163
164	;; Floating point add/sub, mul, abs value, neg, comp, & moves.
165	(define_insn_reservation "r10k_fp_miscadd" 2
166	(and (eq_attr "cpu" "r10000")
167	(eq_attr "type" "fadd,fabs,fneg,fcmp"))
168	"r10k_fpadd")
169
170	(define_insn_reservation "r10k_fp_miscmul" 2
171	(and (eq_attr "cpu" "r10000")
172	(eq_attr "type" "fmul,fmove"))
173	"r10k_fpmpy")
174
175	(define_insn_reservation "r10k_fp_cmove" 2
176	(and (eq_attr "cpu" "r10000")
177	(and (eq_attr "type" "condmove")
178	(eq_attr "mode" "SF,DF")))
179	"r10k_fpmpy")
180
181
182	;; The fcvt.s.[wl] insn has latency 4, repeat 2.
183	;; All other fcvt insns have latency 2, repeat 1.
184	(define_insn_reservation "r10k_fcvt_single" 4
185	(and (eq_attr "cpu" "r10000")
186	(and (eq_attr "type" "fcvt")
187	(eq_attr "cnv_mode" "I2S")))
188	"r10k_fpadd * 2")
189
190	(define_insn_reservation "r10k_fcvt_other" 2
191	(and (eq_attr "cpu" "r10000")
192	(and (eq_attr "type" "fcvt")
193	(eq_attr "cnv_mode" "!I2S")))
194	"r10k_fpadd")
195
196
197	;; Run the fmadd insn through fp-adder first, then fp-multiplier.
198	;;
199	;; The latency for fmadd is 2 cycles if the result is used
200	;; by another fmadd instruction.
201	(define_insn_reservation "r10k_fmadd" 4
202	(and (eq_attr "cpu" "r10000")
203	(eq_attr "type" "fmadd"))
204	"r10k_fpadd, r10k_fpmpy")
205
206	(define_bypass 2 "r10k_fmadd" "r10k_fmadd")
207
208
209	;; Floating point Divisions & square roots.
210	(define_insn_reservation "r10k_fdiv_single" 12
211	(and (eq_attr "cpu" "r10000")
212	(and (eq_attr "type" "fdiv,frdiv")
213	(eq_attr "mode" "SF")))
214	"r10k_fpdiv * 14")
215
216	(define_insn_reservation "r10k_fdiv_double" 19
217	(and (eq_attr "cpu" "r10000")
218	(and (eq_attr "type" "fdiv,frdiv")
219	(eq_attr "mode" "DF")))
220	"r10k_fpdiv * 21")
221
222	(define_insn_reservation "r10k_fsqrt_single" 18
223	(and (eq_attr "cpu" "r10000")
224	(and (eq_attr "type" "fsqrt")
225	(eq_attr "mode" "SF")))
226	"r10k_fpsqrt * 20")
227
228	(define_insn_reservation "r10k_fsqrt_double" 33
229	(and (eq_attr "cpu" "r10000")
230	(and (eq_attr "type" "fsqrt")
231	(eq_attr "mode" "DF")))
232	"r10k_fpsqrt * 35")
233
234	(define_insn_reservation "r10k_frsqrt_single" 30
235	(and (eq_attr "cpu" "r10000")
236	(and (eq_attr "type" "frsqrt")
237	(eq_attr "mode" "SF")))
238	"r10k_fpsqrt * 20")
239
240	(define_insn_reservation "r10k_frsqrt_double" 52
241	(and (eq_attr "cpu" "r10000")
242	(and (eq_attr "type" "frsqrt")
243	(eq_attr "mode" "DF")))
244	"r10k_fpsqrt * 35")
245
246
247	;; Handle unknown/multi insns here (this is a guess).
248	(define_insn_reservation "r10k_unknown" 1
249	(and (eq_attr "cpu" "r10000")
3088716e	250	(eq_attr "type" "unknown,multi,atomic,syncloop"))
7a3446ec	251	"r10k_alu1 + r10k_alu2")