[thirdparty/gcc.git] / gcc / config / sparc / m8.md

;; Scheduling description for the SPARC M8.
;;   Copyright (C) 2017-2020 Free Software Foundation, Inc.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; <http://www.gnu.org/licenses/>.

;; Thigs to improve:
;;
;; - Store instructions are implemented by micro-ops, one of which
;;   generates the store address and is executed in the store address
;;   generation unit in the slot0.  We need to model that.
;;
;; - There are two V3 pipes connected to different slots.  The current
;;   implementation assumes that all the instructions executing in a
;;   V3 pipe are issued to the unit in slot3.
;;
;; - Single-issue ALU operations incur an additional cycle of latency to
;;   slot 0 and slot 1 instructions.  This is not currently reflected
;;   in the DFA.

(define_automaton "m8_0")

;; The S5 core has two dual-issue queues, PQLS and PQEX.  Each queue
;; is divided into two slots: PQLS corresponds to slots 0 and 1, and
;; PQEX corresponds to slots 2 and 3.  The core can issue 4
;; instructions per-cycle, and up to 4 instructions are committed each
;; cycle.
;;
;;                            
;;                   m8_slot0  - Load Unit.
;;                             - Store address gen. Unit.
;;                                                       
;;                            
;;   === PQLS ==>    m8_slot1  - Store data unit.
;;                             - Branch unit.
;;                                            
;;                             
;;   === PQEX ==>    m8_slot2  - Integer Unit (EXU2).                     
;;                             - 3-cycles Crypto Unit (SPU2).
;;                                                     
;;                   m8_slot3  - Integer Unit (EXU3).
;;                             - 3-cycles Crypto Unit (SPU3).
;;                             - Floating-point and graphics unit (FPG).
;;                             - Long-latency Crypto Unit.
;;                             - Oracle Numbers Unit (ONU).

(define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0")

;; Some instructions stall the pipeline and avoid any other
;; instruction to be issued in the same cycle.  We assume the same for
;; multi-instruction insns.

(define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3")

(define_insn_reservation "m8_single" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "multi,savew,flushw,trap,bmask"))
  "m8_single_issue")

;; Most of the instructions executing in the integer units have a
;; latency of 1.

(define_insn_reservation "m8_integer" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask"))
  "(m8_slot2 | m8_slot3)")

;; Flushing the instruction memory takes 27 cycles.


(define_insn_reservation "m8_iflush" 27
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "iflush"))
  "(m8_slot2 | m8_slot3), nothing*26")

;; The integer multiplication instructions have a latency of 10 cycles
;; and execute in integer units.
;;
;; Likewise for array*, edge* and pdistn instructions.
;;
;; However, the latency is only 9 cycles if the consumer of the
;; operation is also capable of 9 cycles latency.  We model this with
;; a bypass.

(define_insn_reservation "m8_imul" 10
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "imul,array,edge,edgen,pdistn"))
  "(m8_slot2 | m8_slot3), nothing*12")

(define_bypass 9 "m8_imul" "m8_imul")

;; The integer division instructions `sdiv' and `udivx' have a latency
;; of 30 cycles and execute in integer units.

(define_insn_reservation "m8_idiv" 30
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "idiv"))
  "(m8_slot2 | m8_slot3), nothing*29")

;; Both integer and floating-point load instructions have a latency of
;; only 3 cycles,and execute in the slot0.
;;
;; Misaligned load instructions feature a latency of 11 cycles.
;;
;; The prefetch instruction also executes in the load unit, but it's
;; latency is only 1 cycle.

(define_insn_reservation "m8_load" 3
  (and (eq_attr "cpu" "m8")
       (ior (eq_attr "type" "fpload,sload")
            (and (eq_attr "type" "load")
                 (eq_attr "subtype" "regular"))))
  "m8_slot0, nothing*2")

;; (define_insn_reservation "m8_load_misalign" 11
;;  (and (eq_attr "cpu" "m8")
;;       (eq_attr "type" "load_mis,fpload_mis"))
;;  "m8_slot0, nothing*10")

(define_insn_reservation "m8_prefetch" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "load")
       (eq_attr "subtype" "prefetch"))
  "m8_slot0")

;; Both integer and floating-point store instructions have a latency
;; of 1 cycle, and execute in the store data unit in slot1.
;;
;; However, misaligned store instructions feature a latency of 3
;; cycles.

(define_insn_reservation "m8_store" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "store,fpstore"))
  "m8_slot1")

;; (define_insn_reservation "m8_store_misalign" 3
;;   (and (eq_attr "cpu" "m8")
;;        (eq_attr "type" "store_mis,fpstore_mis"))
;;   "m8_slot1, nothing*2")

;; Control-transfer instructions execute in the Branch Unit in the
;; slot1.

(define_insn_reservation "m8_cti" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
  "m8_slot1")

;; Many instructions executing in the Floating-point and Graphics Unit
;; (FGU) serving slot3 feature a default latency of 9 cycles.

(define_insn_reservation "m8_fp" 9
  (and (eq_attr "cpu" "m8")
       (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
            (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "fpu"))))
  "m8_slot3, nothing*8")

;; Floating-point division and floating-point square-root instructions
;; have high latencies.  They execute in the FGU.

(define_insn_reservation "m8_fpdivs" 26
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "fpdivs"))
  "m8_slot3, nothing*25")

(define_insn_reservation "m8_fpsqrts" 33
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "fpsqrts"))
  "m8_slot3, nothing*32")

(define_insn_reservation "m8_fpdivd" 30
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "fpdivd"))
  "m8_slot3, nothing*29")

(define_insn_reservation "m8_fpsqrtd" 41
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "fpsqrtd"))
  "m8_slot3, nothing*40")

;; SIMD VIS instructions executing in the Floating-point and graphics
;; unit (FPG) in slot3 usually have a latency of 5 cycles.
;;
;; However, the latency for many instructions is only 3 cycles if the
;; consumer can also be executed in 3 cycles.  We model this with a
;; bypass.  In these cases the instructions are executed in one of the
;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2
;; and 3.

(define_insn_reservation "m8_vis" 5
  (and (eq_attr "cpu" "m8")
       (ior (eq_attr "type" "viscmp,lzd")
            (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "maxmin,cmask,other"))
            (and (eq_attr "type" "vismv")
                 (eq_attr "subtype" "single,movstouw"))
            (and (eq_attr "type" "visl")
                 (eq_attr "subtype" "single"))))
  "m8_slot3, nothing*4")

(define_bypass 3 "m8_vis" "m8_vis")

(define_insn_reservation "m8_gsr" 5
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "gsr")
       (eq_attr "subtype" "alignaddr"))
  "m8_slot3, nothing*4")

;; A few VIS instructions have a latency of 1.

(define_insn_reservation "m8_vis_1cycle" 1
  (and (eq_attr "cpu" "m8")
       (ior (and (eq_attr "type" "vismv")
                 (eq_attr "subtype" "double,movxtod,movdtox"))
            (and (eq_attr "type" "visl")
                 (eq_attr "subtype" "double"))
            (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "addsub64"))))
  "m8_slot3")

;; Reading and writing to the gsr register takes more than 70 cycles.

(define_insn_reservation "m8_gsr_reg" 70
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "gsr")
       (eq_attr "subtype" "reg"))
  "m8_slot3, nothing*69")
Commit	Line	Data
	1	;; Scheduling description for the SPARC M8.
	2	;; Copyright (C) 2017-2020 Free Software Foundation, Inc.
	3	;;
	4	;; This file is part of GCC.
	5	;;
	6	;; GCC is free software; you can redistribute it and/or modify
	7	;; it under the terms of the GNU General Public License as published by
	8	;; the Free Software Foundation; either version 3, or (at your option)
	9	;; any later version.
	10	;;
	11	;; GCC is distributed in the hope that it will be useful,
	12	;; but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	;; GNU General Public License for more details.
	15	;;
	16	;; You should have received a copy of the GNU General Public License
	17	;; along with GCC; see the file COPYING3. If not see
	18	;; <http://www.gnu.org/licenses/>.
	19
	20	;; Thigs to improve:
	21	;;
	22	;; - Store instructions are implemented by micro-ops, one of which
	23	;; generates the store address and is executed in the store address
	24	;; generation unit in the slot0. We need to model that.
	25	;;
	26	;; - There are two V3 pipes connected to different slots. The current
	27	;; implementation assumes that all the instructions executing in a
	28	;; V3 pipe are issued to the unit in slot3.
	29	;;
	30	;; - Single-issue ALU operations incur an additional cycle of latency to
	31	;; slot 0 and slot 1 instructions. This is not currently reflected
	32	;; in the DFA.
	33
	34	(define_automaton "m8_0")
	35
	36	;; The S5 core has two dual-issue queues, PQLS and PQEX. Each queue
	37	;; is divided into two slots: PQLS corresponds to slots 0 and 1, and
	38	;; PQEX corresponds to slots 2 and 3. The core can issue 4
	39	;; instructions per-cycle, and up to 4 instructions are committed each
	40	;; cycle.
	41	;;
	42	;;
	43	;; m8_slot0 - Load Unit.
	44	;; - Store address gen. Unit.
	45	;;
	46	;;
	47	;; === PQLS ==> m8_slot1 - Store data unit.
	48	;; - Branch unit.
	49	;;
	50	;;
	51	;; === PQEX ==> m8_slot2 - Integer Unit (EXU2).
	52	;; - 3-cycles Crypto Unit (SPU2).
	53	;;
	54	;; m8_slot3 - Integer Unit (EXU3).
	55	;; - 3-cycles Crypto Unit (SPU3).
	56	;; - Floating-point and graphics unit (FPG).
	57	;; - Long-latency Crypto Unit.
	58	;; - Oracle Numbers Unit (ONU).
	59
	60	(define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0")
	61
	62	;; Some instructions stall the pipeline and avoid any other
	63	;; instruction to be issued in the same cycle. We assume the same for
	64	;; multi-instruction insns.
	65
	66	(define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3")
	67
	68	(define_insn_reservation "m8_single" 1
	69	(and (eq_attr "cpu" "m8")
	70	(eq_attr "type" "multi,savew,flushw,trap,bmask"))
	71	"m8_single_issue")
	72
	73	;; Most of the instructions executing in the integer units have a
	74	;; latency of 1.
	75
	76	(define_insn_reservation "m8_integer" 1
	77	(and (eq_attr "cpu" "m8")
	78	(eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask"))
	79	"(m8_slot2 \| m8_slot3)")
	80
	81	;; Flushing the instruction memory takes 27 cycles.
	82
	83
	84	(define_insn_reservation "m8_iflush" 27
	85	(and (eq_attr "cpu" "m8")
	86	(eq_attr "type" "iflush"))
	87	"(m8_slot2 \| m8_slot3), nothing*26")
	88
	89	;; The integer multiplication instructions have a latency of 10 cycles
	90	;; and execute in integer units.
	91	;;
	92	;; Likewise for array, edge and pdistn instructions.
	93	;;
	94	;; However, the latency is only 9 cycles if the consumer of the
	95	;; operation is also capable of 9 cycles latency. We model this with
	96	;; a bypass.
	97
	98	(define_insn_reservation "m8_imul" 10
	99	(and (eq_attr "cpu" "m8")
	100	(eq_attr "type" "imul,array,edge,edgen,pdistn"))
	101	"(m8_slot2 \| m8_slot3), nothing*12")
	102
	103	(define_bypass 9 "m8_imul" "m8_imul")
	104
	105	;; The integer division instructions `sdiv' and `udivx' have a latency
	106	;; of 30 cycles and execute in integer units.
	107
	108	(define_insn_reservation "m8_idiv" 30
	109	(and (eq_attr "cpu" "m8")
	110	(eq_attr "type" "idiv"))
	111	"(m8_slot2 \| m8_slot3), nothing*29")
	112
	113	;; Both integer and floating-point load instructions have a latency of
	114	;; only 3 cycles,and execute in the slot0.
	115	;;
	116	;; Misaligned load instructions feature a latency of 11 cycles.
	117	;;
	118	;; The prefetch instruction also executes in the load unit, but it's
	119	;; latency is only 1 cycle.
	120
	121	(define_insn_reservation "m8_load" 3
	122	(and (eq_attr "cpu" "m8")
	123	(ior (eq_attr "type" "fpload,sload")
	124	(and (eq_attr "type" "load")
	125	(eq_attr "subtype" "regular"))))
	126	"m8_slot0, nothing*2")
	127
	128	;; (define_insn_reservation "m8_load_misalign" 11
	129	;; (and (eq_attr "cpu" "m8")
	130	;; (eq_attr "type" "load_mis,fpload_mis"))
	131	;; "m8_slot0, nothing*10")
	132
	133	(define_insn_reservation "m8_prefetch" 1
	134	(and (eq_attr "cpu" "m8")
	135	(eq_attr "type" "load")
	136	(eq_attr "subtype" "prefetch"))
	137	"m8_slot0")
	138
	139	;; Both integer and floating-point store instructions have a latency
	140	;; of 1 cycle, and execute in the store data unit in slot1.
	141	;;
	142	;; However, misaligned store instructions feature a latency of 3
	143	;; cycles.
	144
	145	(define_insn_reservation "m8_store" 1
	146	(and (eq_attr "cpu" "m8")
	147	(eq_attr "type" "store,fpstore"))
	148	"m8_slot1")
	149
	150	;; (define_insn_reservation "m8_store_misalign" 3
	151	;; (and (eq_attr "cpu" "m8")
	152	;; (eq_attr "type" "store_mis,fpstore_mis"))
	153	;; "m8_slot1, nothing*2")
	154
	155	;; Control-transfer instructions execute in the Branch Unit in the
	156	;; slot1.
	157
	158	(define_insn_reservation "m8_cti" 1
	159	(and (eq_attr "cpu" "m8")
	160	(eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
	161	"m8_slot1")
	162
	163	;; Many instructions executing in the Floating-point and Graphics Unit
	164	;; (FGU) serving slot3 feature a default latency of 9 cycles.
	165
	166	(define_insn_reservation "m8_fp" 9
	167	(and (eq_attr "cpu" "m8")
	168	(ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
	169	(and (eq_attr "type" "fga")
	170	(eq_attr "subtype" "fpu"))))
	171	"m8_slot3, nothing*8")
	172
	173	;; Floating-point division and floating-point square-root instructions
	174	;; have high latencies. They execute in the FGU.
	175
	176	(define_insn_reservation "m8_fpdivs" 26
	177	(and (eq_attr "cpu" "m8")
	178	(eq_attr "type" "fpdivs"))
	179	"m8_slot3, nothing*25")
	180
	181	(define_insn_reservation "m8_fpsqrts" 33
	182	(and (eq_attr "cpu" "m8")
	183	(eq_attr "type" "fpsqrts"))
	184	"m8_slot3, nothing*32")
	185
	186	(define_insn_reservation "m8_fpdivd" 30
	187	(and (eq_attr "cpu" "m8")
	188	(eq_attr "type" "fpdivd"))
	189	"m8_slot3, nothing*29")
	190
	191	(define_insn_reservation "m8_fpsqrtd" 41
	192	(and (eq_attr "cpu" "m8")
	193	(eq_attr "type" "fpsqrtd"))
	194	"m8_slot3, nothing*40")
	195
	196	;; SIMD VIS instructions executing in the Floating-point and graphics
	197	;; unit (FPG) in slot3 usually have a latency of 5 cycles.
	198	;;
	199	;; However, the latency for many instructions is only 3 cycles if the
	200	;; consumer can also be executed in 3 cycles. We model this with a
	201	;; bypass. In these cases the instructions are executed in one of the
	202	;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2
	203	;; and 3.
	204
	205	(define_insn_reservation "m8_vis" 5
	206	(and (eq_attr "cpu" "m8")
	207	(ior (eq_attr "type" "viscmp,lzd")
	208	(and (eq_attr "type" "fga")
	209	(eq_attr "subtype" "maxmin,cmask,other"))
	210	(and (eq_attr "type" "vismv")
	211	(eq_attr "subtype" "single,movstouw"))
	212	(and (eq_attr "type" "visl")
	213	(eq_attr "subtype" "single"))))
	214	"m8_slot3, nothing*4")
	215
	216	(define_bypass 3 "m8_vis" "m8_vis")
	217
	218	(define_insn_reservation "m8_gsr" 5
	219	(and (eq_attr "cpu" "m8")
	220	(eq_attr "type" "gsr")
	221	(eq_attr "subtype" "alignaddr"))
	222	"m8_slot3, nothing*4")
	223
	224	;; A few VIS instructions have a latency of 1.
	225
	226	(define_insn_reservation "m8_vis_1cycle" 1
	227	(and (eq_attr "cpu" "m8")
	228	(ior (and (eq_attr "type" "vismv")
	229	(eq_attr "subtype" "double,movxtod,movdtox"))
	230	(and (eq_attr "type" "visl")
	231	(eq_attr "subtype" "double"))
	232	(and (eq_attr "type" "fga")
	233	(eq_attr "subtype" "addsub64"))))
	234	"m8_slot3")
	235
	236	;; Reading and writing to the gsr register takes more than 70 cycles.
	237
	238	(define_insn_reservation "m8_gsr_reg" 70
	239	(and (eq_attr "cpu" "m8")
	240	(eq_attr "type" "gsr")
	241	(eq_attr "subtype" "reg"))
	242	"m8_slot3, nothing*69")