]>
Commit | Line | Data |
---|---|---|
1 | ;; Scheduling description for the SPARC M8. | |
2 | ;; Copyright (C) 2017-2020 Free Software Foundation, Inc. | |
3 | ;; | |
4 | ;; This file is part of GCC. | |
5 | ;; | |
6 | ;; GCC is free software; you can redistribute it and/or modify | |
7 | ;; it under the terms of the GNU General Public License as published by | |
8 | ;; the Free Software Foundation; either version 3, or (at your option) | |
9 | ;; any later version. | |
10 | ;; | |
11 | ;; GCC is distributed in the hope that it will be useful, | |
12 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | ;; GNU General Public License for more details. | |
15 | ;; | |
16 | ;; You should have received a copy of the GNU General Public License | |
17 | ;; along with GCC; see the file COPYING3. If not see | |
18 | ;; <http://www.gnu.org/licenses/>. | |
19 | ||
20 | ;; Thigs to improve: | |
21 | ;; | |
22 | ;; - Store instructions are implemented by micro-ops, one of which | |
23 | ;; generates the store address and is executed in the store address | |
24 | ;; generation unit in the slot0. We need to model that. | |
25 | ;; | |
26 | ;; - There are two V3 pipes connected to different slots. The current | |
27 | ;; implementation assumes that all the instructions executing in a | |
28 | ;; V3 pipe are issued to the unit in slot3. | |
29 | ;; | |
30 | ;; - Single-issue ALU operations incur an additional cycle of latency to | |
31 | ;; slot 0 and slot 1 instructions. This is not currently reflected | |
32 | ;; in the DFA. | |
33 | ||
34 | (define_automaton "m8_0") | |
35 | ||
36 | ;; The S5 core has two dual-issue queues, PQLS and PQEX. Each queue | |
37 | ;; is divided into two slots: PQLS corresponds to slots 0 and 1, and | |
38 | ;; PQEX corresponds to slots 2 and 3. The core can issue 4 | |
39 | ;; instructions per-cycle, and up to 4 instructions are committed each | |
40 | ;; cycle. | |
41 | ;; | |
42 | ;; | |
43 | ;; m8_slot0 - Load Unit. | |
44 | ;; - Store address gen. Unit. | |
45 | ;; | |
46 | ;; | |
47 | ;; === PQLS ==> m8_slot1 - Store data unit. | |
48 | ;; - Branch unit. | |
49 | ;; | |
50 | ;; | |
51 | ;; === PQEX ==> m8_slot2 - Integer Unit (EXU2). | |
52 | ;; - 3-cycles Crypto Unit (SPU2). | |
53 | ;; | |
54 | ;; m8_slot3 - Integer Unit (EXU3). | |
55 | ;; - 3-cycles Crypto Unit (SPU3). | |
56 | ;; - Floating-point and graphics unit (FPG). | |
57 | ;; - Long-latency Crypto Unit. | |
58 | ;; - Oracle Numbers Unit (ONU). | |
59 | ||
60 | (define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0") | |
61 | ||
62 | ;; Some instructions stall the pipeline and avoid any other | |
63 | ;; instruction to be issued in the same cycle. We assume the same for | |
64 | ;; multi-instruction insns. | |
65 | ||
66 | (define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3") | |
67 | ||
68 | (define_insn_reservation "m8_single" 1 | |
69 | (and (eq_attr "cpu" "m8") | |
70 | (eq_attr "type" "multi,savew,flushw,trap,bmask")) | |
71 | "m8_single_issue") | |
72 | ||
73 | ;; Most of the instructions executing in the integer units have a | |
74 | ;; latency of 1. | |
75 | ||
76 | (define_insn_reservation "m8_integer" 1 | |
77 | (and (eq_attr "cpu" "m8") | |
78 | (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask")) | |
79 | "(m8_slot2 | m8_slot3)") | |
80 | ||
81 | ;; Flushing the instruction memory takes 27 cycles. | |
82 | ||
83 | ||
84 | (define_insn_reservation "m8_iflush" 27 | |
85 | (and (eq_attr "cpu" "m8") | |
86 | (eq_attr "type" "iflush")) | |
87 | "(m8_slot2 | m8_slot3), nothing*26") | |
88 | ||
89 | ;; The integer multiplication instructions have a latency of 10 cycles | |
90 | ;; and execute in integer units. | |
91 | ;; | |
92 | ;; Likewise for array*, edge* and pdistn instructions. | |
93 | ;; | |
94 | ;; However, the latency is only 9 cycles if the consumer of the | |
95 | ;; operation is also capable of 9 cycles latency. We model this with | |
96 | ;; a bypass. | |
97 | ||
98 | (define_insn_reservation "m8_imul" 10 | |
99 | (and (eq_attr "cpu" "m8") | |
100 | (eq_attr "type" "imul,array,edge,edgen,pdistn")) | |
101 | "(m8_slot2 | m8_slot3), nothing*12") | |
102 | ||
103 | (define_bypass 9 "m8_imul" "m8_imul") | |
104 | ||
105 | ;; The integer division instructions `sdiv' and `udivx' have a latency | |
106 | ;; of 30 cycles and execute in integer units. | |
107 | ||
108 | (define_insn_reservation "m8_idiv" 30 | |
109 | (and (eq_attr "cpu" "m8") | |
110 | (eq_attr "type" "idiv")) | |
111 | "(m8_slot2 | m8_slot3), nothing*29") | |
112 | ||
113 | ;; Both integer and floating-point load instructions have a latency of | |
114 | ;; only 3 cycles,and execute in the slot0. | |
115 | ;; | |
116 | ;; Misaligned load instructions feature a latency of 11 cycles. | |
117 | ;; | |
118 | ;; The prefetch instruction also executes in the load unit, but it's | |
119 | ;; latency is only 1 cycle. | |
120 | ||
121 | (define_insn_reservation "m8_load" 3 | |
122 | (and (eq_attr "cpu" "m8") | |
123 | (ior (eq_attr "type" "fpload,sload") | |
124 | (and (eq_attr "type" "load") | |
125 | (eq_attr "subtype" "regular")))) | |
126 | "m8_slot0, nothing*2") | |
127 | ||
128 | ;; (define_insn_reservation "m8_load_misalign" 11 | |
129 | ;; (and (eq_attr "cpu" "m8") | |
130 | ;; (eq_attr "type" "load_mis,fpload_mis")) | |
131 | ;; "m8_slot0, nothing*10") | |
132 | ||
133 | (define_insn_reservation "m8_prefetch" 1 | |
134 | (and (eq_attr "cpu" "m8") | |
135 | (eq_attr "type" "load") | |
136 | (eq_attr "subtype" "prefetch")) | |
137 | "m8_slot0") | |
138 | ||
139 | ;; Both integer and floating-point store instructions have a latency | |
140 | ;; of 1 cycle, and execute in the store data unit in slot1. | |
141 | ;; | |
142 | ;; However, misaligned store instructions feature a latency of 3 | |
143 | ;; cycles. | |
144 | ||
145 | (define_insn_reservation "m8_store" 1 | |
146 | (and (eq_attr "cpu" "m8") | |
147 | (eq_attr "type" "store,fpstore")) | |
148 | "m8_slot1") | |
149 | ||
150 | ;; (define_insn_reservation "m8_store_misalign" 3 | |
151 | ;; (and (eq_attr "cpu" "m8") | |
152 | ;; (eq_attr "type" "store_mis,fpstore_mis")) | |
153 | ;; "m8_slot1, nothing*2") | |
154 | ||
155 | ;; Control-transfer instructions execute in the Branch Unit in the | |
156 | ;; slot1. | |
157 | ||
158 | (define_insn_reservation "m8_cti" 1 | |
159 | (and (eq_attr "cpu" "m8") | |
160 | (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return")) | |
161 | "m8_slot1") | |
162 | ||
163 | ;; Many instructions executing in the Floating-point and Graphics Unit | |
164 | ;; (FGU) serving slot3 feature a default latency of 9 cycles. | |
165 | ||
166 | (define_insn_reservation "m8_fp" 9 | |
167 | (and (eq_attr "cpu" "m8") | |
168 | (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist") | |
169 | (and (eq_attr "type" "fga") | |
170 | (eq_attr "subtype" "fpu")))) | |
171 | "m8_slot3, nothing*8") | |
172 | ||
173 | ;; Floating-point division and floating-point square-root instructions | |
174 | ;; have high latencies. They execute in the FGU. | |
175 | ||
176 | (define_insn_reservation "m8_fpdivs" 26 | |
177 | (and (eq_attr "cpu" "m8") | |
178 | (eq_attr "type" "fpdivs")) | |
179 | "m8_slot3, nothing*25") | |
180 | ||
181 | (define_insn_reservation "m8_fpsqrts" 33 | |
182 | (and (eq_attr "cpu" "m8") | |
183 | (eq_attr "type" "fpsqrts")) | |
184 | "m8_slot3, nothing*32") | |
185 | ||
186 | (define_insn_reservation "m8_fpdivd" 30 | |
187 | (and (eq_attr "cpu" "m8") | |
188 | (eq_attr "type" "fpdivd")) | |
189 | "m8_slot3, nothing*29") | |
190 | ||
191 | (define_insn_reservation "m8_fpsqrtd" 41 | |
192 | (and (eq_attr "cpu" "m8") | |
193 | (eq_attr "type" "fpsqrtd")) | |
194 | "m8_slot3, nothing*40") | |
195 | ||
196 | ;; SIMD VIS instructions executing in the Floating-point and graphics | |
197 | ;; unit (FPG) in slot3 usually have a latency of 5 cycles. | |
198 | ;; | |
199 | ;; However, the latency for many instructions is only 3 cycles if the | |
200 | ;; consumer can also be executed in 3 cycles. We model this with a | |
201 | ;; bypass. In these cases the instructions are executed in one of the | |
202 | ;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2 | |
203 | ;; and 3. | |
204 | ||
205 | (define_insn_reservation "m8_vis" 5 | |
206 | (and (eq_attr "cpu" "m8") | |
207 | (ior (eq_attr "type" "viscmp,lzd") | |
208 | (and (eq_attr "type" "fga") | |
209 | (eq_attr "subtype" "maxmin,cmask,other")) | |
210 | (and (eq_attr "type" "vismv") | |
211 | (eq_attr "subtype" "single,movstouw")) | |
212 | (and (eq_attr "type" "visl") | |
213 | (eq_attr "subtype" "single")))) | |
214 | "m8_slot3, nothing*4") | |
215 | ||
216 | (define_bypass 3 "m8_vis" "m8_vis") | |
217 | ||
218 | (define_insn_reservation "m8_gsr" 5 | |
219 | (and (eq_attr "cpu" "m8") | |
220 | (eq_attr "type" "gsr") | |
221 | (eq_attr "subtype" "alignaddr")) | |
222 | "m8_slot3, nothing*4") | |
223 | ||
224 | ;; A few VIS instructions have a latency of 1. | |
225 | ||
226 | (define_insn_reservation "m8_vis_1cycle" 1 | |
227 | (and (eq_attr "cpu" "m8") | |
228 | (ior (and (eq_attr "type" "vismv") | |
229 | (eq_attr "subtype" "double,movxtod,movdtox")) | |
230 | (and (eq_attr "type" "visl") | |
231 | (eq_attr "subtype" "double")) | |
232 | (and (eq_attr "type" "fga") | |
233 | (eq_attr "subtype" "addsub64")))) | |
234 | "m8_slot3") | |
235 | ||
236 | ;; Reading and writing to the gsr register takes more than 70 cycles. | |
237 | ||
238 | (define_insn_reservation "m8_gsr_reg" 70 | |
239 | (and (eq_attr "cpu" "m8") | |
240 | (eq_attr "type" "gsr") | |
241 | (eq_attr "subtype" "reg")) | |
242 | "m8_slot3, nothing*69") |