]>
Commit | Line | Data |
---|---|---|
c39b724a | 1 | ;; DFA scheduling description for SH4. |
f1717362 | 2 | ;; Copyright (C) 2004-2016 Free Software Foundation, Inc. |
c39b724a | 3 | |
4 | ;; This file is part of GCC. | |
5 | ||
6 | ;; GCC is free software; you can redistribute it and/or modify | |
7 | ;; it under the terms of the GNU General Public License as published by | |
038d1e19 | 8 | ;; the Free Software Foundation; either version 3, or (at your option) |
c39b724a | 9 | ;; any later version. |
10 | ||
11 | ;; GCC is distributed in the hope that it will be useful, | |
12 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | ;; GNU General Public License for more details. | |
15 | ||
16 | ;; You should have received a copy of the GNU General Public License | |
038d1e19 | 17 | ;; along with GCC; see the file COPYING3. If not see |
18 | ;; <http://www.gnu.org/licenses/>. | |
c39b724a | 19 | |
20 | ;; Load and store instructions save a cycle if they are aligned on a | |
21 | ;; four byte boundary. Using a function unit for stores encourages | |
22 | ;; gcc to separate load and store instructions by one instruction, | |
23 | ;; which makes it more likely that the linker will be able to word | |
24 | ;; align them when relaxing. | |
25 | ||
26 | ;; The following description models the SH4 pipeline using the DFA based | |
27 | ;; scheduler. The DFA based description is better way to model a | |
28 | ;; superscalar pipeline as compared to function unit reservation model. | |
29 | ;; 1. The function unit based model is oriented to describe at most one | |
30 | ;; unit reservation by each insn. It is difficult to model unit reservations | |
31 | ;; in multiple pipeline units by same insn. This can be done using DFA | |
32 | ;; based description. | |
33 | ;; 2. The execution performance of DFA based scheduler does not depend on | |
34 | ;; processor complexity. | |
35 | ;; 3. Writing all unit reservations for an instruction class is a more natural | |
36 | ;; description of the pipeline and makes the interface to the hazard | |
37 | ;; recognizer simpler than the old function unit based model. | |
38 | ;; 4. The DFA model is richer and is a part of greater overall framework | |
39 | ;; of RCSP. | |
40 | ||
41 | ||
42 | ;; Two automata are defined to reduce number of states | |
43 | ;; which a single large automaton will have. (Factoring) | |
c39b724a | 44 | (define_automaton "inst_pipeline,fpu_pipe") |
45 | ||
46 | ;; This unit is basically the decode unit of the processor. | |
47 | ;; Since SH4 is a dual issue machine,it is as if there are two | |
48 | ;; units so that any insn can be processed by either one | |
49 | ;; of the decoding unit. | |
c39b724a | 50 | (define_cpu_unit "pipe_01,pipe_02" "inst_pipeline") |
51 | ||
52 | ||
53 | ;; The fixed point arithmetic calculator(?? EX Unit). | |
c39b724a | 54 | (define_cpu_unit "int" "inst_pipeline") |
55 | ||
56 | ;; f1_1 and f1_2 are floating point units.Actually there is | |
57 | ;; a f1 unit which can overlap with other f1 unit but | |
58 | ;; not another F1 unit.It is as though there were two | |
59 | ;; f1 units. | |
c39b724a | 60 | (define_cpu_unit "f1_1,f1_2" "fpu_pipe") |
61 | ||
62 | ;; The floating point units (except FS - F2 always precedes it.) | |
c39b724a | 63 | (define_cpu_unit "F0,F1,F2,F3" "fpu_pipe") |
64 | ||
65 | ;; This is basically the MA unit of SH4 | |
66 | ;; used in LOAD/STORE pipeline. | |
c39b724a | 67 | (define_cpu_unit "memory" "inst_pipeline") |
68 | ||
69 | ;; However, there are LS group insns that don't use it, even ones that | |
70 | ;; complete in 0 cycles. So we use an extra unit for the issue of LS insns. | |
71 | (define_cpu_unit "load_store" "inst_pipeline") | |
72 | ||
73 | ;; The address calculator used for branch instructions. | |
74 | ;; This will be reserved after "issue" of branch instructions | |
75 | ;; and this is to make sure that no two branch instructions | |
76 | ;; can be issued in parallel. | |
77 | ||
78 | (define_cpu_unit "pcr_addrcalc" "inst_pipeline") | |
79 | ||
80 | ;; ---------------------------------------------------- | |
81 | ;; This reservation is to simplify the dual issue description. | |
c39b724a | 82 | (define_reservation "issue" "pipe_01|pipe_02") |
83 | ||
84 | ;; This is to express the locking of D stage. | |
85 | ;; Note that the issue of a CO group insn also effectively locks the D stage. | |
c39b724a | 86 | (define_reservation "d_lock" "pipe_01+pipe_02") |
87 | ||
88 | ;; Every FE instruction but fipr / ftrv starts with issue and this. | |
89 | (define_reservation "F01" "F0+F1") | |
90 | ||
91 | ;; This is to simplify description where F1,F2,FS | |
92 | ;; are used simultaneously. | |
c39b724a | 93 | (define_reservation "fpu" "F1+F2") |
94 | ||
95 | ;; This is to highlight the fact that f1 | |
96 | ;; cannot overlap with F1. | |
c39b724a | 97 | (exclusion_set "f1_1,f1_2" "F1") |
98 | ||
99 | (define_insn_reservation "nil" 0 (eq_attr "type" "nil") "nothing") | |
100 | ||
101 | ;; Although reg moves have a latency of zero | |
102 | ;; we need to highlight that they use D stage | |
103 | ;; for one cycle. | |
104 | ||
105 | ;; Group: MT | |
c39b724a | 106 | (define_insn_reservation "reg_mov" 0 |
107 | (and (eq_attr "pipe_model" "sh4") | |
108 | (eq_attr "type" "move")) | |
109 | "issue") | |
110 | ||
111 | ;; Group: LS | |
c39b724a | 112 | (define_insn_reservation "freg_mov" 0 |
113 | (and (eq_attr "pipe_model" "sh4") | |
114 | (eq_attr "type" "fmove")) | |
115 | "issue+load_store") | |
116 | ||
117 | ;; We don't model all pipeline stages; we model the issue ('D') stage | |
118 | ;; inasmuch as we allow only two instructions to issue simultaneously, | |
119 | ;; and CO instructions prevent any simultaneous issue of another instruction. | |
120 | ;; (This uses pipe_01 and pipe_02). | |
121 | ;; Double issue of EX insns is prevented by using the int unit in the EX stage. | |
122 | ;; Double issue of EX / BR insns is prevented by using the int unit / | |
123 | ;; pcr_addrcalc unit in the EX stage. | |
124 | ;; Double issue of BR / LS instructions is prevented by using the | |
125 | ;; pcr_addrcalc / load_store unit in the issue cycle. | |
126 | ;; Double issue of FE instructions is prevented by using F0 in the first | |
127 | ;; pipeline stage after the first D stage. | |
128 | ;; There is no need to describe the [ES]X / [MN]A / S stages after a D stage | |
129 | ;; (except in the cases outlined above), nor to describe the FS stage after | |
130 | ;; the F2 stage. | |
131 | ||
132 | ;; Other MT group instructions(1 step operations) | |
133 | ;; Group: MT | |
134 | ;; Latency: 1 | |
135 | ;; Issue Rate: 1 | |
c39b724a | 136 | (define_insn_reservation "mt" 1 |
137 | (and (eq_attr "pipe_model" "sh4") | |
138 | (eq_attr "type" "mt_group")) | |
139 | "issue") | |
140 | ||
141 | ;; Fixed Point Arithmetic Instructions(1 step operations) | |
142 | ;; Group: EX | |
143 | ;; Latency: 1 | |
144 | ;; Issue Rate: 1 | |
c39b724a | 145 | (define_insn_reservation "sh4_simple_arith" 1 |
146 | (and (eq_attr "pipe_model" "sh4") | |
147 | (eq_attr "insn_class" "ex_group")) | |
148 | "issue,int") | |
149 | ||
150 | ;; Load and store instructions have no alignment peculiarities for the SH4, | |
151 | ;; but they use the load-store unit, which they share with the fmove type | |
152 | ;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) . | |
153 | ;; Loads have a latency of two. | |
154 | ;; However, call insns can only paired with a preceding insn, and have | |
155 | ;; a delay slot, so that we want two more insns to be scheduled between the | |
156 | ;; load of the function address and the call. This is equivalent to a | |
157 | ;; latency of three. | |
158 | ;; ADJUST_COST can only properly handle reductions of the cost, so we | |
159 | ;; use a latency of three here, which gets multiplied by 10 to yield 30. | |
160 | ;; We only do this for SImode loads of general registers, to make the work | |
161 | ;; for ADJUST_COST easier. | |
162 | ||
163 | ;; Load Store instructions. (MOV.[BWL]@(d,GBR) | |
164 | ;; Group: LS | |
165 | ;; Latency: 2 | |
166 | ;; Issue Rate: 1 | |
c39b724a | 167 | (define_insn_reservation "sh4_load" 2 |
168 | (and (eq_attr "pipe_model" "sh4") | |
169 | (eq_attr "type" "load,pcload")) | |
170 | "issue+load_store,nothing,memory") | |
171 | ||
172 | ;; calls / sfuncs need an extra instruction for their delay slot. | |
173 | ;; Moreover, estimating the latency for SImode loads as 3 will also allow | |
174 | ;; adjust_cost to meaningfully bump it back up to 3 if they load the shift | |
175 | ;; count of a dynamic shift. | |
176 | (define_insn_reservation "sh4_load_si" 3 | |
177 | (and (eq_attr "pipe_model" "sh4") | |
178 | (eq_attr "type" "load_si,pcload_si")) | |
179 | "issue+load_store,nothing,memory") | |
180 | ||
181 | ;; (define_bypass 2 "sh4_load_si" "!sh4_call") | |
182 | ||
183 | ;; The load latency is upped to three higher if the dependent insn does | |
184 | ;; double precision computation. We want the 'default' latency to reflect | |
185 | ;; that increased latency because otherwise the insn priorities won't | |
186 | ;; allow proper scheduling. | |
187 | (define_insn_reservation "sh4_fload" 3 | |
188 | (and (eq_attr "pipe_model" "sh4") | |
189 | (eq_attr "type" "fload,pcfload")) | |
190 | "issue+load_store,nothing,memory") | |
191 | ||
192 | ;; (define_bypass 2 "sh4_fload" "!") | |
193 | ||
194 | (define_insn_reservation "sh4_store" 1 | |
195 | (and (eq_attr "pipe_model" "sh4") | |
5be30882 | 196 | (eq_attr "type" "store,fstore")) |
c39b724a | 197 | "issue+load_store,nothing,memory") |
198 | ||
5be30882 | 199 | (define_insn_reservation "mac_mem" 1 |
200 | (and (eq_attr "pipe_model" "sh4") | |
201 | (eq_attr "type" "mac_mem")) | |
202 | "d_lock,nothing,memory") | |
203 | ||
c39b724a | 204 | ;; Load Store instructions. |
205 | ;; Group: LS | |
206 | ;; Latency: 1 | |
207 | ;; Issue Rate: 1 | |
c39b724a | 208 | (define_insn_reservation "sh4_gp_fpul" 1 |
209 | (and (eq_attr "pipe_model" "sh4") | |
210 | (eq_attr "type" "gp_fpul")) | |
211 | "issue+load_store") | |
212 | ||
213 | ;; Load Store instructions. | |
214 | ;; Group: LS | |
215 | ;; Latency: 3 | |
216 | ;; Issue Rate: 1 | |
c39b724a | 217 | (define_insn_reservation "sh4_fpul_gp" 3 |
218 | (and (eq_attr "pipe_model" "sh4") | |
219 | (eq_attr "type" "fpul_gp")) | |
220 | "issue+load_store") | |
221 | ||
222 | ;; Branch (BF,BF/S,BT,BT/S,BRA) | |
223 | ;; Group: BR | |
224 | ;; Latency when taken: 2 (or 1) | |
225 | ;; Issue Rate: 1 | |
226 | ;; The latency is 1 when displacement is 0. | |
227 | ;; We can't really do much with the latency, even if we could express it, | |
228 | ;; but the pairing restrictions are useful to take into account. | |
229 | ;; ??? If the branch is likely, we might want to fill the delay slot; | |
230 | ;; if the branch is likely, but not very likely, should we pretend to use | |
231 | ;; a resource that CO instructions use, to get a pairable delay slot insn? | |
c39b724a | 232 | (define_insn_reservation "sh4_branch" 1 |
233 | (and (eq_attr "pipe_model" "sh4") | |
234 | (eq_attr "type" "cbranch,jump")) | |
235 | "issue+pcr_addrcalc") | |
236 | ||
237 | ;; Branch Far (JMP,RTS,BRAF) | |
238 | ;; Group: CO | |
239 | ;; Latency: 3 | |
240 | ;; Issue Rate: 2 | |
241 | ;; ??? Scheduling happens before branch shortening, and hence jmp and braf | |
242 | ;; can't be distinguished from bra for the "jump" pattern. | |
c39b724a | 243 | (define_insn_reservation "sh4_return" 3 |
244 | (and (eq_attr "pipe_model" "sh4") | |
245 | (eq_attr "type" "return,jump_ind")) | |
246 | "d_lock*2") | |
247 | ||
248 | ;; RTE | |
249 | ;; Group: CO | |
250 | ;; Latency: 5 | |
251 | ;; Issue Rate: 5 | |
252 | ;; this instruction can be executed in any of the pipelines | |
253 | ;; and blocks the pipeline for next 4 stages. | |
c39b724a | 254 | (define_insn_reservation "sh4_return_from_exp" 5 |
255 | (and (eq_attr "pipe_model" "sh4") | |
256 | (eq_attr "type" "rte")) | |
257 | "d_lock*5") | |
258 | ||
259 | ;; OCBP, OCBWB | |
260 | ;; Group: CO | |
261 | ;; Latency: 1-5 | |
262 | ;; Issue Rate: 1 | |
6c049e03 | 263 | ;; cwb is used for the sequence |
264 | ;; ocbwb @%0 | |
265 | ;; extu.w %0,%2 | |
266 | ;; or %1,%2 | |
267 | ;; mov.l %0,@%2 | |
c39b724a | 268 | ;; ocbwb on its own would be "d_lock,nothing,memory*5" |
269 | (define_insn_reservation "ocbwb" 6 | |
270 | (and (eq_attr "pipe_model" "sh4") | |
271 | (eq_attr "type" "cwb")) | |
272 | "d_lock*2,(d_lock+memory)*3,issue+load_store+memory,memory*2") | |
273 | ||
274 | ;; LDS to PR,JSR | |
275 | ;; Group: CO | |
276 | ;; Latency: 3 | |
277 | ;; Issue Rate: 2 | |
278 | ;; The SX stage is blocked for last 2 cycles. | |
279 | ;; OTOH, the only time that has an effect for insns generated by the compiler | |
280 | ;; is when lds to PR is followed by sts from PR - and that is highly unlikely - | |
281 | ;; or when we are doing a function call - and we don't do inter-function | |
282 | ;; scheduling. For the function call case, it's really best that we end with | |
283 | ;; something that models an rts. | |
c39b724a | 284 | (define_insn_reservation "sh4_lds_to_pr" 3 |
285 | (and (eq_attr "pipe_model" "sh4") | |
286 | (eq_attr "type" "prset") ) | |
287 | "d_lock*2") | |
288 | ||
289 | ;; calls introduce a longisch delay that is likely to flush the pipelines | |
290 | ;; of the caller's instructions. Ordinary functions tend to end with a | |
291 | ;; load to restore a register (in the delay slot of rts), while sfuncs | |
292 | ;; tend to end with an EX or MT insn. But that is not actually relevant, | |
293 | ;; since there are no instructions that contend for memory access early. | |
294 | ;; We could, of course, provide exact scheduling information for specific | |
295 | ;; sfuncs, if that should prove useful. | |
c39b724a | 296 | (define_insn_reservation "sh4_call" 16 |
297 | (and (eq_attr "pipe_model" "sh4") | |
298 | (eq_attr "type" "call,sfunc")) | |
299 | "d_lock*16") | |
300 | ||
301 | ;; LDS.L to PR | |
302 | ;; Group: CO | |
303 | ;; Latency: 3 | |
304 | ;; Issue Rate: 2 | |
305 | ;; The SX unit is blocked for last 2 cycles. | |
c39b724a | 306 | (define_insn_reservation "ldsmem_to_pr" 3 |
307 | (and (eq_attr "pipe_model" "sh4") | |
308 | (eq_attr "type" "pload")) | |
309 | "d_lock*2") | |
310 | ||
311 | ;; STS from PR | |
312 | ;; Group: CO | |
313 | ;; Latency: 2 | |
314 | ;; Issue Rate: 2 | |
315 | ;; The SX unit in second and third cycles. | |
c39b724a | 316 | (define_insn_reservation "sts_from_pr" 2 |
317 | (and (eq_attr "pipe_model" "sh4") | |
318 | (eq_attr "type" "prget")) | |
319 | "d_lock*2") | |
320 | ||
321 | ;; STS.L from PR | |
322 | ;; Group: CO | |
323 | ;; Latency: 2 | |
324 | ;; Issue Rate: 2 | |
c39b724a | 325 | (define_insn_reservation "sh4_prstore_mem" 2 |
326 | (and (eq_attr "pipe_model" "sh4") | |
327 | (eq_attr "type" "pstore")) | |
328 | "d_lock*2,nothing,memory") | |
329 | ||
330 | ;; LDS to FPSCR | |
331 | ;; Group: CO | |
332 | ;; Latency: 4 | |
333 | ;; Issue Rate: 1 | |
334 | ;; F1 is blocked for last three cycles. | |
c39b724a | 335 | (define_insn_reservation "fpscr_load" 4 |
336 | (and (eq_attr "pipe_model" "sh4") | |
337 | (eq_attr "type" "gp_fpscr")) | |
338 | "d_lock,nothing,F1*3") | |
339 | ||
340 | ;; LDS.L to FPSCR | |
341 | ;; Group: CO | |
342 | ;; Latency: 1 / 4 | |
343 | ;; Latency to update Rn is 1 and latency to update FPSCR is 4 | |
344 | ;; Issue Rate: 1 | |
345 | ;; F1 is blocked for last three cycles. | |
c39b724a | 346 | (define_insn_reservation "fpscr_load_mem" 4 |
347 | (and (eq_attr "pipe_model" "sh4") | |
348 | (eq_attr "type" "mem_fpscr")) | |
349 | "d_lock,nothing,(F1+memory),F1*2") | |
350 | ||
351 | \f | |
352 | ;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W) | |
353 | ;; Group: CO | |
354 | ;; Latency: 4 / 4 | |
5be30882 | 355 | ;; Issue Rate: 2 |
c39b724a | 356 | (define_insn_reservation "multi" 4 |
357 | (and (eq_attr "pipe_model" "sh4") | |
358 | (eq_attr "type" "smpy,dmpy")) | |
359 | "d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2") | |
360 | ||
5be30882 | 361 | ;; Fixed STS from, and LDS to MACL / MACH |
c39b724a | 362 | ;; Group: CO |
363 | ;; Latency: 3 | |
364 | ;; Issue Rate: 1 | |
c39b724a | 365 | (define_insn_reservation "sh4_mac_gp" 3 |
366 | (and (eq_attr "pipe_model" "sh4") | |
5be30882 | 367 | (eq_attr "type" "mac_gp,gp_mac,mem_mac")) |
c39b724a | 368 | "d_lock") |
369 | ||
370 | ||
371 | ;; Single precision floating point computation FCMP/EQ, | |
5be30882 | 372 | ;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRCHG, FSCHG |
c39b724a | 373 | ;; Group: FE |
374 | ;; Latency: 3/4 | |
375 | ;; Issue Rate: 1 | |
c39b724a | 376 | (define_insn_reservation "fp_arith" 3 |
377 | (and (eq_attr "pipe_model" "sh4") | |
5be30882 | 378 | (eq_attr "type" "fp,fp_cmp")) |
c39b724a | 379 | "issue,F01,F2") |
380 | ||
5be30882 | 381 | ;; We don't model the resource usage of this exactly because that would |
382 | ;; introduce a bogus latency. | |
383 | (define_insn_reservation "sh4_fpscr_toggle" 1 | |
384 | (and (eq_attr "pipe_model" "sh4") | |
385 | (eq_attr "type" "fpscr_toggle")) | |
386 | "issue") | |
387 | ||
c39b724a | 388 | (define_insn_reservation "fp_arith_ftrc" 3 |
389 | (and (eq_attr "pipe_model" "sh4") | |
390 | (eq_attr "type" "ftrc_s")) | |
391 | "issue,F01,F2") | |
392 | ||
393 | (define_bypass 1 "fp_arith_ftrc" "sh4_fpul_gp") | |
394 | ||
395 | ;; Single Precision FDIV/SQRT | |
396 | ;; Group: FE | |
397 | ;; Latency: 12/13 (FDIV); 11/12 (FSQRT) | |
398 | ;; Issue Rate: 1 | |
399 | ;; We describe fdiv here; fsqrt is actually one cycle faster. | |
c39b724a | 400 | (define_insn_reservation "fp_div" 12 |
401 | (and (eq_attr "pipe_model" "sh4") | |
402 | (eq_attr "type" "fdiv")) | |
403 | "issue,F01+F3,F2+F3,F3*7,F1+F3,F2") | |
404 | ||
405 | ;; Double Precision floating point computation | |
406 | ;; (FCNVDS, FCNVSD, FLOAT, FTRC) | |
407 | ;; Group: FE | |
408 | ;; Latency: (3,4)/5 | |
409 | ;; Issue Rate: 1 | |
c39b724a | 410 | (define_insn_reservation "dp_float" 4 |
411 | (and (eq_attr "pipe_model" "sh4") | |
412 | (eq_attr "type" "dfp_conv")) | |
413 | "issue,F01,F1+F2,F2") | |
414 | ||
415 | ;; Double-precision floating-point (FADD,FMUL,FSUB) | |
416 | ;; Group: FE | |
417 | ;; Latency: (7,8)/9 | |
418 | ;; Issue Rate: 1 | |
c39b724a | 419 | (define_insn_reservation "fp_double_arith" 8 |
420 | (and (eq_attr "pipe_model" "sh4") | |
5be30882 | 421 | (eq_attr "type" "dfp_arith,dfp_mul")) |
c39b724a | 422 | "issue,F01,F1+F2,fpu*4,F2") |
423 | ||
424 | ;; Double-precision FCMP (FCMP/EQ,FCMP/GT) | |
425 | ;; Group: CO | |
426 | ;; Latency: 3/5 | |
427 | ;; Issue Rate: 2 | |
c39b724a | 428 | (define_insn_reservation "fp_double_cmp" 3 |
429 | (and (eq_attr "pipe_model" "sh4") | |
430 | (eq_attr "type" "dfp_cmp")) | |
431 | "d_lock,(d_lock+F01),F1+F2,F2") | |
432 | ||
433 | ;; Double precision FDIV/SQRT | |
434 | ;; Group: FE | |
435 | ;; Latency: (24,25)/26 | |
436 | ;; Issue Rate: 1 | |
c39b724a | 437 | (define_insn_reservation "dp_div" 25 |
438 | (and (eq_attr "pipe_model" "sh4") | |
439 | (eq_attr "type" "dfdiv")) | |
440 | "issue,F01+F3,F1+F2+F3,F2+F3,F3*16,F1+F3,(fpu+F3)*2,F2") | |
441 | ||
442 | ||
443 | ;; Use the branch-not-taken case to model arith3 insns. For the branch taken | |
444 | ;; case, we'd get a d_lock instead of issue at the end. | |
445 | (define_insn_reservation "arith3" 3 | |
446 | (and (eq_attr "pipe_model" "sh4") | |
447 | (eq_attr "type" "arith3")) | |
448 | "issue,d_lock+pcr_addrcalc,issue") | |
449 | ||
450 | ;; arith3b insns schedule the same no matter if the branch is taken or not. | |
451 | (define_insn_reservation "arith3b" 2 | |
452 | (and (eq_attr "pipe_model" "sh4") | |
453 | (eq_attr "type" "arith3")) | |
454 | "issue,d_lock+pcr_addrcalc") |