]>
Commit | Line | Data |
---|---|---|
d8099dd8 | 1 | ;; ARM Cortex-A5 pipeline description |
d1e082c2 | 2 | ;; Copyright (C) 2010-2013 Free Software Foundation, Inc. |
d8099dd8 JB |
3 | ;; Contributed by CodeSourcery. |
4 | ;; | |
5 | ;; This file is part of GCC. | |
6 | ;; | |
7 | ;; GCC is free software; you can redistribute it and/or modify it | |
8 | ;; under the terms of the GNU General Public License as published by | |
9 | ;; the Free Software Foundation; either version 3, or (at your option) | |
10 | ;; any later version. | |
11 | ;; | |
12 | ;; GCC is distributed in the hope that it will be useful, but | |
13 | ;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;; General Public License for more details. | |
16 | ;; | |
17 | ;; You should have received a copy of the GNU General Public License | |
18 | ;; along with GCC; see the file COPYING3. If not see | |
19 | ;; <http://www.gnu.org/licenses/>. | |
20 | ||
21 | (define_automaton "cortex_a5") | |
22 | ||
23 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
24 | ;; Functional units. | |
25 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
26 | ||
27 | ;; The integer (ALU) pipeline. There are five DPU pipeline | |
28 | ;; stages. However the decode/issue stages operate the same for all | |
29 | ;; instructions, so do not model them. We only need to model the | |
30 | ;; first execute stage because instructions always advance one stage | |
31 | ;; per cycle in order. Only branch instructions may dual-issue, so a | |
32 | ;; single unit covers all of the LS, ALU, MAC and FPU pipelines. | |
33 | ||
34 | (define_cpu_unit "cortex_a5_ex1" "cortex_a5") | |
35 | ||
36 | ;; The branch pipeline. Branches can dual-issue with other instructions | |
37 | ;; (except when those instructions take multiple cycles to issue). | |
38 | ||
39 | (define_cpu_unit "cortex_a5_branch" "cortex_a5") | |
40 | ||
41 | ;; Pseudo-unit for blocking the multiply pipeline when a double-precision | |
42 | ;; multiply is in progress. | |
43 | ||
44 | (define_cpu_unit "cortex_a5_fpmul_pipe" "cortex_a5") | |
45 | ||
46 | ;; The floating-point add pipeline (ex1/f1 stage), used to model the usage | |
47 | ;; of the add pipeline by fmac instructions, etc. | |
48 | ||
49 | (define_cpu_unit "cortex_a5_fpadd_pipe" "cortex_a5") | |
50 | ||
51 | ;; Floating-point div/sqrt (long latency, out-of-order completion). | |
52 | ||
53 | (define_cpu_unit "cortex_a5_fp_div_sqrt" "cortex_a5") | |
54 | ||
55 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
56 | ;; ALU instructions. | |
57 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
58 | ||
59 | (define_insn_reservation "cortex_a5_alu" 2 | |
60 | (and (eq_attr "tune" "cortexa5") | |
6e4150e1 JG |
61 | (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\ |
62 | alu_reg,alus_reg,logic_reg,logics_reg,\ | |
63 | adc_imm,adcs_imm,adc_reg,adcs_reg,\ | |
64 | adr,bfm,rev,\ | |
65 | shift_imm,shift_reg,\ | |
594726e4 JG |
66 | mov_imm,mov_reg,mvn_imm,mvn_reg,\ |
67 | multiple,no_insn")) | |
d8099dd8 JB |
68 | "cortex_a5_ex1") |
69 | ||
70 | (define_insn_reservation "cortex_a5_alu_shift" 2 | |
71 | (and (eq_attr "tune" "cortexa5") | |
6e4150e1 JG |
72 | (eq_attr "type" "extend,\ |
73 | alu_shift_imm,alus_shift_imm,\ | |
74 | logic_shift_imm,logics_shift_imm,\ | |
75 | alu_shift_reg,alus_shift_reg,\ | |
76 | logic_shift_reg,logics_shift_reg,\ | |
859abddd SN |
77 | mov_shift,mov_shift_reg,\ |
78 | mvn_shift,mvn_shift_reg")) | |
d8099dd8 JB |
79 | "cortex_a5_ex1") |
80 | ||
81 | ;; Forwarding path for unshifted operands. | |
82 | ||
83 | (define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" | |
84 | "cortex_a5_alu") | |
85 | ||
86 | (define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" | |
87 | "cortex_a5_alu_shift" | |
88 | "arm_no_early_alu_shift_dep") | |
89 | ||
90 | ;; The multiplier pipeline can forward results from wr stage only so | |
91 | ;; there's no need to specify bypasses). | |
92 | ||
93 | (define_insn_reservation "cortex_a5_mul" 2 | |
94 | (and (eq_attr "tune" "cortexa5") | |
09485a08 SN |
95 | (ior (eq_attr "mul32" "yes") |
96 | (eq_attr "mul64" "yes"))) | |
d8099dd8 JB |
97 | "cortex_a5_ex1") |
98 | ||
99 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
100 | ;; Load/store instructions. | |
101 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
102 | ||
103 | ;; Address-generation happens in the issue stage, which is one stage behind | |
104 | ;; the ex1 stage (the first stage we care about for scheduling purposes). The | |
105 | ;; dc1 stage is parallel with ex1, dc2 with ex2 and rot with wr. | |
106 | ||
107 | (define_insn_reservation "cortex_a5_load1" 2 | |
108 | (and (eq_attr "tune" "cortexa5") | |
109 | (eq_attr "type" "load_byte,load1")) | |
110 | "cortex_a5_ex1") | |
111 | ||
112 | (define_insn_reservation "cortex_a5_store1" 0 | |
113 | (and (eq_attr "tune" "cortexa5") | |
114 | (eq_attr "type" "store1")) | |
115 | "cortex_a5_ex1") | |
116 | ||
117 | (define_insn_reservation "cortex_a5_load2" 3 | |
118 | (and (eq_attr "tune" "cortexa5") | |
119 | (eq_attr "type" "load2")) | |
120 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
121 | ||
122 | (define_insn_reservation "cortex_a5_store2" 0 | |
123 | (and (eq_attr "tune" "cortexa5") | |
124 | (eq_attr "type" "store2")) | |
125 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
126 | ||
127 | (define_insn_reservation "cortex_a5_load3" 4 | |
128 | (and (eq_attr "tune" "cortexa5") | |
129 | (eq_attr "type" "load3")) | |
130 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
131 | cortex_a5_ex1") | |
132 | ||
133 | (define_insn_reservation "cortex_a5_store3" 0 | |
134 | (and (eq_attr "tune" "cortexa5") | |
135 | (eq_attr "type" "store3")) | |
136 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
137 | cortex_a5_ex1") | |
138 | ||
139 | (define_insn_reservation "cortex_a5_load4" 5 | |
140 | (and (eq_attr "tune" "cortexa5") | |
141 | (eq_attr "type" "load3")) | |
142 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
143 | cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
144 | ||
145 | (define_insn_reservation "cortex_a5_store4" 0 | |
146 | (and (eq_attr "tune" "cortexa5") | |
147 | (eq_attr "type" "store3")) | |
148 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
149 | cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
150 | ||
151 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
152 | ;; Branches. | |
153 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
154 | ||
155 | ;; Direct branches are the only instructions we can dual-issue (also IT and | |
156 | ;; nop, but those aren't very interesting for scheduling). (The latency here | |
157 | ;; is meant to represent when the branch actually takes place, but may not be | |
158 | ;; entirely correct.) | |
159 | ||
160 | (define_insn_reservation "cortex_a5_branch" 3 | |
161 | (and (eq_attr "tune" "cortexa5") | |
162 | (eq_attr "type" "branch,call")) | |
163 | "cortex_a5_branch") | |
164 | ||
165 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
166 | ;; Floating-point arithmetic. | |
167 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
168 | ||
169 | (define_insn_reservation "cortex_a5_fpalu" 4 | |
170 | (and (eq_attr "tune" "cortexa5") | |
171 | (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fcpys, fmuls, f_cvt,\ | |
172 | fcmps, fcmpd")) | |
173 | "cortex_a5_ex1+cortex_a5_fpadd_pipe") | |
174 | ||
175 | ;; For fconsts and fconstd, 8-bit immediate data is passed directly from | |
176 | ;; f1 to f3 (which I think reduces the latency by one cycle). | |
177 | ||
178 | (define_insn_reservation "cortex_a5_fconst" 3 | |
179 | (and (eq_attr "tune" "cortexa5") | |
180 | (eq_attr "type" "fconsts,fconstd")) | |
181 | "cortex_a5_ex1+cortex_a5_fpadd_pipe") | |
182 | ||
183 | ;; We should try not to attempt to issue a single-precision multiplication in | |
184 | ;; the middle of a double-precision multiplication operation (the usage of | |
185 | ;; cortex_a5_fpmul_pipe). | |
186 | ||
187 | (define_insn_reservation "cortex_a5_fpmuls" 4 | |
188 | (and (eq_attr "tune" "cortexa5") | |
189 | (eq_attr "type" "fmuls")) | |
190 | "cortex_a5_ex1+cortex_a5_fpmul_pipe") | |
191 | ||
192 | ;; For single-precision multiply-accumulate, the add (accumulate) is issued | |
193 | ;; whilst the multiply is in F4. The multiply result can then be forwarded | |
194 | ;; from F5 to F1. The issue unit is only used once (when we first start | |
195 | ;; processing the instruction), but the usage of the FP add pipeline could | |
196 | ;; block other instructions attempting to use it simultaneously. We try to | |
197 | ;; avoid that using cortex_a5_fpadd_pipe. | |
198 | ||
199 | (define_insn_reservation "cortex_a5_fpmacs" 8 | |
200 | (and (eq_attr "tune" "cortexa5") | |
29637783 | 201 | (eq_attr "type" "fmacs,ffmas")) |
d8099dd8 JB |
202 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") |
203 | ||
204 | ;; Non-multiply instructions can issue in the middle two instructions of a | |
205 | ;; double-precision multiply. Note that it isn't entirely clear when a branch | |
206 | ;; can dual-issue when a multi-cycle multiplication is in progress; we ignore | |
207 | ;; that for now though. | |
208 | ||
209 | (define_insn_reservation "cortex_a5_fpmuld" 7 | |
210 | (and (eq_attr "tune" "cortexa5") | |
211 | (eq_attr "type" "fmuld")) | |
212 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ | |
213 | cortex_a5_ex1+cortex_a5_fpmul_pipe") | |
214 | ||
215 | (define_insn_reservation "cortex_a5_fpmacd" 11 | |
216 | (and (eq_attr "tune" "cortexa5") | |
29637783 | 217 | (eq_attr "type" "fmacd,ffmad")) |
d8099dd8 JB |
218 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ |
219 | cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") | |
220 | ||
221 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
222 | ;; Floating-point divide/square root instructions. | |
223 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
224 | ||
225 | ;; ??? Not sure if the 14 cycles taken for single-precision divide to complete | |
226 | ;; includes the time taken for the special instruction used to collect the | |
227 | ;; result to travel down the multiply pipeline, or not. Assuming so. (If | |
228 | ;; that's wrong, the latency should be increased by a few cycles.) | |
229 | ||
230 | ;; fsqrt takes one cycle less, but that is not modelled, nor is the use of the | |
231 | ;; multiply pipeline to collect the divide/square-root result. | |
232 | ||
233 | (define_insn_reservation "cortex_a5_fdivs" 14 | |
234 | (and (eq_attr "tune" "cortexa5") | |
235 | (eq_attr "type" "fdivs")) | |
236 | "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 13") | |
237 | ||
238 | ;; ??? Similarly for fdivd. | |
239 | ||
240 | (define_insn_reservation "cortex_a5_fdivd" 29 | |
241 | (and (eq_attr "tune" "cortexa5") | |
242 | (eq_attr "type" "fdivd")) | |
243 | "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 28") | |
244 | ||
245 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
246 | ;; VFP to/from core transfers. | |
247 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
248 | ||
249 | ;; FP loads take data from wr/rot/f3. | |
250 | ||
251 | ;; Core-to-VFP transfers use the multiply pipeline. | |
252 | ||
253 | (define_insn_reservation "cortex_a5_r2f" 4 | |
254 | (and (eq_attr "tune" "cortexa5") | |
003bb7f3 | 255 | (eq_attr "type" "f_mcr,f_mcrr")) |
d8099dd8 JB |
256 | "cortex_a5_ex1") |
257 | ||
258 | (define_insn_reservation "cortex_a5_f2r" 2 | |
259 | (and (eq_attr "tune" "cortexa5") | |
003bb7f3 | 260 | (eq_attr "type" "f_mrc,f_mrrc")) |
d8099dd8 JB |
261 | "cortex_a5_ex1") |
262 | ||
263 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
264 | ;; VFP flag transfer. | |
265 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
266 | ||
267 | ;; ??? The flag forwarding from fmstat to the ex2 stage of the second | |
268 | ;; instruction is not modeled at present. | |
269 | ||
270 | (define_insn_reservation "cortex_a5_f_flags" 4 | |
271 | (and (eq_attr "tune" "cortexa5") | |
272 | (eq_attr "type" "f_flag")) | |
273 | "cortex_a5_ex1") | |
274 | ||
275 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
276 | ;; VFP load/store. | |
277 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
278 | ||
279 | (define_insn_reservation "cortex_a5_f_loads" 4 | |
280 | (and (eq_attr "tune" "cortexa5") | |
281 | (eq_attr "type" "f_loads")) | |
282 | "cortex_a5_ex1") | |
283 | ||
284 | (define_insn_reservation "cortex_a5_f_loadd" 5 | |
285 | (and (eq_attr "tune" "cortexa5") | |
837b01f6 | 286 | (eq_attr "type" "f_loadd")) |
d8099dd8 JB |
287 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") |
288 | ||
289 | (define_insn_reservation "cortex_a5_f_stores" 0 | |
290 | (and (eq_attr "tune" "cortexa5") | |
291 | (eq_attr "type" "f_stores")) | |
292 | "cortex_a5_ex1") | |
293 | ||
294 | (define_insn_reservation "cortex_a5_f_stored" 0 | |
295 | (and (eq_attr "tune" "cortexa5") | |
837b01f6 | 296 | (eq_attr "type" "f_stored")) |
d8099dd8 JB |
297 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") |
298 | ||
299 | ;; Load-to-use for floating-point values has a penalty of one cycle, | |
300 | ;; i.e. a latency of two. | |
301 | ||
302 | (define_bypass 2 "cortex_a5_f_loads" | |
303 | "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ | |
304 | cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ | |
305 | cortex_a5_f2r") | |
306 | ||
307 | (define_bypass 3 "cortex_a5_f_loadd" | |
308 | "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ | |
309 | cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ | |
310 | cortex_a5_f2r") |