]>
Commit | Line | Data |
---|---|---|
d8099dd8 | 1 | ;; ARM Cortex-A5 pipeline description |
d1e082c2 | 2 | ;; Copyright (C) 2010-2013 Free Software Foundation, Inc. |
d8099dd8 JB |
3 | ;; Contributed by CodeSourcery. |
4 | ;; | |
5 | ;; This file is part of GCC. | |
6 | ;; | |
7 | ;; GCC is free software; you can redistribute it and/or modify it | |
8 | ;; under the terms of the GNU General Public License as published by | |
9 | ;; the Free Software Foundation; either version 3, or (at your option) | |
10 | ;; any later version. | |
11 | ;; | |
12 | ;; GCC is distributed in the hope that it will be useful, but | |
13 | ;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;; General Public License for more details. | |
16 | ;; | |
17 | ;; You should have received a copy of the GNU General Public License | |
18 | ;; along with GCC; see the file COPYING3. If not see | |
19 | ;; <http://www.gnu.org/licenses/>. | |
20 | ||
21 | (define_automaton "cortex_a5") | |
22 | ||
23 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
24 | ;; Functional units. | |
25 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
26 | ||
27 | ;; The integer (ALU) pipeline. There are five DPU pipeline | |
28 | ;; stages. However the decode/issue stages operate the same for all | |
29 | ;; instructions, so do not model them. We only need to model the | |
30 | ;; first execute stage because instructions always advance one stage | |
31 | ;; per cycle in order. Only branch instructions may dual-issue, so a | |
32 | ;; single unit covers all of the LS, ALU, MAC and FPU pipelines. | |
33 | ||
34 | (define_cpu_unit "cortex_a5_ex1" "cortex_a5") | |
35 | ||
36 | ;; The branch pipeline. Branches can dual-issue with other instructions | |
37 | ;; (except when those instructions take multiple cycles to issue). | |
38 | ||
39 | (define_cpu_unit "cortex_a5_branch" "cortex_a5") | |
40 | ||
41 | ;; Pseudo-unit for blocking the multiply pipeline when a double-precision | |
42 | ;; multiply is in progress. | |
43 | ||
44 | (define_cpu_unit "cortex_a5_fpmul_pipe" "cortex_a5") | |
45 | ||
46 | ;; The floating-point add pipeline (ex1/f1 stage), used to model the usage | |
47 | ;; of the add pipeline by fmac instructions, etc. | |
48 | ||
49 | (define_cpu_unit "cortex_a5_fpadd_pipe" "cortex_a5") | |
50 | ||
51 | ;; Floating-point div/sqrt (long latency, out-of-order completion). | |
52 | ||
53 | (define_cpu_unit "cortex_a5_fp_div_sqrt" "cortex_a5") | |
54 | ||
55 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
56 | ;; ALU instructions. | |
57 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
58 | ||
59 | (define_insn_reservation "cortex_a5_alu" 2 | |
60 | (and (eq_attr "tune" "cortexa5") | |
006bd006 | 61 | (eq_attr "type" "arlo_imm,arlo_reg,shift,shift_reg")) |
d8099dd8 JB |
62 | "cortex_a5_ex1") |
63 | ||
64 | (define_insn_reservation "cortex_a5_alu_shift" 2 | |
65 | (and (eq_attr "tune" "cortexa5") | |
006bd006 | 66 | (eq_attr "type" "extend,arlo_shift,arlo_shift_reg")) |
d8099dd8 JB |
67 | "cortex_a5_ex1") |
68 | ||
69 | ;; Forwarding path for unshifted operands. | |
70 | ||
71 | (define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" | |
72 | "cortex_a5_alu") | |
73 | ||
74 | (define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift" | |
75 | "cortex_a5_alu_shift" | |
76 | "arm_no_early_alu_shift_dep") | |
77 | ||
78 | ;; The multiplier pipeline can forward results from wr stage only so | |
79 | ;; there's no need to specify bypasses). | |
80 | ||
81 | (define_insn_reservation "cortex_a5_mul" 2 | |
82 | (and (eq_attr "tune" "cortexa5") | |
09485a08 SN |
83 | (ior (eq_attr "mul32" "yes") |
84 | (eq_attr "mul64" "yes"))) | |
d8099dd8 JB |
85 | "cortex_a5_ex1") |
86 | ||
87 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
88 | ;; Load/store instructions. | |
89 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
90 | ||
91 | ;; Address-generation happens in the issue stage, which is one stage behind | |
92 | ;; the ex1 stage (the first stage we care about for scheduling purposes). The | |
93 | ;; dc1 stage is parallel with ex1, dc2 with ex2 and rot with wr. | |
94 | ||
95 | (define_insn_reservation "cortex_a5_load1" 2 | |
96 | (and (eq_attr "tune" "cortexa5") | |
97 | (eq_attr "type" "load_byte,load1")) | |
98 | "cortex_a5_ex1") | |
99 | ||
100 | (define_insn_reservation "cortex_a5_store1" 0 | |
101 | (and (eq_attr "tune" "cortexa5") | |
102 | (eq_attr "type" "store1")) | |
103 | "cortex_a5_ex1") | |
104 | ||
105 | (define_insn_reservation "cortex_a5_load2" 3 | |
106 | (and (eq_attr "tune" "cortexa5") | |
107 | (eq_attr "type" "load2")) | |
108 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
109 | ||
110 | (define_insn_reservation "cortex_a5_store2" 0 | |
111 | (and (eq_attr "tune" "cortexa5") | |
112 | (eq_attr "type" "store2")) | |
113 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
114 | ||
115 | (define_insn_reservation "cortex_a5_load3" 4 | |
116 | (and (eq_attr "tune" "cortexa5") | |
117 | (eq_attr "type" "load3")) | |
118 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
119 | cortex_a5_ex1") | |
120 | ||
121 | (define_insn_reservation "cortex_a5_store3" 0 | |
122 | (and (eq_attr "tune" "cortexa5") | |
123 | (eq_attr "type" "store3")) | |
124 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
125 | cortex_a5_ex1") | |
126 | ||
127 | (define_insn_reservation "cortex_a5_load4" 5 | |
128 | (and (eq_attr "tune" "cortexa5") | |
129 | (eq_attr "type" "load3")) | |
130 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
131 | cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
132 | ||
133 | (define_insn_reservation "cortex_a5_store4" 0 | |
134 | (and (eq_attr "tune" "cortexa5") | |
135 | (eq_attr "type" "store3")) | |
136 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\ | |
137 | cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") | |
138 | ||
139 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
140 | ;; Branches. | |
141 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
142 | ||
143 | ;; Direct branches are the only instructions we can dual-issue (also IT and | |
144 | ;; nop, but those aren't very interesting for scheduling). (The latency here | |
145 | ;; is meant to represent when the branch actually takes place, but may not be | |
146 | ;; entirely correct.) | |
147 | ||
148 | (define_insn_reservation "cortex_a5_branch" 3 | |
149 | (and (eq_attr "tune" "cortexa5") | |
150 | (eq_attr "type" "branch,call")) | |
151 | "cortex_a5_branch") | |
152 | ||
153 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
154 | ;; Floating-point arithmetic. | |
155 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
156 | ||
157 | (define_insn_reservation "cortex_a5_fpalu" 4 | |
158 | (and (eq_attr "tune" "cortexa5") | |
159 | (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fcpys, fmuls, f_cvt,\ | |
160 | fcmps, fcmpd")) | |
161 | "cortex_a5_ex1+cortex_a5_fpadd_pipe") | |
162 | ||
163 | ;; For fconsts and fconstd, 8-bit immediate data is passed directly from | |
164 | ;; f1 to f3 (which I think reduces the latency by one cycle). | |
165 | ||
166 | (define_insn_reservation "cortex_a5_fconst" 3 | |
167 | (and (eq_attr "tune" "cortexa5") | |
168 | (eq_attr "type" "fconsts,fconstd")) | |
169 | "cortex_a5_ex1+cortex_a5_fpadd_pipe") | |
170 | ||
171 | ;; We should try not to attempt to issue a single-precision multiplication in | |
172 | ;; the middle of a double-precision multiplication operation (the usage of | |
173 | ;; cortex_a5_fpmul_pipe). | |
174 | ||
175 | (define_insn_reservation "cortex_a5_fpmuls" 4 | |
176 | (and (eq_attr "tune" "cortexa5") | |
177 | (eq_attr "type" "fmuls")) | |
178 | "cortex_a5_ex1+cortex_a5_fpmul_pipe") | |
179 | ||
180 | ;; For single-precision multiply-accumulate, the add (accumulate) is issued | |
181 | ;; whilst the multiply is in F4. The multiply result can then be forwarded | |
182 | ;; from F5 to F1. The issue unit is only used once (when we first start | |
183 | ;; processing the instruction), but the usage of the FP add pipeline could | |
184 | ;; block other instructions attempting to use it simultaneously. We try to | |
185 | ;; avoid that using cortex_a5_fpadd_pipe. | |
186 | ||
187 | (define_insn_reservation "cortex_a5_fpmacs" 8 | |
188 | (and (eq_attr "tune" "cortexa5") | |
29637783 | 189 | (eq_attr "type" "fmacs,ffmas")) |
d8099dd8 JB |
190 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") |
191 | ||
192 | ;; Non-multiply instructions can issue in the middle two instructions of a | |
193 | ;; double-precision multiply. Note that it isn't entirely clear when a branch | |
194 | ;; can dual-issue when a multi-cycle multiplication is in progress; we ignore | |
195 | ;; that for now though. | |
196 | ||
197 | (define_insn_reservation "cortex_a5_fpmuld" 7 | |
198 | (and (eq_attr "tune" "cortexa5") | |
199 | (eq_attr "type" "fmuld")) | |
200 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ | |
201 | cortex_a5_ex1+cortex_a5_fpmul_pipe") | |
202 | ||
203 | (define_insn_reservation "cortex_a5_fpmacd" 11 | |
204 | (and (eq_attr "tune" "cortexa5") | |
29637783 | 205 | (eq_attr "type" "fmacd,ffmad")) |
d8099dd8 JB |
206 | "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\ |
207 | cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe") | |
208 | ||
209 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
210 | ;; Floating-point divide/square root instructions. | |
211 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
212 | ||
213 | ;; ??? Not sure if the 14 cycles taken for single-precision divide to complete | |
214 | ;; includes the time taken for the special instruction used to collect the | |
215 | ;; result to travel down the multiply pipeline, or not. Assuming so. (If | |
216 | ;; that's wrong, the latency should be increased by a few cycles.) | |
217 | ||
218 | ;; fsqrt takes one cycle less, but that is not modelled, nor is the use of the | |
219 | ;; multiply pipeline to collect the divide/square-root result. | |
220 | ||
221 | (define_insn_reservation "cortex_a5_fdivs" 14 | |
222 | (and (eq_attr "tune" "cortexa5") | |
223 | (eq_attr "type" "fdivs")) | |
224 | "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 13") | |
225 | ||
226 | ;; ??? Similarly for fdivd. | |
227 | ||
228 | (define_insn_reservation "cortex_a5_fdivd" 29 | |
229 | (and (eq_attr "tune" "cortexa5") | |
230 | (eq_attr "type" "fdivd")) | |
231 | "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 28") | |
232 | ||
233 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
234 | ;; VFP to/from core transfers. | |
235 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
236 | ||
237 | ;; FP loads take data from wr/rot/f3. | |
238 | ||
239 | ;; Core-to-VFP transfers use the multiply pipeline. | |
240 | ||
241 | (define_insn_reservation "cortex_a5_r2f" 4 | |
242 | (and (eq_attr "tune" "cortexa5") | |
243 | (eq_attr "type" "r_2_f")) | |
244 | "cortex_a5_ex1") | |
245 | ||
246 | (define_insn_reservation "cortex_a5_f2r" 2 | |
247 | (and (eq_attr "tune" "cortexa5") | |
248 | (eq_attr "type" "f_2_r")) | |
249 | "cortex_a5_ex1") | |
250 | ||
251 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
252 | ;; VFP flag transfer. | |
253 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
254 | ||
255 | ;; ??? The flag forwarding from fmstat to the ex2 stage of the second | |
256 | ;; instruction is not modeled at present. | |
257 | ||
258 | (define_insn_reservation "cortex_a5_f_flags" 4 | |
259 | (and (eq_attr "tune" "cortexa5") | |
260 | (eq_attr "type" "f_flag")) | |
261 | "cortex_a5_ex1") | |
262 | ||
263 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
264 | ;; VFP load/store. | |
265 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
266 | ||
267 | (define_insn_reservation "cortex_a5_f_loads" 4 | |
268 | (and (eq_attr "tune" "cortexa5") | |
269 | (eq_attr "type" "f_loads")) | |
270 | "cortex_a5_ex1") | |
271 | ||
272 | (define_insn_reservation "cortex_a5_f_loadd" 5 | |
273 | (and (eq_attr "tune" "cortexa5") | |
837b01f6 | 274 | (eq_attr "type" "f_loadd")) |
d8099dd8 JB |
275 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") |
276 | ||
277 | (define_insn_reservation "cortex_a5_f_stores" 0 | |
278 | (and (eq_attr "tune" "cortexa5") | |
279 | (eq_attr "type" "f_stores")) | |
280 | "cortex_a5_ex1") | |
281 | ||
282 | (define_insn_reservation "cortex_a5_f_stored" 0 | |
283 | (and (eq_attr "tune" "cortexa5") | |
837b01f6 | 284 | (eq_attr "type" "f_stored")) |
d8099dd8 JB |
285 | "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1") |
286 | ||
287 | ;; Load-to-use for floating-point values has a penalty of one cycle, | |
288 | ;; i.e. a latency of two. | |
289 | ||
290 | (define_bypass 2 "cortex_a5_f_loads" | |
291 | "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ | |
292 | cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ | |
293 | cortex_a5_f2r") | |
294 | ||
295 | (define_bypass 3 "cortex_a5_f_loadd" | |
296 | "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\ | |
297 | cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\ | |
298 | cortex_a5_f2r") |