]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/arm/cortex-a5.md
re PR testsuite/52641 (Test cases fail for 16-bit int targets)
[thirdparty/gcc.git] / gcc / config / arm / cortex-a5.md
CommitLineData
d8099dd8 1;; ARM Cortex-A5 pipeline description
d1e082c2 2;; Copyright (C) 2010-2013 Free Software Foundation, Inc.
d8099dd8
JB
3;; Contributed by CodeSourcery.
4;;
5;; This file is part of GCC.
6;;
7;; GCC is free software; you can redistribute it and/or modify it
8;; under the terms of the GNU General Public License as published by
9;; the Free Software Foundation; either version 3, or (at your option)
10;; any later version.
11;;
12;; GCC is distributed in the hope that it will be useful, but
13;; WITHOUT ANY WARRANTY; without even the implied warranty of
14;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;; General Public License for more details.
16;;
17;; You should have received a copy of the GNU General Public License
18;; along with GCC; see the file COPYING3. If not see
19;; <http://www.gnu.org/licenses/>.
20
21(define_automaton "cortex_a5")
22
23;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
24;; Functional units.
25;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
26
27;; The integer (ALU) pipeline. There are five DPU pipeline
28;; stages. However the decode/issue stages operate the same for all
29;; instructions, so do not model them. We only need to model the
30;; first execute stage because instructions always advance one stage
31;; per cycle in order. Only branch instructions may dual-issue, so a
32;; single unit covers all of the LS, ALU, MAC and FPU pipelines.
33
34(define_cpu_unit "cortex_a5_ex1" "cortex_a5")
35
36;; The branch pipeline. Branches can dual-issue with other instructions
37;; (except when those instructions take multiple cycles to issue).
38
39(define_cpu_unit "cortex_a5_branch" "cortex_a5")
40
41;; Pseudo-unit for blocking the multiply pipeline when a double-precision
42;; multiply is in progress.
43
44(define_cpu_unit "cortex_a5_fpmul_pipe" "cortex_a5")
45
46;; The floating-point add pipeline (ex1/f1 stage), used to model the usage
47;; of the add pipeline by fmac instructions, etc.
48
49(define_cpu_unit "cortex_a5_fpadd_pipe" "cortex_a5")
50
51;; Floating-point div/sqrt (long latency, out-of-order completion).
52
53(define_cpu_unit "cortex_a5_fp_div_sqrt" "cortex_a5")
54
55;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56;; ALU instructions.
57;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
58
59(define_insn_reservation "cortex_a5_alu" 2
60 (and (eq_attr "tune" "cortexa5")
006bd006 61 (eq_attr "type" "arlo_imm,arlo_reg,shift,shift_reg"))
d8099dd8
JB
62 "cortex_a5_ex1")
63
64(define_insn_reservation "cortex_a5_alu_shift" 2
65 (and (eq_attr "tune" "cortexa5")
006bd006 66 (eq_attr "type" "extend,arlo_shift,arlo_shift_reg"))
d8099dd8
JB
67 "cortex_a5_ex1")
68
69;; Forwarding path for unshifted operands.
70
71(define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift"
72 "cortex_a5_alu")
73
74(define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift"
75 "cortex_a5_alu_shift"
76 "arm_no_early_alu_shift_dep")
77
78;; The multiplier pipeline can forward results from wr stage only so
79;; there's no need to specify bypasses).
80
81(define_insn_reservation "cortex_a5_mul" 2
82 (and (eq_attr "tune" "cortexa5")
09485a08
SN
83 (ior (eq_attr "mul32" "yes")
84 (eq_attr "mul64" "yes")))
d8099dd8
JB
85 "cortex_a5_ex1")
86
87;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
88;; Load/store instructions.
89;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
90
91;; Address-generation happens in the issue stage, which is one stage behind
92;; the ex1 stage (the first stage we care about for scheduling purposes). The
93;; dc1 stage is parallel with ex1, dc2 with ex2 and rot with wr.
94
95(define_insn_reservation "cortex_a5_load1" 2
96 (and (eq_attr "tune" "cortexa5")
97 (eq_attr "type" "load_byte,load1"))
98 "cortex_a5_ex1")
99
100(define_insn_reservation "cortex_a5_store1" 0
101 (and (eq_attr "tune" "cortexa5")
102 (eq_attr "type" "store1"))
103 "cortex_a5_ex1")
104
105(define_insn_reservation "cortex_a5_load2" 3
106 (and (eq_attr "tune" "cortexa5")
107 (eq_attr "type" "load2"))
108 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
109
110(define_insn_reservation "cortex_a5_store2" 0
111 (and (eq_attr "tune" "cortexa5")
112 (eq_attr "type" "store2"))
113 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
114
115(define_insn_reservation "cortex_a5_load3" 4
116 (and (eq_attr "tune" "cortexa5")
117 (eq_attr "type" "load3"))
118 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\
119 cortex_a5_ex1")
120
121(define_insn_reservation "cortex_a5_store3" 0
122 (and (eq_attr "tune" "cortexa5")
123 (eq_attr "type" "store3"))
124 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\
125 cortex_a5_ex1")
126
127(define_insn_reservation "cortex_a5_load4" 5
128 (and (eq_attr "tune" "cortexa5")
129 (eq_attr "type" "load3"))
130 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\
131 cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
132
133(define_insn_reservation "cortex_a5_store4" 0
134 (and (eq_attr "tune" "cortexa5")
135 (eq_attr "type" "store3"))
136 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\
137 cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
138
139;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
140;; Branches.
141;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
142
143;; Direct branches are the only instructions we can dual-issue (also IT and
144;; nop, but those aren't very interesting for scheduling). (The latency here
145;; is meant to represent when the branch actually takes place, but may not be
146;; entirely correct.)
147
148(define_insn_reservation "cortex_a5_branch" 3
149 (and (eq_attr "tune" "cortexa5")
150 (eq_attr "type" "branch,call"))
151 "cortex_a5_branch")
152
153;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
154;; Floating-point arithmetic.
155;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
156
157(define_insn_reservation "cortex_a5_fpalu" 4
158 (and (eq_attr "tune" "cortexa5")
159 (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fcpys, fmuls, f_cvt,\
160 fcmps, fcmpd"))
161 "cortex_a5_ex1+cortex_a5_fpadd_pipe")
162
163;; For fconsts and fconstd, 8-bit immediate data is passed directly from
164;; f1 to f3 (which I think reduces the latency by one cycle).
165
166(define_insn_reservation "cortex_a5_fconst" 3
167 (and (eq_attr "tune" "cortexa5")
168 (eq_attr "type" "fconsts,fconstd"))
169 "cortex_a5_ex1+cortex_a5_fpadd_pipe")
170
171;; We should try not to attempt to issue a single-precision multiplication in
172;; the middle of a double-precision multiplication operation (the usage of
173;; cortex_a5_fpmul_pipe).
174
175(define_insn_reservation "cortex_a5_fpmuls" 4
176 (and (eq_attr "tune" "cortexa5")
177 (eq_attr "type" "fmuls"))
178 "cortex_a5_ex1+cortex_a5_fpmul_pipe")
179
180;; For single-precision multiply-accumulate, the add (accumulate) is issued
181;; whilst the multiply is in F4. The multiply result can then be forwarded
182;; from F5 to F1. The issue unit is only used once (when we first start
183;; processing the instruction), but the usage of the FP add pipeline could
184;; block other instructions attempting to use it simultaneously. We try to
185;; avoid that using cortex_a5_fpadd_pipe.
186
187(define_insn_reservation "cortex_a5_fpmacs" 8
188 (and (eq_attr "tune" "cortexa5")
29637783 189 (eq_attr "type" "fmacs,ffmas"))
d8099dd8
JB
190 "cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe")
191
192;; Non-multiply instructions can issue in the middle two instructions of a
193;; double-precision multiply. Note that it isn't entirely clear when a branch
194;; can dual-issue when a multi-cycle multiplication is in progress; we ignore
195;; that for now though.
196
197(define_insn_reservation "cortex_a5_fpmuld" 7
198 (and (eq_attr "tune" "cortexa5")
199 (eq_attr "type" "fmuld"))
200 "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\
201 cortex_a5_ex1+cortex_a5_fpmul_pipe")
202
203(define_insn_reservation "cortex_a5_fpmacd" 11
204 (and (eq_attr "tune" "cortexa5")
29637783 205 (eq_attr "type" "fmacd,ffmad"))
d8099dd8
JB
206 "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\
207 cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe")
208
209;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
210;; Floating-point divide/square root instructions.
211;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
212
213;; ??? Not sure if the 14 cycles taken for single-precision divide to complete
214;; includes the time taken for the special instruction used to collect the
215;; result to travel down the multiply pipeline, or not. Assuming so. (If
216;; that's wrong, the latency should be increased by a few cycles.)
217
218;; fsqrt takes one cycle less, but that is not modelled, nor is the use of the
219;; multiply pipeline to collect the divide/square-root result.
220
221(define_insn_reservation "cortex_a5_fdivs" 14
222 (and (eq_attr "tune" "cortexa5")
223 (eq_attr "type" "fdivs"))
224 "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 13")
225
226;; ??? Similarly for fdivd.
227
228(define_insn_reservation "cortex_a5_fdivd" 29
229 (and (eq_attr "tune" "cortexa5")
230 (eq_attr "type" "fdivd"))
231 "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 28")
232
233;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
234;; VFP to/from core transfers.
235;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
236
237;; FP loads take data from wr/rot/f3.
238
239;; Core-to-VFP transfers use the multiply pipeline.
240
241(define_insn_reservation "cortex_a5_r2f" 4
242 (and (eq_attr "tune" "cortexa5")
243 (eq_attr "type" "r_2_f"))
244 "cortex_a5_ex1")
245
246(define_insn_reservation "cortex_a5_f2r" 2
247 (and (eq_attr "tune" "cortexa5")
248 (eq_attr "type" "f_2_r"))
249 "cortex_a5_ex1")
250
251;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
252;; VFP flag transfer.
253;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
254
255;; ??? The flag forwarding from fmstat to the ex2 stage of the second
256;; instruction is not modeled at present.
257
258(define_insn_reservation "cortex_a5_f_flags" 4
259 (and (eq_attr "tune" "cortexa5")
260 (eq_attr "type" "f_flag"))
261 "cortex_a5_ex1")
262
263;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
264;; VFP load/store.
265;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
266
267(define_insn_reservation "cortex_a5_f_loads" 4
268 (and (eq_attr "tune" "cortexa5")
269 (eq_attr "type" "f_loads"))
270 "cortex_a5_ex1")
271
272(define_insn_reservation "cortex_a5_f_loadd" 5
273 (and (eq_attr "tune" "cortexa5")
837b01f6 274 (eq_attr "type" "f_loadd"))
d8099dd8
JB
275 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
276
277(define_insn_reservation "cortex_a5_f_stores" 0
278 (and (eq_attr "tune" "cortexa5")
279 (eq_attr "type" "f_stores"))
280 "cortex_a5_ex1")
281
282(define_insn_reservation "cortex_a5_f_stored" 0
283 (and (eq_attr "tune" "cortexa5")
837b01f6 284 (eq_attr "type" "f_stored"))
d8099dd8
JB
285 "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
286
287;; Load-to-use for floating-point values has a penalty of one cycle,
288;; i.e. a latency of two.
289
290(define_bypass 2 "cortex_a5_f_loads"
291 "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\
292 cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\
293 cortex_a5_f2r")
294
295(define_bypass 3 "cortex_a5_f_loadd"
296 "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\
297 cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\
298 cortex_a5_f2r")