1 (* Emission of the core of the Cortex-A8 NEON scheduling description.
2 Copyright (C) 2007-2013 Free Software Foundation, Inc.
3 Contributed by CodeSourcery.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>.
21 (* This scheduling description generator works as follows.
22 - Each group of instructions has source and destination requirements
23 specified and a list of cores supported. This is then filtered
24 and per core scheduler descriptions are generated out.
25 The reservations generated are prefixed by the name of the
26 core and the check is performed on the basis of what the tuning
27 string is. Running this will generate Neon scheduler descriptions
28 for all cores supported.
30 The source requirements may be specified using
31 Source (the stage at which all source operands not otherwise
32 described are read), Source_m (the stage at which Rm operands are
33 read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
34 - For each group of instructions the earliest stage where a source
35 operand may be required is calculated.
36 - Each group of instructions is selected in turn as a producer.
37 The latencies between this group and every other group are then
38 calculated, yielding up to four values for each combination:
39 1. Producer -> consumer Rn latency
40 2. Producer -> consumer Rm latency
41 3. Producer -> consumer Rd (as a source) latency
42 4. Producer -> consumer worst-case latency.
43 Value 4 is calculated from the destination availability requirements
44 of the consumer and the earliest source availability requirements
46 - The largest Value 4 calculated for the current producer is the
47 worse-case latency, L, for that instruction group. This value is written
48 out in a define_insn_reservation for the producer group.
49 - For each producer and consumer pair, the latencies calculated above
50 are collated. The average (of up to four values) is calculated and
51 if this average is different from the worst-case latency, an
52 unguarded define_bypass construction is issued for that pair.
53 (For each pair only one define_bypass construction will be emitted,
54 and at present we do not emit specific guards.)
57 let find_with_result fn lst =
58 let rec scan = function
66 let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
67 and n7 = 7 and n8 = 8 and n9 = 9
69 type availability = Source of int
74 | Dest_n_after of int * int
76 type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
78 (* Reservation behaviors. All but the last row here correspond to one
79 pipeline each. Each constructor will correspond to one
80 define_reservation. *)
82 Mul | Mul_2cycle | Mul_4cycle
83 | Shift | Shift_2cycle
90 | Fmul_then_fadd | Fmul_then_fadd_2
92 type core = CortexA8 | CortexA9
93 let allCores = [CortexA8; CortexA9]
94 let coreStr = function
95 CortexA8 -> "cortex_a8"
96 | CortexA9 -> "cortex_a9"
98 let tuneStr = function
99 CortexA8 -> "cortexa8"
100 | CortexA9 -> "cortexa9"
103 (* This table must be kept as short as possible by conflating
104 entries with the same availability behavior.
106 First components: instruction group names
107 Second components: availability requirements, in the order in which
108 they should appear in the comments in the .md file.
109 Third components: reservation info
110 Fourth components: List of supported cores.
112 let availability_table = [
113 (* NEON integer ALU instructions. *)
114 (* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
115 veor vbic vorn ddd qqq *)
116 "neon_int_1", [Source n2; Dest n3], ALU, allCores;
117 (* vadd vsub qqd vsub ddd qqq *)
118 "neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU, allCores;
119 (* vsum vneg dd qq vadd vsub qdd *)
120 "neon_int_3", [Source n1; Dest n3], ALU, allCores;
121 (* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
122 (* vhadd vrhadd vqadd vtst ddd qqq *)
123 "neon_int_4", [Source n2; Dest n4], ALU, allCores;
124 (* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
125 "neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU, allCores;
126 (* vqneg vqabs dd qq *)
127 "neon_vqneg_vqabs", [Source n1; Dest n4], ALU, allCores;
129 "neon_vmov", [Dest n3], ALU, allCores;
131 "neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU, allCores;
133 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)],
134 ALU_2cycle, allCores;
136 "neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU, allCores;
138 (* NEON integer multiply instructions. *)
139 (* vmul, vqdmlh, vqrdmlh *)
140 (* vmul, vqdmul, qdd 16/8 long 32/16 long *)
141 "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6],
143 "neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)],
144 Mul_2cycle, allCores;
145 (* vmul, vqdmul again *)
146 "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
147 [Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle, allCores;
149 "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
150 [Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul, allCores;
152 [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)],
153 Mul_2cycle, allCores;
154 "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
155 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)],
156 Mul_2cycle, allCores;
157 "neon_mla_qqq_32_qqd_32_scalar",
158 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)],
159 Mul_4cycle, allCores;
160 (* vmul, vqdmulh, vqrdmulh *)
162 "neon_mul_ddd_16_scalar_32_16_long_scalar",
163 [Source_n n2; Source_m n1; Dest n6], Mul, allCores;
164 "neon_mul_qqd_32_scalar",
165 [Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle, allCores;
167 (* vmla, vmla, vqdmla, vqdmls *)
168 "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
169 [Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul, allCores;
171 (* NEON integer shift instructions. *)
172 (* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
173 "neon_shift_1", [Source n1; Dest n3], Shift, allCores;
174 (* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow, allCores;
175 vqshl_vrshl_vqrshl_ddd *)
176 "neon_shift_2", [Source n1; Dest n4], Shift, allCores;
177 (* vsli, vsri and vshl for qqq *)
178 "neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle, allCores;
179 "neon_vshl_ddd", [Source n1; Dest n1], Shift, allCores;
180 "neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
181 Shift_2cycle, allCores;
182 "neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift, allCores;
184 (* NEON floating-point instructions. *)
185 (* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
186 (* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
187 "neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd, allCores;
188 "neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
189 Fadd_2cycle, allCores;
190 (* vsum, fvmx, vfmn *)
191 "neon_fp_vsum", [Source n1; Dest n5], Fadd, allCores;
192 "neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul, allCores;
193 "neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
194 Fmul_2cycle, allCores;
197 [Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd, allCores;
199 [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
200 Fmul_then_fadd_2, allCores;
201 "neon_fp_vmla_ddd_scalar",
202 [Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd, allCores;
203 "neon_fp_vmla_qqq_scalar",
204 [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
205 Fmul_then_fadd_2, allCores;
206 "neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd, allCores;
207 "neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
208 Fmul_then_fadd_2, allCores;
210 (* NEON byte permute instructions. *)
211 (* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
212 "neon_bp_simple", [Source n1; Dest n2], Permute 1, allCores;
213 (* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1}, allCores;
214 similarly for vtbx *)
215 "neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2, allCores;
217 "neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3, allCores;
219 (* NEON load/store instructions. *)
220 "neon_ldr", [Dest n1], Ls 1, allCores;
221 "neon_str", [Source n1], Ls 1, allCores;
222 "neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2, allCores;
223 "neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3, allCores;
224 "neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2, allCores;
225 "neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3, allCores;
226 "neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4, allCores;
227 "neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2, allCores;
228 "neon_vst1_3_4_regs", [Source n1], Ls 3, allCores;
229 "neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4, allCores;
230 "neon_vst3_vst4", [Source n1], Ls 4, allCores;
231 "neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3, allCores;
232 "neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5, allCores;
233 "neon_vst1_vst2_lane", [Source n1], Ls 2, allCores;
234 "neon_vst3_vst4_lane", [Source n1], Ls 3, allCores;
235 "neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3, allCores;
237 (* NEON register transfer instructions. *)
238 "neon_mcr", [Dest n2], Permute 1, allCores;
239 "neon_mcr_2_mcrr", [Dest n2], Permute 2, allCores;
240 (* MRC instructions are in the .tpl file. *)
243 (* Augment the tuples in the availability table with an extra component
244 that describes the earliest stage where a source operand may be
245 required. (It is also possible that an entry in the table has no
246 source requirements.) *)
247 let calculate_sources =
248 List.map (fun (name, avail, res, cores) ->
251 (fun cur -> fun info ->
259 | Some stage' when stage < stage' -> Some stage
261 | _ -> cur) None avail
263 (name, avail, res, earliest_stage))
265 (* Find the stage, if any, at the end of which a group produces a result. *)
266 let find_dest (attr, avail, _, _) =
269 (fun av -> match av with
270 Dest st -> Some (Some st)
271 | Dest_n_after (after, st) -> Some (Some (after + st))
273 with Not_found -> None
275 (* Find the worst-case latency between a producer and a consumer. *)
276 let worst_case_latency producer (_, _, _, earliest_required) =
277 let dest = find_dest producer in
278 match earliest_required, dest with
280 (* The consumer doesn't have any source requirements. *)
283 (* The producer doesn't produce any results (e.g. a store insn). *)
285 | Some consumed, Some produced -> Some (produced - consumed + 1)
287 (* Helper function for below. *)
288 let latency_calc f producer (_, avail, _, _) =
290 let source_avail = find_with_result f avail in
291 match find_dest producer with
293 (* The producer does not produce a result. *)
296 let latency = produced - source_avail + 1 in
297 (* Latencies below zero are raised to zero since we don't have
299 if latency < 0 then Some 0 else Some latency
300 with Not_found -> None
302 (* Find any Rm latency between a producer and a consumer. If no
303 Rm source requirement is explicitly specified for the consumer,
304 return "positive infinity". Also return "positive infinity" if
305 the latency matches the supplied worst-case latency for this
307 let get_m_latency producer consumer =
308 match latency_calc (fun av -> match av with Source_m stage -> Some stage
309 | _ -> None) producer consumer
310 with None -> [] | Some latency -> [(Guard_only_m, latency)]
312 (* Likewise for Rn. *)
313 let get_n_latency producer consumer =
314 match latency_calc (fun av -> match av with Source_n stage -> Some stage
315 | _ -> None) producer consumer
316 with None -> [] | Some latency -> [(Guard_only_n, latency)]
318 (* Likewise for Rd. *)
319 let get_d_latency producer consumer =
321 latency_calc (fun av -> match av with Source_d stage -> Some stage
322 | _ -> None) producer consumer
323 with None -> [] | Some latency -> [(Guard_only_d, latency)]
325 (* Given a producer and a consumer, work out the latency of the producer
326 to the consumer in each of the four cases (availability information
327 permitting) identified at the top of this file. Return the
328 consumer, the worst-case unguarded latency and any guarded latencies. *)
329 let calculate_latencies producer consumer =
330 let worst = worst_case_latency producer consumer in
331 let m_latency = get_m_latency producer consumer in
332 let n_latency = get_n_latency producer consumer in
333 let d_latency = get_d_latency producer consumer in
334 (consumer, worst, m_latency @ n_latency @ d_latency)
336 (* Helper function for below. *)
337 let pick_latency largest worst guards =
341 | Some worst -> (Guard_none, worst) :: guards
343 if List.length guards = 0 then None else
345 List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
347 let average_latency = (float_of_int total_latency) /.
348 (float_of_int (List.length guards)) in
349 let rounded_latency = int_of_float (ceil average_latency) in
350 if rounded_latency = largest then None
351 else Some (Guard_none, rounded_latency)
353 (* Collate all bypasses for a particular producer as required in
354 worst_case_latencies_and_bypasses. (By this stage there is a maximum
355 of one bypass from this producer to any particular consumer listed
356 in LATENCIES.) Use a hash table to collate bypasses with the
357 same latency and guard. *)
358 let collate_bypasses (producer_name, _, _, _) largest latencies core =
359 let ht = Hashtbl.create 42 in
362 fun ((consumer, _, _, _), worst, guards) ->
363 (* Find out which latency to use. Ignoring latencies that match
364 the *overall* worst-case latency for this producer (which will
365 be in define_insn_reservation), we have to examine:
366 1. the latency with no guard between this producer and this
368 2. any guarded latency. *)
369 let guard_latency_opt = pick_latency largest worst guards in
370 match guard_latency_opt with
372 | Some (guard, latency) ->
374 (if (try ignore (Hashtbl.find ht (guard, latency)); false
375 with Not_found -> true) then
376 keys := (guard, latency) :: !keys);
377 Hashtbl.add ht (guard, latency) ((coreStr core) ^ "_" ^ consumer)
380 (* The hash table now has bypasses collated so that ones with the
381 same latency and guard have the same keys. Walk through all the
382 keys, extract the associated bypasses, and concatenate the names
383 of the consumers for each bypass. *)
385 fun ((guard, latency) as key) ->
386 let consumers = Hashtbl.find_all ht key in
388 String.concat ",\\\n " consumers,
393 (* For every producer, find the worst-case latency between it and
394 *any* consumer. Also determine (if such a thing exists) the
395 lowest-latency bypass from each producer to each consumer. Group
396 the output in such a way that all bypasses with the same producer
397 and latency are together, and so that bypasses with the worst-case
398 latency are ignored. *)
399 let worst_case_latencies_and_bypasses core =
400 let rec f (worst_acc, bypasses_acc) prev xs =
402 [] -> (worst_acc, bypasses_acc)
403 | ((producer_name, producer_avail, res_string, _) as producer)::next ->
404 (* For this particular producer, work out the latencies between
405 it and every consumer. *)
407 List.fold_left (fun acc -> fun consumer ->
408 (calculate_latencies producer consumer) :: acc)
411 (* Now work out what the overall worst case latency was for this
412 particular producer. *)
416 let comp_fn (_, l1, _) (_, l2, _) =
417 if l1 > l2 then -1 else if l1 = l2 then 0 else 1
420 match List.hd (List.sort comp_fn latencies) with
421 (_, None, _) -> 0 (* Producer has no consumers. *)
422 | (_, Some worst, _) -> worst
424 (* Having got the largest latency, collect all bypasses for
425 this producer and filter out those with that larger
426 latency. Record the others for later emission. *)
427 let bypasses = collate_bypasses producer largest latencies core in
428 (* Go on to process remaining producers, having noted
429 the result for this one. *)
430 f ((producer_name, producer_avail, largest,
431 res_string) :: worst_acc,
432 bypasses @ bypasses_acc)
433 (prev @ [producer]) next
437 (* Emit a helpful comment for a define_insn_reservation. *)
438 let write_comment producer avail =
439 let seen_source = ref false in
441 let read = if !seen_source then "" else "read " in
445 Printf.printf "%stheir source operands at N%d" read stage
448 Printf.printf "%stheir (D|Q)n operands at N%d" read stage
451 Printf.printf "%stheir (D|Q)m operands at N%d" read stage
453 Printf.printf "%stheir (D|Q)d operands at N%d" read stage
455 Printf.printf "produce a result at N%d" stage
456 | Dest_n_after (after, stage) ->
457 Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
459 Printf.printf ";; Instructions using this reservation ";
461 let sep = if x mod 2 = 1 then "" else "\n;;" in
464 | [info] -> describe info; Printf.printf ".\n"
465 | info::(_::[] as infos) ->
466 describe info; Printf.printf ", and%s " sep; f infos (x+1)
467 | info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
472 (* Emit a define_insn_reservation for each producer. The latency
473 written in will be its worst-case latency. *)
474 let emit_insn_reservations core =
475 let corestring = coreStr core in
476 let tunestring = tuneStr core
478 fun (producer, avail, latency, reservation) ->
479 write_comment producer avail;
480 Printf.printf "(define_insn_reservation \"%s_%s\" %d\n"
481 corestring producer latency;
482 Printf.printf " (and (eq_attr \"tune\" \"%s\")\n" tunestring;
483 Printf.printf " (eq_attr \"type\" \"%s\"))\n" producer;
485 match reservation with
486 Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
487 | Shift -> "dp" | Shift_2cycle -> "dp_2"
488 | ALU -> "dp" | ALU_2cycle -> "dp_2"
489 | Fmul -> "dp" | Fmul_2cycle -> "dp_2"
490 | Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
492 | Ls n -> "ls_" ^ (string_of_int n)
493 | Permute 1 -> "perm"
494 | Permute n -> "perm_" ^ (string_of_int n)
495 | Fmul_then_fadd -> "fmul_then_fadd"
496 | Fmul_then_fadd_2 -> "fmul_then_fadd_2"
498 Printf.printf " \"%s_neon_%s\")\n\n" corestring str
501 (* Given a guard description, return the name of the C function to
502 be used as the guard for define_bypass. *)
505 Guard_only_m -> "arm_neon_only_m_dependency"
506 | Guard_only_n -> "arm_neon_only_n_dependency"
507 | Guard_only_d -> "arm_neon_only_d_dependency"
508 | Guard_none -> assert false
510 (* Emit a define_bypass for each bypass. *)
511 let emit_bypasses core =
513 fun (producer, consumers, latency, guard) ->
514 Printf.printf "(define_bypass %d \"%s_%s\"\n"
515 latency (coreStr core) producer;
517 if guard = Guard_none then
518 Printf.printf " \"%s\")\n\n" consumers
521 Printf.printf " \"%s\"\n" consumers;
522 Printf.printf " \"%s\")\n\n" (guard_fn guard)
527 let calculate_per_core_availability_table core availability_table =
528 let table = calculate_sources availability_table in
529 let worst_cases, bypasses = worst_case_latencies_and_bypasses core table in
530 emit_insn_reservations core (List.rev worst_cases);
531 Printf.printf ";; Exceptions to the default latencies.\n\n";
532 emit_bypasses core bypasses
534 let calculate_core_availability_table core availability_table =
535 let filter_core = List.filter (fun (_, _, _, cores)
536 -> List.exists ((=) core) cores)
537 in calculate_per_core_availability_table core (filter_core availability_table)
540 (* Program entry point. *)
542 List.map (fun core -> calculate_core_availability_table
543 core availability_table) allCores