gcc/config/arm/fa726te.md

   1 ;; Faraday FA726TE Pipeline Description
   2 ;; Copyright (C) 2010-2021 Free Software Foundation, Inc.
   3 ;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
   4 ;;
   5 ;; This file is part of GCC.
   6 ;;
   7 ;; GCC is free software; you can redistribute it and/or modify it under
   8 ;; the terms of the GNU General Public License as published by the Free
   9 ;; Software Foundation; either version 3, or (at your option) any later
  10 ;; version.
  11 ;;
  12 ;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 ;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 ;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 ;; for more details.
  16 ;;
  17 ;; You should have received a copy of the GNU General Public License
  18 ;; along with GCC; see the file COPYING3.  If not see
  19 ;; <http://www.gnu.org/licenses/>.  */
  20
  21 ;; These descriptions are based on the information contained in the
  22 ;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
  23
  24 ;; This automaton provides a pipeline description for the Faraday
  25 ;; FA726TE core.
  26 ;;
  27 ;; The model given here assumes that the condition for all conditional
  28 ;; instructions is "true", i.e., that all of the instructions are
  29 ;; actually executed.
  30
  31 (define_automaton "fa726te")
  32
  33 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  34 ;; Pipelines
  35 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  36
  37 ;;   The ALU pipeline has fetch, decode, execute, memory, and
  38 ;;   write stages.  We only need to model the execute, memory and write
  39 ;;   stages.
  40
  41 ;;      E1      E2      E3      E4      E5      WB
  42 ;;______________________________________________________
  43 ;;
  44 ;;      <-------------- LD/ST ----------->
  45 ;;    shifter + LU      <-- AU -->
  46 ;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
  47 ;;______________________________________________________
  48 ;;
  49 ;;      <---------- MUL --------->
  50 ;;    shifter + LU      <-- AU -->
  51 ;;      <-- AU -->     shifter + LU    CPSR     (Pipe 1)
  52
  53
  54 (define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te")
  55 (define_cpu_unit "fa726te_mac_pipe" "fa726te")
  56 (define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te")
  57
  58 ;; Pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
  59 ;; improve code quality.
  60 (define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
  61 (define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te")
  62
  63 (define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)")
  64 ;; Reservation to restrict issue to 1.
  65 (define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
  66
  67 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  68 ;; ALU Instructions
  69 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  70
  71 ;; ALU instructions require three cycles to execute, and use the ALU
  72 ;; pipeline in each of the three stages.  The results are available
  73 ;; after the execute stage has finished.
  74 ;;
  75 ;; If the destination register is the PC, the pipelines are stalled
  76 ;; for several cycles.  That case is not modeled here.
  77
  78 ;; Move instructions.
  79 (define_insn_reservation "726te_shift_op" 1
  80   (and (eq_attr "tune" "fa726te")
  81        (eq_attr "type" "mov_imm,mov_reg,mov_shift,mov_shift_reg,\
  82                         mvn_imm,mvn_reg,mvn_shift,mvn_shift_reg"))
  83   "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
  84
  85 ;; ALU operations with no shifted operand will finished in 1 cycle
  86 ;; Other ALU instructions 2 cycles.
  87 (define_insn_reservation "726te_alu_op" 1
  88  (and (eq_attr "tune" "fa726te")
  89       (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\
  90                        alu_sreg,alus_sreg,logic_reg,logics_reg,\
  91                        adc_imm,adcs_imm,adc_reg,adcs_reg,\
  92                        adr,bfm,rev,\
  93                        shift_imm,shift_reg,\
  94                        mrs,multiple"))
  95   "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
  96
  97 ;; ALU operations with a shift-by-register operand.
  98 ;; These really stall in the decoder, in order to read the shift value
  99 ;; in the first cycle.  If the instruction uses both shifter and AU,
 100 ;; it takes 3 cycles.
 101 (define_insn_reservation "726te_alu_shift_op" 3
 102  (and (eq_attr "tune" "fa726te")
 103       (eq_attr "type" "extend,alu_shift_imm_lsl_1to4,alu_shift_imm_other,alus_shift_imm,\
 104                        logic_shift_imm,logics_shift_imm"))
 105   "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
 106
 107 (define_insn_reservation "726te_alu_shift_reg_op" 3
 108  (and (eq_attr "tune" "fa726te")
 109       (eq_attr "type" "alu_shift_reg,alus_shift_reg,\
 110                        logic_shift_reg,logics_shift_reg"))
 111   "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
 112 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 113 ;; Multiplication Instructions
 114 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 115
 116 ;; Multiplication instructions loop in the execute stage until the
 117 ;; instruction has been passed through the multiplier array enough
 118 ;; times.  Multiply operations occur in both the execute and memory
 119 ;; stages of the pipeline
 120
 121 (define_insn_reservation "726te_mult_op" 3
 122  (and (eq_attr "tune" "fa726te")
 123       (eq_attr "type" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\
 124                        umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy"))
 125  "fa726te_issue+fa726te_mac_pipe")
 126
 127 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 128 ;; Load/Store Instructions
 129 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 130
 131 ;; The models for load/store instructions do not accurately describe
 132 ;; the difference between operations with a base register writeback
 133 ;; (such as "ldm!").  These models assume that all memory references
 134 ;; hit in dcache.
 135
 136 ;; Loads with a shifted offset take 3 cycles, and are (a) probably the
 137 ;; most common and (b) the pessimistic assumption will lead to fewer stalls.
 138
 139 ;; Scalar loads are pipelined in FA726TE LSU pipe.
 140 ;; Here we model the resource conflict between Load@E3-stage & Store@W-stage.
 141 ;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the
 142 ;; same "bundle", and the 2nd load will introudce another ISSUE stall but is
 143 ;; still ok to execute (and may be benefical sometimes).
 144
 145 (define_insn_reservation "726te_load1_op" 3
 146  (and (eq_attr "tune" "fa726te")
 147       (eq_attr "type" "load_4,load_byte"))
 148  "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\
 149   | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)")
 150
 151 (define_insn_reservation "726te_store1_op" 1
 152  (and (eq_attr "tune" "fa726te")
 153       (eq_attr "type" "store_4"))
 154  "fa726te_blockage*2")
 155
 156 ;; Load/Store Multiple blocks all pipelines in EX stages until WB.
 157 ;; No other instructions can be issued together.  Since they essentially
 158 ;; prevent all scheduling opportunities, we model them together here.
 159
 160 ;; The LDM is breaking into multiple load instructions, later instruction in
 161 ;; the pipe 1 is stalled.
 162 (define_insn_reservation "726te_ldm2_op" 4
 163  (and (eq_attr "tune" "fa726te")
 164       (eq_attr "type" "load_8,load_12"))
 165  "fa726te_blockage*4")
 166
 167 (define_insn_reservation "726te_ldm3_op" 5
 168  (and (eq_attr "tune" "fa726te")
 169       (eq_attr "type" "load_16"))
 170  "fa726te_blockage*5")
 171
 172 (define_insn_reservation "726te_stm2_op" 2
 173  (and (eq_attr "tune" "fa726te")
 174       (eq_attr "type" "store_8,store_12"))
 175  "fa726te_blockage*3")
 176
 177 (define_insn_reservation "726te_stm3_op" 3
 178  (and (eq_attr "tune" "fa726te")
 179       (eq_attr "type" "store_16"))
 180  "fa726te_blockage*4")
 181
 182 (define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\
 183                   726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep")
 184 (define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\
 185                  726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op"
 186                  "arm_no_early_store_addr_dep")
 187 (define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op")
 188 (define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op"
 189                  "726te_shift_op,726te_alu_op")
 190 (define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
 191                  "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
 192 (define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
 193                  "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep")
 194 (define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op")
 195
 196 (define_bypass 4 "726te_load1_op" "726te_mult_op")
 197 (define_bypass 5 "726te_ldm2_op" "726te_mult_op")
 198 (define_bypass 6 "726te_ldm3_op" "726te_mult_op")
 199
 200 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 201 ;; Branch and Call Instructions
 202 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 203
 204 ;; Branch instructions are difficult to model accurately.  The FA726TE
 205 ;; core can predict most branches.  If the branch is predicted
 206 ;; correctly, and predicted early enough, the branch can be completely
 207 ;; eliminated from the instruction stream.  Some branches can
 208 ;; therefore appear to require zero cycle to execute.  We assume that
 209 ;; all branches are predicted correctly, and that the latency is
 210 ;; therefore the minimum value.
 211
 212 (define_insn_reservation "726te_branch_op" 0
 213  (and (eq_attr "tune" "fa726te")
 214       (eq_attr "type" "branch"))
 215  "fa726te_blockage")
 216
 217 ;; The latency for a call is actually the latency when the result is available.
 218 ;; i.e. R0 is ready for int return value.
 219 (define_insn_reservation "726te_call_op" 1
 220  (and (eq_attr "tune" "fa726te")
 221       (eq_attr "type" "call"))
 222  "fa726te_blockage")
 223