sysdeps/ia64/bzero.S

   1 /* Optimized version of the standard bzero() function.
   2    This file is part of the GNU C Library.
   3    Copyright (C) 2000-2021 Free Software Foundation, Inc.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 /* Return: dest
  20
  21    Inputs:
  22         in0:    dest
  23         in1:    count
  24
  25    The algorithm is fairly straightforward: set byte by byte until we
  26    we get to a 16B-aligned address, then loop on 128 B chunks using an
  27    early store as prefetching, then loop on 32B chucks, then clear remaining
  28    words, finally clear remaining bytes.
  29    Since a stf.spill f0 can store 16B in one go, we use this instruction
  30    to get peak speed.  */
  31
  32 #include <sysdep.h>
  33 #undef ret
  34
  35 #define dest            in0
  36 #define cnt             in1
  37
  38 #define tmp             r31
  39 #define save_lc         r30
  40 #define ptr0            r29
  41 #define ptr1            r28
  42 #define ptr2            r27
  43 #define ptr3            r26
  44 #define ptr9            r24
  45 #define loopcnt         r23
  46 #define linecnt         r22
  47 #define bytecnt         r21
  48
  49 // This routine uses only scratch predicate registers (p6 - p15)
  50 #define p_scr           p6      // default register for same-cycle branches
  51 #define p_unalgn        p9
  52 #define p_y             p11
  53 #define p_n             p12
  54 #define p_yy            p13
  55 #define p_nn            p14
  56
  57 #define movi0           mov
  58
  59 #define MIN1            15
  60 #define MIN1P1HALF      8
  61 #define LINE_SIZE       128
  62 #define LSIZE_SH        7                       // shift amount
  63 #define PREF_AHEAD      8
  64
  65 #define USE_FLP
  66 #if defined(USE_INT)
  67 #define store           st8
  68 #define myval           r0
  69 #elif defined(USE_FLP)
  70 #define store           stf8
  71 #define myval           f0
  72 #endif
  73
  74 .align  64
  75 ENTRY(bzero)
  76 { .mmi
  77         .prologue
  78         alloc   tmp = ar.pfs, 2, 0, 0, 0
  79         lfetch.nt1 [dest]
  80         .save   ar.lc, save_lc
  81         movi0   save_lc = ar.lc
  82 } { .mmi
  83         .body
  84         mov     ret0 = dest             // return value
  85         nop.m   0
  86         cmp.eq  p_scr, p0 = cnt, r0
  87 ;; }
  88 { .mmi
  89         and     ptr2 = -(MIN1+1), dest  // aligned address
  90         and     tmp = MIN1, dest        // prepare to check for alignment
  91         tbit.nz p_y, p_n = dest, 0      // Do we have an odd address? (M_B_U)
  92 } { .mib
  93         mov     ptr1 = dest
  94         nop.i   0
  95 (p_scr) br.ret.dpnt.many rp             // return immediately if count = 0
  96 ;; }
  97 { .mib
  98         cmp.ne  p_unalgn, p0 = tmp, r0
  99 } { .mib                                        // NB: # of bytes to move is 1
 100         sub     bytecnt = (MIN1+1), tmp         //     higher than loopcnt
 101         cmp.gt  p_scr, p0 = 16, cnt             // is it a minimalistic task?
 102 (p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
 103 ;; }
 104 { .mmi
 105 (p_unalgn) add  ptr1 = (MIN1+1), ptr2           // after alignment
 106 (p_unalgn) add  ptr2 = MIN1P1HALF, ptr2         // after alignment
 107 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3    // should we do a st8 ?
 108 ;; }
 109 { .mib
 110 (p_y)   add     cnt = -8, cnt
 111 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2  // should we do a st4 ?
 112 } { .mib
 113 (p_y)   st8     [ptr2] = r0,-4
 114 (p_n)   add     ptr2 = 4, ptr2
 115 ;; }
 116 { .mib
 117 (p_yy)  add     cnt = -4, cnt
 118 (p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1    // should we do a st2 ?
 119 } { .mib
 120 (p_yy)  st4     [ptr2] = r0,-2
 121 (p_nn)  add     ptr2 = 2, ptr2
 122 ;; }
 123 { .mmi
 124         mov     tmp = LINE_SIZE+1               // for compare
 125 (p_y)   add     cnt = -2, cnt
 126 (p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0  // should we do a st1 ?
 127 } { .mmi
 128         nop.m   0
 129 (p_y)   st2     [ptr2] = r0,-1
 130 (p_n)   add     ptr2 = 1, ptr2
 131 ;; }
 132
 133 { .mmi
 134 (p_yy)  st1     [ptr2] = r0
 135         cmp.gt  p_scr, p0 = tmp, cnt            // is it a minimalistic task?
 136 } { .mbb
 137 (p_yy)  add     cnt = -1, cnt
 138 (p_scr) br.cond.dpnt.many .fraction_of_line     // go move just a few
 139 ;; }
 140 { .mib
 141         nop.m   0
 142         shr.u   linecnt = cnt, LSIZE_SH
 143         nop.b   0
 144 ;; }
 145
 146         .align 32
 147 .l1b:   // ------------------//  L1B: store ahead into cache lines; fill later
 148 { .mmi
 149         and     tmp = -(LINE_SIZE), cnt         // compute end of range
 150         mov     ptr9 = ptr1                     // used for prefetching
 151         and     cnt = (LINE_SIZE-1), cnt        // remainder
 152 } { .mmi
 153         mov     loopcnt = PREF_AHEAD-1          // default prefetch loop
 154         cmp.gt  p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
 155 ;; }
 156 { .mmi
 157 (p_scr) add     loopcnt = -1, linecnt
 158         add     ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
 159         add     ptr1 = tmp, ptr1        // first address beyond total range
 160 ;; }
 161 { .mmi
 162         add     tmp = -1, linecnt       // next loop count
 163         movi0   ar.lc = loopcnt
 164 ;; }
 165 .pref_l1b:
 166 { .mib
 167         stf.spill [ptr9] = f0, 128      // Do stores one cache line apart
 168         nop.i   0
 169         br.cloop.dptk.few .pref_l1b
 170 ;; }
 171 { .mmi
 172         add     ptr0 = 16, ptr2         // Two stores in parallel
 173         movi0   ar.lc = tmp
 174 ;; }
 175 .l1bx:
 176  { .mmi
 177         stf.spill [ptr2] = f0, 32
 178         stf.spill [ptr0] = f0, 32
 179  ;; }
 180  { .mmi
 181         stf.spill [ptr2] = f0, 32
 182         stf.spill [ptr0] = f0, 32
 183  ;; }
 184  { .mmi
 185         stf.spill [ptr2] = f0, 32
 186         stf.spill [ptr0] = f0, 64
 187         cmp.lt  p_scr, p0 = ptr9, ptr1  // do we need more prefetching?
 188  ;; }
 189 { .mmb
 190         stf.spill [ptr2] = f0, 32
 191 (p_scr) stf.spill [ptr9] = f0, 128
 192         br.cloop.dptk.few .l1bx
 193 ;; }
 194 { .mib
 195         cmp.gt  p_scr, p0 = 8, cnt      // just a few bytes left ?
 196 (p_scr) br.cond.dpnt.many  .move_bytes_from_alignment
 197 ;; }
 198
 199 .fraction_of_line:
 200 { .mib
 201         add     ptr2 = 16, ptr1
 202         shr.u   loopcnt = cnt, 5        // loopcnt = cnt / 32
 203 ;; }
 204 { .mib
 205         cmp.eq  p_scr, p0 = loopcnt, r0
 206         add     loopcnt = -1, loopcnt
 207 (p_scr) br.cond.dpnt.many .store_words
 208 ;; }
 209 { .mib
 210         and     cnt = 0x1f, cnt         // compute the remaining cnt
 211         movi0   ar.lc = loopcnt
 212 ;; }
 213         .align 32
 214 .l2:    // -----------------------------//  L2A:  store 32B in 2 cycles
 215 { .mmb
 216         store   [ptr1] = myval, 8
 217         store   [ptr2] = myval, 8
 218 ;; } { .mmb
 219         store   [ptr1] = myval, 24
 220         store   [ptr2] = myval, 24
 221         br.cloop.dptk.many .l2
 222 ;; }
 223 .store_words:
 224 { .mib
 225         cmp.gt  p_scr, p0 = 8, cnt      // just a few bytes left ?
 226 (p_scr) br.cond.dpnt.many .move_bytes_from_alignment    // Branch
 227 ;; }
 228
 229 { .mmi
 230         store   [ptr1] = myval, 8       // store
 231         cmp.le  p_y, p_n = 16, cnt      //
 232         add     cnt = -8, cnt           // subtract
 233 ;; }
 234 { .mmi
 235 (p_y)   store   [ptr1] = myval, 8       // store
 236 (p_y)   cmp.le.unc p_yy, p_nn = 16, cnt
 237 (p_y)   add     cnt = -8, cnt           // subtract
 238 ;; }
 239 { .mmi                                  // store
 240 (p_yy)  store   [ptr1] = myval, 8
 241 (p_yy)  add     cnt = -8, cnt           // subtract
 242 ;; }
 243
 244 .move_bytes_from_alignment:
 245 { .mib
 246         cmp.eq  p_scr, p0 = cnt, r0
 247         tbit.nz.unc p_y, p0 = cnt, 2    // should we terminate with a st4 ?
 248 (p_scr) br.cond.dpnt.few .restore_and_exit
 249 ;; }
 250 { .mib
 251 (p_y)   st4     [ptr1] = r0,4
 252         tbit.nz.unc p_yy, p0 = cnt, 1   // should we terminate with a st2 ?
 253 ;; }
 254 { .mib
 255 (p_yy)  st2     [ptr1] = r0,2
 256         tbit.nz.unc p_y, p0 = cnt, 0    // should we terminate with a st1 ?
 257 ;; }
 258
 259 { .mib
 260 (p_y)   st1     [ptr1] = r0
 261 ;; }
 262 .restore_and_exit:
 263 { .mib
 264         nop.m   0
 265         movi0   ar.lc = save_lc
 266         br.ret.sptk.many rp
 267 ;; }
 268
 269 .move_bytes_unaligned:
 270 { .mmi
 271        .pred.rel "mutex",p_y, p_n
 272        .pred.rel "mutex",p_yy, p_nn
 273 (p_n)   cmp.le  p_yy, p_nn = 4, cnt
 274 (p_y)   cmp.le  p_yy, p_nn = 5, cnt
 275 (p_n)   add     ptr2 = 2, ptr1
 276 } { .mmi
 277 (p_y)   add     ptr2 = 3, ptr1
 278 (p_y)   st1     [ptr1] = r0, 1          // fill 1 (odd-aligned) byte
 279 (p_y)   add     cnt = -1, cnt           // [15, 14 (or less) left]
 280 ;; }
 281 { .mmi
 282 (p_yy)  cmp.le.unc p_y, p0 = 8, cnt
 283         add     ptr3 = ptr1, cnt        // prepare last store
 284         movi0   ar.lc = save_lc
 285 } { .mmi
 286 (p_yy)  st2     [ptr1] = r0, 4          // fill 2 (aligned) bytes
 287 (p_yy)  st2     [ptr2] = r0, 4          // fill 2 (aligned) bytes
 288 (p_yy)  add     cnt = -4, cnt           // [11, 10 (o less) left]
 289 ;; }
 290 { .mmi
 291 (p_y)   cmp.le.unc p_yy, p0 = 8, cnt
 292         add     ptr3 = -1, ptr3         // last store
 293         tbit.nz p_scr, p0 = cnt, 1      // will there be a st2 at the end ?
 294 } { .mmi
 295 (p_y)   st2     [ptr1] = r0, 4          // fill 2 (aligned) bytes
 296 (p_y)   st2     [ptr2] = r0, 4          // fill 2 (aligned) bytes
 297 (p_y)   add     cnt = -4, cnt           // [7, 6 (or less) left]
 298 ;; }
 299 { .mmi
 300 (p_yy)  st2     [ptr1] = r0, 4          // fill 2 (aligned) bytes
 301 (p_yy)  st2     [ptr2] = r0, 4          // fill 2 (aligned) bytes
 302                                         // [3, 2 (or less) left]
 303         tbit.nz p_y, p0 = cnt, 0        // will there be a st1 at the end ?
 304 } { .mmi
 305 (p_yy)  add     cnt = -4, cnt
 306 ;; }
 307 { .mmb
 308 (p_scr) st2     [ptr1] = r0             // fill 2 (aligned) bytes
 309 (p_y)   st1     [ptr3] = r0             // fill last byte (using ptr3)
 310         br.ret.sptk.many rp
 311 ;; }
 312 END(bzero)