sysdeps/ia64/memcpy.S

   1 /* Optimized version of the standard memcpy() function.
   2    This file is part of the GNU C Library.
   3    Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
   4    Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
   5    Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, write to the Free
  19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20    02111-1307 USA.  */
  21
  22 /* Return: dest
  23
  24    Inputs:
  25         in0:    dest
  26         in1:    src
  27         in2:    byte count
  28
  29    An assembly implementation of the algorithm used by the generic C
  30    version from glibc.  The case when source and sest are aligned is
  31    treated separately, for extra performance.
  32
  33    In this form, memcpy assumes little endian mode.  For big endian mode,
  34    sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
  35    and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
  36    shrp instruction.  */
  37
  38 #define USE_LFETCH
  39 #define USE_FLP
  40 #include <sysdep.h>
  41 #undef ret
  42
  43 #define LFETCH_DIST     500
  44
  45 #define ALIGN_UNROLL_no   4 // no. of elements
  46 #define ALIGN_UNROLL_sh   2 // (shift amount)
  47
  48 #define MEMLAT  8
  49 #define Nrot    ((4*(MEMLAT+2) + 7) & ~7)
  50
  51 #define OP_T_THRES      16
  52 #define OPSIZ           8
  53
  54 #define loopcnt         r14
  55 #define elemcnt         r15
  56 #define saved_pr        r16
  57 #define saved_lc        r17
  58 #define adest           r18
  59 #define dest            r19
  60 #define asrc            r20
  61 #define src             r21
  62 #define len             r22
  63 #define tmp2            r23
  64 #define tmp3            r24
  65 #define tmp4            r25
  66 #define ptable          r26
  67 #define ploop56         r27
  68 #define loopaddr        r28
  69 #define sh1             r29
  70 #define ptr1            r30
  71 #define ptr2            r31
  72
  73 #define movi0           mov
  74
  75 #define p_scr           p6
  76 #define p_xtr           p7
  77 #define p_nxtr          p8
  78 #define p_few           p9
  79
  80 #if defined(USE_FLP)
  81 #define load            ldf8
  82 #define store           stf8
  83 #define tempreg         f6
  84 #define the_r           fr
  85 #define the_s           fs
  86 #define the_t           ft
  87 #define the_q           fq
  88 #define the_w           fw
  89 #define the_x           fx
  90 #define the_y           fy
  91 #define the_z           fz
  92 #elif defined(USE_INT)
  93 #define load            ld8
  94 #define store           st8
  95 #define tempreg         tmp2
  96 #define the_r           r
  97 #define the_s           s
  98 #define the_t           t
  99 #define the_q           q
 100 #define the_w           w
 101 #define the_x           x
 102 #define the_y           y
 103 #define the_z           z
 104 #endif
 105
 106 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
 107 /* Manually force proper loop-alignment.  Note: be sure to
 108    double-check the code-layout after making any changes to
 109    this routine! */
 110 # define ALIGN(n)       { nop 0 }
 111 #else
 112 # define ALIGN(n)       .align n
 113 #endif
 114
 115 #if defined(USE_LFETCH)
 116 #define LOOP(shift)                                             \
 117                 ALIGN(32);                                      \
 118 .loop##shift##:                                                 \
 119 { .mmb                                                          \
 120 (p[0])  ld8.nt1 r[0] = [asrc], 8 ;                              \
 121 (p[0])  lfetch.nt1 [ptr1], 16 ;                                 \
 122         nop.b 0 ;                                               \
 123 } { .mib                                                        \
 124 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ;                            \
 125 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ;         \
 126         nop.b 0 ;;                                              \
 127  } { .mmb                                                       \
 128 (p[0])  ld8.nt1 s[0] = [asrc], 8 ;                              \
 129 (p[0])  lfetch.nt1      [ptr2], 16 ;                            \
 130         nop.b 0 ;                                               \
 131 } { .mib                                                        \
 132 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ;                            \
 133 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ;           \
 134         br.ctop.sptk.many .loop##shift                          \
 135 ;; }                                                            \
 136 { .mib                                                          \
 137         br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */  \
 138 }
 139 #else
 140 #define LOOP(shift)                                             \
 141                 ALIGN(32);                                      \
 142 .loop##shift##:                                                 \
 143 { .mmb                                                          \
 144 (p[0])  ld8.nt1 r[0] = [asrc], 8 ;                              \
 145         nop.b 0 ;                                               \
 146 } { .mib                                                        \
 147 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ;                            \
 148 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ;         \
 149         nop.b 0 ;;                                              \
 150  } { .mmb                                                       \
 151 (p[0])  ld8.nt1 s[0] = [asrc], 8 ;                              \
 152         nop.b 0 ;                                               \
 153 } { .mib                                                        \
 154 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ;                            \
 155 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ;           \
 156         br.ctop.sptk.many .loop##shift                          \
 157 ;; }                                                            \
 158 { .mib                                                          \
 159         br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */  \
 160 }
 161 #endif
 162
 163
 164 ENTRY(memcpy)
 165 { .mmi
 166         .prologue
 167         alloc   r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
 168         .rotr   r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
 169         .rotp   p[MEMLAT+2]
 170         .rotf   fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
 171         mov     ret0 = in0              // return tmp2 = dest
 172         .save   pr, saved_pr
 173         movi0   saved_pr = pr           // save the predicate registers
 174 } { .mmi
 175         and     tmp4 = 7, in0           // check if destination is aligned
 176         mov     dest = in0              // dest
 177         mov     src = in1               // src
 178 ;; }
 179 { .mii
 180         cmp.eq  p_scr, p0 = in2, r0     // if (len == 0)
 181         .save   ar.lc, saved_lc
 182         movi0   saved_lc = ar.lc        // save the loop counter
 183         .body
 184         cmp.ge  p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
 185 } { .mbb
 186         mov     len = in2               // len
 187 (p_scr) br.cond.dpnt.few .restore_and_exit //   Branch no. 1: return dest
 188 (p_few) br.cond.dpnt.many .copy_bytes   // Branch no. 2: copy byte by byte
 189 ;; }
 190 { .mmi
 191 #if defined(USE_LFETCH)
 192         lfetch.nt1 [dest]               //
 193         lfetch.nt1 [src]                //
 194 #endif
 195         shr.u   elemcnt = len, 3        // elemcnt = len / 8
 196 } { .mib
 197         cmp.eq  p_scr, p0 = tmp4, r0    // is destination aligned?
 198         sub     loopcnt = 7, tmp4       //
 199 (p_scr) br.cond.dptk.many .dest_aligned
 200 ;; }
 201 { .mmi
 202         ld1     tmp2 = [src], 1         //
 203         sub     len = len, loopcnt, 1   // reduce len
 204         movi0   ar.lc = loopcnt         //
 205 } { .mib
 206         cmp.ne  p_scr, p0 = 0, loopcnt  // avoid loading beyond end-point
 207 ;; }
 208
 209 .l0:    // ---------------------------- // L0: Align src on 8-byte boundary
 210 { .mmi
 211         st1     [dest] = tmp2, 1        //
 212 (p_scr) ld1     tmp2 = [src], 1         //
 213 } { .mib
 214         cmp.lt  p_scr, p0 = 1, loopcnt  // avoid load beyond end-point
 215         add     loopcnt = -1, loopcnt
 216         br.cloop.dptk.few .l0           //
 217 ;; }
 218
 219 .dest_aligned:
 220 { .mmi
 221         and     tmp4 = 7, src           // ready for alignment check
 222         shr.u   elemcnt = len, 3        // elemcnt = len / 8
 223 ;; }
 224 { .mib
 225         cmp.ne  p_scr, p0 = tmp4, r0    // is source also aligned
 226         tbit.nz p_xtr, p_nxtr = src, 3  // prepare a separate move if src
 227 } { .mib                                // is not 16B aligned
 228         add     ptr2 = LFETCH_DIST, dest        // prefetch address
 229         add     ptr1 = LFETCH_DIST, src
 230 (p_scr) br.cond.dptk.many .src_not_aligned
 231 ;; }
 232
 233 // The optimal case, when dest, and src are aligned
 234
 235 .both_aligned:
 236 { .mmi
 237         .pred.rel "mutex",p_xtr,p_nxtr
 238 (p_xtr) cmp.gt  p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
 239 (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt  // Need only N to qualify
 240         movi0   pr.rot = 1 << 16        // set rotating predicates
 241 } { .mib
 242 (p_scr) br.cond.dpnt.many .copy_full_words
 243 ;; }
 244
 245 { .mmi
 246 (p_xtr) load    tempreg = [src], 8
 247 (p_xtr) add     elemcnt = -1, elemcnt
 248         movi0   ar.ec = MEMLAT + 1      // set the epilog counter
 249 ;; }
 250 { .mmi
 251 (p_xtr) add     len = -8, len           //
 252         add     asrc = 16, src          // one bank apart (for USE_INT)
 253         shr.u   loopcnt = elemcnt, ALIGN_UNROLL_sh  // cater for unrolling
 254 ;;}
 255 { .mmi
 256         add     loopcnt = -1, loopcnt
 257 (p_xtr) store   [dest] = tempreg, 8     // copy the "extra" word
 258         nop.i   0
 259 ;; }
 260 { .mib
 261         add     adest = 16, dest
 262         movi0   ar.lc = loopcnt         // set the loop counter
 263 ;; }
 264
 265 #ifdef  GAS_ALIGN_BREAKS_UNWIND_INFO
 266         { nop 0 }
 267 #else
 268         .align  32
 269 #endif
 270 #if defined(USE_FLP)
 271 .l1: // ------------------------------- // L1: Everything a multiple of 8
 272 { .mmi
 273 #if defined(USE_LFETCH)
 274 (p[0])  lfetch.nt1 [ptr2],32
 275 #endif
 276 (p[0])  ldfp8   the_r[0],the_q[0] = [src], 16
 277 (p[0])  add     len = -32, len
 278 } {.mmb
 279 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
 280 (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
 281 ;; }
 282 { .mmi
 283 #if defined(USE_LFETCH)
 284 (p[0])  lfetch.nt1 [ptr1],32
 285 #endif
 286 (p[0])  ldfp8   the_s[0], the_t[0] = [src], 16
 287 } {.mmb
 288 (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
 289 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
 290         br.ctop.dptk.many .l1
 291 ;; }
 292 #elif defined(USE_INT)
 293 .l1: // ------------------------------- // L1: Everything a multiple of 8
 294 { .mmi
 295 (p[0])  load    the_r[0] = [src], 8
 296 (p[0])  load    the_q[0] = [asrc], 8
 297 (p[0])  add     len = -32, len
 298 } {.mmb
 299 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
 300 (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
 301 ;; }
 302 { .mmi
 303 (p[0])  load    the_s[0]  = [src], 24
 304 (p[0])  load    the_t[0] = [asrc], 24
 305 } {.mmb
 306 (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
 307 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
 308 #if defined(USE_LFETCH)
 309 ;; }
 310 { .mmb
 311 (p[0])  lfetch.nt1 [ptr2],32
 312 (p[0])  lfetch.nt1 [ptr1],32
 313 #endif
 314         br.ctop.dptk.many .l1
 315 ;; }
 316 #endif
 317
 318 .copy_full_words:
 319 { .mib
 320         cmp.gt  p_scr, p0 = 8, len      //
 321         shr.u   elemcnt = len, 3        //
 322 (p_scr) br.cond.dpnt.many .copy_bytes
 323 ;; }
 324 { .mii
 325         load    tempreg = [src], 8
 326         add     loopcnt = -1, elemcnt   //
 327 ;; }
 328 { .mii
 329         cmp.ne  p_scr, p0 = 0, loopcnt  //
 330         mov     ar.lc = loopcnt         //
 331 ;; }
 332
 333 .l2: // ------------------------------- // L2: Max 4 words copied separately
 334 { .mmi
 335         store   [dest] = tempreg, 8
 336 (p_scr) load    tempreg = [src], 8      //
 337         add     len = -8, len
 338 } { .mib
 339         cmp.lt  p_scr, p0 = 1, loopcnt  // avoid load beyond end-point
 340         add     loopcnt = -1, loopcnt
 341         br.cloop.dptk.few  .l2
 342 ;; }
 343
 344 .copy_bytes:
 345 { .mib
 346         cmp.eq  p_scr, p0 = len, r0     // is len == 0 ?
 347         add     loopcnt = -1, len       // len--;
 348 (p_scr) br.cond.spnt    .restore_and_exit
 349 ;; }
 350 { .mii
 351         ld1     tmp2 = [src], 1
 352         movi0   ar.lc = loopcnt
 353         cmp.ne  p_scr, p0 = 0, loopcnt  // avoid load beyond end-point
 354 ;; }
 355
 356 .l3: // ------------------------------- // L3: Final byte move
 357 { .mmi
 358         st1     [dest] = tmp2, 1
 359 (p_scr) ld1     tmp2 = [src], 1
 360 } { .mib
 361         cmp.lt  p_scr, p0 = 1, loopcnt  // avoid load beyond end-point
 362         add     loopcnt = -1, loopcnt
 363         br.cloop.dptk.few  .l3
 364 ;; }
 365
 366 .restore_and_exit:
 367 { .mmi
 368         movi0   pr = saved_pr, -1       // restore the predicate registers
 369 ;; }
 370 { .mib
 371         movi0   ar.lc = saved_lc        // restore the loop counter
 372         br.ret.sptk.many b0
 373 ;; }
 374
 375
 376 .src_not_aligned:
 377 { .mmi
 378         cmp.gt  p_scr, p0 = 16, len
 379         and     sh1 = 7, src            // sh1 = src % 8
 380         shr.u   loopcnt = len, 4        // element-cnt = len / 16
 381 } { .mib
 382         add     tmp4 = @ltoff(.table), gp
 383         add     tmp3 = @ltoff(.loop56), gp
 384 (p_scr) br.cond.dpnt.many .copy_bytes   // do byte by byte if too few
 385 ;; }
 386 { .mmi
 387         and     asrc = -8, src          // asrc = (-8) -- align src for loop
 388         add     loopcnt = -1, loopcnt   // loopcnt--
 389         shl     sh1 = sh1, 3            // sh1 = 8 * (src % 8)
 390 } { .mmi
 391         ld8     ptable = [tmp4]         // ptable = &table
 392         ld8     ploop56 = [tmp3]        // ploop56 = &loop56
 393         and     tmp2 = -16, len         // tmp2 = len & -OPSIZ
 394 ;; }
 395 { .mmi
 396         add     tmp3 = ptable, sh1      // tmp3 = &table + sh1
 397         add     src = src, tmp2         // src += len & (-16)
 398         movi0   ar.lc = loopcnt         // set LC
 399 ;; }
 400 { .mmi
 401         ld8     tmp4 = [tmp3]           // tmp4 = loop offset
 402         sub     len = len, tmp2         // len -= len & (-16)
 403         movi0   ar.ec = MEMLAT + 2      // one more pass needed
 404 ;; }
 405 { .mmi
 406         ld8     s[1] = [asrc], 8        // preload
 407         sub     loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
 408         movi0   pr.rot = 1 << 16        // set rotating predicates
 409 ;; }
 410 { .mib
 411         nop.m   0
 412         movi0   b6 = loopaddr
 413         br      b6                      // jump to the appropriate loop
 414 ;; }
 415
 416         LOOP(8)
 417         LOOP(16)
 418         LOOP(24)
 419         LOOP(32)
 420         LOOP(40)
 421         LOOP(48)
 422         LOOP(56)
 423 END(memcpy)
 424 libc_hidden_builtin_def (memcpy)
 425
 426         .rodata
 427         .align 8
 428 .table:
 429         data8   0                       // dummy entry
 430         data8   .loop56 - .loop8
 431         data8   .loop56 - .loop16
 432         data8   .loop56 - .loop24
 433         data8   .loop56 - .loop32
 434         data8   .loop56 - .loop40
 435         data8   .loop56 - .loop48
 436         data8   .loop56 - .loop56