sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

   1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
   2    Copyright (C) 2016-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 /* memmove/memcpy/mempcpy is implemented as:
  20    1. Use overlapping load and store to avoid branch.
  21    2. Load all sources into registers and store them together to avoid
  22       possible address overlap between source and destination.
  23    3. If size is 8 * VEC_SIZE or less, load all sources into registers
  24       and store them together.
  25    4. If address of destination > address of source, backward copy
  26       4 * VEC_SIZE at a time with unaligned load and aligned store.
  27       Load the first 4 * VEC and last VEC before the loop and store
  28       them after the loop to support overlapping addresses.
  29    5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
  30       load and aligned store.  Load the last 4 * VEC and first VEC
  31       before the loop and store them after the loop to support
  32       overlapping addresses.
  33    6. If size >= __x86_shared_non_temporal_threshold and there is no
  34       overlap between destination and source, use non-temporal store
  35       instead of aligned store.  */
  36
  37 #include <sysdep.h>
  38
  39 #ifndef MEMCPY_SYMBOL
  40 # define MEMCPY_SYMBOL(p,s)             MEMMOVE_SYMBOL(p, s)
  41 #endif
  42
  43 #ifndef MEMPCPY_SYMBOL
  44 # define MEMPCPY_SYMBOL(p,s)            MEMMOVE_SYMBOL(p, s)
  45 #endif
  46
  47 #ifndef MEMMOVE_CHK_SYMBOL
  48 # define MEMMOVE_CHK_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
  49 #endif
  50
  51 #ifndef VZEROUPPER
  52 # if VEC_SIZE > 16
  53 #  define VZEROUPPER vzeroupper
  54 # else
  55 #  define VZEROUPPER
  56 # endif
  57 #endif
  58
  59 /* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
  60    up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
  61    memcpy micro benchmark in glibc shows that 2KB is the approximate
  62    value above which REP MOVSB becomes faster than SSE2 optimization
  63    on processors with Enhanced REP MOVSB.  Since larger register size
  64    can move more data with a single load and store, the threshold is
  65    higher with larger register size.  */
  66 #ifndef REP_MOVSB_THRESHOLD
  67 # define REP_MOVSB_THRESHOLD    (2048 * (VEC_SIZE / 16))
  68 #endif
  69
  70 /* Avoid short distance rep movsb only with non-SSE vector.  */
  71 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
  72 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
  73 #else
  74 # define AVOID_SHORT_DISTANCE_REP_MOVSB 0
  75 #endif
  76
  77 #ifndef PREFETCH
  78 # define PREFETCH(addr) prefetcht0 addr
  79 #endif
  80
  81 /* Assume 64-byte prefetch size.  */
  82 #ifndef PREFETCH_SIZE
  83 # define PREFETCH_SIZE 64
  84 #endif
  85
  86 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
  87
  88 #if PREFETCH_SIZE == 64
  89 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
  90 #  define PREFETCH_ONE_SET(dir, base, offset) \
  91         PREFETCH ((offset)base)
  92 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
  93 #  define PREFETCH_ONE_SET(dir, base, offset) \
  94         PREFETCH ((offset)base); \
  95         PREFETCH ((offset + dir * PREFETCH_SIZE)base)
  96 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
  97 #  define PREFETCH_ONE_SET(dir, base, offset) \
  98         PREFETCH ((offset)base); \
  99         PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
 100         PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
 101         PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
 102 # else
 103 #   error Unsupported PREFETCHED_LOAD_SIZE!
 104 # endif
 105 #else
 106 # error Unsupported PREFETCH_SIZE!
 107 #endif
 108
 109 #ifndef SECTION
 110 # error SECTION is not defined!
 111 #endif
 112
 113         .section SECTION(.text),"ax",@progbits
 114 #if defined SHARED && IS_IN (libc)
 115 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 116         cmp     %RDX_LP, %RCX_LP
 117         jb      HIDDEN_JUMPTARGET (__chk_fail)
 118 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 119 #endif
 120
 121 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 122         mov     %RDI_LP, %RAX_LP
 123         add     %RDX_LP, %RAX_LP
 124         jmp     L(start)
 125 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 126
 127 #if defined SHARED && IS_IN (libc)
 128 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 129         cmp     %RDX_LP, %RCX_LP
 130         jb      HIDDEN_JUMPTARGET (__chk_fail)
 131 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 132 #endif
 133
 134 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 135         movq    %rdi, %rax
 136 L(start):
 137 # ifdef __ILP32__
 138         /* Clear the upper 32 bits.  */
 139         movl    %edx, %edx
 140 # endif
 141         cmp     $VEC_SIZE, %RDX_LP
 142         jb      L(less_vec)
 143         cmp     $(VEC_SIZE * 2), %RDX_LP
 144         ja      L(more_2x_vec)
 145 #if !defined USE_MULTIARCH || !IS_IN (libc)
 146 L(last_2x_vec):
 147 #endif
 148         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 149         VMOVU   (%rsi), %VEC(0)
 150         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
 151         VMOVU   %VEC(0), (%rdi)
 152         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 153         VZEROUPPER
 154 #if !defined USE_MULTIARCH || !IS_IN (libc)
 155 L(nop):
 156 #endif
 157         ret
 158 #if defined USE_MULTIARCH && IS_IN (libc)
 159 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 160
 161 # if VEC_SIZE == 16
 162 ENTRY (__mempcpy_chk_erms)
 163         cmp     %RDX_LP, %RCX_LP
 164         jb      HIDDEN_JUMPTARGET (__chk_fail)
 165 END (__mempcpy_chk_erms)
 166
 167 /* Only used to measure performance of REP MOVSB.  */
 168 ENTRY (__mempcpy_erms)
 169         mov     %RDI_LP, %RAX_LP
 170         /* Skip zero length.  */
 171         test    %RDX_LP, %RDX_LP
 172         jz      2f
 173         add     %RDX_LP, %RAX_LP
 174         jmp     L(start_movsb)
 175 END (__mempcpy_erms)
 176
 177 ENTRY (__memmove_chk_erms)
 178         cmp     %RDX_LP, %RCX_LP
 179         jb      HIDDEN_JUMPTARGET (__chk_fail)
 180 END (__memmove_chk_erms)
 181
 182 ENTRY (__memmove_erms)
 183         movq    %rdi, %rax
 184         /* Skip zero length.  */
 185         test    %RDX_LP, %RDX_LP
 186         jz      2f
 187 L(start_movsb):
 188         mov     %RDX_LP, %RCX_LP
 189         cmp     %RSI_LP, %RDI_LP
 190         jb      1f
 191         /* Source == destination is less common.  */
 192         je      2f
 193         lea     (%rsi,%rcx), %RDX_LP
 194         cmp     %RDX_LP, %RDI_LP
 195         jb      L(movsb_backward)
 196 1:
 197         rep movsb
 198 2:
 199         ret
 200 L(movsb_backward):
 201         leaq    -1(%rdi,%rcx), %rdi
 202         leaq    -1(%rsi,%rcx), %rsi
 203         std
 204         rep movsb
 205         cld
 206         ret
 207 END (__memmove_erms)
 208 strong_alias (__memmove_erms, __memcpy_erms)
 209 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
 210 # endif
 211
 212 # ifdef SHARED
 213 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 214         cmp     %RDX_LP, %RCX_LP
 215         jb      HIDDEN_JUMPTARGET (__chk_fail)
 216 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 217 # endif
 218
 219 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 220         mov     %RDI_LP, %RAX_LP
 221         add     %RDX_LP, %RAX_LP
 222         jmp     L(start_erms)
 223 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 224
 225 # ifdef SHARED
 226 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 227         cmp     %RDX_LP, %RCX_LP
 228         jb      HIDDEN_JUMPTARGET (__chk_fail)
 229 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 230 # endif
 231
 232 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 233         movq    %rdi, %rax
 234 L(start_erms):
 235 # ifdef __ILP32__
 236         /* Clear the upper 32 bits.  */
 237         movl    %edx, %edx
 238 # endif
 239         cmp     $VEC_SIZE, %RDX_LP
 240         jb      L(less_vec)
 241         cmp     $(VEC_SIZE * 2), %RDX_LP
 242         ja      L(movsb_more_2x_vec)
 243 L(last_2x_vec):
 244         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
 245         VMOVU   (%rsi), %VEC(0)
 246         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
 247         VMOVU   %VEC(0), (%rdi)
 248         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 249 L(return):
 250         VZEROUPPER
 251         ret
 252
 253 L(movsb):
 254         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 255         jae     L(more_8x_vec)
 256         cmpq    %rsi, %rdi
 257         jb      1f
 258         /* Source == destination is less common.  */
 259         je      L(nop)
 260         leaq    (%rsi,%rdx), %r9
 261         cmpq    %r9, %rdi
 262         /* Avoid slow backward REP MOVSB.  */
 263 # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
 264 #  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
 265 # endif
 266         jb      L(more_8x_vec_backward)
 267 # if AVOID_SHORT_DISTANCE_REP_MOVSB
 268         movq    %rdi, %rcx
 269         subq    %rsi, %rcx
 270         jmp     2f
 271 # endif
 272 1:
 273 # if AVOID_SHORT_DISTANCE_REP_MOVSB
 274         movq    %rsi, %rcx
 275         subq    %rdi, %rcx
 276 2:
 277 /* Avoid "rep movsb" if RCX, the distance between source and destination,
 278    is N*4GB + [1..63] with N >= 0.  */
 279         cmpl    $63, %ecx
 280         jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
 281 # endif
 282         mov     %RDX_LP, %RCX_LP
 283         rep movsb
 284 L(nop):
 285         ret
 286 #endif
 287
 288 L(less_vec):
 289         /* Less than 1 VEC.  */
 290 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 291 # error Unsupported VEC_SIZE!
 292 #endif
 293 #if VEC_SIZE > 32
 294         cmpb    $32, %dl
 295         jae     L(between_32_63)
 296 #endif
 297 #if VEC_SIZE > 16
 298         cmpb    $16, %dl
 299         jae     L(between_16_31)
 300 #endif
 301         cmpb    $8, %dl
 302         jae     L(between_8_15)
 303         cmpb    $4, %dl
 304         jae     L(between_4_7)
 305         cmpb    $1, %dl
 306         ja      L(between_2_3)
 307         jb      1f
 308         movzbl  (%rsi), %ecx
 309         movb    %cl, (%rdi)
 310 1:
 311         ret
 312 #if VEC_SIZE > 32
 313 L(between_32_63):
 314         /* From 32 to 63.  No branch when size == 32.  */
 315         vmovdqu (%rsi), %ymm0
 316         vmovdqu -32(%rsi,%rdx), %ymm1
 317         vmovdqu %ymm0, (%rdi)
 318         vmovdqu %ymm1, -32(%rdi,%rdx)
 319         VZEROUPPER
 320         ret
 321 #endif
 322 #if VEC_SIZE > 16
 323         /* From 16 to 31.  No branch when size == 16.  */
 324 L(between_16_31):
 325         vmovdqu (%rsi), %xmm0
 326         vmovdqu -16(%rsi,%rdx), %xmm1
 327         vmovdqu %xmm0, (%rdi)
 328         vmovdqu %xmm1, -16(%rdi,%rdx)
 329         ret
 330 #endif
 331 L(between_8_15):
 332         /* From 8 to 15.  No branch when size == 8.  */
 333         movq    -8(%rsi,%rdx), %rcx
 334         movq    (%rsi), %rsi
 335         movq    %rcx, -8(%rdi,%rdx)
 336         movq    %rsi, (%rdi)
 337         ret
 338 L(between_4_7):
 339         /* From 4 to 7.  No branch when size == 4.  */
 340         movl    -4(%rsi,%rdx), %ecx
 341         movl    (%rsi), %esi
 342         movl    %ecx, -4(%rdi,%rdx)
 343         movl    %esi, (%rdi)
 344         ret
 345 L(between_2_3):
 346         /* From 2 to 3.  No branch when size == 2.  */
 347         movzwl  -2(%rsi,%rdx), %ecx
 348         movzwl  (%rsi), %esi
 349         movw    %cx, -2(%rdi,%rdx)
 350         movw    %si, (%rdi)
 351         ret
 352
 353 #if defined USE_MULTIARCH && IS_IN (libc)
 354 L(movsb_more_2x_vec):
 355         cmpq    $REP_MOVSB_THRESHOLD, %rdx
 356         ja      L(movsb)
 357 #endif
 358 L(more_2x_vec):
 359         /* More than 2 * VEC and there may be overlap between destination
 360            and source.  */
 361         cmpq    $(VEC_SIZE * 8), %rdx
 362         ja      L(more_8x_vec)
 363         cmpq    $(VEC_SIZE * 4), %rdx
 364         jb      L(last_4x_vec)
 365         /* Copy from 4 * VEC to 8 * VEC, inclusively. */
 366         VMOVU   (%rsi), %VEC(0)
 367         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 368         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 369         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 370         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
 371         VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
 372         VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
 373         VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
 374         VMOVU   %VEC(0), (%rdi)
 375         VMOVU   %VEC(1), VEC_SIZE(%rdi)
 376         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
 377         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
 378         VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
 379         VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
 380         VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
 381         VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
 382         VZEROUPPER
 383         ret
 384 L(last_4x_vec):
 385         /* Copy from 2 * VEC to 4 * VEC. */
 386         VMOVU   (%rsi), %VEC(0)
 387         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 388         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
 389         VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
 390         VMOVU   %VEC(0), (%rdi)
 391         VMOVU   %VEC(1), VEC_SIZE(%rdi)
 392         VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
 393         VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
 394         VZEROUPPER
 395         ret
 396
 397 L(more_8x_vec):
 398         cmpq    %rsi, %rdi
 399         ja      L(more_8x_vec_backward)
 400         /* Source == destination is less common.  */
 401         je      L(nop)
 402         /* Load the first VEC and last 4 * VEC to support overlapping
 403            addresses.  */
 404         VMOVU   (%rsi), %VEC(4)
 405         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
 406         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
 407         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 408         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
 409         /* Save start and stop of the destination buffer.  */
 410         movq    %rdi, %r11
 411         leaq    -VEC_SIZE(%rdi, %rdx), %rcx
 412         /* Align destination for aligned stores in the loop.  Compute
 413            how much destination is misaligned.  */
 414         movq    %rdi, %r8
 415         andq    $(VEC_SIZE - 1), %r8
 416         /* Get the negative of offset for alignment.  */
 417         subq    $VEC_SIZE, %r8
 418         /* Adjust source.  */
 419         subq    %r8, %rsi
 420         /* Adjust destination which should be aligned now.  */
 421         subq    %r8, %rdi
 422         /* Adjust length.  */
 423         addq    %r8, %rdx
 424 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 425         /* Check non-temporal store threshold.  */
 426         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 427         ja      L(large_forward)
 428 #endif
 429 L(loop_4x_vec_forward):
 430         /* Copy 4 * VEC a time forward.  */
 431         VMOVU   (%rsi), %VEC(0)
 432         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 433         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 434         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 435         addq    $(VEC_SIZE * 4), %rsi
 436         subq    $(VEC_SIZE * 4), %rdx
 437         VMOVA   %VEC(0), (%rdi)
 438         VMOVA   %VEC(1), VEC_SIZE(%rdi)
 439         VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
 440         VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
 441         addq    $(VEC_SIZE * 4), %rdi
 442         cmpq    $(VEC_SIZE * 4), %rdx
 443         ja      L(loop_4x_vec_forward)
 444         /* Store the last 4 * VEC.  */
 445         VMOVU   %VEC(5), (%rcx)
 446         VMOVU   %VEC(6), -VEC_SIZE(%rcx)
 447         VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
 448         VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
 449         /* Store the first VEC.  */
 450         VMOVU   %VEC(4), (%r11)
 451         VZEROUPPER
 452         ret
 453
 454 L(more_8x_vec_backward):
 455         /* Load the first 4 * VEC and last VEC to support overlapping
 456            addresses.  */
 457         VMOVU   (%rsi), %VEC(4)
 458         VMOVU   VEC_SIZE(%rsi), %VEC(5)
 459         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
 460         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
 461         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
 462         /* Save stop of the destination buffer.  */
 463         leaq    -VEC_SIZE(%rdi, %rdx), %r11
 464         /* Align destination end for aligned stores in the loop.  Compute
 465            how much destination end is misaligned.  */
 466         leaq    -VEC_SIZE(%rsi, %rdx), %rcx
 467         movq    %r11, %r9
 468         movq    %r11, %r8
 469         andq    $(VEC_SIZE - 1), %r8
 470         /* Adjust source.  */
 471         subq    %r8, %rcx
 472         /* Adjust the end of destination which should be aligned now.  */
 473         subq    %r8, %r9
 474         /* Adjust length.  */
 475         subq    %r8, %rdx
 476 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 477         /* Check non-temporal store threshold.  */
 478         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 479         ja      L(large_backward)
 480 #endif
 481 L(loop_4x_vec_backward):
 482         /* Copy 4 * VEC a time backward.  */
 483         VMOVU   (%rcx), %VEC(0)
 484         VMOVU   -VEC_SIZE(%rcx), %VEC(1)
 485         VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
 486         VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
 487         subq    $(VEC_SIZE * 4), %rcx
 488         subq    $(VEC_SIZE * 4), %rdx
 489         VMOVA   %VEC(0), (%r9)
 490         VMOVA   %VEC(1), -VEC_SIZE(%r9)
 491         VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
 492         VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
 493         subq    $(VEC_SIZE * 4), %r9
 494         cmpq    $(VEC_SIZE * 4), %rdx
 495         ja      L(loop_4x_vec_backward)
 496         /* Store the first 4 * VEC.  */
 497         VMOVU   %VEC(4), (%rdi)
 498         VMOVU   %VEC(5), VEC_SIZE(%rdi)
 499         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
 500         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
 501         /* Store the last VEC.  */
 502         VMOVU   %VEC(8), (%r11)
 503         VZEROUPPER
 504         ret
 505
 506 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 507 L(large_forward):
 508         /* Don't use non-temporal store if there is overlap between
 509            destination and source since destination may be in cache
 510            when source is loaded.  */
 511         leaq    (%rdi, %rdx), %r10
 512         cmpq    %r10, %rsi
 513         jb      L(loop_4x_vec_forward)
 514 L(loop_large_forward):
 515         /* Copy 4 * VEC a time forward with non-temporal stores.  */
 516         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 517         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
 518         VMOVU   (%rsi), %VEC(0)
 519         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 520         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 521         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 522         addq    $PREFETCHED_LOAD_SIZE, %rsi
 523         subq    $PREFETCHED_LOAD_SIZE, %rdx
 524         VMOVNT  %VEC(0), (%rdi)
 525         VMOVNT  %VEC(1), VEC_SIZE(%rdi)
 526         VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
 527         VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
 528         addq    $PREFETCHED_LOAD_SIZE, %rdi
 529         cmpq    $PREFETCHED_LOAD_SIZE, %rdx
 530         ja      L(loop_large_forward)
 531         sfence
 532         /* Store the last 4 * VEC.  */
 533         VMOVU   %VEC(5), (%rcx)
 534         VMOVU   %VEC(6), -VEC_SIZE(%rcx)
 535         VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
 536         VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
 537         /* Store the first VEC.  */
 538         VMOVU   %VEC(4), (%r11)
 539         VZEROUPPER
 540         ret
 541
 542 L(large_backward):
 543         /* Don't use non-temporal store if there is overlap between
 544            destination and source since destination may be in cache
 545            when source is loaded.  */
 546         leaq    (%rcx, %rdx), %r10
 547         cmpq    %r10, %r9
 548         jb      L(loop_4x_vec_backward)
 549 L(loop_large_backward):
 550         /* Copy 4 * VEC a time backward with non-temporal stores.  */
 551         PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
 552         PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
 553         VMOVU   (%rcx), %VEC(0)
 554         VMOVU   -VEC_SIZE(%rcx), %VEC(1)
 555         VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
 556         VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
 557         subq    $PREFETCHED_LOAD_SIZE, %rcx
 558         subq    $PREFETCHED_LOAD_SIZE, %rdx
 559         VMOVNT  %VEC(0), (%r9)
 560         VMOVNT  %VEC(1), -VEC_SIZE(%r9)
 561         VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
 562         VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
 563         subq    $PREFETCHED_LOAD_SIZE, %r9
 564         cmpq    $PREFETCHED_LOAD_SIZE, %rdx
 565         ja      L(loop_large_backward)
 566         sfence
 567         /* Store the first 4 * VEC.  */
 568         VMOVU   %VEC(4), (%rdi)
 569         VMOVU   %VEC(5), VEC_SIZE(%rdi)
 570         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
 571         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
 572         /* Store the last VEC.  */
 573         VMOVU   %VEC(8), (%r11)
 574         VZEROUPPER
 575         ret
 576 #endif
 577 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 578
 579 #if IS_IN (libc)
 580 # ifdef USE_MULTIARCH
 581 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 582               MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
 583 #  ifdef SHARED
 584 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 585               MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
 586 #  endif
 587 # endif
 588 # ifdef SHARED
 589 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
 590               MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
 591 # endif
 592 #endif
 593 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
 594               MEMCPY_SYMBOL (__memcpy, unaligned))