sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

   1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
   2    Copyright (C) 2016-2019 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 /* memmove/memcpy/mempcpy is implemented as:
  20    1. Use overlapping load and store to avoid branch.
  21    2. Load all sources into registers and store them together to avoid
  22       possible address overlap between source and destination.
  23    3. If size is 8 * VEC_SIZE or less, load all sources into registers
  24       and store them together.
  25    4. If address of destination > address of source, backward copy
  26       4 * VEC_SIZE at a time with unaligned load and aligned store.
  27       Load the first 4 * VEC and last VEC before the loop and store
  28       them after the loop to support overlapping addresses.
  29    5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
  30       load and aligned store.  Load the last 4 * VEC and first VEC
  31       before the loop and store them after the loop to support
  32       overlapping addresses.
  33    6. If size >= __x86_shared_non_temporal_threshold and there is no
  34       overlap between destination and source, use non-temporal store
  35       instead of aligned store.  */
  36
  37 #include <sysdep.h>
  38
  39 #ifndef MEMCPY_SYMBOL
  40 # define MEMCPY_SYMBOL(p,s)             MEMMOVE_SYMBOL(p, s)
  41 #endif
  42
  43 #ifndef MEMPCPY_SYMBOL
  44 # define MEMPCPY_SYMBOL(p,s)            MEMMOVE_SYMBOL(p, s)
  45 #endif
  46
  47 #ifndef MEMMOVE_CHK_SYMBOL
  48 # define MEMMOVE_CHK_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
  49 #endif
  50
  51 #ifndef VZEROUPPER
  52 # if VEC_SIZE > 16
  53 #  define VZEROUPPER vzeroupper
  54 # else
  55 #  define VZEROUPPER
  56 # endif
  57 #endif
  58
  59 /* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
  60    up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
  61    memcpy micro benchmark in glibc shows that 2KB is the approximate
  62    value above which REP MOVSB becomes faster than SSE2 optimization
  63    on processors with Enhanced REP MOVSB.  Since larger register size
  64    can move more data with a single load and store, the threshold is
  65    higher with larger register size.  */
  66 #ifndef REP_MOVSB_THRESHOLD
  67 # define REP_MOVSB_THRESHOLD    (2048 * (VEC_SIZE / 16))
  68 #endif
  69
  70 #ifndef PREFETCH
  71 # define PREFETCH(addr) prefetcht0 addr
  72 #endif
  73
  74 /* Assume 64-byte prefetch size.  */
  75 #ifndef PREFETCH_SIZE
  76 # define PREFETCH_SIZE 64
  77 #endif
  78
  79 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
  80
  81 #if PREFETCH_SIZE == 64
  82 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
  83 #  define PREFETCH_ONE_SET(dir, base, offset) \
  84         PREFETCH ((offset)base)
  85 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
  86 #  define PREFETCH_ONE_SET(dir, base, offset) \
  87         PREFETCH ((offset)base); \
  88         PREFETCH ((offset + dir * PREFETCH_SIZE)base)
  89 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
  90 #  define PREFETCH_ONE_SET(dir, base, offset) \
  91         PREFETCH ((offset)base); \
  92         PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
  93         PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
  94         PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
  95 # else
  96 #   error Unsupported PREFETCHED_LOAD_SIZE!
  97 # endif
  98 #else
  99 # error Unsupported PREFETCH_SIZE!
 100 #endif
 101
 102 #ifndef SECTION
 103 # error SECTION is not defined!
 104 #endif
 105
 106         .section SECTION(.text),"ax",@progbits
 107 #if defined SHARED && IS_IN (libc)
 108 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 109         cmp     %RDX_LP, %RCX_LP
 110         jb      HIDDEN_JUMPTARGET (__chk_fail)
 111 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 112 #endif
 113
 114 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 115         mov     %RDI_LP, %RAX_LP
 116         add     %RDX_LP, %RAX_LP
 117         jmp     L(start)
 118 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 119
 120 #if defined SHARED && IS_IN (libc)
 121 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 122         cmp     %RDX_LP, %RCX_LP
 123         jb      HIDDEN_JUMPTARGET (__chk_fail)
 124 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 125 #endif
 126
 127 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 128         movq    %rdi, %rax
 129 L(start):
 130 # ifdef __ILP32__
 131         /* Clear the upper 32 bits.  */
 132         movl    %edx, %edx
 133 # endif
 134         cmp     $VEC_SIZE, %RDX_LP
 135         jb      L(less_vec)
 136         cmp     $(VEC_SIZE * 2), %RDX_LP
 137         ja      L(more_2x_vec)
 138 #if !defined USE_MULTIARCH || !IS_IN (libc)
 139 L(last_2x_vec):
 140 #endif
 141         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 142         VMOVU   (%rsi), %VEC(0)
 143         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
 144         VMOVU   %VEC(0), (%rdi)
 145         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 146         VZEROUPPER
 147 #if !defined USE_MULTIARCH || !IS_IN (libc)
 148 L(nop):
 149 #endif
 150         ret
 151 #if defined USE_MULTIARCH && IS_IN (libc)
 152 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 153
 154 # if VEC_SIZE == 16
 155 ENTRY (__mempcpy_chk_erms)
 156         cmp     %RDX_LP, %RCX_LP
 157         jb      HIDDEN_JUMPTARGET (__chk_fail)
 158 END (__mempcpy_chk_erms)
 159
 160 /* Only used to measure performance of REP MOVSB.  */
 161 ENTRY (__mempcpy_erms)
 162         mov     %RDI_LP, %RAX_LP
 163         /* Skip zero length.  */
 164         test    %RDX_LP, %RDX_LP
 165         jz      2f
 166         add     %RDX_LP, %RAX_LP
 167         jmp     L(start_movsb)
 168 END (__mempcpy_erms)
 169
 170 ENTRY (__memmove_chk_erms)
 171         cmp     %RDX_LP, %RCX_LP
 172         jb      HIDDEN_JUMPTARGET (__chk_fail)
 173 END (__memmove_chk_erms)
 174
 175 ENTRY (__memmove_erms)
 176         movq    %rdi, %rax
 177         /* Skip zero length.  */
 178         test    %RDX_LP, %RDX_LP
 179         jz      2f
 180 L(start_movsb):
 181         mov     %RDX_LP, %RCX_LP
 182         cmp     %RSI_LP, %RDI_LP
 183         jb      1f
 184         /* Source == destination is less common.  */
 185         je      2f
 186         lea     (%rsi,%rcx), %RDX_LP
 187         cmp     %RDX_LP, %RDI_LP
 188         jb      L(movsb_backward)
 189 1:
 190         rep movsb
 191 2:
 192         ret
 193 L(movsb_backward):
 194         leaq    -1(%rdi,%rcx), %rdi
 195         leaq    -1(%rsi,%rcx), %rsi
 196         std
 197         rep movsb
 198         cld
 199         ret
 200 END (__memmove_erms)
 201 strong_alias (__memmove_erms, __memcpy_erms)
 202 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
 203 # endif
 204
 205 # ifdef SHARED
 206 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 207         cmp     %RDX_LP, %RCX_LP
 208         jb      HIDDEN_JUMPTARGET (__chk_fail)
 209 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 210 # endif
 211
 212 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 213         mov     %RDI_LP, %RAX_LP
 214         add     %RDX_LP, %RAX_LP
 215         jmp     L(start_erms)
 216 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 217
 218 # ifdef SHARED
 219 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 220         cmp     %RDX_LP, %RCX_LP
 221         jb      HIDDEN_JUMPTARGET (__chk_fail)
 222 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 223 # endif
 224
 225 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 226         movq    %rdi, %rax
 227 L(start_erms):
 228 # ifdef __ILP32__
 229         /* Clear the upper 32 bits.  */
 230         movl    %edx, %edx
 231 # endif
 232         cmp     $VEC_SIZE, %RDX_LP
 233         jb      L(less_vec)
 234         cmp     $(VEC_SIZE * 2), %RDX_LP
 235         ja      L(movsb_more_2x_vec)
 236 L(last_2x_vec):
 237         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
 238         VMOVU   (%rsi), %VEC(0)
 239         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
 240         VMOVU   %VEC(0), (%rdi)
 241         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 242 L(return):
 243         VZEROUPPER
 244         ret
 245
 246 L(movsb):
 247         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 248         jae     L(more_8x_vec)
 249         cmpq    %rsi, %rdi
 250         jb      1f
 251         /* Source == destination is less common.  */
 252         je      L(nop)
 253         leaq    (%rsi,%rdx), %r9
 254         cmpq    %r9, %rdi
 255         /* Avoid slow backward REP MOVSB.  */
 256 # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
 257 #  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
 258 # endif
 259         jb      L(more_8x_vec_backward)
 260 1:
 261         mov     %RDX_LP, %RCX_LP
 262         rep movsb
 263 L(nop):
 264         ret
 265 #endif
 266
 267 L(less_vec):
 268         /* Less than 1 VEC.  */
 269 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 270 # error Unsupported VEC_SIZE!
 271 #endif
 272 #if VEC_SIZE > 32
 273         cmpb    $32, %dl
 274         jae     L(between_32_63)
 275 #endif
 276 #if VEC_SIZE > 16
 277         cmpb    $16, %dl
 278         jae     L(between_16_31)
 279 #endif
 280         cmpb    $8, %dl
 281         jae     L(between_8_15)
 282         cmpb    $4, %dl
 283         jae     L(between_4_7)
 284         cmpb    $1, %dl
 285         ja      L(between_2_3)
 286         jb      1f
 287         movzbl  (%rsi), %ecx
 288         movb    %cl, (%rdi)
 289 1:
 290         ret
 291 #if VEC_SIZE > 32
 292 L(between_32_63):
 293         /* From 32 to 63.  No branch when size == 32.  */
 294         vmovdqu (%rsi), %ymm0
 295         vmovdqu -32(%rsi,%rdx), %ymm1
 296         vmovdqu %ymm0, (%rdi)
 297         vmovdqu %ymm1, -32(%rdi,%rdx)
 298         VZEROUPPER
 299         ret
 300 #endif
 301 #if VEC_SIZE > 16
 302         /* From 16 to 31.  No branch when size == 16.  */
 303 L(between_16_31):
 304         vmovdqu (%rsi), %xmm0
 305         vmovdqu -16(%rsi,%rdx), %xmm1
 306         vmovdqu %xmm0, (%rdi)
 307         vmovdqu %xmm1, -16(%rdi,%rdx)
 308         ret
 309 #endif
 310 L(between_8_15):
 311         /* From 8 to 15.  No branch when size == 8.  */
 312         movq    -8(%rsi,%rdx), %rcx
 313         movq    (%rsi), %rsi
 314         movq    %rcx, -8(%rdi,%rdx)
 315         movq    %rsi, (%rdi)
 316         ret
 317 L(between_4_7):
 318         /* From 4 to 7.  No branch when size == 4.  */
 319         movl    -4(%rsi,%rdx), %ecx
 320         movl    (%rsi), %esi
 321         movl    %ecx, -4(%rdi,%rdx)
 322         movl    %esi, (%rdi)
 323         ret
 324 L(between_2_3):
 325         /* From 2 to 3.  No branch when size == 2.  */
 326         movzwl  -2(%rsi,%rdx), %ecx
 327         movzwl  (%rsi), %esi
 328         movw    %cx, -2(%rdi,%rdx)
 329         movw    %si, (%rdi)
 330         ret
 331
 332 #if defined USE_MULTIARCH && IS_IN (libc)
 333 L(movsb_more_2x_vec):
 334         cmpq    $REP_MOVSB_THRESHOLD, %rdx
 335         ja      L(movsb)
 336 #endif
 337 L(more_2x_vec):
 338         /* More than 2 * VEC and there may be overlap between destination
 339            and source.  */
 340         cmpq    $(VEC_SIZE * 8), %rdx
 341         ja      L(more_8x_vec)
 342         cmpq    $(VEC_SIZE * 4), %rdx
 343         jb      L(last_4x_vec)
 344         /* Copy from 4 * VEC to 8 * VEC, inclusively. */
 345         VMOVU   (%rsi), %VEC(0)
 346         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 347         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 348         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 349         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
 350         VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
 351         VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
 352         VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
 353         VMOVU   %VEC(0), (%rdi)
 354         VMOVU   %VEC(1), VEC_SIZE(%rdi)
 355         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
 356         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
 357         VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
 358         VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
 359         VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
 360         VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
 361         VZEROUPPER
 362         ret
 363 L(last_4x_vec):
 364         /* Copy from 2 * VEC to 4 * VEC. */
 365         VMOVU   (%rsi), %VEC(0)
 366         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 367         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
 368         VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
 369         VMOVU   %VEC(0), (%rdi)
 370         VMOVU   %VEC(1), VEC_SIZE(%rdi)
 371         VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
 372         VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
 373         VZEROUPPER
 374         ret
 375
 376 L(more_8x_vec):
 377         cmpq    %rsi, %rdi
 378         ja      L(more_8x_vec_backward)
 379         /* Source == destination is less common.  */
 380         je      L(nop)
 381         /* Load the first VEC and last 4 * VEC to support overlapping
 382            addresses.  */
 383         VMOVU   (%rsi), %VEC(4)
 384         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
 385         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
 386         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 387         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
 388         /* Save start and stop of the destination buffer.  */
 389         movq    %rdi, %r11
 390         leaq    -VEC_SIZE(%rdi, %rdx), %rcx
 391         /* Align destination for aligned stores in the loop.  Compute
 392            how much destination is misaligned.  */
 393         movq    %rdi, %r8
 394         andq    $(VEC_SIZE - 1), %r8
 395         /* Get the negative of offset for alignment.  */
 396         subq    $VEC_SIZE, %r8
 397         /* Adjust source.  */
 398         subq    %r8, %rsi
 399         /* Adjust destination which should be aligned now.  */
 400         subq    %r8, %rdi
 401         /* Adjust length.  */
 402         addq    %r8, %rdx
 403 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 404         /* Check non-temporal store threshold.  */
 405         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 406         ja      L(large_forward)
 407 #endif
 408 L(loop_4x_vec_forward):
 409         /* Copy 4 * VEC a time forward.  */
 410         VMOVU   (%rsi), %VEC(0)
 411         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 412         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 413         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 414         addq    $(VEC_SIZE * 4), %rsi
 415         subq    $(VEC_SIZE * 4), %rdx
 416         VMOVA   %VEC(0), (%rdi)
 417         VMOVA   %VEC(1), VEC_SIZE(%rdi)
 418         VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
 419         VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
 420         addq    $(VEC_SIZE * 4), %rdi
 421         cmpq    $(VEC_SIZE * 4), %rdx
 422         ja      L(loop_4x_vec_forward)
 423         /* Store the last 4 * VEC.  */
 424         VMOVU   %VEC(5), (%rcx)
 425         VMOVU   %VEC(6), -VEC_SIZE(%rcx)
 426         VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
 427         VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
 428         /* Store the first VEC.  */
 429         VMOVU   %VEC(4), (%r11)
 430         VZEROUPPER
 431         ret
 432
 433 L(more_8x_vec_backward):
 434         /* Load the first 4 * VEC and last VEC to support overlapping
 435            addresses.  */
 436         VMOVU   (%rsi), %VEC(4)
 437         VMOVU   VEC_SIZE(%rsi), %VEC(5)
 438         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
 439         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
 440         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
 441         /* Save stop of the destination buffer.  */
 442         leaq    -VEC_SIZE(%rdi, %rdx), %r11
 443         /* Align destination end for aligned stores in the loop.  Compute
 444            how much destination end is misaligned.  */
 445         leaq    -VEC_SIZE(%rsi, %rdx), %rcx
 446         movq    %r11, %r9
 447         movq    %r11, %r8
 448         andq    $(VEC_SIZE - 1), %r8
 449         /* Adjust source.  */
 450         subq    %r8, %rcx
 451         /* Adjust the end of destination which should be aligned now.  */
 452         subq    %r8, %r9
 453         /* Adjust length.  */
 454         subq    %r8, %rdx
 455 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 456         /* Check non-temporal store threshold.  */
 457         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 458         ja      L(large_backward)
 459 #endif
 460 L(loop_4x_vec_backward):
 461         /* Copy 4 * VEC a time backward.  */
 462         VMOVU   (%rcx), %VEC(0)
 463         VMOVU   -VEC_SIZE(%rcx), %VEC(1)
 464         VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
 465         VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
 466         subq    $(VEC_SIZE * 4), %rcx
 467         subq    $(VEC_SIZE * 4), %rdx
 468         VMOVA   %VEC(0), (%r9)
 469         VMOVA   %VEC(1), -VEC_SIZE(%r9)
 470         VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
 471         VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
 472         subq    $(VEC_SIZE * 4), %r9
 473         cmpq    $(VEC_SIZE * 4), %rdx
 474         ja      L(loop_4x_vec_backward)
 475         /* Store the first 4 * VEC.  */
 476         VMOVU   %VEC(4), (%rdi)
 477         VMOVU   %VEC(5), VEC_SIZE(%rdi)
 478         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
 479         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
 480         /* Store the last VEC.  */
 481         VMOVU   %VEC(8), (%r11)
 482         VZEROUPPER
 483         ret
 484
 485 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 486 L(large_forward):
 487         /* Don't use non-temporal store if there is overlap between
 488            destination and source since destination may be in cache
 489            when source is loaded.  */
 490         leaq    (%rdi, %rdx), %r10
 491         cmpq    %r10, %rsi
 492         jb      L(loop_4x_vec_forward)
 493 L(loop_large_forward):
 494         /* Copy 4 * VEC a time forward with non-temporal stores.  */
 495         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 496         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
 497         VMOVU   (%rsi), %VEC(0)
 498         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 499         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 500         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 501         addq    $PREFETCHED_LOAD_SIZE, %rsi
 502         subq    $PREFETCHED_LOAD_SIZE, %rdx
 503         VMOVNT  %VEC(0), (%rdi)
 504         VMOVNT  %VEC(1), VEC_SIZE(%rdi)
 505         VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
 506         VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
 507         addq    $PREFETCHED_LOAD_SIZE, %rdi
 508         cmpq    $PREFETCHED_LOAD_SIZE, %rdx
 509         ja      L(loop_large_forward)
 510         sfence
 511         /* Store the last 4 * VEC.  */
 512         VMOVU   %VEC(5), (%rcx)
 513         VMOVU   %VEC(6), -VEC_SIZE(%rcx)
 514         VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
 515         VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
 516         /* Store the first VEC.  */
 517         VMOVU   %VEC(4), (%r11)
 518         VZEROUPPER
 519         ret
 520
 521 L(large_backward):
 522         /* Don't use non-temporal store if there is overlap between
 523            destination and source since destination may be in cache
 524            when source is loaded.  */
 525         leaq    (%rcx, %rdx), %r10
 526         cmpq    %r10, %r9
 527         jb      L(loop_4x_vec_backward)
 528 L(loop_large_backward):
 529         /* Copy 4 * VEC a time backward with non-temporal stores.  */
 530         PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
 531         PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
 532         VMOVU   (%rcx), %VEC(0)
 533         VMOVU   -VEC_SIZE(%rcx), %VEC(1)
 534         VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
 535         VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
 536         subq    $PREFETCHED_LOAD_SIZE, %rcx
 537         subq    $PREFETCHED_LOAD_SIZE, %rdx
 538         VMOVNT  %VEC(0), (%r9)
 539         VMOVNT  %VEC(1), -VEC_SIZE(%r9)
 540         VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
 541         VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
 542         subq    $PREFETCHED_LOAD_SIZE, %r9
 543         cmpq    $PREFETCHED_LOAD_SIZE, %rdx
 544         ja      L(loop_large_backward)
 545         sfence
 546         /* Store the first 4 * VEC.  */
 547         VMOVU   %VEC(4), (%rdi)
 548         VMOVU   %VEC(5), VEC_SIZE(%rdi)
 549         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
 550         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
 551         /* Store the last VEC.  */
 552         VMOVU   %VEC(8), (%r11)
 553         VZEROUPPER
 554         ret
 555 #endif
 556 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 557
 558 #if IS_IN (libc)
 559 # ifdef USE_MULTIARCH
 560 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 561               MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
 562 #  ifdef SHARED
 563 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 564               MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
 565 #  endif
 566 # endif
 567 # ifdef SHARED
 568 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
 569               MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
 570 # endif
 571 #endif
 572 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
 573               MEMCPY_SYMBOL (__memcpy, unaligned))