sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

   1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
   2    Copyright (C) 2016-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 /* memmove/memcpy/mempcpy is implemented as:
  20    1. Use overlapping load and store to avoid branch.
  21    2. Load all sources into registers and store them together to avoid
  22       possible address overlap between source and destination.
  23    3. If size is 8 * VEC_SIZE or less, load all sources into registers
  24       and store them together.
  25    4. If address of destination > address of source, backward copy
  26       4 * VEC_SIZE at a time with unaligned load and aligned store.
  27       Load the first 4 * VEC and last VEC before the loop and store
  28       them after the loop to support overlapping addresses.
  29    5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
  30       load and aligned store.  Load the last 4 * VEC and first VEC
  31       before the loop and store them after the loop to support
  32       overlapping addresses.
  33    6. If size >= __x86_shared_non_temporal_threshold and there is no
  34       overlap between destination and source, use non-temporal store
  35       instead of aligned store.  */
  36
  37 #include <sysdep.h>
  38
  39 #ifndef MEMCPY_SYMBOL
  40 # define MEMCPY_SYMBOL(p,s)             MEMMOVE_SYMBOL(p, s)
  41 #endif
  42
  43 #ifndef MEMPCPY_SYMBOL
  44 # define MEMPCPY_SYMBOL(p,s)            MEMMOVE_SYMBOL(p, s)
  45 #endif
  46
  47 #ifndef MEMMOVE_CHK_SYMBOL
  48 # define MEMMOVE_CHK_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
  49 #endif
  50
  51 #ifndef VZEROUPPER
  52 # if VEC_SIZE > 16
  53 #  define VZEROUPPER vzeroupper
  54 # else
  55 #  define VZEROUPPER
  56 # endif
  57 #endif
  58
  59 #ifndef PREFETCH
  60 # define PREFETCH(addr) prefetcht0 addr
  61 #endif
  62
  63 /* Assume 64-byte prefetch size.  */
  64 #ifndef PREFETCH_SIZE
  65 # define PREFETCH_SIZE 64
  66 #endif
  67
  68 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
  69
  70 #if PREFETCH_SIZE == 64
  71 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
  72 #  define PREFETCH_ONE_SET(dir, base, offset) \
  73         PREFETCH ((offset)base)
  74 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
  75 #  define PREFETCH_ONE_SET(dir, base, offset) \
  76         PREFETCH ((offset)base); \
  77         PREFETCH ((offset + dir * PREFETCH_SIZE)base)
  78 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
  79 #  define PREFETCH_ONE_SET(dir, base, offset) \
  80         PREFETCH ((offset)base); \
  81         PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
  82         PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
  83         PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
  84 # else
  85 #   error Unsupported PREFETCHED_LOAD_SIZE!
  86 # endif
  87 #else
  88 # error Unsupported PREFETCH_SIZE!
  89 #endif
  90
  91 #ifndef SECTION
  92 # error SECTION is not defined!
  93 #endif
  94
  95         .section SECTION(.text),"ax",@progbits
  96 #if defined SHARED && IS_IN (libc)
  97 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
  98         cmp     %RDX_LP, %RCX_LP
  99         jb      HIDDEN_JUMPTARGET (__chk_fail)
 100 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 101 #endif
 102
 103 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 104         mov     %RDI_LP, %RAX_LP
 105         add     %RDX_LP, %RAX_LP
 106         jmp     L(start)
 107 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 108
 109 #if defined SHARED && IS_IN (libc)
 110 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 111         cmp     %RDX_LP, %RCX_LP
 112         jb      HIDDEN_JUMPTARGET (__chk_fail)
 113 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 114 #endif
 115
 116 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 117         movq    %rdi, %rax
 118 L(start):
 119 # ifdef __ILP32__
 120         /* Clear the upper 32 bits.  */
 121         movl    %edx, %edx
 122 # endif
 123         cmp     $VEC_SIZE, %RDX_LP
 124         jb      L(less_vec)
 125         cmp     $(VEC_SIZE * 2), %RDX_LP
 126         ja      L(more_2x_vec)
 127 #if !defined USE_MULTIARCH || !IS_IN (libc)
 128 L(last_2x_vec):
 129 #endif
 130         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 131         VMOVU   (%rsi), %VEC(0)
 132         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
 133         VMOVU   %VEC(0), (%rdi)
 134         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 135         VZEROUPPER
 136 #if !defined USE_MULTIARCH || !IS_IN (libc)
 137 L(nop):
 138 #endif
 139         ret
 140 #if defined USE_MULTIARCH && IS_IN (libc)
 141 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 142
 143 # if VEC_SIZE == 16
 144 ENTRY (__mempcpy_chk_erms)
 145         cmp     %RDX_LP, %RCX_LP
 146         jb      HIDDEN_JUMPTARGET (__chk_fail)
 147 END (__mempcpy_chk_erms)
 148
 149 /* Only used to measure performance of REP MOVSB.  */
 150 ENTRY (__mempcpy_erms)
 151         mov     %RDI_LP, %RAX_LP
 152         /* Skip zero length.  */
 153         test    %RDX_LP, %RDX_LP
 154         jz      2f
 155         add     %RDX_LP, %RAX_LP
 156         jmp     L(start_movsb)
 157 END (__mempcpy_erms)
 158
 159 ENTRY (__memmove_chk_erms)
 160         cmp     %RDX_LP, %RCX_LP
 161         jb      HIDDEN_JUMPTARGET (__chk_fail)
 162 END (__memmove_chk_erms)
 163
 164 ENTRY (__memmove_erms)
 165         movq    %rdi, %rax
 166         /* Skip zero length.  */
 167         test    %RDX_LP, %RDX_LP
 168         jz      2f
 169 L(start_movsb):
 170         mov     %RDX_LP, %RCX_LP
 171         cmp     %RSI_LP, %RDI_LP
 172         jb      1f
 173         /* Source == destination is less common.  */
 174         je      2f
 175         lea     (%rsi,%rcx), %RDX_LP
 176         cmp     %RDX_LP, %RDI_LP
 177         jb      L(movsb_backward)
 178 1:
 179         rep movsb
 180 2:
 181         ret
 182 L(movsb_backward):
 183         leaq    -1(%rdi,%rcx), %rdi
 184         leaq    -1(%rsi,%rcx), %rsi
 185         std
 186         rep movsb
 187         cld
 188         ret
 189 END (__memmove_erms)
 190 strong_alias (__memmove_erms, __memcpy_erms)
 191 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
 192 # endif
 193
 194 # ifdef SHARED
 195 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 196         cmp     %RDX_LP, %RCX_LP
 197         jb      HIDDEN_JUMPTARGET (__chk_fail)
 198 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 199 # endif
 200
 201 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 202         mov     %RDI_LP, %RAX_LP
 203         add     %RDX_LP, %RAX_LP
 204         jmp     L(start_erms)
 205 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 206
 207 # ifdef SHARED
 208 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 209         cmp     %RDX_LP, %RCX_LP
 210         jb      HIDDEN_JUMPTARGET (__chk_fail)
 211 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 212 # endif
 213
 214 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 215         movq    %rdi, %rax
 216 L(start_erms):
 217 # ifdef __ILP32__
 218         /* Clear the upper 32 bits.  */
 219         movl    %edx, %edx
 220 # endif
 221         cmp     $VEC_SIZE, %RDX_LP
 222         jb      L(less_vec)
 223         cmp     $(VEC_SIZE * 2), %RDX_LP
 224         ja      L(movsb_more_2x_vec)
 225 L(last_2x_vec):
 226         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
 227         VMOVU   (%rsi), %VEC(0)
 228         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
 229         VMOVU   %VEC(0), (%rdi)
 230         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 231 L(return):
 232         VZEROUPPER
 233         ret
 234
 235 L(movsb):
 236         cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
 237         jae     L(more_8x_vec)
 238         cmpq    %rsi, %rdi
 239         jb      1f
 240         /* Source == destination is less common.  */
 241         je      L(nop)
 242         leaq    (%rsi,%rdx), %r9
 243         cmpq    %r9, %rdi
 244         /* Avoid slow backward REP MOVSB.  */
 245         jb      L(more_8x_vec_backward)
 246 1:
 247         mov     %RDX_LP, %RCX_LP
 248         rep movsb
 249 L(nop):
 250         ret
 251 #endif
 252
 253 L(less_vec):
 254         /* Less than 1 VEC.  */
 255 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 256 # error Unsupported VEC_SIZE!
 257 #endif
 258 #if VEC_SIZE > 32
 259         cmpb    $32, %dl
 260         jae     L(between_32_63)
 261 #endif
 262 #if VEC_SIZE > 16
 263         cmpb    $16, %dl
 264         jae     L(between_16_31)
 265 #endif
 266         cmpb    $8, %dl
 267         jae     L(between_8_15)
 268         cmpb    $4, %dl
 269         jae     L(between_4_7)
 270         cmpb    $1, %dl
 271         ja      L(between_2_3)
 272         jb      1f
 273         movzbl  (%rsi), %ecx
 274         movb    %cl, (%rdi)
 275 1:
 276         ret
 277 #if VEC_SIZE > 32
 278 L(between_32_63):
 279         /* From 32 to 63.  No branch when size == 32.  */
 280         vmovdqu (%rsi), %ymm0
 281         vmovdqu -32(%rsi,%rdx), %ymm1
 282         vmovdqu %ymm0, (%rdi)
 283         vmovdqu %ymm1, -32(%rdi,%rdx)
 284         VZEROUPPER
 285         ret
 286 #endif
 287 #if VEC_SIZE > 16
 288         /* From 16 to 31.  No branch when size == 16.  */
 289 L(between_16_31):
 290         vmovdqu (%rsi), %xmm0
 291         vmovdqu -16(%rsi,%rdx), %xmm1
 292         vmovdqu %xmm0, (%rdi)
 293         vmovdqu %xmm1, -16(%rdi,%rdx)
 294         ret
 295 #endif
 296 L(between_8_15):
 297         /* From 8 to 15.  No branch when size == 8.  */
 298         movq    -8(%rsi,%rdx), %rcx
 299         movq    (%rsi), %rsi
 300         movq    %rcx, -8(%rdi,%rdx)
 301         movq    %rsi, (%rdi)
 302         ret
 303 L(between_4_7):
 304         /* From 4 to 7.  No branch when size == 4.  */
 305         movl    -4(%rsi,%rdx), %ecx
 306         movl    (%rsi), %esi
 307         movl    %ecx, -4(%rdi,%rdx)
 308         movl    %esi, (%rdi)
 309         ret
 310 L(between_2_3):
 311         /* From 2 to 3.  No branch when size == 2.  */
 312         movzwl  -2(%rsi,%rdx), %ecx
 313         movzwl  (%rsi), %esi
 314         movw    %cx, -2(%rdi,%rdx)
 315         movw    %si, (%rdi)
 316         ret
 317
 318 #if defined USE_MULTIARCH && IS_IN (libc)
 319 L(movsb_more_2x_vec):
 320         cmp     __x86_rep_movsb_threshold(%rip), %RDX_LP
 321         ja      L(movsb)
 322 #endif
 323 L(more_2x_vec):
 324         /* More than 2 * VEC and there may be overlap between destination
 325            and source.  */
 326         cmpq    $(VEC_SIZE * 8), %rdx
 327         ja      L(more_8x_vec)
 328         cmpq    $(VEC_SIZE * 4), %rdx
 329         jb      L(last_4x_vec)
 330         /* Copy from 4 * VEC to 8 * VEC, inclusively. */
 331         VMOVU   (%rsi), %VEC(0)
 332         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 333         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 334         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 335         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
 336         VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
 337         VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
 338         VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
 339         VMOVU   %VEC(0), (%rdi)
 340         VMOVU   %VEC(1), VEC_SIZE(%rdi)
 341         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
 342         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
 343         VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
 344         VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
 345         VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
 346         VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
 347         VZEROUPPER
 348         ret
 349 L(last_4x_vec):
 350         /* Copy from 2 * VEC to 4 * VEC. */
 351         VMOVU   (%rsi), %VEC(0)
 352         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 353         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
 354         VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
 355         VMOVU   %VEC(0), (%rdi)
 356         VMOVU   %VEC(1), VEC_SIZE(%rdi)
 357         VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
 358         VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
 359         VZEROUPPER
 360         ret
 361
 362 L(more_8x_vec):
 363         cmpq    %rsi, %rdi
 364         ja      L(more_8x_vec_backward)
 365         /* Source == destination is less common.  */
 366         je      L(nop)
 367         /* Load the first VEC and last 4 * VEC to support overlapping
 368            addresses.  */
 369         VMOVU   (%rsi), %VEC(4)
 370         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
 371         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
 372         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 373         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
 374         /* Save start and stop of the destination buffer.  */
 375         movq    %rdi, %r11
 376         leaq    -VEC_SIZE(%rdi, %rdx), %rcx
 377         /* Align destination for aligned stores in the loop.  Compute
 378            how much destination is misaligned.  */
 379         movq    %rdi, %r8
 380         andq    $(VEC_SIZE - 1), %r8
 381         /* Get the negative of offset for alignment.  */
 382         subq    $VEC_SIZE, %r8
 383         /* Adjust source.  */
 384         subq    %r8, %rsi
 385         /* Adjust destination which should be aligned now.  */
 386         subq    %r8, %rdi
 387         /* Adjust length.  */
 388         addq    %r8, %rdx
 389 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 390         /* Check non-temporal store threshold.  */
 391         cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
 392         ja      L(large_forward)
 393 #endif
 394 L(loop_4x_vec_forward):
 395         /* Copy 4 * VEC a time forward.  */
 396         VMOVU   (%rsi), %VEC(0)
 397         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 398         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 399         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 400         addq    $(VEC_SIZE * 4), %rsi
 401         subq    $(VEC_SIZE * 4), %rdx
 402         VMOVA   %VEC(0), (%rdi)
 403         VMOVA   %VEC(1), VEC_SIZE(%rdi)
 404         VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
 405         VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
 406         addq    $(VEC_SIZE * 4), %rdi
 407         cmpq    $(VEC_SIZE * 4), %rdx
 408         ja      L(loop_4x_vec_forward)
 409         /* Store the last 4 * VEC.  */
 410         VMOVU   %VEC(5), (%rcx)
 411         VMOVU   %VEC(6), -VEC_SIZE(%rcx)
 412         VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
 413         VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
 414         /* Store the first VEC.  */
 415         VMOVU   %VEC(4), (%r11)
 416         VZEROUPPER
 417         ret
 418
 419 L(more_8x_vec_backward):
 420         /* Load the first 4 * VEC and last VEC to support overlapping
 421            addresses.  */
 422         VMOVU   (%rsi), %VEC(4)
 423         VMOVU   VEC_SIZE(%rsi), %VEC(5)
 424         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
 425         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
 426         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
 427         /* Save stop of the destination buffer.  */
 428         leaq    -VEC_SIZE(%rdi, %rdx), %r11
 429         /* Align destination end for aligned stores in the loop.  Compute
 430            how much destination end is misaligned.  */
 431         leaq    -VEC_SIZE(%rsi, %rdx), %rcx
 432         movq    %r11, %r9
 433         movq    %r11, %r8
 434         andq    $(VEC_SIZE - 1), %r8
 435         /* Adjust source.  */
 436         subq    %r8, %rcx
 437         /* Adjust the end of destination which should be aligned now.  */
 438         subq    %r8, %r9
 439         /* Adjust length.  */
 440         subq    %r8, %rdx
 441 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 442         /* Check non-temporal store threshold.  */
 443         cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
 444         ja      L(large_backward)
 445 #endif
 446 L(loop_4x_vec_backward):
 447         /* Copy 4 * VEC a time backward.  */
 448         VMOVU   (%rcx), %VEC(0)
 449         VMOVU   -VEC_SIZE(%rcx), %VEC(1)
 450         VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
 451         VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
 452         subq    $(VEC_SIZE * 4), %rcx
 453         subq    $(VEC_SIZE * 4), %rdx
 454         VMOVA   %VEC(0), (%r9)
 455         VMOVA   %VEC(1), -VEC_SIZE(%r9)
 456         VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
 457         VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
 458         subq    $(VEC_SIZE * 4), %r9
 459         cmpq    $(VEC_SIZE * 4), %rdx
 460         ja      L(loop_4x_vec_backward)
 461         /* Store the first 4 * VEC.  */
 462         VMOVU   %VEC(4), (%rdi)
 463         VMOVU   %VEC(5), VEC_SIZE(%rdi)
 464         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
 465         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
 466         /* Store the last VEC.  */
 467         VMOVU   %VEC(8), (%r11)
 468         VZEROUPPER
 469         ret
 470
 471 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 472 L(large_forward):
 473         /* Don't use non-temporal store if there is overlap between
 474            destination and source since destination may be in cache
 475            when source is loaded.  */
 476         leaq    (%rdi, %rdx), %r10
 477         cmpq    %r10, %rsi
 478         jb      L(loop_4x_vec_forward)
 479 L(loop_large_forward):
 480         /* Copy 4 * VEC a time forward with non-temporal stores.  */
 481         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 482         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
 483         VMOVU   (%rsi), %VEC(0)
 484         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 485         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 486         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 487         addq    $PREFETCHED_LOAD_SIZE, %rsi
 488         subq    $PREFETCHED_LOAD_SIZE, %rdx
 489         VMOVNT  %VEC(0), (%rdi)
 490         VMOVNT  %VEC(1), VEC_SIZE(%rdi)
 491         VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
 492         VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
 493         addq    $PREFETCHED_LOAD_SIZE, %rdi
 494         cmpq    $PREFETCHED_LOAD_SIZE, %rdx
 495         ja      L(loop_large_forward)
 496         sfence
 497         /* Store the last 4 * VEC.  */
 498         VMOVU   %VEC(5), (%rcx)
 499         VMOVU   %VEC(6), -VEC_SIZE(%rcx)
 500         VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
 501         VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
 502         /* Store the first VEC.  */
 503         VMOVU   %VEC(4), (%r11)
 504         VZEROUPPER
 505         ret
 506
 507 L(large_backward):
 508         /* Don't use non-temporal store if there is overlap between
 509            destination and source since destination may be in cache
 510            when source is loaded.  */
 511         leaq    (%rcx, %rdx), %r10
 512         cmpq    %r10, %r9
 513         jb      L(loop_4x_vec_backward)
 514 L(loop_large_backward):
 515         /* Copy 4 * VEC a time backward with non-temporal stores.  */
 516         PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
 517         PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
 518         VMOVU   (%rcx), %VEC(0)
 519         VMOVU   -VEC_SIZE(%rcx), %VEC(1)
 520         VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
 521         VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
 522         subq    $PREFETCHED_LOAD_SIZE, %rcx
 523         subq    $PREFETCHED_LOAD_SIZE, %rdx
 524         VMOVNT  %VEC(0), (%r9)
 525         VMOVNT  %VEC(1), -VEC_SIZE(%r9)
 526         VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
 527         VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
 528         subq    $PREFETCHED_LOAD_SIZE, %r9
 529         cmpq    $PREFETCHED_LOAD_SIZE, %rdx
 530         ja      L(loop_large_backward)
 531         sfence
 532         /* Store the first 4 * VEC.  */
 533         VMOVU   %VEC(4), (%rdi)
 534         VMOVU   %VEC(5), VEC_SIZE(%rdi)
 535         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
 536         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
 537         /* Store the last VEC.  */
 538         VMOVU   %VEC(8), (%r11)
 539         VZEROUPPER
 540         ret
 541 #endif
 542 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 543
 544 #if IS_IN (libc)
 545 # ifdef USE_MULTIARCH
 546 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 547               MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
 548 #  ifdef SHARED
 549 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 550               MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
 551 #  endif
 552 # endif
 553 # ifdef SHARED
 554 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
 555               MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
 556 # endif
 557 #endif
 558 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
 559               MEMCPY_SYMBOL (__memcpy, unaligned))