sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

   1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
   2    Copyright (C) 2016-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 /* memmove/memcpy/mempcpy is implemented as:
  20    1. Use overlapping load and store to avoid branch.
  21    2. Load all sources into registers and store them together to avoid
  22       possible address overlap between source and destination.
  23    3. If size is 8 * VEC_SIZE or less, load all sources into registers
  24       and store them together.
  25    4. If address of destination > address of source, backward copy
  26       4 * VEC_SIZE at a time with unaligned load and aligned store.
  27       Load the first 4 * VEC and last VEC before the loop and store
  28       them after the loop to support overlapping addresses.
  29    5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
  30       load and aligned store.  Load the last 4 * VEC and first VEC
  31       before the loop and store them after the loop to support
  32       overlapping addresses.
  33    6. If size >= __x86_shared_non_temporal_threshold and there is no
  34       overlap between destination and source, use non-temporal store
  35       instead of aligned store.  */
  36
  37 #include <sysdep.h>
  38
  39 #ifndef MEMCPY_SYMBOL
  40 # define MEMCPY_SYMBOL(p,s)             MEMMOVE_SYMBOL(p, s)
  41 #endif
  42
  43 #ifndef MEMPCPY_SYMBOL
  44 # define MEMPCPY_SYMBOL(p,s)            MEMMOVE_SYMBOL(p, s)
  45 #endif
  46
  47 #ifndef MEMMOVE_CHK_SYMBOL
  48 # define MEMMOVE_CHK_SYMBOL(p,s)        MEMMOVE_SYMBOL(p, s)
  49 #endif
  50
  51 #ifndef VZEROUPPER
  52 # if VEC_SIZE > 16
  53 #  define VZEROUPPER vzeroupper
  54 # else
  55 #  define VZEROUPPER
  56 # endif
  57 #endif
  58
  59 /* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
  60    up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
  61    memcpy micro benchmark in glibc shows that 2KB is the approximate
  62    value above which REP MOVSB becomes faster than SSE2 optimization
  63    on processors with Enhanced REP MOVSB.  Since larger register size
  64    can move more data with a single load and store, the threshold is
  65    higher with larger register size.  */
  66 #ifndef REP_MOVSB_THRESHOLD
  67 # define REP_MOVSB_THRESHOLD    (2048 * (VEC_SIZE / 16))
  68 #endif
  69
  70 #ifndef PREFETCH
  71 # define PREFETCH(addr) prefetcht0 addr
  72 #endif
  73
  74 /* Assume 64-byte prefetch size.  */
  75 #ifndef PREFETCH_SIZE
  76 # define PREFETCH_SIZE 64
  77 #endif
  78
  79 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
  80
  81 #if PREFETCH_SIZE == 64
  82 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
  83 #  define PREFETCH_ONE_SET(dir, base, offset) \
  84         PREFETCH ((offset)base)
  85 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
  86 #  define PREFETCH_ONE_SET(dir, base, offset) \
  87         PREFETCH ((offset)base); \
  88         PREFETCH ((offset + dir * PREFETCH_SIZE)base)
  89 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
  90 #  define PREFETCH_ONE_SET(dir, base, offset) \
  91         PREFETCH ((offset)base); \
  92         PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
  93         PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
  94         PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
  95         PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
  96 # else
  97 #   error Unsupported PREFETCHED_LOAD_SIZE!
  98 # endif
  99 #else
 100 # error Unsupported PREFETCH_SIZE!
 101 #endif
 102
 103 #ifndef SECTION
 104 # error SECTION is not defined!
 105 #endif
 106
 107         .section SECTION(.text),"ax",@progbits
 108 #if defined SHARED && IS_IN (libc)
 109 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 110         cmpq    %rdx, %rcx
 111         jb      HIDDEN_JUMPTARGET (__chk_fail)
 112 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 113 #endif
 114
 115 #if VEC_SIZE == 16 || defined SHARED
 116 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 117         movq    %rdi, %rax
 118         addq    %rdx, %rax
 119         jmp     L(start)
 120 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 121 #endif
 122
 123 #if defined SHARED && IS_IN (libc)
 124 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 125         cmpq    %rdx, %rcx
 126         jb      HIDDEN_JUMPTARGET (__chk_fail)
 127 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 128 #endif
 129
 130 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 131         movq    %rdi, %rax
 132 L(start):
 133         cmpq    $VEC_SIZE, %rdx
 134         jb      L(less_vec)
 135         cmpq    $(VEC_SIZE * 2), %rdx
 136         ja      L(more_2x_vec)
 137 #if !defined USE_MULTIARCH || !IS_IN (libc)
 138 L(last_2x_vec):
 139 #endif
 140         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 141         VMOVU   (%rsi), %VEC(0)
 142         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
 143         VMOVU   %VEC(0), (%rdi)
 144         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 145         VZEROUPPER
 146 #if !defined USE_MULTIARCH || !IS_IN (libc)
 147 L(nop):
 148 #endif
 149         ret
 150 #if defined USE_MULTIARCH && IS_IN (libc)
 151 END (MEMMOVE_SYMBOL (__memmove, unaligned))
 152
 153 # if VEC_SIZE == 16
 154 #  if defined SHARED
 155 /* Only used to measure performance of REP MOVSB.  */
 156 ENTRY (__mempcpy_erms)
 157         movq    %rdi, %rax
 158         addq    %rdx, %rax
 159         jmp     L(start_movsb)
 160 END (__mempcpy_erms)
 161 #  endif
 162
 163 ENTRY (__memmove_erms)
 164         movq    %rdi, %rax
 165 L(start_movsb):
 166         movq    %rdx, %rcx
 167         cmpq    %rsi, %rdi
 168         jb      1f
 169         /* Source == destination is less common.  */
 170         je      2f
 171         leaq    (%rsi,%rcx), %rdx
 172         cmpq    %rdx, %rdi
 173         jb      L(movsb_backward)
 174 1:
 175         rep movsb
 176 2:
 177         ret
 178 L(movsb_backward):
 179         leaq    -1(%rdi,%rcx), %rdi
 180         leaq    -1(%rsi,%rcx), %rsi
 181         std
 182         rep movsb
 183         cld
 184         ret
 185 END (__memmove_erms)
 186 #  if defined SHARED
 187 strong_alias (__memmove_erms, __memcpy_erms)
 188 #  endif
 189 # endif
 190
 191 # ifdef SHARED
 192 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 193         cmpq    %rdx, %rcx
 194         jb      HIDDEN_JUMPTARGET (__chk_fail)
 195 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 196
 197 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 198         movq    %rdi, %rax
 199         addq    %rdx, %rax
 200         jmp     L(start_erms)
 201 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 202
 203 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 204         cmpq    %rdx, %rcx
 205         jb      HIDDEN_JUMPTARGET (__chk_fail)
 206 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 207 # endif
 208
 209 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 210         movq    %rdi, %rax
 211 L(start_erms):
 212         cmpq    $VEC_SIZE, %rdx
 213         jb      L(less_vec)
 214         cmpq    $(VEC_SIZE * 2), %rdx
 215         ja      L(movsb_more_2x_vec)
 216 L(last_2x_vec):
 217         /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
 218         VMOVU   (%rsi), %VEC(0)
 219         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
 220         VMOVU   %VEC(0), (%rdi)
 221         VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
 222 L(return):
 223         VZEROUPPER
 224         ret
 225
 226 L(movsb):
 227         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 228         jae     L(more_8x_vec)
 229         cmpq    %rsi, %rdi
 230         jb      1f
 231         /* Source == destination is less common.  */
 232         je      L(nop)
 233         leaq    (%rsi,%rdx), %r9
 234         cmpq    %r9, %rdi
 235         /* Avoid slow backward REP MOVSB.  */
 236 # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
 237 #  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
 238 # endif
 239         jb      L(more_8x_vec_backward)
 240 1:
 241         movq    %rdx, %rcx
 242         rep movsb
 243 L(nop):
 244         ret
 245 #endif
 246
 247 L(less_vec):
 248         /* Less than 1 VEC.  */
 249 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 250 # error Unsupported VEC_SIZE!
 251 #endif
 252 #if VEC_SIZE > 32
 253         cmpb    $32, %dl
 254         jae     L(between_32_63)
 255 #endif
 256 #if VEC_SIZE > 16
 257         cmpb    $16, %dl
 258         jae     L(between_16_31)
 259 #endif
 260         cmpb    $8, %dl
 261         jae     L(between_8_15)
 262         cmpb    $4, %dl
 263         jae     L(between_4_7)
 264         cmpb    $1, %dl
 265         ja      L(between_2_3)
 266         jb      1f
 267         movzbl  (%rsi), %ecx
 268         movb    %cl, (%rdi)
 269 1:
 270         ret
 271 #if VEC_SIZE > 32
 272 L(between_32_63):
 273         /* From 32 to 63.  No branch when size == 32.  */
 274         vmovdqu (%rsi), %ymm0
 275         vmovdqu -32(%rsi,%rdx), %ymm1
 276         vmovdqu %ymm0, (%rdi)
 277         vmovdqu %ymm1, -32(%rdi,%rdx)
 278         VZEROUPPER
 279         ret
 280 #endif
 281 #if VEC_SIZE > 16
 282         /* From 16 to 31.  No branch when size == 16.  */
 283 L(between_16_31):
 284         vmovdqu (%rsi), %xmm0
 285         vmovdqu -16(%rsi,%rdx), %xmm1
 286         vmovdqu %xmm0, (%rdi)
 287         vmovdqu %xmm1, -16(%rdi,%rdx)
 288         ret
 289 #endif
 290 L(between_8_15):
 291         /* From 8 to 15.  No branch when size == 8.  */
 292         movq    -8(%rsi,%rdx), %rcx
 293         movq    (%rsi), %rsi
 294         movq    %rcx, -8(%rdi,%rdx)
 295         movq    %rsi, (%rdi)
 296         ret
 297 L(between_4_7):
 298         /* From 4 to 7.  No branch when size == 4.  */
 299         movl    -4(%rsi,%rdx), %ecx
 300         movl    (%rsi), %esi
 301         movl    %ecx, -4(%rdi,%rdx)
 302         movl    %esi, (%rdi)
 303         ret
 304 L(between_2_3):
 305         /* From 2 to 3.  No branch when size == 2.  */
 306         movzwl  -2(%rsi,%rdx), %ecx
 307         movzwl  (%rsi), %esi
 308         movw    %cx, -2(%rdi,%rdx)
 309         movw    %si, (%rdi)
 310         ret
 311
 312 #if defined USE_MULTIARCH && IS_IN (libc)
 313 L(movsb_more_2x_vec):
 314         cmpq    $REP_MOVSB_THRESHOLD, %rdx
 315         ja      L(movsb)
 316 #endif
 317 L(more_2x_vec):
 318         /* More than 2 * VEC and there may be overlap between destination
 319            and source.  */
 320         cmpq    $(VEC_SIZE * 8), %rdx
 321         ja      L(more_8x_vec)
 322         cmpq    $(VEC_SIZE * 4), %rdx
 323         jb      L(last_4x_vec)
 324         /* Copy from 4 * VEC to 8 * VEC, inclusively. */
 325         VMOVU   (%rsi), %VEC(0)
 326         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 327         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 328         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 329         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
 330         VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
 331         VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
 332         VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
 333         VMOVU   %VEC(0), (%rdi)
 334         VMOVU   %VEC(1), VEC_SIZE(%rdi)
 335         VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
 336         VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
 337         VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
 338         VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
 339         VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
 340         VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
 341         VZEROUPPER
 342         ret
 343 L(last_4x_vec):
 344         /* Copy from 2 * VEC to 4 * VEC. */
 345         VMOVU   (%rsi), %VEC(0)
 346         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 347         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
 348         VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
 349         VMOVU   %VEC(0), (%rdi)
 350         VMOVU   %VEC(1), VEC_SIZE(%rdi)
 351         VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
 352         VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
 353         VZEROUPPER
 354         ret
 355
 356 L(more_8x_vec):
 357         cmpq    %rsi, %rdi
 358         ja      L(more_8x_vec_backward)
 359         /* Source == destination is less common.  */
 360         je      L(nop)
 361         /* Load the first VEC and last 4 * VEC to support overlapping
 362            addresses.  */
 363         VMOVU   (%rsi), %VEC(4)
 364         VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(5)
 365         VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
 366         VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 367         VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
 368         /* Save start and stop of the destination buffer.  */
 369         movq    %rdi, %r11
 370         leaq    -VEC_SIZE(%rdi, %rdx), %rcx
 371         /* Align destination for aligned stores in the loop.  Compute
 372            how much destination is misaligned.  */
 373         movq    %rdi, %r8
 374         andq    $(VEC_SIZE - 1), %r8
 375         /* Get the negative of offset for alignment.  */
 376         subq    $VEC_SIZE, %r8
 377         /* Adjust source.  */
 378         subq    %r8, %rsi
 379         /* Adjust destination which should be aligned now.  */
 380         subq    %r8, %rdi
 381         /* Adjust length.  */
 382         addq    %r8, %rdx
 383 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 384         /* Check non-temporal store threshold.  */
 385         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 386         ja      L(large_forward)
 387 #endif
 388 L(loop_4x_vec_forward):
 389         /* Copy 4 * VEC a time forward.  */
 390         VMOVU   (%rsi), %VEC(0)
 391         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 392         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 393         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 394         addq    $(VEC_SIZE * 4), %rsi
 395         subq    $(VEC_SIZE * 4), %rdx
 396         VMOVA   %VEC(0), (%rdi)
 397         VMOVA   %VEC(1), VEC_SIZE(%rdi)
 398         VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
 399         VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
 400         addq    $(VEC_SIZE * 4), %rdi
 401         cmpq    $(VEC_SIZE * 4), %rdx
 402         ja      L(loop_4x_vec_forward)
 403         /* Store the last 4 * VEC.  */
 404         VMOVU   %VEC(5), (%rcx)
 405         VMOVU   %VEC(6), -VEC_SIZE(%rcx)
 406         VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
 407         VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
 408         /* Store the first VEC.  */
 409         VMOVU   %VEC(4), (%r11)
 410         VZEROUPPER
 411         ret
 412
 413 L(more_8x_vec_backward):
 414         /* Load the first 4 * VEC and last VEC to support overlapping
 415            addresses.  */
 416         VMOVU   (%rsi), %VEC(4)
 417         VMOVU   VEC_SIZE(%rsi), %VEC(5)
 418         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(6)
 419         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(7)
 420         VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(8)
 421         /* Save stop of the destination buffer.  */
 422         leaq    -VEC_SIZE(%rdi, %rdx), %r11
 423         /* Align destination end for aligned stores in the loop.  Compute
 424            how much destination end is misaligned.  */
 425         leaq    -VEC_SIZE(%rsi, %rdx), %rcx
 426         movq    %r11, %r9
 427         movq    %r11, %r8
 428         andq    $(VEC_SIZE - 1), %r8
 429         /* Adjust source.  */
 430         subq    %r8, %rcx
 431         /* Adjust the end of destination which should be aligned now.  */
 432         subq    %r8, %r9
 433         /* Adjust length.  */
 434         subq    %r8, %rdx
 435 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 436         /* Check non-temporal store threshold.  */
 437         cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
 438         ja      L(large_backward)
 439 #endif
 440 L(loop_4x_vec_backward):
 441         /* Copy 4 * VEC a time backward.  */
 442         VMOVU   (%rcx), %VEC(0)
 443         VMOVU   -VEC_SIZE(%rcx), %VEC(1)
 444         VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
 445         VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
 446         subq    $(VEC_SIZE * 4), %rcx
 447         subq    $(VEC_SIZE * 4), %rdx
 448         VMOVA   %VEC(0), (%r9)
 449         VMOVA   %VEC(1), -VEC_SIZE(%r9)
 450         VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
 451         VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
 452         subq    $(VEC_SIZE * 4), %r9
 453         cmpq    $(VEC_SIZE * 4), %rdx
 454         ja      L(loop_4x_vec_backward)
 455         /* Store the first 4 * VEC.  */
 456         VMOVU   %VEC(4), (%rdi)
 457         VMOVU   %VEC(5), VEC_SIZE(%rdi)
 458         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
 459         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
 460         /* Store the last VEC.  */
 461         VMOVU   %VEC(8), (%r11)
 462         VZEROUPPER
 463         ret
 464
 465 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 466 L(large_forward):
 467         /* Don't use non-temporal store if there is overlap between
 468            destination and source since destination may be in cache
 469            when source is loaded.  */
 470         leaq    (%rdi, %rdx), %r10
 471         cmpq    %r10, %rsi
 472         jb      L(loop_4x_vec_forward)
 473 L(loop_large_forward):
 474         /* Copy 4 * VEC a time forward with non-temporal stores.  */
 475         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 476         PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
 477         VMOVU   (%rsi), %VEC(0)
 478         VMOVU   VEC_SIZE(%rsi), %VEC(1)
 479         VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
 480         VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
 481         addq    $PREFETCHED_LOAD_SIZE, %rsi
 482         subq    $PREFETCHED_LOAD_SIZE, %rdx
 483         VMOVNT  %VEC(0), (%rdi)
 484         VMOVNT  %VEC(1), VEC_SIZE(%rdi)
 485         VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
 486         VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
 487         addq    $PREFETCHED_LOAD_SIZE, %rdi
 488         cmpq    $PREFETCHED_LOAD_SIZE, %rdx
 489         ja      L(loop_large_forward)
 490         sfence
 491         /* Store the last 4 * VEC.  */
 492         VMOVU   %VEC(5), (%rcx)
 493         VMOVU   %VEC(6), -VEC_SIZE(%rcx)
 494         VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
 495         VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
 496         /* Store the first VEC.  */
 497         VMOVU   %VEC(4), (%r11)
 498         VZEROUPPER
 499         ret
 500
 501 L(large_backward):
 502         /* Don't use non-temporal store if there is overlap between
 503            destination and source since destination may be in cache
 504            when source is loaded.  */
 505         leaq    (%rcx, %rdx), %r10
 506         cmpq    %r10, %r9
 507         jb      L(loop_4x_vec_backward)
 508 L(loop_large_backward):
 509         /* Copy 4 * VEC a time backward with non-temporal stores.  */
 510         PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
 511         PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
 512         VMOVU   (%rcx), %VEC(0)
 513         VMOVU   -VEC_SIZE(%rcx), %VEC(1)
 514         VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
 515         VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
 516         subq    $PREFETCHED_LOAD_SIZE, %rcx
 517         subq    $PREFETCHED_LOAD_SIZE, %rdx
 518         VMOVNT  %VEC(0), (%r9)
 519         VMOVNT  %VEC(1), -VEC_SIZE(%r9)
 520         VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
 521         VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
 522         subq    $PREFETCHED_LOAD_SIZE, %r9
 523         cmpq    $PREFETCHED_LOAD_SIZE, %rdx
 524         ja      L(loop_large_backward)
 525         sfence
 526         /* Store the first 4 * VEC.  */
 527         VMOVU   %VEC(4), (%rdi)
 528         VMOVU   %VEC(5), VEC_SIZE(%rdi)
 529         VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
 530         VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
 531         /* Store the last VEC.  */
 532         VMOVU   %VEC(8), (%r11)
 533         VZEROUPPER
 534         ret
 535 #endif
 536 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 537
 538 #ifdef SHARED
 539 # if IS_IN (libc)
 540 #  ifdef USE_MULTIARCH
 541 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 542               MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
 543 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 544               MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
 545 #  endif
 546 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
 547               MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
 548 # endif
 549 #endif
 550 #if VEC_SIZE == 16 || defined SHARED
 551 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
 552               MEMCPY_SYMBOL (__memcpy, unaligned))
 553 #endif