]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
Add x86-64 memmove with unaligned load/store and rep movsb
authorH.J. Lu <hjl.tools@gmail.com>
Thu, 31 Mar 2016 17:04:26 +0000 (10:04 -0700)
committerH.J. Lu <hjl.tools@gmail.com>
Thu, 31 Mar 2016 17:04:40 +0000 (10:04 -0700)
Implement x86-64 memmove with unaligned load/store and rep movsb.
Support 16-byte, 32-byte and 64-byte vector register sizes.  When
size <= 8 times of vector register size, there is no check for
address overlap bewteen source and destination.  Since overhead for
overlap check is small when size > 8 times of vector register size,
memcpy is an alias of memmove.

A single file provides 2 implementations of memmove, one with rep movsb
and the other without rep movsb.  They share the same codes when size is
between 2 times of vector register size and REP_MOVSB_THRESHOLD which
is 2KB for 16-byte vector register size and scaled up by large vector
register size.

Key features:

1. Use overlapping load and store to avoid branch.
2. For size <= 8 times of vector register size, load  all sources into
registers and store them together.
3. If there is no address overlap bewteen source and destination, copy
from both ends with 4 times of vector register size at a time.
4. If address of destination > address of source, backward copy 8 times
of vector register size at a time.
5. Otherwise, forward copy 8 times of vector register size at a time.
6. Use rep movsb only for forward copy.  Avoid slow backward rep movsb
by fallbacking to backward copy 8 times of vector register size at a
time.
7. Skip when address of destination == address of source.

[BZ #19776]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
memmove-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test
__memmove_chk_avx512_unaligned_2,
__memmove_chk_avx512_unaligned_erms,
__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
__memmove_chk_sse2_unaligned_2,
__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
__memmove_avx512_unaligned_erms, __memmove_erms,
__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
__memcpy_chk_avx512_unaligned_2,
__memcpy_chk_avx512_unaligned_erms,
__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
__memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
__memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
__memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
__memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
__memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
__mempcpy_chk_avx512_unaligned_erms,
__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
__mempcpy_erms.
* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
Likwise.

ChangeLog
sysdeps/x86_64/multiarch/Makefile
sysdeps/x86_64/multiarch/ifunc-impl-list.c
sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S [new file with mode: 0644]

index 632da3c751bfe87ca0cf006d3324b53bcb46c7ae..100764f66acbb3e459bf9e79b41a38e87f9916ad 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,43 @@
+2016-03-31   H.J. Lu  <hongjiu.lu@intel.com>
+
+       [BZ #19776]
+       * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+       memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
+       memmove-avx512-unaligned-erms.
+       * sysdeps/x86_64/multiarch/ifunc-impl-list.c
+       (__libc_ifunc_impl_list): Test
+       __memmove_chk_avx512_unaligned_2,
+       __memmove_chk_avx512_unaligned_erms,
+       __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
+       __memmove_chk_sse2_unaligned_2,
+       __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
+       __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
+       __memmove_avx512_unaligned_erms, __memmove_erms,
+       __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
+       __memcpy_chk_avx512_unaligned_2,
+       __memcpy_chk_avx512_unaligned_erms,
+       __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
+       __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
+       __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
+       __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
+       __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
+       __memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
+       __mempcpy_chk_avx512_unaligned_erms,
+       __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
+       __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
+       __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
+       __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
+       __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
+       __mempcpy_erms.
+       * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
+       file.
+       * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
+       Likwise.
+       * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
+       Likwise.
+       * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+       Likwise.
+
 2016-03-31  Stefan Liebler  <stli@linux.vnet.ibm.com>
 
        * sysdeps/s390/bits/link.h: (La_s390_vr) New typedef.
index 7fc89c253f5335c3a6a0351a7240275b0f47e4ff..ef4dbc0c6f07dea58db0b5f02f3b08cdb46072f9 100644 (file)
@@ -20,7 +20,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
                   strcat-sse2-unaligned strncat-sse2-unaligned \
                   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
                   strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
-                  memset-avx512-no-vzeroupper
+                  memset-avx512-no-vzeroupper \
+                  memmove-sse2-unaligned-erms \
+                  memmove-avx-unaligned-erms \
+                  memmove-avx512-unaligned-erms
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
index 188b6d36c653abe6b84fa1610da31fffd679acd7..9204da450a50e7a0d9f4c95f9513e2bd639a6da7 100644 (file)
@@ -52,16 +52,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __memmove_chk,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __memmove_chk_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memmove_chk_avx512_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memmove_chk_avx512_unaligned_erms)
 #endif
              IFUNC_IMPL_ADD (array, i, __memmove_chk,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __memmove_chk_avx_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memmove_chk_avx_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memmove_chk_avx_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __memmove_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __memmove_chk_ssse3_back)
              IFUNC_IMPL_ADD (array, i, __memmove_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __memmove_chk_ssse3)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+                             __memmove_chk_sse2_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+                             __memmove_chk_sse2_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
                              __memmove_chk_sse2))
 
@@ -70,15 +86,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memmove,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __memmove_avx_unaligned)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memmove_avx_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memmove_avx_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
              IFUNC_IMPL_ADD (array, i, memmove,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __memmove_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memmove_avx512_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memmove_avx512_unaligned_erms)
 #endif
              IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
                              __memmove_ssse3_back)
              IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
                              __memmove_ssse3)
+             IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
+             IFUNC_IMPL_ADD (array, i, memmove, 1,
+                             __memmove_sse2_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, memmove, 1,
+                             __memmove_sse2_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
@@ -267,16 +300,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __memcpy_chk,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __memcpy_chk_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memcpy_chk_avx512_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memcpy_chk_avx512_unaligned_erms)
 #endif
              IFUNC_IMPL_ADD (array, i, __memcpy_chk,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __memcpy_chk_avx_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memcpy_chk_avx_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memcpy_chk_avx_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __memcpy_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __memcpy_chk_ssse3_back)
              IFUNC_IMPL_ADD (array, i, __memcpy_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __memcpy_chk_ssse3)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+                             __memcpy_chk_sse2_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+                             __memcpy_chk_sse2_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
                              __memcpy_chk_sse2))
 
@@ -285,6 +334,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memcpy,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __memcpy_avx_unaligned)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memcpy_avx_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memcpy_avx_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
                              __memcpy_ssse3_back)
              IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -293,8 +348,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memcpy,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __memcpy_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memcpy_avx512_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memcpy_avx512_unaligned_erms)
 #endif
              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+             IFUNC_IMPL_ADD (array, i, memcpy, 1,
+                             __memcpy_sse2_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, memcpy, 1,
+                             __memcpy_sse2_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
@@ -303,16 +369,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __mempcpy_chk_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __mempcpy_chk_avx512_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __mempcpy_chk_avx512_unaligned_erms)
 #endif
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __mempcpy_chk_avx_unaligned)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __mempcpy_chk_avx_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __mempcpy_chk_avx_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __mempcpy_chk_ssse3_back)
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __mempcpy_chk_ssse3)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+                             __mempcpy_chk_sse2_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+                             __mempcpy_chk_sse2_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
                              __mempcpy_chk_sse2))
 
@@ -322,14 +404,31 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, mempcpy,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __mempcpy_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __mempcpy_avx512_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __mempcpy_avx512_unaligned_erms)
 #endif
              IFUNC_IMPL_ADD (array, i, mempcpy,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __mempcpy_avx_unaligned)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __mempcpy_avx_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __mempcpy_avx_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
                              __mempcpy_ssse3_back)
              IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
                              __mempcpy_ssse3)
+             IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+                             __mempcpy_sse2_unaligned_2)
+             IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+                             __mempcpy_sse2_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
              IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
new file mode 100644 (file)
index 0000000..3a72c7e
--- /dev/null
@@ -0,0 +1,9 @@
+#define VEC_SIZE       32
+#define VEC(i)         ymm##i
+#define VMOVU          vmovdqu
+#define VMOVA          vmovdqa
+
+#define SECTION(p)             p##.avx
+#define MEMMOVE_SYMBOL(p,s)    p##_avx_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
new file mode 100644 (file)
index 0000000..38358fa
--- /dev/null
@@ -0,0 +1,11 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE      64
+# define VEC(i)                zmm##i
+# define VMOVU         vmovdqu64
+# define VMOVA         vmovdqa64
+
+# define SECTION(p)            p##.avx512
+# define MEMMOVE_SYMBOL(p,s)   p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
new file mode 100644 (file)
index 0000000..52b9ae0
--- /dev/null
@@ -0,0 +1,9 @@
+#define VEC_SIZE       16
+#define VEC(i)         xmm##i
+#define VMOVU          movdqu
+#define VMOVA          movdqa
+
+#define SECTION(p)             p
+#define MEMMOVE_SYMBOL(p,s)    p##_sse2_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
new file mode 100644 (file)
index 0000000..cf645dd
--- /dev/null
@@ -0,0 +1,462 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memmove/memcpy/mempcpy is implemented as:
+   1. Use overlapping load and store to avoid branch.
+   2. Use 8-bit or 32-bit displacements for branches and nop paddings
+      to avoid long nop between instructions.
+   3. Load all sources into registers and store them together to avoid
+      possible address overflap between source and destination.
+   4. If size is 2 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   5. If there is no address overflap, copy from both ends with
+      4 * VEC_SIZE at a time.
+   6. If size is 8 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   7. If address of destination > address of source, backward copy
+      8 * VEC_SIZE at a time.
+   8. Otherwise, forward copy 8 * VEC_SIZE at a time.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef VZEROUPPER
+#  if VEC_SIZE > 16
+#   define VZEROUPPER vzeroupper
+#  else
+#   define VZEROUPPER
+#  endif
+# endif
+
+/* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
+   up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
+   memcpy micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP MOVSB becomes faster than SSE2 optimization
+   on processors with Enhanced REP MOVSB.  Since larger register size
+   can move more data with a single load and store, the threshold is
+   higher with larger register size.  */
+# ifndef REP_MOVSB_THRESHOLD
+#  define REP_MOVSB_THRESHOLD  (2048 * (VEC_SIZE / 16))
+# endif
+
+# ifndef SECTION
+#  error SECTION is not defined!
+# endif
+       .section SECTION(.text),"ax",@progbits
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+       cmpq    %rdx, %rcx
+       jb      HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+       movq    %rdi, %rax
+       addq    %rdx, %rax
+       jmp     L(start)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+       cmpq    %rdx, %rcx
+       jb      HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+       movq    %rdi, %rax
+L(start):
+       cmpq    $VEC_SIZE, %rdx
+       jb      L(less_vec)
+       cmpq    $(VEC_SIZE * 2), %rdx
+       ja      L(more_2x_vec)
+       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+       VMOVU   (%rsi), %VEC(0)
+       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
+       VZEROUPPER
+       ret
+END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+       cmpq    %rdx, %rcx
+       jb      HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+       movq    %rdi, %rax
+       addq    %rdx, %rax
+       jmp     L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+       cmpq    %rdx, %rcx
+       jb      HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+# if VEC_SIZE == 16
+/* Only used to measure performance of REP MOVSB.  */
+#  ifdef SHARED
+ENTRY (__mempcpy_erms)
+       movq    %rdi, %rax
+       addq    %rdx, %rax
+       jmp     L(movsb)
+END (__mempcpy_erms)
+#  endif
+
+ENTRY (__memmove_erms)
+       movq    %rdi, %rax
+       movq    %rdx, %rcx
+       cmpq    %rsi, %rdi
+       jbe     1f
+       leaq    (%rsi,%rcx), %rdx
+       cmpq    %rdx, %rdi
+       jb      L(movsb_backward)
+1:
+       rep movsb
+       ret
+L(movsb_backward):
+       leaq    -1(%rdi,%rcx), %rdi
+       leaq    -1(%rsi,%rcx), %rsi
+       std
+       rep movsb
+       cld
+       ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+       movq    %rdi, %rax
+L(start_erms):
+       cmpq    $VEC_SIZE, %rdx
+       jb      L(less_vec)
+       cmpq    $(VEC_SIZE * 2), %rdx
+       ja      L(movsb_more_2x_vec)
+L(last_2x_vec):
+       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+       VMOVU   (%rsi), %VEC(0)
+       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+       VZEROUPPER
+       ret
+
+L(movsb):
+       cmpq    %rsi, %rdi
+       je      L(nop)
+       jb      1f
+       leaq    (%rsi,%rdx), %r9
+       cmpq    %r9, %rdi
+       /* Avoid slow backward REP MOVSB.  */
+# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
+#  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
+# endif
+       jb      L(more_8x_vec_backward)
+1:
+       movq    %rdx, %rcx
+       rep movsb
+L(nop):
+       ret
+
+       .p2align 4
+L(movsb_more_2x_vec):
+       cmpq    $REP_MOVSB_THRESHOLD, %rdx
+       /* Force 32-bit displacement to avoid long nop between
+          instructions.  */
+       ja.d32  L(movsb)
+       .p2align 4
+L(more_2x_vec):
+       /* More than 2 * VEC.  */
+       cmpq    %rsi, %rdi
+       je      L(nop)
+       jb      L(copy_forward)
+       leaq    (%rsi,%rdx), %rcx
+       cmpq    %rcx, %rdi
+       jb      L(more_2x_vec_overlap)
+L(copy_forward):
+       leaq    (%rdi,%rdx), %rcx
+       cmpq    %rcx, %rsi
+       jb      L(more_2x_vec_overlap)
+       VMOVU   (%rsi), %VEC(0)
+       VMOVU   VEC_SIZE(%rsi), %VEC(1)
+       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
+       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(1), VEC_SIZE(%rdi)
+       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+       cmpq    $(VEC_SIZE * 4), %rdx
+       /* Force 32-bit displacement to avoid long nop between
+          instructions.  */
+       jbe.d32 L(return)
+       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(0)
+       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(1)
+       VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
+       VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
+       VMOVU   %VEC(0), (VEC_SIZE * 2)(%rdi)
+       VMOVU   %VEC(1), (VEC_SIZE * 3)(%rdi)
+       VMOVU   %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
+       VMOVU   %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
+       cmpq    $(VEC_SIZE * 8), %rdx
+# if  VEC_SIZE == 16
+       jbe     L(return)
+# else
+       /* Use 8-bit displacement to avoid long nop between
+          instructions.  */
+       jbe     L(return_disp8)
+# endif
+       leaq    (VEC_SIZE * 4)(%rdi), %rcx
+       addq    %rdi, %rdx
+       andq    $-(VEC_SIZE * 4), %rdx
+       andq    $-(VEC_SIZE * 4), %rcx
+       movq    %rcx, %r11
+       subq    %rdi, %r11
+       addq    %r11, %rsi
+       cmpq    %rdx, %rcx
+       /* Use 8-bit displacement to avoid long nop between
+          instructions.  */
+       je      L(return_disp8)
+       movq    %rsi, %r10
+       subq    %rcx, %r10
+       leaq    VEC_SIZE(%r10), %r9
+       leaq    (VEC_SIZE * 2)(%r10), %r8
+       leaq    (VEC_SIZE * 3)(%r10), %r11
+       .p2align 4
+L(loop):
+       VMOVU   (%rcx,%r10), %VEC(0)
+       VMOVU   (%rcx,%r9), %VEC(1)
+       VMOVU   (%rcx,%r8), %VEC(2)
+       VMOVU   (%rcx,%r11), %VEC(3)
+       VMOVA   %VEC(0), (%rcx)
+       VMOVA   %VEC(1), VEC_SIZE(%rcx)
+       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rcx)
+       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rcx)
+       addq    $(VEC_SIZE * 4), %rcx
+       cmpq    %rcx, %rdx
+       jne     L(loop)
+L(return_disp8):
+       VZEROUPPER
+       ret
+L(less_vec):
+       /* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+       cmpb    $32, %dl
+       jae     L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+       cmpb    $16, %dl
+       jae     L(between_16_31)
+# endif
+       cmpb    $8, %dl
+       jae     L(between_8_15)
+       cmpb    $4, %dl
+       jae     L(between_4_7)
+       cmpb    $1, %dl
+       ja      L(between_2_3)
+       jb      1f
+       movzbl  (%rsi), %ecx
+       movb    %cl, (%rdi)
+1:
+       ret
+# if VEC_SIZE > 32
+L(between_32_63):
+       /* From 32 to 63.  No branch when size == 32.  */
+       vmovdqu (%rsi), %ymm0
+       vmovdqu -32(%rsi,%rdx), %ymm1
+       vmovdqu %ymm0, (%rdi)
+       vmovdqu %ymm1, -32(%rdi,%rdx)
+       VZEROUPPER
+       ret
+# endif
+# if VEC_SIZE > 16
+       /* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+       vmovdqu (%rsi), %xmm0
+       vmovdqu -16(%rsi,%rdx), %xmm1
+       vmovdqu %xmm0, (%rdi)
+       vmovdqu %xmm1, -16(%rdi,%rdx)
+       ret
+# endif
+L(between_8_15):
+       /* From 8 to 15.  No branch when size == 8.  */
+       movq    -8(%rsi,%rdx), %rcx
+       movq    (%rsi), %rsi
+       movq    %rcx, -8(%rdi,%rdx)
+       movq    %rsi, (%rdi)
+       ret
+L(between_4_7):
+       /* From 4 to 7.  No branch when size == 4.  */
+       movl    -4(%rsi,%rdx), %ecx
+       movl    (%rsi), %esi
+       movl    %ecx, -4(%rdi,%rdx)
+       movl    %esi, (%rdi)
+       ret
+L(between_2_3):
+       /* From 2 to 3.  No branch when size == 2.  */
+       movzwl  -2(%rsi,%rdx), %ecx
+       movzwl  (%rsi), %esi
+       movw    %cx, -2(%rdi,%rdx)
+       movw    %si, (%rdi)
+       ret
+
+# if VEC_SIZE > 16
+       /* Align to 16 bytes to avoid long nop between instructions.  */
+       .p2align 4
+# endif
+L(more_2x_vec_overlap):
+       /* More than 2 * VEC and there is overlap bewteen destination
+          and source.  */
+       cmpq    $(VEC_SIZE * 8), %rdx
+       ja      L(more_8x_vec)
+       cmpq    $(VEC_SIZE * 4), %rdx
+       jb      L(last_4x_vec)
+L(between_4x_vec_and_8x_vec):
+       /* Copy from 4 * VEC to 8 * VEC, inclusively. */
+       VMOVU   (%rsi), %VEC(0)
+       VMOVU   VEC_SIZE(%rsi), %VEC(1)
+       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
+       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
+       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(4)
+       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+       VMOVU   -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+       VMOVU   -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(1), VEC_SIZE(%rdi)
+       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
+       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
+       VMOVU   %VEC(4), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+       VZEROUPPER
+       ret
+L(last_4x_vec):
+       /* Copy from 2 * VEC to 4 * VEC. */
+       VMOVU   (%rsi), %VEC(0)
+       VMOVU   VEC_SIZE(%rsi), %VEC(1)
+       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(2)
+       VMOVU   -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(1), VEC_SIZE(%rdi)
+       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+       VZEROUPPER
+       ret
+L(between_0_and_4x_vec):
+       /* Copy from 0 to 4 * VEC. */
+       cmpl    $(VEC_SIZE * 2), %edx
+       jae     L(last_4x_vec)
+       /* Copy from 0 to 2 * VEC. */
+       cmpl    $VEC_SIZE, %edx
+       jae     L(last_2x_vec)
+       /* Copy from 0 to VEC. */
+       VZEROUPPER
+       jmp     L(less_vec)
+L(more_8x_vec):
+       cmpq    %rsi, %rdi
+       ja      L(more_8x_vec_backward)
+
+       .p2align 4
+L(loop_8x_vec_forward):
+       /* Copy 8 * VEC a time forward.  */
+       VMOVU   (%rsi), %VEC(0)
+       VMOVU   VEC_SIZE(%rsi), %VEC(1)
+       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
+       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
+       VMOVU   (VEC_SIZE * 4)(%rsi), %VEC(4)
+       VMOVU   (VEC_SIZE * 5)(%rsi), %VEC(5)
+       VMOVU   (VEC_SIZE * 6)(%rsi), %VEC(6)
+       VMOVU   (VEC_SIZE * 7)(%rsi), %VEC(7)
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(1), VEC_SIZE(%rdi)
+       VMOVU   %VEC(2), (VEC_SIZE * 2)(%rdi)
+       VMOVU   %VEC(3), (VEC_SIZE * 3)(%rdi)
+       VMOVU   %VEC(4), (VEC_SIZE * 4)(%rdi)
+       VMOVU   %VEC(5), (VEC_SIZE * 5)(%rdi)
+       VMOVU   %VEC(6), (VEC_SIZE * 6)(%rdi)
+       VMOVU   %VEC(7), (VEC_SIZE * 7)(%rdi)
+       addq    $(VEC_SIZE * 8), %rdi
+       addq    $(VEC_SIZE * 8), %rsi
+       subq    $(VEC_SIZE * 8), %rdx
+       cmpq    $(VEC_SIZE * 8), %rdx
+       je      L(between_4x_vec_and_8x_vec)
+       ja      L(loop_8x_vec_forward)
+       /* Less than 8 * VEC to copy.  */
+       cmpq    $(VEC_SIZE * 4), %rdx
+       jb      L(between_0_and_4x_vec)
+       jmp     L(between_4x_vec_and_8x_vec)
+
+       .p2align 4
+L(more_8x_vec_backward):
+       leaq    -VEC_SIZE(%rsi, %rdx), %rcx
+       leaq    -VEC_SIZE(%rdi, %rdx), %r9
+
+       .p2align 4
+L(loop_8x_vec_backward):
+       /* Copy 8 * VEC a time backward.  */
+       VMOVU   (%rcx), %VEC(0)
+       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
+       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
+       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
+       VMOVU   -(VEC_SIZE * 4)(%rcx), %VEC(4)
+       VMOVU   -(VEC_SIZE * 5)(%rcx), %VEC(5)
+       VMOVU   -(VEC_SIZE * 6)(%rcx), %VEC(6)
+       VMOVU   -(VEC_SIZE * 7)(%rcx), %VEC(7)
+       VMOVU   %VEC(0), (%r9)
+       VMOVU   %VEC(1), -VEC_SIZE(%r9)
+       VMOVU   %VEC(2), -(VEC_SIZE * 2)(%r9)
+       VMOVU   %VEC(3), -(VEC_SIZE * 3)(%r9)
+       VMOVU   %VEC(4), -(VEC_SIZE * 4)(%r9)
+       VMOVU   %VEC(5), -(VEC_SIZE * 5)(%r9)
+       VMOVU   %VEC(6), -(VEC_SIZE * 6)(%r9)
+       VMOVU   %VEC(7), -(VEC_SIZE * 7)(%r9)
+       subq    $(VEC_SIZE * 8), %rcx
+       subq    $(VEC_SIZE * 8), %r9
+       subq    $(VEC_SIZE * 8), %rdx
+       cmpq    $(VEC_SIZE * 8), %rdx
+       je      L(between_4x_vec_and_8x_vec)
+       ja      L(loop_8x_vec_backward)
+       /* Less than 8 * VEC to copy.  */
+       cmpq    $(VEC_SIZE * 4), %rdx
+       jb      L(between_0_and_4x_vec)
+       jmp     L(between_4x_vec_and_8x_vec)
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+# ifdef SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+             MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+             MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
+             MEMMOVE_SYMBOL (__memcpy, unaligned_2))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
+             MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
+# endif
+
+#endif