#include "init-arch.h"
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 4
+#define MAX_IFUNC 5
/* Fill ARRAY of MAX elements with IFUNC implementations for function
NAME supported on target machine and return the number of valid
__memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
- /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
+ /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
IFUNC_IMPL (i, name, __memmove_chk,
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned)
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
IFUNC_IMPL (i, name, __mempcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned)
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
IFUNC_IMPL (i, name, mempcpy,
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned)
--- /dev/null
+/* memcpy optimized with AVX512 for KNL hardware.\r
+ Copyright (C) 2016 Free Software Foundation, Inc.\r
+ This file is part of the GNU C Library.\r
+\r
+ The GNU C Library is free software; you can redistribute it and/or\r
+ modify it under the terms of the GNU Lesser General Public\r
+ License as published by the Free Software Foundation; either\r
+ version 2.1 of the License, or (at your option) any later version.\r
+\r
+ The GNU C Library is distributed in the hope that it will be useful,\r
+ but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
+ Lesser General Public License for more details.\r
+\r
+ You should have received a copy of the GNU Lesser General Public\r
+ License along with the GNU C Library; if not, see\r
+ <http://www.gnu.org/licenses/>. */\r
+\r
+#include <sysdep.h>\r
+\r
+#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \\r
+ && (defined SHARED \\r
+ || defined USE_AS_MEMMOVE \\r
+ || !defined USE_MULTIARCH)\r
+\r
+#include "asm-syntax.h"\r
+#ifndef MEMCPY\r
+# define MEMCPY __memcpy_avx512_no_vzeroupper\r
+# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper\r
+#endif\r
+\r
+ .section .text,"ax",@progbits\r
+#if !defined USE_AS_BCOPY\r
+ENTRY (MEMCPY_CHK)\r
+ cmpq %rdx, %rcx\r
+ jb HIDDEN_JUMPTARGET (__chk_fail)\r
+END (MEMCPY_CHK)\r
+#endif\r
+\r
+ENTRY (MEMCPY)\r
+ mov %rdi, %rax\r
+#ifdef USE_AS_MEMPCPY\r
+ add %rdx, %rax\r
+#endif\r
+ lea (%rsi, %rdx), %rcx\r
+ lea (%rdi, %rdx), %r9\r
+ cmp $512, %rdx\r
+ ja L(512bytesormore)\r
+\r
+L(check):\r
+ cmp $16, %rdx\r
+ jbe L(less_16bytes)\r
+ cmp $256, %rdx\r
+ jb L(less_256bytes)\r
+ vmovups (%rsi), %zmm0\r
+ vmovups 0x40(%rsi), %zmm1\r
+ vmovups 0x80(%rsi), %zmm2\r
+ vmovups 0xC0(%rsi), %zmm3\r
+ vmovups -0x100(%rcx), %zmm4\r
+ vmovups -0xC0(%rcx), %zmm5\r
+ vmovups -0x80(%rcx), %zmm6\r
+ vmovups -0x40(%rcx), %zmm7\r
+ vmovups %zmm0, (%rdi)\r
+ vmovups %zmm1, 0x40(%rdi)\r
+ vmovups %zmm2, 0x80(%rdi)\r
+ vmovups %zmm3, 0xC0(%rdi)\r
+ vmovups %zmm4, -0x100(%r9)\r
+ vmovups %zmm5, -0xC0(%r9)\r
+ vmovups %zmm6, -0x80(%r9)\r
+ vmovups %zmm7, -0x40(%r9)\r
+ ret\r
+\r
+L(less_256bytes):\r
+ cmp $128, %dl\r
+ jb L(less_128bytes)\r
+ vmovups (%rsi), %zmm0\r
+ vmovups 0x40(%rsi), %zmm1\r
+ vmovups -0x80(%rcx), %zmm2\r
+ vmovups -0x40(%rcx), %zmm3\r
+ vmovups %zmm0, (%rdi)\r
+ vmovups %zmm1, 0x40(%rdi)\r
+ vmovups %zmm2, -0x80(%r9)\r
+ vmovups %zmm3, -0x40(%r9)\r
+ ret\r
+\r
+L(less_128bytes):\r
+ cmp $64, %dl\r
+ jb L(less_64bytes)\r
+ vmovdqu (%rsi), %ymm0\r
+ vmovdqu 0x20(%rsi), %ymm1\r
+ vmovdqu -0x40(%rcx), %ymm2\r
+ vmovdqu -0x20(%rcx), %ymm3\r
+ vmovdqu %ymm0, (%rdi)\r
+ vmovdqu %ymm1, 0x20(%rdi)\r
+ vmovdqu %ymm2, -0x40(%r9)\r
+ vmovdqu %ymm3, -0x20(%r9)\r
+ ret\r
+\r
+L(less_64bytes):\r
+ cmp $32, %dl\r
+ jb L(less_32bytes)\r
+ vmovdqu (%rsi), %ymm0\r
+ vmovdqu -0x20(%rcx), %ymm1\r
+ vmovdqu %ymm0, (%rdi)\r
+ vmovdqu %ymm1, -0x20(%r9)\r
+ ret\r
+\r
+L(less_32bytes):\r
+ vmovdqu (%rsi), %xmm0\r
+ vmovdqu -0x10(%rcx), %xmm1\r
+ vmovdqu %xmm0, (%rdi)\r
+ vmovdqu %xmm1, -0x10(%r9)\r
+ ret\r
+\r
+L(less_16bytes):\r
+ cmp $8, %dl\r
+ jb L(less_8bytes)\r
+ movq (%rsi), %rsi\r
+ movq -0x8(%rcx), %rcx\r
+ movq %rsi, (%rdi)\r
+ movq %rcx, -0x8(%r9)\r
+ ret\r
+\r
+L(less_8bytes):\r
+ cmp $4, %dl\r
+ jb L(less_4bytes)\r
+ mov (%rsi), %esi\r
+ mov -0x4(%rcx), %ecx\r
+ mov %esi, (%rdi)\r
+ mov %ecx, -0x4(%r9)\r
+ ret\r
+\r
+L(less_4bytes):\r
+ cmp $2, %dl\r
+ jb L(less_2bytes)\r
+ mov (%rsi), %si\r
+ mov -0x2(%rcx), %cx\r
+ mov %si, (%rdi)\r
+ mov %cx, -0x2(%r9)\r
+ ret\r
+\r
+L(less_2bytes):\r
+ cmp $1, %dl\r
+ jb L(less_1bytes) \r
+ mov (%rsi), %cl\r
+ mov %cl, (%rdi)\r
+L(less_1bytes):\r
+ ret\r
+\r
+L(512bytesormore):\r
+#ifdef SHARED_CACHE_SIZE_HALF\r
+ mov $SHARED_CACHE_SIZE_HALF, %r8\r
+#else\r
+ mov __x86_shared_cache_size_half(%rip), %r8\r
+#endif\r
+ cmp %r8, %rdx\r
+ jae L(preloop_large)\r
+ cmp $1024, %rdx\r
+ ja L(1024bytesormore)\r
+ prefetcht1 (%rsi)\r
+ prefetcht1 0x40(%rsi)\r
+ prefetcht1 0x80(%rsi)\r
+ prefetcht1 0xC0(%rsi)\r
+ prefetcht1 0x100(%rsi)\r
+ prefetcht1 0x140(%rsi)\r
+ prefetcht1 0x180(%rsi)\r
+ prefetcht1 0x1C0(%rsi)\r
+ prefetcht1 -0x200(%rcx)\r
+ prefetcht1 -0x1C0(%rcx)\r
+ prefetcht1 -0x180(%rcx)\r
+ prefetcht1 -0x140(%rcx)\r
+ prefetcht1 -0x100(%rcx)\r
+ prefetcht1 -0xC0(%rcx)\r
+ prefetcht1 -0x80(%rcx)\r
+ prefetcht1 -0x40(%rcx) \r
+ vmovups (%rsi), %zmm0\r
+ vmovups 0x40(%rsi), %zmm1\r
+ vmovups 0x80(%rsi), %zmm2\r
+ vmovups 0xC0(%rsi), %zmm3\r
+ vmovups 0x100(%rsi), %zmm4\r
+ vmovups 0x140(%rsi), %zmm5\r
+ vmovups 0x180(%rsi), %zmm6\r
+ vmovups 0x1C0(%rsi), %zmm7\r
+ vmovups -0x200(%rcx), %zmm8\r
+ vmovups -0x1C0(%rcx), %zmm9\r
+ vmovups -0x180(%rcx), %zmm10\r
+ vmovups -0x140(%rcx), %zmm11\r
+ vmovups -0x100(%rcx), %zmm12\r
+ vmovups -0xC0(%rcx), %zmm13\r
+ vmovups -0x80(%rcx), %zmm14\r
+ vmovups -0x40(%rcx), %zmm15\r
+ vmovups %zmm0, (%rdi)\r
+ vmovups %zmm1, 0x40(%rdi)\r
+ vmovups %zmm2, 0x80(%rdi)\r
+ vmovups %zmm3, 0xC0(%rdi)\r
+ vmovups %zmm4, 0x100(%rdi)\r
+ vmovups %zmm5, 0x140(%rdi)\r
+ vmovups %zmm6, 0x180(%rdi)\r
+ vmovups %zmm7, 0x1C0(%rdi)\r
+ vmovups %zmm8, -0x200(%r9)\r
+ vmovups %zmm9, -0x1C0(%r9)\r
+ vmovups %zmm10, -0x180(%r9)\r
+ vmovups %zmm11, -0x140(%r9)\r
+ vmovups %zmm12, -0x100(%r9)\r
+ vmovups %zmm13, -0xC0(%r9)\r
+ vmovups %zmm14, -0x80(%r9)\r
+ vmovups %zmm15, -0x40(%r9)\r
+ ret\r
+\r
+L(1024bytesormore):\r
+ cmp %rsi, %rdi\r
+ ja L(1024bytesormore_bkw)\r
+ sub $512, %r9\r
+ vmovups -0x200(%rcx), %zmm8\r
+ vmovups -0x1C0(%rcx), %zmm9\r
+ vmovups -0x180(%rcx), %zmm10\r
+ vmovups -0x140(%rcx), %zmm11\r
+ vmovups -0x100(%rcx), %zmm12\r
+ vmovups -0xC0(%rcx), %zmm13\r
+ vmovups -0x80(%rcx), %zmm14\r
+ vmovups -0x40(%rcx), %zmm15\r
+ prefetcht1 (%rsi)\r
+ prefetcht1 0x40(%rsi)\r
+ prefetcht1 0x80(%rsi)\r
+ prefetcht1 0xC0(%rsi)\r
+ prefetcht1 0x100(%rsi)\r
+ prefetcht1 0x140(%rsi)\r
+ prefetcht1 0x180(%rsi)\r
+ prefetcht1 0x1C0(%rsi)\r
+\r
+/* Loop with unaligned memory access. */\r
+L(gobble_512bytes_loop):\r
+ vmovups (%rsi), %zmm0\r
+ vmovups 0x40(%rsi), %zmm1\r
+ vmovups 0x80(%rsi), %zmm2\r
+ vmovups 0xC0(%rsi), %zmm3\r
+ vmovups 0x100(%rsi), %zmm4\r
+ vmovups 0x140(%rsi), %zmm5\r
+ vmovups 0x180(%rsi), %zmm6\r
+ vmovups 0x1C0(%rsi), %zmm7\r
+ add $512, %rsi\r
+ prefetcht1 (%rsi)\r
+ prefetcht1 0x40(%rsi)\r
+ prefetcht1 0x80(%rsi)\r
+ prefetcht1 0xC0(%rsi)\r
+ prefetcht1 0x100(%rsi)\r
+ prefetcht1 0x140(%rsi)\r
+ prefetcht1 0x180(%rsi)\r
+ prefetcht1 0x1C0(%rsi)\r
+ vmovups %zmm0, (%rdi)\r
+ vmovups %zmm1, 0x40(%rdi)\r
+ vmovups %zmm2, 0x80(%rdi)\r
+ vmovups %zmm3, 0xC0(%rdi)\r
+ vmovups %zmm4, 0x100(%rdi)\r
+ vmovups %zmm5, 0x140(%rdi)\r
+ vmovups %zmm6, 0x180(%rdi)\r
+ vmovups %zmm7, 0x1C0(%rdi)\r
+ add $512, %rdi\r
+ cmp %r9, %rdi\r
+ jb L(gobble_512bytes_loop)\r
+ vmovups %zmm8, (%r9)\r
+ vmovups %zmm9, 0x40(%r9)\r
+ vmovups %zmm10, 0x80(%r9)\r
+ vmovups %zmm11, 0xC0(%r9)\r
+ vmovups %zmm12, 0x100(%r9)\r
+ vmovups %zmm13, 0x140(%r9)\r
+ vmovups %zmm14, 0x180(%r9)\r
+ vmovups %zmm15, 0x1C0(%r9)\r
+ ret\r
+\r
+L(1024bytesormore_bkw):\r
+ add $512, %rdi\r
+ vmovups 0x1C0(%rsi), %zmm8\r
+ vmovups 0x180(%rsi), %zmm9\r
+ vmovups 0x140(%rsi), %zmm10\r
+ vmovups 0x100(%rsi), %zmm11\r
+ vmovups 0xC0(%rsi), %zmm12\r
+ vmovups 0x80(%rsi), %zmm13\r
+ vmovups 0x40(%rsi), %zmm14\r
+ vmovups (%rsi), %zmm15\r
+ prefetcht1 -0x40(%rcx)\r
+ prefetcht1 -0x80(%rcx)\r
+ prefetcht1 -0xC0(%rcx)\r
+ prefetcht1 -0x100(%rcx)\r
+ prefetcht1 -0x140(%rcx)\r
+ prefetcht1 -0x180(%rcx)\r
+ prefetcht1 -0x1C0(%rcx)\r
+ prefetcht1 -0x200(%rcx)\r
+ \r
+/* Backward loop with unaligned memory access. */\r
+L(gobble_512bytes_loop_bkw):\r
+ vmovups -0x40(%rcx), %zmm0\r
+ vmovups -0x80(%rcx), %zmm1\r
+ vmovups -0xC0(%rcx), %zmm2\r
+ vmovups -0x100(%rcx), %zmm3\r
+ vmovups -0x140(%rcx), %zmm4\r
+ vmovups -0x180(%rcx), %zmm5\r
+ vmovups -0x1C0(%rcx), %zmm6\r
+ vmovups -0x200(%rcx), %zmm7\r
+ sub $512, %rcx\r
+ prefetcht1 -0x40(%rcx)\r
+ prefetcht1 -0x80(%rcx)\r
+ prefetcht1 -0xC0(%rcx)\r
+ prefetcht1 -0x100(%rcx)\r
+ prefetcht1 -0x140(%rcx)\r
+ prefetcht1 -0x180(%rcx)\r
+ prefetcht1 -0x1C0(%rcx)\r
+ prefetcht1 -0x200(%rcx)\r
+ vmovups %zmm0, -0x40(%r9) \r
+ vmovups %zmm1, -0x80(%r9) \r
+ vmovups %zmm2, -0xC0(%r9) \r
+ vmovups %zmm3, -0x100(%r9) \r
+ vmovups %zmm4, -0x140(%r9) \r
+ vmovups %zmm5, -0x180(%r9) \r
+ vmovups %zmm6, -0x1C0(%r9)\r
+ vmovups %zmm7, -0x200(%r9) \r
+ sub $512, %r9\r
+ cmp %rdi, %r9\r
+ ja L(gobble_512bytes_loop_bkw)\r
+ vmovups %zmm8, -0x40(%rdi)\r
+ vmovups %zmm9, -0x80(%rdi)\r
+ vmovups %zmm10, -0xC0(%rdi)\r
+ vmovups %zmm11, -0x100(%rdi)\r
+ vmovups %zmm12, -0x140(%rdi)\r
+ vmovups %zmm13, -0x180(%rdi)\r
+ vmovups %zmm14, -0x1C0(%rdi)\r
+ vmovups %zmm15, -0x200(%rdi)\r
+ ret\r
+\r
+L(preloop_large):\r
+ cmp %rsi, %rdi\r
+ ja L(preloop_large_bkw)\r
+ vmovups (%rsi), %zmm4\r
+ vmovups 0x40(%rsi), %zmm5\r
+\r
+/* Align destination for access with non-temporal stores in the loop. */\r
+ mov %rdi, %r8\r
+ and $-0x80, %rdi\r
+ add $0x80, %rdi\r
+ sub %rdi, %r8 \r
+ sub %r8, %rsi\r
+ add %r8, %rdx\r
+ prefetcht1 (%rsi)\r
+ prefetcht1 0x40(%rsi)\r
+ prefetcht1 0x80(%rsi)\r
+ prefetcht1 0xC0(%rsi)\r
+L(gobble_256bytes_nt_loop):\r
+ vmovups (%rsi), %zmm0\r
+ prefetcht1 0x100(%rsi)\r
+ vmovups 0x40(%rsi), %zmm1\r
+ prefetcht1 0x140(%rsi)\r
+ vmovups 0x80(%rsi), %zmm2\r
+ prefetcht1 0x180(%rsi)\r
+ vmovups 0xC0(%rsi), %zmm3\r
+ prefetcht1 0x1C0(%rsi)\r
+ vmovntdq %zmm0, (%rdi)\r
+ vmovntdq %zmm1, 0x40(%rdi)\r
+ vmovntdq %zmm2, 0x80(%rdi)\r
+ vmovntdq %zmm3, 0xC0(%rdi)\r
+ sub $256, %rdx\r
+ add $256, %rsi\r
+ add $256, %rdi\r
+ cmp $256, %rdx\r
+ ja L(gobble_256bytes_nt_loop)\r
+ sfence\r
+ vmovups %zmm4, (%rax)\r
+ vmovups %zmm5, 0x40(%rax)\r
+ jmp L(check)\r
+\r
+L(preloop_large_bkw):\r
+ vmovups -0x80(%rcx), %zmm4\r
+ vmovups -0x40(%rcx), %zmm5\r
+\r
+/* Align end of destination for access with non-temporal stores. */\r
+ mov %r9, %r8\r
+ and $-0x80, %r9\r
+ sub %r9, %r8\r
+ sub %r8, %rcx\r
+ sub %r8, %rdx\r
+ add %r9, %r8\r
+ prefetcht1 -0x100(%rcx)\r
+ prefetcht1 -0xC0(%rcx)\r
+ prefetcht1 -0x80(%rcx)\r
+ prefetcht1 -0x40(%rcx)\r
+L(gobble_256bytes_nt_loop_bkw):\r
+ vmovups -0x100(%rcx), %zmm0\r
+ prefetcht1 -0x200(%rcx)\r
+ vmovups -0xC0(%rcx), %zmm1\r
+ prefetcht1 -0x1C0(%rcx)\r
+ vmovups -0x80(%rcx), %zmm2\r
+ prefetcht1 -0x180(%rcx)\r
+ vmovups -0x40(%rcx), %zmm3\r
+ prefetcht1 -0x140(%rcx)\r
+ vmovntdq %zmm0, -0x100(%r9)\r
+ vmovntdq %zmm1, -0xC0(%r9)\r
+ vmovntdq %zmm2, -0x80(%r9)\r
+ vmovntdq %zmm3, -0x40(%r9)\r
+ sub $256, %rdx\r
+ sub $256, %rcx\r
+ sub $256, %r9\r
+ cmp $256, %rdx\r
+ ja L(gobble_256bytes_nt_loop_bkw)\r
+ sfence\r
+ vmovups %zmm4, -0x80(%r8)\r
+ vmovups %zmm5, -0x40(%r8)\r
+ jmp L(check)\r
+\r
+END (MEMCPY)\r
+\r
+#endif\r