[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memmove-vec-unaligned-erms.S

/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

/* memmove/memcpy/mempcpy is implemented as:
   1. Use overlapping load and store to avoid branch.
   2. Load all sources into registers and store them together to avoid
      possible address overlap between source and destination.
   3. If size is 8 * VEC_SIZE or less, load all sources into registers
      and store them together.
   4. If address of destination > address of source, backward copy
      4 * VEC_SIZE at a time with unaligned load and aligned store.
      Load the first 4 * VEC and last VEC before the loop and store
      them after the loop to support overlapping addresses.
   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
      load and aligned store.  Load the last 4 * VEC and first VEC
      before the loop and store them after the loop to support
      overlapping addresses.
   6. If size >= __x86_shared_non_temporal_threshold and there is no
      overlap between destination and source, use non-temporal store
      instead of aligned store.  */

#include <sysdep.h>

#ifndef MEMCPY_SYMBOL
# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
#endif

#ifndef MEMPCPY_SYMBOL
# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
#endif

#ifndef MEMMOVE_CHK_SYMBOL
# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
#endif

#ifndef VZEROUPPER
# if VEC_SIZE > 16
#  define VZEROUPPER vzeroupper
# else
#  define VZEROUPPER
# endif
#endif

#ifndef PREFETCH
# define PREFETCH(addr) prefetcht0 addr
#endif

/* Assume 64-byte prefetch size.  */
#ifndef PREFETCH_SIZE
# define PREFETCH_SIZE 64
#endif

#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)

#if PREFETCH_SIZE == 64
# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
#  define PREFETCH_ONE_SET(dir, base, offset) \
	PREFETCH ((offset)base)
# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
#  define PREFETCH_ONE_SET(dir, base, offset) \
	PREFETCH ((offset)base); \
	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
#  define PREFETCH_ONE_SET(dir, base, offset) \
	PREFETCH ((offset)base); \
	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
# else
#   error Unsupported PREFETCHED_LOAD_SIZE!
# endif
#else
# error Unsupported PREFETCH_SIZE!
#endif

#ifndef SECTION
# error SECTION is not defined!
#endif

	.section SECTION(.text),"ax",@progbits
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
	cmp	%RDX_LP, %RCX_LP
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
#endif

ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
	mov	%RDI_LP, %RAX_LP
	add	%RDX_LP, %RAX_LP
	jmp	L(start)
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))

#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
	cmp	%RDX_LP, %RCX_LP
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
#endif

ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
	movq	%rdi, %rax
L(start):
# ifdef __ILP32__
	/* Clear the upper 32 bits.  */
	movl	%edx, %edx
# endif
	cmp	$VEC_SIZE, %RDX_LP
	jb	L(less_vec)
	cmp	$(VEC_SIZE * 2), %RDX_LP
	ja	L(more_2x_vec)
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(last_2x_vec):
#endif
	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
	VMOVU	(%rsi), %VEC(0)
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
	VMOVU	%VEC(0), (%rdi)
	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
	VZEROUPPER
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(nop):
#endif
	ret
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))

# if VEC_SIZE == 16
ENTRY (__mempcpy_chk_erms)
	cmp	%RDX_LP, %RCX_LP
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_erms)

/* Only used to measure performance of REP MOVSB.  */
ENTRY (__mempcpy_erms)
	mov	%RDI_LP, %RAX_LP
	/* Skip zero length.  */
	test	%RDX_LP, %RDX_LP
	jz	2f
	add	%RDX_LP, %RAX_LP
	jmp	L(start_movsb)
END (__mempcpy_erms)

ENTRY (__memmove_chk_erms)
	cmp	%RDX_LP, %RCX_LP
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_erms)

ENTRY (__memmove_erms)
	movq	%rdi, %rax
	/* Skip zero length.  */
	test	%RDX_LP, %RDX_LP
	jz	2f
L(start_movsb):
	mov	%RDX_LP, %RCX_LP
	cmp	%RSI_LP, %RDI_LP
	jb	1f
	/* Source == destination is less common.  */
	je	2f
	lea	(%rsi,%rcx), %RDX_LP
	cmp	%RDX_LP, %RDI_LP
	jb	L(movsb_backward)
1:
	rep movsb
2:
	ret
L(movsb_backward):
	leaq	-1(%rdi,%rcx), %rdi
	leaq	-1(%rsi,%rcx), %rsi
	std
	rep movsb
	cld
	ret
END (__memmove_erms)
strong_alias (__memmove_erms, __memcpy_erms)
strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
# endif

# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
	cmp	%RDX_LP, %RCX_LP
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
# endif

ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
	mov	%RDI_LP, %RAX_LP
	add	%RDX_LP, %RAX_LP
	jmp	L(start_erms)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))

# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
	cmp	%RDX_LP, %RCX_LP
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif

ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
	movq	%rdi, %rax
L(start_erms):
# ifdef __ILP32__
	/* Clear the upper 32 bits.  */
	movl	%edx, %edx
# endif
	cmp	$VEC_SIZE, %RDX_LP
	jb	L(less_vec)
	cmp	$(VEC_SIZE * 2), %RDX_LP
	ja	L(movsb_more_2x_vec)
L(last_2x_vec):
	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
	VMOVU	(%rsi), %VEC(0)
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
	VMOVU	%VEC(0), (%rdi)
	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
L(return):
	VZEROUPPER
	ret

L(movsb):
	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
	jae	L(more_8x_vec)
	cmpq	%rsi, %rdi
	jb	1f
	/* Source == destination is less common.  */
	je	L(nop)
	leaq	(%rsi,%rdx), %r9
	cmpq	%r9, %rdi
	/* Avoid slow backward REP MOVSB.  */
	jb	L(more_8x_vec_backward)
1:
	mov	%RDX_LP, %RCX_LP
	rep movsb
L(nop):
	ret
#endif

L(less_vec):
	/* Less than 1 VEC.  */
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
#endif
#if VEC_SIZE > 32
	cmpb	$32, %dl
	jae	L(between_32_63)
#endif
#if VEC_SIZE > 16
	cmpb	$16, %dl
	jae	L(between_16_31)
#endif
	cmpb	$8, %dl
	jae	L(between_8_15)
	cmpb	$4, %dl
	jae	L(between_4_7)
	cmpb	$1, %dl
	ja	L(between_2_3)
	jb	1f
	movzbl	(%rsi), %ecx
	movb	%cl, (%rdi)
1:
	ret
#if VEC_SIZE > 32
L(between_32_63):
	/* From 32 to 63.  No branch when size == 32.  */
	vmovdqu	(%rsi), %ymm0
	vmovdqu	-32(%rsi,%rdx), %ymm1
	vmovdqu	%ymm0, (%rdi)
	vmovdqu	%ymm1, -32(%rdi,%rdx)
	VZEROUPPER
	ret
#endif
#if VEC_SIZE > 16
	/* From 16 to 31.  No branch when size == 16.  */
L(between_16_31):
	vmovdqu	(%rsi), %xmm0
	vmovdqu	-16(%rsi,%rdx), %xmm1
	vmovdqu	%xmm0, (%rdi)
	vmovdqu	%xmm1, -16(%rdi,%rdx)
	ret
#endif
L(between_8_15):
	/* From 8 to 15.  No branch when size == 8.  */
	movq	-8(%rsi,%rdx), %rcx
	movq	(%rsi), %rsi
	movq	%rcx, -8(%rdi,%rdx)
	movq	%rsi, (%rdi)
	ret
L(between_4_7):
	/* From 4 to 7.  No branch when size == 4.  */
	movl	-4(%rsi,%rdx), %ecx
	movl	(%rsi), %esi
	movl	%ecx, -4(%rdi,%rdx)
	movl	%esi, (%rdi)
	ret
L(between_2_3):
	/* From 2 to 3.  No branch when size == 2.  */
	movzwl	-2(%rsi,%rdx), %ecx
	movzwl	(%rsi), %esi
	movw	%cx, -2(%rdi,%rdx)
	movw	%si, (%rdi)
	ret

#if defined USE_MULTIARCH && IS_IN (libc)
L(movsb_more_2x_vec):
	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
	ja	L(movsb)
#endif
L(more_2x_vec):
	/* More than 2 * VEC and there may be overlap between destination
	   and source.  */
	cmpq	$(VEC_SIZE * 8), %rdx
	ja	L(more_8x_vec)
	cmpq	$(VEC_SIZE * 4), %rdx
	jb	L(last_4x_vec)
	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
	VMOVU	(%rsi), %VEC(0)
	VMOVU	VEC_SIZE(%rsi), %VEC(1)
	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
	VMOVU	%VEC(0), (%rdi)
	VMOVU	%VEC(1), VEC_SIZE(%rdi)
	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
	VZEROUPPER
	ret
L(last_4x_vec):
	/* Copy from 2 * VEC to 4 * VEC. */
	VMOVU	(%rsi), %VEC(0)
	VMOVU	VEC_SIZE(%rsi), %VEC(1)
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
	VMOVU	%VEC(0), (%rdi)
	VMOVU	%VEC(1), VEC_SIZE(%rdi)
	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
	VZEROUPPER
	ret

L(more_8x_vec):
	cmpq	%rsi, %rdi
	ja	L(more_8x_vec_backward)
	/* Source == destination is less common.  */
	je	L(nop)
	/* Load the first VEC and last 4 * VEC to support overlapping
	   addresses.  */
	VMOVU	(%rsi), %VEC(4)
	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
	/* Save start and stop of the destination buffer.  */
	movq	%rdi, %r11
	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
	/* Align destination for aligned stores in the loop.  Compute
	   how much destination is misaligned.  */
	movq	%rdi, %r8
	andq	$(VEC_SIZE - 1), %r8
	/* Get the negative of offset for alignment.  */
	subq	$VEC_SIZE, %r8
	/* Adjust source.  */
	subq	%r8, %rsi
	/* Adjust destination which should be aligned now.  */
	subq	%r8, %rdi
	/* Adjust length.  */
	addq	%r8, %rdx
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
	/* Check non-temporal store threshold.  */
	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
	ja	L(large_forward)
#endif
L(loop_4x_vec_forward):
	/* Copy 4 * VEC a time forward.  */
	VMOVU	(%rsi), %VEC(0)
	VMOVU	VEC_SIZE(%rsi), %VEC(1)
	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
	addq	$(VEC_SIZE * 4), %rsi
	subq	$(VEC_SIZE * 4), %rdx
	VMOVA	%VEC(0), (%rdi)
	VMOVA	%VEC(1), VEC_SIZE(%rdi)
	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
	addq	$(VEC_SIZE * 4), %rdi
	cmpq	$(VEC_SIZE * 4), %rdx
	ja	L(loop_4x_vec_forward)
	/* Store the last 4 * VEC.  */
	VMOVU	%VEC(5), (%rcx)
	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
	/* Store the first VEC.  */
	VMOVU	%VEC(4), (%r11)
	VZEROUPPER
	ret

L(more_8x_vec_backward):
	/* Load the first 4 * VEC and last VEC to support overlapping
	   addresses.  */
	VMOVU	(%rsi), %VEC(4)
	VMOVU	VEC_SIZE(%rsi), %VEC(5)
	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
	/* Save stop of the destination buffer.  */
	leaq	-VEC_SIZE(%rdi, %rdx), %r11
	/* Align destination end for aligned stores in the loop.  Compute
	   how much destination end is misaligned.  */
	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
	movq	%r11, %r9
	movq	%r11, %r8
	andq	$(VEC_SIZE - 1), %r8
	/* Adjust source.  */
	subq	%r8, %rcx
	/* Adjust the end of destination which should be aligned now.  */
	subq	%r8, %r9
	/* Adjust length.  */
	subq	%r8, %rdx
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
	/* Check non-temporal store threshold.  */
	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
	ja	L(large_backward)
#endif
L(loop_4x_vec_backward):
	/* Copy 4 * VEC a time backward.  */
	VMOVU	(%rcx), %VEC(0)
	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
	subq	$(VEC_SIZE * 4), %rcx
	subq	$(VEC_SIZE * 4), %rdx
	VMOVA	%VEC(0), (%r9)
	VMOVA	%VEC(1), -VEC_SIZE(%r9)
	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
	subq	$(VEC_SIZE * 4), %r9
	cmpq	$(VEC_SIZE * 4), %rdx
	ja	L(loop_4x_vec_backward)
	/* Store the first 4 * VEC.  */
	VMOVU	%VEC(4), (%rdi)
	VMOVU	%VEC(5), VEC_SIZE(%rdi)
	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
	/* Store the last VEC.  */
	VMOVU	%VEC(8), (%r11)
	VZEROUPPER
	ret

#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
L(large_forward):
	/* Don't use non-temporal store if there is overlap between
	   destination and source since destination may be in cache
	   when source is loaded.  */
	leaq    (%rdi, %rdx), %r10
	cmpq    %r10, %rsi
	jb	L(loop_4x_vec_forward)
L(loop_large_forward):
	/* Copy 4 * VEC a time forward with non-temporal stores.  */
	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
	VMOVU	(%rsi), %VEC(0)
	VMOVU	VEC_SIZE(%rsi), %VEC(1)
	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
	addq	$PREFETCHED_LOAD_SIZE, %rsi
	subq	$PREFETCHED_LOAD_SIZE, %rdx
	VMOVNT	%VEC(0), (%rdi)
	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
	addq	$PREFETCHED_LOAD_SIZE, %rdi
	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
	ja	L(loop_large_forward)
	sfence
	/* Store the last 4 * VEC.  */
	VMOVU	%VEC(5), (%rcx)
	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
	/* Store the first VEC.  */
	VMOVU	%VEC(4), (%r11)
	VZEROUPPER
	ret

L(large_backward):
	/* Don't use non-temporal store if there is overlap between
	   destination and source since destination may be in cache
	   when source is loaded.  */
	leaq    (%rcx, %rdx), %r10
	cmpq    %r10, %r9
	jb	L(loop_4x_vec_backward)
L(loop_large_backward):
	/* Copy 4 * VEC a time backward with non-temporal stores.  */
	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
	VMOVU	(%rcx), %VEC(0)
	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
	subq	$PREFETCHED_LOAD_SIZE, %rcx
	subq	$PREFETCHED_LOAD_SIZE, %rdx
	VMOVNT	%VEC(0), (%r9)
	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
	subq	$PREFETCHED_LOAD_SIZE, %r9
	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
	ja	L(loop_large_backward)
	sfence
	/* Store the first 4 * VEC.  */
	VMOVU	%VEC(4), (%rdi)
	VMOVU	%VEC(5), VEC_SIZE(%rdi)
	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
	/* Store the last VEC.  */
	VMOVU	%VEC(8), (%r11)
	VZEROUPPER
	ret
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))

#if IS_IN (libc)
# ifdef USE_MULTIARCH
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
#  ifdef SHARED
strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
#  endif
# endif
# ifdef SHARED
strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
# endif
#endif
strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
	      MEMCPY_SYMBOL (__memcpy, unaligned))
Commit	Line	Data
88b57b8e	1	/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2b778ceb	2	Copyright (C) 2016-2021 Free Software Foundation, Inc.
88b57b8e L	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
88b57b8e L	18
	19	/* memmove/memcpy/mempcpy is implemented as:
	20	1. Use overlapping load and store to avoid branch.
a057f5f8	21	2. Load all sources into registers and store them together to avoid
3f61232a	22	possible address overlap between source and destination.
a057f5f8	23	3. If size is 8 * VEC_SIZE or less, load all sources into registers
88b57b8e	24	and store them together.
a057f5f8 L	25	4. If address of destination > address of source, backward copy
	26	4 * VEC_SIZE at a time with unaligned load and aligned store.
	27	Load the first 4 * VEC and last VEC before the loop and store
	28	them after the loop to support overlapping addresses.
	29	5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
	30	load and aligned store. Load the last 4 * VEC and first VEC
	31	before the loop and store them after the loop to support
	32	overlapping addresses.
	33	6. If size >= __x86_shared_non_temporal_threshold and there is no
	34	overlap between destination and source, use non-temporal store
	35	instead of aligned store. */
88b57b8e	36
a7d1c514	37	#include <sysdep.h>
88b57b8e	38
a7d1c514 L	39	#ifndef MEMCPY_SYMBOL
	40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
	41	#endif
88b57b8e	42
a7d1c514 L	43	#ifndef MEMPCPY_SYMBOL
	44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
	45	#endif
	46
	47	#ifndef MEMMOVE_CHK_SYMBOL
	48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
	49	#endif
	50
	51	#ifndef VZEROUPPER
	52	# if VEC_SIZE > 16
	53	# define VZEROUPPER vzeroupper
	54	# else
	55	# define VZEROUPPER
88b57b8e	56	# endif
a7d1c514	57	#endif
88b57b8e	58
a057f5f8 L	59	#ifndef PREFETCH
	60	# define PREFETCH(addr) prefetcht0 addr
	61	#endif
	62
	63	/* Assume 64-byte prefetch size. */
	64	#ifndef PREFETCH_SIZE
	65	# define PREFETCH_SIZE 64
	66	#endif
	67
	68	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
	69
	70	#if PREFETCH_SIZE == 64
	71	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
	72	# define PREFETCH_ONE_SET(dir, base, offset) \
	73	PREFETCH ((offset)base)
	74	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
	75	# define PREFETCH_ONE_SET(dir, base, offset) \
	76	PREFETCH ((offset)base); \
	77	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
	78	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
	79	# define PREFETCH_ONE_SET(dir, base, offset) \
	80	PREFETCH ((offset)base); \
	81	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
a057f5f8 L	82	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
	83	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
	84	# else
	85	# error Unsupported PREFETCHED_LOAD_SIZE!
	86	# endif
	87	#else
	88	# error Unsupported PREFETCH_SIZE!
	89	#endif
	90
a7d1c514 L	91	#ifndef SECTION
	92	# error SECTION is not defined!
	93	#endif
88b57b8e	94
a7d1c514 L	95	.section SECTION(.text),"ax",@progbits
a7d1c514 L	96	#if defined SHARED && IS_IN (libc)
c867597b	97	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
231c5676	98	cmp %RDX_LP, %RCX_LP
88b57b8e	99	jb HIDDEN_JUMPTARGET (__chk_fail)
c867597b	100	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
a7d1c514	101	#endif
88b57b8e	102
c867597b	103	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
231c5676 L	104	mov %RDI_LP, %RAX_LP
231c5676 L	105	add %RDX_LP, %RAX_LP
88b57b8e	106	jmp L(start)
c867597b	107	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
88b57b8e	108
a7d1c514	109	#if defined SHARED && IS_IN (libc)
c867597b	110	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
231c5676	111	cmp %RDX_LP, %RCX_LP
88b57b8e	112	jb HIDDEN_JUMPTARGET (__chk_fail)
c867597b	113	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
a7d1c514	114	#endif
88b57b8e	115
c867597b	116	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
88b57b8e L	117	movq %rdi, %rax
88b57b8e L	118	L(start):
231c5676 L	119	# ifdef __ILP32__
	120	/* Clear the upper 32 bits. */
	121	movl %edx, %edx
	122	# endif
	123	cmp $VEC_SIZE, %RDX_LP
88b57b8e	124	jb L(less_vec)
231c5676	125	cmp $(VEC_SIZE * 2), %RDX_LP
88b57b8e	126	ja L(more_2x_vec)
a7d1c514 L	127	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
	128	L(last_2x_vec):
	129	#endif
88b57b8e L	130	/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
	131	VMOVU (%rsi), %VEC(0)
	132	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
	133	VMOVU %VEC(0), (%rdi)
	134	VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
	135	VZEROUPPER
a7d1c514 L	136	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
	137	L(nop):
	138	#endif
88b57b8e	139	ret
a7d1c514	140	#if defined USE_MULTIARCH && IS_IN (libc)
c867597b	141	END (MEMMOVE_SYMBOL (__memmove, unaligned))
88b57b8e	142
13efa86e	143	# if VEC_SIZE == 16
5c3e322d	144	ENTRY (__mempcpy_chk_erms)
231c5676	145	cmp %RDX_LP, %RCX_LP
5c3e322d L	146	jb HIDDEN_JUMPTARGET (__chk_fail)
	147	END (__mempcpy_chk_erms)
	148
88b57b8e	149	/* Only used to measure performance of REP MOVSB. */
88b57b8e	150	ENTRY (__mempcpy_erms)
231c5676	151	mov %RDI_LP, %RAX_LP
727b38df	152	/* Skip zero length. */
231c5676	153	test %RDX_LP, %RDX_LP
727b38df	154	jz 2f
231c5676	155	add %RDX_LP, %RAX_LP
ea2785e9	156	jmp L(start_movsb)
88b57b8e	157	END (__mempcpy_erms)
88b57b8e	158
5c3e322d	159	ENTRY (__memmove_chk_erms)
231c5676	160	cmp %RDX_LP, %RCX_LP
5c3e322d L	161	jb HIDDEN_JUMPTARGET (__chk_fail)
	162	END (__memmove_chk_erms)
	163
88b57b8e L	164	ENTRY (__memmove_erms)
88b57b8e L	165	movq %rdi, %rax
727b38df	166	/* Skip zero length. */
231c5676	167	test %RDX_LP, %RDX_LP
727b38df	168	jz 2f
ea2785e9	169	L(start_movsb):
231c5676 L	170	mov %RDX_LP, %RCX_LP
231c5676 L	171	cmp %RSI_LP, %RDI_LP
ea2785e9 L	172	jb 1f
	173	/* Source == destination is less common. */
	174	je 2f
231c5676 L	175	lea (%rsi,%rcx), %RDX_LP
231c5676 L	176	cmp %RDX_LP, %RDI_LP
88b57b8e L	177	jb L(movsb_backward)
	178	1:
	179	rep movsb
ea2785e9	180	2:
88b57b8e L	181	ret
	182	L(movsb_backward):
	183	leaq -1(%rdi,%rcx), %rdi
	184	leaq -1(%rsi,%rcx), %rsi
	185	std
	186	rep movsb
	187	cld
	188	ret
	189	END (__memmove_erms)
	190	strong_alias (__memmove_erms, __memcpy_erms)
5c3e322d	191	strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
88b57b8e L	192	# endif
88b57b8e L	193
ea2785e9	194	# ifdef SHARED
a7d1c514	195	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
231c5676	196	cmp %RDX_LP, %RCX_LP
ea2785e9	197	jb HIDDEN_JUMPTARGET (__chk_fail)
a7d1c514	198	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
fc11ff8d	199	# endif
ea2785e9 L	200
ea2785e9 L	201	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
231c5676 L	202	mov %RDI_LP, %RAX_LP
231c5676 L	203	add %RDX_LP, %RAX_LP
ea2785e9 L	204	jmp L(start_erms)
	205	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
	206
fc11ff8d	207	# ifdef SHARED
a7d1c514	208	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
231c5676	209	cmp %RDX_LP, %RCX_LP
ea2785e9	210	jb HIDDEN_JUMPTARGET (__chk_fail)
a7d1c514	211	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
ea2785e9 L	212	# endif
ea2785e9 L	213
88b57b8e L	214	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
	215	movq %rdi, %rax
	216	L(start_erms):
231c5676 L	217	# ifdef __ILP32__
	218	/* Clear the upper 32 bits. */
	219	movl %edx, %edx
	220	# endif
	221	cmp $VEC_SIZE, %RDX_LP
88b57b8e	222	jb L(less_vec)
231c5676	223	cmp $(VEC_SIZE * 2), %RDX_LP
88b57b8e L	224	ja L(movsb_more_2x_vec)
	225	L(last_2x_vec):
	226	/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
	227	VMOVU (%rsi), %VEC(0)
	228	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
	229	VMOVU %VEC(0), (%rdi)
	230	VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
	231	L(return):
	232	VZEROUPPER
	233	ret
	234
	235	L(movsb):
55c7bcc7	236	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
a057f5f8	237	jae L(more_8x_vec)
88b57b8e	238	cmpq %rsi, %rdi
88b57b8e	239	jb 1f
ea2785e9 L	240	/* Source == destination is less common. */
ea2785e9 L	241	je L(nop)
88b57b8e L	242	leaq (%rsi,%rdx), %r9
	243	cmpq %r9, %rdi
	244	/* Avoid slow backward REP MOVSB. */
88b57b8e L	245	jb L(more_8x_vec_backward)
88b57b8e L	246	1:
231c5676	247	mov %RDX_LP, %RCX_LP
88b57b8e L	248	rep movsb
	249	L(nop):
	250	ret
a7d1c514	251	#endif
a057f5f8	252
88b57b8e L	253	L(less_vec):
88b57b8e L	254	/* Less than 1 VEC. */
a7d1c514 L	255	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
	256	# error Unsupported VEC_SIZE!
	257	#endif
	258	#if VEC_SIZE > 32
88b57b8e L	259	cmpb $32, %dl
88b57b8e L	260	jae L(between_32_63)
a7d1c514 L	261	#endif
a7d1c514 L	262	#if VEC_SIZE > 16
88b57b8e L	263	cmpb $16, %dl
88b57b8e L	264	jae L(between_16_31)
a7d1c514	265	#endif
88b57b8e L	266	cmpb $8, %dl
	267	jae L(between_8_15)
	268	cmpb $4, %dl
	269	jae L(between_4_7)
	270	cmpb $1, %dl
	271	ja L(between_2_3)
	272	jb 1f
	273	movzbl (%rsi), %ecx
	274	movb %cl, (%rdi)
	275	1:
	276	ret
a7d1c514	277	#if VEC_SIZE > 32
88b57b8e L	278	L(between_32_63):
	279	/* From 32 to 63. No branch when size == 32. */
	280	vmovdqu (%rsi), %ymm0
	281	vmovdqu -32(%rsi,%rdx), %ymm1
	282	vmovdqu %ymm0, (%rdi)
	283	vmovdqu %ymm1, -32(%rdi,%rdx)
	284	VZEROUPPER
	285	ret
a7d1c514 L	286	#endif
a7d1c514 L	287	#if VEC_SIZE > 16
88b57b8e L	288	/* From 16 to 31. No branch when size == 16. */
	289	L(between_16_31):
	290	vmovdqu (%rsi), %xmm0
	291	vmovdqu -16(%rsi,%rdx), %xmm1
	292	vmovdqu %xmm0, (%rdi)
	293	vmovdqu %xmm1, -16(%rdi,%rdx)
	294	ret
a7d1c514	295	#endif
88b57b8e L	296	L(between_8_15):
	297	/* From 8 to 15. No branch when size == 8. */
	298	movq -8(%rsi,%rdx), %rcx
	299	movq (%rsi), %rsi
	300	movq %rcx, -8(%rdi,%rdx)
	301	movq %rsi, (%rdi)
	302	ret
	303	L(between_4_7):
	304	/* From 4 to 7. No branch when size == 4. */
	305	movl -4(%rsi,%rdx), %ecx
	306	movl (%rsi), %esi
	307	movl %ecx, -4(%rdi,%rdx)
	308	movl %esi, (%rdi)
	309	ret
	310	L(between_2_3):
	311	/* From 2 to 3. No branch when size == 2. */
	312	movzwl -2(%rsi,%rdx), %ecx
	313	movzwl (%rsi), %esi
	314	movw %cx, -2(%rdi,%rdx)
	315	movw %si, (%rdi)
	316	ret
	317
a057f5f8 L	318	#if defined USE_MULTIARCH && IS_IN (libc)
a057f5f8 L	319	L(movsb_more_2x_vec):
3f4b61a0	320	cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
a057f5f8	321	ja L(movsb)
a7d1c514	322	#endif
a057f5f8 L	323	L(more_2x_vec):
a057f5f8 L	324	/* More than 2 * VEC and there may be overlap between destination
88b57b8e L	325	and source. */
	326	cmpq $(VEC_SIZE * 8), %rdx
	327	ja L(more_8x_vec)
	328	cmpq $(VEC_SIZE * 4), %rdx
	329	jb L(last_4x_vec)
88b57b8e L	330	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
	331	VMOVU (%rsi), %VEC(0)
	332	VMOVU VEC_SIZE(%rsi), %VEC(1)
	333	VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
	334	VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
	335	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
	336	VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
	337	VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
	338	VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
	339	VMOVU %VEC(0), (%rdi)
	340	VMOVU %VEC(1), VEC_SIZE(%rdi)
	341	VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
	342	VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
	343	VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
	344	VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
	345	VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
	346	VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
	347	VZEROUPPER
	348	ret
	349	L(last_4x_vec):
	350	/* Copy from 2 * VEC to 4 * VEC. */
	351	VMOVU (%rsi), %VEC(0)
	352	VMOVU VEC_SIZE(%rsi), %VEC(1)
	353	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
	354	VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
	355	VMOVU %VEC(0), (%rdi)
	356	VMOVU %VEC(1), VEC_SIZE(%rdi)
	357	VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
	358	VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
	359	VZEROUPPER
	360	ret
a057f5f8	361
88b57b8e L	362	L(more_8x_vec):
	363	cmpq %rsi, %rdi
	364	ja L(more_8x_vec_backward)
a057f5f8 L	365	/* Source == destination is less common. */
	366	je L(nop)
	367	/* Load the first VEC and last 4 * VEC to support overlapping
	368	addresses. */
	369	VMOVU (%rsi), %VEC(4)
	370	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
	371	VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
	372	VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
	373	VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
	374	/* Save start and stop of the destination buffer. */
	375	movq %rdi, %r11
	376	leaq -VEC_SIZE(%rdi, %rdx), %rcx
	377	/* Align destination for aligned stores in the loop. Compute
	378	how much destination is misaligned. */
	379	movq %rdi, %r8
	380	andq $(VEC_SIZE - 1), %r8
	381	/* Get the negative of offset for alignment. */
	382	subq $VEC_SIZE, %r8
	383	/* Adjust source. */
	384	subq %r8, %rsi
	385	/* Adjust destination which should be aligned now. */
	386	subq %r8, %rdi
	387	/* Adjust length. */
	388	addq %r8, %rdx
	389	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
	390	/* Check non-temporal store threshold. */
55c7bcc7	391	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
a057f5f8 L	392	ja L(large_forward)
	393	#endif
	394	L(loop_4x_vec_forward):
	395	/* Copy 4 * VEC a time forward. */
88b57b8e L	396	VMOVU (%rsi), %VEC(0)
	397	VMOVU VEC_SIZE(%rsi), %VEC(1)
	398	VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
	399	VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
a057f5f8 L	400	addq $(VEC_SIZE * 4), %rsi
	401	subq $(VEC_SIZE * 4), %rdx
	402	VMOVA %VEC(0), (%rdi)
	403	VMOVA %VEC(1), VEC_SIZE(%rdi)
	404	VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
	405	VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
	406	addq $(VEC_SIZE * 4), %rdi
88b57b8e	407	cmpq $(VEC_SIZE * 4), %rdx
a057f5f8 L	408	ja L(loop_4x_vec_forward)
	409	/* Store the last 4 * VEC. */
	410	VMOVU %VEC(5), (%rcx)
	411	VMOVU %VEC(6), -VEC_SIZE(%rcx)
	412	VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
	413	VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
	414	/* Store the first VEC. */
	415	VMOVU %VEC(4), (%r11)
	416	VZEROUPPER
	417	ret
88b57b8e	418
88b57b8e	419	L(more_8x_vec_backward):
a057f5f8 L	420	/* Load the first 4 * VEC and last VEC to support overlapping
	421	addresses. */
	422	VMOVU (%rsi), %VEC(4)
	423	VMOVU VEC_SIZE(%rsi), %VEC(5)
	424	VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
	425	VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
	426	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
	427	/* Save stop of the destination buffer. */
	428	leaq -VEC_SIZE(%rdi, %rdx), %r11
	429	/* Align destination end for aligned stores in the loop. Compute
	430	how much destination end is misaligned. */
88b57b8e	431	leaq -VEC_SIZE(%rsi, %rdx), %rcx
a057f5f8 L	432	movq %r11, %r9
	433	movq %r11, %r8
	434	andq $(VEC_SIZE - 1), %r8
	435	/* Adjust source. */
	436	subq %r8, %rcx
	437	/* Adjust the end of destination which should be aligned now. */
	438	subq %r8, %r9
	439	/* Adjust length. */
	440	subq %r8, %rdx
	441	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
	442	/* Check non-temporal store threshold. */
55c7bcc7	443	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
a057f5f8 L	444	ja L(large_backward)
	445	#endif
	446	L(loop_4x_vec_backward):
	447	/* Copy 4 * VEC a time backward. */
88b57b8e L	448	VMOVU (%rcx), %VEC(0)
	449	VMOVU -VEC_SIZE(%rcx), %VEC(1)
	450	VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
	451	VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
a057f5f8 L	452	subq $(VEC_SIZE * 4), %rcx
	453	subq $(VEC_SIZE * 4), %rdx
	454	VMOVA %VEC(0), (%r9)
	455	VMOVA %VEC(1), -VEC_SIZE(%r9)
	456	VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
	457	VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
	458	subq $(VEC_SIZE * 4), %r9
88b57b8e	459	cmpq $(VEC_SIZE * 4), %rdx
a057f5f8 L	460	ja L(loop_4x_vec_backward)
	461	/* Store the first 4 * VEC. */
	462	VMOVU %VEC(4), (%rdi)
	463	VMOVU %VEC(5), VEC_SIZE(%rdi)
	464	VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
	465	VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
	466	/* Store the last VEC. */
	467	VMOVU %VEC(8), (%r11)
	468	VZEROUPPER
	469	ret
	470
	471	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
	472	L(large_forward):
	473	/* Don't use non-temporal store if there is overlap between
	474	destination and source since destination may be in cache
	475	when source is loaded. */
	476	leaq (%rdi, %rdx), %r10
	477	cmpq %r10, %rsi
	478	jb L(loop_4x_vec_forward)
	479	L(loop_large_forward):
	480	/* Copy 4 * VEC a time forward with non-temporal stores. */
	481	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
	482	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
	483	VMOVU (%rsi), %VEC(0)
	484	VMOVU VEC_SIZE(%rsi), %VEC(1)
	485	VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
	486	VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
	487	addq $PREFETCHED_LOAD_SIZE, %rsi
	488	subq $PREFETCHED_LOAD_SIZE, %rdx
	489	VMOVNT %VEC(0), (%rdi)
	490	VMOVNT %VEC(1), VEC_SIZE(%rdi)
	491	VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
	492	VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
	493	addq $PREFETCHED_LOAD_SIZE, %rdi
	494	cmpq $PREFETCHED_LOAD_SIZE, %rdx
	495	ja L(loop_large_forward)
	496	sfence
	497	/* Store the last 4 * VEC. */
	498	VMOVU %VEC(5), (%rcx)
	499	VMOVU %VEC(6), -VEC_SIZE(%rcx)
	500	VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
	501	VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
	502	/* Store the first VEC. */
	503	VMOVU %VEC(4), (%r11)
	504	VZEROUPPER
	505	ret
	506
	507	L(large_backward):
	508	/* Don't use non-temporal store if there is overlap between
	509	destination and source since destination may be in cache
	510	when source is loaded. */
	511	leaq (%rcx, %rdx), %r10
	512	cmpq %r10, %r9
	513	jb L(loop_4x_vec_backward)
	514	L(loop_large_backward):
	515	/* Copy 4 * VEC a time backward with non-temporal stores. */
	516	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
	517	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
	518	VMOVU (%rcx), %VEC(0)
	519	VMOVU -VEC_SIZE(%rcx), %VEC(1)
	520	VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
	521	VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
	522	subq $PREFETCHED_LOAD_SIZE, %rcx
	523	subq $PREFETCHED_LOAD_SIZE, %rdx
524	VMOVNT %VEC(0), (%r9)
525	VMOVNT %VEC(1), -VEC_SIZE(%r9)
526	VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
527	VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
528	subq $PREFETCHED_LOAD_SIZE, %r9
529	cmpq $PREFETCHED_LOAD_SIZE, %rdx
530	ja L(loop_large_backward)
531	sfence
532	/* Store the first 4 * VEC. */
533	VMOVU %VEC(4), (%rdi)
534	VMOVU %VEC(5), VEC_SIZE(%rdi)
535	VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
536	VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
537	/* Store the last VEC. */
538	VMOVU %VEC(8), (%r11)
539	VZEROUPPER
540	ret
541	#endif
88b57b8e L	542	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
88b57b8e L	543
fc11ff8d L	544	#if IS_IN (libc)
fc11ff8d L	545	# ifdef USE_MULTIARCH
88b57b8e L	546	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
88b57b8e L	547	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
fc11ff8d	548	# ifdef SHARED
88b57b8e L	549	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
88b57b8e L	550	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
a7d1c514	551	# endif
fc11ff8d L	552	# endif
fc11ff8d L	553	# ifdef SHARED
c867597b L	554	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
c867597b L	555	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
88b57b8e	556	# endif
a7d1c514	557	#endif
c867597b L	558	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
c867597b L	559	MEMCPY_SYMBOL (__memcpy, unaligned))