[thirdparty/glibc.git] / sysdeps / arm / armv7 / multiarch / memcpy_impl.S

/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
   Copyright (C) 2013-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.

   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
   of VFP or NEON when built with the appropriate flags.

   Assumptions:

    ARMv6 (ARMv7-a if using Neon)
    ARM state
    Unaligned accesses

 */

/* Thumb cannot encode negative immediate offsets in memory operations.  */
#ifndef NO_THUMB
#define NO_THUMB
#endif
#include <sysdep.h>
#include <arm-features.h>

	.syntax unified
	/* This implementation requires ARM state.  */
	.arm

#ifdef MEMCPY_NEON

	.fpu	neon
	.arch	armv7-a
# define FRAME_SIZE	4
# define USE_VFP
# define USE_NEON

#elif defined (MEMCPY_VFP)

	.arch	armv6
	.fpu	vfpv2
# define FRAME_SIZE	32
# define USE_VFP

#else
	.arch	armv6
# define FRAME_SIZE    32

#endif

#define ALIGN(addr, align) addr:align

#define INSN_SIZE	4

/* Call parameters.  */
#define dstin	r0
#define src	r1
#define count	r2

/* Locals.  */
#define tmp1	r3
#define dst	ip
#define tmp2	r8

/* These two macros both work by repeated invocation of the macro
   dispatch_step (not defined here).  That macro performs one "step",
   doing one load instruction and one store instruction to copy one
   "unit".  On entry, TMP1 contains the number of bytes to be copied,
   a multiple of the unit size.  The macro clobbers TMP1 in the
   process of doing a computed jump to the tail containing the
   appropriate number of steps.

   In dispatch_7_dword, dispatch_step is invoked seven times, with an
   argument that is 7 for the first and 1 for the last.  Units are
   double-words (8 bytes).  TMP1 is at most 56.

   In dispatch_15_word, dispatch_step is invoked fifteen times,
   with an argument that is 15 for the first and 1 for the last.
   Units are words (4 bytes).  TMP1 is at most 60.  */

#ifndef ARM_ALWAYS_BX
# if ARM_BX_ALIGN_LOG2 != 2
#  error case not handled
# endif
	.macro dispatch_7_dword
	rsb	tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
	add	pc, pc, tmp1
	dispatch_step 7
	dispatch_step 6
	dispatch_step 5
	dispatch_step 4
	dispatch_step 3
	dispatch_step 2
	dispatch_step 1
	.purgem dispatch_step
	.endm

	.macro dispatch_15_word
	rsb	tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
	add	pc, pc, tmp1, lsl #1
	dispatch_step 15
	dispatch_step 14
	dispatch_step 13
	dispatch_step 12
	dispatch_step 11
	dispatch_step 10
	dispatch_step 9
	dispatch_step 8
	dispatch_step 7
	dispatch_step 6
	dispatch_step 5
	dispatch_step 4
	dispatch_step 3
	dispatch_step 2
	dispatch_step 1
	.purgem dispatch_step
	.endm
#else
# if ARM_BX_ALIGN_LOG2 < 3
#  error case not handled
# endif
	.macro dispatch_helper steps, log2_bytes_per_step
	/* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
	   (STEPS << LOG2_BYTES_PER_STEP).
	   So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
	   Then it needs further adjustment to compensate for the
	   distance between the PC value taken below (0f + PC_OFS)
	   and the first step's instructions (1f).  */
	rsb	tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
			      + ((1f - PC_OFS - 0f) \
				 >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
	/* Shifting down LOG2_BYTES_PER_STEP gives us the number of
	   steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
	   the (byte) distance to add to the PC.  */
0:	add	tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
	bx	tmp1
	.p2align ARM_BX_ALIGN_LOG2
1:
	.endm

	.macro dispatch_7_dword
	dispatch_helper 7, 3
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 7
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 6
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 5
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 4
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 3
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 2
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 1
	.p2align ARM_BX_ALIGN_LOG2
	.purgem dispatch_step
	.endm

	.macro dispatch_15_word
	dispatch_helper 15, 2
	dispatch_step 15
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 14
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 13
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 12
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 11
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 10
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 9
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 8
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 7
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 6
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 5
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 4
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 3
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 2
	.p2align ARM_BX_ALIGN_LOG2
	dispatch_step 1
	.p2align ARM_BX_ALIGN_LOG2
	.purgem dispatch_step
	.endm

#endif

#ifndef USE_NEON
/* For bulk copies using GP registers.  */
#define	A_l	r2		/* Call-clobbered.  */
#define	A_h	r3		/* Call-clobbered.  */
#define	B_l	r4
#define	B_h	r5
#define	C_l	r6
#define	C_h	r7
/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved.  */
#define	D_l	r10
#define	D_h	r11
#endif

/* Number of lines ahead to pre-fetch data.  If you change this the code
   below will need adjustment to compensate.  */

#define prefetch_lines	5

#ifdef USE_VFP
	.macro	cpy_line_vfp vreg, base
	vstr	\vreg, [dst, #\base]
	vldr	\vreg, [src, #\base]
	vstr	d0, [dst, #\base + 8]
	vldr	d0, [src, #\base + 8]
	vstr	d1, [dst, #\base + 16]
	vldr	d1, [src, #\base + 16]
	vstr	d2, [dst, #\base + 24]
	vldr	d2, [src, #\base + 24]
	vstr	\vreg, [dst, #\base + 32]
	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
	vstr	d0, [dst, #\base + 40]
	vldr	d0, [src, #\base + 40]
	vstr	d1, [dst, #\base + 48]
	vldr	d1, [src, #\base + 48]
	vstr	d2, [dst, #\base + 56]
	vldr	d2, [src, #\base + 56]
	.endm

	.macro	cpy_tail_vfp vreg, base
	vstr	\vreg, [dst, #\base]
	vldr	\vreg, [src, #\base]
	vstr	d0, [dst, #\base + 8]
	vldr	d0, [src, #\base + 8]
	vstr	d1, [dst, #\base + 16]
	vldr	d1, [src, #\base + 16]
	vstr	d2, [dst, #\base + 24]
	vldr	d2, [src, #\base + 24]
	vstr	\vreg, [dst, #\base + 32]
	vstr	d0, [dst, #\base + 40]
	vldr	d0, [src, #\base + 40]
	vstr	d1, [dst, #\base + 48]
	vldr	d1, [src, #\base + 48]
	vstr	d2, [dst, #\base + 56]
	vldr	d2, [src, #\base + 56]
	.endm
#endif

	.p2align 6
ENTRY(memcpy)

	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
	cmp	count, #64
	bge	.Lcpy_not_short
	/* Deal with small copies quickly by dropping straight into the
	   exit block.  */

.Ltail63unaligned:
#ifdef USE_NEON
	/* These need an extra layer of macro just to work around a
	   bug in the assembler's parser when an operand starts with
	   a {...}.  https://sourceware.org/bugzilla/show_bug.cgi?id=15647
	   tracks that bug; it was not fixed as of binutils-2.23.2.  */
	.macro neon_load_d0 reg
	vld1.8	{d0}, [\reg]!
	.endm
	.macro neon_store_d0 reg
	vst1.8	{d0}, [\reg]!
	.endm

	and	tmp1, count, #0x38
	.macro dispatch_step i
	neon_load_d0 src
	neon_store_d0 dst
	.endm
	dispatch_7_dword

	tst	count, #4
	ldrne	tmp1, [src], #4
	strne	tmp1, [dst], #4
#else
	/* Copy up to 15 full words of data.  May not be aligned.  */
	/* Cannot use VFP for unaligned data.  */
	and	tmp1, count, #0x3c
	add	dst, dst, tmp1
	add	src, src, tmp1
	/* Jump directly into the sequence below at the correct offset.  */
	.macro dispatch_step i
	ldr	tmp1, [src, #-(\i * 4)]
	str	tmp1, [dst, #-(\i * 4)]
	.endm
	dispatch_15_word
#endif

	lsls	count, count, #31
	ldrhcs	tmp1, [src], #2
	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
	strhcs	tmp1, [dst], #2
	strbne	src, [dst]
	bx	lr

.Lcpy_not_short:
	/* At least 64 bytes to copy, but don't know the alignment yet.  */
	str	tmp2, [sp, #-FRAME_SIZE]!
	cfi_adjust_cfa_offset (FRAME_SIZE)
	cfi_rel_offset (tmp2, 0)
	cfi_remember_state
	and	tmp2, src, #7
	and	tmp1, dst, #7
	cmp	tmp1, tmp2
	bne	.Lcpy_notaligned

#ifdef USE_VFP
	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
	   that the FP pipeline is much better at streaming loads and
	   stores.  This is outside the critical loop.  */
	vmov.f32	s0, s0
#endif

	/* SRC and DST have the same mutual 64-bit alignment, but we may
	   still need to pre-copy some bytes to get to natural alignment.
	   We bring SRC and DST into full 64-bit alignment.  */
	lsls	tmp2, dst, #29
	beq	1f
	rsbs	tmp2, tmp2, #0
	sub	count, count, tmp2, lsr #29
	ldrmi	tmp1, [src], #4
	strmi	tmp1, [dst], #4
	lsls	tmp2, tmp2, #2
	ldrhcs	tmp1, [src], #2
	ldrbne	tmp2, [src], #1
	strhcs	tmp1, [dst], #2
	strbne	tmp2, [dst], #1

1:
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
	blt	.Ltail63aligned

	cmp	tmp2, #512
	bge	.Lcpy_body_long

.Lcpy_body_medium:			/* Count in tmp2.  */
#ifdef USE_VFP
1:
	vldr	d0, [src, #0]
	subs	tmp2, tmp2, #64
	vldr	d1, [src, #8]
	vstr	d0, [dst, #0]
	vldr	d0, [src, #16]
	vstr	d1, [dst, #8]
	vldr	d1, [src, #24]
	vstr	d0, [dst, #16]
	vldr	d0, [src, #32]
	vstr	d1, [dst, #24]
	vldr	d1, [src, #40]
	vstr	d0, [dst, #32]
	vldr	d0, [src, #48]
	vstr	d1, [dst, #40]
	vldr	d1, [src, #56]
	vstr	d0, [dst, #48]
	add	src, src, #64
	vstr	d1, [dst, #56]
	add	dst, dst, #64
	bge	1b
	tst	tmp2, #0x3f
	beq	.Ldone

.Ltail63aligned:			/* Count in tmp2.  */
	and	tmp1, tmp2, #0x38
	add	dst, dst, tmp1
	add	src, src, tmp1
	.macro dispatch_step i
	vldr	d0, [src, #-(\i * 8)]
	vstr	d0, [dst, #-(\i * 8)]
	.endm
	dispatch_7_dword
#else
	sub	src, src, #8
	sub	dst, dst, #8
1:
	ldrd	A_l, A_h, [src, #8]
	strd	A_l, A_h, [dst, #8]
	ldrd	A_l, A_h, [src, #16]
	strd	A_l, A_h, [dst, #16]
	ldrd	A_l, A_h, [src, #24]
	strd	A_l, A_h, [dst, #24]
	ldrd	A_l, A_h, [src, #32]
	strd	A_l, A_h, [dst, #32]
	ldrd	A_l, A_h, [src, #40]
	strd	A_l, A_h, [dst, #40]
	ldrd	A_l, A_h, [src, #48]
	strd	A_l, A_h, [dst, #48]
	ldrd	A_l, A_h, [src, #56]
	strd	A_l, A_h, [dst, #56]
	ldrd	A_l, A_h, [src, #64]!
	strd	A_l, A_h, [dst, #64]!
	subs	tmp2, tmp2, #64
	bge	1b
	tst	tmp2, #0x3f
	bne	1f
	ldr	tmp2,[sp], #FRAME_SIZE
	cfi_adjust_cfa_offset (-FRAME_SIZE)
	cfi_restore (tmp2)
	bx	lr

	cfi_restore_state
	cfi_remember_state
1:
	add	src, src, #8
	add	dst, dst, #8

.Ltail63aligned:			/* Count in tmp2.  */
	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
	   we know that the src and dest are 64-bit aligned so we can use
	   LDRD/STRD to improve efficiency.  */
	/* TMP2 is now negative, but we don't care about that.  The bottom
	   six bits still tell us how many bytes are left to copy.  */

	and	tmp1, tmp2, #0x38
	add	dst, dst, tmp1
	add	src, src, tmp1
	.macro dispatch_step i
	ldrd	A_l, A_h, [src, #-(\i * 8)]
	strd	A_l, A_h, [dst, #-(\i * 8)]
	.endm
	dispatch_7_dword
#endif

	tst	tmp2, #4
	ldrne	tmp1, [src], #4
	strne	tmp1, [dst], #4
	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
	ldrhcs	tmp1, [src], #2
	ldrbne	tmp2, [src]
	strhcs	tmp1, [dst], #2
	strbne	tmp2, [dst]

.Ldone:
	ldr	tmp2, [sp], #FRAME_SIZE
	cfi_adjust_cfa_offset (-FRAME_SIZE)
	cfi_restore (tmp2)
	bx	lr

	cfi_restore_state
	cfi_remember_state

.Lcpy_body_long:			/* Count in tmp2.  */

	/* Long copy.  We know that there's at least (prefetch_lines * 64)
	   bytes to go.  */
#ifdef USE_VFP
	/* Don't use PLD.  Instead, read some data in advance of the current
	   copy position into a register.  This should act like a PLD
	   operation but we won't have to repeat the transfer.  */

	vldr	d3, [src, #0]
	vldr	d4, [src, #64]
	vldr	d5, [src, #128]
	vldr	d6, [src, #192]
	vldr	d7, [src, #256]

	vldr	d0, [src, #8]
	vldr	d1, [src, #16]
	vldr	d2, [src, #24]
	add	src, src, #32

	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
	blt	2f
1:
	cpy_line_vfp	d3, 0
	cpy_line_vfp	d4, 64
	cpy_line_vfp	d5, 128
	add	dst, dst, #3 * 64
	add	src, src, #3 * 64
	cpy_line_vfp	d6, 0
	cpy_line_vfp	d7, 64
	add	dst, dst, #2 * 64
	add	src, src, #2 * 64
	subs	tmp2, tmp2, #prefetch_lines * 64
	bge	1b

2:
	cpy_tail_vfp	d3, 0
	cpy_tail_vfp	d4, 64
	cpy_tail_vfp	d5, 128
	add	src, src, #3 * 64
	add	dst, dst, #3 * 64
	cpy_tail_vfp	d6, 0
	vstr	d7, [dst, #64]
	vldr	d7, [src, #64]
	vstr	d0, [dst, #64 + 8]
	vldr	d0, [src, #64 + 8]
	vstr	d1, [dst, #64 + 16]
	vldr	d1, [src, #64 + 16]
	vstr	d2, [dst, #64 + 24]
	vldr	d2, [src, #64 + 24]
	vstr	d7, [dst, #64 + 32]
	add	src, src, #96
	vstr	d0, [dst, #64 + 40]
	vstr	d1, [dst, #64 + 48]
	vstr	d2, [dst, #64 + 56]
	add	dst, dst, #128
	add	tmp2, tmp2, #prefetch_lines * 64
	b	.Lcpy_body_medium
#else
	/* Long copy.  Use an SMS style loop to maximize the I/O
	   bandwidth of the core.  We don't have enough spare registers
	   to synthesise prefetching, so use PLD operations.  */
	/* Pre-bias src and dst.  */
	sub	src, src, #8
	sub	dst, dst, #8
	pld	[src, #8]
	pld	[src, #72]
	subs	tmp2, tmp2, #64
	pld	[src, #136]
	ldrd	A_l, A_h, [src, #8]
	strd	B_l, B_h, [sp, #8]
	cfi_rel_offset (B_l, 8)
	cfi_rel_offset (B_h, 12)
	ldrd	B_l, B_h, [src, #16]
	strd	C_l, C_h, [sp, #16]
	cfi_rel_offset (C_l, 16)
	cfi_rel_offset (C_h, 20)
	ldrd	C_l, C_h, [src, #24]
	strd	D_l, D_h, [sp, #24]
	cfi_rel_offset (D_l, 24)
	cfi_rel_offset (D_h, 28)
	pld	[src, #200]
	ldrd	D_l, D_h, [src, #32]!
	b	1f
	.p2align	6
2:
	pld	[src, #232]
	strd	A_l, A_h, [dst, #40]
	ldrd	A_l, A_h, [src, #40]
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [src, #48]
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [src, #56]
	strd	D_l, D_h, [dst, #64]!
	ldrd	D_l, D_h, [src, #64]!
	subs	tmp2, tmp2, #64
1:
	strd	A_l, A_h, [dst, #8]
	ldrd	A_l, A_h, [src, #8]
	strd	B_l, B_h, [dst, #16]
	ldrd	B_l, B_h, [src, #16]
	strd	C_l, C_h, [dst, #24]
	ldrd	C_l, C_h, [src, #24]
	strd	D_l, D_h, [dst, #32]
	ldrd	D_l, D_h, [src, #32]
	bcs	2b
	/* Save the remaining bytes and restore the callee-saved regs.  */
	strd	A_l, A_h, [dst, #40]
	add	src, src, #40
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [sp, #8]
	cfi_restore (B_l)
	cfi_restore (B_h)
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [sp, #16]
	cfi_restore (C_l)
	cfi_restore (C_h)
	strd	D_l, D_h, [dst, #64]
	ldrd	D_l, D_h, [sp, #24]
	cfi_restore (D_l)
	cfi_restore (D_h)
	add	dst, dst, #72
	tst	tmp2, #0x3f
	bne	.Ltail63aligned
	ldr	tmp2, [sp], #FRAME_SIZE
	cfi_adjust_cfa_offset (-FRAME_SIZE)
	cfi_restore (tmp2)
	bx	lr
#endif

	cfi_restore_state
	cfi_remember_state

.Lcpy_notaligned:
	pld	[src, #0]
	pld	[src, #64]
	/* There's at least 64 bytes to copy, but there is no mutual
	   alignment.  */
	/* Bring DST to 64-bit alignment.  */
	lsls	tmp2, dst, #29
	pld	[src, #(2 * 64)]
	beq	1f
	rsbs	tmp2, tmp2, #0
	sub	count, count, tmp2, lsr #29
	ldrmi	tmp1, [src], #4
	strmi	tmp1, [dst], #4
	lsls	tmp2, tmp2, #2
	ldrbne	tmp1, [src], #1
	ldrhcs	tmp2, [src], #2
	strbne	tmp1, [dst], #1
	strhcs	tmp2, [dst], #2
1:
	pld	[src, #(3 * 64)]
	subs	count, count, #64
	ldrmi	tmp2, [sp], #FRAME_SIZE
	bmi	.Ltail63unaligned
	pld	[src, #(4 * 64)]

#ifdef USE_NEON
	/* These need an extra layer of macro just to work around a
	   bug in the assembler's parser when an operand starts with
	   a {...}.  */
	.macro neon_load_multi reglist, basereg
	vld1.8	{\reglist}, [\basereg]!
	.endm
	.macro neon_store_multi reglist, basereg
	vst1.8	{\reglist}, [ALIGN (\basereg, 64)]!
	.endm

	neon_load_multi d0-d3, src
	neon_load_multi d4-d7, src
	subs	count, count, #64
	bmi	2f
1:
	pld	[src, #(4 * 64)]
	neon_store_multi d0-d3, dst
	neon_load_multi d0-d3, src
	neon_store_multi d4-d7, dst
	neon_load_multi d4-d7, src
	subs	count, count, #64
	bpl	1b
2:
	neon_store_multi d0-d3, dst
	neon_store_multi d4-d7, dst
	ands	count, count, #0x3f
#else
	/* Use an SMS style loop to maximize the I/O bandwidth.  */
	sub	src, src, #4
	sub	dst, dst, #8
	subs	tmp2, count, #64	/* Use tmp2 for count.  */
	ldr	A_l, [src, #4]
	ldr	A_h, [src, #8]
	strd	B_l, B_h, [sp, #8]
	cfi_rel_offset (B_l, 8)
	cfi_rel_offset (B_h, 12)
	ldr	B_l, [src, #12]
	ldr	B_h, [src, #16]
	strd	C_l, C_h, [sp, #16]
	cfi_rel_offset (C_l, 16)
	cfi_rel_offset (C_h, 20)
	ldr	C_l, [src, #20]
	ldr	C_h, [src, #24]
	strd	D_l, D_h, [sp, #24]
	cfi_rel_offset (D_l, 24)
	cfi_rel_offset (D_h, 28)
	ldr	D_l, [src, #28]
	ldr	D_h, [src, #32]!
	b	1f
	.p2align	6
2:
	pld	[src, #(5 * 64) - (32 - 4)]
	strd	A_l, A_h, [dst, #40]
	ldr	A_l, [src, #36]
	ldr	A_h, [src, #40]
	strd	B_l, B_h, [dst, #48]
	ldr	B_l, [src, #44]
	ldr	B_h, [src, #48]
	strd	C_l, C_h, [dst, #56]
	ldr	C_l, [src, #52]
	ldr	C_h, [src, #56]
	strd	D_l, D_h, [dst, #64]!
	ldr	D_l, [src, #60]
	ldr	D_h, [src, #64]!
	subs	tmp2, tmp2, #64
1:
	strd	A_l, A_h, [dst, #8]
	ldr	A_l, [src, #4]
	ldr	A_h, [src, #8]
	strd	B_l, B_h, [dst, #16]
	ldr	B_l, [src, #12]
	ldr	B_h, [src, #16]
	strd	C_l, C_h, [dst, #24]
	ldr	C_l, [src, #20]
	ldr	C_h, [src, #24]
	strd	D_l, D_h, [dst, #32]
	ldr	D_l, [src, #28]
	ldr	D_h, [src, #32]
	bcs	2b

	/* Save the remaining bytes and restore the callee-saved regs.  */
	strd	A_l, A_h, [dst, #40]
	add	src, src, #36
	strd	B_l, B_h, [dst, #48]
	ldrd	B_l, B_h, [sp, #8]
	cfi_restore (B_l)
	cfi_restore (B_h)
	strd	C_l, C_h, [dst, #56]
	ldrd	C_l, C_h, [sp, #16]
	cfi_restore (C_l)
	cfi_restore (C_h)
	strd	D_l, D_h, [dst, #64]
	ldrd	D_l, D_h, [sp, #24]
	cfi_restore (D_l)
	cfi_restore (D_h)
	add	dst, dst, #72
	ands	count, tmp2, #0x3f
#endif
	ldr	tmp2, [sp], #FRAME_SIZE
	cfi_adjust_cfa_offset (-FRAME_SIZE)
	cfi_restore (tmp2)
	bne	.Ltail63unaligned
	bx	lr

END(memcpy)
libc_hidden_builtin_def (memcpy)
Commit	Line	Data
ae65139d	1	/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
04277e02	2	Copyright (C) 2013-2019 Free Software Foundation, Inc.
ae65139d WN	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	<http://www.gnu.org/licenses/>.
	18
	19	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
	20	of VFP or NEON when built with the appropriate flags.
	21
	22	Assumptions:
	23
	24	ARMv6 (ARMv7-a if using Neon)
	25	ARM state
	26	Unaligned accesses
ae65139d WN	27
	28	*/
	29
	30	/* Thumb cannot encode negative immediate offsets in memory operations. */
	31	#ifndef NO_THUMB
	32	#define NO_THUMB
	33	#endif
	34	#include <sysdep.h>
733edfb8	35	#include <arm-features.h>
ae65139d WN	36
	37	.syntax unified
	38	/* This implementation requires ARM state. */
	39	.arm
	40
	41	#ifdef MEMCPY_NEON
	42
	43	.fpu neon
	44	.arch armv7-a
	45	# define FRAME_SIZE 4
	46	# define USE_VFP
	47	# define USE_NEON
	48
	49	#elif defined (MEMCPY_VFP)
	50
	51	.arch armv6
	52	.fpu vfpv2
	53	# define FRAME_SIZE 32
	54	# define USE_VFP
	55
	56	#else
	57	.arch armv6
	58	# define FRAME_SIZE 32
	59
	60	#endif
	61
	62	#define ALIGN(addr, align) addr:align
	63
	64	#define INSN_SIZE 4
	65
	66	/* Call parameters. */
	67	#define dstin r0
	68	#define src r1
	69	#define count r2
	70
	71	/* Locals. */
	72	#define tmp1 r3
	73	#define dst ip
733edfb8 RM	74	#define tmp2 r8
	75
	76	/* These two macros both work by repeated invocation of the macro
	77	dispatch_step (not defined here). That macro performs one "step",
	78	doing one load instruction and one store instruction to copy one
	79	"unit". On entry, TMP1 contains the number of bytes to be copied,
	80	a multiple of the unit size. The macro clobbers TMP1 in the
	81	process of doing a computed jump to the tail containing the
	82	appropriate number of steps.
	83
	84	In dispatch_7_dword, dispatch_step is invoked seven times, with an
	85	argument that is 7 for the first and 1 for the last. Units are
	86	double-words (8 bytes). TMP1 is at most 56.
	87
	88	In dispatch_15_word, dispatch_step is invoked fifteen times,
	89	with an argument that is 15 for the first and 1 for the last.
	90	Units are words (4 bytes). TMP1 is at most 60. */
	91
	92	#ifndef ARM_ALWAYS_BX
	93	# if ARM_BX_ALIGN_LOG2 != 2
	94	# error case not handled
	95	# endif
	96	.macro dispatch_7_dword
	97	rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
	98	add pc, pc, tmp1
	99	dispatch_step 7
	100	dispatch_step 6
	101	dispatch_step 5
	102	dispatch_step 4
	103	dispatch_step 3
	104	dispatch_step 2
	105	dispatch_step 1
	106	.purgem dispatch_step
	107	.endm
	108
	109	.macro dispatch_15_word
	110	rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
	111	add pc, pc, tmp1, lsl #1
	112	dispatch_step 15
	113	dispatch_step 14
	114	dispatch_step 13
	115	dispatch_step 12
	116	dispatch_step 11
	117	dispatch_step 10
	118	dispatch_step 9
	119	dispatch_step 8
	120	dispatch_step 7
	121	dispatch_step 6
	122	dispatch_step 5
	123	dispatch_step 4
	124	dispatch_step 3
	125	dispatch_step 2
	126	dispatch_step 1
	127	.purgem dispatch_step
	128	.endm
	129	#else
068dcfd6	130	# if ARM_BX_ALIGN_LOG2 < 3
733edfb8 RM	131	# error case not handled
	132	# endif
	133	.macro dispatch_helper steps, log2_bytes_per_step
733edfb8 RM	134	/* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
733edfb8 RM	135	(STEPS << LOG2_BYTES_PER_STEP).
068dcfd6 RM	136	So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
	137	Then it needs further adjustment to compensate for the
	138	distance between the PC value taken below (0f + PC_OFS)
	139	and the first step's instructions (1f). */
	140	rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
	141	+ ((1f - PC_OFS - 0f) \
	142	>> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
733edfb8 RM	143	/* Shifting down LOG2_BYTES_PER_STEP gives us the number of
	144	steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
	145	the (byte) distance to add to the PC. */
068dcfd6	146	0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
733edfb8	147	bx tmp1
068dcfd6 RM	148	.p2align ARM_BX_ALIGN_LOG2
068dcfd6 RM	149	1:
733edfb8 RM	150	.endm
	151
	152	.macro dispatch_7_dword
	153	dispatch_helper 7, 3
	154	.p2align ARM_BX_ALIGN_LOG2
	155	dispatch_step 7
	156	.p2align ARM_BX_ALIGN_LOG2
	157	dispatch_step 6
	158	.p2align ARM_BX_ALIGN_LOG2
	159	dispatch_step 5
	160	.p2align ARM_BX_ALIGN_LOG2
	161	dispatch_step 4
	162	.p2align ARM_BX_ALIGN_LOG2
	163	dispatch_step 3
	164	.p2align ARM_BX_ALIGN_LOG2
	165	dispatch_step 2
	166	.p2align ARM_BX_ALIGN_LOG2
	167	dispatch_step 1
	168	.p2align ARM_BX_ALIGN_LOG2
	169	.purgem dispatch_step
	170	.endm
	171
	172	.macro dispatch_15_word
	173	dispatch_helper 15, 2
	174	dispatch_step 15
	175	.p2align ARM_BX_ALIGN_LOG2
	176	dispatch_step 14
	177	.p2align ARM_BX_ALIGN_LOG2
	178	dispatch_step 13
	179	.p2align ARM_BX_ALIGN_LOG2
	180	dispatch_step 12
	181	.p2align ARM_BX_ALIGN_LOG2
	182	dispatch_step 11
	183	.p2align ARM_BX_ALIGN_LOG2
	184	dispatch_step 10
	185	.p2align ARM_BX_ALIGN_LOG2
	186	dispatch_step 9
	187	.p2align ARM_BX_ALIGN_LOG2
	188	dispatch_step 8
	189	.p2align ARM_BX_ALIGN_LOG2
	190	dispatch_step 7
	191	.p2align ARM_BX_ALIGN_LOG2
	192	dispatch_step 6
	193	.p2align ARM_BX_ALIGN_LOG2
	194	dispatch_step 5
	195	.p2align ARM_BX_ALIGN_LOG2
	196	dispatch_step 4
	197	.p2align ARM_BX_ALIGN_LOG2
	198	dispatch_step 3
	199	.p2align ARM_BX_ALIGN_LOG2
	200	dispatch_step 2
	201	.p2align ARM_BX_ALIGN_LOG2
	202	dispatch_step 1
	203	.p2align ARM_BX_ALIGN_LOG2
	204	.purgem dispatch_step
	205	.endm
	206
	207	#endif
ae65139d WN	208
	209	#ifndef USE_NEON
	210	/* For bulk copies using GP registers. */
	211	#define A_l r2 /* Call-clobbered. */
	212	#define A_h r3 /* Call-clobbered. */
	213	#define B_l r4
	214	#define B_h r5
	215	#define C_l r6
	216	#define C_h r7
733edfb8 RM	217	/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */
	218	#define D_l r10
	219	#define D_h r11
ae65139d WN	220	#endif
	221
	222	/* Number of lines ahead to pre-fetch data. If you change this the code
	223	below will need adjustment to compensate. */
	224
	225	#define prefetch_lines 5
	226
	227	#ifdef USE_VFP
	228	.macro cpy_line_vfp vreg, base
81cb7a0b ZW	229	vstr \vreg, [dst, #\base]
	230	vldr \vreg, [src, #\base]
	231	vstr d0, [dst, #\base + 8]
	232	vldr d0, [src, #\base + 8]
	233	vstr d1, [dst, #\base + 16]
	234	vldr d1, [src, #\base + 16]
	235	vstr d2, [dst, #\base + 24]
	236	vldr d2, [src, #\base + 24]
	237	vstr \vreg, [dst, #\base + 32]
	238	vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
	239	vstr d0, [dst, #\base + 40]
	240	vldr d0, [src, #\base + 40]
	241	vstr d1, [dst, #\base + 48]
	242	vldr d1, [src, #\base + 48]
	243	vstr d2, [dst, #\base + 56]
	244	vldr d2, [src, #\base + 56]
ae65139d WN	245	.endm
	246
	247	.macro cpy_tail_vfp vreg, base
81cb7a0b ZW	248	vstr \vreg, [dst, #\base]
	249	vldr \vreg, [src, #\base]
	250	vstr d0, [dst, #\base + 8]
	251	vldr d0, [src, #\base + 8]
	252	vstr d1, [dst, #\base + 16]
	253	vldr d1, [src, #\base + 16]
	254	vstr d2, [dst, #\base + 24]
	255	vldr d2, [src, #\base + 24]
	256	vstr \vreg, [dst, #\base + 32]
	257	vstr d0, [dst, #\base + 40]
	258	vldr d0, [src, #\base + 40]
	259	vstr d1, [dst, #\base + 48]
	260	vldr d1, [src, #\base + 48]
	261	vstr d2, [dst, #\base + 56]
	262	vldr d2, [src, #\base + 56]
ae65139d WN	263	.endm
	264	#endif
	265
	266	.p2align 6
	267	ENTRY(memcpy)
	268
	269	mov dst, dstin /* Preserve dstin, we need to return it. */
	270	cmp count, #64
	271	bge .Lcpy_not_short
	272	/* Deal with small copies quickly by dropping straight into the
	273	exit block. */
	274
	275	.Ltail63unaligned:
	276	#ifdef USE_NEON
733edfb8 RM	277	/* These need an extra layer of macro just to work around a
733edfb8 RM	278	bug in the assembler's parser when an operand starts with
a306c790	279	a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647
733edfb8 RM	280	tracks that bug; it was not fixed as of binutils-2.23.2. */
	281	.macro neon_load_d0 reg
	282	vld1.8 {d0}, [\reg]!
	283	.endm
	284	.macro neon_store_d0 reg
	285	vst1.8 {d0}, [\reg]!
	286	.endm
	287
ae65139d	288	and tmp1, count, #0x38
733edfb8	289	.macro dispatch_step i
81cb7a0b ZW	290	neon_load_d0 src
81cb7a0b ZW	291	neon_store_d0 dst
733edfb8 RM	292	.endm
733edfb8 RM	293	dispatch_7_dword
ae65139d WN	294
ae65139d WN	295	tst count, #4
81cb7a0b ZW	296	ldrne tmp1, [src], #4
81cb7a0b ZW	297	strne tmp1, [dst], #4
ae65139d WN	298	#else
	299	/* Copy up to 15 full words of data. May not be aligned. */
	300	/* Cannot use VFP for unaligned data. */
	301	and tmp1, count, #0x3c
	302	add dst, dst, tmp1
	303	add src, src, tmp1
ae65139d	304	/* Jump directly into the sequence below at the correct offset. */
733edfb8	305	.macro dispatch_step i
81cb7a0b ZW	306	ldr tmp1, [src, #-(\i * 4)]
81cb7a0b ZW	307	str tmp1, [dst, #-(\i * 4)]
733edfb8 RM	308	.endm
733edfb8 RM	309	dispatch_15_word
ae65139d WN	310	#endif
	311
	312	lsls count, count, #31
81cb7a0b ZW	313	ldrhcs tmp1, [src], #2
	314	ldrbne src, [src] /* Src is dead, use as a scratch. */
	315	strhcs tmp1, [dst], #2
	316	strbne src, [dst]
ae65139d WN	317	bx lr
	318
	319	.Lcpy_not_short:
	320	/* At least 64 bytes to copy, but don't know the alignment yet. */
	321	str tmp2, [sp, #-FRAME_SIZE]!
	322	cfi_adjust_cfa_offset (FRAME_SIZE)
	323	cfi_rel_offset (tmp2, 0)
	324	cfi_remember_state
cd90698b WN	325	and tmp2, src, #7
cd90698b WN	326	and tmp1, dst, #7
ae65139d WN	327	cmp tmp1, tmp2
	328	bne .Lcpy_notaligned
	329
	330	#ifdef USE_VFP
	331	/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
	332	that the FP pipeline is much better at streaming loads and
	333	stores. This is outside the critical loop. */
	334	vmov.f32 s0, s0
	335	#endif
	336
cd90698b	337	/* SRC and DST have the same mutual 64-bit alignment, but we may
ae65139d	338	still need to pre-copy some bytes to get to natural alignment.
cd90698b	339	We bring SRC and DST into full 64-bit alignment. */
ae65139d WN	340	lsls tmp2, dst, #29
	341	beq 1f
	342	rsbs tmp2, tmp2, #0
	343	sub count, count, tmp2, lsr #29
81cb7a0b ZW	344	ldrmi tmp1, [src], #4
81cb7a0b ZW	345	strmi tmp1, [dst], #4
ae65139d	346	lsls tmp2, tmp2, #2
81cb7a0b ZW	347	ldrhcs tmp1, [src], #2
	348	ldrbne tmp2, [src], #1
	349	strhcs tmp1, [dst], #2
	350	strbne tmp2, [dst], #1
ae65139d WN	351
	352	1:
	353	subs tmp2, count, #64 /* Use tmp2 for count. */
	354	blt .Ltail63aligned
	355
	356	cmp tmp2, #512
	357	bge .Lcpy_body_long
	358
	359	.Lcpy_body_medium: /* Count in tmp2. */
	360	#ifdef USE_VFP
	361	1:
81cb7a0b	362	vldr d0, [src, #0]
ae65139d	363	subs tmp2, tmp2, #64
81cb7a0b ZW	364	vldr d1, [src, #8]
	365	vstr d0, [dst, #0]
	366	vldr d0, [src, #16]
	367	vstr d1, [dst, #8]
	368	vldr d1, [src, #24]
	369	vstr d0, [dst, #16]
	370	vldr d0, [src, #32]
	371	vstr d1, [dst, #24]
	372	vldr d1, [src, #40]
	373	vstr d0, [dst, #32]
	374	vldr d0, [src, #48]
	375	vstr d1, [dst, #40]
	376	vldr d1, [src, #56]
	377	vstr d0, [dst, #48]
ae65139d	378	add src, src, #64
81cb7a0b	379	vstr d1, [dst, #56]
ae65139d WN	380	add dst, dst, #64
	381	bge 1b
	382	tst tmp2, #0x3f
	383	beq .Ldone
	384
	385	.Ltail63aligned: /* Count in tmp2. */
	386	and tmp1, tmp2, #0x38
	387	add dst, dst, tmp1
	388	add src, src, tmp1
733edfb8	389	.macro dispatch_step i
81cb7a0b ZW	390	vldr d0, [src, #-(\i * 8)]
81cb7a0b ZW	391	vstr d0, [dst, #-(\i * 8)]
733edfb8 RM	392	.endm
733edfb8 RM	393	dispatch_7_dword
ae65139d WN	394	#else
	395	sub src, src, #8
	396	sub dst, dst, #8
	397	1:
81cb7a0b ZW	398	ldrd A_l, A_h, [src, #8]
	399	strd A_l, A_h, [dst, #8]
	400	ldrd A_l, A_h, [src, #16]
	401	strd A_l, A_h, [dst, #16]
	402	ldrd A_l, A_h, [src, #24]
	403	strd A_l, A_h, [dst, #24]
	404	ldrd A_l, A_h, [src, #32]
	405	strd A_l, A_h, [dst, #32]
	406	ldrd A_l, A_h, [src, #40]
	407	strd A_l, A_h, [dst, #40]
	408	ldrd A_l, A_h, [src, #48]
	409	strd A_l, A_h, [dst, #48]
	410	ldrd A_l, A_h, [src, #56]
	411	strd A_l, A_h, [dst, #56]
	412	ldrd A_l, A_h, [src, #64]!
	413	strd A_l, A_h, [dst, #64]!
ae65139d WN	414	subs tmp2, tmp2, #64
	415	bge 1b
	416	tst tmp2, #0x3f
	417	bne 1f
	418	ldr tmp2,[sp], #FRAME_SIZE
	419	cfi_adjust_cfa_offset (-FRAME_SIZE)
	420	cfi_restore (tmp2)
	421	bx lr
	422
	423	cfi_restore_state
	424	cfi_remember_state
	425	1:
	426	add src, src, #8
	427	add dst, dst, #8
	428
	429	.Ltail63aligned: /* Count in tmp2. */
	430	/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
cd90698b	431	we know that the src and dest are 64-bit aligned so we can use
ae65139d WN	432	LDRD/STRD to improve efficiency. */
	433	/* TMP2 is now negative, but we don't care about that. The bottom
	434	six bits still tell us how many bytes are left to copy. */
	435
	436	and tmp1, tmp2, #0x38
	437	add dst, dst, tmp1
	438	add src, src, tmp1
733edfb8	439	.macro dispatch_step i
81cb7a0b ZW	440	ldrd A_l, A_h, [src, #-(\i * 8)]
81cb7a0b ZW	441	strd A_l, A_h, [dst, #-(\i * 8)]
733edfb8 RM	442	.endm
733edfb8 RM	443	dispatch_7_dword
ae65139d	444	#endif
733edfb8	445
ae65139d	446	tst tmp2, #4
81cb7a0b ZW	447	ldrne tmp1, [src], #4
81cb7a0b ZW	448	strne tmp1, [dst], #4
ae65139d	449	lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
81cb7a0b ZW	450	ldrhcs tmp1, [src], #2
	451	ldrbne tmp2, [src]
	452	strhcs tmp1, [dst], #2
	453	strbne tmp2, [dst]
ae65139d WN	454
	455	.Ldone:
	456	ldr tmp2, [sp], #FRAME_SIZE
	457	cfi_adjust_cfa_offset (-FRAME_SIZE)
	458	cfi_restore (tmp2)
	459	bx lr
	460
	461	cfi_restore_state
	462	cfi_remember_state
	463
	464	.Lcpy_body_long: /* Count in tmp2. */
	465
	466	/* Long copy. We know that there's at least (prefetch_lines * 64)
	467	bytes to go. */
	468	#ifdef USE_VFP
	469	/* Don't use PLD. Instead, read some data in advance of the current
	470	copy position into a register. This should act like a PLD
	471	operation but we won't have to repeat the transfer. */
	472
81cb7a0b ZW	473	vldr d3, [src, #0]
	474	vldr d4, [src, #64]
	475	vldr d5, [src, #128]
	476	vldr d6, [src, #192]
	477	vldr d7, [src, #256]
	478
	479	vldr d0, [src, #8]
	480	vldr d1, [src, #16]
	481	vldr d2, [src, #24]
ae65139d WN	482	add src, src, #32
	483
	484	subs tmp2, tmp2, #prefetch_lines * 64 * 2
	485	blt 2f
	486	1:
	487	cpy_line_vfp d3, 0
	488	cpy_line_vfp d4, 64
	489	cpy_line_vfp d5, 128
	490	add dst, dst, #3 * 64
	491	add src, src, #3 * 64
	492	cpy_line_vfp d6, 0
	493	cpy_line_vfp d7, 64
	494	add dst, dst, #2 * 64
	495	add src, src, #2 * 64
	496	subs tmp2, tmp2, #prefetch_lines * 64
	497	bge 1b
	498
	499	2:
	500	cpy_tail_vfp d3, 0
	501	cpy_tail_vfp d4, 64
	502	cpy_tail_vfp d5, 128
	503	add src, src, #3 * 64
	504	add dst, dst, #3 * 64
	505	cpy_tail_vfp d6, 0
81cb7a0b ZW	506	vstr d7, [dst, #64]
	507	vldr d7, [src, #64]
	508	vstr d0, [dst, #64 + 8]
	509	vldr d0, [src, #64 + 8]
	510	vstr d1, [dst, #64 + 16]
	511	vldr d1, [src, #64 + 16]
	512	vstr d2, [dst, #64 + 24]
	513	vldr d2, [src, #64 + 24]
	514	vstr d7, [dst, #64 + 32]
ae65139d	515	add src, src, #96
81cb7a0b ZW	516	vstr d0, [dst, #64 + 40]
	517	vstr d1, [dst, #64 + 48]
	518	vstr d2, [dst, #64 + 56]
ae65139d WN	519	add dst, dst, #128
	520	add tmp2, tmp2, #prefetch_lines * 64
	521	b .Lcpy_body_medium
	522	#else
	523	/* Long copy. Use an SMS style loop to maximize the I/O
	524	bandwidth of the core. We don't have enough spare registers
	525	to synthesise prefetching, so use PLD operations. */
	526	/* Pre-bias src and dst. */
	527	sub src, src, #8
	528	sub dst, dst, #8
81cb7a0b ZW	529	pld [src, #8]
81cb7a0b ZW	530	pld [src, #72]
ae65139d	531	subs tmp2, tmp2, #64
81cb7a0b ZW	532	pld [src, #136]
81cb7a0b ZW	533	ldrd A_l, A_h, [src, #8]
ae65139d WN	534	strd B_l, B_h, [sp, #8]
	535	cfi_rel_offset (B_l, 8)
	536	cfi_rel_offset (B_h, 12)
81cb7a0b	537	ldrd B_l, B_h, [src, #16]
ae65139d WN	538	strd C_l, C_h, [sp, #16]
	539	cfi_rel_offset (C_l, 16)
	540	cfi_rel_offset (C_h, 20)
81cb7a0b	541	ldrd C_l, C_h, [src, #24]
ae65139d WN	542	strd D_l, D_h, [sp, #24]
	543	cfi_rel_offset (D_l, 24)
	544	cfi_rel_offset (D_h, 28)
81cb7a0b ZW	545	pld [src, #200]
81cb7a0b ZW	546	ldrd D_l, D_h, [src, #32]!
ae65139d WN	547	b 1f
	548	.p2align 6
	549	2:
81cb7a0b ZW	550	pld [src, #232]
	551	strd A_l, A_h, [dst, #40]
	552	ldrd A_l, A_h, [src, #40]
	553	strd B_l, B_h, [dst, #48]
	554	ldrd B_l, B_h, [src, #48]
	555	strd C_l, C_h, [dst, #56]
	556	ldrd C_l, C_h, [src, #56]
	557	strd D_l, D_h, [dst, #64]!
	558	ldrd D_l, D_h, [src, #64]!
ae65139d WN	559	subs tmp2, tmp2, #64
ae65139d WN	560	1:
81cb7a0b ZW	561	strd A_l, A_h, [dst, #8]
	562	ldrd A_l, A_h, [src, #8]
	563	strd B_l, B_h, [dst, #16]
	564	ldrd B_l, B_h, [src, #16]
	565	strd C_l, C_h, [dst, #24]
	566	ldrd C_l, C_h, [src, #24]
	567	strd D_l, D_h, [dst, #32]
	568	ldrd D_l, D_h, [src, #32]
ae65139d WN	569	bcs 2b
ae65139d WN	570	/* Save the remaining bytes and restore the callee-saved regs. */
81cb7a0b	571	strd A_l, A_h, [dst, #40]
ae65139d	572	add src, src, #40
81cb7a0b	573	strd B_l, B_h, [dst, #48]
ae65139d WN	574	ldrd B_l, B_h, [sp, #8]
	575	cfi_restore (B_l)
	576	cfi_restore (B_h)
81cb7a0b	577	strd C_l, C_h, [dst, #56]
ae65139d WN	578	ldrd C_l, C_h, [sp, #16]
	579	cfi_restore (C_l)
	580	cfi_restore (C_h)
81cb7a0b	581	strd D_l, D_h, [dst, #64]
ae65139d WN	582	ldrd D_l, D_h, [sp, #24]
	583	cfi_restore (D_l)
	584	cfi_restore (D_h)
	585	add dst, dst, #72
	586	tst tmp2, #0x3f
	587	bne .Ltail63aligned
	588	ldr tmp2, [sp], #FRAME_SIZE
	589	cfi_adjust_cfa_offset (-FRAME_SIZE)
	590	cfi_restore (tmp2)
	591	bx lr
	592	#endif
	593
	594	cfi_restore_state
	595	cfi_remember_state
	596
	597	.Lcpy_notaligned:
81cb7a0b ZW	598	pld [src, #0]
81cb7a0b ZW	599	pld [src, #64]
ae65139d WN	600	/* There's at least 64 bytes to copy, but there is no mutual
	601	alignment. */
	602	/* Bring DST to 64-bit alignment. */
	603	lsls tmp2, dst, #29
81cb7a0b	604	pld [src, #(2 * 64)]
ae65139d WN	605	beq 1f
	606	rsbs tmp2, tmp2, #0
	607	sub count, count, tmp2, lsr #29
81cb7a0b ZW	608	ldrmi tmp1, [src], #4
81cb7a0b ZW	609	strmi tmp1, [dst], #4
ae65139d	610	lsls tmp2, tmp2, #2
81cb7a0b ZW	611	ldrbne tmp1, [src], #1
	612	ldrhcs tmp2, [src], #2
	613	strbne tmp1, [dst], #1
	614	strhcs tmp2, [dst], #2
ae65139d	615	1:
81cb7a0b	616	pld [src, #(3 * 64)]
ae65139d WN	617	subs count, count, #64
	618	ldrmi tmp2, [sp], #FRAME_SIZE
	619	bmi .Ltail63unaligned
81cb7a0b	620	pld [src, #(4 * 64)]
ae65139d WN	621
ae65139d WN	622	#ifdef USE_NEON
733edfb8 RM	623	/* These need an extra layer of macro just to work around a
	624	bug in the assembler's parser when an operand starts with
	625	a {...}. */
	626	.macro neon_load_multi reglist, basereg
	627	vld1.8 {\reglist}, [\basereg]!
	628	.endm
	629	.macro neon_store_multi reglist, basereg
	630	vst1.8 {\reglist}, [ALIGN (\basereg, 64)]!
	631	.endm
	632
81cb7a0b ZW	633	neon_load_multi d0-d3, src
81cb7a0b ZW	634	neon_load_multi d4-d7, src
ae65139d WN	635	subs count, count, #64
	636	bmi 2f
	637	1:
81cb7a0b ZW	638	pld [src, #(4 * 64)]
	639	neon_store_multi d0-d3, dst
	640	neon_load_multi d0-d3, src
	641	neon_store_multi d4-d7, dst
	642	neon_load_multi d4-d7, src
ae65139d WN	643	subs count, count, #64
	644	bpl 1b
	645	2:
81cb7a0b ZW	646	neon_store_multi d0-d3, dst
81cb7a0b ZW	647	neon_store_multi d4-d7, dst
ae65139d WN	648	ands count, count, #0x3f
	649	#else
	650	/* Use an SMS style loop to maximize the I/O bandwidth. */
	651	sub src, src, #4
	652	sub dst, dst, #8
	653	subs tmp2, count, #64 /* Use tmp2 for count. */
81cb7a0b ZW	654	ldr A_l, [src, #4]
81cb7a0b ZW	655	ldr A_h, [src, #8]
ae65139d WN	656	strd B_l, B_h, [sp, #8]
	657	cfi_rel_offset (B_l, 8)
	658	cfi_rel_offset (B_h, 12)
81cb7a0b ZW	659	ldr B_l, [src, #12]
81cb7a0b ZW	660	ldr B_h, [src, #16]
ae65139d WN	661	strd C_l, C_h, [sp, #16]
	662	cfi_rel_offset (C_l, 16)
	663	cfi_rel_offset (C_h, 20)
81cb7a0b ZW	664	ldr C_l, [src, #20]
81cb7a0b ZW	665	ldr C_h, [src, #24]
ae65139d WN	666	strd D_l, D_h, [sp, #24]
	667	cfi_rel_offset (D_l, 24)
	668	cfi_rel_offset (D_h, 28)
81cb7a0b ZW	669	ldr D_l, [src, #28]
81cb7a0b ZW	670	ldr D_h, [src, #32]!
ae65139d WN	671	b 1f
	672	.p2align 6
	673	2:
81cb7a0b ZW	674	pld [src, #(5 * 64) - (32 - 4)]
	675	strd A_l, A_h, [dst, #40]
	676	ldr A_l, [src, #36]
	677	ldr A_h, [src, #40]
	678	strd B_l, B_h, [dst, #48]
	679	ldr B_l, [src, #44]
	680	ldr B_h, [src, #48]
	681	strd C_l, C_h, [dst, #56]
	682	ldr C_l, [src, #52]
	683	ldr C_h, [src, #56]
	684	strd D_l, D_h, [dst, #64]!
	685	ldr D_l, [src, #60]
	686	ldr D_h, [src, #64]!
ae65139d WN	687	subs tmp2, tmp2, #64
ae65139d WN	688	1:
81cb7a0b ZW	689	strd A_l, A_h, [dst, #8]
	690	ldr A_l, [src, #4]
	691	ldr A_h, [src, #8]
	692	strd B_l, B_h, [dst, #16]
	693	ldr B_l, [src, #12]
	694	ldr B_h, [src, #16]
	695	strd C_l, C_h, [dst, #24]
	696	ldr C_l, [src, #20]
	697	ldr C_h, [src, #24]
	698	strd D_l, D_h, [dst, #32]
	699	ldr D_l, [src, #28]
	700	ldr D_h, [src, #32]
ae65139d WN	701	bcs 2b
	702
	703	/* Save the remaining bytes and restore the callee-saved regs. */
81cb7a0b	704	strd A_l, A_h, [dst, #40]
ae65139d	705	add src, src, #36
81cb7a0b	706	strd B_l, B_h, [dst, #48]
ae65139d WN	707	ldrd B_l, B_h, [sp, #8]
	708	cfi_restore (B_l)
	709	cfi_restore (B_h)
81cb7a0b	710	strd C_l, C_h, [dst, #56]
ae65139d WN	711	ldrd C_l, C_h, [sp, #16]
	712	cfi_restore (C_l)
	713	cfi_restore (C_h)
81cb7a0b	714	strd D_l, D_h, [dst, #64]
ae65139d WN	715	ldrd D_l, D_h, [sp, #24]
	716	cfi_restore (D_l)
	717	cfi_restore (D_h)
	718	add dst, dst, #72
	719	ands count, tmp2, #0x3f
	720	#endif
	721	ldr tmp2, [sp], #FRAME_SIZE
	722	cfi_adjust_cfa_offset (-FRAME_SIZE)
	723	cfi_restore (tmp2)
	724	bne .Ltail63unaligned
	725	bx lr
	726
	727	END(memcpy)
	728	libc_hidden_builtin_def (memcpy)