[thirdparty/glibc.git] / sysdeps / aarch64 / memcpy.S

/* Copyright (C) 2012-2019 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* Assumptions:
 *
 * ARMv8-a, AArch64, unaligned accesses.
 *
 */

#define dstin	x0
#define src	x1
#define count	x2
#define dst	x3
#define srcend	x4
#define dstend	x5
#define A_l	x6
#define A_lw	w6
#define A_h	x7
#define A_hw	w7
#define B_l	x8
#define B_lw	w8
#define B_h	x9
#define C_l	x10
#define C_h	x11
#define D_l	x12
#define D_h	x13
#define E_l	src
#define E_h	count
#define F_l	srcend
#define F_h	dst
#define G_l	count
#define G_h	dst
#define tmp1	x14

/* Copies are split into 3 main cases: small copies of up to 16 bytes,
   medium copies of 17..96 bytes which are fully unrolled. Large copies
   of more than 96 bytes align the destination and use an unrolled loop
   processing 64 bytes per iteration.
   In order to share code with memmove, small and medium copies read all
   data before writing, allowing any kind of overlap. So small, medium
   and large backwards memmoves are handled by falling through into memcpy.
   Overlapping large forward memmoves use a loop that copies backwards.
*/

#ifndef MEMMOVE
# define MEMMOVE memmove
#endif
#ifndef MEMCPY
# define MEMCPY memcpy
#endif

ENTRY_ALIGN (MEMMOVE, 6)

	DELOUSE (0)
	DELOUSE (1)
	DELOUSE (2)

	sub	tmp1, dstin, src
	cmp	count, 96
	ccmp	tmp1, count, 2, hi
	b.lo	L(move_long)

	/* Common case falls through into memcpy.  */
END (MEMMOVE)
libc_hidden_builtin_def (MEMMOVE)
ENTRY (MEMCPY)

	DELOUSE (0)
	DELOUSE (1)
	DELOUSE (2)

	prfm	PLDL1KEEP, [src]
	add	srcend, src, count
	add	dstend, dstin, count
	cmp	count, 16
	b.ls	L(copy16)
	cmp	count, 96
	b.hi	L(copy_long)

	/* Medium copies: 17..96 bytes.  */
	sub	tmp1, count, 1
	ldp	A_l, A_h, [src]
	tbnz	tmp1, 6, L(copy96)
	ldp	D_l, D_h, [srcend, -16]
	tbz	tmp1, 5, 1f
	ldp	B_l, B_h, [src, 16]
	ldp	C_l, C_h, [srcend, -32]
	stp	B_l, B_h, [dstin, 16]
	stp	C_l, C_h, [dstend, -32]
1:
	stp	A_l, A_h, [dstin]
	stp	D_l, D_h, [dstend, -16]
	ret

	.p2align 4
	/* Small copies: 0..16 bytes.  */
L(copy16):
	cmp	count, 8
	b.lo	1f
	ldr	A_l, [src]
	ldr	A_h, [srcend, -8]
	str	A_l, [dstin]
	str	A_h, [dstend, -8]
	ret
	.p2align 4
1:
	tbz	count, 2, 1f
	ldr	A_lw, [src]
	ldr	A_hw, [srcend, -4]
	str	A_lw, [dstin]
	str	A_hw, [dstend, -4]
	ret

	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
1:
	cbz	count, 2f
	lsr	tmp1, count, 1
	ldrb	A_lw, [src]
	ldrb	A_hw, [srcend, -1]
	ldrb	B_lw, [src, tmp1]
	strb	A_lw, [dstin]
	strb	B_lw, [dstin, tmp1]
	strb	A_hw, [dstend, -1]
2:	ret

	.p2align 4
	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
	   32 bytes from the end.  */
L(copy96):
	ldp	B_l, B_h, [src, 16]
	ldp	C_l, C_h, [src, 32]
	ldp	D_l, D_h, [src, 48]
	ldp	E_l, E_h, [srcend, -32]
	ldp	F_l, F_h, [srcend, -16]
	stp	A_l, A_h, [dstin]
	stp	B_l, B_h, [dstin, 16]
	stp	C_l, C_h, [dstin, 32]
	stp	D_l, D_h, [dstin, 48]
	stp	E_l, E_h, [dstend, -32]
	stp	F_l, F_h, [dstend, -16]
	ret

	/* Align DST to 16 byte alignment so that we don't cross cache line
	   boundaries on both loads and stores.  There are at least 96 bytes
	   to copy, so copy 16 bytes unaligned and then align.  The loop
	   copies 64 bytes per iteration and prefetches one iteration ahead.  */

	.p2align 4
L(copy_long):
	and	tmp1, dstin, 15
	bic	dst, dstin, 15
	ldp	D_l, D_h, [src]
	sub	src, src, tmp1
	add	count, count, tmp1	/* Count is now 16 too large.  */
	ldp	A_l, A_h, [src, 16]
	stp	D_l, D_h, [dstin]
	ldp	B_l, B_h, [src, 32]
	ldp	C_l, C_h, [src, 48]
	ldp	D_l, D_h, [src, 64]!
	subs	count, count, 128 + 16	/* Test and readjust count.  */
	b.ls	L(last64)
L(loop64):
	stp	A_l, A_h, [dst, 16]
	ldp	A_l, A_h, [src, 16]
	stp	B_l, B_h, [dst, 32]
	ldp	B_l, B_h, [src, 32]
	stp	C_l, C_h, [dst, 48]
	ldp	C_l, C_h, [src, 48]
	stp	D_l, D_h, [dst, 64]!
	ldp	D_l, D_h, [src, 64]!
	subs	count, count, 64
	b.hi	L(loop64)

	/* Write the last full set of 64 bytes.  The remainder is at most 64
	   bytes, so it is safe to always copy 64 bytes from the end even if
	   there is just 1 byte left.  */
L(last64):
	ldp	E_l, E_h, [srcend, -64]
	stp	A_l, A_h, [dst, 16]
	ldp	A_l, A_h, [srcend, -48]
	stp	B_l, B_h, [dst, 32]
	ldp	B_l, B_h, [srcend, -32]
	stp	C_l, C_h, [dst, 48]
	ldp	C_l, C_h, [srcend, -16]
	stp	D_l, D_h, [dst, 64]
	stp	E_l, E_h, [dstend, -64]
	stp	A_l, A_h, [dstend, -48]
	stp	B_l, B_h, [dstend, -32]
	stp	C_l, C_h, [dstend, -16]
	ret

	.p2align 4
L(move_long):
	cbz	tmp1, 3f

	add	srcend, src, count
	add	dstend, dstin, count

	/* Align dstend to 16 byte alignment so that we don't cross cache line
	   boundaries on both loads and stores.  There are at least 96 bytes
	   to copy, so copy 16 bytes unaligned and then align.  The loop
	   copies 64 bytes per iteration and prefetches one iteration ahead.  */

	and	tmp1, dstend, 15
	ldp	D_l, D_h, [srcend, -16]
	sub	srcend, srcend, tmp1
	sub	count, count, tmp1
	ldp	A_l, A_h, [srcend, -16]
	stp	D_l, D_h, [dstend, -16]
	ldp	B_l, B_h, [srcend, -32]
	ldp	C_l, C_h, [srcend, -48]
	ldp	D_l, D_h, [srcend, -64]!
	sub	dstend, dstend, tmp1
	subs	count, count, 128
	b.ls	2f

	nop
1:
	stp	A_l, A_h, [dstend, -16]
	ldp	A_l, A_h, [srcend, -16]
	stp	B_l, B_h, [dstend, -32]
	ldp	B_l, B_h, [srcend, -32]
	stp	C_l, C_h, [dstend, -48]
	ldp	C_l, C_h, [srcend, -48]
	stp	D_l, D_h, [dstend, -64]!
	ldp	D_l, D_h, [srcend, -64]!
	subs	count, count, 64
	b.hi	1b

	/* Write the last full set of 64 bytes.  The remainder is at most 64
	   bytes, so it is safe to always copy 64 bytes from the start even if
	   there is just 1 byte left.  */
2:
	ldp	G_l, G_h, [src, 48]
	stp	A_l, A_h, [dstend, -16]
	ldp	A_l, A_h, [src, 32]
	stp	B_l, B_h, [dstend, -32]
	ldp	B_l, B_h, [src, 16]
	stp	C_l, C_h, [dstend, -48]
	ldp	C_l, C_h, [src]
	stp	D_l, D_h, [dstend, -64]
	stp	G_l, G_h, [dstin, 48]
	stp	A_l, A_h, [dstin, 32]
	stp	B_l, B_h, [dstin, 16]
	stp	C_l, C_h, [dstin]
3:	ret

END (MEMCPY)
libc_hidden_builtin_def (MEMCPY)
Commit	Line	Data
04277e02	1	/* Copyright (C) 2012-2019 Free Software Foundation, Inc.
857c8d22 MS	2
	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library. If not, see
	17	<http://www.gnu.org/licenses/>. */
	18
b998e16e WD	19	#include <sysdep.h>
b998e16e WD	20
857c8d22 MS	21	/* Assumptions:
857c8d22 MS	22	*
b998e16e	23	* ARMv8-a, AArch64, unaligned accesses.
857c8d22 MS	24	*
	25	*/
	26
	27	#define dstin x0
	28	#define src x1
	29	#define count x2
b998e16e WD	30	#define dst x3
	31	#define srcend x4
	32	#define dstend x5
	33	#define A_l x6
	34	#define A_lw w6
	35	#define A_h x7
	36	#define A_hw w7
	37	#define B_l x8
a024b39a	38	#define B_lw w8
b998e16e WD	39	#define B_h x9
	40	#define C_l x10
	41	#define C_h x11
	42	#define D_l x12
	43	#define D_h x13
	44	#define E_l src
	45	#define E_h count
	46	#define F_l srcend
	47	#define F_h dst
	48	#define G_l count
	49	#define G_h dst
	50	#define tmp1 x14
857c8d22	51
b998e16e WD	52	/* Copies are split into 3 main cases: small copies of up to 16 bytes,
	53	medium copies of 17..96 bytes which are fully unrolled. Large copies
	54	of more than 96 bytes align the destination and use an unrolled loop
	55	processing 64 bytes per iteration.
	56	In order to share code with memmove, small and medium copies read all
	57	data before writing, allowing any kind of overlap. So small, medium
	58	and large backwards memmoves are handled by falling through into memcpy.
	59	Overlapping large forward memmoves use a loop that copies backwards.
	60	*/
857c8d22	61
6a2c6952 SE	62	#ifndef MEMMOVE
	63	# define MEMMOVE memmove
	64	#endif
	65	#ifndef MEMCPY
	66	# define MEMCPY memcpy
	67	#endif
	68
	69	ENTRY_ALIGN (MEMMOVE, 6)
b998e16e	70
389d1f1b SE	71	DELOUSE (0)
	72	DELOUSE (1)
	73	DELOUSE (2)
	74
b998e16e WD	75	sub tmp1, dstin, src
	76	cmp count, 96
	77	ccmp tmp1, count, 2, hi
	78	b.lo L(move_long)
	79
	80	/* Common case falls through into memcpy. */
6a2c6952 SE	81	END (MEMMOVE)
	82	libc_hidden_builtin_def (MEMMOVE)
	83	ENTRY (MEMCPY)
b998e16e	84
389d1f1b SE	85	DELOUSE (0)
	86	DELOUSE (1)
	87	DELOUSE (2)
	88
a024b39a	89	prfm PLDL1KEEP, [src]
b998e16e WD	90	add srcend, src, count
b998e16e WD	91	add dstend, dstin, count
a024b39a WD	92	cmp count, 16
a024b39a WD	93	b.ls L(copy16)
b998e16e WD	94	cmp count, 96
b998e16e WD	95	b.hi L(copy_long)
b998e16e	96
a024b39a WD	97	/* Medium copies: 17..96 bytes. */
	98	sub tmp1, count, 1
	99	ldp A_l, A_h, [src]
	100	tbnz tmp1, 6, L(copy96)
	101	ldp D_l, D_h, [srcend, -16]
	102	tbz tmp1, 5, 1f
	103	ldp B_l, B_h, [src, 16]
	104	ldp C_l, C_h, [srcend, -32]
	105	stp B_l, B_h, [dstin, 16]
	106	stp C_l, C_h, [dstend, -32]
	107	1:
	108	stp A_l, A_h, [dstin]
	109	stp D_l, D_h, [dstend, -16]
	110	ret
	111
	112	.p2align 4
b998e16e WD	113	/* Small copies: 0..16 bytes. */
b998e16e WD	114	L(copy16):
a024b39a WD	115	cmp count, 8
a024b39a WD	116	b.lo 1f
b998e16e WD	117	ldr A_l, [src]
	118	ldr A_h, [srcend, -8]
	119	str A_l, [dstin]
	120	str A_h, [dstend, -8]
	121	ret
a024b39a	122	.p2align 4
857c8d22	123	1:
b998e16e WD	124	tbz count, 2, 1f
	125	ldr A_lw, [src]
	126	ldr A_hw, [srcend, -4]
	127	str A_lw, [dstin]
	128	str A_hw, [dstend, -4]
	129	ret
a024b39a WD	130
	131	/* Copy 0..3 bytes. Use a branchless sequence that copies the same
	132	byte 3 times if count==1, or the 2nd byte twice if count==2. */
857c8d22	133	1:
b998e16e	134	cbz count, 2f
a024b39a	135	lsr tmp1, count, 1
b998e16e	136	ldrb A_lw, [src]
a024b39a WD	137	ldrb A_hw, [srcend, -1]
	138	ldrb B_lw, [src, tmp1]
	139	strb A_lw, [dstin]
	140	strb B_lw, [dstin, tmp1]
	141	strb A_hw, [dstend, -1]
b998e16e WD	142	2: ret
b998e16e WD	143
b998e16e WD	144	.p2align 4
	145	/* Copy 64..96 bytes. Copy 64 bytes from the start and
	146	32 bytes from the end. */
	147	L(copy96):
	148	ldp B_l, B_h, [src, 16]
	149	ldp C_l, C_h, [src, 32]
	150	ldp D_l, D_h, [src, 48]
	151	ldp E_l, E_h, [srcend, -32]
	152	ldp F_l, F_h, [srcend, -16]
	153	stp A_l, A_h, [dstin]
	154	stp B_l, B_h, [dstin, 16]
	155	stp C_l, C_h, [dstin, 32]
	156	stp D_l, D_h, [dstin, 48]
	157	stp E_l, E_h, [dstend, -32]
	158	stp F_l, F_h, [dstend, -16]
	159	ret
	160
	161	/* Align DST to 16 byte alignment so that we don't cross cache line
	162	boundaries on both loads and stores. There are at least 96 bytes
	163	to copy, so copy 16 bytes unaligned and then align. The loop
	164	copies 64 bytes per iteration and prefetches one iteration ahead. */
	165
	166	.p2align 4
	167	L(copy_long):
	168	and tmp1, dstin, 15
	169	bic dst, dstin, 15
	170	ldp D_l, D_h, [src]
	171	sub src, src, tmp1
	172	add count, count, tmp1 /* Count is now 16 too large. */
	173	ldp A_l, A_h, [src, 16]
	174	stp D_l, D_h, [dstin]
	175	ldp B_l, B_h, [src, 32]
	176	ldp C_l, C_h, [src, 48]
	177	ldp D_l, D_h, [src, 64]!
	178	subs count, count, 128 + 16 /* Test and readjust count. */
6a2c6952 SE	179	b.ls L(last64)
6a2c6952 SE	180	L(loop64):
b998e16e WD	181	stp A_l, A_h, [dst, 16]
	182	ldp A_l, A_h, [src, 16]
	183	stp B_l, B_h, [dst, 32]
	184	ldp B_l, B_h, [src, 32]
	185	stp C_l, C_h, [dst, 48]
	186	ldp C_l, C_h, [src, 48]
	187	stp D_l, D_h, [dst, 64]!
	188	ldp D_l, D_h, [src, 64]!
	189	subs count, count, 64
6a2c6952	190	b.hi L(loop64)
b998e16e WD	191
	192	/* Write the last full set of 64 bytes. The remainder is at most 64
	193	bytes, so it is safe to always copy 64 bytes from the end even if
	194	there is just 1 byte left. */
6a2c6952	195	L(last64):
b998e16e WD	196	ldp E_l, E_h, [srcend, -64]
	197	stp A_l, A_h, [dst, 16]
	198	ldp A_l, A_h, [srcend, -48]
	199	stp B_l, B_h, [dst, 32]
	200	ldp B_l, B_h, [srcend, -32]
	201	stp C_l, C_h, [dst, 48]
	202	ldp C_l, C_h, [srcend, -16]
	203	stp D_l, D_h, [dst, 64]
	204	stp E_l, E_h, [dstend, -64]
	205	stp A_l, A_h, [dstend, -48]
	206	stp B_l, B_h, [dstend, -32]
	207	stp C_l, C_h, [dstend, -16]
	208	ret
	209
	210	.p2align 4
	211	L(move_long):
	212	cbz tmp1, 3f
	213
	214	add srcend, src, count
	215	add dstend, dstin, count
	216
	217	/* Align dstend to 16 byte alignment so that we don't cross cache line
	218	boundaries on both loads and stores. There are at least 96 bytes
	219	to copy, so copy 16 bytes unaligned and then align. The loop
	220	copies 64 bytes per iteration and prefetches one iteration ahead. */
	221
	222	and tmp1, dstend, 15
	223	ldp D_l, D_h, [srcend, -16]
	224	sub srcend, srcend, tmp1
	225	sub count, count, tmp1
	226	ldp A_l, A_h, [srcend, -16]
	227	stp D_l, D_h, [dstend, -16]
	228	ldp B_l, B_h, [srcend, -32]
	229	ldp C_l, C_h, [srcend, -48]
	230	ldp D_l, D_h, [srcend, -64]!
	231	sub dstend, dstend, tmp1
	232	subs count, count, 128
	233	b.ls 2f
	234
	235	nop
857c8d22	236	1:
b998e16e WD	237	stp A_l, A_h, [dstend, -16]
	238	ldp A_l, A_h, [srcend, -16]
	239	stp B_l, B_h, [dstend, -32]
	240	ldp B_l, B_h, [srcend, -32]
	241	stp C_l, C_h, [dstend, -48]
	242	ldp C_l, C_h, [srcend, -48]
	243	stp D_l, D_h, [dstend, -64]!
	244	ldp D_l, D_h, [srcend, -64]!
	245	subs count, count, 64
	246	b.hi 1b
	247
	248	/* Write the last full set of 64 bytes. The remainder is at most 64
	249	bytes, so it is safe to always copy 64 bytes from the start even if
	250	there is just 1 byte left. */
	251	2:
	252	ldp G_l, G_h, [src, 48]
	253	stp A_l, A_h, [dstend, -16]
	254	ldp A_l, A_h, [src, 32]
	255	stp B_l, B_h, [dstend, -32]
	256	ldp B_l, B_h, [src, 16]
	257	stp C_l, C_h, [dstend, -48]
	258	ldp C_l, C_h, [src]
	259	stp D_l, D_h, [dstend, -64]
	260	stp G_l, G_h, [dstin, 48]
	261	stp A_l, A_h, [dstin, 32]
	262	stp B_l, B_h, [dstin, 16]
	263	stp C_l, C_h, [dstin]
	264	3: ret
	265
6a2c6952 SE	266	END (MEMCPY)
6a2c6952 SE	267	libc_hidden_builtin_def (MEMCPY)