[thirdparty/glibc.git] / sysdeps / aarch64 / multiarch / memmove_falkor.S

/* Copyright (C) 2017-2019 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses.  */

#define dstin	x0
#define src	x1
#define count	x2
#define dst	x3
#define srcend	x4
#define dstend	x5
#define A_x	x6
#define B_x	x7
#define A_w	w6
#define B_w	w7
#define tmp1	x14

#define Q_q	q6
#define A_q	q22
#define B_q	q18
#define C_q	q19
#define D_q	q20
#define E_q	q21
#define F_q	q17
#define G_q	q23

/* RATIONALE:

   The move has 4 distinct parts:
   * Small moves of 16 bytes and under
   * Medium sized moves of 17-96 bytes
   * Large moves where the source address is higher than the destination
     (forward copies)
   * Large moves where the destination address is higher than the source
     (copy backward, or move).

   We use only two registers q6 and q22 for the moves and move 32 bytes at a
   time to correctly train the hardware prefetcher for better throughput.  */
ENTRY_ALIGN (__memmove_falkor, 6)

	sub	tmp1, dstin, src
	add	srcend, src, count
	add	dstend, dstin, count
	cmp	count, 96
	ccmp	tmp1, count, 2, hi
	b.lo	L(move_long)

	cmp	count, 16
	b.ls	L(copy16)
	cmp	count, 96
	b.hi	L(copy_long)

	/* Medium copies: 17..96 bytes.  */
	sub	tmp1, count, 1
	ldr	A_q, [src]
	tbnz	tmp1, 6, L(copy96)
	ldr	D_q, [srcend, -16]
	tbz	tmp1, 5, 1f
	ldr	B_q, [src, 16]
	ldr	C_q, [srcend, -32]
	str	B_q, [dstin, 16]
	str	C_q, [dstend, -32]
1:
	str	A_q, [dstin]
	str	D_q, [dstend, -16]
	ret

	.p2align 4
	/* Small copies: 0..16 bytes.  */
L(copy16):
	cmp	count, 8
	b.lo	1f
	ldr	A_x, [src]
	ldr	B_x, [srcend, -8]
	str	A_x, [dstin]
	str	B_x, [dstend, -8]
	ret
	.p2align 4
1:
	/* 4-7 */
	tbz	count, 2, 1f
	ldr	A_w, [src]
	ldr	B_w, [srcend, -4]
	str	A_w, [dstin]
	str	B_w, [dstend, -4]
	ret
	.p2align 4
1:
	/* 2-3 */
	tbz	count, 1, 1f
	ldrh	A_w, [src]
	ldrh	B_w, [srcend, -2]
	strh	A_w, [dstin]
	strh	B_w, [dstend, -2]
	ret
	.p2align 4
1:
	/* 0-1 */
	tbz	count, 0, 1f
	ldrb	A_w, [src]
	strb	A_w, [dstin]
1:	ret

	.p2align 4
	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
	   32 bytes from the end.  */
L(copy96):
	ldr	B_q, [src, 16]
	ldr	C_q, [src, 32]
	ldr	D_q, [src, 48]
	ldr	E_q, [srcend, -32]
	ldr	F_q, [srcend, -16]
	str	A_q, [dstin]
	str	B_q, [dstin, 16]
	str	C_q, [dstin, 32]
	str	D_q, [dstin, 48]
	str	E_q, [dstend, -32]
	str	F_q, [dstend, -16]
	ret

	/* Align SRC to 16 byte alignment so that we don't cross cache line
	   boundaries on both loads and stores.  There are at least 96 bytes
	   to copy, so copy 16 bytes unaligned and then align.  The loop
	   copies 32 bytes per iteration and prefetches one iteration ahead.  */

	.p2align 4
L(copy_long):
	ldr	A_q, [src]
	and	tmp1, src, 15
	bic	src, src, 15
	sub	dst, dstin, tmp1
	add	count, count, tmp1	/* Count is now 16 too large.  */
	ldr	Q_q, [src, 16]!
	str	A_q, [dstin]
	ldr	A_q, [src, 16]!
	subs	count, count, 32 + 64 + 16	/* Test and readjust count.  */
	b.ls	L(last64)

L(loop64):
	subs	count, count, 32
	str	Q_q, [dst, 16]
	ldr	Q_q, [src, 16]!
	str	A_q, [dst, 32]!
	ldr	A_q, [src, 16]!
	b.hi	L(loop64)

	/* Write the last full set of 64 bytes.  The remainder is at most 64
	   bytes and at least 33 bytes, so it is safe to always copy 64 bytes
	   from the end.  */
L(last64):
	ldr	C_q, [srcend, -64]
	str	Q_q, [dst, 16]
	ldr	B_q, [srcend, -48]
	str	A_q, [dst, 32]
	ldr	A_q, [srcend, -32]
	ldr	D_q, [srcend, -16]
	str	C_q, [dstend, -64]
	str	B_q, [dstend, -48]
	str	A_q, [dstend, -32]
	str	D_q, [dstend, -16]
	ret

	.p2align 4
L(move_long):
	cbz	tmp1, 3f

	/* Align SRCEND to 16 byte alignment so that we don't cross cache line
	   boundaries on both loads and stores.  There are at least 96 bytes
	   to copy, so copy 16 bytes unaligned and then align.  The loop
	   copies 32 bytes per iteration and prefetches one iteration ahead.  */

	ldr	A_q, [srcend, -16]
	and	tmp1, srcend, 15
	sub	srcend, srcend, tmp1
	ldr	Q_q, [srcend, -16]!
	str	A_q, [dstend, -16]
	sub	count, count, tmp1
	ldr	A_q, [srcend, -16]!
	sub	dstend, dstend, tmp1
	subs	count, count, 32 + 64
	b.ls	2f

1:
	subs	count, count, 32
	str	Q_q, [dstend, -16]
	ldr	Q_q, [srcend, -16]!
	str	A_q, [dstend, -32]!
	ldr	A_q, [srcend, -16]!
	b.hi	1b

	/* Write the last full set of 64 bytes.  The remainder is at most 64
	   bytes and at least 33 bytes, so it is safe to always copy 64 bytes
	   from the start.  */
2:
	ldr	C_q, [src, 48]
	str	Q_q, [dstend, -16]
	ldr	B_q, [src, 32]
	str	A_q, [dstend, -32]
	ldr	A_q, [src, 16]
	ldr	D_q, [src]
	str	C_q, [dstin, 48]
	str	B_q, [dstin, 32]
	str	A_q, [dstin, 16]
	str	D_q, [dstin]
3:	ret

END (__memmove_falkor)
libc_hidden_builtin_def (__memmove_falkor)
Commit	Line	Data
04277e02	1	/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
dd5bc7f1 SP	2
	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library. If not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
dd5bc7f1 SP	18
	19	#include <sysdep.h>
	20
	21	/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */
	22
	23	#define dstin x0
	24	#define src x1
	25	#define count x2
dd5bc7f1 SP	26	#define dst x3
	27	#define srcend x4
	28	#define dstend x5
ce76a5cb SP	29	#define A_x x6
	30	#define B_x x7
	31	#define A_w w6
	32	#define B_w w7
dd5bc7f1 SP	33	#define tmp1 x14
dd5bc7f1 SP	34
ce76a5cb SP	35	#define Q_q q6
	36	#define A_q q22
	37	#define B_q q18
	38	#define C_q q19
	39	#define D_q q20
	40	#define E_q q21
	41	#define F_q q17
	42	#define G_q q23
dd5bc7f1 SP	43
	44	/* RATIONALE:
	45
ce76a5cb SP	46	The move has 4 distinct parts:
	47	* Small moves of 16 bytes and under
	48	* Medium sized moves of 17-96 bytes
	49	* Large moves where the source address is higher than the destination
dd5bc7f1	50	(forward copies)
ce76a5cb	51	* Large moves where the destination address is higher than the source
dd5bc7f1 SP	52	(copy backward, or move).
dd5bc7f1 SP	53
ce76a5cb SP	54	We use only two registers q6 and q22 for the moves and move 32 bytes at a
ce76a5cb SP	55	time to correctly train the hardware prefetcher for better throughput. */
dd5bc7f1 SP	56	ENTRY_ALIGN (__memmove_falkor, 6)
	57
	58	sub tmp1, dstin, src
	59	add srcend, src, count
	60	add dstend, dstin, count
	61	cmp count, 96
	62	ccmp tmp1, count, 2, hi
	63	b.lo L(move_long)
	64
	65	cmp count, 16
	66	b.ls L(copy16)
	67	cmp count, 96
	68	b.hi L(copy_long)
	69
	70	/* Medium copies: 17..96 bytes. */
	71	sub tmp1, count, 1
ce76a5cb	72	ldr A_q, [src]
dd5bc7f1	73	tbnz tmp1, 6, L(copy96)
ce76a5cb	74	ldr D_q, [srcend, -16]
dd5bc7f1	75	tbz tmp1, 5, 1f
ce76a5cb SP	76	ldr B_q, [src, 16]
	77	ldr C_q, [srcend, -32]
	78	str B_q, [dstin, 16]
	79	str C_q, [dstend, -32]
dd5bc7f1	80	1:
ce76a5cb SP	81	str A_q, [dstin]
ce76a5cb SP	82	str D_q, [dstend, -16]
dd5bc7f1 SP	83	ret
	84
	85	.p2align 4
	86	/* Small copies: 0..16 bytes. */
	87	L(copy16):
	88	cmp count, 8
	89	b.lo 1f
ce76a5cb SP	90	ldr A_x, [src]
	91	ldr B_x, [srcend, -8]
	92	str A_x, [dstin]
	93	str B_x, [dstend, -8]
dd5bc7f1 SP	94	ret
	95	.p2align 4
	96	1:
	97	/* 4-7 */
	98	tbz count, 2, 1f
ce76a5cb SP	99	ldr A_w, [src]
	100	ldr B_w, [srcend, -4]
	101	str A_w, [dstin]
	102	str B_w, [dstend, -4]
dd5bc7f1 SP	103	ret
	104	.p2align 4
	105	1:
	106	/* 2-3 */
	107	tbz count, 1, 1f
ce76a5cb SP	108	ldrh A_w, [src]
	109	ldrh B_w, [srcend, -2]
	110	strh A_w, [dstin]
	111	strh B_w, [dstend, -2]
dd5bc7f1 SP	112	ret
	113	.p2align 4
	114	1:
	115	/* 0-1 */
	116	tbz count, 0, 1f
ce76a5cb SP	117	ldrb A_w, [src]
ce76a5cb SP	118	strb A_w, [dstin]
dd5bc7f1 SP	119	1: ret
	120
	121	.p2align 4
	122	/* Copy 64..96 bytes. Copy 64 bytes from the start and
	123	32 bytes from the end. */
	124	L(copy96):
ce76a5cb SP	125	ldr B_q, [src, 16]
	126	ldr C_q, [src, 32]
	127	ldr D_q, [src, 48]
	128	ldr E_q, [srcend, -32]
	129	ldr F_q, [srcend, -16]
	130	str A_q, [dstin]
	131	str B_q, [dstin, 16]
	132	str C_q, [dstin, 32]
	133	str D_q, [dstin, 48]
	134	str E_q, [dstend, -32]
	135	str F_q, [dstend, -16]
dd5bc7f1 SP	136	ret
	137
	138	/* Align SRC to 16 byte alignment so that we don't cross cache line
	139	boundaries on both loads and stores. There are at least 96 bytes
	140	to copy, so copy 16 bytes unaligned and then align. The loop
	141	copies 32 bytes per iteration and prefetches one iteration ahead. */
	142
	143	.p2align 4
	144	L(copy_long):
ce76a5cb	145	ldr A_q, [src]
dd5bc7f1 SP	146	and tmp1, src, 15
	147	bic src, src, 15
	148	sub dst, dstin, tmp1
	149	add count, count, tmp1 /* Count is now 16 too large. */
ce76a5cb SP	150	ldr Q_q, [src, 16]!
	151	str A_q, [dstin]
	152	ldr A_q, [src, 16]!
70c97f84 SP	153	subs count, count, 32 + 64 + 16 /* Test and readjust count. */
70c97f84 SP	154	b.ls L(last64)
dd5bc7f1 SP	155
	156	L(loop64):
	157	subs count, count, 32
ce76a5cb SP	158	str Q_q, [dst, 16]
	159	ldr Q_q, [src, 16]!
	160	str A_q, [dst, 32]!
	161	ldr A_q, [src, 16]!
dd5bc7f1 SP	162	b.hi L(loop64)
dd5bc7f1 SP	163
70c97f84 SP	164	/* Write the last full set of 64 bytes. The remainder is at most 64
	165	bytes and at least 33 bytes, so it is safe to always copy 64 bytes
	166	from the end. */
dd5bc7f1	167	L(last64):
ce76a5cb SP	168	ldr C_q, [srcend, -64]
	169	str Q_q, [dst, 16]
	170	ldr B_q, [srcend, -48]
	171	str A_q, [dst, 32]
	172	ldr A_q, [srcend, -32]
	173	ldr D_q, [srcend, -16]
	174	str C_q, [dstend, -64]
	175	str B_q, [dstend, -48]
	176	str A_q, [dstend, -32]
	177	str D_q, [dstend, -16]
dd5bc7f1 SP	178	ret
	179
	180	.p2align 4
	181	L(move_long):
	182	cbz tmp1, 3f
	183
dd5bc7f1 SP	184	/* Align SRCEND to 16 byte alignment so that we don't cross cache line
	185	boundaries on both loads and stores. There are at least 96 bytes
	186	to copy, so copy 16 bytes unaligned and then align. The loop
	187	copies 32 bytes per iteration and prefetches one iteration ahead. */
	188
ce76a5cb	189	ldr A_q, [srcend, -16]
dd5bc7f1 SP	190	and tmp1, srcend, 15
dd5bc7f1 SP	191	sub srcend, srcend, tmp1
ce76a5cb SP	192	ldr Q_q, [srcend, -16]!
ce76a5cb SP	193	str A_q, [dstend, -16]
dd5bc7f1	194	sub count, count, tmp1
ce76a5cb	195	ldr A_q, [srcend, -16]!
dd5bc7f1	196	sub dstend, dstend, tmp1
70c97f84 SP	197	subs count, count, 32 + 64
70c97f84 SP	198	b.ls 2f
dd5bc7f1 SP	199
	200	1:
	201	subs count, count, 32
ce76a5cb SP	202	str Q_q, [dstend, -16]
	203	ldr Q_q, [srcend, -16]!
	204	str A_q, [dstend, -32]!
	205	ldr A_q, [srcend, -16]!
dd5bc7f1 SP	206	b.hi 1b
dd5bc7f1 SP	207
70c97f84 SP	208	/* Write the last full set of 64 bytes. The remainder is at most 64
	209	bytes and at least 33 bytes, so it is safe to always copy 64 bytes
	210	from the start. */
dd5bc7f1	211	2:
ce76a5cb SP	212	ldr C_q, [src, 48]
	213	str Q_q, [dstend, -16]
	214	ldr B_q, [src, 32]
	215	str A_q, [dstend, -32]
	216	ldr A_q, [src, 16]
	217	ldr D_q, [src]
	218	str C_q, [dstin, 48]
	219	str B_q, [dstin, 32]
	220	str A_q, [dstin, 16]
	221	str D_q, [dstin]
dd5bc7f1 SP	222	3: ret
	223
	224	END (__memmove_falkor)
	225	libc_hidden_builtin_def (__memmove_falkor)