[thirdparty/glibc.git] / sysdeps / aarch64 / memmove.S

/* Copyright (C) 2012-2015 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* Assumptions:
 *
 * ARMv8-a, AArch64
 * Unaligned accesses
 */

/* Parameters and result.  */
#define dstin	x0
#define src	x1
#define count	x2
#define tmp1	x3
#define tmp1w	w3
#define tmp2	x4
#define tmp2w	w4
#define tmp3	x5
#define tmp3w	w5
#define dst	x6

#define A_l	x7
#define A_h	x8
#define B_l	x9
#define B_h	x10
#define C_l	x11
#define C_h	x12
#define D_l	x13
#define D_h	x14

ENTRY_ALIGN (memmove, 6)

	cmp	dstin, src
	b.lo	L(downwards)
	add	tmp1, src, count
	cmp	dstin, tmp1
	b.hs	memcpy		/* No overlap.  */

	/* Upwards move with potential overlap.
	 * Need to move from the tail backwards.  SRC and DST point one
	 * byte beyond the remaining data to move.  */
	add	dst, dstin, count
	add	src, src, count
	cmp	count, #64
	b.ge	L(mov_not_short_up)

	/* Deal with small moves quickly by dropping straight into the
	 * exit block.  */
L(tail63up):
	/* Move up to 48 bytes of data.  At this point we only need the
	 * bottom 6 bits of count to be accurate.  */
	ands	tmp1, count, #0x30
	b.eq	L(tail15up)
	sub	dst, dst, tmp1
	sub	src, src, tmp1
	cmp	tmp1w, #0x20
	b.eq	1f
	b.lt	2f
	ldp	A_l, A_h, [src, #32]
	stp	A_l, A_h, [dst, #32]
1:
	ldp	A_l, A_h, [src, #16]
	stp	A_l, A_h, [dst, #16]
2:
	ldp	A_l, A_h, [src]
	stp	A_l, A_h, [dst]
L(tail15up):
	/* Move up to 15 bytes of data.  Does not assume additional data
	 * being moved.  */
	tbz	count, #3, 1f
	ldr	tmp1, [src, #-8]!
	str	tmp1, [dst, #-8]!
1:
	tbz	count, #2, 1f
	ldr	tmp1w, [src, #-4]!
	str	tmp1w, [dst, #-4]!
1:
	tbz	count, #1, 1f
	ldrh	tmp1w, [src, #-2]!
	strh	tmp1w, [dst, #-2]!
1:
	tbz	count, #0, 1f
	ldrb	tmp1w, [src, #-1]
	strb	tmp1w, [dst, #-1]
1:
	RET

L(mov_not_short_up):
	/* We don't much care about the alignment of DST, but we want SRC
	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
	 * boundaries on both loads and stores.  */
	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
	b.eq	2f
	sub	count, count, tmp2
	/* Move enough data to reach alignment; unlike memcpy, we have to
	 * be aware of the overlap, which means we can't move data twice.  */
	tbz	tmp2, #3, 1f
	ldr	tmp1, [src, #-8]!
	str	tmp1, [dst, #-8]!
1:
	tbz	tmp2, #2, 1f
	ldr	tmp1w, [src, #-4]!
	str	tmp1w, [dst, #-4]!
1:
	tbz	tmp2, #1, 1f
	ldrh	tmp1w, [src, #-2]!
	strh	tmp1w, [dst, #-2]!
1:
	tbz	tmp2, #0, 1f
	ldrb	tmp1w, [src, #-1]!
	strb	tmp1w, [dst, #-1]!
1:

	/* There may be less than 63 bytes to go now.  */
	cmp	count, #63
	b.le	L(tail63up)
2:
	subs	count, count, #128
	b.ge	L(mov_body_large_up)
	/* Less than 128 bytes to move, so handle 64 here and then jump
	 * to the tail.  */
	ldp	A_l, A_h, [src, #-64]!
	ldp	B_l, B_h, [src, #16]
	ldp	C_l, C_h, [src, #32]
	ldp	D_l, D_h, [src, #48]
	stp	A_l, A_h, [dst, #-64]!
	stp	B_l, B_h, [dst, #16]
	stp	C_l, C_h, [dst, #32]
	stp	D_l, D_h, [dst, #48]
	tst	count, #0x3f
	b.ne	L(tail63up)
	RET

	/* Critical loop.  Start at a new Icache line boundary.  Assuming
	 * 64 bytes per line this ensures the entire loop is in one line.  */
	.p2align 6
L(mov_body_large_up):
	/* There are at least 128 bytes to move.  */
	ldp	A_l, A_h, [src, #-16]
	ldp	B_l, B_h, [src, #-32]
	ldp	C_l, C_h, [src, #-48]
	ldp	D_l, D_h, [src, #-64]!
1:
	stp	A_l, A_h, [dst, #-16]
	ldp	A_l, A_h, [src, #-16]
	stp	B_l, B_h, [dst, #-32]
	ldp	B_l, B_h, [src, #-32]
	stp	C_l, C_h, [dst, #-48]
	ldp	C_l, C_h, [src, #-48]
	stp	D_l, D_h, [dst, #-64]!
	ldp	D_l, D_h, [src, #-64]!
	subs	count, count, #64
	b.ge	1b
	stp	A_l, A_h, [dst, #-16]
	stp	B_l, B_h, [dst, #-32]
	stp	C_l, C_h, [dst, #-48]
	stp	D_l, D_h, [dst, #-64]!
	tst	count, #0x3f
	b.ne	L(tail63up)
	RET

L(downwards):
	/* For a downwards move we can safely use memcpy provided that
	 * DST is more than 16 bytes away from SRC.  */
	sub	tmp1, src, #16
	cmp	dstin, tmp1
	b.ls	memcpy		/* May overlap, but not critically.  */

	mov	dst, dstin	/* Preserve DSTIN for return value.  */
	cmp	count, #64
	b.ge	L(mov_not_short_down)

	/* Deal with small moves quickly by dropping straight into the
	 * exit block.  */
L(tail63down):
	/* Move up to 48 bytes of data.  At this point we only need the
	 * bottom 6 bits of count to be accurate.  */
	ands	tmp1, count, #0x30
	b.eq	L(tail15down)
	add	dst, dst, tmp1
	add	src, src, tmp1
	cmp	tmp1w, #0x20
	b.eq	1f
	b.lt	2f
	ldp	A_l, A_h, [src, #-48]
	stp	A_l, A_h, [dst, #-48]
1:
	ldp	A_l, A_h, [src, #-32]
	stp	A_l, A_h, [dst, #-32]
2:
	ldp	A_l, A_h, [src, #-16]
	stp	A_l, A_h, [dst, #-16]
L(tail15down):
	/* Move up to 15 bytes of data.  Does not assume additional data
	   being moved.  */
	tbz	count, #3, 1f
	ldr	tmp1, [src], #8
	str	tmp1, [dst], #8
1:
	tbz	count, #2, 1f
	ldr	tmp1w, [src], #4
	str	tmp1w, [dst], #4
1:
	tbz	count, #1, 1f
	ldrh	tmp1w, [src], #2
	strh	tmp1w, [dst], #2
1:
	tbz	count, #0, 1f
	ldrb	tmp1w, [src]
	strb	tmp1w, [dst]
1:
	RET

L(mov_not_short_down):
	/* We don't much care about the alignment of DST, but we want SRC
	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
	 * boundaries on both loads and stores.  */
	neg	tmp2, src
	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
	b.eq	2f
	sub	count, count, tmp2
	/* Move enough data to reach alignment; unlike memcpy, we have to
	 * be aware of the overlap, which means we can't move data twice.  */
	tbz	tmp2, #3, 1f
	ldr	tmp1, [src], #8
	str	tmp1, [dst], #8
1:
	tbz	tmp2, #2, 1f
	ldr	tmp1w, [src], #4
	str	tmp1w, [dst], #4
1:
	tbz	tmp2, #1, 1f
	ldrh	tmp1w, [src], #2
	strh	tmp1w, [dst], #2
1:
	tbz	tmp2, #0, 1f
	ldrb	tmp1w, [src], #1
	strb	tmp1w, [dst], #1
1:

	/* There may be less than 63 bytes to go now.  */
	cmp	count, #63
	b.le	L(tail63down)
2:
	subs	count, count, #128
	b.ge	L(mov_body_large_down)
	/* Less than 128 bytes to move, so handle 64 here and then jump
	 * to the tail.  */
	ldp	A_l, A_h, [src]
	ldp	B_l, B_h, [src, #16]
	ldp	C_l, C_h, [src, #32]
	ldp	D_l, D_h, [src, #48]
	stp	A_l, A_h, [dst]
	stp	B_l, B_h, [dst, #16]
	stp	C_l, C_h, [dst, #32]
	stp	D_l, D_h, [dst, #48]
	tst	count, #0x3f
	add	src, src, #64
	add	dst, dst, #64
	b.ne	L(tail63down)
	RET

	/* Critical loop.  Start at a new cache line boundary.  Assuming
	 * 64 bytes per line this ensures the entire loop is in one line.  */
	.p2align 6
L(mov_body_large_down):
	/* There are at least 128 bytes to move.  */
	ldp	A_l, A_h, [src, #0]
	sub	dst, dst, #16		/* Pre-bias.  */
	ldp	B_l, B_h, [src, #16]
	ldp	C_l, C_h, [src, #32]
	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
1:
	stp	A_l, A_h, [dst, #16]
	ldp	A_l, A_h, [src, #16]
	stp	B_l, B_h, [dst, #32]
	ldp	B_l, B_h, [src, #32]
	stp	C_l, C_h, [dst, #48]
	ldp	C_l, C_h, [src, #48]
	stp	D_l, D_h, [dst, #64]!
	ldp	D_l, D_h, [src, #64]!
	subs	count, count, #64
	b.ge	1b
	stp	A_l, A_h, [dst, #16]
	stp	B_l, B_h, [dst, #32]
	stp	C_l, C_h, [dst, #48]
	stp	D_l, D_h, [dst, #64]
	add	src, src, #16
	add	dst, dst, #64 + 16
	tst	count, #0x3f
	b.ne	L(tail63down)
	RET
END (memmove)

libc_hidden_builtin_def (memmove)
Commit	Line	Data
b168057a	1	/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
c2b6221e MS	2
	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library. If not, see
	17	<http://www.gnu.org/licenses/>. */
	18
	19	#include <sysdep.h>
	20
	21	/* Assumptions:
	22	*
	23	* ARMv8-a, AArch64
	24	* Unaligned accesses
	25	*/
	26
	27	/* Parameters and result. */
	28	#define dstin x0
	29	#define src x1
	30	#define count x2
	31	#define tmp1 x3
	32	#define tmp1w w3
	33	#define tmp2 x4
	34	#define tmp2w w4
	35	#define tmp3 x5
	36	#define tmp3w w5
	37	#define dst x6
	38
	39	#define A_l x7
	40	#define A_h x8
	41	#define B_l x9
	42	#define B_h x10
	43	#define C_l x11
	44	#define C_h x12
	45	#define D_l x13
	46	#define D_h x14
	47
	48	ENTRY_ALIGN (memmove, 6)
	49
	50	cmp dstin, src
	51	b.lo L(downwards)
	52	add tmp1, src, count
	53	cmp dstin, tmp1
	54	b.hs memcpy /* No overlap. */
	55
	56	/* Upwards move with potential overlap.
	57	* Need to move from the tail backwards. SRC and DST point one
	58	* byte beyond the remaining data to move. */
	59	add dst, dstin, count
	60	add src, src, count
	61	cmp count, #64
	62	b.ge L(mov_not_short_up)
	63
	64	/* Deal with small moves quickly by dropping straight into the
	65	* exit block. */
66	L(tail63up):
67	/* Move up to 48 bytes of data. At this point we only need the
68	* bottom 6 bits of count to be accurate. */
69	ands tmp1, count, #0x30
70	b.eq L(tail15up)
71	sub dst, dst, tmp1
72	sub src, src, tmp1
73	cmp tmp1w, #0x20
74	b.eq 1f
75	b.lt 2f
76	ldp A_l, A_h, [src, #32]
77	stp A_l, A_h, [dst, #32]
78	1:
79	ldp A_l, A_h, [src, #16]
80	stp A_l, A_h, [dst, #16]
81	2:
82	ldp A_l, A_h, [src]
83	stp A_l, A_h, [dst]
84	L(tail15up):
85	/* Move up to 15 bytes of data. Does not assume additional data
86	* being moved. */
87	tbz count, #3, 1f
88	ldr tmp1, [src, #-8]!
89	str tmp1, [dst, #-8]!
90	1:
91	tbz count, #2, 1f
92	ldr tmp1w, [src, #-4]!
93	str tmp1w, [dst, #-4]!
94	1:
95	tbz count, #1, 1f
96	ldrh tmp1w, [src, #-2]!
97	strh tmp1w, [dst, #-2]!
98	1:
99	tbz count, #0, 1f
100	ldrb tmp1w, [src, #-1]
101	strb tmp1w, [dst, #-1]
102	1:
103	RET
104
105	L(mov_not_short_up):
106	/* We don't much care about the alignment of DST, but we want SRC
107	* to be 128-bit (16 byte) aligned so that we don't cross cache line
108	* boundaries on both loads and stores. */
109	ands tmp2, src, #15 /* Bytes to reach alignment. */
110	b.eq 2f
111	sub count, count, tmp2
112	/* Move enough data to reach alignment; unlike memcpy, we have to
113	* be aware of the overlap, which means we can't move data twice. */
114	tbz tmp2, #3, 1f
115	ldr tmp1, [src, #-8]!
116	str tmp1, [dst, #-8]!
117	1:
118	tbz tmp2, #2, 1f
119	ldr tmp1w, [src, #-4]!
120	str tmp1w, [dst, #-4]!
121	1:
122	tbz tmp2, #1, 1f
123	ldrh tmp1w, [src, #-2]!
124	strh tmp1w, [dst, #-2]!
125	1:
126	tbz tmp2, #0, 1f
127	ldrb tmp1w, [src, #-1]!
128	strb tmp1w, [dst, #-1]!
129	1:
130
131	/* There may be less than 63 bytes to go now. */
132	cmp count, #63
133	b.le L(tail63up)
134	2:
135	subs count, count, #128
136	b.ge L(mov_body_large_up)
137	/* Less than 128 bytes to move, so handle 64 here and then jump
138	* to the tail. */
139	ldp A_l, A_h, [src, #-64]!
140	ldp B_l, B_h, [src, #16]
141	ldp C_l, C_h, [src, #32]
142	ldp D_l, D_h, [src, #48]
143	stp A_l, A_h, [dst, #-64]!
144	stp B_l, B_h, [dst, #16]
145	stp C_l, C_h, [dst, #32]
146	stp D_l, D_h, [dst, #48]
147	tst count, #0x3f
148	b.ne L(tail63up)
149	RET
150
151	/* Critical loop. Start at a new Icache line boundary. Assuming
152	* 64 bytes per line this ensures the entire loop is in one line. */
153	.p2align 6
154	L(mov_body_large_up):
155	/* There are at least 128 bytes to move. */
156	ldp A_l, A_h, [src, #-16]
157	ldp B_l, B_h, [src, #-32]
158	ldp C_l, C_h, [src, #-48]
159	ldp D_l, D_h, [src, #-64]!
160	1:
161	stp A_l, A_h, [dst, #-16]
162	ldp A_l, A_h, [src, #-16]
163	stp B_l, B_h, [dst, #-32]
164	ldp B_l, B_h, [src, #-32]
165	stp C_l, C_h, [dst, #-48]
166	ldp C_l, C_h, [src, #-48]
167	stp D_l, D_h, [dst, #-64]!
168	ldp D_l, D_h, [src, #-64]!
169	subs count, count, #64
170	b.ge 1b
171	stp A_l, A_h, [dst, #-16]
172	stp B_l, B_h, [dst, #-32]
173	stp C_l, C_h, [dst, #-48]
174	stp D_l, D_h, [dst, #-64]!
175	tst count, #0x3f
176	b.ne L(tail63up)
177	RET
178
179	L(downwards):
180	/* For a downwards move we can safely use memcpy provided that
181	* DST is more than 16 bytes away from SRC. */
182	sub tmp1, src, #16
183	cmp dstin, tmp1
184	b.ls memcpy /* May overlap, but not critically. */
185
186	mov dst, dstin /* Preserve DSTIN for return value. */
187	cmp count, #64
188	b.ge L(mov_not_short_down)
189
190	/* Deal with small moves quickly by dropping straight into the
191	* exit block. */
192	L(tail63down):
193	/* Move up to 48 bytes of data. At this point we only need the
194	* bottom 6 bits of count to be accurate. */
195	ands tmp1, count, #0x30
196	b.eq L(tail15down)
197	add dst, dst, tmp1
198	add src, src, tmp1
199	cmp tmp1w, #0x20
200	b.eq 1f
201	b.lt 2f
202	ldp A_l, A_h, [src, #-48]
203	stp A_l, A_h, [dst, #-48]
204	1:
205	ldp A_l, A_h, [src, #-32]
206	stp A_l, A_h, [dst, #-32]
207	2:
208	ldp A_l, A_h, [src, #-16]
209	stp A_l, A_h, [dst, #-16]
210	L(tail15down):
211	/* Move up to 15 bytes of data. Does not assume additional data
212	being moved. */
213	tbz count, #3, 1f
214	ldr tmp1, [src], #8
215	str tmp1, [dst], #8
216	1:
217	tbz count, #2, 1f
218	ldr tmp1w, [src], #4
219	str tmp1w, [dst], #4
220	1:
221	tbz count, #1, 1f
222	ldrh tmp1w, [src], #2
223	strh tmp1w, [dst], #2
224	1:
225	tbz count, #0, 1f
226	ldrb tmp1w, [src]
227	strb tmp1w, [dst]
228	1:
229	RET
230
231	L(mov_not_short_down):
232	/* We don't much care about the alignment of DST, but we want SRC
233	* to be 128-bit (16 byte) aligned so that we don't cross cache line
234	* boundaries on both loads and stores. */
235	neg tmp2, src
236	ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
237	b.eq 2f
238	sub count, count, tmp2
239	/* Move enough data to reach alignment; unlike memcpy, we have to
240	* be aware of the overlap, which means we can't move data twice. */
241	tbz tmp2, #3, 1f
242	ldr tmp1, [src], #8
243	str tmp1, [dst], #8
244	1:
245	tbz tmp2, #2, 1f
246	ldr tmp1w, [src], #4
247	str tmp1w, [dst], #4
248	1:
249	tbz tmp2, #1, 1f
250	ldrh tmp1w, [src], #2
251	strh tmp1w, [dst], #2
252	1:
253	tbz tmp2, #0, 1f
254	ldrb tmp1w, [src], #1
255	strb tmp1w, [dst], #1
256	1:
257
258	/* There may be less than 63 bytes to go now. */
259	cmp count, #63
260	b.le L(tail63down)
261	2:
262	subs count, count, #128
263	b.ge L(mov_body_large_down)
264	/* Less than 128 bytes to move, so handle 64 here and then jump
265	* to the tail. */
266	ldp A_l, A_h, [src]
267	ldp B_l, B_h, [src, #16]
268	ldp C_l, C_h, [src, #32]
269	ldp D_l, D_h, [src, #48]
270	stp A_l, A_h, [dst]
271	stp B_l, B_h, [dst, #16]
272	stp C_l, C_h, [dst, #32]
273	stp D_l, D_h, [dst, #48]
274	tst count, #0x3f
275	add src, src, #64
276	add dst, dst, #64
277	b.ne L(tail63down)
278	RET
279
280	/* Critical loop. Start at a new cache line boundary. Assuming
281	* 64 bytes per line this ensures the entire loop is in one line. */
282	.p2align 6
283	L(mov_body_large_down):
284	/* There are at least 128 bytes to move. */
285	ldp A_l, A_h, [src, #0]
286	sub dst, dst, #16 /* Pre-bias. */
287	ldp B_l, B_h, [src, #16]
288	ldp C_l, C_h, [src, #32]
289	ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
290	1:
291	stp A_l, A_h, [dst, #16]
292	ldp A_l, A_h, [src, #16]
293	stp B_l, B_h, [dst, #32]
294	ldp B_l, B_h, [src, #32]
295	stp C_l, C_h, [dst, #48]
296	ldp C_l, C_h, [src, #48]
297	stp D_l, D_h, [dst, #64]!
298	ldp D_l, D_h, [src, #64]!
299	subs count, count, #64
300	b.ge 1b
301	stp A_l, A_h, [dst, #16]
302	stp B_l, B_h, [dst, #32]
303	stp C_l, C_h, [dst, #48]
304	stp D_l, D_h, [dst, #64]
305	add src, src, #16
306	add dst, dst, #64 + 16
307	tst count, #0x3f
308	b.ne L(tail63down)
309	RET
310	END (memmove)
311
312	libc_hidden_builtin_def (memmove)