[thirdparty/linux.git] / arch / arm64 / lib / memset.S

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * be found @
 *
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 * files/head:/src/aarch64/
 */

#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/cache.h>

/*
 * Fill in the buffer with character c (alignment handled by the hardware)
 *
 * Parameters:
 *	x0 - buf
 *	x1 - c
 *	x2 - n
 * Returns:
 *	x0 - buf
 */

dstin		.req	x0
val		.req	w1
count		.req	x2
tmp1		.req	x3
tmp1w		.req	w3
tmp2		.req	x4
tmp2w		.req	w4
zva_len_x	.req	x5
zva_len		.req	w5
zva_bits_x	.req	x6

A_l		.req	x7
A_lw		.req	w7
dst		.req	x8
tmp3w		.req	w9
tmp3		.req	x9

	.weak memset
ENTRY(__memset)
ENTRY(memset)
	mov	dst, dstin	/* Preserve return value.  */
	and	A_lw, val, #255
	orr	A_lw, A_lw, A_lw, lsl #8
	orr	A_lw, A_lw, A_lw, lsl #16
	orr	A_l, A_l, A_l, lsl #32

	cmp	count, #15
	b.hi	.Lover16_proc
	/*All store maybe are non-aligned..*/
	tbz	count, #3, 1f
	str	A_l, [dst], #8
1:
	tbz	count, #2, 2f
	str	A_lw, [dst], #4
2:
	tbz	count, #1, 3f
	strh	A_lw, [dst], #2
3:
	tbz	count, #0, 4f
	strb	A_lw, [dst]
4:
	ret

.Lover16_proc:
	/*Whether  the start address is aligned with 16.*/
	neg	tmp2, dst
	ands	tmp2, tmp2, #15
	b.eq	.Laligned
/*
* The count is not less than 16, we can use stp to store the start 16 bytes,
* then adjust the dst aligned with 16.This process will make the current
* memory address at alignment boundary.
*/
	stp	A_l, A_l, [dst] /*non-aligned store..*/
	/*make the dst aligned..*/
	sub	count, count, tmp2
	add	dst, dst, tmp2

.Laligned:
	cbz	A_l, .Lzero_mem

.Ltail_maybe_long:
	cmp	count, #64
	b.ge	.Lnot_short
.Ltail63:
	ands	tmp1, count, #0x30
	b.eq	3f
	cmp	tmp1w, #0x20
	b.eq	1f
	b.lt	2f
	stp	A_l, A_l, [dst], #16
1:
	stp	A_l, A_l, [dst], #16
2:
	stp	A_l, A_l, [dst], #16
/*
* The last store length is less than 16,use stp to write last 16 bytes.
* It will lead some bytes written twice and the access is non-aligned.
*/
3:
	ands	count, count, #15
	cbz	count, 4f
	add	dst, dst, count
	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
4:
	ret

	/*
	* Critical loop. Start at a new cache line boundary. Assuming
	* 64 bytes per line, this ensures the entire loop is in one line.
	*/
	.p2align	L1_CACHE_SHIFT
.Lnot_short:
	sub	dst, dst, #16/* Pre-bias.  */
	sub	count, count, #64
1:
	stp	A_l, A_l, [dst, #16]
	stp	A_l, A_l, [dst, #32]
	stp	A_l, A_l, [dst, #48]
	stp	A_l, A_l, [dst, #64]!
	subs	count, count, #64
	b.ge	1b
	tst	count, #0x3f
	add	dst, dst, #16
	b.ne	.Ltail63
.Lexitfunc:
	ret

	/*
	* For zeroing memory, check to see if we can use the ZVA feature to
	* zero entire 'cache' lines.
	*/
.Lzero_mem:
	cmp	count, #63
	b.le	.Ltail63
	/*
	* For zeroing small amounts of memory, it's not worth setting up
	* the line-clear code.
	*/
	cmp	count, #128
	b.lt	.Lnot_short /*count is at least  128 bytes*/

	mrs	tmp1, dczid_el0
	tbnz	tmp1, #4, .Lnot_short
	mov	tmp3w, #4
	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
	lsl	zva_len, tmp3w, zva_len

	ands	tmp3w, zva_len, #63
	/*
	* ensure the zva_len is not less than 64.
	* It is not meaningful to use ZVA if the block size is less than 64.
	*/
	b.ne	.Lnot_short
.Lzero_by_line:
	/*
	* Compute how far we need to go to become suitably aligned. We're
	* already at quad-word alignment.
	*/
	cmp	count, zva_len_x
	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
	sub	zva_bits_x, zva_len_x, #1
	neg	tmp2, dst
	ands	tmp2, tmp2, zva_bits_x
	b.eq	2f			/* Already aligned.  */
	/* Not aligned, check that there's enough to copy after alignment.*/
	sub	tmp1, count, tmp2
	/*
	* grantee the remain length to be ZVA is bigger than 64,
	* avoid to make the 2f's process over mem range.*/
	cmp	tmp1, #64
	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
	b.lt	.Lnot_short
	/*
	* We know that there's at least 64 bytes to zero and that it's safe
	* to overrun by 64 bytes.
	*/
	mov	count, tmp1
1:
	stp	A_l, A_l, [dst]
	stp	A_l, A_l, [dst, #16]
	stp	A_l, A_l, [dst, #32]
	subs	tmp2, tmp2, #64
	stp	A_l, A_l, [dst, #48]
	add	dst, dst, #64
	b.ge	1b
	/* We've overrun a bit, so adjust dst downwards.*/
	add	dst, dst, tmp2
2:
	sub	count, count, zva_len_x
3:
	dc	zva, dst
	add	dst, dst, zva_len_x
	subs	count, count, zva_len_x
	b.ge	3b
	ands	count, count, zva_bits_x
	b.ne	.Ltail_maybe_long
	ret
ENDPIPROC(memset)
EXPORT_SYMBOL(memset)
ENDPROC(__memset)
EXPORT_SYMBOL(__memset)
Commit	Line	Data
caab277b	1	/* SPDX-License-Identifier: GPL-2.0-only */
4a899227 CM	2	/*
4a899227 CM	3	* Copyright (C) 2013 ARM Ltd.
b29a51fe	4	* Copyright (C) 2013 Linaro.
	5	*
	6	* This code is based on glibc cortex strings work originally authored by Linaro
b29a51fe	7	* be found @
	8	*
	9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
	10	* files/head:/src/aarch64/
4a899227 CM	11	*/
	12
	13	#include <linux/linkage.h>
	14	#include <asm/assembler.h>
b29a51fe	15	#include <asm/cache.h>
4a899227 CM	16
	17	/*
	18	* Fill in the buffer with character c (alignment handled by the hardware)
	19	*
	20	* Parameters:
	21	* x0 - buf
	22	* x1 - c
	23	* x2 - n
	24	* Returns:
	25	* x0 - buf
	26	*/
b29a51fe	27
	28	dstin .req x0
	29	val .req w1
	30	count .req x2
	31	tmp1 .req x3
	32	tmp1w .req w3
	33	tmp2 .req x4
	34	tmp2w .req w4
	35	zva_len_x .req x5
	36	zva_len .req w5
	37	zva_bits_x .req x6
	38
	39	A_l .req x7
	40	A_lw .req w7
	41	dst .req x8
	42	tmp3w .req w9
	43	tmp3 .req x9
	44
39d114dd AR	45	.weak memset
39d114dd AR	46	ENTRY(__memset)
4a899227	47	ENTRY(memset)
b29a51fe	48	mov dst, dstin /* Preserve return value. */
	49	and A_lw, val, #255
	50	orr A_lw, A_lw, A_lw, lsl #8
	51	orr A_lw, A_lw, A_lw, lsl #16
	52	orr A_l, A_l, A_l, lsl #32
	53
	54	cmp count, #15
	55	b.hi .Lover16_proc
	56	/All store maybe are non-aligned../
	57	tbz count, #3, 1f
	58	str A_l, [dst], #8
	59	1:
	60	tbz count, #2, 2f
	61	str A_lw, [dst], #4
	62	2:
	63	tbz count, #1, 3f
	64	strh A_lw, [dst], #2
	65	3:
	66	tbz count, #0, 4f
	67	strb A_lw, [dst]
	68	4:
	69	ret
	70
	71	.Lover16_proc:
	72	/Whether the start address is aligned with 16./
	73	neg tmp2, dst
	74	ands tmp2, tmp2, #15
	75	b.eq .Laligned
	76	/*
	77	* The count is not less than 16, we can use stp to store the start 16 bytes,
	78	* then adjust the dst aligned with 16.This process will make the current
	79	* memory address at alignment boundary.
	80	*/
	81	stp A_l, A_l, [dst] /non-aligned store../
	82	/make the dst aligned../
	83	sub count, count, tmp2
	84	add dst, dst, tmp2
	85
	86	.Laligned:
	87	cbz A_l, .Lzero_mem
	88
	89	.Ltail_maybe_long:
	90	cmp count, #64
	91	b.ge .Lnot_short
	92	.Ltail63:
	93	ands tmp1, count, #0x30
	94	b.eq 3f
	95	cmp tmp1w, #0x20
	96	b.eq 1f
	97	b.lt 2f
	98	stp A_l, A_l, [dst], #16
	99	1:
	100	stp A_l, A_l, [dst], #16
	101	2:
	102	stp A_l, A_l, [dst], #16
	103	/*
	104	* The last store length is less than 16,use stp to write last 16 bytes.
	105	* It will lead some bytes written twice and the access is non-aligned.
	106	*/
	107	3:
	108	ands count, count, #15
	109	cbz count, 4f
	110	add dst, dst, count
	111	stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
112	4:
113	ret
114
115	/*
116	* Critical loop. Start at a new cache line boundary. Assuming
117	* 64 bytes per line, this ensures the entire loop is in one line.
118	*/
119	.p2align L1_CACHE_SHIFT
120	.Lnot_short:
121	sub dst, dst, #16/* Pre-bias. */
122	sub count, count, #64
123	1:
124	stp A_l, A_l, [dst, #16]
125	stp A_l, A_l, [dst, #32]
126	stp A_l, A_l, [dst, #48]
127	stp A_l, A_l, [dst, #64]!
128	subs count, count, #64
129	b.ge 1b
130	tst count, #0x3f
131	add dst, dst, #16
132	b.ne .Ltail63
133	.Lexitfunc:
134	ret
135
136	/*
137	* For zeroing memory, check to see if we can use the ZVA feature to
138	* zero entire 'cache' lines.
139	*/
140	.Lzero_mem:
141	cmp count, #63
142	b.le .Ltail63
143	/*
144	* For zeroing small amounts of memory, it's not worth setting up
145	* the line-clear code.
146	*/
147	cmp count, #128
148	b.lt .Lnot_short /count is at least 128 bytes/
149
150	mrs tmp1, dczid_el0
151	tbnz tmp1, #4, .Lnot_short
152	mov tmp3w, #4
153	and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
154	lsl zva_len, tmp3w, zva_len
155
156	ands tmp3w, zva_len, #63
157	/*
158	* ensure the zva_len is not less than 64.
159	* It is not meaningful to use ZVA if the block size is less than 64.
160	*/
161	b.ne .Lnot_short
162	.Lzero_by_line:
163	/*
164	* Compute how far we need to go to become suitably aligned. We're
165	* already at quad-word alignment.
166	*/
167	cmp count, zva_len_x
168	b.lt .Lnot_short /* Not enough to reach alignment. */
169	sub zva_bits_x, zva_len_x, #1
170	neg tmp2, dst
171	ands tmp2, tmp2, zva_bits_x
172	b.eq 2f /* Already aligned. */
173	/* Not aligned, check that there's enough to copy after alignment.*/
174	sub tmp1, count, tmp2
175	/*
176	* grantee the remain length to be ZVA is bigger than 64,
177	* avoid to make the 2f's process over mem range.*/
178	cmp tmp1, #64
179	ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
180	b.lt .Lnot_short
181	/*
182	* We know that there's at least 64 bytes to zero and that it's safe
183	* to overrun by 64 bytes.
184	*/
185	mov count, tmp1
186	1:
187	stp A_l, A_l, [dst]
188	stp A_l, A_l, [dst, #16]
189	stp A_l, A_l, [dst, #32]
190	subs tmp2, tmp2, #64
191	stp A_l, A_l, [dst, #48]
192	add dst, dst, #64
193	b.ge 1b
194	/* We've overrun a bit, so adjust dst downwards.*/
195	add dst, dst, tmp2
196	2:
197	sub count, count, zva_len_x
198	3:
199	dc zva, dst
200	add dst, dst, zva_len_x
201	subs count, count, zva_len_x
202	b.ge 3b
203	ands count, count, zva_bits_x
204	b.ne .Ltail_maybe_long
205	ret
20791846	206	ENDPIPROC(memset)
ac0e8c72	207	EXPORT_SYMBOL(memset)
39d114dd	208	ENDPROC(__memset)
ac0e8c72	209	EXPORT_SYMBOL(__memset)