[thirdparty/glibc.git] / sysdeps / alpha / stxcpy.S

/* Copyright (C) 1996-2019 Free Software Foundation, Inc.
   Contributed by Richard Henderson (rth@tamu.edu)
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* Copy a null-terminated string from SRC to DST.

   This is an internal routine used by strcpy, stpcpy, and strcat.
   As such, it uses special linkage conventions to make implementation
   of these public functions more efficient.

   On input:
	t9 = return address
	a0 = DST
	a1 = SRC

   On output:
	t8  = bitmask (with one bit set) indicating the last byte written
	a0  = unaligned address of the last *word* written

   Furthermore, v0, a3-a5, t11, and t12 are untouched.
*/

/* This is generally scheduled for the EV5, but should still be pretty
   good for the EV4 too.  */

#include <sysdep.h>

	.set noat
	.set noreorder

	.text
	.type	__stxcpy, @function
	.globl	__stxcpy
	.usepv	__stxcpy, no

	cfi_startproc
	cfi_return_column (t9)

	/* On entry to this basic block:
	   t0 == the first destination word for masking back in
	   t1 == the first source word.  */
	.align 3
stxcpy_aligned:
	/* Create the 1st output word and detect 0's in the 1st input word.  */
	lda	t2, -1		# e1    : build a mask against false zero
	mskqh	t2, a1, t2	# e0    :   detection in the src word
	mskqh	t1, a1, t3	# e0    :
	ornot	t1, t2, t2	# .. e1 :
	mskql	t0, a1, t0	# e0    : assemble the first output word
	cmpbge	zero, t2, t7	# .. e1 : bits set iff null found
	or	t0, t3, t1	# e0    :
	bne	t7, $a_eos	# .. e1 :

	/* On entry to this basic block:
	   t0 == the first destination word for masking back in
	   t1 == a source word not containing a null.  */
$a_loop:
	stq_u	t1, 0(a0)	# e0    :
	addq	a0, 8, a0	# .. e1 :
	ldq_u	t1, 0(a1)	# e0    :
	addq	a1, 8, a1	# .. e1 :
	cmpbge	zero, t1, t7	# e0 (stall)
	beq	t7, $a_loop	# .. e1 (zdb)

	/* Take care of the final (partial) word store.
	   On entry to this basic block we have:
	   t1 == the source word containing the null
	   t7 == the cmpbge mask that found it.  */
$a_eos:
	negq	t7, t6		# e0    : find low bit set
	and	t7, t6, t8	# e1 (stall)

	/* For the sake of the cache, don't read a destination word
	   if we're not going to need it.  */
	and	t8, 0x80, t6	# e0    :
	bne	t6, 1f		# .. e1 (zdb)

	/* We're doing a partial word store and so need to combine
	   our source and original destination words.  */
	ldq_u	t0, 0(a0)	# e0    :
	subq	t8, 1, t6	# .. e1 :
	zapnot	t1, t6, t1	# e0    : clear src bytes >= null
	or	t8, t6, t7	# .. e1 :
	zap	t0, t7, t0	# e0    : clear dst bytes <= null
	or	t0, t1, t1	# e1    :

1:	stq_u	t1, 0(a0)	# e0    :
	ret	(t9)		# .. e1 :

	.align 3
__stxcpy:
	/* Are source and destination co-aligned?  */
	xor	a0, a1, t0	# e0    :
	unop			#       :
	and	t0, 7, t0	# e0    :
	bne	t0, $unaligned	# .. e1 :

	/* We are co-aligned; take care of a partial first word.  */
	ldq_u	t1, 0(a1)	# e0    : load first src word
	and	a0, 7, t0	# .. e1 : take care not to load a word ...
	addq	a1, 8, a1		# e0    :
	beq	t0, stxcpy_aligned	# .. e1 : ... if we wont need it
	ldq_u	t0, 0(a0)	# e0    :
	br	stxcpy_aligned	# .. e1 :


/* The source and destination are not co-aligned.  Align the destination
   and cope.  We have to be very careful about not reading too much and
   causing a SEGV.  */

	.align 3
$u_head:
	/* We know just enough now to be able to assemble the first
	   full source word.  We can still find a zero at the end of it
	   that prevents us from outputting the whole thing.

	   On entry to this basic block:
	   t0 == the first dest word, for masking back in, if needed else 0
	   t1 == the low bits of the first source word
	   t6 == bytemask that is -1 in dest word bytes */

	ldq_u	t2, 8(a1)	# e0    :
	addq	a1, 8, a1	# .. e1 :

	extql	t1, a1, t1	# e0    :
	extqh	t2, a1, t4	# e0    :
	mskql	t0, a0, t0	# e0    :
	or	t1, t4, t1	# .. e1 :
	mskqh	t1, a0, t1	# e0    :
	or	t0, t1, t1	# e1    :

	or	t1, t6, t6	# e0    :
	cmpbge	zero, t6, t7	# .. e1 :
	lda	t6, -1		# e0    : for masking just below
	bne	t7, $u_final	# .. e1 :

	mskql	t6, a1, t6		# e0    : mask out the bits we have
	or	t6, t2, t2		# e1    :   already extracted before
	cmpbge	zero, t2, t7		# e0    :   testing eos
	bne	t7, $u_late_head_exit	# .. e1 (zdb)

	/* Finally, we've got all the stupid leading edge cases taken care
	   of and we can set up to enter the main loop.  */

	stq_u	t1, 0(a0)	# e0    : store first output word
	addq	a0, 8, a0	# .. e1 :
	extql	t2, a1, t0	# e0    : position ho-bits of lo word
	ldq_u	t2, 8(a1)	# .. e1 : read next high-order source word
	addq	a1, 8, a1	# e0    :
	cmpbge	zero, t2, t7	# .. e1 :
	nop			# e0    :
	bne	t7, $u_eos	# .. e1 :

	/* Unaligned copy main loop.  In order to avoid reading too much,
	   the loop is structured to detect zeros in aligned source words.
	   This has, unfortunately, effectively pulled half of a loop
	   iteration out into the head and half into the tail, but it does
	   prevent nastiness from accumulating in the very thing we want
	   to run as fast as possible.

	   On entry to this basic block:
	   t0 == the shifted high-order bits from the previous source word
	   t2 == the unshifted current source word

	   We further know that t2 does not contain a null terminator.  */

	.align 3
$u_loop:
	extqh	t2, a1, t1	# e0    : extract high bits for current word
	addq	a1, 8, a1	# .. e1 :
	extql	t2, a1, t3	# e0    : extract low bits for next time
	addq	a0, 8, a0	# .. e1 :
	or	t0, t1, t1	# e0    : current dst word now complete
	ldq_u	t2, 0(a1)	# .. e1 : load high word for next time
	stq_u	t1, -8(a0)	# e0    : save the current word
	mov	t3, t0		# .. e1 :
	cmpbge	zero, t2, t7	# e0    : test new word for eos
	beq	t7, $u_loop	# .. e1 :

	/* We've found a zero somewhere in the source word we just read.
	   If it resides in the lower half, we have one (probably partial)
	   word to write out, and if it resides in the upper half, we
	   have one full and one partial word left to write out.

	   On entry to this basic block:
	   t0 == the shifted high-order bits from the previous source word
	   t2 == the unshifted current source word.  */
$u_eos:
	extqh	t2, a1, t1	# e0    :
	or	t0, t1, t1	# e1    : first (partial) source word complete

	cmpbge	zero, t1, t7	# e0    : is the null in this first bit?
	bne	t7, $u_final	# .. e1 (zdb)

$u_late_head_exit:
	stq_u	t1, 0(a0)	# e0    : the null was in the high-order bits
	addq	a0, 8, a0	# .. e1 :
	extql	t2, a1, t1	# e0    :
	cmpbge	zero, t1, t7	# .. e1 :

	/* Take care of a final (probably partial) result word.
	   On entry to this basic block:
	   t1 == assembled source word
	   t7 == cmpbge mask that found the null.  */
$u_final:
	negq	t7, t6		# e0    : isolate low bit set
	and	t6, t7, t8	# e1    :

	and	t8, 0x80, t6	# e0    : avoid dest word load if we can
	bne	t6, 1f		# .. e1 (zdb)

	ldq_u	t0, 0(a0)	# e0    :
	subq	t8, 1, t6	# .. e1 :
	or	t6, t8, t7	# e0    :
	zapnot	t1, t6, t1	# .. e1 : kill source bytes >= null
	zap	t0, t7, t0	# e0    : kill dest bytes <= null
	or	t0, t1, t1	# e1    :

1:	stq_u	t1, 0(a0)	# e0    :
	ret	(t9)		# .. e1 :

	/* Unaligned copy entry point.  */
	.align 3
$unaligned:

	ldq_u	t1, 0(a1)	# e0    : load first source word

	and	a0, 7, t4	# .. e1 : find dest misalignment
	and	a1, 7, t5	# e0    : find src misalignment

	/* Conditionally load the first destination word and a bytemask
	   with 0xff indicating that the destination byte is sacrosanct.  */

	mov	zero, t0	# .. e1 :
	mov	zero, t6	# e0    :
	beq	t4, 1f		# .. e1 :
	ldq_u	t0, 0(a0)	# e0    :
	lda	t6, -1		# .. e1 :
	mskql	t6, a0, t6	# e0    :
1:
	subq	a1, t4, a1	# .. e1 : sub dest misalignment from src addr

	/* If source misalignment is larger than dest misalignment, we need
	   extra startup checks to avoid SEGV.  */

	cmplt	t4, t5, t8	# e0    :
	beq	t8, $u_head	# .. e1 (zdb)

	lda	t2, -1		# e1    : mask out leading garbage in source
	mskqh	t2, t5, t2	# e0    :
	nop			# e0    :
	ornot	t1, t2, t3	# .. e1 :
	cmpbge	zero, t3, t7	# e0    : is there a zero?
	beq	t7, $u_head	# .. e1 (zdb)

	/* At this point we've found a zero in the first partial word of
	   the source.  We need to isolate the valid source data and mask
	   it into the original destination data.  (Incidentally, we know
	   that we'll need at least one byte of that original dest word.) */

	ldq_u	t0, 0(a0)	# e0    :

	negq	t7, t6		# .. e1 : build bitmask of bytes <= zero
	and	t6, t7, t8	# e0    :
	and	a1, 7, t5	# .. e1 :
	subq	t8, 1, t6	# e0    :
	or	t6, t8, t7	# e1    :
	srl	t8, t5, t8	# e0    : adjust final null return value

	zapnot	t2, t7, t2	# .. e1 : prepare source word; mirror changes
	and	t1, t2, t1	# e1    : to source validity mask
	extql	t2, a1, t2	# .. e0 :
	extql	t1, a1, t1	# e0    :

	andnot	t0, t2, t0	# .. e1 : zero place for source to reside
	or	t0, t1, t1	# e1    : and put it there
	stq_u	t1, 0(a0)	# .. e0 :
	ret	(t9)

	cfi_endproc
Commit	Line	Data
04277e02	1	/* Copyright (C) 1996-2019 Free Software Foundation, Inc.
cf182b58	2	Contributed by Richard Henderson (rth@tamu.edu)
db31c863	3	This file is part of the GNU C Library.
cf182b58	4
db31c863	5	The GNU C Library is free software; you can redistribute it and/or
3214b89b AJ	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
cf182b58	9
db31c863 UD	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3214b89b	13	Lesser General Public License for more details.
cf182b58	14
3214b89b	15	You should have received a copy of the GNU Lesser General Public
ab84e3ff PE	16	License along with the GNU C Library. If not, see
ab84e3ff PE	17	<http://www.gnu.org/licenses/>. */
cf182b58 UD	18
	19	/* Copy a null-terminated string from SRC to DST.
	20
	21	This is an internal routine used by strcpy, stpcpy, and strcat.
	22	As such, it uses special linkage conventions to make implementation
	23	of these public functions more efficient.
	24
	25	On input:
	26	t9 = return address
	27	a0 = DST
	28	a1 = SRC
	29
	30	On output:
	31	t8 = bitmask (with one bit set) indicating the last byte written
	32	a0 = unaligned address of the last word written
	33
	34	Furthermore, v0, a3-a5, t11, and t12 are untouched.
	35	*/
	36
05e86071	37	/* This is generally scheduled for the EV5, but should still be pretty
cf182b58 UD	38	good for the EV4 too. */
	39
	40	#include <sysdep.h>
	41
	42	.set noat
	43	.set noreorder
	44
	45	.text
b2afe910 RH	46	.type __stxcpy, @function
	47	.globl __stxcpy
	48	.usepv __stxcpy, no
cf182b58	49
b2afe910 RH	50	cfi_startproc
b2afe910 RH	51	cfi_return_column (t9)
cf182b58 UD	52
	53	/* On entry to this basic block:
	54	t0 == the first destination word for masking back in
	55	t1 == the first source word. */
b2afe910 RH	56	.align 3
b2afe910 RH	57	stxcpy_aligned:
cf182b58 UD	58	/* Create the 1st output word and detect 0's in the 1st input word. */
	59	lda t2, -1 # e1 : build a mask against false zero
	60	mskqh t2, a1, t2 # e0 : detection in the src word
	61	mskqh t1, a1, t3 # e0 :
05e86071	62	ornot t1, t2, t2 # .. e1 :
cf182b58 UD	63	mskql t0, a1, t0 # e0 : assemble the first output word
	64	cmpbge zero, t2, t7 # .. e1 : bits set iff null found
	65	or t0, t3, t1 # e0 :
	66	bne t7, $a_eos # .. e1 :
	67
	68	/* On entry to this basic block:
	69	t0 == the first destination word for masking back in
	70	t1 == a source word not containing a null. */
cf182b58 UD	71	$a_loop:
	72	stq_u t1, 0(a0) # e0 :
	73	addq a0, 8, a0 # .. e1 :
	74	ldq_u t1, 0(a1) # e0 :
	75	addq a1, 8, a1 # .. e1 :
	76	cmpbge zero, t1, t7 # e0 (stall)
	77	beq t7, $a_loop # .. e1 (zdb)
	78
	79	/* Take care of the final (partial) word store.
	80	On entry to this basic block we have:
	81	t1 == the source word containing the null
	82	t7 == the cmpbge mask that found it. */
	83	$a_eos:
	84	negq t7, t6 # e0 : find low bit set
	85	and t7, t6, t8 # e1 (stall)
	86
	87	/* For the sake of the cache, don't read a destination word
	88	if we're not going to need it. */
	89	and t8, 0x80, t6 # e0 :
	90	bne t6, 1f # .. e1 (zdb)
	91
	92	/* We're doing a partial word store and so need to combine
	93	our source and original destination words. */
	94	ldq_u t0, 0(a0) # e0 :
05e86071	95	subq t8, 1, t6 # .. e1 :
cf182b58	96	zapnot t1, t6, t1 # e0 : clear src bytes >= null
05e86071	97	or t8, t6, t7 # .. e1 :
cf182b58 UD	98	zap t0, t7, t0 # e0 : clear dst bytes <= null
	99	or t0, t1, t1 # e1 :
	100
	101	1: stq_u t1, 0(a0) # e0 :
	102	ret (t9) # .. e1 :
	103
cf182b58	104	.align 3
cf182b58	105	__stxcpy:
cf182b58 UD	106	/* Are source and destination co-aligned? */
	107	xor a0, a1, t0 # e0 :
	108	unop # :
	109	and t0, 7, t0 # e0 :
	110	bne t0, $unaligned # .. e1 :
	111
	112	/* We are co-aligned; take care of a partial first word. */
	113	ldq_u t1, 0(a1) # e0 : load first src word
	114	and a0, 7, t0 # .. e1 : take care not to load a word ...
	115	addq a1, 8, a1 # e0 :
	116	beq t0, stxcpy_aligned # .. e1 : ... if we wont need it
	117	ldq_u t0, 0(a0) # e0 :
	118	br stxcpy_aligned # .. e1 :
	119
	120
	121	/* The source and destination are not co-aligned. Align the destination
	122	and cope. We have to be very careful about not reading too much and
	123	causing a SEGV. */
	124
	125	.align 3
	126	$u_head:
	127	/* We know just enough now to be able to assemble the first
	128	full source word. We can still find a zero at the end of it
	129	that prevents us from outputting the whole thing.
	130
	131	On entry to this basic block:
	132	t0 == the first dest word, for masking back in, if needed else 0
	133	t1 == the low bits of the first source word
	134	t6 == bytemask that is -1 in dest word bytes */
	135
	136	ldq_u t2, 8(a1) # e0 :
	137	addq a1, 8, a1 # .. e1 :
	138
	139	extql t1, a1, t1 # e0 :
	140	extqh t2, a1, t4 # e0 :
	141	mskql t0, a0, t0 # e0 :
	142	or t1, t4, t1 # .. e1 :
	143	mskqh t1, a0, t1 # e0 :
	144	or t0, t1, t1 # e1 :
05e86071	145
cf182b58 UD	146	or t1, t6, t6 # e0 :
	147	cmpbge zero, t6, t7 # .. e1 :
	148	lda t6, -1 # e0 : for masking just below
	149	bne t7, $u_final # .. e1 :
	150
05e86071	151	mskql t6, a1, t6 # e0 : mask out the bits we have
cf182b58 UD	152	or t6, t2, t2 # e1 : already extracted before
	153	cmpbge zero, t2, t7 # e0 : testing eos
	154	bne t7, $u_late_head_exit # .. e1 (zdb)
	155
	156	/* Finally, we've got all the stupid leading edge cases taken care
	157	of and we can set up to enter the main loop. */
	158
	159	stq_u t1, 0(a0) # e0 : store first output word
	160	addq a0, 8, a0 # .. e1 :
	161	extql t2, a1, t0 # e0 : position ho-bits of lo word
	162	ldq_u t2, 8(a1) # .. e1 : read next high-order source word
	163	addq a1, 8, a1 # e0 :
	164	cmpbge zero, t2, t7 # .. e1 :
	165	nop # e0 :
	166	bne t7, $u_eos # .. e1 :
	167
	168	/* Unaligned copy main loop. In order to avoid reading too much,
	169	the loop is structured to detect zeros in aligned source words.
05e86071	170	This has, unfortunately, effectively pulled half of a loop
cf182b58 UD	171	iteration out into the head and half into the tail, but it does
	172	prevent nastiness from accumulating in the very thing we want
	173	to run as fast as possible.
	174
	175	On entry to this basic block:
	176	t0 == the shifted high-order bits from the previous source word
	177	t2 == the unshifted current source word
	178
	179	We further know that t2 does not contain a null terminator. */
	180
	181	.align 3
	182	$u_loop:
	183	extqh t2, a1, t1 # e0 : extract high bits for current word
	184	addq a1, 8, a1 # .. e1 :
	185	extql t2, a1, t3 # e0 : extract low bits for next time
	186	addq a0, 8, a0 # .. e1 :
	187	or t0, t1, t1 # e0 : current dst word now complete
	188	ldq_u t2, 0(a1) # .. e1 : load high word for next time
	189	stq_u t1, -8(a0) # e0 : save the current word
	190	mov t3, t0 # .. e1 :
	191	cmpbge zero, t2, t7 # e0 : test new word for eos
	192	beq t7, $u_loop # .. e1 :
	193
	194	/* We've found a zero somewhere in the source word we just read.
	195	If it resides in the lower half, we have one (probably partial)
05e86071	196	word to write out, and if it resides in the upper half, we
cf182b58 UD	197	have one full and one partial word left to write out.
	198
	199	On entry to this basic block:
	200	t0 == the shifted high-order bits from the previous source word
	201	t2 == the unshifted current source word. */
	202	$u_eos:
	203	extqh t2, a1, t1 # e0 :
	204	or t0, t1, t1 # e1 : first (partial) source word complete
	205
	206	cmpbge zero, t1, t7 # e0 : is the null in this first bit?
	207	bne t7, $u_final # .. e1 (zdb)
	208
	209	$u_late_head_exit:
	210	stq_u t1, 0(a0) # e0 : the null was in the high-order bits
	211	addq a0, 8, a0 # .. e1 :
	212	extql t2, a1, t1 # e0 :
	213	cmpbge zero, t1, t7 # .. e1 :
	214
	215	/* Take care of a final (probably partial) result word.
	216	On entry to this basic block:
	217	t1 == assembled source word
	218	t7 == cmpbge mask that found the null. */
	219	$u_final:
	220	negq t7, t6 # e0 : isolate low bit set
	221	and t6, t7, t8 # e1 :
	222
05e86071	223	and t8, 0x80, t6 # e0 : avoid dest word load if we can
cf182b58 UD	224	bne t6, 1f # .. e1 (zdb)
	225
	226	ldq_u t0, 0(a0) # e0 :
	227	subq t8, 1, t6 # .. e1 :
	228	or t6, t8, t7 # e0 :
	229	zapnot t1, t6, t1 # .. e1 : kill source bytes >= null
	230	zap t0, t7, t0 # e0 : kill dest bytes <= null
	231	or t0, t1, t1 # e1 :
	232
	233	1: stq_u t1, 0(a0) # e0 :
	234	ret (t9) # .. e1 :
	235
	236	/* Unaligned copy entry point. */
	237	.align 3
	238	$unaligned:
	239
	240	ldq_u t1, 0(a1) # e0 : load first source word
	241
	242	and a0, 7, t4 # .. e1 : find dest misalignment
	243	and a1, 7, t5 # e0 : find src misalignment
	244
05e86071	245	/* Conditionally load the first destination word and a bytemask
cf182b58 UD	246	with 0xff indicating that the destination byte is sacrosanct. */
	247
	248	mov zero, t0 # .. e1 :
	249	mov zero, t6 # e0 :
	250	beq t4, 1f # .. e1 :
	251	ldq_u t0, 0(a0) # e0 :
	252	lda t6, -1 # .. e1 :
	253	mskql t6, a0, t6 # e0 :
	254	1:
	255	subq a1, t4, a1 # .. e1 : sub dest misalignment from src addr
	256
	257	/* If source misalignment is larger than dest misalignment, we need
	258	extra startup checks to avoid SEGV. */
	259
	260	cmplt t4, t5, t8 # e0 :
	261	beq t8, $u_head # .. e1 (zdb)
	262
	263	lda t2, -1 # e1 : mask out leading garbage in source
	264	mskqh t2, t5, t2 # e0 :
	265	nop # e0 :
	266	ornot t1, t2, t3 # .. e1 :
	267	cmpbge zero, t3, t7 # e0 : is there a zero?
	268	beq t7, $u_head # .. e1 (zdb)
	269
	270	/* At this point we've found a zero in the first partial word of
	271	the source. We need to isolate the valid source data and mask
	272	it into the original destination data. (Incidentally, we know
	273	that we'll need at least one byte of that original dest word.) */
	274
	275	ldq_u t0, 0(a0) # e0 :
	276
	277	negq t7, t6 # .. e1 : build bitmask of bytes <= zero
	278	and t6, t7, t8 # e0 :
05e86071	279	and a1, 7, t5 # .. e1 :
cf182b58 UD	280	subq t8, 1, t6 # e0 :
cf182b58 UD	281	or t6, t8, t7 # e1 :
05e86071	282	srl t8, t5, t8 # e0 : adjust final null return value
cf182b58	283
05e86071	284	zapnot t2, t7, t2 # .. e1 : prepare source word; mirror changes
cf182b58	285	and t1, t2, t1 # e1 : to source validity mask
05e86071	286	extql t2, a1, t2 # .. e0 :
cf182b58 UD	287	extql t1, a1, t1 # e0 :
cf182b58 UD	288
db31c863	289	andnot t0, t2, t0 # .. e1 : zero place for source to reside
cf182b58	290	or t0, t1, t1 # e1 : and put it there
05e86071 UD	291	stq_u t1, 0(a0) # .. e0 :
05e86071 UD	292	ret (t9)
cf182b58	293
b2afe910	294	cfi_endproc