[thirdparty/glibc.git] / sysdeps / alpha / alphaev6 / stxncpy.S

/* Copyright (C) 2000-2019 Free Software Foundation, Inc.
   Contributed by Richard Henderson (rth@tamu.edu)
   EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library.  If not, see
   <https://www.gnu.org/licenses/>.  */

/* Copy no more than COUNT bytes of the null-terminated string from
   SRC to DST.

   This is an internal routine used by strncpy, stpncpy, and strncat.
   As such, it uses special linkage conventions to make implementation
   of these public functions more efficient.

   On input:
	t9 = return address
	a0 = DST
	a1 = SRC
	a2 = COUNT

   Furthermore, COUNT may not be zero.

   On output:
	t0  = last word written
	t8  = bitmask (with one bit set) indicating the last byte written
	t10 = bitmask (with one bit set) indicating the byte position of
	      the end of the range specified by COUNT
	a0  = unaligned address of the last *word* written
	a2  = the number of full words left in COUNT

   Furthermore, v0, a3-a5, t11, and t12 are untouched.
*/

#include <sysdep.h>

	.arch ev6
	.set noat
	.set noreorder

	.text
	.type	__stxncpy, @function
	.globl	__stxncpy
	.usepv	__stxncpy, no

	cfi_startproc
	cfi_return_column (t9)

	/* On entry to this basic block:
	   t0 == the first destination word for masking back in
	   t1 == the first source word.  */
	.align 4
stxncpy_aligned:
	/* Create the 1st output word and detect 0's in the 1st input word.  */
	lda	t2, -1		# E : build a mask against false zero
	mskqh	t2, a1, t2	# U :   detection in the src word (stall)
	mskqh	t1, a1, t3	# U :
	ornot	t1, t2, t2	# E : (stall)

	mskql	t0, a1, t0	# U : assemble the first output word
	cmpbge	zero, t2, t7	# E : bits set iff null found
	or	t0, t3, t0	# E : (stall)
	beq	a2, $a_eoc	# U :

	bne	t7, $a_eos	# U :
	nop
	nop
	nop

	/* On entry to this basic block:
	   t0 == a source word not containing a null.  */

	/*
	 * nops here to:
	 *	separate store quads from load quads
	 *	limit of 1 bcond/quad to permit training
	 */
$a_loop:
	stq_u	t0, 0(a0)	# L :
	addq	a0, 8, a0	# E :
	subq	a2, 1, a2	# E :
	nop

	ldq_u	t0, 0(a1)	# L :
	addq	a1, 8, a1	# E :
	cmpbge	zero, t0, t7	# E :
	beq	a2, $a_eoc      # U :

	beq	t7, $a_loop	# U :
	nop
	nop
	nop

	/* Take care of the final (partial) word store.  At this point
	   the end-of-count bit is set in t7 iff it applies.

	   On entry to this basic block we have:
	   t0 == the source word containing the null
	   t7 == the cmpbge mask that found it.  */
$a_eos:
	negq	t7, t8		# E : find low bit set
	and	t7, t8, t8	# E : (stall)
	/* For the sake of the cache, don't read a destination word
	   if we're not going to need it.  */
	and	t8, 0x80, t6	# E : (stall)
	bne	t6, 1f		# U : (stall)

	/* We're doing a partial word store and so need to combine
	   our source and original destination words.  */
	ldq_u	t1, 0(a0)	# L :
	subq	t8, 1, t6	# E :
	or	t8, t6, t7	# E : (stall)
	zapnot	t0, t7, t0	# U : clear src bytes > null (stall)

	zap	t1, t7, t1	# .. e1 : clear dst bytes <= null
	or	t0, t1, t0	# e1    : (stall)
	nop
	nop

1:	stq_u	t0, 0(a0)	# L :
	ret	(t9)		# L0 : Latency=3
	nop
	nop

	/* Add the end-of-count bit to the eos detection bitmask.  */
$a_eoc:
	or	t10, t7, t7	# E :
	br	$a_eos		# L0 : Latency=3
	nop
	nop

	.align 4
__stxncpy:
	/* Are source and destination co-aligned?  */
	lda	t2, -1		# E :
	xor	a0, a1, t1	# E :
	and	a0, 7, t0	# E : find dest misalignment
	nop			# E :

	srl	t2, 1, t2	# U :
	and	t1, 7, t1	# E :
	cmovlt	a2, t2, a2	# E : bound count to LONG_MAX (stall)
	nop			# E :

	addq	a2, t0, a2	# E : bias count by dest misalignment
	subq	a2, 1, a2	# E : (stall)
	and	a2, 7, t2	# E : (stall)
	lda	t10, 1		# E :

	srl	a2, 3, a2	# U : a2 = loop counter = (count - 1)/8
	sll	t10, t2, t10	# U : t10 = bitmask of last count byte
	nop			# E :
	bne	t1, $unaligned	# U : (stall)

	/* We are co-aligned; take care of a partial first word.  */
	ldq_u	t1, 0(a1)	# L : load first src word
	addq	a1, 8, a1	# E :
	beq	t0, stxncpy_aligned # U : avoid loading dest word if not needed
	ldq_u	t0, 0(a0)	# L :

	br	stxncpy_aligned	# U :
	nop
	nop
	nop


/* The source and destination are not co-aligned.  Align the destination
   and cope.  We have to be very careful about not reading too much and
   causing a SEGV.  */

	.align 4
$u_head:
	/* We know just enough now to be able to assemble the first
	   full source word.  We can still find a zero at the end of it
	   that prevents us from outputting the whole thing.

	   On entry to this basic block:
	   t0 == the first dest word, unmasked
	   t1 == the shifted low bits of the first source word
	   t6 == bytemask that is -1 in dest word bytes */

	ldq_u	t2, 8(a1)	# L : Latency=3 load second src word
	addq	a1, 8, a1	# E :
	mskql	t0, a0, t0	# U : mask trailing garbage in dst
	extqh	t2, a1, t4	# U : (3 cycle stall on t2)

	or	t1, t4, t1	# E : first aligned src word complete (stall)
	mskqh	t1, a0, t1	# U : mask leading garbage in src (stall)
	or	t0, t1, t0	# E : first output word complete (stall)
	or	t0, t6, t6	# E : mask original data for zero test (stall)

	cmpbge	zero, t6, t7	# E :
	beq	a2, $u_eocfin	# U :
	lda	t6, -1		# E :
	nop

	bne	t7, $u_final	# U :
	mskql	t6, a1, t6	# U : mask out bits already seen
	stq_u	t0, 0(a0)	# L : store first output word
	or      t6, t2, t2	# E :

	cmpbge	zero, t2, t7	# E : find nulls in second partial
	addq	a0, 8, a0	# E :
	subq	a2, 1, a2	# E :
	bne	t7, $u_late_head_exit	# U :

	/* Finally, we've got all the stupid leading edge cases taken care
	   of and we can set up to enter the main loop.  */
	extql	t2, a1, t1	# U : position hi-bits of lo word
	beq	a2, $u_eoc	# U :
	ldq_u	t2, 8(a1)	# L : read next high-order source word
	addq	a1, 8, a1	# E :

	extqh	t2, a1, t0	# U : position lo-bits of hi word (stall)
	cmpbge	zero, t2, t7	# E :
	nop
	bne	t7, $u_eos	# U :

	/* Unaligned copy main loop.  In order to avoid reading too much,
	   the loop is structured to detect zeros in aligned source words.
	   This has, unfortunately, effectively pulled half of a loop
	   iteration out into the head and half into the tail, but it does
	   prevent nastiness from accumulating in the very thing we want
	   to run as fast as possible.

	   On entry to this basic block:
	   t0 == the shifted low-order bits from the current source word
	   t1 == the shifted high-order bits from the previous source word
	   t2 == the unshifted current source word

	   We further know that t2 does not contain a null terminator.  */

	.align 4
$u_loop:
	or	t0, t1, t0	# E : current dst word now complete
	subq	a2, 1, a2	# E : decrement word count
	extql	t2, a1, t1	# U : extract high bits for next time
	addq	a0, 8, a0	# E :

	stq_u	t0, -8(a0)	# L : save the current word
	beq	a2, $u_eoc	# U :
	ldq_u	t2, 8(a1)	# L : Latency=3 load high word for next time
	addq	a1, 8, a1	# E :

	extqh	t2, a1, t0	# U : extract low bits (2 cycle stall)
	cmpbge	zero, t2, t7	# E : test new word for eos
	nop
	beq	t7, $u_loop	# U :

	/* We've found a zero somewhere in the source word we just read.
	   If it resides in the lower half, we have one (probably partial)
	   word to write out, and if it resides in the upper half, we
	   have one full and one partial word left to write out.

	   On entry to this basic block:
	   t0 == the shifted low-order bits from the current source word
	   t1 == the shifted high-order bits from the previous source word
	   t2 == the unshifted current source word.  */
$u_eos:
	or	t0, t1, t0	# E : first (partial) source word complete
	nop
	cmpbge	zero, t0, t7	# E : is the null in this first bit? (stall)
	bne	t7, $u_final	# U : (stall)

	stq_u	t0, 0(a0)	# L : the null was in the high-order bits
	addq	a0, 8, a0	# E :
	subq	a2, 1, a2	# E :
	nop

$u_late_head_exit:
	extql	t2, a1, t0	# U :
	cmpbge	zero, t0, t7	# E :
	or	t7, t10, t6	# E : (stall)
	cmoveq	a2, t6, t7	# E : Latency=2, extra map slot (stall)

	/* Take care of a final (probably partial) result word.
	   On entry to this basic block:
	   t0 == assembled source word
	   t7 == cmpbge mask that found the null.  */
$u_final:
	negq	t7, t6		# E : isolate low bit set
	and	t6, t7, t8	# E : (stall)
	and	t8, 0x80, t6	# E : avoid dest word load if we can (stall)
	bne	t6, 1f		# U : (stall)

	ldq_u	t1, 0(a0)	# L :
	subq	t8, 1, t6	# E :
	or	t6, t8, t7	# E : (stall)
	zapnot	t0, t7, t0	# U : kill source bytes > null

	zap	t1, t7, t1	# U : kill dest bytes <= null
	or	t0, t1, t0	# E : (stall)
	nop
	nop

1:	stq_u	t0, 0(a0)	# L :
	ret	(t9)		# L0 : Latency=3

        /* Got to end-of-count before end of string.
           On entry to this basic block:
           t1 == the shifted high-order bits from the previous source word  */
$u_eoc:
	and	a1, 7, t6	# E :
	sll	t10, t6, t6	# U : (stall)
	and	t6, 0xff, t6	# E : (stall)
	bne	t6, 1f		# U : (stall)

	ldq_u	t2, 8(a1)	# L : load final src word
	nop
	extqh	t2, a1, t0	# U : extract low bits for last word (stall)
	or	t1, t0, t1	# E : (stall)

1:	cmpbge	zero, t1, t7	# E :
	mov	t1, t0

$u_eocfin:			# end-of-count, final word
	or	t10, t7, t7	# E :
	br	$u_final	# L0 : Latency=3

	/* Unaligned copy entry point.  */
	.align 4
$unaligned:

	ldq_u	t1, 0(a1)	# L : load first source word
	and	a0, 7, t4	# E : find dest misalignment
	and	a1, 7, t5	# E : find src misalignment
	/* Conditionally load the first destination word and a bytemask
	   with 0xff indicating that the destination byte is sacrosanct.  */
	mov	zero, t0	# E :

	mov	zero, t6	# E :
	beq	t4, 1f		# U :
	ldq_u	t0, 0(a0)	# L :
	lda	t6, -1		# E :

	mskql	t6, a0, t6	# U :
	nop
	nop
1:	subq	a1, t4, a1	# E : sub dest misalignment from src addr

	/* If source misalignment is larger than dest misalignment, we need
	   extra startup checks to avoid SEGV.  */

	cmplt	t4, t5, t8	# E :
	extql	t1, a1, t1	# U : shift src into place
	lda	t2, -1		# E : for creating masks later
	beq	t8, $u_head	# U : (stall)

	mskqh	t2, t5, t2	# U : begin src byte validity mask
	cmpbge	zero, t1, t7	# E : is there a zero?
	extql	t2, a1, t2	# U :
	or	t7, t10, t5	# E : test for end-of-count too

	cmpbge	zero, t2, t3	# E :
	cmoveq	a2, t5, t7	# E : Latency=2, extra map slot
	nop			# E : keep with cmoveq
	andnot	t7, t3, t7	# E : (stall)

	beq	t7, $u_head	# U :
	/* At this point we've found a zero in the first partial word of
	   the source.  We need to isolate the valid source data and mask
	   it into the original destination data.  (Incidentally, we know
	   that we'll need at least one byte of that original dest word.) */
	ldq_u	t0, 0(a0)	# L :
	negq	t7, t6		# E : build bitmask of bytes <= zero
	mskqh	t1, t4, t1	# U :

	and	t6, t7, t8	# E :
	subq	t8, 1, t6	# E : (stall)
	or	t6, t8, t7	# E : (stall)
	zapnot	t2, t7, t2	# U : prepare source word; mirror changes (stall)

	zapnot	t1, t7, t1	# U : to source validity mask
	andnot	t0, t2, t0	# E : zero place for source to reside
	or	t0, t1, t0	# E : and put it there (stall both t0, t1)
	stq_u	t0, 0(a0)	# L : (stall)

	ret	(t9)		# L0 : Latency=3

	cfi_endproc
Commit	Line	Data
04277e02	1	/* Copyright (C) 2000-2019 Free Software Foundation, Inc.
6e6bafa8 UD	2	Contributed by Richard Henderson (rth@tamu.edu)
6e6bafa8 UD	3	EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
6e6bafa8 UD	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
3214b89b AJ	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
6e6bafa8 UD	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3214b89b	14	Lesser General Public License for more details.
6e6bafa8	15
3214b89b	16	You should have received a copy of the GNU Lesser General Public
ab84e3ff	17	License along with the GNU C Library. If not, see
5a82c748	18	<https://www.gnu.org/licenses/>. */
6e6bafa8 UD	19
	20	/* Copy no more than COUNT bytes of the null-terminated string from
	21	SRC to DST.
	22
	23	This is an internal routine used by strncpy, stpncpy, and strncat.
	24	As such, it uses special linkage conventions to make implementation
	25	of these public functions more efficient.
	26
	27	On input:
	28	t9 = return address
	29	a0 = DST
	30	a1 = SRC
	31	a2 = COUNT
	32
	33	Furthermore, COUNT may not be zero.
	34
	35	On output:
	36	t0 = last word written
	37	t8 = bitmask (with one bit set) indicating the last byte written
	38	t10 = bitmask (with one bit set) indicating the byte position of
	39	the end of the range specified by COUNT
	40	a0 = unaligned address of the last word written
	41	a2 = the number of full words left in COUNT
	42
	43	Furthermore, v0, a3-a5, t11, and t12 are untouched.
	44	*/
	45
	46	#include <sysdep.h>
	47
	48	.arch ev6
	49	.set noat
	50	.set noreorder
	51
b2afe910 RH	52	.text
	53	.type __stxncpy, @function
	54	.globl __stxncpy
	55	.usepv __stxncpy, no
6e6bafa8	56
b2afe910 RH	57	cfi_startproc
b2afe910 RH	58	cfi_return_column (t9)
6e6bafa8 UD	59
	60	/* On entry to this basic block:
	61	t0 == the first destination word for masking back in
	62	t1 == the first source word. */
b2afe910 RH	63	.align 4
b2afe910 RH	64	stxncpy_aligned:
6e6bafa8 UD	65	/* Create the 1st output word and detect 0's in the 1st input word. */
	66	lda t2, -1 # E : build a mask against false zero
	67	mskqh t2, a1, t2 # U : detection in the src word (stall)
	68	mskqh t1, a1, t3 # U :
	69	ornot t1, t2, t2 # E : (stall)
	70
	71	mskql t0, a1, t0 # U : assemble the first output word
	72	cmpbge zero, t2, t7 # E : bits set iff null found
	73	or t0, t3, t0 # E : (stall)
	74	beq a2, $a_eoc # U :
	75
	76	bne t7, $a_eos # U :
	77	nop
	78	nop
	79	nop
	80
	81	/* On entry to this basic block:
	82	t0 == a source word not containing a null. */
	83
	84	/*
	85	* nops here to:
	86	* separate store quads from load quads
	87	* limit of 1 bcond/quad to permit training
	88	*/
	89	$a_loop:
	90	stq_u t0, 0(a0) # L :
	91	addq a0, 8, a0 # E :
	92	subq a2, 1, a2 # E :
	93	nop
	94
	95	ldq_u t0, 0(a1) # L :
	96	addq a1, 8, a1 # E :
	97	cmpbge zero, t0, t7 # E :
	98	beq a2, $a_eoc # U :
	99
	100	beq t7, $a_loop # U :
	101	nop
	102	nop
	103	nop
	104
	105	/* Take care of the final (partial) word store. At this point
	106	the end-of-count bit is set in t7 iff it applies.
	107
	108	On entry to this basic block we have:
	109	t0 == the source word containing the null
	110	t7 == the cmpbge mask that found it. */
6e6bafa8 UD	111	$a_eos:
	112	negq t7, t8 # E : find low bit set
	113	and t7, t8, t8 # E : (stall)
	114	/* For the sake of the cache, don't read a destination word
	115	if we're not going to need it. */
	116	and t8, 0x80, t6 # E : (stall)
	117	bne t6, 1f # U : (stall)
	118
	119	/* We're doing a partial word store and so need to combine
	120	our source and original destination words. */
	121	ldq_u t1, 0(a0) # L :
	122	subq t8, 1, t6 # E :
	123	or t8, t6, t7 # E : (stall)
	124	zapnot t0, t7, t0 # U : clear src bytes > null (stall)
	125
	126	zap t1, t7, t1 # .. e1 : clear dst bytes <= null
	127	or t0, t1, t0 # e1 : (stall)
	128	nop
	129	nop
	130
	131	1: stq_u t0, 0(a0) # L :
	132	ret (t9) # L0 : Latency=3
	133	nop
	134	nop
	135
	136	/* Add the end-of-count bit to the eos detection bitmask. */
	137	$a_eoc:
	138	or t10, t7, t7 # E :
	139	br $a_eos # L0 : Latency=3
	140	nop
	141	nop
	142
6e6bafa8	143	.align 4
6e6bafa8	144	__stxncpy:
6e6bafa8	145	/* Are source and destination co-aligned? */
8e2f4e97	146	lda t2, -1 # E :
6e6bafa8 UD	147	xor a0, a1, t1 # E :
6e6bafa8 UD	148	and a0, 7, t0 # E : find dest misalignment
8e2f4e97	149	nop # E :
6e6bafa8	150
8e2f4e97 RH	151	srl t2, 1, t2 # U :
	152	and t1, 7, t1 # E :
	153	cmovlt a2, t2, a2 # E : bound count to LONG_MAX (stall)
	154	nop # E :
	155
	156	addq a2, t0, a2 # E : bias count by dest misalignment
	157	subq a2, 1, a2 # E : (stall)
6e6bafa8	158	and a2, 7, t2 # E : (stall)
8e2f4e97	159	lda t10, 1 # E :
6e6bafa8	160
8e2f4e97	161	srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8
6e6bafa8	162	sll t10, t2, t10 # U : t10 = bitmask of last count byte
8e2f4e97 RH	163	nop # E :
8e2f4e97 RH	164	bne t1, $unaligned # U : (stall)
b2afe910	165
6e6bafa8 UD	166	/* We are co-aligned; take care of a partial first word. */
	167	ldq_u t1, 0(a1) # L : load first src word
	168	addq a1, 8, a1 # E :
b2afe910	169	beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
6e6bafa8	170	ldq_u t0, 0(a0) # L :
6e6bafa8	171
b2afe910	172	br stxncpy_aligned # U :
6e6bafa8 UD	173	nop
	174	nop
	175	nop
	176
	177
	178
	179	/* The source and destination are not co-aligned. Align the destination
	180	and cope. We have to be very careful about not reading too much and
	181	causing a SEGV. */
	182
	183	.align 4
	184	$u_head:
	185	/* We know just enough now to be able to assemble the first
	186	full source word. We can still find a zero at the end of it
	187	that prevents us from outputting the whole thing.
	188
	189	On entry to this basic block:
	190	t0 == the first dest word, unmasked
	191	t1 == the shifted low bits of the first source word
	192	t6 == bytemask that is -1 in dest word bytes */
	193
	194	ldq_u t2, 8(a1) # L : Latency=3 load second src word
	195	addq a1, 8, a1 # E :
	196	mskql t0, a0, t0 # U : mask trailing garbage in dst
	197	extqh t2, a1, t4 # U : (3 cycle stall on t2)
	198
	199	or t1, t4, t1 # E : first aligned src word complete (stall)
	200	mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
	201	or t0, t1, t0 # E : first output word complete (stall)
	202	or t0, t6, t6 # E : mask original data for zero test (stall)
	203
	204	cmpbge zero, t6, t7 # E :
	205	beq a2, $u_eocfin # U :
d78045b1	206	lda t6, -1 # E :
6e6bafa8 UD	207	nop
	208
	209	bne t7, $u_final # U :
d78045b1	210	mskql t6, a1, t6 # U : mask out bits already seen
6e6bafa8	211	stq_u t0, 0(a0) # L : store first output word
d78045b1	212	or t6, t2, t2 # E :
6e6bafa8	213
d78045b1 UD	214	cmpbge zero, t2, t7 # E : find nulls in second partial
	215	addq a0, 8, a0 # E :
	216	subq a2, 1, a2 # E :
6e6bafa8	217	bne t7, $u_late_head_exit # U :
d78045b1	218
6e6bafa8 UD	219	/* Finally, we've got all the stupid leading edge cases taken care
	220	of and we can set up to enter the main loop. */
	221	extql t2, a1, t1 # U : position hi-bits of lo word
d78045b1	222	beq a2, $u_eoc # U :
6e6bafa8 UD	223	ldq_u t2, 8(a1) # L : read next high-order source word
	224	addq a1, 8, a1 # E :
	225
d78045b1 UD	226	extqh t2, a1, t0 # U : position lo-bits of hi word (stall)
d78045b1 UD	227	cmpbge zero, t2, t7 # E :
b2afe910	228	nop
d78045b1	229	bne t7, $u_eos # U :
6e6bafa8 UD	230
	231	/* Unaligned copy main loop. In order to avoid reading too much,
	232	the loop is structured to detect zeros in aligned source words.
	233	This has, unfortunately, effectively pulled half of a loop
	234	iteration out into the head and half into the tail, but it does
	235	prevent nastiness from accumulating in the very thing we want
	236	to run as fast as possible.
	237
	238	On entry to this basic block:
d78045b1	239	t0 == the shifted low-order bits from the current source word
6e6bafa8 UD	240	t1 == the shifted high-order bits from the previous source word
	241	t2 == the unshifted current source word
	242
	243	We further know that t2 does not contain a null terminator. */
	244
	245	.align 4
	246	$u_loop:
d78045b1 UD	247	or t0, t1, t0 # E : current dst word now complete
	248	subq a2, 1, a2 # E : decrement word count
	249	extql t2, a1, t1 # U : extract high bits for next time
6e6bafa8 UD	250	addq a0, 8, a0 # E :
6e6bafa8 UD	251
d78045b1 UD	252	stq_u t0, -8(a0) # L : save the current word
	253	beq a2, $u_eoc # U :
	254	ldq_u t2, 8(a1) # L : Latency=3 load high word for next time
	255	addq a1, 8, a1 # E :
6e6bafa8	256
d78045b1 UD	257	extqh t2, a1, t0 # U : extract low bits (2 cycle stall)
d78045b1 UD	258	cmpbge zero, t2, t7 # E : test new word for eos
6e6bafa8	259	nop
6e6bafa8	260	beq t7, $u_loop # U :
6e6bafa8 UD	261
	262	/* We've found a zero somewhere in the source word we just read.
	263	If it resides in the lower half, we have one (probably partial)
	264	word to write out, and if it resides in the upper half, we
	265	have one full and one partial word left to write out.
	266
	267	On entry to this basic block:
d78045b1	268	t0 == the shifted low-order bits from the current source word
6e6bafa8 UD	269	t1 == the shifted high-order bits from the previous source word
	270	t2 == the unshifted current source word. */
	271	$u_eos:
d78045b1 UD	272	or t0, t1, t0 # E : first (partial) source word complete
d78045b1 UD	273	nop
6e6bafa8 UD	274	cmpbge zero, t0, t7 # E : is the null in this first bit? (stall)
	275	bne t7, $u_final # U : (stall)
	276
	277	stq_u t0, 0(a0) # L : the null was in the high-order bits
	278	addq a0, 8, a0 # E :
	279	subq a2, 1, a2 # E :
	280	nop
	281
	282	$u_late_head_exit:
	283	extql t2, a1, t0 # U :
	284	cmpbge zero, t0, t7 # E :
	285	or t7, t10, t6 # E : (stall)
	286	cmoveq a2, t6, t7 # E : Latency=2, extra map slot (stall)
	287
	288	/* Take care of a final (probably partial) result word.
	289	On entry to this basic block:
	290	t0 == assembled source word
	291	t7 == cmpbge mask that found the null. */
	292	$u_final:
	293	negq t7, t6 # E : isolate low bit set
	294	and t6, t7, t8 # E : (stall)
	295	and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
	296	bne t6, 1f # U : (stall)
	297
	298	ldq_u t1, 0(a0) # L :
	299	subq t8, 1, t6 # E :
	300	or t6, t8, t7 # E : (stall)
	301	zapnot t0, t7, t0 # U : kill source bytes > null
	302
	303	zap t1, t7, t1 # U : kill dest bytes <= null
	304	or t0, t1, t0 # E : (stall)
	305	nop
	306	nop
	307
	308	1: stq_u t0, 0(a0) # L :
	309	ret (t9) # L0 : Latency=3
	310
b2afe910	311	/* Got to end-of-count before end of string.
d78045b1 UD	312	On entry to this basic block:
	313	t1 == the shifted high-order bits from the previous source word */
	314	$u_eoc:
	315	and a1, 7, t6 # E :
	316	sll t10, t6, t6 # U : (stall)
	317	and t6, 0xff, t6 # E : (stall)
	318	bne t6, 1f # U : (stall)
	319
	320	ldq_u t2, 8(a1) # L : load final src word
6e6bafa8	321	nop
b2afe910	322	extqh t2, a1, t0 # U : extract low bits for last word (stall)
d78045b1 UD	323	or t1, t0, t1 # E : (stall)
	324
	325	1: cmpbge zero, t1, t7 # E :
	326	mov t1, t0
6e6bafa8 UD	327
	328	$u_eocfin: # end-of-count, final word
	329	or t10, t7, t7 # E :
	330	br $u_final # L0 : Latency=3
6e6bafa8 UD	331
	332	/* Unaligned copy entry point. */
	333	.align 4
	334	$unaligned:
	335
	336	ldq_u t1, 0(a1) # L : load first source word
	337	and a0, 7, t4 # E : find dest misalignment
	338	and a1, 7, t5 # E : find src misalignment
	339	/* Conditionally load the first destination word and a bytemask
	340	with 0xff indicating that the destination byte is sacrosanct. */
	341	mov zero, t0 # E :
	342
	343	mov zero, t6 # E :
	344	beq t4, 1f # U :
	345	ldq_u t0, 0(a0) # L :
	346	lda t6, -1 # E :
	347
	348	mskql t6, a0, t6 # U :
	349	nop
	350	nop
d78045b1	351	1: subq a1, t4, a1 # E : sub dest misalignment from src addr
6e6bafa8 UD	352
	353	/* If source misalignment is larger than dest misalignment, we need
	354	extra startup checks to avoid SEGV. */
	355
	356	cmplt t4, t5, t8 # E :
	357	extql t1, a1, t1 # U : shift src into place
	358	lda t2, -1 # E : for creating masks later
	359	beq t8, $u_head # U : (stall)
	360
	361	mskqh t2, t5, t2 # U : begin src byte validity mask
	362	cmpbge zero, t1, t7 # E : is there a zero?
	363	extql t2, a1, t2 # U :
	364	or t7, t10, t5 # E : test for end-of-count too
	365
	366	cmpbge zero, t2, t3 # E :
	367	cmoveq a2, t5, t7 # E : Latency=2, extra map slot
	368	nop # E : keep with cmoveq
	369	andnot t7, t3, t7 # E : (stall)
	370
	371	beq t7, $u_head # U :
	372	/* At this point we've found a zero in the first partial word of
	373	the source. We need to isolate the valid source data and mask
	374	it into the original destination data. (Incidentally, we know
	375	that we'll need at least one byte of that original dest word.) */
	376	ldq_u t0, 0(a0) # L :
	377	negq t7, t6 # E : build bitmask of bytes <= zero
	378	mskqh t1, t4, t1 # U :
	379
	380	and t6, t7, t8 # E :
	381	subq t8, 1, t6 # E : (stall)
	382	or t6, t8, t7 # E : (stall)
	383	zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
	384
	385	zapnot t1, t7, t1 # U : to source validity mask
	386	andnot t0, t2, t0 # E : zero place for source to reside
	387	or t0, t1, t0 # E : and put it there (stall both t0, t1)
	388	stq_u t0, 0(a0) # L : (stall)
	389
	390	ret (t9) # L0 : Latency=3
6e6bafa8	391
b2afe910	392	cfi_endproc