ornot t1, t2, t2 # E : (stall)
mskql t0, a1, t0 # U : assemble the first output word
- cmpbge zero, t2, t8 # E : bits set iff null found
+ cmpbge zero, t2, t10 # E : bits set iff null found
or t0, t3, t1 # E : (stall)
- bne t8, $a_eos # U : (stall)
+ bne t10, $a_eos # U : (stall)
/* On entry to this basic block:
t0 == the first destination word for masking back in
ldq_u t1, 0(a1) # L : Latency=3
addq a1, 8, a1 # E :
- cmpbge zero, t1, t8 # E : (3 cycle stall)
- beq t8, $a_loop # U : (stall for t8)
+ cmpbge zero, t1, t10 # E : (3 cycle stall)
+ beq t10, $a_loop # U : (stall for t10)
/* Take care of the final (partial) word store.
On entry to this basic block we have:
t1 == the source word containing the null
- t8 == the cmpbge mask that found it. */
+ t10 == the cmpbge mask that found it. */
$a_eos:
- negq t8, t6 # E : find low bit set
- and t8, t6, t10 # E : (stall)
+ negq t10, t6 # E : find low bit set
+ and t10, t6, t8 # E : (stall)
/* For the sake of the cache, don't read a destination word
if we're not going to need it. */
- and t10, 0x80, t6 # E : (stall)
+ and t8, 0x80, t6 # E : (stall)
bne t6, 1f # U : (stall)
/* We're doing a partial word store and so need to combine
our source and original destination words. */
ldq_u t0, 0(a0) # L : Latency=3
- subq t10, 1, t6 # E :
+ subq t8, 1, t6 # E :
zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
- or t10, t6, t8 # E : (stall)
+ or t8, t6, t10 # E : (stall)
- zap t0, t8, t0 # E : clear dst bytes <= null
+ zap t0, t10, t0 # E : clear dst bytes <= null
or t0, t1, t1 # E : (stall)
nop
nop
or t0, t1, t1 # E : (stall on t1)
or t1, t6, t6 # E :
- cmpbge zero, t6, t8 # E : (stall)
+ cmpbge zero, t6, t10 # E : (stall)
lda t6, -1 # E : for masking just below
- bne t8, $u_final # U : (stall)
+ bne t10, $u_final # U : (stall)
mskql t6, a1, t6 # U : mask out the bits we have
or t6, t2, t2 # E : already extracted before (stall)
- cmpbge zero, t2, t8 # E : testing eos (stall)
- bne t8, $u_late_head_exit # U : (stall)
+ cmpbge zero, t2, t10 # E : testing eos (stall)
+ bne t10, $u_late_head_exit # U : (stall)
/* Finally, we've got all the stupid leading edge cases taken care
of and we can set up to enter the main loop. */
ldq_u t2, 8(a1) # U : read next high-order source word
addq a1, 8, a1 # E :
- cmpbge zero, t2, t8 # E : (stall for t2)
+ cmpbge zero, t2, t10 # E : (stall for t2)
nop # E :
- bne t8, $u_eos # U : (stall)
+ bne t10, $u_eos # U : (stall)
/* Unaligned copy main loop. In order to avoid reading too much,
the loop is structured to detect zeros in aligned source words.
stq_u t1, -8(a0) # L : save the current word (stall)
mov t3, t0 # E :
- cmpbge zero, t2, t8 # E : test new word for eos
- beq t8, $u_loop # U : (stall)
+ cmpbge zero, t2, t10 # E : test new word for eos
+ beq t10, $u_loop # U : (stall)
nop
nop
$u_eos:
extqh t2, a1, t1 # U :
or t0, t1, t1 # E : first (partial) source word complete (stall)
- cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
- bne t8, $u_final # U : (stall)
+ cmpbge zero, t1, t10 # E : is the null in this first bit? (stall)
+ bne t10, $u_final # U : (stall)
$u_late_head_exit:
stq_u t1, 0(a0) # L : the null was in the high-order bits
addq a0, 8, a0 # E :
extql t2, a1, t1 # U :
- cmpbge zero, t1, t8 # E : (stall)
+ cmpbge zero, t1, t10 # E : (stall)
/* Take care of a final (probably partial) result word.
On entry to this basic block:
t1 == assembled source word
- t8 == cmpbge mask that found the null. */
+ t10 == cmpbge mask that found the null. */
$u_final:
- negq t8, t6 # E : isolate low bit set
- and t6, t8, t10 # E : (stall)
- and t10, 0x80, t6 # E : avoid dest word load if we can (stall)
+ negq t10, t6 # E : isolate low bit set
+ and t6, t10, t8 # E : (stall)
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
bne t6, 1f # U : (stall)
ldq_u t0, 0(a0) # E :
- subq t10, 1, t6 # E :
- or t6, t10, t8 # E : (stall)
+ subq t8, 1, t6 # E :
+ or t6, t8, t10 # E : (stall)
zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
- zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
+ zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
or t0, t1, t1 # E : (stall)
nop
nop
subq a1, t4, a1 # E : sub dest misalignment from src addr
/* If source misalignment is larger than dest misalignment, we need
extra startup checks to avoid SEGV. */
- cmplt t4, t5, t10 # E :
- beq t10, $u_head # U :
+ cmplt t4, t5, t8 # E :
+ beq t8, $u_head # U :
lda t2, -1 # E : mask out leading garbage in source
mskqh t2, t5, t2 # U :
ornot t1, t2, t3 # E : (stall)
- cmpbge zero, t3, t8 # E : is there a zero? (stall)
- beq t8, $u_head # U : (stall)
+ cmpbge zero, t3, t10 # E : is there a zero? (stall)
+ beq t10, $u_head # U : (stall)
/* At this point we've found a zero in the first partial word of
the source. We need to isolate the valid source data and mask
that we'll need at least one byte of that original dest word.) */
ldq_u t0, 0(a0) # L :
- negq t8, t6 # E : build bitmask of bytes <= zero
- and t6, t8, t10 # E : (stall)
+ negq t10, t6 # E : build bitmask of bytes <= zero
+ and t6, t10, t8 # E : (stall)
and a1, 7, t5 # E :
- subq t10, 1, t6 # E :
- or t6, t10, t8 # E : (stall)
- srl t10, t5, t10 # U : adjust final null return value
- zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
+ subq t8, 1, t6 # E :
+ or t6, t8, t10 # E : (stall)
+ srl t8, t5, t8 # U : adjust final null return value
+ zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
and t1, t2, t1 # E : to source validity mask
extql t2, a1, t2 # U :