From: Niels Möller <nisse@lysator.liu.se>
Date: Wed, 19 Oct 2005 07:46:54 +0000 (+0200)
Subject: * sparc/arcfour-crypt.asm: Special unrolled code if SRC and DST
X-Git-Tag: nettle_1.14_release_20051205~74
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1128d5a6195290466e157665cd113b05aa485cdd;p=thirdparty%2Fnettle.git

* sparc/arcfour-crypt.asm: Special unrolled code if SRC and DST
have compatible alignment. Improves performance by 20%, but I'm
not sure it's worth the extra complexity.

Rev: src/nettle/sparc/arcfour-crypt.asm:1.5
---

diff --git a/sparc/arcfour-crypt.asm b/sparc/arcfour-crypt.asm
index beadd91e..0dd9e363 100644
--- a/sparc/arcfour-crypt.asm
+++ b/sparc/arcfour-crypt.asm
@@ -18,7 +18,12 @@ C along with the nettle library; see the file COPYING.LIB.  If not, write to
 C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 C MA 02111-1307, USA.
 
-C Registers
+C	Define to YES, to enable the complex code to special case SRC
+C	and DST with compatible alignment.
+	
+define(<WITH_ALIGN>, <NO>)
+
+C	Registers
 
 define(<CTX>,	<%i0>)
 define(<LENGTH>,<%i1>)
@@ -30,9 +35,103 @@ define(<J>,	<%i5>)
 define(<SI>,	<%g1>)
 define(<SJ>,	<%g2>)
 define(<TMP>,	<%g3>)
+define(<N>,	<%o0>)
+define(<WORD>,	<%o1>)
 
-C	FIXME: Consider using the callers window
+C	Encrypts n bytes, one byte at a time.
+C	ARCFOUR_BYTE_LOOP(n, label)
+define(<ARCFOUR_BYTE_LOOP>, <
+$2:	
+	add	I, 1, I
+	and	I, 0xff, I
+	ldub	[CTX + I], SI
+	subcc	$1,1,$1
+	ldub	[SRC], TMP
+	add	J, SI, J
+	and	J, 0xff, J
+	ldub	[CTX + J], SJ
+	add	SRC, 1, SRC
+	stb	SI, [CTX + J]
+	add	SI, SJ, SI
+	and	SI, 0xff, SI
+	ldub	[CTX + SI], SI
+	stb	SJ, [CTX + I]
+	xor	TMP, SI, TMP
+	stb	TMP, [DST]
+	bne	$2
+	add	DST, 1, DST
+>)dnl
+
+C	Encrypts 4n bytes, four at a time. Requires proper alignmentof
+C	SRC and DST.
+C	ARCFOUR_WORD_LOOP(n, label)
+define(<ARCFOUR_WORD_LOOP>, <
+$2:
+	add	I, 1, I
+	and	I, 0xff, I
+	ldub	[CTX + I], SI
+	ld	[SRC], WORD
+	add	J, SI, J
+	and	J, 0xff, J
+	ldub	[CTX + J], SJ
+	stb	SI, [CTX + J]
+	add	SI, SJ, SI
+	and	SI, 0xff, SI
+	ldub	[CTX + SI], TMP
+	stb	SJ, [CTX + I]
 
+	add	I, 1, I
+	and	I, 0xff, I
+	ldub	[CTX + I], SI
+	add	SRC, 4, SRC
+	add	J, SI, J
+	and	J, 0xff, J
+	ldub	[CTX + J], SJ
+	stb	SI, [CTX + J]
+	add	SI, SJ, SI
+	and	SI, 0xff, SI
+	ldub	[CTX + SI], SI
+	sll	TMP, 8, TMP
+	stb	SJ, [CTX + I]
+	or	TMP, SI, TMP
+	
+	add	I, 1, I
+	and	I, 0xff, I
+	ldub	[CTX + I], SI
+	subcc	$1, 1, $1
+	add	J, SI, J
+	and	J, 0xff, J
+	ldub	[CTX + J], SJ
+	stb	SI, [CTX + J]
+	add	SI, SJ, SI
+	and	SI, 0xff, SI
+	ldub	[CTX + SI], SI
+	sll	TMP, 8, TMP
+	stb	SJ, [CTX + I]
+	or	TMP, SI, TMP
+
+	add	I, 1, I
+	and	I, 0xff, I
+	ldub	[CTX + I], SI
+	C	empty slot
+	add	J, SI, J
+	and	J, 0xff, J
+	ldub	[CTX + J], SJ
+	stb	SI, [CTX + J]
+	add	SI, SJ, SI
+	and	SI, 0xff, SI
+	ldub	[CTX + SI], SI
+	sll	TMP, 8, TMP
+	stb	SJ, [CTX + I]
+	or	TMP, SI, TMP
+	xor	WORD, TMP, WORD
+	st	WORD, [DST]
+	
+	bne	$2
+	add	DST, 4, DST
+>)dnl
+		
+C	FIXME: Consider using the callers window
 define(<FRAME_SIZE>, 104)
 
 	.file "arcfour-crypt.asm"
@@ -56,26 +155,44 @@ PROLOGUE(nettle_arcfour_crypt)
 	and	I, 0xff, J
 	srl	I, 8, I
 
-.Loop:	
-	add	I, 1, I
-	and	I, 0xff, I
-	ldub	[CTX + I], SI
-	subcc	LENGTH,1,LENGTH
-	ldub	[SRC], TMP
-	add	J, SI, J
-	and	J, 0xff, J
-	ldub	[CTX + J], SJ
-	add	SRC, 1, SRC
-	stb	SI, [CTX + J]
-	add	SI, SJ, SI
-	and	SI, 0xff, SI
-	ldub	[CTX + SI], SI
-	stb	SJ, [CTX + I]
-	xor	TMP, SI, TMP
-	stb	TMP, [DST]
-	bne	.Loop
-	add	DST, 1, DST
+ifelse(WITH_ALIGN, YES, <
+	C	Check if SRC and DST have compatible alignment
+	xor	SRC, DST, TMP
+	andcc	TMP, 3, TMP
+
+	bne	.Lrest
+	nop
+	
+	andcc	DST, 3, N
+	bz	.Laligned
+	nop
+	
+	sub	N, 4, N
+	neg	N
+	cmp	N, LENGTH
+	bgeu	.Lrest
+	nop
+	
+	sub	LENGTH, N, LENGTH
+	
+	ARCFOUR_BYTE_LOOP(N, .Lunalignedloop)
+
+.Laligned:
+	srl	LENGTH, 2, N
+	cmp	N, 0
+	be	.Lrest
+	nop
+	
+	ARCFOUR_WORD_LOOP(N, .Lalignedloop)
+
+	andcc	LENGTH, 3, LENGTH
+	bz	.Ldone
+	nop
+>)
+.Lrest:
+	ARCFOUR_BYTE_LOOP(LENGTH, .Loop)
 
+.Ldone:
 	C	Save back I and J	
 	sll	I, 8, I
 	or	I, J, I
@@ -93,9 +210,11 @@ C 1:	nettle-1.13 C-code
 C 2:	First working version of the assembler code
 C 3:	Moved load of source byte
 C 4:	Better instruction scheduling
+C 5:	Special case SRC and DST with compatible alignment
 
 C	MB/s	cycles/byte	Code size (bytes)
 C 1:	6.6	12.4		132
 C 2:	5.6	14.5		116
 C 3:	6.0	13.5		116
 C 4:	6.5	12.4		116
+C 5:	7.9	10.4		496