From: Niels Möller Date: Wed, 19 Oct 2005 07:46:54 +0000 (+0200) Subject: * sparc/arcfour-crypt.asm: Special unrolled code if SRC and DST X-Git-Tag: nettle_1.14_release_20051205~74 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1128d5a6195290466e157665cd113b05aa485cdd;p=thirdparty%2Fnettle.git * sparc/arcfour-crypt.asm: Special unrolled code if SRC and DST have compatible alignment. Improves performance by 20%, but I'm not sure it's worth the extra complexity. Rev: src/nettle/sparc/arcfour-crypt.asm:1.5 --- diff --git a/sparc/arcfour-crypt.asm b/sparc/arcfour-crypt.asm index beadd91e..0dd9e363 100644 --- a/sparc/arcfour-crypt.asm +++ b/sparc/arcfour-crypt.asm @@ -18,7 +18,12 @@ C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, C MA 02111-1307, USA. -C Registers +C Define to YES, to enable the complex code to special case SRC +C and DST with compatible alignment. + +define(, ) + +C Registers define(, <%i0>) define(,<%i1>) @@ -30,9 +35,103 @@ define(, <%i5>) define(, <%g1>) define(, <%g2>) define(, <%g3>) +define(, <%o0>) +define(, <%o1>) -C FIXME: Consider using the callers window +C Encrypts n bytes, one byte at a time. +C ARCFOUR_BYTE_LOOP(n, label) +define(, < +$2: + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + subcc $1,1,$1 + ldub [SRC], TMP + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + add SRC, 1, SRC + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], SI + stb SJ, [CTX + I] + xor TMP, SI, TMP + stb TMP, [DST] + bne $2 + add DST, 1, DST +>)dnl + +C Encrypts 4n bytes, four at a time. Requires proper alignmentof +C SRC and DST. +C ARCFOUR_WORD_LOOP(n, label) +define(, < +$2: + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + ld [SRC], WORD + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], TMP + stb SJ, [CTX + I] + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + add SRC, 4, SRC + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], SI + sll TMP, 8, TMP + stb SJ, [CTX + I] + or TMP, SI, TMP + + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + subcc $1, 1, $1 + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], SI + sll TMP, 8, TMP + stb SJ, [CTX + I] + or TMP, SI, TMP + + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + C empty slot + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], SI + sll TMP, 8, TMP + stb SJ, [CTX + I] + or TMP, SI, TMP + xor WORD, TMP, WORD + st WORD, [DST] + + bne $2 + add DST, 4, DST +>)dnl + +C FIXME: Consider using the callers window define(, 104) .file "arcfour-crypt.asm" @@ -56,26 +155,44 @@ PROLOGUE(nettle_arcfour_crypt) and I, 0xff, J srl I, 8, I -.Loop: - add I, 1, I - and I, 0xff, I - ldub [CTX + I], SI - subcc LENGTH,1,LENGTH - ldub [SRC], TMP - add J, SI, J - and J, 0xff, J - ldub [CTX + J], SJ - add SRC, 1, SRC - stb SI, [CTX + J] - add SI, SJ, SI - and SI, 0xff, SI - ldub [CTX + SI], SI - stb SJ, [CTX + I] - xor TMP, SI, TMP - stb TMP, [DST] - bne .Loop - add DST, 1, DST +ifelse(WITH_ALIGN, YES, < + C Check if SRC and DST have compatible alignment + xor SRC, DST, TMP + andcc TMP, 3, TMP + + bne .Lrest + nop + + andcc DST, 3, N + bz .Laligned + nop + + sub N, 4, N + neg N + cmp N, LENGTH + bgeu .Lrest + nop + + sub LENGTH, N, LENGTH + + ARCFOUR_BYTE_LOOP(N, .Lunalignedloop) + +.Laligned: + srl LENGTH, 2, N + cmp N, 0 + be .Lrest + nop + + ARCFOUR_WORD_LOOP(N, .Lalignedloop) + + andcc LENGTH, 3, LENGTH + bz .Ldone + nop +>) +.Lrest: + ARCFOUR_BYTE_LOOP(LENGTH, .Loop) +.Ldone: C Save back I and J sll I, 8, I or I, J, I @@ -93,9 +210,11 @@ C 1: nettle-1.13 C-code C 2: First working version of the assembler code C 3: Moved load of source byte C 4: Better instruction scheduling +C 5: Special case SRC and DST with compatible alignment C MB/s cycles/byte Code size (bytes) C 1: 6.6 12.4 132 C 2: 5.6 14.5 116 C 3: 6.0 13.5 116 C 4: 6.5 12.4 116 +C 5: 7.9 10.4 496