must order the store at [CTX+I] before the load of [CTX+SI+SJ].
Rev: src/nettle/sparc/arcfour-crypt.asm:1.6
C Define to YES, to enable the complex code to special case SRC
C and DST with compatible alignment.
-define(<WITH_ALIGN>, <NO>)
+define(<WITH_ALIGN>, <YES>)
C Registers
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
- ldub [CTX + SI], SI
stb SJ, [CTX + I]
+ ldub [CTX + SI], SI
xor TMP, SI, TMP
stb TMP, [DST]
bne $2
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
- ldub [CTX + SI], TMP
stb SJ, [CTX + I]
+ ldub [CTX + SI], TMP
add I, 1, I
and I, 0xff, I
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
+ stb SJ, [CTX + I]
ldub [CTX + SI], SI
sll TMP, 8, TMP
- stb SJ, [CTX + I]
or TMP, SI, TMP
add I, 1, I
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
+ stb SJ, [CTX + I]
ldub [CTX + SI], SI
sll TMP, 8, TMP
- stb SJ, [CTX + I]
or TMP, SI, TMP
add I, 1, I
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
+ stb SJ, [CTX + I]
ldub [CTX + SI], SI
sll TMP, 8, TMP
- stb SJ, [CTX + I]
or TMP, SI, TMP
xor WORD, TMP, WORD
st WORD, [DST]
C 3: Moved load of source byte
C 4: Better instruction scheduling
C 5: Special case SRC and DST with compatible alignment
+C 6: After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI])
C MB/s cycles/byte Code size (bytes)
C 1: 6.6 12.4 132
C 3: 6.0 13.5 116
C 4: 6.5 12.4 116
C 5: 7.9 10.4 496
+C 6: 8.3 9.7 496