C Some stats from adriana.lysator.liu.se (SS1000$, 85 MHz), for AES 128
-C A: nettle-1.13 C-code
-C B: nettle-1.13 assembler
-C C: New C-code
-C D: New assembler, first correct version
-C E: New assembler, with basic scheduling of AES_ROUND.
+C 1: nettle-1.13 C-code
+C 2: nettle-1.13 assembler
+C 3: New C-code
+C 4: New assembler, first correct version
+C 5: New assembler, with basic scheduling of AES_ROUND.
+C 6: New assembpler, with loop invariants T0-T3.
+C 7: New assembler, with basic scheduling also of AES_FINAL_ROUND.
C MB/s cycles/block
-C A 1.2 1107
-C B 2.3 572
-C C 2.1 627
-C D 1.8 722
-C E 2.6 496
+C 1 1.2 1107
+C 2 2.3 572
+C 3 2.1 627
+C 4 1.8 722
+C 5 2.6 496
+C 6 3.0 437
+C 7 3.1 415
ld [$7 + eval(4*$1)], TMP3
and $3, 0xff, TMP1 C 0
- ldub [T + TMP1], TMP1 C 0
- nop
- xor TMP3, TMP1, TMP1 C 0
- stb TMP1, [$8 + eval(4*$1)] C 0
-
srl $4, 8, TMP2 C 1
+ ldub [T + TMP1], TMP1 C 0
and TMP2, 0xff, TMP2 C 1
+ xor TMP3, TMP1, TMP1 C 0
ldub [T + TMP2], TMP2 C 1
- srl TMP3, 8, TMP3 C 1
- xor TMP3, TMP2, TMP2 C 1
- stb TMP2, [$8 + eval(4*$1 + 1)] C 1
-
+ stb TMP1, [$8 + eval(4*$1)] C 0 E0
srl $5, 16, TMP1 C 2
+ srl TMP3, 8, TMP3 C 1
and TMP1, 0xff, TMP1 C 2
+ xor TMP3, TMP2, TMP2 C 1
ldub [T + TMP1], TMP1 C 2
- srl TMP3, 8, TMP3 C 2
- xor TMP3, TMP1, TMP1 C 2
- stb TMP1, [$8 + eval(4*$1 + 2)] C 2
-
+ stb TMP2, [$8 + eval(4*$1 + 1)] C 1 E1
srl $6, 24, TMP2 C 3
+ srl TMP3, 8, TMP3 C 2
ldub [T + TMP2], TMP2 C 3
+ xor TMP3, TMP1, TMP1 C 2
srl TMP3, 8, TMP3 C 3
+ stb TMP1, [$8 + eval(4*$1 + 2)] C 2 E2
xor TMP3, TMP2, TMP2 C 3
- stb TMP2, [$8 + eval(4*$1 + 3)] C 3
+ stb TMP2, [$8 + eval(4*$1 + 3)] C 3 E3
>)