]> git.ipfire.org Git - thirdparty/nettle.git/commitdiff
* sparc/machine.m4 (AES_FINAL_ROUND): Better scheduling, by
authorNiels Möller <nisse@lysator.liu.se>
Sun, 16 Oct 2005 12:24:13 +0000 (14:24 +0200)
committerNiels Möller <nisse@lysator.liu.se>
Sun, 16 Oct 2005 12:24:13 +0000 (14:24 +0200)
interleaving independent operations.

Rev: src/nettle/sparc/aes-encrypt-internal.asm:1.10
Rev: src/nettle/sparc/machine.m4:1.11

sparc/aes-encrypt-internal.asm
sparc/machine.m4

index 09964b69260d4e77cbc2f688b75af40cd981019e..58beb4df41824a477af45ef53027e3738ea85432 100644 (file)
@@ -137,15 +137,19 @@ EPILOGUE(_nettle_aes_encrypt)
 
 C Some stats from adriana.lysator.liu.se (SS1000$, 85 MHz), for AES 128
 
-C A:   nettle-1.13 C-code
-C B:   nettle-1.13 assembler
-C C:   New C-code
-C D:   New assembler, first correct version
-C E:   New assembler, with basic scheduling of AES_ROUND.
+C 1:   nettle-1.13 C-code
+C 2:   nettle-1.13 assembler
+C 3:   New C-code
+C 4:   New assembler, first correct version
+C 5:   New assembler, with basic scheduling of AES_ROUND.
+C 6:   New assembpler, with loop invariants T0-T3.
+C 7:   New assembler, with basic scheduling also of AES_FINAL_ROUND.
        
 C      MB/s    cycles/block
-C A    1.2     1107
-C B    2.3     572
-C C    2.1     627
-C D    1.8     722
-C E    2.6     496
+C 1    1.2     1107
+C 2    2.3     572
+C 3    2.1     627
+C 4    1.8     722
+C 5    2.6     496
+C 6    3.0     437
+C 7    3.1     415
index adc205e22e7e106b0176828a1428a73613eafc13..0817327aa96fc6817ea874f24b280d7261441f88 100644 (file)
@@ -66,29 +66,25 @@ define(<AES_FINAL_ROUND>, <
        ld      [$7 + eval(4*$1)], TMP3
 
        and     $3, 0xff, TMP1          C  0
-       ldub    [T + TMP1], TMP1        C  0
-       nop
-       xor     TMP3, TMP1, TMP1        C  0
-       stb     TMP1, [$8 + eval(4*$1)] C  0
-       
        srl     $4, 8, TMP2             C  1
+       ldub    [T + TMP1], TMP1        C  0
        and     TMP2, 0xff, TMP2        C  1
+       xor     TMP3, TMP1, TMP1        C  0
        ldub    [T + TMP2], TMP2        C  1
-       srl     TMP3, 8, TMP3           C  1
-       xor     TMP3, TMP2, TMP2        C  1
-       stb     TMP2, [$8 + eval(4*$1 + 1)]     C  1
-
+       stb     TMP1, [$8 + eval(4*$1)] C  0    E0
        srl     $5, 16, TMP1            C  2
+       srl     TMP3, 8, TMP3           C  1
        and     TMP1, 0xff, TMP1        C  2
+       xor     TMP3, TMP2, TMP2        C  1
        ldub    [T + TMP1], TMP1        C  2
-       srl     TMP3, 8, TMP3           C  2
-       xor     TMP3, TMP1, TMP1        C  2
-       stb     TMP1, [$8 + eval(4*$1 + 2)]     C  2
-
+       stb     TMP2, [$8 + eval(4*$1 + 1)]     C  1    E1
        srl     $6, 24, TMP2            C  3
+       srl     TMP3, 8, TMP3           C  2
        ldub    [T + TMP2], TMP2        C  3
+       xor     TMP3, TMP1, TMP1        C  2
        srl     TMP3, 8, TMP3           C  3
+       stb     TMP1, [$8 + eval(4*$1 + 2)]     C  2    E2
        xor     TMP3, TMP2, TMP2        C  3
-       stb     TMP2, [$8 + eval(4*$1 + 3)]     C  3
+       stb     TMP2, [$8 + eval(4*$1 + 3)]     C  3    E3
 >)