]> git.ipfire.org Git - thirdparty/openssl.git/commitdiff
sha/asm/keccak1600-ppc64.pl: Load data in 8 byte chunks on little endian
authorAnton Blanchard <anton@linux.ibm.com>
Tue, 12 Mar 2019 05:03:56 +0000 (16:03 +1100)
committerTomas Mraz <tomas@openssl.org>
Thu, 11 Nov 2021 09:58:46 +0000 (10:58 +0100)
We currently load data byte by byte in order to byteswap it on big
endian. On little endian we can just do 8 byte loads.

A SHAKE128 benchmark runs 10% faster on POWER9 with this patch applied.

Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8455)

crypto/sha/asm/keccak1600-ppc64.pl

index 83f8d8ef337beaf561b1175fe26c7c571c34f32b..bff0d785859e3ba9d91858a9b69ebab27b141473 100755 (executable)
@@ -51,6 +51,16 @@ if ($flavour =~ /64/) {
        $PUSH   ="std";
 } else { die "nonsense $flavour"; }
 
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
+
+if ($LITTLE_ENDIAN) {
+       $DWORD_LE_LOAD = "ldu   r0,8(r3)";
+       $LE_LOAD_SIZE = "8";
+} else {
+       $DWORD_LE_LOAD = "bl    dword_le_load";
+       $LE_LOAD_SIZE = "1";
+}
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -384,7 +394,9 @@ KeccakF1600:
        .byte   0,12,4,1,0x80,18,1,0
        .long   0
 .size  KeccakF1600,.-KeccakF1600
-
+___
+if (!$LITTLE_ENDIAN) {
+$code.=<<___;
 .type  dword_le_load,\@function
 .align 5
 dword_le_load:
@@ -408,7 +420,10 @@ dword_le_load:
        .byte   0,12,0x14,0,0,0,1,0
        .long   0
 .size  dword_le_load,.-dword_le_load
+___
+}
 
+$code.=<<___;
 .globl SHA3_absorb
 .type  SHA3_absorb,\@function
 .align 5
@@ -436,7 +451,7 @@ SHA3_absorb:
        $PUSH   r0,`$FRAME+$LRSAVE`($sp)
 
        bl      PICmeup
-       subi    r4,r4,1                         ; prepare for lbzu
+       subi    r4,r4,$LE_LOAD_SIZE             ; prepare for ldu or lbzu
        subi    r12,r12,8                       ; prepare for ldu
 
        $PUSH   r3,`$LOCALS+0*$SIZE_T`($sp)     ; save A[][]
@@ -487,79 +502,79 @@ SHA3_absorb:
        srwi    r5,r5,3
        $PUSH   r4,`$LOCALS+2*$SIZE_T`($sp)     ; save len
        mtctr   r5
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[0][0],$A[0][0],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[0][1],$A[0][1],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[0][2],$A[0][2],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[0][3],$A[0][3],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[0][4],$A[0][4],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[1][0],$A[1][0],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[1][1],$A[1][1],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[1][2],$A[1][2],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[1][3],$A[1][3],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[1][4],$A[1][4],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[2][0],$A[2][0],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[2][1],$A[2][1],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[2][2],$A[2][2],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[2][3],$A[2][3],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[2][4],$A[2][4],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[3][0],$A[3][0],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[3][1],$A[3][1],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[3][2],$A[3][2],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[3][3],$A[3][3],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[3][4],$A[3][4],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[4][0],$A[4][0],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[4][1],$A[4][1],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[4][2],$A[4][2],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[4][3],$A[4][3],r0
        bdz     .Lprocess_block
-       bl      dword_le_load                   ; *inp++
+       $DWORD_LE_LOAD                          ; *inp++
        xor     $A[4][4],$A[4][4],r0
 
 .Lprocess_block: