From 6715034002f2d7831b234c50a2a072320905cafe Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 27 May 2011 13:32:34 +0000 Subject: [PATCH] PPC assembler pack: adhere closer to ABI specs, add PowerOpen traceback data. --- crypto/aes/asm/aes-ppc.pl | 42 +++++++--- crypto/bn/asm/ppc-mont.pl | 96 ++++++++++++---------- crypto/bn/asm/ppc.pl | 43 +++++++--- crypto/bn/asm/ppc64-mont.pl | 154 ++++++++++++++++++----------------- crypto/ppccpuid.pl | 20 +++++ crypto/sha/asm/sha1-ppc.pl | 83 ++++++++++--------- crypto/sha/asm/sha512-ppc.pl | 114 +++++++++++++------------- 7 files changed, 311 insertions(+), 241 deletions(-) diff --git a/crypto/aes/asm/aes-ppc.pl b/crypto/aes/asm/aes-ppc.pl index f82c5e1814..8cfd4232b8 100644 --- a/crypto/aes/asm/aes-ppc.pl +++ b/crypto/aes/asm/aes-ppc.pl @@ -18,7 +18,7 @@ # February 2010 # -# Rescheduling instructions to favour Power6 pipeline gives 10% +# Rescheduling instructions to favour Power6 pipeline gave 10% # performance improvement on the platfrom in question (and marginal # improvement even on others). It should be noted that Power6 fails # to process byte in 18 cycles, only in 23, because it fails to issue @@ -33,11 +33,13 @@ $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; + $LRSAVE =2*$SIZE_T; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; + $LRSAVE =$SIZE_T; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; @@ -116,15 +118,19 @@ LAES_Te: addi $Tbl0,$Tbl0,`128-8` mtlr r0 blr - .space `32-24` + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` LAES_Td: mflr r0 bcl 20,31,\$+4 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry - addi $Tbl0,$Tbl0,`128-8-32+2048+256` + addi $Tbl0,$Tbl0,`128-64-8+2048+256` mtlr r0 blr - .space `128-32-24` + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `128-64-9*4` ___ &_data_word( 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, @@ -328,10 +334,9 @@ $code.=<<___; .globl .AES_encrypt .align 7 .AES_encrypt: - mflr r0 $STU $sp,-$FRAME($sp) + mflr r0 - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -352,6 +357,7 @@ $code.=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $s0,0($inp) lwz $s1,4($inp) @@ -364,7 +370,7 @@ $code.=<<___; stw $s2,8($out) stw $s3,12($out) - $POP r0,`$FRAME-$SIZE_T*21`($sp) + $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) @@ -388,6 +394,9 @@ $code.=<<___; mtlr r0 addi $sp,$sp,$FRAME blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 .align 5 Lppc_AES_encrypt: @@ -530,6 +539,8 @@ Lenc_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_encrypt_compact: @@ -673,14 +684,15 @@ Lenc_compact_done: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .AES_decrypt .align 7 .AES_decrypt: - mflr r0 $STU $sp,-$FRAME($sp) + mflr r0 - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -701,6 +713,7 @@ Lenc_compact_done: $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $s0,0($inp) lwz $s1,4($inp) @@ -713,7 +726,7 @@ Lenc_compact_done: stw $s2,8($out) stw $s3,12($out) - $POP r0,`$FRAME-$SIZE_T*21`($sp) + $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) @@ -737,6 +750,9 @@ Lenc_compact_done: mtlr r0 addi $sp,$sp,$FRAME blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 .align 5 Lppc_AES_decrypt: @@ -879,6 +895,8 @@ Ldec_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_decrypt_compact: @@ -1179,7 +1197,9 @@ Ldec_compact_done: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr -.long 0 + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .asciz "AES for PPC, CRYPTOGAMS by " .align 7 ___ diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl index 9257b2cd71..f9b6992ccc 100644 --- a/crypto/bn/asm/ppc-mont.pl +++ b/crypto/bn/asm/ppc-mont.pl @@ -31,7 +31,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=4; $RZONE= 224; - $FRAME= $SIZE_T*16; $LD= "lwz"; # load $LDU= "lwzu"; # load and update @@ -51,7 +50,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=8; $RZONE= 288; - $FRAME= $SIZE_T*16; # same as above, but 64-bit mnemonics... $LD= "ld"; # load @@ -69,6 +67,9 @@ if ($flavour =~ /32/) { $POP= $LD; } else { die "nonsense $flavour"; } +$FRAME=8*$SIZE_T+$RZONE; +$LOCALS=8*$SIZE_T; + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or @@ -89,18 +90,18 @@ $aj="r10"; $nj="r11"; $tj="r12"; # non-volatile registers -$i="r14"; -$j="r15"; -$tp="r16"; -$m0="r17"; -$m1="r18"; -$lo0="r19"; -$hi0="r20"; -$lo1="r21"; -$hi1="r22"; -$alo="r23"; -$ahi="r24"; -$nlo="r25"; +$i="r20"; +$j="r21"; +$tp="r22"; +$m0="r23"; +$m1="r24"; +$lo0="r25"; +$hi0="r26"; +$lo1="r27"; +$hi1="r28"; +$alo="r29"; +$ahi="r30"; +$nlo="r31"; # $nhi="r0"; @@ -123,32 +124,33 @@ ___ $code.=<<___; slwi $num,$num,`log($BNSZ)/log(2)` li $tj,-4096 - addi $ovf,$num,`$FRAME+$RZONE` + addi $ovf,$num,$FRAME subf $ovf,$ovf,$sp ; $sp-$ovf and $ovf,$ovf,$tj ; minimize TLB usage subf $ovf,$sp,$ovf ; $ovf-$sp + mr $tj,$sp srwi $num,$num,`log($BNSZ)/log(2)` $STUX $sp,$sp,$ovf - $PUSH r14,`4*$SIZE_T`($sp) - $PUSH r15,`5*$SIZE_T`($sp) - $PUSH r16,`6*$SIZE_T`($sp) - $PUSH r17,`7*$SIZE_T`($sp) - $PUSH r18,`8*$SIZE_T`($sp) - $PUSH r19,`9*$SIZE_T`($sp) - $PUSH r20,`10*$SIZE_T`($sp) - $PUSH r21,`11*$SIZE_T`($sp) - $PUSH r22,`12*$SIZE_T`($sp) - $PUSH r23,`13*$SIZE_T`($sp) - $PUSH r24,`14*$SIZE_T`($sp) - $PUSH r25,`15*$SIZE_T`($sp) + $PUSH r20,`-12*$SIZE_T`($tj) + $PUSH r21,`-11*$SIZE_T`($tj) + $PUSH r22,`-10*$SIZE_T`($tj) + $PUSH r23,`-9*$SIZE_T`($tj) + $PUSH r24,`-8*$SIZE_T`($tj) + $PUSH r25,`-7*$SIZE_T`($tj) + $PUSH r26,`-6*$SIZE_T`($tj) + $PUSH r27,`-5*$SIZE_T`($tj) + $PUSH r28,`-4*$SIZE_T`($tj) + $PUSH r29,`-3*$SIZE_T`($tj) + $PUSH r30,`-2*$SIZE_T`($tj) + $PUSH r31,`-1*$SIZE_T`($tj) $LD $n0,0($n0) ; pull n0[0] value addi $num,$num,-2 ; adjust $num for counter register $LD $m0,0($bp) ; m0=bp[0] $LD $aj,0($ap) ; ap[0] - addi $tp,$sp,$FRAME + addi $tp,$sp,$LOCALS $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] $UMULH $hi0,$aj,$m0 @@ -210,8 +212,8 @@ L1st: Louter: $LDX $m0,$bp,$i ; m0=bp[i] $LD $aj,0($ap) ; ap[0] - addi $tp,$sp,$FRAME - $LD $tj,$FRAME($sp) ; tp[0] + addi $tp,$sp,$LOCALS + $LD $tj,$LOCALS($sp); tp[0] $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] $UMULH $hi0,$aj,$m0 $LD $aj,$BNSZ($ap) ; ap[1] @@ -278,7 +280,7 @@ Linner: addi $num,$num,2 ; restore $num subfc $j,$j,$j ; j=0 and "clear" XER[CA] - addi $tp,$sp,$FRAME + addi $tp,$sp,$LOCALS mtctr $num .align 4 @@ -304,23 +306,27 @@ Lcopy: ; copy or in-place refresh addi $j,$j,$BNSZ bdnz- Lcopy - $POP r14,`4*$SIZE_T`($sp) - $POP r15,`5*$SIZE_T`($sp) - $POP r16,`6*$SIZE_T`($sp) - $POP r17,`7*$SIZE_T`($sp) - $POP r18,`8*$SIZE_T`($sp) - $POP r19,`9*$SIZE_T`($sp) - $POP r20,`10*$SIZE_T`($sp) - $POP r21,`11*$SIZE_T`($sp) - $POP r22,`12*$SIZE_T`($sp) - $POP r23,`13*$SIZE_T`($sp) - $POP r24,`14*$SIZE_T`($sp) - $POP r25,`15*$SIZE_T`($sp) - $POP $sp,0($sp) + $POP $tj,0($sp) li r3,1 + $POP r20,`-12*$SIZE_T`($tj) + $POP r21,`-11*$SIZE_T`($tj) + $POP r22,`-10*$SIZE_T`($tj) + $POP r23,`-9*$SIZE_T`($tj) + $POP r24,`-8*$SIZE_T`($tj) + $POP r25,`-7*$SIZE_T`($tj) + $POP r26,`-6*$SIZE_T`($tj) + $POP r27,`-5*$SIZE_T`($tj) + $POP r28,`-4*$SIZE_T`($tj) + $POP r29,`-3*$SIZE_T`($tj) + $POP r30,`-2*$SIZE_T`($tj) + $POP r31,`-1*$SIZE_T`($tj) + mr $sp,$tj blr .long 0 -.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by " + .byte 0,12,4,0,0x80,12,6,0 + .long 0 + +.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by " ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl index 37c65d3511..aaf669a5b3 100644 --- a/crypto/bn/asm/ppc.pl +++ b/crypto/bn/asm/ppc.pl @@ -389,7 +389,9 @@ $data=<+-------------------------------+ # | saved sp | # +-------------------------------+ -# | | -# +-------------------------------+ -# | 10 saved gpr, r14-r23 | -# . . -# . . -# +12*size_t +-------------------------------+ -# | 12 saved fpr, f14-f25 | -# . . # . . -# +12*8 +-------------------------------+ -# | padding to 64 byte boundary | -# . . -# +X +-------------------------------+ +# +64 +-------------------------------+ # | 16 gpr<->fpr transfer zone | # . . # . . @@ -192,6 +179,16 @@ $T3a="f24"; $T3b="f25"; # . . # . . # +-------------------------------+ +# . . +# -12*size_t +-------------------------------+ +# | 10 saved gpr, r22-r31 | +# . . +# . . +# -12*8 +-------------------------------+ +# | 12 saved fpr, f20-f31 | +# . . +# . . +# +-------------------------------+ $code=<<___; .machine "any" @@ -215,30 +212,31 @@ $code=<<___; subf $tp,$tp,$sp ; $sp-$tp and $tp,$tp,$i ; minimize TLB usage subf $tp,$sp,$tp ; $tp-$sp + mr $i,$sp $STUX $sp,$sp,$tp ; alloca - $PUSH r14,`2*$SIZE_T`($sp) - $PUSH r15,`3*$SIZE_T`($sp) - $PUSH r16,`4*$SIZE_T`($sp) - $PUSH r17,`5*$SIZE_T`($sp) - $PUSH r18,`6*$SIZE_T`($sp) - $PUSH r19,`7*$SIZE_T`($sp) - $PUSH r20,`8*$SIZE_T`($sp) - $PUSH r21,`9*$SIZE_T`($sp) - $PUSH r22,`10*$SIZE_T`($sp) - $PUSH r23,`11*$SIZE_T`($sp) - stfd f14,`12*$SIZE_T+0`($sp) - stfd f15,`12*$SIZE_T+8`($sp) - stfd f16,`12*$SIZE_T+16`($sp) - stfd f17,`12*$SIZE_T+24`($sp) - stfd f18,`12*$SIZE_T+32`($sp) - stfd f19,`12*$SIZE_T+40`($sp) - stfd f20,`12*$SIZE_T+48`($sp) - stfd f21,`12*$SIZE_T+56`($sp) - stfd f22,`12*$SIZE_T+64`($sp) - stfd f23,`12*$SIZE_T+72`($sp) - stfd f24,`12*$SIZE_T+80`($sp) - stfd f25,`12*$SIZE_T+88`($sp) + $PUSH r22,`-12*8-10*$SIZE_T`($i) + $PUSH r23,`-12*8-9*$SIZE_T`($i) + $PUSH r24,`-12*8-8*$SIZE_T`($i) + $PUSH r25,`-12*8-7*$SIZE_T`($i) + $PUSH r26,`-12*8-6*$SIZE_T`($i) + $PUSH r27,`-12*8-5*$SIZE_T`($i) + $PUSH r28,`-12*8-4*$SIZE_T`($i) + $PUSH r29,`-12*8-3*$SIZE_T`($i) + $PUSH r30,`-12*8-2*$SIZE_T`($i) + $PUSH r31,`-12*8-1*$SIZE_T`($i) + stfd f20,`-12*8`($i) + stfd f21,`-11*8`($i) + stfd f22,`-10*8`($i) + stfd f23,`-9*8`($i) + stfd f24,`-8*8`($i) + stfd f25,`-7*8`($i) + stfd f26,`-6*8`($i) + stfd f27,`-5*8`($i) + stfd f28,`-4*8`($i) + stfd f29,`-3*8`($i) + stfd f30,`-2*8`($i) + stfd f31,`-1*8`($i) ___ $code.=<<___ if ($SIZE_T==8); ld $a0,0($ap) ; pull ap[0] value @@ -1052,33 +1050,37 @@ Lcopy: ; copy or in-place refresh ___ $code.=<<___; - $POP r14,`2*$SIZE_T`($sp) - $POP r15,`3*$SIZE_T`($sp) - $POP r16,`4*$SIZE_T`($sp) - $POP r17,`5*$SIZE_T`($sp) - $POP r18,`6*$SIZE_T`($sp) - $POP r19,`7*$SIZE_T`($sp) - $POP r20,`8*$SIZE_T`($sp) - $POP r21,`9*$SIZE_T`($sp) - $POP r22,`10*$SIZE_T`($sp) - $POP r23,`11*$SIZE_T`($sp) - lfd f14,`12*$SIZE_T+0`($sp) - lfd f15,`12*$SIZE_T+8`($sp) - lfd f16,`12*$SIZE_T+16`($sp) - lfd f17,`12*$SIZE_T+24`($sp) - lfd f18,`12*$SIZE_T+32`($sp) - lfd f19,`12*$SIZE_T+40`($sp) - lfd f20,`12*$SIZE_T+48`($sp) - lfd f21,`12*$SIZE_T+56`($sp) - lfd f22,`12*$SIZE_T+64`($sp) - lfd f23,`12*$SIZE_T+72`($sp) - lfd f24,`12*$SIZE_T+80`($sp) - lfd f25,`12*$SIZE_T+88`($sp) - $POP $sp,0($sp) + $POP $i,0($sp) li r3,1 ; signal "handled" + $POP r22,`-12*8-10*$SIZE_T`($i) + $POP r23,`-12*8-9*$SIZE_T`($i) + $POP r24,`-12*8-8*$SIZE_T`($i) + $POP r25,`-12*8-7*$SIZE_T`($i) + $POP r26,`-12*8-6*$SIZE_T`($i) + $POP r27,`-12*8-5*$SIZE_T`($i) + $POP r28,`-12*8-4*$SIZE_T`($i) + $POP r29,`-12*8-3*$SIZE_T`($i) + $POP r30,`-12*8-2*$SIZE_T`($i) + $POP r31,`-12*8-1*$SIZE_T`($i) + lfd f20,`-12*8`($i) + lfd f21,`-11*8`($i) + lfd f22,`-10*8`($i) + lfd f23,`-9*8`($i) + lfd f24,`-8*8`($i) + lfd f25,`-7*8`($i) + lfd f26,`-6*8`($i) + lfd f27,`-5*8`($i) + lfd f28,`-4*8`($i) + lfd f29,`-3*8`($i) + lfd f30,`-2*8`($i) + lfd f31,`-1*8`($i) + mr $sp,$i blr .long 0 -.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by " + .byte 0,12,4,0,0x8c,10,6,0 + .long 0 + +.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by " ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl index d6220e747d..3bdfff39d8 100755 --- a/crypto/ppccpuid.pl +++ b/crypto/ppccpuid.pl @@ -29,12 +29,16 @@ $code=<<___; fcfid f1,f1 extrdi r0,r0,32,0 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_altivec_probe .align 4 .OPENSSL_altivec_probe: .long 0x10000484 # vor v0,v0,v0 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_wipe_cpu .align 4 @@ -65,6 +69,8 @@ $code=<<___; fmr f12,f31 fmr f13,f31 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_atomic_add .align 4 @@ -75,6 +81,9 @@ Ladd: lwarx r5,0,r3 bne- Ladd $SIGNX r3,r0 blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 .globl .OPENSSL_rdtsc .align 4 @@ -82,6 +91,8 @@ Ladd: lwarx r5,0,r3 mftb r3 mftbu r4 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_cleanse .align 4 @@ -111,6 +122,9 @@ Laligned: andi. r4,r4,3 bne Little blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 ___ { my ($out,$cnt,$max)=("r3","r4","r5"); @@ -145,6 +159,9 @@ Loop: mftb $tick mr r3,$cnt blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 .globl .OPENSSL_instrument_bus2 .align 4 @@ -193,6 +210,9 @@ Ldone2: srwi $cnt,$cnt,2 sub r3,r0,$cnt blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 ___ } diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl index dcd0fcdfcf..2140dd2f8d 100755 --- a/crypto/sha/asm/sha1-ppc.pl +++ b/crypto/sha/asm/sha1-ppc.pl @@ -24,12 +24,14 @@ $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; + $LRSAVE =2*$SIZE_T; $UCMP ="cmpld"; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; + $LRSAVE =$SIZE_T; $UCMP ="cmplw"; $STU ="stwu"; $POP ="lwz"; @@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; -$FRAME=24*$SIZE_T; +$FRAME=24*$SIZE_T+64; +$LOCALS=6*$SIZE_T; $K ="r0"; $sp ="r1"; @@ -162,9 +165,8 @@ $code=<<___; .globl .sha1_block_data_order .align 4 .sha1_block_data_order: + $STU $sp,-$FRAME($sp) mflr r0 - $STU $sp,`-($FRAME+64)`($sp) - $PUSH r0,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) $PUSH r17,`$FRAME-$SIZE_T*15`($sp) @@ -182,6 +184,7 @@ $code=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $A,0($ctx) lwz $B,4($ctx) lwz $C,8($ctx) @@ -192,37 +195,14 @@ $code=<<___; Laligned: mtctr $num bl Lsha1_block_private -Ldone: - $POP r0,`$FRAME-$SIZE_T*18`($sp) - $POP r15,`$FRAME-$SIZE_T*17`($sp) - $POP r16,`$FRAME-$SIZE_T*16`($sp) - $POP r17,`$FRAME-$SIZE_T*15`($sp) - $POP r18,`$FRAME-$SIZE_T*14`($sp) - $POP r19,`$FRAME-$SIZE_T*13`($sp) - $POP r20,`$FRAME-$SIZE_T*12`($sp) - $POP r21,`$FRAME-$SIZE_T*11`($sp) - $POP r22,`$FRAME-$SIZE_T*10`($sp) - $POP r23,`$FRAME-$SIZE_T*9`($sp) - $POP r24,`$FRAME-$SIZE_T*8`($sp) - $POP r25,`$FRAME-$SIZE_T*7`($sp) - $POP r26,`$FRAME-$SIZE_T*6`($sp) - $POP r27,`$FRAME-$SIZE_T*5`($sp) - $POP r28,`$FRAME-$SIZE_T*4`($sp) - $POP r29,`$FRAME-$SIZE_T*3`($sp) - $POP r30,`$FRAME-$SIZE_T*2`($sp) - $POP r31,`$FRAME-$SIZE_T*1`($sp) - mtlr r0 - addi $sp,$sp,`$FRAME+64` - blr -___ + b Ldone -# PowerPC specification allows an implementation to be ill-behaved -# upon unaligned access which crosses page boundary. "Better safe -# than sorry" principle makes me treat it specially. But I don't -# look for particular offending word, but rather for 64-byte input -# block which crosses the boundary. Once found that block is aligned -# and hashed separately... -$code.=<<___; +; PowerPC specification allows an implementation to be ill-behaved +; upon unaligned access which crosses page boundary. "Better safe +; than sorry" principle makes me treat it specially. But I don't +; look for particular offending word, but rather for 64-byte input +; block which crosses the boundary. Once found that block is aligned +; and hashed separately... .align 4 Lunaligned: subfic $t1,$inp,4096 @@ -237,7 +217,7 @@ Lunaligned: Lcross_page: li $t1,16 mtctr $t1 - addi r20,$sp,$FRAME ; spot below the frame + addi r20,$sp,$LOCALS ; spot within the frame Lmemcpy: lbz r16,0($inp) lbz r17,1($inp) @@ -251,15 +231,40 @@ Lmemcpy: addi r20,r20,4 bdnz Lmemcpy - $PUSH $inp,`$FRAME-$SIZE_T*19`($sp) + $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) li $t1,1 - addi $inp,$sp,$FRAME + addi $inp,$sp,$LOCALS mtctr $t1 bl Lsha1_block_private - $POP $inp,`$FRAME-$SIZE_T*19`($sp) + $POP $inp,`$FRAME-$SIZE_T*18`($sp) addic. $num,$num,-1 bne- Lunaligned - b Ldone + +Ldone: + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 ___ # This is private block function, which uses tailored calling @@ -309,6 +314,8 @@ $code.=<<___; addi $inp,$inp,`16*4` bdnz- Lsha1_block_private blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 ___ $code.=<<___; .asciz "SHA1 block transform for PPC, CRYPTOGAMS by " diff --git a/crypto/sha/asm/sha512-ppc.pl b/crypto/sha/asm/sha512-ppc.pl index 768a6a6fad..6b44a68e59 100755 --- a/crypto/sha/asm/sha512-ppc.pl +++ b/crypto/sha/asm/sha512-ppc.pl @@ -40,6 +40,7 @@ $output =shift; if ($flavour =~ /64/) { $SIZE_T=8; + $LRSAVE=2*$SIZE_T; $STU="stdu"; $UCMP="cmpld"; $SHL="sldi"; @@ -47,6 +48,7 @@ if ($flavour =~ /64/) { $PUSH="std"; } elsif ($flavour =~ /32/) { $SIZE_T=4; + $LRSAVE=$SIZE_T; $STU="stwu"; $UCMP="cmplw"; $SHL="slwi"; @@ -87,7 +89,8 @@ if ($output =~ /512/) { $SHR="srwi"; } -$FRAME=32*$SIZE_T; +$FRAME=32*$SIZE_T+16*$SZ; +$LOCALS=6*$SIZE_T; $sp ="r1"; $toc="r2"; @@ -179,13 +182,12 @@ $code=<<___; .globl $func .align 6 $func: + $STU $sp,-$FRAME($sp) mflr r0 - $STU $sp,`-($FRAME+16*$SZ)`($sp) $SHL $num,$num,`log(16*$SZ)/log(2)` $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -206,6 +208,7 @@ $func: $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) $LD $A,`0*$SZ`($ctx) mr $inp,r4 ; incarnate $inp @@ -217,7 +220,7 @@ $func: $LD $G,`6*$SZ`($ctx) $LD $H,`7*$SZ`($ctx) - b LPICmeup + bl LPICmeup LPICedup: andi. r0,$inp,3 bne Lunaligned @@ -226,40 +229,14 @@ Laligned: $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer bl Lsha2_block_private -Ldone: - $POP r0,`$FRAME-$SIZE_T*21`($sp) - $POP $toc,`$FRAME-$SIZE_T*20`($sp) - $POP r13,`$FRAME-$SIZE_T*19`($sp) - $POP r14,`$FRAME-$SIZE_T*18`($sp) - $POP r15,`$FRAME-$SIZE_T*17`($sp) - $POP r16,`$FRAME-$SIZE_T*16`($sp) - $POP r17,`$FRAME-$SIZE_T*15`($sp) - $POP r18,`$FRAME-$SIZE_T*14`($sp) - $POP r19,`$FRAME-$SIZE_T*13`($sp) - $POP r20,`$FRAME-$SIZE_T*12`($sp) - $POP r21,`$FRAME-$SIZE_T*11`($sp) - $POP r22,`$FRAME-$SIZE_T*10`($sp) - $POP r23,`$FRAME-$SIZE_T*9`($sp) - $POP r24,`$FRAME-$SIZE_T*8`($sp) - $POP r25,`$FRAME-$SIZE_T*7`($sp) - $POP r26,`$FRAME-$SIZE_T*6`($sp) - $POP r27,`$FRAME-$SIZE_T*5`($sp) - $POP r28,`$FRAME-$SIZE_T*4`($sp) - $POP r29,`$FRAME-$SIZE_T*3`($sp) - $POP r30,`$FRAME-$SIZE_T*2`($sp) - $POP r31,`$FRAME-$SIZE_T*1`($sp) - mtlr r0 - addi $sp,$sp,`$FRAME+16*$SZ` - blr -___ + b Ldone -# PowerPC specification allows an implementation to be ill-behaved -# upon unaligned access which crosses page boundary. "Better safe -# than sorry" principle makes me treat it specially. But I don't -# look for particular offending word, but rather for the input -# block which crosses the boundary. Once found that block is aligned -# and hashed separately... -$code.=<<___; +; PowerPC specification allows an implementation to be ill-behaved +; upon unaligned access which crosses page boundary. "Better safe +; than sorry" principle makes me treat it specially. But I don't +; look for particular offending word, but rather for the input +; block which crosses the boundary. Once found that block is aligned +; and hashed separately... .align 4 Lunaligned: subfic $t1,$inp,4096 @@ -278,7 +255,7 @@ Lunaligned: Lcross_page: li $t1,`16*$SZ/4` mtctr $t1 - addi r20,$sp,$FRAME ; aligned spot below the frame + addi r20,$sp,$LOCALS ; aligned spot below the frame Lmemcpy: lbz r16,0($inp) lbz r17,1($inp) @@ -293,8 +270,8 @@ Lmemcpy: bdnz Lmemcpy $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp - addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer - addi $inp,$sp,$FRAME ; fictitious inp pointer + addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer + addi $inp,$sp,$LOCALS ; fictitious inp pointer $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer @@ -303,10 +280,36 @@ Lmemcpy: $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num addic. $num,$num,`-16*$SZ` ; num-- bne- Lunaligned - b Ldone -___ -$code.=<<___; +Ldone: + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP $toc,`$FRAME-$SIZE_T*20`($sp) + $POP r13,`$FRAME-$SIZE_T*19`($sp) + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 + .align 4 Lsha2_block_private: ___ @@ -372,6 +375,8 @@ $code.=<<___; $ST $H,`7*$SZ`($ctx) bne Lsha2_block_private blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 ___ # Ugly hack here, because PPC assembler syntax seem to vary too @@ -379,22 +384,15 @@ ___ $code.=<<___; .align 6 LPICmeup: - bl LPIC - addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop - b LPICedup - nop - nop - nop - nop - nop -LPIC: mflr $Tbl + mflr r0 + bcl 20,31,\$+4 + mflr $Tbl ; vvvvvv "distance" between . and 1st data entry + addi $Tbl,$Tbl,`64-8` + mtlr r0 blr - nop - nop - nop - nop - nop - nop + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` ___ $code.=<<___ if ($SZ==8); .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd -- 2.39.5