crypto/sha/asm/sha512-sparcv9.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 #
   9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
  10 # ====================================================================
  11
  12 # SHA256 performance improvement over compiler generated code varies
  13 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  14 # build]. Just like in SHA1 module I aim to ensure scalability on
  15 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  16
  17 # SHA512 on pre-T1 UltraSPARC.
  18 #
  19 # Performance is >75% better than 64-bit code generated by Sun C and
  20 # over 2x than 32-bit code. X[16] resides on stack, but access to it
  21 # is scheduled for L2 latency and staged through 32 least significant
  22 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  23 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
  24 # good [optimal coefficient is 50%].
  25 #
  26 # SHA512 on UltraSPARC T1.
  27 #
  28 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  29 # because 64-bit code generator has the advantage of using 64-bit
  30 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  31 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  32 # code by 60%, not to mention that it doesn't suffer from severe decay
  33 # when running 4 times physical cores threads and that it leaves gcc
  34 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  35 # performance is only 10% better, but overall throughput for maximum
  36 # amount of threads for given CPU exceeds corresponding one of SHA256
  37 # by 30% [again, optimal coefficient is 50%].
  38 #
  39 # (*)   Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  40 #       in-order, i.e. load instruction has to complete prior next
  41 #       instruction in given thread is executed, even if the latter is
  42 #       not dependent on load result! This means that on T1 two 32-bit
  43 #       loads are always slower than one 64-bit load. Once again this
  44 #       is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  45 #       2x32-bit loads can be as fast as 1x64-bit ones.
  46 #
  47 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
  48 # which is 9.3x/11.1x faster than software. Multi-process benchmark
  49 # saturates at 11.5x single-process result on 8-core processor, or
  50 # ~11/16GBps per 2.85GHz socket.
  51
  52
  53 $bits=32;
  54 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  55 if ($bits==64)  { $bias=2047; $frame=192; }
  56 else            { $bias=0;    $frame=112; }
  57
  58 $output=shift;
  59 open STDOUT,">$output";
  60
  61 if ($output =~ /512/) {
  62         $label="512";
  63         $SZ=8;
  64         $LD="ldx";              # load from memory
  65         $ST="stx";              # store to memory
  66         $SLL="sllx";            # shift left logical
  67         $SRL="srlx";            # shift right logical
  68         @Sigma0=(28,34,39);
  69         @Sigma1=(14,18,41);
  70         @sigma0=( 7, 1, 8);     # right shift first
  71         @sigma1=( 6,19,61);     # right shift first
  72         $lastK=0x817;
  73         $rounds=80;
  74         $align=4;
  75
  76         $locals=16*$SZ;         # X[16]
  77
  78         $A="%o0";
  79         $B="%o1";
  80         $C="%o2";
  81         $D="%o3";
  82         $E="%o4";
  83         $F="%o5";
  84         $G="%g1";
  85         $H="%o7";
  86         @V=($A,$B,$C,$D,$E,$F,$G,$H);
  87 } else {
  88         $label="256";
  89         $SZ=4;
  90         $LD="ld";               # load from memory
  91         $ST="st";               # store to memory
  92         $SLL="sll";             # shift left logical
  93         $SRL="srl";             # shift right logical
  94         @Sigma0=( 2,13,22);
  95         @Sigma1=( 6,11,25);
  96         @sigma0=( 3, 7,18);     # right shift first
  97         @sigma1=(10,17,19);     # right shift first
  98         $lastK=0x8f2;
  99         $rounds=64;
 100         $align=8;
 101
 102         $locals=0;              # X[16] is register resident
 103         @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
 104
 105         $A="%l0";
 106         $B="%l1";
 107         $C="%l2";
 108         $D="%l3";
 109         $E="%l4";
 110         $F="%l5";
 111         $G="%l6";
 112         $H="%l7";
 113         @V=($A,$B,$C,$D,$E,$F,$G,$H);
 114 }
 115 $T1="%g2";
 116 $tmp0="%g3";
 117 $tmp1="%g4";
 118 $tmp2="%g5";
 119
 120 $ctx="%i0";
 121 $inp="%i1";
 122 $len="%i2";
 123 $Ktbl="%i3";
 124 $tmp31="%i4";
 125 $tmp32="%i5";
 126
 127 ########### SHA256
 128 $Xload = sub {
 129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 130
 131     if ($i==0) {
 132 $code.=<<___;
 133         ldx     [$inp+0],@X[0]
 134         ldx     [$inp+16],@X[2]
 135         ldx     [$inp+32],@X[4]
 136         ldx     [$inp+48],@X[6]
 137         ldx     [$inp+8],@X[1]
 138         ldx     [$inp+24],@X[3]
 139         subcc   %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
 140         ldx     [$inp+40],@X[5]
 141         bz,pt   %icc,.Laligned
 142         ldx     [$inp+56],@X[7]
 143
 144         sllx    @X[0],$tmp31,@X[0]
 145         ldx     [$inp+64],$T1
 146 ___
 147 for($j=0;$j<7;$j++)
 148 {   $code.=<<___;
 149         srlx    @X[$j+1],$tmp32,$tmp1
 150         sllx    @X[$j+1],$tmp31,@X[$j+1]
 151         or      $tmp1,@X[$j],@X[$j]
 152 ___
 153 }
 154 $code.=<<___;
 155         srlx    $T1,$tmp32,$T1
 156         or      $T1,@X[7],@X[7]
 157 .Laligned:
 158 ___
 159     }
 160
 161     if ($i&1) {
 162         $code.="\tadd   @X[$i/2],$h,$T1\n";
 163     } else {
 164         $code.="\tsrlx  @X[$i/2],32,$T1\n\tadd  $h,$T1,$T1\n";
 165     }
 166 } if ($SZ==4);
 167
 168 ########### SHA512
 169 $Xload = sub {
 170 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 171 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
 172
 173 $code.=<<___ if ($i==0);
 174         ld      [$inp+0],%l0
 175         ld      [$inp+4],%l1
 176         ld      [$inp+8],%l2
 177         ld      [$inp+12],%l3
 178         ld      [$inp+16],%l4
 179         ld      [$inp+20],%l5
 180         ld      [$inp+24],%l6
 181         cmp     $tmp31,0
 182         ld      [$inp+28],%l7
 183 ___
 184 $code.=<<___ if ($i<15);
 185         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 186         add     $tmp31,32,$tmp0
 187         sllx    @pair[0],$tmp0,$tmp1
 188         `"ld    [$inp+".eval(32+0+$i*8)."],@pair[0]"    if ($i<12)`
 189         srlx    @pair[2],$tmp32,@pair[1]
 190         or      $tmp1,$tmp2,$tmp2
 191         or      @pair[1],$tmp2,$tmp2
 192         `"ld    [$inp+".eval(32+4+$i*8)."],@pair[1]"    if ($i<12)`
 193         add     $h,$tmp2,$T1
 194         $ST     $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
 195 ___
 196 $code.=<<___ if ($i==12);
 197         bnz,a,pn        %icc,.+8
 198         ld      [$inp+128],%l0
 199 ___
 200 $code.=<<___ if ($i==15);
 201         ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
 202         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 203         add     $tmp31,32,$tmp0
 204         ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
 205         sllx    @pair[0],$tmp0,$tmp1
 206         ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
 207         srlx    @pair[2],$tmp32,@pair[1]
 208         or      $tmp1,$tmp2,$tmp2
 209         ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
 210         or      @pair[1],$tmp2,$tmp2
 211         ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
 212         add     $h,$tmp2,$T1
 213         $ST     $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
 214         ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
 215         ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
 216         ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
 217 ___
 218 } if ($SZ==8);
 219
 220 ########### common
 221 sub BODY_00_15 {
 222 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 223
 224     if ($i<16) {
 225         &$Xload(@_);
 226     } else {
 227         $code.="\tadd   $h,$T1,$T1\n";
 228     }
 229
 230 $code.=<<___;
 231         $SRL    $e,@Sigma1[0],$h        !! $i
 232         xor     $f,$g,$tmp2
 233         $SLL    $e,`$SZ*8-@Sigma1[2]`,$tmp1
 234         and     $e,$tmp2,$tmp2
 235         $SRL    $e,@Sigma1[1],$tmp0
 236         xor     $tmp1,$h,$h
 237         $SLL    $e,`$SZ*8-@Sigma1[1]`,$tmp1
 238         xor     $tmp0,$h,$h
 239         $SRL    $e,@Sigma1[2],$tmp0
 240         xor     $tmp1,$h,$h
 241         $SLL    $e,`$SZ*8-@Sigma1[0]`,$tmp1
 242         xor     $tmp0,$h,$h
 243         xor     $g,$tmp2,$tmp2          ! Ch(e,f,g)
 244         xor     $tmp1,$h,$tmp0          ! Sigma1(e)
 245
 246         $SRL    $a,@Sigma0[0],$h
 247         add     $tmp2,$T1,$T1
 248         $LD     [$Ktbl+`$i*$SZ`],$tmp2  ! K[$i]
 249         $SLL    $a,`$SZ*8-@Sigma0[2]`,$tmp1
 250         add     $tmp0,$T1,$T1
 251         $SRL    $a,@Sigma0[1],$tmp0
 252         xor     $tmp1,$h,$h
 253         $SLL    $a,`$SZ*8-@Sigma0[1]`,$tmp1
 254         xor     $tmp0,$h,$h
 255         $SRL    $a,@Sigma0[2],$tmp0
 256         xor     $tmp1,$h,$h
 257         $SLL    $a,`$SZ*8-@Sigma0[0]`,$tmp1
 258         xor     $tmp0,$h,$h
 259         xor     $tmp1,$h,$h             ! Sigma0(a)
 260
 261         or      $a,$b,$tmp0
 262         and     $a,$b,$tmp1
 263         and     $c,$tmp0,$tmp0
 264         or      $tmp0,$tmp1,$tmp1       ! Maj(a,b,c)
 265         add     $tmp2,$T1,$T1           ! +=K[$i]
 266         add     $tmp1,$h,$h
 267
 268         add     $T1,$d,$d
 269         add     $T1,$h,$h
 270 ___
 271 }
 272
 273 ########### SHA256
 274 $BODY_16_XX = sub {
 275 my $i=@_[0];
 276 my $xi;
 277
 278     if ($i&1) {
 279         $xi=$tmp32;
 280         $code.="\tsrlx  @X[(($i+1)/2)%8],32,$xi\n";
 281     } else {
 282         $xi=@X[(($i+1)/2)%8];
 283     }
 284 $code.=<<___;
 285         srl     $xi,@sigma0[0],$T1              !! Xupdate($i)
 286         sll     $xi,`32-@sigma0[2]`,$tmp1
 287         srl     $xi,@sigma0[1],$tmp0
 288         xor     $tmp1,$T1,$T1
 289         sll     $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 290         xor     $tmp0,$T1,$T1
 291         srl     $xi,@sigma0[2],$tmp0
 292         xor     $tmp1,$T1,$T1
 293 ___
 294     if ($i&1) {
 295         $xi=@X[(($i+14)/2)%8];
 296     } else {
 297         $xi=$tmp32;
 298         $code.="\tsrlx  @X[(($i+14)/2)%8],32,$xi\n";
 299     }
 300 $code.=<<___;
 301         srl     $xi,@sigma1[0],$tmp2
 302         xor     $tmp0,$T1,$T1                   ! T1=sigma0(X[i+1])
 303         sll     $xi,`32-@sigma1[2]`,$tmp1
 304         srl     $xi,@sigma1[1],$tmp0
 305         xor     $tmp1,$tmp2,$tmp2
 306         sll     $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
 307         xor     $tmp0,$tmp2,$tmp2
 308         srl     $xi,@sigma1[2],$tmp0
 309         xor     $tmp1,$tmp2,$tmp2
 310 ___
 311     if ($i&1) {
 312         $xi=@X[($i/2)%8];
 313 $code.=<<___;
 314         srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
 315         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 316         srl     @X[($i/2)%8],0,$tmp0
 317         add     $tmp2,$tmp1,$tmp1
 318         add     $xi,$T1,$T1                     ! +=X[i]
 319         xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 320         add     $tmp1,$T1,$T1
 321
 322         srl     $T1,0,$T1
 323         or      $T1,@X[($i/2)%8],@X[($i/2)%8]
 324 ___
 325     } else {
 326         $xi=@X[(($i+9)/2)%8];
 327 $code.=<<___;
 328         srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
 329         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 330         add     $xi,$T1,$T1                     ! +=X[i+9]
 331         add     $tmp2,$tmp1,$tmp1
 332         srl     @X[($i/2)%8],0,@X[($i/2)%8]
 333         add     $tmp1,$T1,$T1
 334
 335         sllx    $T1,32,$tmp0
 336         or      $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 337 ___
 338     }
 339     &BODY_00_15(@_);
 340 } if ($SZ==4);
 341
 342 ########### SHA512
 343 $BODY_16_XX = sub {
 344 my $i=@_[0];
 345 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
 346
 347 $code.=<<___;
 348         sllx    %l2,32,$tmp0            !! Xupdate($i)
 349         or      %l3,$tmp0,$tmp0
 350
 351         srlx    $tmp0,@sigma0[0],$T1
 352         ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
 353         sllx    $tmp0,`64-@sigma0[2]`,$tmp1
 354         ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
 355         srlx    $tmp0,@sigma0[1],$tmp0
 356         xor     $tmp1,$T1,$T1
 357         sllx    $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 358         xor     $tmp0,$T1,$T1
 359         srlx    $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
 360         xor     $tmp1,$T1,$T1
 361         sllx    %l6,32,$tmp2
 362         xor     $tmp0,$T1,$T1           ! sigma0(X[$i+1])
 363         or      %l7,$tmp2,$tmp2
 364
 365         srlx    $tmp2,@sigma1[0],$tmp1
 366         ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
 367         sllx    $tmp2,`64-@sigma1[2]`,$tmp0
 368         ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
 369         srlx    $tmp2,@sigma1[1],$tmp2
 370         xor     $tmp0,$tmp1,$tmp1
 371         sllx    $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
 372         xor     $tmp2,$tmp1,$tmp1
 373         srlx    $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
 374         xor     $tmp0,$tmp1,$tmp1
 375         sllx    %l4,32,$tmp0
 376         xor     $tmp2,$tmp1,$tmp1       ! sigma1(X[$i+14])
 377         ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
 378         or      %l5,$tmp0,$tmp0
 379         ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
 380
 381         sllx    %l0,32,$tmp2
 382         add     $tmp1,$T1,$T1
 383         ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
 384         or      %l1,$tmp2,$tmp2
 385         add     $tmp0,$T1,$T1           ! +=X[$i+9]
 386         ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
 387         add     $tmp2,$T1,$T1           ! +=X[$i]
 388         $ST     $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
 389 ___
 390     &BODY_00_15(@_);
 391 } if ($SZ==8);
 392
 393 $code.=<<___ if ($bits==64);
 394 .register       %g2,#scratch
 395 .register       %g3,#scratch
 396 ___
 397 $code.=<<___;
 398 #include "sparc_arch.h"
 399
 400 .section        ".text",#alloc,#execinstr
 401
 402 .align  64
 403 K${label}:
 404 .type   K${label},#object
 405 ___
 406 if ($SZ==4) {
 407 $code.=<<___;
 408         .long   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
 409         .long   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
 410         .long   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
 411         .long   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
 412         .long   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
 413         .long   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
 414         .long   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
 415         .long   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
 416         .long   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
 417         .long   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
 418         .long   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
 419         .long   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
 420         .long   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
 421         .long   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
 422         .long   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 423         .long   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 424 ___
 425 } else {
 426 $code.=<<___;
 427         .long   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
 428         .long   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
 429         .long   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
 430         .long   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
 431         .long   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
 432         .long   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
 433         .long   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
 434         .long   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
 435         .long   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
 436         .long   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
 437         .long   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
 438         .long   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
 439         .long   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
 440         .long   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
 441         .long   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
 442         .long   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
 443         .long   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
 444         .long   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
 445         .long   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
 446         .long   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
 447         .long   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
 448         .long   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
 449         .long   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
 450         .long   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
 451         .long   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
 452         .long   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
 453         .long   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
 454         .long   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
 455         .long   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
 456         .long   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
 457         .long   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
 458         .long   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
 459         .long   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
 460         .long   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
 461         .long   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
 462         .long   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
 463         .long   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
 464         .long   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
 465         .long   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
 466         .long   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
 467 ___
 468 }
 469 $code.=<<___;
 470 .size   K${label},.-K${label}
 471
 472 #ifdef __PIC__
 473 SPARC_PIC_THUNK(%g1)
 474 #endif
 475
 476 .globl  sha${label}_block_data_order
 477 .align  32
 478 sha${label}_block_data_order:
 479         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 480         ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
 481
 482         andcc   %g1, CFR_SHA${label}, %g0
 483         be      .Lsoftware
 484         nop
 485 ___
 486 $code.=<<___ if ($SZ==8);               # SHA512
 487         ldd     [%o0 + 0x00], %f0       ! load context
 488         ldd     [%o0 + 0x08], %f2
 489         ldd     [%o0 + 0x10], %f4
 490         ldd     [%o0 + 0x18], %f6
 491         ldd     [%o0 + 0x20], %f8
 492         ldd     [%o0 + 0x28], %f10
 493         andcc   %o1, 0x7, %g0
 494         ldd     [%o0 + 0x30], %f12
 495         bne,pn  %icc, .Lhwunaligned
 496          ldd    [%o0 + 0x38], %f14
 497
 498 .Lhwaligned_loop:
 499         ldd     [%o1 + 0x00], %f16
 500         ldd     [%o1 + 0x08], %f18
 501         ldd     [%o1 + 0x10], %f20
 502         ldd     [%o1 + 0x18], %f22
 503         ldd     [%o1 + 0x20], %f24
 504         ldd     [%o1 + 0x28], %f26
 505         ldd     [%o1 + 0x30], %f28
 506         ldd     [%o1 + 0x38], %f30
 507         ldd     [%o1 + 0x40], %f32
 508         ldd     [%o1 + 0x48], %f34
 509         ldd     [%o1 + 0x50], %f36
 510         ldd     [%o1 + 0x58], %f38
 511         ldd     [%o1 + 0x60], %f40
 512         ldd     [%o1 + 0x68], %f42
 513         ldd     [%o1 + 0x70], %f44
 514         subcc   %o2, 1, %o2             ! done yet?
 515         ldd     [%o1 + 0x78], %f46
 516         add     %o1, 0x80, %o1
 517
 518         .word   0x81b02860              ! SHA512
 519
 520         bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
 521         nop
 522
 523 .Lhwfinish:
 524         std     %f0, [%o0 + 0x00]       ! store context
 525         std     %f2, [%o0 + 0x08]
 526         std     %f4, [%o0 + 0x10]
 527         std     %f6, [%o0 + 0x18]
 528         std     %f8, [%o0 + 0x20]
 529         std     %f10, [%o0 + 0x28]
 530         std     %f12, [%o0 + 0x30]
 531         retl
 532          std    %f14, [%o0 + 0x38]
 533
 534 .align  16
 535 .Lhwunaligned:
 536         alignaddr %o1, %g0, %o1
 537
 538         ldd     [%o1 + 0x00], %f18
 539 .Lhwunaligned_loop:
 540         ldd     [%o1 + 0x08], %f20
 541         ldd     [%o1 + 0x10], %f22
 542         ldd     [%o1 + 0x18], %f24
 543         ldd     [%o1 + 0x20], %f26
 544         ldd     [%o1 + 0x28], %f28
 545         ldd     [%o1 + 0x30], %f30
 546         ldd     [%o1 + 0x38], %f32
 547         ldd     [%o1 + 0x40], %f34
 548         ldd     [%o1 + 0x48], %f36
 549         ldd     [%o1 + 0x50], %f38
 550         ldd     [%o1 + 0x58], %f40
 551         ldd     [%o1 + 0x60], %f42
 552         ldd     [%o1 + 0x68], %f44
 553         ldd     [%o1 + 0x70], %f46
 554         ldd     [%o1 + 0x78], %f48
 555         subcc   %o2, 1, %o2             ! done yet?
 556         ldd     [%o1 + 0x80], %f50
 557         add     %o1, 0x80, %o1
 558
 559         faligndata %f18, %f20, %f16
 560         faligndata %f20, %f22, %f18
 561         faligndata %f22, %f24, %f20
 562         faligndata %f24, %f26, %f22
 563         faligndata %f26, %f28, %f24
 564         faligndata %f28, %f30, %f26
 565         faligndata %f30, %f32, %f28
 566         faligndata %f32, %f34, %f30
 567         faligndata %f34, %f36, %f32
 568         faligndata %f36, %f38, %f34
 569         faligndata %f38, %f40, %f36
 570         faligndata %f40, %f42, %f38
 571         faligndata %f42, %f44, %f40
 572         faligndata %f44, %f46, %f42
 573         faligndata %f46, %f48, %f44
 574         faligndata %f48, %f50, %f46
 575
 576         .word   0x81b02860              ! SHA512
 577
 578         bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
 579         for     %f50, %f50, %f18        ! %f18=%f50
 580
 581         ba      .Lhwfinish
 582         nop
 583 ___
 584 $code.=<<___ if ($SZ==4);               # SHA256
 585         ld      [%o0 + 0x00], %f0
 586         ld      [%o0 + 0x04], %f1
 587         ld      [%o0 + 0x08], %f2
 588         ld      [%o0 + 0x0c], %f3
 589         ld      [%o0 + 0x10], %f4
 590         ld      [%o0 + 0x14], %f5
 591         andcc   %o1, 0x7, %g0
 592         ld      [%o0 + 0x18], %f6
 593         bne,pn  %icc, .Lhwunaligned
 594          ld     [%o0 + 0x1c], %f7
 595
 596 .Lhwloop:
 597         ldd     [%o1 + 0x00], %f8
 598         ldd     [%o1 + 0x08], %f10
 599         ldd     [%o1 + 0x10], %f12
 600         ldd     [%o1 + 0x18], %f14
 601         ldd     [%o1 + 0x20], %f16
 602         ldd     [%o1 + 0x28], %f18
 603         ldd     [%o1 + 0x30], %f20
 604         subcc   %o2, 1, %o2             ! done yet?
 605         ldd     [%o1 + 0x38], %f22
 606         add     %o1, 0x40, %o1
 607
 608         .word   0x81b02840              ! SHA256
 609
 610         bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwloop
 611         nop
 612
 613 .Lhwfinish:
 614         st      %f0, [%o0 + 0x00]       ! store context
 615         st      %f1, [%o0 + 0x04]
 616         st      %f2, [%o0 + 0x08]
 617         st      %f3, [%o0 + 0x0c]
 618         st      %f4, [%o0 + 0x10]
 619         st      %f5, [%o0 + 0x14]
 620         st      %f6, [%o0 + 0x18]
 621         retl
 622          st     %f7, [%o0 + 0x1c]
 623
 624 .align  8
 625 .Lhwunaligned:
 626         alignaddr %o1, %g0, %o1
 627
 628         ldd     [%o1 + 0x00], %f10
 629 .Lhwunaligned_loop:
 630         ldd     [%o1 + 0x08], %f12
 631         ldd     [%o1 + 0x10], %f14
 632         ldd     [%o1 + 0x18], %f16
 633         ldd     [%o1 + 0x20], %f18
 634         ldd     [%o1 + 0x28], %f20
 635         ldd     [%o1 + 0x30], %f22
 636         ldd     [%o1 + 0x38], %f24
 637         subcc   %o2, 1, %o2             ! done yet?
 638         ldd     [%o1 + 0x40], %f26
 639         add     %o1, 0x40, %o1
 640
 641         faligndata %f10, %f12, %f8
 642         faligndata %f12, %f14, %f10
 643         faligndata %f14, %f16, %f12
 644         faligndata %f16, %f18, %f14
 645         faligndata %f18, %f20, %f16
 646         faligndata %f20, %f22, %f18
 647         faligndata %f22, %f24, %f20
 648         faligndata %f24, %f26, %f22
 649
 650         .word   0x81b02840              ! SHA256
 651
 652         bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
 653         for     %f26, %f26, %f10        ! %f10=%f26
 654
 655         ba      .Lhwfinish
 656         nop
 657 ___
 658 $code.=<<___;
 659 .align  16
 660 .Lsoftware:
 661         save    %sp,`-$frame-$locals`,%sp
 662         and     $inp,`$align-1`,$tmp31
 663         sllx    $len,`log(16*$SZ)/log(2)`,$len
 664         andn    $inp,`$align-1`,$inp
 665         sll     $tmp31,3,$tmp31
 666         add     $inp,$len,$len
 667 ___
 668 $code.=<<___ if ($SZ==8); # SHA512
 669         mov     32,$tmp32
 670         sub     $tmp32,$tmp31,$tmp32
 671 ___
 672 $code.=<<___;
 673 .Lpic:  call    .+8
 674         add     %o7,K${label}-.Lpic,$Ktbl
 675
 676         $LD     [$ctx+`0*$SZ`],$A
 677         $LD     [$ctx+`1*$SZ`],$B
 678         $LD     [$ctx+`2*$SZ`],$C
 679         $LD     [$ctx+`3*$SZ`],$D
 680         $LD     [$ctx+`4*$SZ`],$E
 681         $LD     [$ctx+`5*$SZ`],$F
 682         $LD     [$ctx+`6*$SZ`],$G
 683         $LD     [$ctx+`7*$SZ`],$H
 684
 685 .Lloop:
 686 ___
 687 for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 688 $code.=".L16_xx:\n";
 689 for (;$i<32;$i++)       { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 690 $code.=<<___;
 691         and     $tmp2,0xfff,$tmp2
 692         cmp     $tmp2,$lastK
 693         bne     .L16_xx
 694         add     $Ktbl,`16*$SZ`,$Ktbl    ! Ktbl+=16
 695
 696 ___
 697 $code.=<<___ if ($SZ==4); # SHA256
 698         $LD     [$ctx+`0*$SZ`],@X[0]
 699         $LD     [$ctx+`1*$SZ`],@X[1]
 700         $LD     [$ctx+`2*$SZ`],@X[2]
 701         $LD     [$ctx+`3*$SZ`],@X[3]
 702         $LD     [$ctx+`4*$SZ`],@X[4]
 703         $LD     [$ctx+`5*$SZ`],@X[5]
 704         $LD     [$ctx+`6*$SZ`],@X[6]
 705         $LD     [$ctx+`7*$SZ`],@X[7]
 706
 707         add     $A,@X[0],$A
 708         $ST     $A,[$ctx+`0*$SZ`]
 709         add     $B,@X[1],$B
 710         $ST     $B,[$ctx+`1*$SZ`]
 711         add     $C,@X[2],$C
 712         $ST     $C,[$ctx+`2*$SZ`]
 713         add     $D,@X[3],$D
 714         $ST     $D,[$ctx+`3*$SZ`]
 715         add     $E,@X[4],$E
 716         $ST     $E,[$ctx+`4*$SZ`]
 717         add     $F,@X[5],$F
 718         $ST     $F,[$ctx+`5*$SZ`]
 719         add     $G,@X[6],$G
 720         $ST     $G,[$ctx+`6*$SZ`]
 721         add     $H,@X[7],$H
 722         $ST     $H,[$ctx+`7*$SZ`]
 723 ___
 724 $code.=<<___ if ($SZ==8); # SHA512
 725         ld      [$ctx+`0*$SZ+0`],%l0
 726         ld      [$ctx+`0*$SZ+4`],%l1
 727         ld      [$ctx+`1*$SZ+0`],%l2
 728         ld      [$ctx+`1*$SZ+4`],%l3
 729         ld      [$ctx+`2*$SZ+0`],%l4
 730         ld      [$ctx+`2*$SZ+4`],%l5
 731         ld      [$ctx+`3*$SZ+0`],%l6
 732
 733         sllx    %l0,32,$tmp0
 734         ld      [$ctx+`3*$SZ+4`],%l7
 735         sllx    %l2,32,$tmp1
 736         or      %l1,$tmp0,$tmp0
 737         or      %l3,$tmp1,$tmp1
 738         add     $tmp0,$A,$A
 739         add     $tmp1,$B,$B
 740         $ST     $A,[$ctx+`0*$SZ`]
 741         sllx    %l4,32,$tmp2
 742         $ST     $B,[$ctx+`1*$SZ`]
 743         sllx    %l6,32,$T1
 744         or      %l5,$tmp2,$tmp2
 745         or      %l7,$T1,$T1
 746         add     $tmp2,$C,$C
 747         $ST     $C,[$ctx+`2*$SZ`]
 748         add     $T1,$D,$D
 749         $ST     $D,[$ctx+`3*$SZ`]
 750
 751         ld      [$ctx+`4*$SZ+0`],%l0
 752         ld      [$ctx+`4*$SZ+4`],%l1
 753         ld      [$ctx+`5*$SZ+0`],%l2
 754         ld      [$ctx+`5*$SZ+4`],%l3
 755         ld      [$ctx+`6*$SZ+0`],%l4
 756         ld      [$ctx+`6*$SZ+4`],%l5
 757         ld      [$ctx+`7*$SZ+0`],%l6
 758
 759         sllx    %l0,32,$tmp0
 760         ld      [$ctx+`7*$SZ+4`],%l7
 761         sllx    %l2,32,$tmp1
 762         or      %l1,$tmp0,$tmp0
 763         or      %l3,$tmp1,$tmp1
 764         add     $tmp0,$E,$E
 765         add     $tmp1,$F,$F
 766         $ST     $E,[$ctx+`4*$SZ`]
 767         sllx    %l4,32,$tmp2
 768         $ST     $F,[$ctx+`5*$SZ`]
 769         sllx    %l6,32,$T1
 770         or      %l5,$tmp2,$tmp2
 771         or      %l7,$T1,$T1
 772         add     $tmp2,$G,$G
 773         $ST     $G,[$ctx+`6*$SZ`]
 774         add     $T1,$H,$H
 775         $ST     $H,[$ctx+`7*$SZ`]
 776 ___
 777 $code.=<<___;
 778         add     $inp,`16*$SZ`,$inp              ! advance inp
 779         cmp     $inp,$len
 780         bne     `$bits==64?"%xcc":"%icc"`,.Lloop
 781         sub     $Ktbl,`($rounds-16)*$SZ`,$Ktbl  ! rewind Ktbl
 782
 783         ret
 784         restore
 785 .type   sha${label}_block_data_order,#function
 786 .size   sha${label}_block_data_order,(.-sha${label}_block_data_order)
 787 .asciz  "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 788 .align  4
 789 ___
 790
 791 # Purpose of these subroutines is to explicitly encode VIS instructions,
 792 # so that one can compile the module without having to specify VIS
 793 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 794 # Idea is to reserve for option to produce "universal" binary and let
 795 # programmer detect if current CPU is VIS capable at run-time.
 796 sub unvis {
 797 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 798 my $ref,$opf;
 799 my %visopf = (  "faligndata"    => 0x048,
 800                 "for"           => 0x07c        );
 801
 802     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 803
 804     if ($opf=$visopf{$mnemonic}) {
 805         foreach ($rs1,$rs2,$rd) {
 806             return $ref if (!/%f([0-9]{1,2})/);
 807             $_=$1;
 808             if ($1>=32) {
 809                 return $ref if ($1&1);
 810                 # re-encode for upper double register addressing
 811                 $_=($1|$1>>5)&31;
 812             }
 813         }
 814
 815         return  sprintf ".word\t0x%08x !%s",
 816                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 817                         $ref;
 818     } else {
 819         return $ref;
 820     }
 821 }
 822 sub unalignaddr {
 823 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 824 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 825 my $ref="$mnemonic\t$rs1,$rs2,$rd";
 826
 827     foreach ($rs1,$rs2,$rd) {
 828         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 829         else                    { return $ref; }
 830     }
 831     return  sprintf ".word\t0x%08x !%s",
 832                     0x81b00300|$rd<<25|$rs1<<14|$rs2,
 833                     $ref;
 834 }
 835
 836 foreach (split("\n",$code)) {
 837         s/\`([^\`]*)\`/eval $1/ge;
 838
 839         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
 840                 &unvis($1,$2,$3,$4)
 841          /ge;
 842         s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 843                 &unalignaddr($1,$2,$3,$4)
 844          /ge;
 845
 846         print $_,"\n";
 847 }
 848
 849 close STDOUT;