]>
git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha512-sparcv9.pl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
10 # ====================================================================
12 # SHA256 performance improvement over compiler generated code varies
13 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
14 # build]. Just like in SHA1 module I aim to ensure scalability on
15 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
17 # SHA512 on pre-T1 UltraSPARC.
19 # Performance is >75% better than 64-bit code generated by Sun C and
20 # over 2x than 32-bit code. X[16] resides on stack, but access to it
21 # is scheduled for L2 latency and staged through 32 least significant
22 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
23 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
24 # good [optimal coefficient is 50%].
26 # SHA512 on UltraSPARC T1.
28 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
29 # because 64-bit code generator has the advantage of using 64-bit
30 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
31 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
32 # code by 60%, not to mention that it doesn't suffer from severe decay
33 # when running 4 times physical cores threads and that it leaves gcc
34 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
35 # performance is only 10% better, but overall throughput for maximum
36 # amount of threads for given CPU exceeds corresponding one of SHA256
37 # by 30% [again, optimal coefficient is 50%].
39 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
40 # in-order, i.e. load instruction has to complete prior next
41 # instruction in given thread is executed, even if the latter is
42 # not dependent on load result! This means that on T1 two 32-bit
43 # loads are always slower than one 64-bit load. Once again this
44 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
45 # 2x32-bit loads can be as fast as 1x64-bit ones.
47 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
48 # which is 9.3x/11.1x faster than software. Multi-process benchmark
49 # saturates at 11.5x single-process result on 8-core processor, or
50 # ~11/16GBps per 2.85GHz socket.
54 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55 if ($bits==64) { $bias=2047; $frame=192; }
56 else { $bias=0; $frame=112; }
59 open STDOUT
,">$output";
61 if ($output =~ /512/) {
64 $LD="ldx"; # load from memory
65 $ST="stx"; # store to memory
66 $SLL="sllx"; # shift left logical
67 $SRL="srlx"; # shift right logical
70 @sigma0=( 7, 1, 8); # right shift first
71 @sigma1=( 6,19,61); # right shift first
76 $locals=16*$SZ; # X[16]
86 @V=($A,$B,$C,$D,$E,$F,$G,$H);
90 $LD="ld"; # load from memory
91 $ST="st"; # store to memory
92 $SLL="sll"; # shift left logical
93 $SRL="srl"; # shift right logical
96 @sigma0=( 3, 7,18); # right shift first
97 @sigma1=(10,17,19); # right shift first
102 $locals=0; # X[16] is register resident
103 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
113 @V=($A,$B,$C,$D,$E,$F,$G,$H);
129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
139 subcc
%g0,$tmp31,$tmp32 ! should be
64-$tmp31, but
-$tmp31 works too
144 sllx
@X[0],$tmp31,@X[0]
149 srlx
@X[$j+1],$tmp32,$tmp1
150 sllx
@X[$j+1],$tmp31,@X[$j+1]
151 or $tmp1,@X[$j],@X[$j]
162 $code.="\tadd @X[$i/2],$h,$T1\n";
164 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
170 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
171 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
173 $code.=<<___
if ($i==0);
184 $code.=<<___
if ($i<15);
185 sllx
@pair[1],$tmp31,$tmp2 ! Xload
($i)
187 sllx
@pair[0],$tmp0,$tmp1
188 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
189 srlx
@pair[2],$tmp32,@pair[1]
191 or @pair[1],$tmp2,$tmp2
192 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
194 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
196 $code.=<<___
if ($i==12);
200 $code.=<<___
if ($i==15);
201 ld
[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
202 sllx
@pair[1],$tmp31,$tmp2 ! Xload
($i)
204 ld
[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
205 sllx
@pair[0],$tmp0,$tmp1
206 ld
[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
207 srlx
@pair[2],$tmp32,@pair[1]
209 ld
[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
210 or @pair[1],$tmp2,$tmp2
211 ld
[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
213 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
214 ld
[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
215 ld
[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
216 ld
[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
222 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
227 $code.="\tadd $h,$T1,$T1\n";
231 $SRL $e,@Sigma1[0],$h !! $i
233 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
235 $SRL $e,@Sigma1[1],$tmp0
237 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
239 $SRL $e,@Sigma1[2],$tmp0
241 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
243 xor $g,$tmp2,$tmp2 ! Ch
(e
,f
,g
)
244 xor $tmp1,$h,$tmp0 ! Sigma1
(e
)
246 $SRL $a,@Sigma0[0],$h
248 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K
[$i]
249 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
251 $SRL $a,@Sigma0[1],$tmp0
253 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
255 $SRL $a,@Sigma0[2],$tmp0
257 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
259 xor $tmp1,$h,$h ! Sigma0
(a
)
264 or $tmp0,$tmp1,$tmp1 ! Maj
(a
,b
,c
)
265 add
$tmp2,$T1,$T1 ! +=K
[$i]
280 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
282 $xi=@X[(($i+1)/2)%8];
285 srl
$xi,@sigma0[0],$T1 !! Xupdate
($i)
286 sll
$xi,`32-@sigma0[2]`,$tmp1
287 srl
$xi,@sigma0[1],$tmp0
289 sll
$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
291 srl
$xi,@sigma0[2],$tmp0
295 $xi=@X[(($i+14)/2)%8];
298 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
301 srl
$xi,@sigma1[0],$tmp2
302 xor $tmp0,$T1,$T1 ! T1
=sigma0
(X
[i
+1])
303 sll
$xi,`32-@sigma1[2]`,$tmp1
304 srl
$xi,@sigma1[1],$tmp0
305 xor $tmp1,$tmp2,$tmp2
306 sll
$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
307 xor $tmp0,$tmp2,$tmp2
308 srl
$xi,@sigma1[2],$tmp0
309 xor $tmp1,$tmp2,$tmp2
314 srlx
@X[(($i+9)/2)%8],32,$tmp1 ! X
[i
+9]
315 xor $tmp0,$tmp2,$tmp2 ! sigma1
(X
[i
+14])
316 srl
@X[($i/2)%8],0,$tmp0
317 add
$tmp2,$tmp1,$tmp1
318 add
$xi,$T1,$T1 ! +=X
[i
]
319 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
323 or $T1,@X[($i/2)%8],@X[($i/2)%8]
326 $xi=@X[(($i+9)/2)%8];
328 srlx
@X[($i/2)%8],32,$tmp1 ! X
[i
]
329 xor $tmp0,$tmp2,$tmp2 ! sigma1
(X
[i
+14])
330 add
$xi,$T1,$T1 ! +=X
[i
+9]
331 add
$tmp2,$tmp1,$tmp1
332 srl
@X[($i/2)%8],0,@X[($i/2)%8]
336 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
345 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
348 sllx
%l2,32,$tmp0 !! Xupdate
($i)
351 srlx
$tmp0,@sigma0[0],$T1
352 ld
[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
353 sllx
$tmp0,`64-@sigma0[2]`,$tmp1
354 ld
[%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
355 srlx
$tmp0,@sigma0[1],$tmp0
357 sllx
$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
359 srlx
$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
362 xor $tmp0,$T1,$T1 ! sigma0
(X
[$i+1])
365 srlx
$tmp2,@sigma1[0],$tmp1
366 ld
[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
367 sllx
$tmp2,`64-@sigma1[2]`,$tmp0
368 ld
[%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
369 srlx
$tmp2,@sigma1[1],$tmp2
370 xor $tmp0,$tmp1,$tmp1
371 sllx
$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
372 xor $tmp2,$tmp1,$tmp1
373 srlx
$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
374 xor $tmp0,$tmp1,$tmp1
376 xor $tmp2,$tmp1,$tmp1 ! sigma1
(X
[$i+14])
377 ld
[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
379 ld
[%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
383 ld
[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
385 add
$tmp0,$T1,$T1 ! +=X
[$i+9]
386 ld
[%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
387 add
$tmp2,$T1,$T1 ! +=X
[$i]
388 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
393 $code.=<<___
if ($bits==64);
394 .register
%g2,#scratch
395 .register
%g3,#scratch
398 #include "sparc_arch.h"
400 .section
".text",#alloc,#execinstr
404 .type K
${label
},#object
408 .long
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
409 .long
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
410 .long
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
411 .long
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
412 .long
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
413 .long
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
414 .long
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
415 .long
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
416 .long
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
417 .long
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
418 .long
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
419 .long
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
420 .long
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
421 .long
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
422 .long
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
423 .long
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
427 .long
0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
428 .long
0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
429 .long
0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
430 .long
0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
431 .long
0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
432 .long
0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
433 .long
0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
434 .long
0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
435 .long
0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
436 .long
0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
437 .long
0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
438 .long
0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
439 .long
0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
440 .long
0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
441 .long
0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
442 .long
0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
443 .long
0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
444 .long
0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
445 .long
0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
446 .long
0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
447 .long
0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
448 .long
0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
449 .long
0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
450 .long
0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
451 .long
0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
452 .long
0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
453 .long
0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
454 .long
0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
455 .long
0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
456 .long
0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
457 .long
0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
458 .long
0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
459 .long
0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
460 .long
0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
461 .long
0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
462 .long
0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
463 .long
0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
464 .long
0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
465 .long
0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
466 .long
0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
470 .size K
${label
},.-K
${label
}
476 .globl sha
${label
}_block_data_order
478 sha
${label
}_block_data_order
:
479 SPARC_LOAD_ADDRESS_LEAF
(OPENSSL_sparcv9cap_P
,%g1,%g5)
480 ld
[%g1+4],%g1 ! OPENSSL_sparcv9cap_P
[1]
482 andcc
%g1, CFR_SHA
${label
}, %g0
486 $code.=<<___
if ($SZ==8); # SHA512
487 ldd
[%o0 + 0x00], %f0 ! load context
488 ldd
[%o0 + 0x08], %f2
489 ldd
[%o0 + 0x10], %f4
490 ldd
[%o0 + 0x18], %f6
491 ldd
[%o0 + 0x20], %f8
492 ldd
[%o0 + 0x28], %f10
494 ldd
[%o0 + 0x30], %f12
495 bne
,pn
%icc, .Lhwunaligned
496 ldd
[%o0 + 0x38], %f14
499 ldd
[%o1 + 0x00], %f16
500 ldd
[%o1 + 0x08], %f18
501 ldd
[%o1 + 0x10], %f20
502 ldd
[%o1 + 0x18], %f22
503 ldd
[%o1 + 0x20], %f24
504 ldd
[%o1 + 0x28], %f26
505 ldd
[%o1 + 0x30], %f28
506 ldd
[%o1 + 0x38], %f30
507 ldd
[%o1 + 0x40], %f32
508 ldd
[%o1 + 0x48], %f34
509 ldd
[%o1 + 0x50], %f36
510 ldd
[%o1 + 0x58], %f38
511 ldd
[%o1 + 0x60], %f40
512 ldd
[%o1 + 0x68], %f42
513 ldd
[%o1 + 0x70], %f44
514 subcc
%o2, 1, %o2 ! done yet?
515 ldd
[%o1 + 0x78], %f46
518 .word
0x81b02860 ! SHA512
520 bne
,pt
`$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
524 std
%f0, [%o0 + 0x00] ! store context
525 std
%f2, [%o0 + 0x08]
526 std
%f4, [%o0 + 0x10]
527 std
%f6, [%o0 + 0x18]
528 std
%f8, [%o0 + 0x20]
529 std
%f10, [%o0 + 0x28]
530 std
%f12, [%o0 + 0x30]
532 std
%f14, [%o0 + 0x38]
536 alignaddr
%o1, %g0, %o1
538 ldd
[%o1 + 0x00], %f18
540 ldd
[%o1 + 0x08], %f20
541 ldd
[%o1 + 0x10], %f22
542 ldd
[%o1 + 0x18], %f24
543 ldd
[%o1 + 0x20], %f26
544 ldd
[%o1 + 0x28], %f28
545 ldd
[%o1 + 0x30], %f30
546 ldd
[%o1 + 0x38], %f32
547 ldd
[%o1 + 0x40], %f34
548 ldd
[%o1 + 0x48], %f36
549 ldd
[%o1 + 0x50], %f38
550 ldd
[%o1 + 0x58], %f40
551 ldd
[%o1 + 0x60], %f42
552 ldd
[%o1 + 0x68], %f44
553 ldd
[%o1 + 0x70], %f46
554 ldd
[%o1 + 0x78], %f48
555 subcc
%o2, 1, %o2 ! done yet?
556 ldd
[%o1 + 0x80], %f50
559 faligndata
%f18, %f20, %f16
560 faligndata
%f20, %f22, %f18
561 faligndata
%f22, %f24, %f20
562 faligndata
%f24, %f26, %f22
563 faligndata
%f26, %f28, %f24
564 faligndata
%f28, %f30, %f26
565 faligndata
%f30, %f32, %f28
566 faligndata
%f32, %f34, %f30
567 faligndata
%f34, %f36, %f32
568 faligndata
%f36, %f38, %f34
569 faligndata
%f38, %f40, %f36
570 faligndata
%f40, %f42, %f38
571 faligndata
%f42, %f44, %f40
572 faligndata
%f44, %f46, %f42
573 faligndata
%f46, %f48, %f44
574 faligndata
%f48, %f50, %f46
576 .word
0x81b02860 ! SHA512
578 bne
,pt
`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
579 for %f50, %f50, %f18 ! %f18=%f50
584 $code.=<<___
if ($SZ==4); # SHA256
593 bne
,pn
%icc, .Lhwunaligned
597 ldd
[%o1 + 0x00], %f8
598 ldd
[%o1 + 0x08], %f10
599 ldd
[%o1 + 0x10], %f12
600 ldd
[%o1 + 0x18], %f14
601 ldd
[%o1 + 0x20], %f16
602 ldd
[%o1 + 0x28], %f18
603 ldd
[%o1 + 0x30], %f20
604 subcc
%o2, 1, %o2 ! done yet?
605 ldd
[%o1 + 0x38], %f22
608 .word
0x81b02840 ! SHA256
610 bne
,pt
`$bits==64?"%xcc":"%icc"`, .Lhwloop
614 st
%f0, [%o0 + 0x00] ! store context
626 alignaddr
%o1, %g0, %o1
628 ldd
[%o1 + 0x00], %f10
630 ldd
[%o1 + 0x08], %f12
631 ldd
[%o1 + 0x10], %f14
632 ldd
[%o1 + 0x18], %f16
633 ldd
[%o1 + 0x20], %f18
634 ldd
[%o1 + 0x28], %f20
635 ldd
[%o1 + 0x30], %f22
636 ldd
[%o1 + 0x38], %f24
637 subcc
%o2, 1, %o2 ! done yet?
638 ldd
[%o1 + 0x40], %f26
641 faligndata
%f10, %f12, %f8
642 faligndata
%f12, %f14, %f10
643 faligndata
%f14, %f16, %f12
644 faligndata
%f16, %f18, %f14
645 faligndata
%f18, %f20, %f16
646 faligndata
%f20, %f22, %f18
647 faligndata
%f22, %f24, %f20
648 faligndata
%f24, %f26, %f22
650 .word
0x81b02840 ! SHA256
652 bne
,pt
`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
653 for %f26, %f26, %f10 ! %f10=%f26
661 save
%sp,`-$frame-$locals`,%sp
662 and $inp,`$align-1`,$tmp31
663 sllx
$len,`log(16*$SZ)/log(2)`,$len
664 andn
$inp,`$align-1`,$inp
668 $code.=<<___
if ($SZ==8); # SHA512
670 sub $tmp32,$tmp31,$tmp32
674 add
%o7,K
${label
}-.Lpic
,$Ktbl
676 $LD [$ctx+`0*$SZ`],$A
677 $LD [$ctx+`1*$SZ`],$B
678 $LD [$ctx+`2*$SZ`],$C
679 $LD [$ctx+`3*$SZ`],$D
680 $LD [$ctx+`4*$SZ`],$E
681 $LD [$ctx+`5*$SZ`],$F
682 $LD [$ctx+`6*$SZ`],$G
683 $LD [$ctx+`7*$SZ`],$H
687 for ($i=0;$i<16;$i++) { &BODY_00_15
($i,@V); unshift(@V,pop(@V)); }
689 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
691 and $tmp2,0xfff,$tmp2
694 add
$Ktbl,`16*$SZ`,$Ktbl ! Ktbl
+=16
697 $code.=<<___
if ($SZ==4); # SHA256
698 $LD [$ctx+`0*$SZ`],@X[0]
699 $LD [$ctx+`1*$SZ`],@X[1]
700 $LD [$ctx+`2*$SZ`],@X[2]
701 $LD [$ctx+`3*$SZ`],@X[3]
702 $LD [$ctx+`4*$SZ`],@X[4]
703 $LD [$ctx+`5*$SZ`],@X[5]
704 $LD [$ctx+`6*$SZ`],@X[6]
705 $LD [$ctx+`7*$SZ`],@X[7]
708 $ST $A,[$ctx+`0*$SZ`]
710 $ST $B,[$ctx+`1*$SZ`]
712 $ST $C,[$ctx+`2*$SZ`]
714 $ST $D,[$ctx+`3*$SZ`]
716 $ST $E,[$ctx+`4*$SZ`]
718 $ST $F,[$ctx+`5*$SZ`]
720 $ST $G,[$ctx+`6*$SZ`]
722 $ST $H,[$ctx+`7*$SZ`]
724 $code.=<<___
if ($SZ==8); # SHA512
725 ld
[$ctx+`0*$SZ+0`],%l0
726 ld
[$ctx+`0*$SZ+4`],%l1
727 ld
[$ctx+`1*$SZ+0`],%l2
728 ld
[$ctx+`1*$SZ+4`],%l3
729 ld
[$ctx+`2*$SZ+0`],%l4
730 ld
[$ctx+`2*$SZ+4`],%l5
731 ld
[$ctx+`3*$SZ+0`],%l6
734 ld
[$ctx+`3*$SZ+4`],%l7
740 $ST $A,[$ctx+`0*$SZ`]
742 $ST $B,[$ctx+`1*$SZ`]
747 $ST $C,[$ctx+`2*$SZ`]
749 $ST $D,[$ctx+`3*$SZ`]
751 ld
[$ctx+`4*$SZ+0`],%l0
752 ld
[$ctx+`4*$SZ+4`],%l1
753 ld
[$ctx+`5*$SZ+0`],%l2
754 ld
[$ctx+`5*$SZ+4`],%l3
755 ld
[$ctx+`6*$SZ+0`],%l4
756 ld
[$ctx+`6*$SZ+4`],%l5
757 ld
[$ctx+`7*$SZ+0`],%l6
760 ld
[$ctx+`7*$SZ+4`],%l7
766 $ST $E,[$ctx+`4*$SZ`]
768 $ST $F,[$ctx+`5*$SZ`]
773 $ST $G,[$ctx+`6*$SZ`]
775 $ST $H,[$ctx+`7*$SZ`]
778 add
$inp,`16*$SZ`,$inp ! advance inp
780 bne
`$bits==64?"%xcc":"%icc"`,.Lloop
781 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
785 .type sha
${label
}_block_data_order
,#function
786 .size sha
${label
}_block_data_order
,(.-sha
${label
}_block_data_order
)
787 .asciz
"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
791 # Purpose of these subroutines is to explicitly encode VIS instructions,
792 # so that one can compile the module without having to specify VIS
793 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
794 # Idea is to reserve for option to produce "universal" binary and let
795 # programmer detect if current CPU is VIS capable at run-time.
797 my ($mnemonic,$rs1,$rs2,$rd)=@_;
799 my %visopf = ( "faligndata" => 0x048,
802 $ref = "$mnemonic\t$rs1,$rs2,$rd";
804 if ($opf=$visopf{$mnemonic}) {
805 foreach ($rs1,$rs2,$rd) {
806 return $ref if (!/%f([0-9]{1,2})/);
809 return $ref if ($1&1);
810 # re-encode for upper double register addressing
815 return sprintf ".word\t0x%08x !%s",
816 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
823 my ($mnemonic,$rs1,$rs2,$rd)=@_;
824 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
825 my $ref="$mnemonic\t$rs1,$rs2,$rd";
827 foreach ($rs1,$rs2,$rd) {
828 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
829 else { return $ref; }
831 return sprintf ".word\t0x%08x !%s",
832 0x81b00300|$rd<<25|$rs1<<14|$rs2,
836 foreach (split("\n",$code)) {
837 s/\`([^\`]*)\`/eval $1/ge;
839 s
/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
842 s
/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
843 &unalignaddr
($1,$2,$3,$4)