]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha512-sparcv9.pl
sha[1|512]-sparcv9.pl: add hardware SPARC T4 support.
[thirdparty/openssl.git] / crypto / sha / asm / sha512-sparcv9.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
10 # ====================================================================
11
12 # SHA256 performance improvement over compiler generated code varies
13 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
14 # build]. Just like in SHA1 module I aim to ensure scalability on
15 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
16
17 # SHA512 on pre-T1 UltraSPARC.
18 #
19 # Performance is >75% better than 64-bit code generated by Sun C and
20 # over 2x than 32-bit code. X[16] resides on stack, but access to it
21 # is scheduled for L2 latency and staged through 32 least significant
22 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
23 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
24 # good [optimal coefficient is 50%].
25 #
26 # SHA512 on UltraSPARC T1.
27 #
28 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
29 # because 64-bit code generator has the advantage of using 64-bit
30 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
31 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
32 # code by 60%, not to mention that it doesn't suffer from severe decay
33 # when running 4 times physical cores threads and that it leaves gcc
34 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
35 # performance is only 10% better, but overall throughput for maximum
36 # amount of threads for given CPU exceeds corresponding one of SHA256
37 # by 30% [again, optimal coefficient is 50%].
38 #
39 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
40 # in-order, i.e. load instruction has to complete prior next
41 # instruction in given thread is executed, even if the latter is
42 # not dependent on load result! This means that on T1 two 32-bit
43 # loads are always slower than one 64-bit load. Once again this
44 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
45 # 2x32-bit loads can be as fast as 1x64-bit ones.
46 #
47 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
48 # which is 9.3x/11.1x faster than software. Multi-process benchmark
49 # saturates at 11.5x single-process result on 8-core processor, or
50 # ~11/16GBps per 2.85GHz socket.
51
52
53 $bits=32;
54 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55 if ($bits==64) { $bias=2047; $frame=192; }
56 else { $bias=0; $frame=112; }
57
58 $output=shift;
59 open STDOUT,">$output";
60
61 if ($output =~ /512/) {
62 $label="512";
63 $SZ=8;
64 $LD="ldx"; # load from memory
65 $ST="stx"; # store to memory
66 $SLL="sllx"; # shift left logical
67 $SRL="srlx"; # shift right logical
68 @Sigma0=(28,34,39);
69 @Sigma1=(14,18,41);
70 @sigma0=( 7, 1, 8); # right shift first
71 @sigma1=( 6,19,61); # right shift first
72 $lastK=0x817;
73 $rounds=80;
74 $align=4;
75
76 $locals=16*$SZ; # X[16]
77
78 $A="%o0";
79 $B="%o1";
80 $C="%o2";
81 $D="%o3";
82 $E="%o4";
83 $F="%o5";
84 $G="%g1";
85 $H="%o7";
86 @V=($A,$B,$C,$D,$E,$F,$G,$H);
87 } else {
88 $label="256";
89 $SZ=4;
90 $LD="ld"; # load from memory
91 $ST="st"; # store to memory
92 $SLL="sll"; # shift left logical
93 $SRL="srl"; # shift right logical
94 @Sigma0=( 2,13,22);
95 @Sigma1=( 6,11,25);
96 @sigma0=( 3, 7,18); # right shift first
97 @sigma1=(10,17,19); # right shift first
98 $lastK=0x8f2;
99 $rounds=64;
100 $align=8;
101
102 $locals=0; # X[16] is register resident
103 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
104
105 $A="%l0";
106 $B="%l1";
107 $C="%l2";
108 $D="%l3";
109 $E="%l4";
110 $F="%l5";
111 $G="%l6";
112 $H="%l7";
113 @V=($A,$B,$C,$D,$E,$F,$G,$H);
114 }
115 $T1="%g2";
116 $tmp0="%g3";
117 $tmp1="%g4";
118 $tmp2="%g5";
119
120 $ctx="%i0";
121 $inp="%i1";
122 $len="%i2";
123 $Ktbl="%i3";
124 $tmp31="%i4";
125 $tmp32="%i5";
126
127 ########### SHA256
128 $Xload = sub {
129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
130
131 if ($i==0) {
132 $code.=<<___;
133 ldx [$inp+0],@X[0]
134 ldx [$inp+16],@X[2]
135 ldx [$inp+32],@X[4]
136 ldx [$inp+48],@X[6]
137 ldx [$inp+8],@X[1]
138 ldx [$inp+24],@X[3]
139 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
140 ldx [$inp+40],@X[5]
141 bz,pt %icc,.Laligned
142 ldx [$inp+56],@X[7]
143
144 sllx @X[0],$tmp31,@X[0]
145 ldx [$inp+64],$T1
146 ___
147 for($j=0;$j<7;$j++)
148 { $code.=<<___;
149 srlx @X[$j+1],$tmp32,$tmp1
150 sllx @X[$j+1],$tmp31,@X[$j+1]
151 or $tmp1,@X[$j],@X[$j]
152 ___
153 }
154 $code.=<<___;
155 srlx $T1,$tmp32,$T1
156 or $T1,@X[7],@X[7]
157 .Laligned:
158 ___
159 }
160
161 if ($i&1) {
162 $code.="\tadd @X[$i/2],$h,$T1\n";
163 } else {
164 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
165 }
166 } if ($SZ==4);
167
168 ########### SHA512
169 $Xload = sub {
170 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
171 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
172
173 $code.=<<___ if ($i==0);
174 ld [$inp+0],%l0
175 ld [$inp+4],%l1
176 ld [$inp+8],%l2
177 ld [$inp+12],%l3
178 ld [$inp+16],%l4
179 ld [$inp+20],%l5
180 ld [$inp+24],%l6
181 cmp $tmp31,0
182 ld [$inp+28],%l7
183 ___
184 $code.=<<___ if ($i<15);
185 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
186 add $tmp31,32,$tmp0
187 sllx @pair[0],$tmp0,$tmp1
188 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
189 srlx @pair[2],$tmp32,@pair[1]
190 or $tmp1,$tmp2,$tmp2
191 or @pair[1],$tmp2,$tmp2
192 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
193 add $h,$tmp2,$T1
194 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
195 ___
196 $code.=<<___ if ($i==12);
197 bnz,a,pn %icc,.+8
198 ld [$inp+128],%l0
199 ___
200 $code.=<<___ if ($i==15);
201 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
202 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
203 add $tmp31,32,$tmp0
204 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
205 sllx @pair[0],$tmp0,$tmp1
206 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
207 srlx @pair[2],$tmp32,@pair[1]
208 or $tmp1,$tmp2,$tmp2
209 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
210 or @pair[1],$tmp2,$tmp2
211 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
212 add $h,$tmp2,$T1
213 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
214 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
215 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
216 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
217 ___
218 } if ($SZ==8);
219
220 ########### common
221 sub BODY_00_15 {
222 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
223
224 if ($i<16) {
225 &$Xload(@_);
226 } else {
227 $code.="\tadd $h,$T1,$T1\n";
228 }
229
230 $code.=<<___;
231 $SRL $e,@Sigma1[0],$h !! $i
232 xor $f,$g,$tmp2
233 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
234 and $e,$tmp2,$tmp2
235 $SRL $e,@Sigma1[1],$tmp0
236 xor $tmp1,$h,$h
237 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
238 xor $tmp0,$h,$h
239 $SRL $e,@Sigma1[2],$tmp0
240 xor $tmp1,$h,$h
241 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
242 xor $tmp0,$h,$h
243 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
244 xor $tmp1,$h,$tmp0 ! Sigma1(e)
245
246 $SRL $a,@Sigma0[0],$h
247 add $tmp2,$T1,$T1
248 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
249 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
250 add $tmp0,$T1,$T1
251 $SRL $a,@Sigma0[1],$tmp0
252 xor $tmp1,$h,$h
253 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
254 xor $tmp0,$h,$h
255 $SRL $a,@Sigma0[2],$tmp0
256 xor $tmp1,$h,$h
257 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
258 xor $tmp0,$h,$h
259 xor $tmp1,$h,$h ! Sigma0(a)
260
261 or $a,$b,$tmp0
262 and $a,$b,$tmp1
263 and $c,$tmp0,$tmp0
264 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
265 add $tmp2,$T1,$T1 ! +=K[$i]
266 add $tmp1,$h,$h
267
268 add $T1,$d,$d
269 add $T1,$h,$h
270 ___
271 }
272
273 ########### SHA256
274 $BODY_16_XX = sub {
275 my $i=@_[0];
276 my $xi;
277
278 if ($i&1) {
279 $xi=$tmp32;
280 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
281 } else {
282 $xi=@X[(($i+1)/2)%8];
283 }
284 $code.=<<___;
285 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
286 sll $xi,`32-@sigma0[2]`,$tmp1
287 srl $xi,@sigma0[1],$tmp0
288 xor $tmp1,$T1,$T1
289 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
290 xor $tmp0,$T1,$T1
291 srl $xi,@sigma0[2],$tmp0
292 xor $tmp1,$T1,$T1
293 ___
294 if ($i&1) {
295 $xi=@X[(($i+14)/2)%8];
296 } else {
297 $xi=$tmp32;
298 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
299 }
300 $code.=<<___;
301 srl $xi,@sigma1[0],$tmp2
302 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
303 sll $xi,`32-@sigma1[2]`,$tmp1
304 srl $xi,@sigma1[1],$tmp0
305 xor $tmp1,$tmp2,$tmp2
306 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
307 xor $tmp0,$tmp2,$tmp2
308 srl $xi,@sigma1[2],$tmp0
309 xor $tmp1,$tmp2,$tmp2
310 ___
311 if ($i&1) {
312 $xi=@X[($i/2)%8];
313 $code.=<<___;
314 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
315 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
316 srl @X[($i/2)%8],0,$tmp0
317 add $tmp2,$tmp1,$tmp1
318 add $xi,$T1,$T1 ! +=X[i]
319 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
320 add $tmp1,$T1,$T1
321
322 srl $T1,0,$T1
323 or $T1,@X[($i/2)%8],@X[($i/2)%8]
324 ___
325 } else {
326 $xi=@X[(($i+9)/2)%8];
327 $code.=<<___;
328 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
329 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
330 add $xi,$T1,$T1 ! +=X[i+9]
331 add $tmp2,$tmp1,$tmp1
332 srl @X[($i/2)%8],0,@X[($i/2)%8]
333 add $tmp1,$T1,$T1
334
335 sllx $T1,32,$tmp0
336 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
337 ___
338 }
339 &BODY_00_15(@_);
340 } if ($SZ==4);
341
342 ########### SHA512
343 $BODY_16_XX = sub {
344 my $i=@_[0];
345 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
346
347 $code.=<<___;
348 sllx %l2,32,$tmp0 !! Xupdate($i)
349 or %l3,$tmp0,$tmp0
350
351 srlx $tmp0,@sigma0[0],$T1
352 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
353 sllx $tmp0,`64-@sigma0[2]`,$tmp1
354 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
355 srlx $tmp0,@sigma0[1],$tmp0
356 xor $tmp1,$T1,$T1
357 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
358 xor $tmp0,$T1,$T1
359 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
360 xor $tmp1,$T1,$T1
361 sllx %l6,32,$tmp2
362 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
363 or %l7,$tmp2,$tmp2
364
365 srlx $tmp2,@sigma1[0],$tmp1
366 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
367 sllx $tmp2,`64-@sigma1[2]`,$tmp0
368 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
369 srlx $tmp2,@sigma1[1],$tmp2
370 xor $tmp0,$tmp1,$tmp1
371 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
372 xor $tmp2,$tmp1,$tmp1
373 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
374 xor $tmp0,$tmp1,$tmp1
375 sllx %l4,32,$tmp0
376 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
377 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
378 or %l5,$tmp0,$tmp0
379 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
380
381 sllx %l0,32,$tmp2
382 add $tmp1,$T1,$T1
383 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
384 or %l1,$tmp2,$tmp2
385 add $tmp0,$T1,$T1 ! +=X[$i+9]
386 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
387 add $tmp2,$T1,$T1 ! +=X[$i]
388 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
389 ___
390 &BODY_00_15(@_);
391 } if ($SZ==8);
392
393 $code.=<<___ if ($bits==64);
394 .register %g2,#scratch
395 .register %g3,#scratch
396 ___
397 $code.=<<___;
398 #include "sparc_arch.h"
399
400 .section ".text",#alloc,#execinstr
401
402 .align 64
403 K${label}:
404 .type K${label},#object
405 ___
406 if ($SZ==4) {
407 $code.=<<___;
408 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
409 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
410 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
411 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
412 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
413 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
414 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
415 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
416 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
417 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
418 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
419 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
420 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
421 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
422 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
423 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
424 ___
425 } else {
426 $code.=<<___;
427 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
428 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
429 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
430 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
431 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
432 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
433 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
434 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
435 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
436 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
437 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
438 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
439 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
440 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
441 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
442 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
443 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
444 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
445 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
446 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
447 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
448 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
449 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
450 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
451 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
452 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
453 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
454 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
455 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
456 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
457 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
458 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
459 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
460 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
461 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
462 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
463 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
464 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
465 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
466 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
467 ___
468 }
469 $code.=<<___;
470 .size K${label},.-K${label}
471
472 #ifdef __PIC__
473 SPARC_PIC_THUNK(%g1)
474 #endif
475
476 .globl sha${label}_block_data_order
477 .align 32
478 sha${label}_block_data_order:
479 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
480 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
481
482 andcc %g1, CFR_SHA${label}, %g0
483 be .Lsoftware
484 nop
485 ___
486 $code.=<<___ if ($SZ==8); # SHA512
487 ldd [%o0 + 0x00], %f0 ! load context
488 ldd [%o0 + 0x08], %f2
489 ldd [%o0 + 0x10], %f4
490 ldd [%o0 + 0x18], %f6
491 ldd [%o0 + 0x20], %f8
492 ldd [%o0 + 0x28], %f10
493 andcc %o1, 0x7, %g0
494 ldd [%o0 + 0x30], %f12
495 bne,pn %icc, .Lhwunaligned
496 ldd [%o0 + 0x38], %f14
497
498 .Lhwaligned_loop:
499 ldd [%o1 + 0x00], %f16
500 ldd [%o1 + 0x08], %f18
501 ldd [%o1 + 0x10], %f20
502 ldd [%o1 + 0x18], %f22
503 ldd [%o1 + 0x20], %f24
504 ldd [%o1 + 0x28], %f26
505 ldd [%o1 + 0x30], %f28
506 ldd [%o1 + 0x38], %f30
507 ldd [%o1 + 0x40], %f32
508 ldd [%o1 + 0x48], %f34
509 ldd [%o1 + 0x50], %f36
510 ldd [%o1 + 0x58], %f38
511 ldd [%o1 + 0x60], %f40
512 ldd [%o1 + 0x68], %f42
513 ldd [%o1 + 0x70], %f44
514 subcc %o2, 1, %o2 ! done yet?
515 ldd [%o1 + 0x78], %f46
516 add %o1, 0x80, %o1
517
518 .word 0x81b02860 ! SHA512
519
520 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
521 nop
522
523 .Lhwfinish:
524 std %f0, [%o0 + 0x00] ! store context
525 std %f2, [%o0 + 0x08]
526 std %f4, [%o0 + 0x10]
527 std %f6, [%o0 + 0x18]
528 std %f8, [%o0 + 0x20]
529 std %f10, [%o0 + 0x28]
530 std %f12, [%o0 + 0x30]
531 retl
532 std %f14, [%o0 + 0x38]
533
534 .align 16
535 .Lhwunaligned:
536 alignaddr %o1, %g0, %o1
537
538 ldd [%o1 + 0x00], %f18
539 .Lhwunaligned_loop:
540 ldd [%o1 + 0x08], %f20
541 ldd [%o1 + 0x10], %f22
542 ldd [%o1 + 0x18], %f24
543 ldd [%o1 + 0x20], %f26
544 ldd [%o1 + 0x28], %f28
545 ldd [%o1 + 0x30], %f30
546 ldd [%o1 + 0x38], %f32
547 ldd [%o1 + 0x40], %f34
548 ldd [%o1 + 0x48], %f36
549 ldd [%o1 + 0x50], %f38
550 ldd [%o1 + 0x58], %f40
551 ldd [%o1 + 0x60], %f42
552 ldd [%o1 + 0x68], %f44
553 ldd [%o1 + 0x70], %f46
554 ldd [%o1 + 0x78], %f48
555 subcc %o2, 1, %o2 ! done yet?
556 ldd [%o1 + 0x80], %f50
557 add %o1, 0x80, %o1
558
559 faligndata %f18, %f20, %f16
560 faligndata %f20, %f22, %f18
561 faligndata %f22, %f24, %f20
562 faligndata %f24, %f26, %f22
563 faligndata %f26, %f28, %f24
564 faligndata %f28, %f30, %f26
565 faligndata %f30, %f32, %f28
566 faligndata %f32, %f34, %f30
567 faligndata %f34, %f36, %f32
568 faligndata %f36, %f38, %f34
569 faligndata %f38, %f40, %f36
570 faligndata %f40, %f42, %f38
571 faligndata %f42, %f44, %f40
572 faligndata %f44, %f46, %f42
573 faligndata %f46, %f48, %f44
574 faligndata %f48, %f50, %f46
575
576 .word 0x81b02860 ! SHA512
577
578 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
579 for %f50, %f50, %f18 ! %f18=%f50
580
581 ba .Lhwfinish
582 nop
583 ___
584 $code.=<<___ if ($SZ==4); # SHA256
585 ld [%o0 + 0x00], %f0
586 ld [%o0 + 0x04], %f1
587 ld [%o0 + 0x08], %f2
588 ld [%o0 + 0x0c], %f3
589 ld [%o0 + 0x10], %f4
590 ld [%o0 + 0x14], %f5
591 andcc %o1, 0x7, %g0
592 ld [%o0 + 0x18], %f6
593 bne,pn %icc, .Lhwunaligned
594 ld [%o0 + 0x1c], %f7
595
596 .Lhwloop:
597 ldd [%o1 + 0x00], %f8
598 ldd [%o1 + 0x08], %f10
599 ldd [%o1 + 0x10], %f12
600 ldd [%o1 + 0x18], %f14
601 ldd [%o1 + 0x20], %f16
602 ldd [%o1 + 0x28], %f18
603 ldd [%o1 + 0x30], %f20
604 subcc %o2, 1, %o2 ! done yet?
605 ldd [%o1 + 0x38], %f22
606 add %o1, 0x40, %o1
607
608 .word 0x81b02840 ! SHA256
609
610 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwloop
611 nop
612
613 .Lhwfinish:
614 st %f0, [%o0 + 0x00] ! store context
615 st %f1, [%o0 + 0x04]
616 st %f2, [%o0 + 0x08]
617 st %f3, [%o0 + 0x0c]
618 st %f4, [%o0 + 0x10]
619 st %f5, [%o0 + 0x14]
620 st %f6, [%o0 + 0x18]
621 retl
622 st %f7, [%o0 + 0x1c]
623
624 .align 8
625 .Lhwunaligned:
626 alignaddr %o1, %g0, %o1
627
628 ldd [%o1 + 0x00], %f10
629 .Lhwunaligned_loop:
630 ldd [%o1 + 0x08], %f12
631 ldd [%o1 + 0x10], %f14
632 ldd [%o1 + 0x18], %f16
633 ldd [%o1 + 0x20], %f18
634 ldd [%o1 + 0x28], %f20
635 ldd [%o1 + 0x30], %f22
636 ldd [%o1 + 0x38], %f24
637 subcc %o2, 1, %o2 ! done yet?
638 ldd [%o1 + 0x40], %f26
639 add %o1, 0x40, %o1
640
641 faligndata %f10, %f12, %f8
642 faligndata %f12, %f14, %f10
643 faligndata %f14, %f16, %f12
644 faligndata %f16, %f18, %f14
645 faligndata %f18, %f20, %f16
646 faligndata %f20, %f22, %f18
647 faligndata %f22, %f24, %f20
648 faligndata %f24, %f26, %f22
649
650 .word 0x81b02840 ! SHA256
651
652 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
653 for %f26, %f26, %f10 ! %f10=%f26
654
655 ba .Lhwfinish
656 nop
657 ___
658 $code.=<<___;
659 .align 16
660 .Lsoftware:
661 save %sp,`-$frame-$locals`,%sp
662 and $inp,`$align-1`,$tmp31
663 sllx $len,`log(16*$SZ)/log(2)`,$len
664 andn $inp,`$align-1`,$inp
665 sll $tmp31,3,$tmp31
666 add $inp,$len,$len
667 ___
668 $code.=<<___ if ($SZ==8); # SHA512
669 mov 32,$tmp32
670 sub $tmp32,$tmp31,$tmp32
671 ___
672 $code.=<<___;
673 .Lpic: call .+8
674 add %o7,K${label}-.Lpic,$Ktbl
675
676 $LD [$ctx+`0*$SZ`],$A
677 $LD [$ctx+`1*$SZ`],$B
678 $LD [$ctx+`2*$SZ`],$C
679 $LD [$ctx+`3*$SZ`],$D
680 $LD [$ctx+`4*$SZ`],$E
681 $LD [$ctx+`5*$SZ`],$F
682 $LD [$ctx+`6*$SZ`],$G
683 $LD [$ctx+`7*$SZ`],$H
684
685 .Lloop:
686 ___
687 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
688 $code.=".L16_xx:\n";
689 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
690 $code.=<<___;
691 and $tmp2,0xfff,$tmp2
692 cmp $tmp2,$lastK
693 bne .L16_xx
694 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
695
696 ___
697 $code.=<<___ if ($SZ==4); # SHA256
698 $LD [$ctx+`0*$SZ`],@X[0]
699 $LD [$ctx+`1*$SZ`],@X[1]
700 $LD [$ctx+`2*$SZ`],@X[2]
701 $LD [$ctx+`3*$SZ`],@X[3]
702 $LD [$ctx+`4*$SZ`],@X[4]
703 $LD [$ctx+`5*$SZ`],@X[5]
704 $LD [$ctx+`6*$SZ`],@X[6]
705 $LD [$ctx+`7*$SZ`],@X[7]
706
707 add $A,@X[0],$A
708 $ST $A,[$ctx+`0*$SZ`]
709 add $B,@X[1],$B
710 $ST $B,[$ctx+`1*$SZ`]
711 add $C,@X[2],$C
712 $ST $C,[$ctx+`2*$SZ`]
713 add $D,@X[3],$D
714 $ST $D,[$ctx+`3*$SZ`]
715 add $E,@X[4],$E
716 $ST $E,[$ctx+`4*$SZ`]
717 add $F,@X[5],$F
718 $ST $F,[$ctx+`5*$SZ`]
719 add $G,@X[6],$G
720 $ST $G,[$ctx+`6*$SZ`]
721 add $H,@X[7],$H
722 $ST $H,[$ctx+`7*$SZ`]
723 ___
724 $code.=<<___ if ($SZ==8); # SHA512
725 ld [$ctx+`0*$SZ+0`],%l0
726 ld [$ctx+`0*$SZ+4`],%l1
727 ld [$ctx+`1*$SZ+0`],%l2
728 ld [$ctx+`1*$SZ+4`],%l3
729 ld [$ctx+`2*$SZ+0`],%l4
730 ld [$ctx+`2*$SZ+4`],%l5
731 ld [$ctx+`3*$SZ+0`],%l6
732
733 sllx %l0,32,$tmp0
734 ld [$ctx+`3*$SZ+4`],%l7
735 sllx %l2,32,$tmp1
736 or %l1,$tmp0,$tmp0
737 or %l3,$tmp1,$tmp1
738 add $tmp0,$A,$A
739 add $tmp1,$B,$B
740 $ST $A,[$ctx+`0*$SZ`]
741 sllx %l4,32,$tmp2
742 $ST $B,[$ctx+`1*$SZ`]
743 sllx %l6,32,$T1
744 or %l5,$tmp2,$tmp2
745 or %l7,$T1,$T1
746 add $tmp2,$C,$C
747 $ST $C,[$ctx+`2*$SZ`]
748 add $T1,$D,$D
749 $ST $D,[$ctx+`3*$SZ`]
750
751 ld [$ctx+`4*$SZ+0`],%l0
752 ld [$ctx+`4*$SZ+4`],%l1
753 ld [$ctx+`5*$SZ+0`],%l2
754 ld [$ctx+`5*$SZ+4`],%l3
755 ld [$ctx+`6*$SZ+0`],%l4
756 ld [$ctx+`6*$SZ+4`],%l5
757 ld [$ctx+`7*$SZ+0`],%l6
758
759 sllx %l0,32,$tmp0
760 ld [$ctx+`7*$SZ+4`],%l7
761 sllx %l2,32,$tmp1
762 or %l1,$tmp0,$tmp0
763 or %l3,$tmp1,$tmp1
764 add $tmp0,$E,$E
765 add $tmp1,$F,$F
766 $ST $E,[$ctx+`4*$SZ`]
767 sllx %l4,32,$tmp2
768 $ST $F,[$ctx+`5*$SZ`]
769 sllx %l6,32,$T1
770 or %l5,$tmp2,$tmp2
771 or %l7,$T1,$T1
772 add $tmp2,$G,$G
773 $ST $G,[$ctx+`6*$SZ`]
774 add $T1,$H,$H
775 $ST $H,[$ctx+`7*$SZ`]
776 ___
777 $code.=<<___;
778 add $inp,`16*$SZ`,$inp ! advance inp
779 cmp $inp,$len
780 bne `$bits==64?"%xcc":"%icc"`,.Lloop
781 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
782
783 ret
784 restore
785 .type sha${label}_block_data_order,#function
786 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
787 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
788 .align 4
789 ___
790
791 # Purpose of these subroutines is to explicitly encode VIS instructions,
792 # so that one can compile the module without having to specify VIS
793 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
794 # Idea is to reserve for option to produce "universal" binary and let
795 # programmer detect if current CPU is VIS capable at run-time.
796 sub unvis {
797 my ($mnemonic,$rs1,$rs2,$rd)=@_;
798 my $ref,$opf;
799 my %visopf = ( "faligndata" => 0x048,
800 "for" => 0x07c );
801
802 $ref = "$mnemonic\t$rs1,$rs2,$rd";
803
804 if ($opf=$visopf{$mnemonic}) {
805 foreach ($rs1,$rs2,$rd) {
806 return $ref if (!/%f([0-9]{1,2})/);
807 $_=$1;
808 if ($1>=32) {
809 return $ref if ($1&1);
810 # re-encode for upper double register addressing
811 $_=($1|$1>>5)&31;
812 }
813 }
814
815 return sprintf ".word\t0x%08x !%s",
816 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
817 $ref;
818 } else {
819 return $ref;
820 }
821 }
822 sub unalignaddr {
823 my ($mnemonic,$rs1,$rs2,$rd)=@_;
824 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
825 my $ref="$mnemonic\t$rs1,$rs2,$rd";
826
827 foreach ($rs1,$rs2,$rd) {
828 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
829 else { return $ref; }
830 }
831 return sprintf ".word\t0x%08x !%s",
832 0x81b00300|$rd<<25|$rs1<<14|$rs2,
833 $ref;
834 }
835
836 foreach (split("\n",$code)) {
837 s/\`([^\`]*)\`/eval $1/ge;
838
839 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
840 &unvis($1,$2,$3,$4)
841 /ge;
842 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
843 &unalignaddr($1,$2,$3,$4)
844 /ge;
845
846 print $_,"\n";
847 }
848
849 close STDOUT;