]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha512-sparcv9.pl
7f57010fc5afd70f0eb82c7ad5fca238a11b7c6f
[thirdparty/openssl.git] / crypto / sha / asm / sha512-sparcv9.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 #
16 # Hardware SPARC T4 support by David S. Miller
17 # ====================================================================
18
19 # SHA256 performance improvement over compiler generated code varies
20 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
21 # build]. Just like in SHA1 module I aim to ensure scalability on
22 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
23
24 # SHA512 on pre-T1 UltraSPARC.
25 #
26 # Performance is >75% better than 64-bit code generated by Sun C and
27 # over 2x than 32-bit code. X[16] resides on stack, but access to it
28 # is scheduled for L2 latency and staged through 32 least significant
29 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
30 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
31 # good [optimal coefficient is 50%].
32 #
33 # SHA512 on UltraSPARC T1.
34 #
35 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
36 # because 64-bit code generator has the advantage of using 64-bit
37 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
38 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
39 # code by 60%, not to mention that it doesn't suffer from severe decay
40 # when running 4 times physical cores threads and that it leaves gcc
41 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
42 # performance is only 10% better, but overall throughput for maximum
43 # amount of threads for given CPU exceeds corresponding one of SHA256
44 # by 30% [again, optimal coefficient is 50%].
45 #
46 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
47 # in-order, i.e. load instruction has to complete prior next
48 # instruction in given thread is executed, even if the latter is
49 # not dependent on load result! This means that on T1 two 32-bit
50 # loads are always slower than one 64-bit load. Once again this
51 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
52 # 2x32-bit loads can be as fast as 1x64-bit ones.
53 #
54 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
55 # which is 9.3x/11.1x faster than software. Multi-process benchmark
56 # saturates at 11.5x single-process result on 8-core processor, or
57 # ~11/16GBps per 2.85GHz socket.
58
59 $output=pop;
60 open STDOUT,">$output";
61
62 if ($output =~ /512/) {
63 $label="512";
64 $SZ=8;
65 $LD="ldx"; # load from memory
66 $ST="stx"; # store to memory
67 $SLL="sllx"; # shift left logical
68 $SRL="srlx"; # shift right logical
69 @Sigma0=(28,34,39);
70 @Sigma1=(14,18,41);
71 @sigma0=( 7, 1, 8); # right shift first
72 @sigma1=( 6,19,61); # right shift first
73 $lastK=0x817;
74 $rounds=80;
75 $align=4;
76
77 $locals=16*$SZ; # X[16]
78
79 $A="%o0";
80 $B="%o1";
81 $C="%o2";
82 $D="%o3";
83 $E="%o4";
84 $F="%o5";
85 $G="%g1";
86 $H="%o7";
87 @V=($A,$B,$C,$D,$E,$F,$G,$H);
88 } else {
89 $label="256";
90 $SZ=4;
91 $LD="ld"; # load from memory
92 $ST="st"; # store to memory
93 $SLL="sll"; # shift left logical
94 $SRL="srl"; # shift right logical
95 @Sigma0=( 2,13,22);
96 @Sigma1=( 6,11,25);
97 @sigma0=( 3, 7,18); # right shift first
98 @sigma1=(10,17,19); # right shift first
99 $lastK=0x8f2;
100 $rounds=64;
101 $align=8;
102
103 $locals=0; # X[16] is register resident
104 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
105
106 $A="%l0";
107 $B="%l1";
108 $C="%l2";
109 $D="%l3";
110 $E="%l4";
111 $F="%l5";
112 $G="%l6";
113 $H="%l7";
114 @V=($A,$B,$C,$D,$E,$F,$G,$H);
115 }
116 $T1="%g2";
117 $tmp0="%g3";
118 $tmp1="%g4";
119 $tmp2="%g5";
120
121 $ctx="%i0";
122 $inp="%i1";
123 $len="%i2";
124 $Ktbl="%i3";
125 $tmp31="%i4";
126 $tmp32="%i5";
127
128 ########### SHA256
129 $Xload = sub {
130 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
131
132 if ($i==0) {
133 $code.=<<___;
134 ldx [$inp+0],@X[0]
135 ldx [$inp+16],@X[2]
136 ldx [$inp+32],@X[4]
137 ldx [$inp+48],@X[6]
138 ldx [$inp+8],@X[1]
139 ldx [$inp+24],@X[3]
140 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
141 ldx [$inp+40],@X[5]
142 bz,pt %icc,.Laligned
143 ldx [$inp+56],@X[7]
144
145 sllx @X[0],$tmp31,@X[0]
146 ldx [$inp+64],$T1
147 ___
148 for($j=0;$j<7;$j++)
149 { $code.=<<___;
150 srlx @X[$j+1],$tmp32,$tmp1
151 sllx @X[$j+1],$tmp31,@X[$j+1]
152 or $tmp1,@X[$j],@X[$j]
153 ___
154 }
155 $code.=<<___;
156 srlx $T1,$tmp32,$T1
157 or $T1,@X[7],@X[7]
158 .Laligned:
159 ___
160 }
161
162 if ($i&1) {
163 $code.="\tadd @X[$i/2],$h,$T1\n";
164 } else {
165 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
166 }
167 } if ($SZ==4);
168
169 ########### SHA512
170 $Xload = sub {
171 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
172 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
173
174 $code.=<<___ if ($i==0);
175 ld [$inp+0],%l0
176 ld [$inp+4],%l1
177 ld [$inp+8],%l2
178 ld [$inp+12],%l3
179 ld [$inp+16],%l4
180 ld [$inp+20],%l5
181 ld [$inp+24],%l6
182 cmp $tmp31,0
183 ld [$inp+28],%l7
184 ___
185 $code.=<<___ if ($i<15);
186 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
187 add $tmp31,32,$tmp0
188 sllx @pair[0],$tmp0,$tmp1
189 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
190 srlx @pair[2],$tmp32,@pair[1]
191 or $tmp1,$tmp2,$tmp2
192 or @pair[1],$tmp2,$tmp2
193 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
194 add $h,$tmp2,$T1
195 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
196 ___
197 $code.=<<___ if ($i==12);
198 bnz,a,pn %icc,.+8
199 ld [$inp+128],%l0
200 ___
201 $code.=<<___ if ($i==15);
202 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
203 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
204 add $tmp31,32,$tmp0
205 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
206 sllx @pair[0],$tmp0,$tmp1
207 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
208 srlx @pair[2],$tmp32,@pair[1]
209 or $tmp1,$tmp2,$tmp2
210 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
211 or @pair[1],$tmp2,$tmp2
212 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
213 add $h,$tmp2,$T1
214 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
215 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
216 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
217 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
218 ___
219 } if ($SZ==8);
220
221 ########### common
222 sub BODY_00_15 {
223 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
224
225 if ($i<16) {
226 &$Xload(@_);
227 } else {
228 $code.="\tadd $h,$T1,$T1\n";
229 }
230
231 $code.=<<___;
232 $SRL $e,@Sigma1[0],$h !! $i
233 xor $f,$g,$tmp2
234 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
235 and $e,$tmp2,$tmp2
236 $SRL $e,@Sigma1[1],$tmp0
237 xor $tmp1,$h,$h
238 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
239 xor $tmp0,$h,$h
240 $SRL $e,@Sigma1[2],$tmp0
241 xor $tmp1,$h,$h
242 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
243 xor $tmp0,$h,$h
244 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
245 xor $tmp1,$h,$tmp0 ! Sigma1(e)
246
247 $SRL $a,@Sigma0[0],$h
248 add $tmp2,$T1,$T1
249 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
250 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
251 add $tmp0,$T1,$T1
252 $SRL $a,@Sigma0[1],$tmp0
253 xor $tmp1,$h,$h
254 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
255 xor $tmp0,$h,$h
256 $SRL $a,@Sigma0[2],$tmp0
257 xor $tmp1,$h,$h
258 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
259 xor $tmp0,$h,$h
260 xor $tmp1,$h,$h ! Sigma0(a)
261
262 or $a,$b,$tmp0
263 and $a,$b,$tmp1
264 and $c,$tmp0,$tmp0
265 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
266 add $tmp2,$T1,$T1 ! +=K[$i]
267 add $tmp1,$h,$h
268
269 add $T1,$d,$d
270 add $T1,$h,$h
271 ___
272 }
273
274 ########### SHA256
275 $BODY_16_XX = sub {
276 my $i=@_[0];
277 my $xi;
278
279 if ($i&1) {
280 $xi=$tmp32;
281 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
282 } else {
283 $xi=@X[(($i+1)/2)%8];
284 }
285 $code.=<<___;
286 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
287 sll $xi,`32-@sigma0[2]`,$tmp1
288 srl $xi,@sigma0[1],$tmp0
289 xor $tmp1,$T1,$T1
290 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
291 xor $tmp0,$T1,$T1
292 srl $xi,@sigma0[2],$tmp0
293 xor $tmp1,$T1,$T1
294 ___
295 if ($i&1) {
296 $xi=@X[(($i+14)/2)%8];
297 } else {
298 $xi=$tmp32;
299 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
300 }
301 $code.=<<___;
302 srl $xi,@sigma1[0],$tmp2
303 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
304 sll $xi,`32-@sigma1[2]`,$tmp1
305 srl $xi,@sigma1[1],$tmp0
306 xor $tmp1,$tmp2,$tmp2
307 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
308 xor $tmp0,$tmp2,$tmp2
309 srl $xi,@sigma1[2],$tmp0
310 xor $tmp1,$tmp2,$tmp2
311 ___
312 if ($i&1) {
313 $xi=@X[($i/2)%8];
314 $code.=<<___;
315 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
316 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
317 srl @X[($i/2)%8],0,$tmp0
318 add $tmp2,$tmp1,$tmp1
319 add $xi,$T1,$T1 ! +=X[i]
320 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
321 add $tmp1,$T1,$T1
322
323 srl $T1,0,$T1
324 or $T1,@X[($i/2)%8],@X[($i/2)%8]
325 ___
326 } else {
327 $xi=@X[(($i+9)/2)%8];
328 $code.=<<___;
329 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
330 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
331 add $xi,$T1,$T1 ! +=X[i+9]
332 add $tmp2,$tmp1,$tmp1
333 srl @X[($i/2)%8],0,@X[($i/2)%8]
334 add $tmp1,$T1,$T1
335
336 sllx $T1,32,$tmp0
337 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
338 ___
339 }
340 &BODY_00_15(@_);
341 } if ($SZ==4);
342
343 ########### SHA512
344 $BODY_16_XX = sub {
345 my $i=@_[0];
346 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
347
348 $code.=<<___;
349 sllx %l2,32,$tmp0 !! Xupdate($i)
350 or %l3,$tmp0,$tmp0
351
352 srlx $tmp0,@sigma0[0],$T1
353 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
354 sllx $tmp0,`64-@sigma0[2]`,$tmp1
355 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
356 srlx $tmp0,@sigma0[1],$tmp0
357 xor $tmp1,$T1,$T1
358 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
359 xor $tmp0,$T1,$T1
360 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
361 xor $tmp1,$T1,$T1
362 sllx %l6,32,$tmp2
363 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
364 or %l7,$tmp2,$tmp2
365
366 srlx $tmp2,@sigma1[0],$tmp1
367 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
368 sllx $tmp2,`64-@sigma1[2]`,$tmp0
369 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
370 srlx $tmp2,@sigma1[1],$tmp2
371 xor $tmp0,$tmp1,$tmp1
372 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
373 xor $tmp2,$tmp1,$tmp1
374 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
375 xor $tmp0,$tmp1,$tmp1
376 sllx %l4,32,$tmp0
377 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
378 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
379 or %l5,$tmp0,$tmp0
380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
381
382 sllx %l0,32,$tmp2
383 add $tmp1,$T1,$T1
384 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
385 or %l1,$tmp2,$tmp2
386 add $tmp0,$T1,$T1 ! +=X[$i+9]
387 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
388 add $tmp2,$T1,$T1 ! +=X[$i]
389 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
390 ___
391 &BODY_00_15(@_);
392 } if ($SZ==8);
393
394 $code.=<<___;
395 #include "sparc_arch.h"
396
397 #ifdef __arch64__
398 .register %g2,#scratch
399 .register %g3,#scratch
400 #endif
401
402 .section ".text",#alloc,#execinstr
403
404 .align 64
405 K${label}:
406 .type K${label},#object
407 ___
408 if ($SZ==4) {
409 $code.=<<___;
410 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
411 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
412 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
413 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
414 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
415 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
416 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
417 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
418 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
419 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
420 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
421 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
422 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
423 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
424 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
425 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
426 ___
427 } else {
428 $code.=<<___;
429 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
430 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
431 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
432 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
433 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
434 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
435 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
436 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
437 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
438 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
439 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
440 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
441 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
442 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
443 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
444 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
445 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
446 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
447 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
448 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
449 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
450 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
451 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
452 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
453 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
454 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
455 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
456 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
457 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
458 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
459 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
460 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
461 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
462 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
463 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
464 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
465 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
466 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
467 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
468 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
469 ___
470 }
471 $code.=<<___;
472 .size K${label},.-K${label}
473
474 #ifdef __PIC__
475 SPARC_PIC_THUNK(%g1)
476 #endif
477
478 .globl sha${label}_block_data_order
479 .align 32
480 sha${label}_block_data_order:
481 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
482 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
483
484 andcc %g1, CFR_SHA${label}, %g0
485 be .Lsoftware
486 nop
487 ___
488 $code.=<<___ if ($SZ==8); # SHA512
489 ldd [%o0 + 0x00], %f0 ! load context
490 ldd [%o0 + 0x08], %f2
491 ldd [%o0 + 0x10], %f4
492 ldd [%o0 + 0x18], %f6
493 ldd [%o0 + 0x20], %f8
494 ldd [%o0 + 0x28], %f10
495 andcc %o1, 0x7, %g0
496 ldd [%o0 + 0x30], %f12
497 bne,pn %icc, .Lhwunaligned
498 ldd [%o0 + 0x38], %f14
499
500 .Lhwaligned_loop:
501 ldd [%o1 + 0x00], %f16
502 ldd [%o1 + 0x08], %f18
503 ldd [%o1 + 0x10], %f20
504 ldd [%o1 + 0x18], %f22
505 ldd [%o1 + 0x20], %f24
506 ldd [%o1 + 0x28], %f26
507 ldd [%o1 + 0x30], %f28
508 ldd [%o1 + 0x38], %f30
509 ldd [%o1 + 0x40], %f32
510 ldd [%o1 + 0x48], %f34
511 ldd [%o1 + 0x50], %f36
512 ldd [%o1 + 0x58], %f38
513 ldd [%o1 + 0x60], %f40
514 ldd [%o1 + 0x68], %f42
515 ldd [%o1 + 0x70], %f44
516 subcc %o2, 1, %o2 ! done yet?
517 ldd [%o1 + 0x78], %f46
518 add %o1, 0x80, %o1
519 prefetch [%o1 + 63], 20
520 prefetch [%o1 + 64+63], 20
521
522 .word 0x81b02860 ! SHA512
523
524 bne,pt SIZE_T_CC, .Lhwaligned_loop
525 nop
526
527 .Lhwfinish:
528 std %f0, [%o0 + 0x00] ! store context
529 std %f2, [%o0 + 0x08]
530 std %f4, [%o0 + 0x10]
531 std %f6, [%o0 + 0x18]
532 std %f8, [%o0 + 0x20]
533 std %f10, [%o0 + 0x28]
534 std %f12, [%o0 + 0x30]
535 retl
536 std %f14, [%o0 + 0x38]
537
538 .align 16
539 .Lhwunaligned:
540 alignaddr %o1, %g0, %o1
541
542 ldd [%o1 + 0x00], %f18
543 .Lhwunaligned_loop:
544 ldd [%o1 + 0x08], %f20
545 ldd [%o1 + 0x10], %f22
546 ldd [%o1 + 0x18], %f24
547 ldd [%o1 + 0x20], %f26
548 ldd [%o1 + 0x28], %f28
549 ldd [%o1 + 0x30], %f30
550 ldd [%o1 + 0x38], %f32
551 ldd [%o1 + 0x40], %f34
552 ldd [%o1 + 0x48], %f36
553 ldd [%o1 + 0x50], %f38
554 ldd [%o1 + 0x58], %f40
555 ldd [%o1 + 0x60], %f42
556 ldd [%o1 + 0x68], %f44
557 ldd [%o1 + 0x70], %f46
558 ldd [%o1 + 0x78], %f48
559 subcc %o2, 1, %o2 ! done yet?
560 ldd [%o1 + 0x80], %f50
561 add %o1, 0x80, %o1
562 prefetch [%o1 + 63], 20
563 prefetch [%o1 + 64+63], 20
564
565 faligndata %f18, %f20, %f16
566 faligndata %f20, %f22, %f18
567 faligndata %f22, %f24, %f20
568 faligndata %f24, %f26, %f22
569 faligndata %f26, %f28, %f24
570 faligndata %f28, %f30, %f26
571 faligndata %f30, %f32, %f28
572 faligndata %f32, %f34, %f30
573 faligndata %f34, %f36, %f32
574 faligndata %f36, %f38, %f34
575 faligndata %f38, %f40, %f36
576 faligndata %f40, %f42, %f38
577 faligndata %f42, %f44, %f40
578 faligndata %f44, %f46, %f42
579 faligndata %f46, %f48, %f44
580 faligndata %f48, %f50, %f46
581
582 .word 0x81b02860 ! SHA512
583
584 bne,pt SIZE_T_CC, .Lhwunaligned_loop
585 for %f50, %f50, %f18 ! %f18=%f50
586
587 ba .Lhwfinish
588 nop
589 ___
590 $code.=<<___ if ($SZ==4); # SHA256
591 ld [%o0 + 0x00], %f0
592 ld [%o0 + 0x04], %f1
593 ld [%o0 + 0x08], %f2
594 ld [%o0 + 0x0c], %f3
595 ld [%o0 + 0x10], %f4
596 ld [%o0 + 0x14], %f5
597 andcc %o1, 0x7, %g0
598 ld [%o0 + 0x18], %f6
599 bne,pn %icc, .Lhwunaligned
600 ld [%o0 + 0x1c], %f7
601
602 .Lhwloop:
603 ldd [%o1 + 0x00], %f8
604 ldd [%o1 + 0x08], %f10
605 ldd [%o1 + 0x10], %f12
606 ldd [%o1 + 0x18], %f14
607 ldd [%o1 + 0x20], %f16
608 ldd [%o1 + 0x28], %f18
609 ldd [%o1 + 0x30], %f20
610 subcc %o2, 1, %o2 ! done yet?
611 ldd [%o1 + 0x38], %f22
612 add %o1, 0x40, %o1
613 prefetch [%o1 + 63], 20
614
615 .word 0x81b02840 ! SHA256
616
617 bne,pt SIZE_T_CC, .Lhwloop
618 nop
619
620 .Lhwfinish:
621 st %f0, [%o0 + 0x00] ! store context
622 st %f1, [%o0 + 0x04]
623 st %f2, [%o0 + 0x08]
624 st %f3, [%o0 + 0x0c]
625 st %f4, [%o0 + 0x10]
626 st %f5, [%o0 + 0x14]
627 st %f6, [%o0 + 0x18]
628 retl
629 st %f7, [%o0 + 0x1c]
630
631 .align 8
632 .Lhwunaligned:
633 alignaddr %o1, %g0, %o1
634
635 ldd [%o1 + 0x00], %f10
636 .Lhwunaligned_loop:
637 ldd [%o1 + 0x08], %f12
638 ldd [%o1 + 0x10], %f14
639 ldd [%o1 + 0x18], %f16
640 ldd [%o1 + 0x20], %f18
641 ldd [%o1 + 0x28], %f20
642 ldd [%o1 + 0x30], %f22
643 ldd [%o1 + 0x38], %f24
644 subcc %o2, 1, %o2 ! done yet?
645 ldd [%o1 + 0x40], %f26
646 add %o1, 0x40, %o1
647 prefetch [%o1 + 63], 20
648
649 faligndata %f10, %f12, %f8
650 faligndata %f12, %f14, %f10
651 faligndata %f14, %f16, %f12
652 faligndata %f16, %f18, %f14
653 faligndata %f18, %f20, %f16
654 faligndata %f20, %f22, %f18
655 faligndata %f22, %f24, %f20
656 faligndata %f24, %f26, %f22
657
658 .word 0x81b02840 ! SHA256
659
660 bne,pt SIZE_T_CC, .Lhwunaligned_loop
661 for %f26, %f26, %f10 ! %f10=%f26
662
663 ba .Lhwfinish
664 nop
665 ___
666 $code.=<<___;
667 .align 16
668 .Lsoftware:
669 save %sp,-STACK_FRAME-$locals,%sp
670 and $inp,`$align-1`,$tmp31
671 sllx $len,`log(16*$SZ)/log(2)`,$len
672 andn $inp,`$align-1`,$inp
673 sll $tmp31,3,$tmp31
674 add $inp,$len,$len
675 ___
676 $code.=<<___ if ($SZ==8); # SHA512
677 mov 32,$tmp32
678 sub $tmp32,$tmp31,$tmp32
679 ___
680 $code.=<<___;
681 .Lpic: call .+8
682 add %o7,K${label}-.Lpic,$Ktbl
683
684 $LD [$ctx+`0*$SZ`],$A
685 $LD [$ctx+`1*$SZ`],$B
686 $LD [$ctx+`2*$SZ`],$C
687 $LD [$ctx+`3*$SZ`],$D
688 $LD [$ctx+`4*$SZ`],$E
689 $LD [$ctx+`5*$SZ`],$F
690 $LD [$ctx+`6*$SZ`],$G
691 $LD [$ctx+`7*$SZ`],$H
692
693 .Lloop:
694 ___
695 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
696 $code.=".L16_xx:\n";
697 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
698 $code.=<<___;
699 and $tmp2,0xfff,$tmp2
700 cmp $tmp2,$lastK
701 bne .L16_xx
702 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
703
704 ___
705 $code.=<<___ if ($SZ==4); # SHA256
706 $LD [$ctx+`0*$SZ`],@X[0]
707 $LD [$ctx+`1*$SZ`],@X[1]
708 $LD [$ctx+`2*$SZ`],@X[2]
709 $LD [$ctx+`3*$SZ`],@X[3]
710 $LD [$ctx+`4*$SZ`],@X[4]
711 $LD [$ctx+`5*$SZ`],@X[5]
712 $LD [$ctx+`6*$SZ`],@X[6]
713 $LD [$ctx+`7*$SZ`],@X[7]
714
715 add $A,@X[0],$A
716 $ST $A,[$ctx+`0*$SZ`]
717 add $B,@X[1],$B
718 $ST $B,[$ctx+`1*$SZ`]
719 add $C,@X[2],$C
720 $ST $C,[$ctx+`2*$SZ`]
721 add $D,@X[3],$D
722 $ST $D,[$ctx+`3*$SZ`]
723 add $E,@X[4],$E
724 $ST $E,[$ctx+`4*$SZ`]
725 add $F,@X[5],$F
726 $ST $F,[$ctx+`5*$SZ`]
727 add $G,@X[6],$G
728 $ST $G,[$ctx+`6*$SZ`]
729 add $H,@X[7],$H
730 $ST $H,[$ctx+`7*$SZ`]
731 ___
732 $code.=<<___ if ($SZ==8); # SHA512
733 ld [$ctx+`0*$SZ+0`],%l0
734 ld [$ctx+`0*$SZ+4`],%l1
735 ld [$ctx+`1*$SZ+0`],%l2
736 ld [$ctx+`1*$SZ+4`],%l3
737 ld [$ctx+`2*$SZ+0`],%l4
738 ld [$ctx+`2*$SZ+4`],%l5
739 ld [$ctx+`3*$SZ+0`],%l6
740
741 sllx %l0,32,$tmp0
742 ld [$ctx+`3*$SZ+4`],%l7
743 sllx %l2,32,$tmp1
744 or %l1,$tmp0,$tmp0
745 or %l3,$tmp1,$tmp1
746 add $tmp0,$A,$A
747 add $tmp1,$B,$B
748 $ST $A,[$ctx+`0*$SZ`]
749 sllx %l4,32,$tmp2
750 $ST $B,[$ctx+`1*$SZ`]
751 sllx %l6,32,$T1
752 or %l5,$tmp2,$tmp2
753 or %l7,$T1,$T1
754 add $tmp2,$C,$C
755 $ST $C,[$ctx+`2*$SZ`]
756 add $T1,$D,$D
757 $ST $D,[$ctx+`3*$SZ`]
758
759 ld [$ctx+`4*$SZ+0`],%l0
760 ld [$ctx+`4*$SZ+4`],%l1
761 ld [$ctx+`5*$SZ+0`],%l2
762 ld [$ctx+`5*$SZ+4`],%l3
763 ld [$ctx+`6*$SZ+0`],%l4
764 ld [$ctx+`6*$SZ+4`],%l5
765 ld [$ctx+`7*$SZ+0`],%l6
766
767 sllx %l0,32,$tmp0
768 ld [$ctx+`7*$SZ+4`],%l7
769 sllx %l2,32,$tmp1
770 or %l1,$tmp0,$tmp0
771 or %l3,$tmp1,$tmp1
772 add $tmp0,$E,$E
773 add $tmp1,$F,$F
774 $ST $E,[$ctx+`4*$SZ`]
775 sllx %l4,32,$tmp2
776 $ST $F,[$ctx+`5*$SZ`]
777 sllx %l6,32,$T1
778 or %l5,$tmp2,$tmp2
779 or %l7,$T1,$T1
780 add $tmp2,$G,$G
781 $ST $G,[$ctx+`6*$SZ`]
782 add $T1,$H,$H
783 $ST $H,[$ctx+`7*$SZ`]
784 ___
785 $code.=<<___;
786 add $inp,`16*$SZ`,$inp ! advance inp
787 cmp $inp,$len
788 bne SIZE_T_CC,.Lloop
789 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
790
791 ret
792 restore
793 .type sha${label}_block_data_order,#function
794 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
795 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
796 .align 4
797 ___
798
799 # Purpose of these subroutines is to explicitly encode VIS instructions,
800 # so that one can compile the module without having to specify VIS
801 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
802 # Idea is to reserve for option to produce "universal" binary and let
803 # programmer detect if current CPU is VIS capable at run-time.
804 sub unvis {
805 my ($mnemonic,$rs1,$rs2,$rd)=@_;
806 my $ref,$opf;
807 my %visopf = ( "faligndata" => 0x048,
808 "for" => 0x07c );
809
810 $ref = "$mnemonic\t$rs1,$rs2,$rd";
811
812 if ($opf=$visopf{$mnemonic}) {
813 foreach ($rs1,$rs2,$rd) {
814 return $ref if (!/%f([0-9]{1,2})/);
815 $_=$1;
816 if ($1>=32) {
817 return $ref if ($1&1);
818 # re-encode for upper double register addressing
819 $_=($1|$1>>5)&31;
820 }
821 }
822
823 return sprintf ".word\t0x%08x !%s",
824 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
825 $ref;
826 } else {
827 return $ref;
828 }
829 }
830 sub unalignaddr {
831 my ($mnemonic,$rs1,$rs2,$rd)=@_;
832 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
833 my $ref="$mnemonic\t$rs1,$rs2,$rd";
834
835 foreach ($rs1,$rs2,$rd) {
836 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
837 else { return $ref; }
838 }
839 return sprintf ".word\t0x%08x !%s",
840 0x81b00300|$rd<<25|$rs1<<14|$rs2,
841 $ref;
842 }
843
844 foreach (split("\n",$code)) {
845 s/\`([^\`]*)\`/eval $1/ge;
846
847 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
848 &unvis($1,$2,$3,$4)
849 /ge;
850 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
851 &unalignaddr($1,$2,$3,$4)
852 /ge;
853
854 print $_,"\n";
855 }
856
857 close STDOUT;