]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/sha512-sparcv9.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / sha512-sparcv9.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 #
16 # Hardware SPARC T4 support by David S. Miller
17 # ====================================================================
18
19 # SHA256 performance improvement over compiler generated code varies
20 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
21 # build]. Just like in SHA1 module I aim to ensure scalability on
22 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
23
24 # SHA512 on pre-T1 UltraSPARC.
25 #
26 # Performance is >75% better than 64-bit code generated by Sun C and
27 # over 2x than 32-bit code. X[16] resides on stack, but access to it
28 # is scheduled for L2 latency and staged through 32 least significant
29 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
30 # duality. Nevertheless it's ~40% faster than SHA256, which is pretty
31 # good [optimal coefficient is 50%].
32 #
33 # SHA512 on UltraSPARC T1.
34 #
35 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
36 # because 64-bit code generator has the advantage of using 64-bit
37 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
38 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
39 # code by 60%, not to mention that it doesn't suffer from severe decay
40 # when running 4 times physical cores threads and that it leaves gcc
41 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
42 # performance is only 10% better, but overall throughput for maximum
43 # amount of threads for given CPU exceeds corresponding one of SHA256
44 # by 30% [again, optimal coefficient is 50%].
45 #
46 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
47 # in-order, i.e. load instruction has to complete prior next
48 # instruction in given thread is executed, even if the latter is
49 # not dependent on load result! This means that on T1 two 32-bit
50 # loads are always slower than one 64-bit load. Once again this
51 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
52 # 2x32-bit loads can be as fast as 1x64-bit ones.
53 #
54 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
55 # which is 9.3x/11.1x faster than software. Multi-process benchmark
56 # saturates at 11.5x single-process result on 8-core processor, or
57 # ~11/16GBps per 2.85GHz socket.
58
59 # $output is the last argument if it looks like a file (it has an extension)
60 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
61
62 $output and open STDOUT,">$output";
63
64 if ($output =~ /512/) {
65 $label="512";
66 $SZ=8;
67 $LD="ldx"; # load from memory
68 $ST="stx"; # store to memory
69 $SLL="sllx"; # shift left logical
70 $SRL="srlx"; # shift right logical
71 @Sigma0=(28,34,39);
72 @Sigma1=(14,18,41);
73 @sigma0=( 7, 1, 8); # right shift first
74 @sigma1=( 6,19,61); # right shift first
75 $lastK=0x817;
76 $rounds=80;
77 $align=4;
78
79 $locals=16*$SZ; # X[16]
80
81 $A="%o0";
82 $B="%o1";
83 $C="%o2";
84 $D="%o3";
85 $E="%o4";
86 $F="%o5";
87 $G="%g1";
88 $H="%o7";
89 @V=($A,$B,$C,$D,$E,$F,$G,$H);
90 } else {
91 $label="256";
92 $SZ=4;
93 $LD="ld"; # load from memory
94 $ST="st"; # store to memory
95 $SLL="sll"; # shift left logical
96 $SRL="srl"; # shift right logical
97 @Sigma0=( 2,13,22);
98 @Sigma1=( 6,11,25);
99 @sigma0=( 3, 7,18); # right shift first
100 @sigma1=(10,17,19); # right shift first
101 $lastK=0x8f2;
102 $rounds=64;
103 $align=8;
104
105 $locals=0; # X[16] is register resident
106 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
107
108 $A="%l0";
109 $B="%l1";
110 $C="%l2";
111 $D="%l3";
112 $E="%l4";
113 $F="%l5";
114 $G="%l6";
115 $H="%l7";
116 @V=($A,$B,$C,$D,$E,$F,$G,$H);
117 }
118 $T1="%g2";
119 $tmp0="%g3";
120 $tmp1="%g4";
121 $tmp2="%g5";
122
123 $ctx="%i0";
124 $inp="%i1";
125 $len="%i2";
126 $Ktbl="%i3";
127 $tmp31="%i4";
128 $tmp32="%i5";
129
130 ########### SHA256
131 $Xload = sub {
132 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
133
134 if ($i==0) {
135 $code.=<<___;
136 ldx [$inp+0],@X[0]
137 ldx [$inp+16],@X[2]
138 ldx [$inp+32],@X[4]
139 ldx [$inp+48],@X[6]
140 ldx [$inp+8],@X[1]
141 ldx [$inp+24],@X[3]
142 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
143 ldx [$inp+40],@X[5]
144 bz,pt %icc,.Laligned
145 ldx [$inp+56],@X[7]
146
147 sllx @X[0],$tmp31,@X[0]
148 ldx [$inp+64],$T1
149 ___
150 for($j=0;$j<7;$j++)
151 { $code.=<<___;
152 srlx @X[$j+1],$tmp32,$tmp1
153 sllx @X[$j+1],$tmp31,@X[$j+1]
154 or $tmp1,@X[$j],@X[$j]
155 ___
156 }
157 $code.=<<___;
158 srlx $T1,$tmp32,$T1
159 or $T1,@X[7],@X[7]
160 .Laligned:
161 ___
162 }
163
164 if ($i&1) {
165 $code.="\tadd @X[$i/2],$h,$T1\n";
166 } else {
167 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
168 }
169 } if ($SZ==4);
170
171 ########### SHA512
172 $Xload = sub {
173 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
174 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
175
176 $code.=<<___ if ($i==0);
177 ld [$inp+0],%l0
178 ld [$inp+4],%l1
179 ld [$inp+8],%l2
180 ld [$inp+12],%l3
181 ld [$inp+16],%l4
182 ld [$inp+20],%l5
183 ld [$inp+24],%l6
184 cmp $tmp31,0
185 ld [$inp+28],%l7
186 ___
187 $code.=<<___ if ($i<15);
188 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
189 add $tmp31,32,$tmp0
190 sllx @pair[0],$tmp0,$tmp1
191 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
192 srlx @pair[2],$tmp32,@pair[1]
193 or $tmp1,$tmp2,$tmp2
194 or @pair[1],$tmp2,$tmp2
195 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
196 add $h,$tmp2,$T1
197 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
198 ___
199 $code.=<<___ if ($i==12);
200 bnz,a,pn %icc,.+8
201 ld [$inp+128],%l0
202 ___
203 $code.=<<___ if ($i==15);
204 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
205 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
206 add $tmp31,32,$tmp0
207 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
208 sllx @pair[0],$tmp0,$tmp1
209 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
210 srlx @pair[2],$tmp32,@pair[1]
211 or $tmp1,$tmp2,$tmp2
212 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
213 or @pair[1],$tmp2,$tmp2
214 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
215 add $h,$tmp2,$T1
216 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
217 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
218 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
219 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
220 ___
221 } if ($SZ==8);
222
223 ########### common
224 sub BODY_00_15 {
225 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
226
227 if ($i<16) {
228 &$Xload(@_);
229 } else {
230 $code.="\tadd $h,$T1,$T1\n";
231 }
232
233 $code.=<<___;
234 $SRL $e,@Sigma1[0],$h !! $i
235 xor $f,$g,$tmp2
236 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
237 and $e,$tmp2,$tmp2
238 $SRL $e,@Sigma1[1],$tmp0
239 xor $tmp1,$h,$h
240 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
241 xor $tmp0,$h,$h
242 $SRL $e,@Sigma1[2],$tmp0
243 xor $tmp1,$h,$h
244 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
245 xor $tmp0,$h,$h
246 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
247 xor $tmp1,$h,$tmp0 ! Sigma1(e)
248
249 $SRL $a,@Sigma0[0],$h
250 add $tmp2,$T1,$T1
251 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
252 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
253 add $tmp0,$T1,$T1
254 $SRL $a,@Sigma0[1],$tmp0
255 xor $tmp1,$h,$h
256 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
257 xor $tmp0,$h,$h
258 $SRL $a,@Sigma0[2],$tmp0
259 xor $tmp1,$h,$h
260 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
261 xor $tmp0,$h,$h
262 xor $tmp1,$h,$h ! Sigma0(a)
263
264 or $a,$b,$tmp0
265 and $a,$b,$tmp1
266 and $c,$tmp0,$tmp0
267 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
268 add $tmp2,$T1,$T1 ! +=K[$i]
269 add $tmp1,$h,$h
270
271 add $T1,$d,$d
272 add $T1,$h,$h
273 ___
274 }
275
276 ########### SHA256
277 $BODY_16_XX = sub {
278 my $i=@_[0];
279 my $xi;
280
281 if ($i&1) {
282 $xi=$tmp32;
283 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
284 } else {
285 $xi=@X[(($i+1)/2)%8];
286 }
287 $code.=<<___;
288 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
289 sll $xi,`32-@sigma0[2]`,$tmp1
290 srl $xi,@sigma0[1],$tmp0
291 xor $tmp1,$T1,$T1
292 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
293 xor $tmp0,$T1,$T1
294 srl $xi,@sigma0[2],$tmp0
295 xor $tmp1,$T1,$T1
296 ___
297 if ($i&1) {
298 $xi=@X[(($i+14)/2)%8];
299 } else {
300 $xi=$tmp32;
301 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
302 }
303 $code.=<<___;
304 srl $xi,@sigma1[0],$tmp2
305 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
306 sll $xi,`32-@sigma1[2]`,$tmp1
307 srl $xi,@sigma1[1],$tmp0
308 xor $tmp1,$tmp2,$tmp2
309 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
310 xor $tmp0,$tmp2,$tmp2
311 srl $xi,@sigma1[2],$tmp0
312 xor $tmp1,$tmp2,$tmp2
313 ___
314 if ($i&1) {
315 $xi=@X[($i/2)%8];
316 $code.=<<___;
317 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
318 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
319 srl @X[($i/2)%8],0,$tmp0
320 add $tmp2,$tmp1,$tmp1
321 add $xi,$T1,$T1 ! +=X[i]
322 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
323 add $tmp1,$T1,$T1
324
325 srl $T1,0,$T1
326 or $T1,@X[($i/2)%8],@X[($i/2)%8]
327 ___
328 } else {
329 $xi=@X[(($i+9)/2)%8];
330 $code.=<<___;
331 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
332 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
333 add $xi,$T1,$T1 ! +=X[i+9]
334 add $tmp2,$tmp1,$tmp1
335 srl @X[($i/2)%8],0,@X[($i/2)%8]
336 add $tmp1,$T1,$T1
337
338 sllx $T1,32,$tmp0
339 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
340 ___
341 }
342 &BODY_00_15(@_);
343 } if ($SZ==4);
344
345 ########### SHA512
346 $BODY_16_XX = sub {
347 my $i=@_[0];
348 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
349
350 $code.=<<___;
351 sllx %l2,32,$tmp0 !! Xupdate($i)
352 or %l3,$tmp0,$tmp0
353
354 srlx $tmp0,@sigma0[0],$T1
355 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
356 sllx $tmp0,`64-@sigma0[2]`,$tmp1
357 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
358 srlx $tmp0,@sigma0[1],$tmp0
359 xor $tmp1,$T1,$T1
360 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
361 xor $tmp0,$T1,$T1
362 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
363 xor $tmp1,$T1,$T1
364 sllx %l6,32,$tmp2
365 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
366 or %l7,$tmp2,$tmp2
367
368 srlx $tmp2,@sigma1[0],$tmp1
369 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
370 sllx $tmp2,`64-@sigma1[2]`,$tmp0
371 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
372 srlx $tmp2,@sigma1[1],$tmp2
373 xor $tmp0,$tmp1,$tmp1
374 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
375 xor $tmp2,$tmp1,$tmp1
376 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
377 xor $tmp0,$tmp1,$tmp1
378 sllx %l4,32,$tmp0
379 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
381 or %l5,$tmp0,$tmp0
382 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
383
384 sllx %l0,32,$tmp2
385 add $tmp1,$T1,$T1
386 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
387 or %l1,$tmp2,$tmp2
388 add $tmp0,$T1,$T1 ! +=X[$i+9]
389 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
390 add $tmp2,$T1,$T1 ! +=X[$i]
391 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
392 ___
393 &BODY_00_15(@_);
394 } if ($SZ==8);
395
396 $code.=<<___;
397 #include "sparc_arch.h"
398
399 #ifdef __arch64__
400 .register %g2,#scratch
401 .register %g3,#scratch
402 #endif
403
404 .section ".text",#alloc,#execinstr
405
406 .align 64
407 K${label}:
408 .type K${label},#object
409 ___
410 if ($SZ==4) {
411 $code.=<<___;
412 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
413 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
414 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
415 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
416 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
417 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
418 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
419 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
420 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
421 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
422 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
423 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
424 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
425 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
426 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
427 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
428 ___
429 } else {
430 $code.=<<___;
431 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
432 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
433 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
434 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
435 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
436 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
437 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
438 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
439 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
440 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
441 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
442 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
443 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
444 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
445 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
446 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
447 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
448 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
449 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
450 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
451 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
452 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
453 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
454 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
455 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
456 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
457 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
458 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
459 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
460 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
461 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
462 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
463 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
464 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
465 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
466 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
467 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
468 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
469 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
470 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
471 ___
472 }
473 $code.=<<___;
474 .size K${label},.-K${label}
475
476 #ifdef __PIC__
477 SPARC_PIC_THUNK(%g1)
478 #endif
479
480 .globl sha${label}_block_data_order
481 .align 32
482 sha${label}_block_data_order:
483 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
484 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
485
486 andcc %g1, CFR_SHA${label}, %g0
487 be .Lsoftware
488 nop
489 ___
490 $code.=<<___ if ($SZ==8); # SHA512
491 ldd [%o0 + 0x00], %f0 ! load context
492 ldd [%o0 + 0x08], %f2
493 ldd [%o0 + 0x10], %f4
494 ldd [%o0 + 0x18], %f6
495 ldd [%o0 + 0x20], %f8
496 ldd [%o0 + 0x28], %f10
497 andcc %o1, 0x7, %g0
498 ldd [%o0 + 0x30], %f12
499 bne,pn %icc, .Lhwunaligned
500 ldd [%o0 + 0x38], %f14
501
502 .Lhwaligned_loop:
503 ldd [%o1 + 0x00], %f16
504 ldd [%o1 + 0x08], %f18
505 ldd [%o1 + 0x10], %f20
506 ldd [%o1 + 0x18], %f22
507 ldd [%o1 + 0x20], %f24
508 ldd [%o1 + 0x28], %f26
509 ldd [%o1 + 0x30], %f28
510 ldd [%o1 + 0x38], %f30
511 ldd [%o1 + 0x40], %f32
512 ldd [%o1 + 0x48], %f34
513 ldd [%o1 + 0x50], %f36
514 ldd [%o1 + 0x58], %f38
515 ldd [%o1 + 0x60], %f40
516 ldd [%o1 + 0x68], %f42
517 ldd [%o1 + 0x70], %f44
518 subcc %o2, 1, %o2 ! done yet?
519 ldd [%o1 + 0x78], %f46
520 add %o1, 0x80, %o1
521 prefetch [%o1 + 63], 20
522 prefetch [%o1 + 64+63], 20
523
524 .word 0x81b02860 ! SHA512
525
526 bne,pt SIZE_T_CC, .Lhwaligned_loop
527 nop
528
529 .Lhwfinish:
530 std %f0, [%o0 + 0x00] ! store context
531 std %f2, [%o0 + 0x08]
532 std %f4, [%o0 + 0x10]
533 std %f6, [%o0 + 0x18]
534 std %f8, [%o0 + 0x20]
535 std %f10, [%o0 + 0x28]
536 std %f12, [%o0 + 0x30]
537 retl
538 std %f14, [%o0 + 0x38]
539
540 .align 16
541 .Lhwunaligned:
542 alignaddr %o1, %g0, %o1
543
544 ldd [%o1 + 0x00], %f18
545 .Lhwunaligned_loop:
546 ldd [%o1 + 0x08], %f20
547 ldd [%o1 + 0x10], %f22
548 ldd [%o1 + 0x18], %f24
549 ldd [%o1 + 0x20], %f26
550 ldd [%o1 + 0x28], %f28
551 ldd [%o1 + 0x30], %f30
552 ldd [%o1 + 0x38], %f32
553 ldd [%o1 + 0x40], %f34
554 ldd [%o1 + 0x48], %f36
555 ldd [%o1 + 0x50], %f38
556 ldd [%o1 + 0x58], %f40
557 ldd [%o1 + 0x60], %f42
558 ldd [%o1 + 0x68], %f44
559 ldd [%o1 + 0x70], %f46
560 ldd [%o1 + 0x78], %f48
561 subcc %o2, 1, %o2 ! done yet?
562 ldd [%o1 + 0x80], %f50
563 add %o1, 0x80, %o1
564 prefetch [%o1 + 63], 20
565 prefetch [%o1 + 64+63], 20
566
567 faligndata %f18, %f20, %f16
568 faligndata %f20, %f22, %f18
569 faligndata %f22, %f24, %f20
570 faligndata %f24, %f26, %f22
571 faligndata %f26, %f28, %f24
572 faligndata %f28, %f30, %f26
573 faligndata %f30, %f32, %f28
574 faligndata %f32, %f34, %f30
575 faligndata %f34, %f36, %f32
576 faligndata %f36, %f38, %f34
577 faligndata %f38, %f40, %f36
578 faligndata %f40, %f42, %f38
579 faligndata %f42, %f44, %f40
580 faligndata %f44, %f46, %f42
581 faligndata %f46, %f48, %f44
582 faligndata %f48, %f50, %f46
583
584 .word 0x81b02860 ! SHA512
585
586 bne,pt SIZE_T_CC, .Lhwunaligned_loop
587 for %f50, %f50, %f18 ! %f18=%f50
588
589 ba .Lhwfinish
590 nop
591 ___
592 $code.=<<___ if ($SZ==4); # SHA256
593 ld [%o0 + 0x00], %f0
594 ld [%o0 + 0x04], %f1
595 ld [%o0 + 0x08], %f2
596 ld [%o0 + 0x0c], %f3
597 ld [%o0 + 0x10], %f4
598 ld [%o0 + 0x14], %f5
599 andcc %o1, 0x7, %g0
600 ld [%o0 + 0x18], %f6
601 bne,pn %icc, .Lhwunaligned
602 ld [%o0 + 0x1c], %f7
603
604 .Lhwloop:
605 ldd [%o1 + 0x00], %f8
606 ldd [%o1 + 0x08], %f10
607 ldd [%o1 + 0x10], %f12
608 ldd [%o1 + 0x18], %f14
609 ldd [%o1 + 0x20], %f16
610 ldd [%o1 + 0x28], %f18
611 ldd [%o1 + 0x30], %f20
612 subcc %o2, 1, %o2 ! done yet?
613 ldd [%o1 + 0x38], %f22
614 add %o1, 0x40, %o1
615 prefetch [%o1 + 63], 20
616
617 .word 0x81b02840 ! SHA256
618
619 bne,pt SIZE_T_CC, .Lhwloop
620 nop
621
622 .Lhwfinish:
623 st %f0, [%o0 + 0x00] ! store context
624 st %f1, [%o0 + 0x04]
625 st %f2, [%o0 + 0x08]
626 st %f3, [%o0 + 0x0c]
627 st %f4, [%o0 + 0x10]
628 st %f5, [%o0 + 0x14]
629 st %f6, [%o0 + 0x18]
630 retl
631 st %f7, [%o0 + 0x1c]
632
633 .align 8
634 .Lhwunaligned:
635 alignaddr %o1, %g0, %o1
636
637 ldd [%o1 + 0x00], %f10
638 .Lhwunaligned_loop:
639 ldd [%o1 + 0x08], %f12
640 ldd [%o1 + 0x10], %f14
641 ldd [%o1 + 0x18], %f16
642 ldd [%o1 + 0x20], %f18
643 ldd [%o1 + 0x28], %f20
644 ldd [%o1 + 0x30], %f22
645 ldd [%o1 + 0x38], %f24
646 subcc %o2, 1, %o2 ! done yet?
647 ldd [%o1 + 0x40], %f26
648 add %o1, 0x40, %o1
649 prefetch [%o1 + 63], 20
650
651 faligndata %f10, %f12, %f8
652 faligndata %f12, %f14, %f10
653 faligndata %f14, %f16, %f12
654 faligndata %f16, %f18, %f14
655 faligndata %f18, %f20, %f16
656 faligndata %f20, %f22, %f18
657 faligndata %f22, %f24, %f20
658 faligndata %f24, %f26, %f22
659
660 .word 0x81b02840 ! SHA256
661
662 bne,pt SIZE_T_CC, .Lhwunaligned_loop
663 for %f26, %f26, %f10 ! %f10=%f26
664
665 ba .Lhwfinish
666 nop
667 ___
668 $code.=<<___;
669 .align 16
670 .Lsoftware:
671 save %sp,-STACK_FRAME-$locals,%sp
672 and $inp,`$align-1`,$tmp31
673 sllx $len,`log(16*$SZ)/log(2)`,$len
674 andn $inp,`$align-1`,$inp
675 sll $tmp31,3,$tmp31
676 add $inp,$len,$len
677 ___
678 $code.=<<___ if ($SZ==8); # SHA512
679 mov 32,$tmp32
680 sub $tmp32,$tmp31,$tmp32
681 ___
682 $code.=<<___;
683 .Lpic: call .+8
684 add %o7,K${label}-.Lpic,$Ktbl
685
686 $LD [$ctx+`0*$SZ`],$A
687 $LD [$ctx+`1*$SZ`],$B
688 $LD [$ctx+`2*$SZ`],$C
689 $LD [$ctx+`3*$SZ`],$D
690 $LD [$ctx+`4*$SZ`],$E
691 $LD [$ctx+`5*$SZ`],$F
692 $LD [$ctx+`6*$SZ`],$G
693 $LD [$ctx+`7*$SZ`],$H
694
695 .Lloop:
696 ___
697 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
698 $code.=".L16_xx:\n";
699 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
700 $code.=<<___;
701 and $tmp2,0xfff,$tmp2
702 cmp $tmp2,$lastK
703 bne .L16_xx
704 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
705
706 ___
707 $code.=<<___ if ($SZ==4); # SHA256
708 $LD [$ctx+`0*$SZ`],@X[0]
709 $LD [$ctx+`1*$SZ`],@X[1]
710 $LD [$ctx+`2*$SZ`],@X[2]
711 $LD [$ctx+`3*$SZ`],@X[3]
712 $LD [$ctx+`4*$SZ`],@X[4]
713 $LD [$ctx+`5*$SZ`],@X[5]
714 $LD [$ctx+`6*$SZ`],@X[6]
715 $LD [$ctx+`7*$SZ`],@X[7]
716
717 add $A,@X[0],$A
718 $ST $A,[$ctx+`0*$SZ`]
719 add $B,@X[1],$B
720 $ST $B,[$ctx+`1*$SZ`]
721 add $C,@X[2],$C
722 $ST $C,[$ctx+`2*$SZ`]
723 add $D,@X[3],$D
724 $ST $D,[$ctx+`3*$SZ`]
725 add $E,@X[4],$E
726 $ST $E,[$ctx+`4*$SZ`]
727 add $F,@X[5],$F
728 $ST $F,[$ctx+`5*$SZ`]
729 add $G,@X[6],$G
730 $ST $G,[$ctx+`6*$SZ`]
731 add $H,@X[7],$H
732 $ST $H,[$ctx+`7*$SZ`]
733 ___
734 $code.=<<___ if ($SZ==8); # SHA512
735 ld [$ctx+`0*$SZ+0`],%l0
736 ld [$ctx+`0*$SZ+4`],%l1
737 ld [$ctx+`1*$SZ+0`],%l2
738 ld [$ctx+`1*$SZ+4`],%l3
739 ld [$ctx+`2*$SZ+0`],%l4
740 ld [$ctx+`2*$SZ+4`],%l5
741 ld [$ctx+`3*$SZ+0`],%l6
742
743 sllx %l0,32,$tmp0
744 ld [$ctx+`3*$SZ+4`],%l7
745 sllx %l2,32,$tmp1
746 or %l1,$tmp0,$tmp0
747 or %l3,$tmp1,$tmp1
748 add $tmp0,$A,$A
749 add $tmp1,$B,$B
750 $ST $A,[$ctx+`0*$SZ`]
751 sllx %l4,32,$tmp2
752 $ST $B,[$ctx+`1*$SZ`]
753 sllx %l6,32,$T1
754 or %l5,$tmp2,$tmp2
755 or %l7,$T1,$T1
756 add $tmp2,$C,$C
757 $ST $C,[$ctx+`2*$SZ`]
758 add $T1,$D,$D
759 $ST $D,[$ctx+`3*$SZ`]
760
761 ld [$ctx+`4*$SZ+0`],%l0
762 ld [$ctx+`4*$SZ+4`],%l1
763 ld [$ctx+`5*$SZ+0`],%l2
764 ld [$ctx+`5*$SZ+4`],%l3
765 ld [$ctx+`6*$SZ+0`],%l4
766 ld [$ctx+`6*$SZ+4`],%l5
767 ld [$ctx+`7*$SZ+0`],%l6
768
769 sllx %l0,32,$tmp0
770 ld [$ctx+`7*$SZ+4`],%l7
771 sllx %l2,32,$tmp1
772 or %l1,$tmp0,$tmp0
773 or %l3,$tmp1,$tmp1
774 add $tmp0,$E,$E
775 add $tmp1,$F,$F
776 $ST $E,[$ctx+`4*$SZ`]
777 sllx %l4,32,$tmp2
778 $ST $F,[$ctx+`5*$SZ`]
779 sllx %l6,32,$T1
780 or %l5,$tmp2,$tmp2
781 or %l7,$T1,$T1
782 add $tmp2,$G,$G
783 $ST $G,[$ctx+`6*$SZ`]
784 add $T1,$H,$H
785 $ST $H,[$ctx+`7*$SZ`]
786 ___
787 $code.=<<___;
788 add $inp,`16*$SZ`,$inp ! advance inp
789 cmp $inp,$len
790 bne SIZE_T_CC,.Lloop
791 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
792
793 ret
794 restore
795 .type sha${label}_block_data_order,#function
796 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
797 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
798 .align 4
799 ___
800
801 # Purpose of these subroutines is to explicitly encode VIS instructions,
802 # so that one can compile the module without having to specify VIS
803 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
804 # Idea is to reserve for option to produce "universal" binary and let
805 # programmer detect if current CPU is VIS capable at run-time.
806 sub unvis {
807 my ($mnemonic,$rs1,$rs2,$rd)=@_;
808 my $ref,$opf;
809 my %visopf = ( "faligndata" => 0x048,
810 "for" => 0x07c );
811
812 $ref = "$mnemonic\t$rs1,$rs2,$rd";
813
814 if ($opf=$visopf{$mnemonic}) {
815 foreach ($rs1,$rs2,$rd) {
816 return $ref if (!/%f([0-9]{1,2})/);
817 $_=$1;
818 if ($1>=32) {
819 return $ref if ($1&1);
820 # re-encode for upper double register addressing
821 $_=($1|$1>>5)&31;
822 }
823 }
824
825 return sprintf ".word\t0x%08x !%s",
826 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
827 $ref;
828 } else {
829 return $ref;
830 }
831 }
832 sub unalignaddr {
833 my ($mnemonic,$rs1,$rs2,$rd)=@_;
834 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
835 my $ref="$mnemonic\t$rs1,$rs2,$rd";
836
837 foreach ($rs1,$rs2,$rd) {
838 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
839 else { return $ref; }
840 }
841 return sprintf ".word\t0x%08x !%s",
842 0x81b00300|$rd<<25|$rs1<<14|$rs2,
843 $ref;
844 }
845
846 foreach (split("\n",$code)) {
847 s/\`([^\`]*)\`/eval $1/ge;
848
849 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
850 &unvis($1,$2,$3,$4)
851 /ge;
852 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
853 &unalignaddr($1,$2,$3,$4)
854 /ge;
855
856 print $_,"\n";
857 }
858
859 close STDOUT or die "error closing STDOUT: $!";