]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/sha512-sparcv9.pl
misspellings fixes by https://github.com/vlajos/misspell_fixer
[thirdparty/openssl.git] / crypto / sha / asm / sha512-sparcv9.pl
CommitLineData
6fa8a01c
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
3ed6e227
AP
8#
9# Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
6fa8a01c
AP
10# ====================================================================
11
12# SHA256 performance improvement over compiler generated code varies
13# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
14# build]. Just like in SHA1 module I aim to ensure scalability on
15# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
16
17# SHA512 on pre-T1 UltraSPARC.
18#
19# Performance is >75% better than 64-bit code generated by Sun C and
20# over 2x than 32-bit code. X[16] resides on stack, but access to it
21# is scheduled for L2 latency and staged through 32 least significant
79fe664f 22# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
6fa8a01c
AP
23# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
24# good [optimal coefficient is 50%].
25#
26# SHA512 on UltraSPARC T1.
27#
0bd8d6e2
AP
28# It's not any faster than 64-bit code generated by Sun C 5.8. This is
29# because 64-bit code generator has the advantage of using 64-bit
79fe664f
AP
30# loads(*) to access X[16], which I consciously traded for 32-/64-bit
31# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
32# code by 60%, not to mention that it doesn't suffer from severe decay
33# when running 4 times physical cores threads and that it leaves gcc
34# [3.4] behind by over 4x factor! If compared to SHA256, single thread
0bd8d6e2
AP
35# performance is only 10% better, but overall throughput for maximum
36# amount of threads for given CPU exceeds corresponding one of SHA256
37# by 30% [again, optimal coefficient is 50%].
79fe664f
AP
38#
39# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
40# in-order, i.e. load instruction has to complete prior next
41# instruction in given thread is executed, even if the latter is
42# not dependent on load result! This means that on T1 two 32-bit
43# loads are always slower than one 64-bit load. Once again this
44# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
45# 2x32-bit loads can be as fast as 1x64-bit ones.
3ed6e227
AP
46#
47# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
48# which is 9.3x/11.1x faster than software. Multi-process benchmark
49# saturates at 11.5x single-process result on 8-core processor, or
50# ~11/16GBps per 2.85GHz socket.
51
6fa8a01c
AP
52$output=shift;
53open STDOUT,">$output";
54
55if ($output =~ /512/) {
56 $label="512";
57 $SZ=8;
58 $LD="ldx"; # load from memory
59 $ST="stx"; # store to memory
60 $SLL="sllx"; # shift left logical
61 $SRL="srlx"; # shift right logical
62 @Sigma0=(28,34,39);
63 @Sigma1=(14,18,41);
64 @sigma0=( 7, 1, 8); # right shift first
65 @sigma1=( 6,19,61); # right shift first
66 $lastK=0x817;
67 $rounds=80;
68 $align=4;
69
70 $locals=16*$SZ; # X[16]
71
72 $A="%o0";
73 $B="%o1";
74 $C="%o2";
75 $D="%o3";
76 $E="%o4";
77 $F="%o5";
78 $G="%g1";
79 $H="%o7";
80 @V=($A,$B,$C,$D,$E,$F,$G,$H);
81} else {
82 $label="256";
83 $SZ=4;
84 $LD="ld"; # load from memory
85 $ST="st"; # store to memory
86 $SLL="sll"; # shift left logical
87 $SRL="srl"; # shift right logical
88 @Sigma0=( 2,13,22);
89 @Sigma1=( 6,11,25);
90 @sigma0=( 3, 7,18); # right shift first
91 @sigma1=(10,17,19); # right shift first
92 $lastK=0x8f2;
93 $rounds=64;
94 $align=8;
95
96 $locals=0; # X[16] is register resident
97 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
98
99 $A="%l0";
100 $B="%l1";
101 $C="%l2";
102 $D="%l3";
103 $E="%l4";
104 $F="%l5";
105 $G="%l6";
106 $H="%l7";
107 @V=($A,$B,$C,$D,$E,$F,$G,$H);
108}
109$T1="%g2";
110$tmp0="%g3";
111$tmp1="%g4";
112$tmp2="%g5";
113
114$ctx="%i0";
115$inp="%i1";
116$len="%i2";
117$Ktbl="%i3";
118$tmp31="%i4";
119$tmp32="%i5";
120
121########### SHA256
122$Xload = sub {
123my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
124
125 if ($i==0) {
126$code.=<<___;
127 ldx [$inp+0],@X[0]
128 ldx [$inp+16],@X[2]
129 ldx [$inp+32],@X[4]
130 ldx [$inp+48],@X[6]
131 ldx [$inp+8],@X[1]
132 ldx [$inp+24],@X[3]
133 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
134 ldx [$inp+40],@X[5]
135 bz,pt %icc,.Laligned
136 ldx [$inp+56],@X[7]
137
138 sllx @X[0],$tmp31,@X[0]
139 ldx [$inp+64],$T1
140___
141for($j=0;$j<7;$j++)
142{ $code.=<<___;
143 srlx @X[$j+1],$tmp32,$tmp1
144 sllx @X[$j+1],$tmp31,@X[$j+1]
145 or $tmp1,@X[$j],@X[$j]
146___
147}
148$code.=<<___;
149 srlx $T1,$tmp32,$T1
150 or $T1,@X[7],@X[7]
151.Laligned:
152___
153 }
154
155 if ($i&1) {
156 $code.="\tadd @X[$i/2],$h,$T1\n";
157 } else {
158 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
159 }
160} if ($SZ==4);
161
162########### SHA512
163$Xload = sub {
164my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
165my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
166
167$code.=<<___ if ($i==0);
168 ld [$inp+0],%l0
169 ld [$inp+4],%l1
170 ld [$inp+8],%l2
171 ld [$inp+12],%l3
172 ld [$inp+16],%l4
173 ld [$inp+20],%l5
174 ld [$inp+24],%l6
17e820ae 175 cmp $tmp31,0
6fa8a01c
AP
176 ld [$inp+28],%l7
177___
178$code.=<<___ if ($i<15);
179 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
180 add $tmp31,32,$tmp0
181 sllx @pair[0],$tmp0,$tmp1
182 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
183 srlx @pair[2],$tmp32,@pair[1]
184 or $tmp1,$tmp2,$tmp2
185 or @pair[1],$tmp2,$tmp2
186 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
187 add $h,$tmp2,$T1
1efd5830 188 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
6fa8a01c
AP
189___
190$code.=<<___ if ($i==12);
adb5a269 191 bnz,a,pn %icc,.+8
6fa8a01c
AP
192 ld [$inp+128],%l0
193___
194$code.=<<___ if ($i==15);
1efd5830 195 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
6fa8a01c
AP
196 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
197 add $tmp31,32,$tmp0
1efd5830 198 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
6fa8a01c 199 sllx @pair[0],$tmp0,$tmp1
1efd5830 200 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
6fa8a01c
AP
201 srlx @pair[2],$tmp32,@pair[1]
202 or $tmp1,$tmp2,$tmp2
1efd5830 203 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
6fa8a01c 204 or @pair[1],$tmp2,$tmp2
1efd5830 205 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
6fa8a01c 206 add $h,$tmp2,$T1
1efd5830
AP
207 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
208 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
209 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
210 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
6fa8a01c
AP
211___
212} if ($SZ==8);
213
214########### common
215sub BODY_00_15 {
216my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
217
218 if ($i<16) {
219 &$Xload(@_);
220 } else {
221 $code.="\tadd $h,$T1,$T1\n";
222 }
223
224$code.=<<___;
225 $SRL $e,@Sigma1[0],$h !! $i
226 xor $f,$g,$tmp2
227 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
228 and $e,$tmp2,$tmp2
229 $SRL $e,@Sigma1[1],$tmp0
230 xor $tmp1,$h,$h
231 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
232 xor $tmp0,$h,$h
233 $SRL $e,@Sigma1[2],$tmp0
234 xor $tmp1,$h,$h
235 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
236 xor $tmp0,$h,$h
237 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
238 xor $tmp1,$h,$tmp0 ! Sigma1(e)
239
240 $SRL $a,@Sigma0[0],$h
241 add $tmp2,$T1,$T1
242 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
243 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
244 add $tmp0,$T1,$T1
245 $SRL $a,@Sigma0[1],$tmp0
246 xor $tmp1,$h,$h
247 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
248 xor $tmp0,$h,$h
249 $SRL $a,@Sigma0[2],$tmp0
250 xor $tmp1,$h,$h
251 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
252 xor $tmp0,$h,$h
253 xor $tmp1,$h,$h ! Sigma0(a)
254
255 or $a,$b,$tmp0
256 and $a,$b,$tmp1
257 and $c,$tmp0,$tmp0
258 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
259 add $tmp2,$T1,$T1 ! +=K[$i]
260 add $tmp1,$h,$h
261
262 add $T1,$d,$d
263 add $T1,$h,$h
264___
265}
266
267########### SHA256
268$BODY_16_XX = sub {
269my $i=@_[0];
270my $xi;
271
272 if ($i&1) {
273 $xi=$tmp32;
274 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
275 } else {
276 $xi=@X[(($i+1)/2)%8];
277 }
278$code.=<<___;
279 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
280 sll $xi,`32-@sigma0[2]`,$tmp1
281 srl $xi,@sigma0[1],$tmp0
282 xor $tmp1,$T1,$T1
283 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
284 xor $tmp0,$T1,$T1
285 srl $xi,@sigma0[2],$tmp0
286 xor $tmp1,$T1,$T1
287___
288 if ($i&1) {
289 $xi=@X[(($i+14)/2)%8];
290 } else {
291 $xi=$tmp32;
292 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
293 }
294$code.=<<___;
295 srl $xi,@sigma1[0],$tmp2
296 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
297 sll $xi,`32-@sigma1[2]`,$tmp1
298 srl $xi,@sigma1[1],$tmp0
299 xor $tmp1,$tmp2,$tmp2
300 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
301 xor $tmp0,$tmp2,$tmp2
302 srl $xi,@sigma1[2],$tmp0
303 xor $tmp1,$tmp2,$tmp2
304___
305 if ($i&1) {
306 $xi=@X[($i/2)%8];
307$code.=<<___;
308 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
309 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
310 srl @X[($i/2)%8],0,$tmp0
9df286b1 311 add $tmp2,$tmp1,$tmp1
6fa8a01c
AP
312 add $xi,$T1,$T1 ! +=X[i]
313 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
6fa8a01c
AP
314 add $tmp1,$T1,$T1
315
316 srl $T1,0,$T1
317 or $T1,@X[($i/2)%8],@X[($i/2)%8]
318___
319 } else {
320 $xi=@X[(($i+9)/2)%8];
321$code.=<<___;
322 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
323 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
6fa8a01c 324 add $xi,$T1,$T1 ! +=X[i+9]
9df286b1
AP
325 add $tmp2,$tmp1,$tmp1
326 srl @X[($i/2)%8],0,@X[($i/2)%8]
6fa8a01c
AP
327 add $tmp1,$T1,$T1
328
329 sllx $T1,32,$tmp0
330 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
331___
332 }
333 &BODY_00_15(@_);
334} if ($SZ==4);
335
336########### SHA512
337$BODY_16_XX = sub {
338my $i=@_[0];
339my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
340
341$code.=<<___;
342 sllx %l2,32,$tmp0 !! Xupdate($i)
343 or %l3,$tmp0,$tmp0
344
345 srlx $tmp0,@sigma0[0],$T1
1efd5830 346 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
6fa8a01c 347 sllx $tmp0,`64-@sigma0[2]`,$tmp1
1efd5830 348 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
6fa8a01c
AP
349 srlx $tmp0,@sigma0[1],$tmp0
350 xor $tmp1,$T1,$T1
351 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
352 xor $tmp0,$T1,$T1
353 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
354 xor $tmp1,$T1,$T1
355 sllx %l6,32,$tmp2
356 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
357 or %l7,$tmp2,$tmp2
358
359 srlx $tmp2,@sigma1[0],$tmp1
1efd5830 360 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
6fa8a01c 361 sllx $tmp2,`64-@sigma1[2]`,$tmp0
1efd5830 362 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
6fa8a01c
AP
363 srlx $tmp2,@sigma1[1],$tmp2
364 xor $tmp0,$tmp1,$tmp1
365 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
366 xor $tmp2,$tmp1,$tmp1
367 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
368 xor $tmp0,$tmp1,$tmp1
369 sllx %l4,32,$tmp0
370 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
1efd5830 371 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
6fa8a01c 372 or %l5,$tmp0,$tmp0
1efd5830 373 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
6fa8a01c
AP
374
375 sllx %l0,32,$tmp2
376 add $tmp1,$T1,$T1
1efd5830 377 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
6fa8a01c
AP
378 or %l1,$tmp2,$tmp2
379 add $tmp0,$T1,$T1 ! +=X[$i+9]
1efd5830 380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
6fa8a01c 381 add $tmp2,$T1,$T1 ! +=X[$i]
1efd5830 382 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
6fa8a01c
AP
383___
384 &BODY_00_15(@_);
385} if ($SZ==8);
386
6fa8a01c 387$code.=<<___;
3ed6e227
AP
388#include "sparc_arch.h"
389
1efd5830
AP
390#ifdef __arch64__
391.register %g2,#scratch
392.register %g3,#scratch
393#endif
394
6fa8a01c
AP
395.section ".text",#alloc,#execinstr
396
397.align 64
398K${label}:
399.type K${label},#object
400___
401if ($SZ==4) {
402$code.=<<___;
403 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
404 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
405 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
406 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
407 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
408 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
409 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
410 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
411 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
412 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
413 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
414 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
415 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
416 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
417 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
418 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
419___
420} else {
421$code.=<<___;
422 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
423 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
424 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
425 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
426 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
427 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
428 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
429 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
430 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
431 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
432 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
433 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
434 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
435 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
436 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
437 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
438 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
439 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
440 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
441 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
442 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
443 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
444 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
445 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
446 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
447 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
448 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
449 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
450 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
451 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
452 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
453 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
454 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
455 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
456 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
457 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
458 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
459 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
460 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
461 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
462___
463}
464$code.=<<___;
465.size K${label},.-K${label}
3ed6e227
AP
466
467#ifdef __PIC__
468SPARC_PIC_THUNK(%g1)
469#endif
470
6fa8a01c 471.globl sha${label}_block_data_order
3ed6e227 472.align 32
6fa8a01c 473sha${label}_block_data_order:
3ed6e227
AP
474 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
475 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
476
477 andcc %g1, CFR_SHA${label}, %g0
478 be .Lsoftware
479 nop
480___
481$code.=<<___ if ($SZ==8); # SHA512
482 ldd [%o0 + 0x00], %f0 ! load context
483 ldd [%o0 + 0x08], %f2
484 ldd [%o0 + 0x10], %f4
485 ldd [%o0 + 0x18], %f6
486 ldd [%o0 + 0x20], %f8
487 ldd [%o0 + 0x28], %f10
488 andcc %o1, 0x7, %g0
489 ldd [%o0 + 0x30], %f12
490 bne,pn %icc, .Lhwunaligned
491 ldd [%o0 + 0x38], %f14
492
493.Lhwaligned_loop:
494 ldd [%o1 + 0x00], %f16
495 ldd [%o1 + 0x08], %f18
496 ldd [%o1 + 0x10], %f20
497 ldd [%o1 + 0x18], %f22
498 ldd [%o1 + 0x20], %f24
499 ldd [%o1 + 0x28], %f26
500 ldd [%o1 + 0x30], %f28
501 ldd [%o1 + 0x38], %f30
502 ldd [%o1 + 0x40], %f32
503 ldd [%o1 + 0x48], %f34
504 ldd [%o1 + 0x50], %f36
505 ldd [%o1 + 0x58], %f38
506 ldd [%o1 + 0x60], %f40
507 ldd [%o1 + 0x68], %f42
508 ldd [%o1 + 0x70], %f44
509 subcc %o2, 1, %o2 ! done yet?
510 ldd [%o1 + 0x78], %f46
511 add %o1, 0x80, %o1
aea4126e
AP
512 prefetch [%o1 + 63], 20
513 prefetch [%o1 + 64+63], 20
3ed6e227
AP
514
515 .word 0x81b02860 ! SHA512
516
1efd5830 517 bne,pt SIZE_T_CC, .Lhwaligned_loop
3ed6e227
AP
518 nop
519
520.Lhwfinish:
521 std %f0, [%o0 + 0x00] ! store context
522 std %f2, [%o0 + 0x08]
523 std %f4, [%o0 + 0x10]
524 std %f6, [%o0 + 0x18]
525 std %f8, [%o0 + 0x20]
526 std %f10, [%o0 + 0x28]
527 std %f12, [%o0 + 0x30]
528 retl
529 std %f14, [%o0 + 0x38]
530
531.align 16
532.Lhwunaligned:
533 alignaddr %o1, %g0, %o1
534
535 ldd [%o1 + 0x00], %f18
536.Lhwunaligned_loop:
537 ldd [%o1 + 0x08], %f20
538 ldd [%o1 + 0x10], %f22
539 ldd [%o1 + 0x18], %f24
540 ldd [%o1 + 0x20], %f26
541 ldd [%o1 + 0x28], %f28
542 ldd [%o1 + 0x30], %f30
543 ldd [%o1 + 0x38], %f32
544 ldd [%o1 + 0x40], %f34
545 ldd [%o1 + 0x48], %f36
546 ldd [%o1 + 0x50], %f38
547 ldd [%o1 + 0x58], %f40
548 ldd [%o1 + 0x60], %f42
549 ldd [%o1 + 0x68], %f44
550 ldd [%o1 + 0x70], %f46
551 ldd [%o1 + 0x78], %f48
552 subcc %o2, 1, %o2 ! done yet?
553 ldd [%o1 + 0x80], %f50
554 add %o1, 0x80, %o1
aea4126e
AP
555 prefetch [%o1 + 63], 20
556 prefetch [%o1 + 64+63], 20
3ed6e227
AP
557
558 faligndata %f18, %f20, %f16
559 faligndata %f20, %f22, %f18
560 faligndata %f22, %f24, %f20
561 faligndata %f24, %f26, %f22
562 faligndata %f26, %f28, %f24
563 faligndata %f28, %f30, %f26
564 faligndata %f30, %f32, %f28
565 faligndata %f32, %f34, %f30
566 faligndata %f34, %f36, %f32
567 faligndata %f36, %f38, %f34
568 faligndata %f38, %f40, %f36
569 faligndata %f40, %f42, %f38
570 faligndata %f42, %f44, %f40
571 faligndata %f44, %f46, %f42
572 faligndata %f46, %f48, %f44
573 faligndata %f48, %f50, %f46
574
575 .word 0x81b02860 ! SHA512
576
1efd5830 577 bne,pt SIZE_T_CC, .Lhwunaligned_loop
3ed6e227
AP
578 for %f50, %f50, %f18 ! %f18=%f50
579
580 ba .Lhwfinish
581 nop
582___
583$code.=<<___ if ($SZ==4); # SHA256
584 ld [%o0 + 0x00], %f0
585 ld [%o0 + 0x04], %f1
586 ld [%o0 + 0x08], %f2
587 ld [%o0 + 0x0c], %f3
588 ld [%o0 + 0x10], %f4
589 ld [%o0 + 0x14], %f5
590 andcc %o1, 0x7, %g0
591 ld [%o0 + 0x18], %f6
592 bne,pn %icc, .Lhwunaligned
593 ld [%o0 + 0x1c], %f7
594
595.Lhwloop:
596 ldd [%o1 + 0x00], %f8
597 ldd [%o1 + 0x08], %f10
598 ldd [%o1 + 0x10], %f12
599 ldd [%o1 + 0x18], %f14
600 ldd [%o1 + 0x20], %f16
601 ldd [%o1 + 0x28], %f18
602 ldd [%o1 + 0x30], %f20
603 subcc %o2, 1, %o2 ! done yet?
604 ldd [%o1 + 0x38], %f22
605 add %o1, 0x40, %o1
aea4126e 606 prefetch [%o1 + 63], 20
3ed6e227
AP
607
608 .word 0x81b02840 ! SHA256
609
1efd5830 610 bne,pt SIZE_T_CC, .Lhwloop
3ed6e227
AP
611 nop
612
613.Lhwfinish:
614 st %f0, [%o0 + 0x00] ! store context
615 st %f1, [%o0 + 0x04]
616 st %f2, [%o0 + 0x08]
617 st %f3, [%o0 + 0x0c]
618 st %f4, [%o0 + 0x10]
619 st %f5, [%o0 + 0x14]
620 st %f6, [%o0 + 0x18]
621 retl
622 st %f7, [%o0 + 0x1c]
623
624.align 8
625.Lhwunaligned:
626 alignaddr %o1, %g0, %o1
627
628 ldd [%o1 + 0x00], %f10
629.Lhwunaligned_loop:
630 ldd [%o1 + 0x08], %f12
631 ldd [%o1 + 0x10], %f14
632 ldd [%o1 + 0x18], %f16
633 ldd [%o1 + 0x20], %f18
634 ldd [%o1 + 0x28], %f20
635 ldd [%o1 + 0x30], %f22
636 ldd [%o1 + 0x38], %f24
637 subcc %o2, 1, %o2 ! done yet?
638 ldd [%o1 + 0x40], %f26
639 add %o1, 0x40, %o1
aea4126e 640 prefetch [%o1 + 63], 20
3ed6e227
AP
641
642 faligndata %f10, %f12, %f8
643 faligndata %f12, %f14, %f10
644 faligndata %f14, %f16, %f12
645 faligndata %f16, %f18, %f14
646 faligndata %f18, %f20, %f16
647 faligndata %f20, %f22, %f18
648 faligndata %f22, %f24, %f20
649 faligndata %f24, %f26, %f22
650
651 .word 0x81b02840 ! SHA256
652
1efd5830 653 bne,pt SIZE_T_CC, .Lhwunaligned_loop
3ed6e227
AP
654 for %f26, %f26, %f10 ! %f10=%f26
655
656 ba .Lhwfinish
657 nop
658___
659$code.=<<___;
660.align 16
661.Lsoftware:
1efd5830 662 save %sp,-STACK_FRAME-$locals,%sp
6fa8a01c
AP
663 and $inp,`$align-1`,$tmp31
664 sllx $len,`log(16*$SZ)/log(2)`,$len
665 andn $inp,`$align-1`,$inp
666 sll $tmp31,3,$tmp31
667 add $inp,$len,$len
668___
669$code.=<<___ if ($SZ==8); # SHA512
670 mov 32,$tmp32
671 sub $tmp32,$tmp31,$tmp32
672___
673$code.=<<___;
674.Lpic: call .+8
e22b8648 675 add %o7,K${label}-.Lpic,$Ktbl
6fa8a01c
AP
676
677 $LD [$ctx+`0*$SZ`],$A
678 $LD [$ctx+`1*$SZ`],$B
679 $LD [$ctx+`2*$SZ`],$C
680 $LD [$ctx+`3*$SZ`],$D
681 $LD [$ctx+`4*$SZ`],$E
682 $LD [$ctx+`5*$SZ`],$F
683 $LD [$ctx+`6*$SZ`],$G
684 $LD [$ctx+`7*$SZ`],$H
685
686.Lloop:
687___
688for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
689$code.=".L16_xx:\n";
690for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
691$code.=<<___;
692 and $tmp2,0xfff,$tmp2
693 cmp $tmp2,$lastK
694 bne .L16_xx
695 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
696
697___
698$code.=<<___ if ($SZ==4); # SHA256
699 $LD [$ctx+`0*$SZ`],@X[0]
700 $LD [$ctx+`1*$SZ`],@X[1]
701 $LD [$ctx+`2*$SZ`],@X[2]
702 $LD [$ctx+`3*$SZ`],@X[3]
703 $LD [$ctx+`4*$SZ`],@X[4]
704 $LD [$ctx+`5*$SZ`],@X[5]
705 $LD [$ctx+`6*$SZ`],@X[6]
706 $LD [$ctx+`7*$SZ`],@X[7]
707
708 add $A,@X[0],$A
709 $ST $A,[$ctx+`0*$SZ`]
710 add $B,@X[1],$B
711 $ST $B,[$ctx+`1*$SZ`]
712 add $C,@X[2],$C
713 $ST $C,[$ctx+`2*$SZ`]
714 add $D,@X[3],$D
715 $ST $D,[$ctx+`3*$SZ`]
716 add $E,@X[4],$E
717 $ST $E,[$ctx+`4*$SZ`]
718 add $F,@X[5],$F
719 $ST $F,[$ctx+`5*$SZ`]
720 add $G,@X[6],$G
721 $ST $G,[$ctx+`6*$SZ`]
722 add $H,@X[7],$H
723 $ST $H,[$ctx+`7*$SZ`]
724___
725$code.=<<___ if ($SZ==8); # SHA512
726 ld [$ctx+`0*$SZ+0`],%l0
727 ld [$ctx+`0*$SZ+4`],%l1
728 ld [$ctx+`1*$SZ+0`],%l2
729 ld [$ctx+`1*$SZ+4`],%l3
730 ld [$ctx+`2*$SZ+0`],%l4
731 ld [$ctx+`2*$SZ+4`],%l5
732 ld [$ctx+`3*$SZ+0`],%l6
733
734 sllx %l0,32,$tmp0
735 ld [$ctx+`3*$SZ+4`],%l7
736 sllx %l2,32,$tmp1
737 or %l1,$tmp0,$tmp0
738 or %l3,$tmp1,$tmp1
739 add $tmp0,$A,$A
740 add $tmp1,$B,$B
741 $ST $A,[$ctx+`0*$SZ`]
742 sllx %l4,32,$tmp2
743 $ST $B,[$ctx+`1*$SZ`]
744 sllx %l6,32,$T1
745 or %l5,$tmp2,$tmp2
746 or %l7,$T1,$T1
747 add $tmp2,$C,$C
748 $ST $C,[$ctx+`2*$SZ`]
749 add $T1,$D,$D
750 $ST $D,[$ctx+`3*$SZ`]
751
752 ld [$ctx+`4*$SZ+0`],%l0
753 ld [$ctx+`4*$SZ+4`],%l1
754 ld [$ctx+`5*$SZ+0`],%l2
755 ld [$ctx+`5*$SZ+4`],%l3
756 ld [$ctx+`6*$SZ+0`],%l4
757 ld [$ctx+`6*$SZ+4`],%l5
758 ld [$ctx+`7*$SZ+0`],%l6
759
760 sllx %l0,32,$tmp0
761 ld [$ctx+`7*$SZ+4`],%l7
762 sllx %l2,32,$tmp1
763 or %l1,$tmp0,$tmp0
764 or %l3,$tmp1,$tmp1
765 add $tmp0,$E,$E
766 add $tmp1,$F,$F
767 $ST $E,[$ctx+`4*$SZ`]
768 sllx %l4,32,$tmp2
769 $ST $F,[$ctx+`5*$SZ`]
770 sllx %l6,32,$T1
771 or %l5,$tmp2,$tmp2
772 or %l7,$T1,$T1
773 add $tmp2,$G,$G
774 $ST $G,[$ctx+`6*$SZ`]
775 add $T1,$H,$H
776 $ST $H,[$ctx+`7*$SZ`]
777___
778$code.=<<___;
779 add $inp,`16*$SZ`,$inp ! advance inp
780 cmp $inp,$len
1efd5830 781 bne SIZE_T_CC,.Lloop
6fa8a01c
AP
782 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
783
784 ret
785 restore
786.type sha${label}_block_data_order,#function
787.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
788.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
c32fcca6 789.align 4
6fa8a01c
AP
790___
791
3ed6e227
AP
792# Purpose of these subroutines is to explicitly encode VIS instructions,
793# so that one can compile the module without having to specify VIS
478b50cf 794# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
3ed6e227
AP
795# Idea is to reserve for option to produce "universal" binary and let
796# programmer detect if current CPU is VIS capable at run-time.
797sub unvis {
798my ($mnemonic,$rs1,$rs2,$rd)=@_;
799my $ref,$opf;
800my %visopf = ( "faligndata" => 0x048,
801 "for" => 0x07c );
802
803 $ref = "$mnemonic\t$rs1,$rs2,$rd";
804
805 if ($opf=$visopf{$mnemonic}) {
806 foreach ($rs1,$rs2,$rd) {
807 return $ref if (!/%f([0-9]{1,2})/);
808 $_=$1;
809 if ($1>=32) {
810 return $ref if ($1&1);
811 # re-encode for upper double register addressing
812 $_=($1|$1>>5)&31;
813 }
814 }
815
816 return sprintf ".word\t0x%08x !%s",
817 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
818 $ref;
819 } else {
820 return $ref;
821 }
822}
823sub unalignaddr {
824my ($mnemonic,$rs1,$rs2,$rd)=@_;
825my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
826my $ref="$mnemonic\t$rs1,$rs2,$rd";
827
828 foreach ($rs1,$rs2,$rd) {
829 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
830 else { return $ref; }
831 }
832 return sprintf ".word\t0x%08x !%s",
833 0x81b00300|$rd<<25|$rs1<<14|$rs2,
834 $ref;
835}
836
837foreach (split("\n",$code)) {
838 s/\`([^\`]*)\`/eval $1/ge;
839
840 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
841 &unvis($1,$2,$3,$4)
842 /ge;
843 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
844 &unalignaddr($1,$2,$3,$4)
845 /ge;
846
847 print $_,"\n";
848}
849
6fa8a01c 850close STDOUT;