]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
a598ed0d | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
6fa8a01c AP |
9 | |
10 | # ==================================================================== | |
e3713c36 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
6fa8a01c AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
3ed6e227 | 15 | # |
e3713c36 | 16 | # Hardware SPARC T4 support by David S. Miller |
6fa8a01c AP |
17 | # ==================================================================== |
18 | ||
19 | # Performance improvement is not really impressive on pre-T1 CPU: +8% | |
0bd8d6e2 AP |
20 | # over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it |
21 | # turned to be 40% faster than 64-bit code generated by Sun C 5.8 and | |
22 | # >2x than 64-bit code generated by gcc 3.4. And there is a gimmick. | |
23 | # X[16] vector is packed to 8 64-bit registers and as result nothing | |
24 | # is spilled on stack. In addition input data is loaded in compact | |
25 | # instruction sequence, thus minimizing the window when the code is | |
26 | # subject to [inter-thread] cache-thrashing hazard. The goal is to | |
27 | # ensure scalability on UltraSPARC T1, or rather to avoid decay when | |
28 | # amount of active threads exceeds the number of physical cores. | |
6fa8a01c | 29 | |
3ed6e227 AP |
30 | # SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x |
31 | # faster than software. Multi-process benchmark saturates at 11x | |
32 | # single-process result on 8-core processor, or ~9GBps per 2.85GHz | |
33 | # socket. | |
34 | ||
eb77e888 | 35 | $output=pop; |
6fa8a01c AP |
36 | open STDOUT,">$output"; |
37 | ||
38 | @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); | |
39 | $rot1m="%g2"; | |
40 | $tmp64="%g3"; | |
41 | $Xi="%g4"; | |
42 | $A="%l0"; | |
43 | $B="%l1"; | |
44 | $C="%l2"; | |
45 | $D="%l3"; | |
46 | $E="%l4"; | |
47 | @V=($A,$B,$C,$D,$E); | |
48 | $K_00_19="%l5"; | |
49 | $K_20_39="%l6"; | |
50 | $K_40_59="%l7"; | |
51 | $K_60_79="%g5"; | |
52 | @K=($K_00_19,$K_20_39,$K_40_59,$K_60_79); | |
53 | ||
54 | $ctx="%i0"; | |
55 | $inp="%i1"; | |
56 | $len="%i2"; | |
57 | $tmp0="%i3"; | |
58 | $tmp1="%i4"; | |
59 | $tmp2="%i5"; | |
60 | ||
61 | sub BODY_00_15 { | |
62 | my ($i,$a,$b,$c,$d,$e)=@_; | |
63 | my $xi=($i&1)?@X[($i/2)%8]:$Xi; | |
64 | ||
65 | $code.=<<___; | |
66 | sll $a,5,$tmp0 !! $i | |
67 | add @K[$i/20],$e,$e | |
68 | srl $a,27,$tmp1 | |
69 | add $tmp0,$e,$e | |
70 | and $c,$b,$tmp0 | |
71 | add $tmp1,$e,$e | |
72 | sll $b,30,$tmp2 | |
73 | andn $d,$b,$tmp1 | |
74 | srl $b,2,$b | |
75 | or $tmp1,$tmp0,$tmp1 | |
76 | or $tmp2,$b,$b | |
77 | add $xi,$e,$e | |
78 | ___ | |
79 | if ($i&1 && $i<15) { | |
80 | $code.= | |
81 | " srlx @X[(($i+1)/2)%8],32,$Xi\n"; | |
82 | } | |
83 | $code.=<<___; | |
84 | add $tmp1,$e,$e | |
85 | ___ | |
86 | } | |
87 | ||
88 | sub Xupdate { | |
89 | my ($i,$a,$b,$c,$d,$e)=@_; | |
90 | my $j=$i/2; | |
91 | ||
92 | if ($i&1) { | |
93 | $code.=<<___; | |
94 | sll $a,5,$tmp0 !! $i | |
95 | add @K[$i/20],$e,$e | |
96 | srl $a,27,$tmp1 | |
97 | ___ | |
98 | } else { | |
99 | $code.=<<___; | |
100 | sllx @X[($j+6)%8],32,$Xi ! Xupdate($i) | |
101 | xor @X[($j+1)%8],@X[$j%8],@X[$j%8] | |
102 | srlx @X[($j+7)%8],32,$tmp1 | |
103 | xor @X[($j+4)%8],@X[$j%8],@X[$j%8] | |
104 | sll $a,5,$tmp0 !! $i | |
105 | or $tmp1,$Xi,$Xi | |
106 | add @K[$i/20],$e,$e !! | |
107 | xor $Xi,@X[$j%8],@X[$j%8] | |
108 | srlx @X[$j%8],31,$Xi | |
109 | add @X[$j%8],@X[$j%8],@X[$j%8] | |
110 | and $Xi,$rot1m,$Xi | |
111 | andn @X[$j%8],$rot1m,@X[$j%8] | |
112 | srl $a,27,$tmp1 !! | |
113 | or $Xi,@X[$j%8],@X[$j%8] | |
114 | ___ | |
115 | } | |
116 | } | |
117 | ||
118 | sub BODY_16_19 { | |
119 | my ($i,$a,$b,$c,$d,$e)=@_; | |
120 | ||
121 | &Xupdate(@_); | |
122 | if ($i&1) { | |
123 | $xi=@X[($i/2)%8]; | |
124 | } else { | |
125 | $xi=$Xi; | |
126 | $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; | |
127 | } | |
128 | $code.=<<___; | |
129 | add $tmp0,$e,$e !! | |
130 | and $c,$b,$tmp0 | |
131 | add $tmp1,$e,$e | |
132 | sll $b,30,$tmp2 | |
133 | add $xi,$e,$e | |
134 | andn $d,$b,$tmp1 | |
135 | srl $b,2,$b | |
136 | or $tmp1,$tmp0,$tmp1 | |
137 | or $tmp2,$b,$b | |
138 | add $tmp1,$e,$e | |
139 | ___ | |
140 | } | |
141 | ||
142 | sub BODY_20_39 { | |
143 | my ($i,$a,$b,$c,$d,$e)=@_; | |
144 | my $xi; | |
145 | &Xupdate(@_); | |
146 | if ($i&1) { | |
147 | $xi=@X[($i/2)%8]; | |
148 | } else { | |
149 | $xi=$Xi; | |
150 | $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; | |
151 | } | |
152 | $code.=<<___; | |
153 | add $tmp0,$e,$e !! | |
154 | xor $c,$b,$tmp0 | |
155 | add $tmp1,$e,$e | |
156 | sll $b,30,$tmp2 | |
157 | xor $d,$tmp0,$tmp1 | |
158 | srl $b,2,$b | |
159 | add $tmp1,$e,$e | |
160 | or $tmp2,$b,$b | |
161 | add $xi,$e,$e | |
162 | ___ | |
163 | } | |
164 | ||
165 | sub BODY_40_59 { | |
166 | my ($i,$a,$b,$c,$d,$e)=@_; | |
167 | my $xi; | |
168 | &Xupdate(@_); | |
169 | if ($i&1) { | |
170 | $xi=@X[($i/2)%8]; | |
171 | } else { | |
172 | $xi=$Xi; | |
173 | $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; | |
174 | } | |
175 | $code.=<<___; | |
176 | add $tmp0,$e,$e !! | |
177 | and $c,$b,$tmp0 | |
178 | add $tmp1,$e,$e | |
179 | sll $b,30,$tmp2 | |
180 | or $c,$b,$tmp1 | |
181 | srl $b,2,$b | |
182 | and $d,$tmp1,$tmp1 | |
183 | add $xi,$e,$e | |
184 | or $tmp1,$tmp0,$tmp1 | |
185 | or $tmp2,$b,$b | |
186 | add $tmp1,$e,$e | |
187 | ___ | |
188 | } | |
189 | ||
6fa8a01c | 190 | $code.=<<___; |
3ed6e227 AP |
191 | #include "sparc_arch.h" |
192 | ||
1efd5830 AP |
193 | #ifdef __arch64__ |
194 | .register %g2,#scratch | |
195 | .register %g3,#scratch | |
196 | #endif | |
197 | ||
6fa8a01c AP |
198 | .section ".text",#alloc,#execinstr |
199 | ||
3ed6e227 AP |
200 | #ifdef __PIC__ |
201 | SPARC_PIC_THUNK(%g1) | |
202 | #endif | |
203 | ||
6fa8a01c AP |
204 | .align 32 |
205 | .globl sha1_block_data_order | |
206 | sha1_block_data_order: | |
3ed6e227 AP |
207 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) |
208 | ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] | |
209 | ||
210 | andcc %g1, CFR_SHA1, %g0 | |
211 | be .Lsoftware | |
212 | nop | |
213 | ||
214 | ld [%o0 + 0x00], %f0 ! load context | |
215 | ld [%o0 + 0x04], %f1 | |
216 | ld [%o0 + 0x08], %f2 | |
217 | andcc %o1, 0x7, %g0 | |
218 | ld [%o0 + 0x0c], %f3 | |
219 | bne,pn %icc, .Lhwunaligned | |
220 | ld [%o0 + 0x10], %f4 | |
221 | ||
222 | .Lhw_loop: | |
223 | ldd [%o1 + 0x00], %f8 | |
224 | ldd [%o1 + 0x08], %f10 | |
225 | ldd [%o1 + 0x10], %f12 | |
226 | ldd [%o1 + 0x18], %f14 | |
227 | ldd [%o1 + 0x20], %f16 | |
228 | ldd [%o1 + 0x28], %f18 | |
229 | ldd [%o1 + 0x30], %f20 | |
609b0852 | 230 | subcc %o2, 1, %o2 ! done yet? |
3ed6e227 AP |
231 | ldd [%o1 + 0x38], %f22 |
232 | add %o1, 0x40, %o1 | |
aea4126e | 233 | prefetch [%o1 + 63], 20 |
3ed6e227 AP |
234 | |
235 | .word 0x81b02820 ! SHA1 | |
236 | ||
1efd5830 | 237 | bne,pt SIZE_T_CC, .Lhw_loop |
3ed6e227 AP |
238 | nop |
239 | ||
240 | .Lhwfinish: | |
241 | st %f0, [%o0 + 0x00] ! store context | |
242 | st %f1, [%o0 + 0x04] | |
243 | st %f2, [%o0 + 0x08] | |
244 | st %f3, [%o0 + 0x0c] | |
245 | retl | |
246 | st %f4, [%o0 + 0x10] | |
247 | ||
248 | .align 8 | |
249 | .Lhwunaligned: | |
250 | alignaddr %o1, %g0, %o1 | |
251 | ||
252 | ldd [%o1 + 0x00], %f10 | |
253 | .Lhwunaligned_loop: | |
254 | ldd [%o1 + 0x08], %f12 | |
255 | ldd [%o1 + 0x10], %f14 | |
256 | ldd [%o1 + 0x18], %f16 | |
257 | ldd [%o1 + 0x20], %f18 | |
258 | ldd [%o1 + 0x28], %f20 | |
259 | ldd [%o1 + 0x30], %f22 | |
260 | ldd [%o1 + 0x38], %f24 | |
261 | subcc %o2, 1, %o2 ! done yet? | |
262 | ldd [%o1 + 0x40], %f26 | |
263 | add %o1, 0x40, %o1 | |
aea4126e | 264 | prefetch [%o1 + 63], 20 |
3ed6e227 AP |
265 | |
266 | faligndata %f10, %f12, %f8 | |
267 | faligndata %f12, %f14, %f10 | |
268 | faligndata %f14, %f16, %f12 | |
269 | faligndata %f16, %f18, %f14 | |
270 | faligndata %f18, %f20, %f16 | |
271 | faligndata %f20, %f22, %f18 | |
272 | faligndata %f22, %f24, %f20 | |
273 | faligndata %f24, %f26, %f22 | |
274 | ||
275 | .word 0x81b02820 ! SHA1 | |
276 | ||
1efd5830 | 277 | bne,pt SIZE_T_CC, .Lhwunaligned_loop |
3ed6e227 AP |
278 | for %f26, %f26, %f10 ! %f10=%f26 |
279 | ||
280 | ba .Lhwfinish | |
281 | nop | |
282 | ||
283 | .align 16 | |
284 | .Lsoftware: | |
1efd5830 | 285 | save %sp,-STACK_FRAME,%sp |
6fa8a01c AP |
286 | sllx $len,6,$len |
287 | add $inp,$len,$len | |
288 | ||
289 | or %g0,1,$rot1m | |
290 | sllx $rot1m,32,$rot1m | |
291 | or $rot1m,1,$rot1m | |
292 | ||
293 | ld [$ctx+0],$A | |
294 | ld [$ctx+4],$B | |
295 | ld [$ctx+8],$C | |
296 | ld [$ctx+12],$D | |
297 | ld [$ctx+16],$E | |
298 | andn $inp,7,$tmp0 | |
299 | ||
300 | sethi %hi(0x5a827999),$K_00_19 | |
301 | or $K_00_19,%lo(0x5a827999),$K_00_19 | |
302 | sethi %hi(0x6ed9eba1),$K_20_39 | |
303 | or $K_20_39,%lo(0x6ed9eba1),$K_20_39 | |
304 | sethi %hi(0x8f1bbcdc),$K_40_59 | |
305 | or $K_40_59,%lo(0x8f1bbcdc),$K_40_59 | |
306 | sethi %hi(0xca62c1d6),$K_60_79 | |
307 | or $K_60_79,%lo(0xca62c1d6),$K_60_79 | |
308 | ||
309 | .Lloop: | |
310 | ldx [$tmp0+0],@X[0] | |
311 | ldx [$tmp0+16],@X[2] | |
312 | ldx [$tmp0+32],@X[4] | |
313 | ldx [$tmp0+48],@X[6] | |
314 | and $inp,7,$tmp1 | |
315 | ldx [$tmp0+8],@X[1] | |
316 | sll $tmp1,3,$tmp1 | |
317 | ldx [$tmp0+24],@X[3] | |
318 | subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too | |
319 | ldx [$tmp0+40],@X[5] | |
320 | bz,pt %icc,.Laligned | |
321 | ldx [$tmp0+56],@X[7] | |
322 | ||
323 | sllx @X[0],$tmp1,@X[0] | |
324 | ldx [$tmp0+64],$tmp64 | |
325 | ___ | |
326 | for($i=0;$i<7;$i++) | |
327 | { $code.=<<___; | |
328 | srlx @X[$i+1],$tmp2,$Xi | |
329 | sllx @X[$i+1],$tmp1,@X[$i+1] | |
330 | or $Xi,@X[$i],@X[$i] | |
331 | ___ | |
332 | } | |
333 | $code.=<<___; | |
334 | srlx $tmp64,$tmp2,$tmp64 | |
335 | or $tmp64,@X[7],@X[7] | |
336 | .Laligned: | |
337 | srlx @X[0],32,$Xi | |
338 | ___ | |
339 | for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } | |
340 | for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } | |
341 | for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
342 | for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } | |
343 | for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } | |
344 | $code.=<<___; | |
345 | ||
346 | ld [$ctx+0],@X[0] | |
347 | ld [$ctx+4],@X[1] | |
348 | ld [$ctx+8],@X[2] | |
349 | ld [$ctx+12],@X[3] | |
350 | add $inp,64,$inp | |
351 | ld [$ctx+16],@X[4] | |
352 | cmp $inp,$len | |
353 | ||
354 | add $A,@X[0],$A | |
355 | st $A,[$ctx+0] | |
356 | add $B,@X[1],$B | |
357 | st $B,[$ctx+4] | |
358 | add $C,@X[2],$C | |
359 | st $C,[$ctx+8] | |
360 | add $D,@X[3],$D | |
361 | st $D,[$ctx+12] | |
362 | add $E,@X[4],$E | |
363 | st $E,[$ctx+16] | |
364 | ||
1efd5830 | 365 | bne SIZE_T_CC,.Lloop |
6fa8a01c AP |
366 | andn $inp,7,$tmp0 |
367 | ||
368 | ret | |
369 | restore | |
370 | .type sha1_block_data_order,#function | |
371 | .size sha1_block_data_order,(.-sha1_block_data_order) | |
372 | .asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | |
c32fcca6 | 373 | .align 4 |
6fa8a01c AP |
374 | ___ |
375 | ||
3ed6e227 AP |
376 | # Purpose of these subroutines is to explicitly encode VIS instructions, |
377 | # so that one can compile the module without having to specify VIS | |
478b50cf | 378 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. |
3ed6e227 AP |
379 | # Idea is to reserve for option to produce "universal" binary and let |
380 | # programmer detect if current CPU is VIS capable at run-time. | |
381 | sub unvis { | |
382 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
383 | my $ref,$opf; | |
384 | my %visopf = ( "faligndata" => 0x048, | |
385 | "for" => 0x07c ); | |
386 | ||
387 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
388 | ||
389 | if ($opf=$visopf{$mnemonic}) { | |
390 | foreach ($rs1,$rs2,$rd) { | |
391 | return $ref if (!/%f([0-9]{1,2})/); | |
392 | $_=$1; | |
393 | if ($1>=32) { | |
394 | return $ref if ($1&1); | |
395 | # re-encode for upper double register addressing | |
396 | $_=($1|$1>>5)&31; | |
397 | } | |
398 | } | |
399 | ||
400 | return sprintf ".word\t0x%08x !%s", | |
401 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | |
402 | $ref; | |
403 | } else { | |
404 | return $ref; | |
405 | } | |
406 | } | |
407 | sub unalignaddr { | |
408 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
409 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | |
410 | my $ref="$mnemonic\t$rs1,$rs2,$rd"; | |
411 | ||
412 | foreach ($rs1,$rs2,$rd) { | |
413 | if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } | |
414 | else { return $ref; } | |
415 | } | |
416 | return sprintf ".word\t0x%08x !%s", | |
417 | 0x81b00300|$rd<<25|$rs1<<14|$rs2, | |
418 | $ref; | |
419 | } | |
420 | ||
421 | foreach (split("\n",$code)) { | |
422 | s/\`([^\`]*)\`/eval $1/ge; | |
423 | ||
424 | s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ | |
425 | &unvis($1,$2,$3,$4) | |
426 | /ge; | |
427 | s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ | |
428 | &unalignaddr($1,$2,$3,$4) | |
429 | /ge; | |
430 | ||
431 | print $_,"\n"; | |
432 | } | |
433 | ||
6fa8a01c | 434 | close STDOUT; |