]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
4911f553 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
e98c526b AP |
9 | |
10 | # ==================================================================== | |
aea4126e | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
e98c526b AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
e66055b8 | 15 | # |
e3713c36 | 16 | # Hardware SPARC T4 support by David S. Miller. |
e98c526b AP |
17 | # ==================================================================== |
18 | ||
27e0c863 | 19 | # MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than |
e98c526b AP |
20 | # code generated by Sun C 5.2. |
21 | ||
d17b59e4 | 22 | # SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x |
e66055b8 AP |
23 | # faster than software. Multi-process benchmark saturates at 12x |
24 | # single-process result on 8-core processor, or ~11GBps per 2.85GHz | |
25 | # socket. | |
26 | ||
1aa89a7a RL |
27 | # $output is the last argument if it looks like a file (it has an extension) |
28 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
29 | ||
30 | $output and open STDOUT,">$output"; | |
e98c526b AP |
31 | |
32 | use integer; | |
33 | ||
34 | ($ctx,$inp,$len)=("%i0","%i1","%i2"); # input arguments | |
35 | ||
36 | # 64-bit values | |
37 | @X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2"); | |
38 | $tx="%g3"; | |
39 | ($AB,$CD)=("%g4","%g5"); | |
40 | ||
41 | # 32-bit values | |
42 | @V=($A,$B,$C,$D)=map("%l$_",(0..3)); | |
43 | ($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7)); | |
44 | ($shr,$shl1,$shl2)=("%i3","%i4","%i5"); | |
45 | ||
46 | my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, | |
47 | 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, | |
48 | 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, | |
49 | 0x6b901122,0xfd987193,0xa679438e,0x49b40821, | |
50 | ||
51 | 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, | |
52 | 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, | |
53 | 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, | |
54 | 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, | |
55 | ||
56 | 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, | |
57 | 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, | |
58 | 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, | |
59 | 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, | |
60 | ||
61 | 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, | |
62 | 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, | |
63 | 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, | |
64 | 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0 ); | |
65 | ||
66 | sub R0 { | |
67 | my ($i,$a,$b,$c,$d) = @_; | |
68 | my $rot = (7,12,17,22)[$i%4]; | |
69 | my $j = ($i+1)/2; | |
70 | ||
71 | if ($i&1) { | |
72 | $code.=<<___; | |
73 | srlx @X[$j],$shr,@X[$j] ! align X[`$i+1`] | |
74 | and $b,$t1,$t1 ! round $i | |
75 | sllx @X[$j+1],$shl1,$tx | |
76 | add $t2,$a,$a | |
77 | sllx $tx,$shl2,$tx | |
78 | xor $d,$t1,$t1 | |
79 | or $tx,@X[$j],@X[$j] | |
80 | sethi %hi(@K[$i+1]),$t2 | |
81 | add $t1,$a,$a | |
82 | or $t2,%lo(@K[$i+1]),$t2 | |
83 | sll $a,$rot,$t3 | |
84 | add @X[$j],$t2,$t2 ! X[`$i+1`]+K[`$i+1`] | |
85 | srl $a,32-$rot,$a | |
86 | add $b,$t3,$t3 | |
87 | xor $b,$c,$t1 | |
88 | add $t3,$a,$a | |
89 | ___ | |
90 | } else { | |
91 | $code.=<<___; | |
92 | srlx @X[$j],32,$tx ! extract X[`2*$j+1`] | |
93 | and $b,$t1,$t1 ! round $i | |
94 | add $t2,$a,$a | |
95 | xor $d,$t1,$t1 | |
96 | sethi %hi(@K[$i+1]),$t2 | |
97 | add $t1,$a,$a | |
98 | or $t2,%lo(@K[$i+1]),$t2 | |
99 | sll $a,$rot,$t3 | |
100 | add $tx,$t2,$t2 ! X[`2*$j+1`]+K[`$i+1`] | |
101 | srl $a,32-$rot,$a | |
102 | add $b,$t3,$t3 | |
103 | xor $b,$c,$t1 | |
104 | add $t3,$a,$a | |
105 | ___ | |
106 | } | |
107 | } | |
108 | ||
109 | sub R0_1 { | |
110 | my ($i,$a,$b,$c,$d) = @_; | |
111 | my $rot = (7,12,17,22)[$i%4]; | |
112 | ||
113 | $code.=<<___; | |
114 | srlx @X[0],32,$tx ! extract X[1] | |
115 | and $b,$t1,$t1 ! round $i | |
116 | add $t2,$a,$a | |
117 | xor $d,$t1,$t1 | |
118 | sethi %hi(@K[$i+1]),$t2 | |
119 | add $t1,$a,$a | |
120 | or $t2,%lo(@K[$i+1]),$t2 | |
121 | sll $a,$rot,$t3 | |
122 | add $tx,$t2,$t2 ! X[1]+K[`$i+1`] | |
123 | srl $a,32-$rot,$a | |
124 | add $b,$t3,$t3 | |
125 | andn $b,$c,$t1 | |
126 | add $t3,$a,$a | |
127 | ___ | |
128 | } | |
129 | ||
130 | sub R1 { | |
131 | my ($i,$a,$b,$c,$d) = @_; | |
132 | my $rot = (5,9,14,20)[$i%4]; | |
133 | my $j = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16; | |
134 | my $xi = @X[$j/2]; | |
135 | ||
136 | $code.=<<___ if ($j&1 && ($xi=$tx)); | |
137 | srlx @X[$j/2],32,$xi ! extract X[$j] | |
138 | ___ | |
139 | $code.=<<___; | |
140 | and $b,$d,$t3 ! round $i | |
141 | add $t2,$a,$a | |
142 | or $t3,$t1,$t1 | |
143 | sethi %hi(@K[$i+1]),$t2 | |
144 | add $t1,$a,$a | |
145 | or $t2,%lo(@K[$i+1]),$t2 | |
146 | sll $a,$rot,$t3 | |
147 | add $xi,$t2,$t2 ! X[$j]+K[`$i+1`] | |
148 | srl $a,32-$rot,$a | |
149 | add $b,$t3,$t3 | |
150 | `$i<31?"andn":"xor"` $b,$c,$t1 | |
151 | add $t3,$a,$a | |
152 | ___ | |
153 | } | |
154 | ||
155 | sub R2 { | |
156 | my ($i,$a,$b,$c,$d) = @_; | |
157 | my $rot = (4,11,16,23)[$i%4]; | |
158 | my $j = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16; | |
159 | my $xi = @X[$j/2]; | |
160 | ||
161 | $code.=<<___ if ($j&1 && ($xi=$tx)); | |
162 | srlx @X[$j/2],32,$xi ! extract X[$j] | |
163 | ___ | |
164 | $code.=<<___; | |
165 | add $t2,$a,$a ! round $i | |
166 | xor $b,$t1,$t1 | |
167 | sethi %hi(@K[$i+1]),$t2 | |
168 | add $t1,$a,$a | |
169 | or $t2,%lo(@K[$i+1]),$t2 | |
170 | sll $a,$rot,$t3 | |
171 | add $xi,$t2,$t2 ! X[$j]+K[`$i+1`] | |
172 | srl $a,32-$rot,$a | |
173 | add $b,$t3,$t3 | |
174 | xor $b,$c,$t1 | |
175 | add $t3,$a,$a | |
176 | ___ | |
177 | } | |
178 | ||
179 | sub R3 { | |
180 | my ($i,$a,$b,$c,$d) = @_; | |
181 | my $rot = (6,10,15,21)[$i%4]; | |
182 | my $j = (0+7*($i+1))%16; | |
183 | my $xi = @X[$j/2]; | |
184 | ||
185 | $code.=<<___; | |
186 | add $t2,$a,$a ! round $i | |
187 | ___ | |
188 | $code.=<<___ if ($j&1 && ($xi=$tx)); | |
189 | srlx @X[$j/2],32,$xi ! extract X[$j] | |
190 | ___ | |
191 | $code.=<<___; | |
192 | orn $b,$d,$t1 | |
193 | sethi %hi(@K[$i+1]),$t2 | |
194 | xor $c,$t1,$t1 | |
195 | or $t2,%lo(@K[$i+1]),$t2 | |
196 | add $t1,$a,$a | |
197 | sll $a,$rot,$t3 | |
198 | add $xi,$t2,$t2 ! X[$j]+K[`$i+1`] | |
199 | srl $a,32-$rot,$a | |
200 | add $b,$t3,$t3 | |
201 | add $t3,$a,$a | |
202 | ___ | |
203 | } | |
204 | ||
e98c526b | 205 | $code.=<<___; |
e66055b8 AP |
206 | #include "sparc_arch.h" |
207 | ||
1efd5830 AP |
208 | #ifdef __arch64__ |
209 | .register %g2,#scratch | |
210 | .register %g3,#scratch | |
211 | #endif | |
212 | ||
e98c526b AP |
213 | .section ".text",#alloc,#execinstr |
214 | ||
e66055b8 AP |
215 | #ifdef __PIC__ |
216 | SPARC_PIC_THUNK(%g1) | |
217 | #endif | |
218 | ||
e98c526b AP |
219 | .globl md5_block_asm_data_order |
220 | .align 32 | |
221 | md5_block_asm_data_order: | |
e66055b8 AP |
222 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) |
223 | ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] | |
224 | ||
225 | andcc %g1, CFR_MD5, %g0 | |
226 | be .Lsoftware | |
227 | nop | |
228 | ||
d17b59e4 | 229 | mov 4, %g1 |
e66055b8 | 230 | andcc %o1, 0x7, %g0 |
d17b59e4 AP |
231 | lda [%o0 + %g0]0x88, %f0 ! load context |
232 | lda [%o0 + %g1]0x88, %f1 | |
233 | add %o0, 8, %o0 | |
234 | lda [%o0 + %g0]0x88, %f2 | |
235 | lda [%o0 + %g1]0x88, %f3 | |
e66055b8 | 236 | bne,pn %icc, .Lhwunaligned |
d17b59e4 | 237 | sub %o0, 8, %o0 |
e66055b8 AP |
238 | |
239 | .Lhw_loop: | |
240 | ldd [%o1 + 0x00], %f8 | |
241 | ldd [%o1 + 0x08], %f10 | |
242 | ldd [%o1 + 0x10], %f12 | |
243 | ldd [%o1 + 0x18], %f14 | |
244 | ldd [%o1 + 0x20], %f16 | |
245 | ldd [%o1 + 0x28], %f18 | |
246 | ldd [%o1 + 0x30], %f20 | |
609b0852 | 247 | subcc %o2, 1, %o2 ! done yet? |
e66055b8 AP |
248 | ldd [%o1 + 0x38], %f22 |
249 | add %o1, 0x40, %o1 | |
aea4126e | 250 | prefetch [%o1 + 63], 20 |
e66055b8 AP |
251 | |
252 | .word 0x81b02800 ! MD5 | |
253 | ||
1efd5830 | 254 | bne,pt SIZE_T_CC, .Lhw_loop |
e66055b8 AP |
255 | nop |
256 | ||
257 | .Lhwfinish: | |
d17b59e4 AP |
258 | sta %f0, [%o0 + %g0]0x88 ! store context |
259 | sta %f1, [%o0 + %g1]0x88 | |
260 | add %o0, 8, %o0 | |
261 | sta %f2, [%o0 + %g0]0x88 | |
262 | sta %f3, [%o0 + %g1]0x88 | |
e66055b8 | 263 | retl |
d17b59e4 | 264 | nop |
e66055b8 AP |
265 | |
266 | .align 8 | |
267 | .Lhwunaligned: | |
268 | alignaddr %o1, %g0, %o1 | |
269 | ||
270 | ldd [%o1 + 0x00], %f10 | |
271 | .Lhwunaligned_loop: | |
272 | ldd [%o1 + 0x08], %f12 | |
273 | ldd [%o1 + 0x10], %f14 | |
274 | ldd [%o1 + 0x18], %f16 | |
275 | ldd [%o1 + 0x20], %f18 | |
276 | ldd [%o1 + 0x28], %f20 | |
277 | ldd [%o1 + 0x30], %f22 | |
278 | ldd [%o1 + 0x38], %f24 | |
279 | subcc %o2, 1, %o2 ! done yet? | |
280 | ldd [%o1 + 0x40], %f26 | |
281 | add %o1, 0x40, %o1 | |
aea4126e | 282 | prefetch [%o1 + 63], 20 |
e66055b8 AP |
283 | |
284 | faligndata %f10, %f12, %f8 | |
285 | faligndata %f12, %f14, %f10 | |
286 | faligndata %f14, %f16, %f12 | |
287 | faligndata %f16, %f18, %f14 | |
288 | faligndata %f18, %f20, %f16 | |
289 | faligndata %f20, %f22, %f18 | |
290 | faligndata %f22, %f24, %f20 | |
291 | faligndata %f24, %f26, %f22 | |
292 | ||
293 | .word 0x81b02800 ! MD5 | |
294 | ||
1efd5830 | 295 | bne,pt SIZE_T_CC, .Lhwunaligned_loop |
e66055b8 AP |
296 | for %f26, %f26, %f10 ! %f10=%f26 |
297 | ||
298 | ba .Lhwfinish | |
299 | nop | |
300 | ||
301 | .align 16 | |
302 | .Lsoftware: | |
1efd5830 | 303 | save %sp,-STACK_FRAME,%sp |
e98c526b AP |
304 | |
305 | rd %asi,$saved_asi | |
306 | wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE | |
307 | and $inp,7,$shr | |
308 | andn $inp,7,$inp | |
309 | ||
310 | sll $shr,3,$shr ! *=8 | |
311 | mov 56,$shl2 | |
312 | ld [$ctx+0],$A | |
313 | sub $shl2,$shr,$shl2 | |
314 | ld [$ctx+4],$B | |
315 | and $shl2,32,$shl1 | |
316 | add $shl2,8,$shl2 | |
317 | ld [$ctx+8],$C | |
318 | sub $shl2,$shl1,$shl2 ! shr+shl1+shl2==64 | |
319 | ld [$ctx+12],$D | |
320 | nop | |
321 | ||
322 | .Loop: | |
323 | cmp $shr,0 ! was inp aligned? | |
324 | ldxa [$inp+0]%asi,@X[0] ! load little-endian input | |
325 | ldxa [$inp+8]%asi,@X[1] | |
326 | ldxa [$inp+16]%asi,@X[2] | |
327 | ldxa [$inp+24]%asi,@X[3] | |
328 | ldxa [$inp+32]%asi,@X[4] | |
329 | sllx $A,32,$AB ! pack A,B | |
330 | ldxa [$inp+40]%asi,@X[5] | |
331 | sllx $C,32,$CD ! pack C,D | |
332 | ldxa [$inp+48]%asi,@X[6] | |
333 | or $B,$AB,$AB | |
334 | ldxa [$inp+56]%asi,@X[7] | |
335 | or $D,$CD,$CD | |
336 | bnz,a,pn %icc,.+8 | |
337 | ldxa [$inp+64]%asi,@X[8] | |
338 | ||
339 | srlx @X[0],$shr,@X[0] ! align X[0] | |
340 | sllx @X[1],$shl1,$tx | |
341 | sethi %hi(@K[0]),$t2 | |
342 | sllx $tx,$shl2,$tx | |
343 | or $t2,%lo(@K[0]),$t2 | |
344 | or $tx,@X[0],@X[0] | |
345 | xor $C,$D,$t1 | |
346 | add @X[0],$t2,$t2 ! X[0]+K[0] | |
347 | ___ | |
348 | for ($i=0;$i<15;$i++) { &R0($i,@V); unshift(@V,pop(@V)); } | |
349 | for (;$i<16;$i++) { &R0_1($i,@V); unshift(@V,pop(@V)); } | |
350 | for (;$i<32;$i++) { &R1($i,@V); unshift(@V,pop(@V)); } | |
351 | for (;$i<48;$i++) { &R2($i,@V); unshift(@V,pop(@V)); } | |
352 | for (;$i<64;$i++) { &R3($i,@V); unshift(@V,pop(@V)); } | |
353 | $code.=<<___; | |
354 | srlx $AB,32,$t1 ! unpack A,B,C,D and accumulate | |
355 | add $inp,64,$inp ! advance inp | |
356 | srlx $CD,32,$t2 | |
357 | add $t1,$A,$A | |
358 | subcc $len,1,$len ! done yet? | |
359 | add $AB,$B,$B | |
360 | add $t2,$C,$C | |
361 | add $CD,$D,$D | |
362 | srl $B,0,$B ! clruw $B | |
1efd5830 | 363 | bne SIZE_T_CC,.Loop |
e98c526b AP |
364 | srl $D,0,$D ! clruw $D |
365 | ||
366 | st $A,[$ctx+0] ! write out ctx | |
367 | st $B,[$ctx+4] | |
368 | st $C,[$ctx+8] | |
369 | st $D,[$ctx+12] | |
370 | ||
371 | wr %g0,$saved_asi,%asi | |
372 | ret | |
373 | restore | |
374 | .type md5_block_asm_data_order,#function | |
375 | .size md5_block_asm_data_order,(.-md5_block_asm_data_order) | |
376 | ||
377 | .asciz "MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | |
378 | .align 4 | |
379 | ___ | |
380 | ||
e66055b8 AP |
381 | # Purpose of these subroutines is to explicitly encode VIS instructions, |
382 | # so that one can compile the module without having to specify VIS | |
478b50cf | 383 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. |
e66055b8 AP |
384 | # Idea is to reserve for option to produce "universal" binary and let |
385 | # programmer detect if current CPU is VIS capable at run-time. | |
386 | sub unvis { | |
387 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
388 | my $ref,$opf; | |
389 | my %visopf = ( "faligndata" => 0x048, | |
390 | "for" => 0x07c ); | |
391 | ||
392 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
393 | ||
394 | if ($opf=$visopf{$mnemonic}) { | |
395 | foreach ($rs1,$rs2,$rd) { | |
396 | return $ref if (!/%f([0-9]{1,2})/); | |
397 | $_=$1; | |
398 | if ($1>=32) { | |
399 | return $ref if ($1&1); | |
400 | # re-encode for upper double register addressing | |
401 | $_=($1|$1>>5)&31; | |
402 | } | |
403 | } | |
404 | ||
405 | return sprintf ".word\t0x%08x !%s", | |
406 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | |
407 | $ref; | |
408 | } else { | |
409 | return $ref; | |
410 | } | |
411 | } | |
412 | sub unalignaddr { | |
413 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
414 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | |
415 | my $ref="$mnemonic\t$rs1,$rs2,$rd"; | |
416 | ||
417 | foreach ($rs1,$rs2,$rd) { | |
418 | if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } | |
419 | else { return $ref; } | |
420 | } | |
421 | return sprintf ".word\t0x%08x !%s", | |
422 | 0x81b00300|$rd<<25|$rs1<<14|$rs2, | |
423 | $ref; | |
424 | } | |
425 | ||
426 | foreach (split("\n",$code)) { | |
427 | s/\`([^\`]*)\`/eval $1/ge; | |
428 | ||
429 | s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ | |
430 | &unvis($1,$2,$3,$4) | |
431 | /ge; | |
432 | s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ | |
433 | &unalignaddr($1,$2,$3,$4) | |
434 | /ge; | |
435 | ||
436 | print $_,"\n"; | |
437 | } | |
438 | ||
a21314db | 439 | close STDOUT or die "error closing STDOUT: $!"; |