]>
Commit | Line | Data |
---|---|---|
e98c526b AP |
1 | #!/usr/bin/env perl |
2 | ||
3 | # ==================================================================== | |
aea4126e | 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
e98c526b AP |
5 | # project. The module is, however, dual licensed under OpenSSL and |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
e66055b8 AP |
8 | # |
9 | # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>. | |
e98c526b AP |
10 | # ==================================================================== |
11 | ||
27e0c863 | 12 | # MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than |
e98c526b AP |
13 | # code generated by Sun C 5.2. |
14 | ||
d17b59e4 | 15 | # SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x |
e66055b8 AP |
16 | # faster than software. Multi-process benchmark saturates at 12x |
17 | # single-process result on 8-core processor, or ~11GBps per 2.85GHz | |
18 | # socket. | |
19 | ||
e98c526b AP |
20 | $output=shift; |
21 | open STDOUT,">$output"; | |
22 | ||
23 | use integer; | |
24 | ||
25 | ($ctx,$inp,$len)=("%i0","%i1","%i2"); # input arguments | |
26 | ||
27 | # 64-bit values | |
28 | @X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2"); | |
29 | $tx="%g3"; | |
30 | ($AB,$CD)=("%g4","%g5"); | |
31 | ||
32 | # 32-bit values | |
33 | @V=($A,$B,$C,$D)=map("%l$_",(0..3)); | |
34 | ($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7)); | |
35 | ($shr,$shl1,$shl2)=("%i3","%i4","%i5"); | |
36 | ||
37 | my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, | |
38 | 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, | |
39 | 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, | |
40 | 0x6b901122,0xfd987193,0xa679438e,0x49b40821, | |
41 | ||
42 | 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, | |
43 | 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, | |
44 | 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, | |
45 | 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, | |
46 | ||
47 | 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, | |
48 | 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, | |
49 | 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, | |
50 | 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, | |
51 | ||
52 | 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, | |
53 | 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, | |
54 | 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, | |
55 | 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0 ); | |
56 | ||
57 | sub R0 { | |
58 | my ($i,$a,$b,$c,$d) = @_; | |
59 | my $rot = (7,12,17,22)[$i%4]; | |
60 | my $j = ($i+1)/2; | |
61 | ||
62 | if ($i&1) { | |
63 | $code.=<<___; | |
64 | srlx @X[$j],$shr,@X[$j] ! align X[`$i+1`] | |
65 | and $b,$t1,$t1 ! round $i | |
66 | sllx @X[$j+1],$shl1,$tx | |
67 | add $t2,$a,$a | |
68 | sllx $tx,$shl2,$tx | |
69 | xor $d,$t1,$t1 | |
70 | or $tx,@X[$j],@X[$j] | |
71 | sethi %hi(@K[$i+1]),$t2 | |
72 | add $t1,$a,$a | |
73 | or $t2,%lo(@K[$i+1]),$t2 | |
74 | sll $a,$rot,$t3 | |
75 | add @X[$j],$t2,$t2 ! X[`$i+1`]+K[`$i+1`] | |
76 | srl $a,32-$rot,$a | |
77 | add $b,$t3,$t3 | |
78 | xor $b,$c,$t1 | |
79 | add $t3,$a,$a | |
80 | ___ | |
81 | } else { | |
82 | $code.=<<___; | |
83 | srlx @X[$j],32,$tx ! extract X[`2*$j+1`] | |
84 | and $b,$t1,$t1 ! round $i | |
85 | add $t2,$a,$a | |
86 | xor $d,$t1,$t1 | |
87 | sethi %hi(@K[$i+1]),$t2 | |
88 | add $t1,$a,$a | |
89 | or $t2,%lo(@K[$i+1]),$t2 | |
90 | sll $a,$rot,$t3 | |
91 | add $tx,$t2,$t2 ! X[`2*$j+1`]+K[`$i+1`] | |
92 | srl $a,32-$rot,$a | |
93 | add $b,$t3,$t3 | |
94 | xor $b,$c,$t1 | |
95 | add $t3,$a,$a | |
96 | ___ | |
97 | } | |
98 | } | |
99 | ||
100 | sub R0_1 { | |
101 | my ($i,$a,$b,$c,$d) = @_; | |
102 | my $rot = (7,12,17,22)[$i%4]; | |
103 | ||
104 | $code.=<<___; | |
105 | srlx @X[0],32,$tx ! extract X[1] | |
106 | and $b,$t1,$t1 ! round $i | |
107 | add $t2,$a,$a | |
108 | xor $d,$t1,$t1 | |
109 | sethi %hi(@K[$i+1]),$t2 | |
110 | add $t1,$a,$a | |
111 | or $t2,%lo(@K[$i+1]),$t2 | |
112 | sll $a,$rot,$t3 | |
113 | add $tx,$t2,$t2 ! X[1]+K[`$i+1`] | |
114 | srl $a,32-$rot,$a | |
115 | add $b,$t3,$t3 | |
116 | andn $b,$c,$t1 | |
117 | add $t3,$a,$a | |
118 | ___ | |
119 | } | |
120 | ||
121 | sub R1 { | |
122 | my ($i,$a,$b,$c,$d) = @_; | |
123 | my $rot = (5,9,14,20)[$i%4]; | |
124 | my $j = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16; | |
125 | my $xi = @X[$j/2]; | |
126 | ||
127 | $code.=<<___ if ($j&1 && ($xi=$tx)); | |
128 | srlx @X[$j/2],32,$xi ! extract X[$j] | |
129 | ___ | |
130 | $code.=<<___; | |
131 | and $b,$d,$t3 ! round $i | |
132 | add $t2,$a,$a | |
133 | or $t3,$t1,$t1 | |
134 | sethi %hi(@K[$i+1]),$t2 | |
135 | add $t1,$a,$a | |
136 | or $t2,%lo(@K[$i+1]),$t2 | |
137 | sll $a,$rot,$t3 | |
138 | add $xi,$t2,$t2 ! X[$j]+K[`$i+1`] | |
139 | srl $a,32-$rot,$a | |
140 | add $b,$t3,$t3 | |
141 | `$i<31?"andn":"xor"` $b,$c,$t1 | |
142 | add $t3,$a,$a | |
143 | ___ | |
144 | } | |
145 | ||
146 | sub R2 { | |
147 | my ($i,$a,$b,$c,$d) = @_; | |
148 | my $rot = (4,11,16,23)[$i%4]; | |
149 | my $j = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16; | |
150 | my $xi = @X[$j/2]; | |
151 | ||
152 | $code.=<<___ if ($j&1 && ($xi=$tx)); | |
153 | srlx @X[$j/2],32,$xi ! extract X[$j] | |
154 | ___ | |
155 | $code.=<<___; | |
156 | add $t2,$a,$a ! round $i | |
157 | xor $b,$t1,$t1 | |
158 | sethi %hi(@K[$i+1]),$t2 | |
159 | add $t1,$a,$a | |
160 | or $t2,%lo(@K[$i+1]),$t2 | |
161 | sll $a,$rot,$t3 | |
162 | add $xi,$t2,$t2 ! X[$j]+K[`$i+1`] | |
163 | srl $a,32-$rot,$a | |
164 | add $b,$t3,$t3 | |
165 | xor $b,$c,$t1 | |
166 | add $t3,$a,$a | |
167 | ___ | |
168 | } | |
169 | ||
170 | sub R3 { | |
171 | my ($i,$a,$b,$c,$d) = @_; | |
172 | my $rot = (6,10,15,21)[$i%4]; | |
173 | my $j = (0+7*($i+1))%16; | |
174 | my $xi = @X[$j/2]; | |
175 | ||
176 | $code.=<<___; | |
177 | add $t2,$a,$a ! round $i | |
178 | ___ | |
179 | $code.=<<___ if ($j&1 && ($xi=$tx)); | |
180 | srlx @X[$j/2],32,$xi ! extract X[$j] | |
181 | ___ | |
182 | $code.=<<___; | |
183 | orn $b,$d,$t1 | |
184 | sethi %hi(@K[$i+1]),$t2 | |
185 | xor $c,$t1,$t1 | |
186 | or $t2,%lo(@K[$i+1]),$t2 | |
187 | add $t1,$a,$a | |
188 | sll $a,$rot,$t3 | |
189 | add $xi,$t2,$t2 ! X[$j]+K[`$i+1`] | |
190 | srl $a,32-$rot,$a | |
191 | add $b,$t3,$t3 | |
192 | add $t3,$a,$a | |
193 | ___ | |
194 | } | |
195 | ||
e98c526b | 196 | $code.=<<___; |
e66055b8 AP |
197 | #include "sparc_arch.h" |
198 | ||
1efd5830 AP |
199 | #ifdef __arch64__ |
200 | .register %g2,#scratch | |
201 | .register %g3,#scratch | |
202 | #endif | |
203 | ||
e98c526b AP |
204 | .section ".text",#alloc,#execinstr |
205 | ||
e66055b8 AP |
206 | #ifdef __PIC__ |
207 | SPARC_PIC_THUNK(%g1) | |
208 | #endif | |
209 | ||
e98c526b AP |
210 | .globl md5_block_asm_data_order |
211 | .align 32 | |
212 | md5_block_asm_data_order: | |
e66055b8 AP |
213 | SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) |
214 | ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] | |
215 | ||
216 | andcc %g1, CFR_MD5, %g0 | |
217 | be .Lsoftware | |
218 | nop | |
219 | ||
d17b59e4 | 220 | mov 4, %g1 |
e66055b8 | 221 | andcc %o1, 0x7, %g0 |
d17b59e4 AP |
222 | lda [%o0 + %g0]0x88, %f0 ! load context |
223 | lda [%o0 + %g1]0x88, %f1 | |
224 | add %o0, 8, %o0 | |
225 | lda [%o0 + %g0]0x88, %f2 | |
226 | lda [%o0 + %g1]0x88, %f3 | |
e66055b8 | 227 | bne,pn %icc, .Lhwunaligned |
d17b59e4 | 228 | sub %o0, 8, %o0 |
e66055b8 AP |
229 | |
230 | .Lhw_loop: | |
231 | ldd [%o1 + 0x00], %f8 | |
232 | ldd [%o1 + 0x08], %f10 | |
233 | ldd [%o1 + 0x10], %f12 | |
234 | ldd [%o1 + 0x18], %f14 | |
235 | ldd [%o1 + 0x20], %f16 | |
236 | ldd [%o1 + 0x28], %f18 | |
237 | ldd [%o1 + 0x30], %f20 | |
238 | subcc %o2, 1, %o2 ! done yet? | |
239 | ldd [%o1 + 0x38], %f22 | |
240 | add %o1, 0x40, %o1 | |
aea4126e | 241 | prefetch [%o1 + 63], 20 |
e66055b8 AP |
242 | |
243 | .word 0x81b02800 ! MD5 | |
244 | ||
1efd5830 | 245 | bne,pt SIZE_T_CC, .Lhw_loop |
e66055b8 AP |
246 | nop |
247 | ||
248 | .Lhwfinish: | |
d17b59e4 AP |
249 | sta %f0, [%o0 + %g0]0x88 ! store context |
250 | sta %f1, [%o0 + %g1]0x88 | |
251 | add %o0, 8, %o0 | |
252 | sta %f2, [%o0 + %g0]0x88 | |
253 | sta %f3, [%o0 + %g1]0x88 | |
e66055b8 | 254 | retl |
d17b59e4 | 255 | nop |
e66055b8 AP |
256 | |
257 | .align 8 | |
258 | .Lhwunaligned: | |
259 | alignaddr %o1, %g0, %o1 | |
260 | ||
261 | ldd [%o1 + 0x00], %f10 | |
262 | .Lhwunaligned_loop: | |
263 | ldd [%o1 + 0x08], %f12 | |
264 | ldd [%o1 + 0x10], %f14 | |
265 | ldd [%o1 + 0x18], %f16 | |
266 | ldd [%o1 + 0x20], %f18 | |
267 | ldd [%o1 + 0x28], %f20 | |
268 | ldd [%o1 + 0x30], %f22 | |
269 | ldd [%o1 + 0x38], %f24 | |
270 | subcc %o2, 1, %o2 ! done yet? | |
271 | ldd [%o1 + 0x40], %f26 | |
272 | add %o1, 0x40, %o1 | |
aea4126e | 273 | prefetch [%o1 + 63], 20 |
e66055b8 AP |
274 | |
275 | faligndata %f10, %f12, %f8 | |
276 | faligndata %f12, %f14, %f10 | |
277 | faligndata %f14, %f16, %f12 | |
278 | faligndata %f16, %f18, %f14 | |
279 | faligndata %f18, %f20, %f16 | |
280 | faligndata %f20, %f22, %f18 | |
281 | faligndata %f22, %f24, %f20 | |
282 | faligndata %f24, %f26, %f22 | |
283 | ||
284 | .word 0x81b02800 ! MD5 | |
285 | ||
1efd5830 | 286 | bne,pt SIZE_T_CC, .Lhwunaligned_loop |
e66055b8 AP |
287 | for %f26, %f26, %f10 ! %f10=%f26 |
288 | ||
289 | ba .Lhwfinish | |
290 | nop | |
291 | ||
292 | .align 16 | |
293 | .Lsoftware: | |
1efd5830 | 294 | save %sp,-STACK_FRAME,%sp |
e98c526b AP |
295 | |
296 | rd %asi,$saved_asi | |
297 | wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE | |
298 | and $inp,7,$shr | |
299 | andn $inp,7,$inp | |
300 | ||
301 | sll $shr,3,$shr ! *=8 | |
302 | mov 56,$shl2 | |
303 | ld [$ctx+0],$A | |
304 | sub $shl2,$shr,$shl2 | |
305 | ld [$ctx+4],$B | |
306 | and $shl2,32,$shl1 | |
307 | add $shl2,8,$shl2 | |
308 | ld [$ctx+8],$C | |
309 | sub $shl2,$shl1,$shl2 ! shr+shl1+shl2==64 | |
310 | ld [$ctx+12],$D | |
311 | nop | |
312 | ||
313 | .Loop: | |
314 | cmp $shr,0 ! was inp aligned? | |
315 | ldxa [$inp+0]%asi,@X[0] ! load little-endian input | |
316 | ldxa [$inp+8]%asi,@X[1] | |
317 | ldxa [$inp+16]%asi,@X[2] | |
318 | ldxa [$inp+24]%asi,@X[3] | |
319 | ldxa [$inp+32]%asi,@X[4] | |
320 | sllx $A,32,$AB ! pack A,B | |
321 | ldxa [$inp+40]%asi,@X[5] | |
322 | sllx $C,32,$CD ! pack C,D | |
323 | ldxa [$inp+48]%asi,@X[6] | |
324 | or $B,$AB,$AB | |
325 | ldxa [$inp+56]%asi,@X[7] | |
326 | or $D,$CD,$CD | |
327 | bnz,a,pn %icc,.+8 | |
328 | ldxa [$inp+64]%asi,@X[8] | |
329 | ||
330 | srlx @X[0],$shr,@X[0] ! align X[0] | |
331 | sllx @X[1],$shl1,$tx | |
332 | sethi %hi(@K[0]),$t2 | |
333 | sllx $tx,$shl2,$tx | |
334 | or $t2,%lo(@K[0]),$t2 | |
335 | or $tx,@X[0],@X[0] | |
336 | xor $C,$D,$t1 | |
337 | add @X[0],$t2,$t2 ! X[0]+K[0] | |
338 | ___ | |
339 | for ($i=0;$i<15;$i++) { &R0($i,@V); unshift(@V,pop(@V)); } | |
340 | for (;$i<16;$i++) { &R0_1($i,@V); unshift(@V,pop(@V)); } | |
341 | for (;$i<32;$i++) { &R1($i,@V); unshift(@V,pop(@V)); } | |
342 | for (;$i<48;$i++) { &R2($i,@V); unshift(@V,pop(@V)); } | |
343 | for (;$i<64;$i++) { &R3($i,@V); unshift(@V,pop(@V)); } | |
344 | $code.=<<___; | |
345 | srlx $AB,32,$t1 ! unpack A,B,C,D and accumulate | |
346 | add $inp,64,$inp ! advance inp | |
347 | srlx $CD,32,$t2 | |
348 | add $t1,$A,$A | |
349 | subcc $len,1,$len ! done yet? | |
350 | add $AB,$B,$B | |
351 | add $t2,$C,$C | |
352 | add $CD,$D,$D | |
353 | srl $B,0,$B ! clruw $B | |
1efd5830 | 354 | bne SIZE_T_CC,.Loop |
e98c526b AP |
355 | srl $D,0,$D ! clruw $D |
356 | ||
357 | st $A,[$ctx+0] ! write out ctx | |
358 | st $B,[$ctx+4] | |
359 | st $C,[$ctx+8] | |
360 | st $D,[$ctx+12] | |
361 | ||
362 | wr %g0,$saved_asi,%asi | |
363 | ret | |
364 | restore | |
365 | .type md5_block_asm_data_order,#function | |
366 | .size md5_block_asm_data_order,(.-md5_block_asm_data_order) | |
367 | ||
368 | .asciz "MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | |
369 | .align 4 | |
370 | ___ | |
371 | ||
e66055b8 AP |
372 | # Purpose of these subroutines is to explicitly encode VIS instructions, |
373 | # so that one can compile the module without having to specify VIS | |
478b50cf | 374 | # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. |
e66055b8 AP |
375 | # Idea is to reserve for option to produce "universal" binary and let |
376 | # programmer detect if current CPU is VIS capable at run-time. | |
377 | sub unvis { | |
378 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
379 | my $ref,$opf; | |
380 | my %visopf = ( "faligndata" => 0x048, | |
381 | "for" => 0x07c ); | |
382 | ||
383 | $ref = "$mnemonic\t$rs1,$rs2,$rd"; | |
384 | ||
385 | if ($opf=$visopf{$mnemonic}) { | |
386 | foreach ($rs1,$rs2,$rd) { | |
387 | return $ref if (!/%f([0-9]{1,2})/); | |
388 | $_=$1; | |
389 | if ($1>=32) { | |
390 | return $ref if ($1&1); | |
391 | # re-encode for upper double register addressing | |
392 | $_=($1|$1>>5)&31; | |
393 | } | |
394 | } | |
395 | ||
396 | return sprintf ".word\t0x%08x !%s", | |
397 | 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, | |
398 | $ref; | |
399 | } else { | |
400 | return $ref; | |
401 | } | |
402 | } | |
403 | sub unalignaddr { | |
404 | my ($mnemonic,$rs1,$rs2,$rd)=@_; | |
405 | my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); | |
406 | my $ref="$mnemonic\t$rs1,$rs2,$rd"; | |
407 | ||
408 | foreach ($rs1,$rs2,$rd) { | |
409 | if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } | |
410 | else { return $ref; } | |
411 | } | |
412 | return sprintf ".word\t0x%08x !%s", | |
413 | 0x81b00300|$rd<<25|$rs1<<14|$rs2, | |
414 | $ref; | |
415 | } | |
416 | ||
417 | foreach (split("\n",$code)) { | |
418 | s/\`([^\`]*)\`/eval $1/ge; | |
419 | ||
420 | s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ | |
421 | &unvis($1,$2,$3,$4) | |
422 | /ge; | |
423 | s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ | |
424 | &unalignaddr($1,$2,$3,$4) | |
425 | /ge; | |
426 | ||
427 | print $_,"\n"; | |
428 | } | |
429 | ||
e98c526b | 430 | close STDOUT; |